diff --git a/.gitattributes b/.gitattributes
index a6344aac8c09253b3b630fb776ae94478aa0275b..3e4e96171cb122fd7de3ef9c6ba4e92c40bc1414 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -33,3 +33,17 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+torchinductor/fxgraph/22/f22et4hxfdbezzlil53lu2pcyq6hgd3sgpjtfwxqwvo3nhcfcvx3/iqgyrwyxmlbu22glkzz24rsbjiieww3sllxux2ttk4c6gtoiuba filter=lfs diff=lfs merge=lfs -text
+torchinductor/fxgraph/6v/f6v2dym5xl4l4b2xlv35ic4ajld4mxcvhsvdsiwx2uug77q36cad/nhugjvtrt6cm53zizhnw673hj52m5m3kegaqvkjzkf4qh6d6rhm filter=lfs diff=lfs merge=lfs -text
+torchinductor/fxgraph/ah/fahbtdmoejcqs352pnbnedqns63nbnu6hdbrwzvf6chptnsannjh/fhpbiokcxxh7ksbfgiljcvh7erywotuv4ddvlfcb4fk2ef7dd5c filter=lfs diff=lfs merge=lfs -text
+torchinductor/fxgraph/bn/fbnlruhvmagcngqd5is2xjbucjaq7uf3sgsbdahfi6ovtehbhzyo/62gizmmmqz43cclymnr7ftyo5qt7ux4og6bc2xazrwstkjpsy2e filter=lfs diff=lfs merge=lfs -text
+torchinductor/fxgraph/e2/fe2tjoiexjbavh5sakfaxvga43vsvwn5ev5bzhfjg76jvmtjqtbn/ejg3u4qymaxsvvl2vdequli7pwsrdjf5zdgqjkgbrxdsvgfv3h4 filter=lfs diff=lfs merge=lfs -text
+torchinductor/fxgraph/fr/ffrx7clryowwzulnhruopihutvaxlycymqopsyoha6yecifyw2m2/g3wh462wylribiwz4th3gnlt5rtnrcb5bkad6w3yucxodv3q5ks filter=lfs diff=lfs merge=lfs -text
+torchinductor/fxgraph/k6/fk6cfyjfeiu7xe6ebkapsnixuplqczgfc5534mitqsfkssbzjyak/4xid4w6sg2yg7xaseouf2vwhp2fyff56a2t6z6ownb3yw3g25rk filter=lfs diff=lfs merge=lfs -text
+torchinductor/fxgraph/kj/fkjh2kykxecmnv6oe3zzwtjpek77nmrm35vgv2daxfgkim6xfk4u/gigbnwpmixz5epksvvrh4mtg3nxlpy724eojb3ajzvtepgvx7y4 filter=lfs diff=lfs merge=lfs -text
+torchinductor/fxgraph/n6/fn6x7m44e35jdmh6iqj3eqiyrz7tbhzd3rqartt67myyrnickjmp/um3sgirsxogup4murdiaoy7dxu4ogolqsa343kh56kq24zd53fb filter=lfs diff=lfs merge=lfs -text
+torchinductor/fxgraph/te/fte5y7bccssideiluepvpscj6srf7orxnfgql6to32ni27zf2uv2/vmrpdvw3meiqsf22oras6imorrybogkgp6jjr3ddtreaiuutais filter=lfs diff=lfs merge=lfs -text
+torchinductor/fxgraph/tz/ftzd5ordyehsowurkwjjpkso24gayhyplcd6wz7xdv53fad276l6/36luuy7klcmb7554z63umrezysn6xbat5wxfaadxh4clxhcn2j7 filter=lfs diff=lfs merge=lfs -text
+torchinductor/fxgraph/u4/fu47dchf76mmiajgnawm3xgek4ysnnmqaavupgy4cddyxygid6iq/725pjrxppjygb6kbca6zklwmn7iunv65thw23b5s4im6zi27j3i filter=lfs diff=lfs merge=lfs -text
+torchinductor/fxgraph/uy/fuygegwmldon4qz3wvjs3cld4hnjz6yxh6aa2cmfsal4u3xxws43/l4w2mroymid3qdffvzt4wffavpm5it6rzi6lmbvxzzezfkbavuo filter=lfs diff=lfs merge=lfs -text
+torchinductor/fxgraph/w5/fw5vzdkweh3kv3fm3mnal4wu63gxhw2anwx2pzuved4acfz4fdzm/n5dfreyro3slkufuydw2d54nm7bfiskxjjasoyg2z5yept5c3rf filter=lfs diff=lfs merge=lfs -text
diff --git a/meta.json b/meta.json
new file mode 100644
index 0000000000000000000000000000000000000000..e7ad09fa1fd47d6e160658e55a91e1da64044e8a
--- /dev/null
+++ b/meta.json
@@ -0,0 +1,44 @@
+{
+  "cache_layout_version": 1,
+  "created_at": "2026-01-23T07:13:39Z",
+  "model_path": "/root/.cache/huggingface/hub/models--black-forest-labs--FLUX.2-klein-9B/snapshots/cd1bba5810fe2aba6666d9cf7352e25436426039",
+  "compile_command": [
+    "/usr/bin/python",
+    "/app/tensorrt_llm/visual_gen/examples/flux2_klein_9b.py",
+    "--model_path",
+    "/root/.cache/huggingface/hub/models--black-forest-labs--FLUX.2-klein-9B/snapshots/cd1bba5810fe2aba6666d9cf7352e25436426039",
+    "--height",
+    "512",
+    "--width",
+    "1024",
+    "--num_inference_steps",
+    "4",
+    "--num_images",
+    "6",
+    "--linear_type",
+    "te-fp8-per-tensor",
+    "--fallback_linear_type",
+    "default",
+    "--torch_compile_mode",
+    "default",
+    "--offload_text_encoder"
+  ],
+  "height": 512,
+  "width": 1024,
+  "num_inference_steps": 4,
+  "num_images": 6,
+  "linear_type": "te-fp8-per-tensor",
+  "fallback_linear_type": "default",
+  "torch_compile_mode": "default",
+  "offload_text_encoder": true,
+  "offload_vae": false,
+  "disable_cuda_graph": false,
+  "disable_teacache": false,
+  "torch_version": "2.10.0a0+b4e4ee81d3.nv25.12",
+  "cuda_version": "13.1",
+  "device_name": "NVIDIA GeForce RTX 4090",
+  "device_capability": [
+    8,
+    9
+  ]
+}
\ No newline at end of file
diff --git a/torchinductor/2h/a581feca05a976cd76073f2f954a7641097b9c5775b12cf6831b3149d528a8b4.best_config b/torchinductor/2h/a581feca05a976cd76073f2f954a7641097b9c5775b12cf6831b3149d528a8b4.best_config
new file mode 100644
index 0000000000000000000000000000000000000000..d289cec8e8a34a7e59e27d318f3de362d687b99b
--- /dev/null
+++ b/torchinductor/2h/a581feca05a976cd76073f2f954a7641097b9c5775b12cf6831b3149d528a8b4.best_config
@@ -0,0 +1 @@
+{"XBLOCK": 64, "YBLOCK": 64, "num_warps": 8, "num_stages": 1, "configs_hash": "1ce421918d79ed0f7edb09d0ee64f016daf650a007a21866fe52d592be55380c", "found_by_coordesc": false, "time_taken_ms": 143, "triton_cache_hash": "RNNMPWWZPRYLZDDP3QNL7R5SV7EYTG7WXIUJKWKAEGE4BUI424IA"}
\ No newline at end of file
diff --git a/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py b/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py
new file mode 100644
index 0000000000000000000000000000000000000000..8fbb9cd5967793d73bf715bc65ea3cb93bdb0d48
--- /dev/null
+++ b/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py
@@ -0,0 +1,70 @@
+
+import triton
+import triton.language as tl
+
+from torch._inductor.runtime import triton_helpers, triton_heuristics
+from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+triton_helpers.set_driver_to_gpu()
+
+@triton_heuristics.pointwise(
+    size_hints={'y': 131072, 'x': 128}, tile_hint=TileHint.DEFAULT,
+    filename=__file__,
+    triton_meta={'signature': {'in_ptr0': '*bf16', 'in_ptr1': '*fp32', 'in_ptr2': '*bf16', 'in_ptr3': '*bf16', 'in_ptr4': '*fp32', 'in_ptr5': '*bf16', 'out_ptr0': '*bf16', 'ynumel': 'i32', 'xnumel': 'i32', 'YBLOCK': 'constexpr', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=128, cc=89, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=1536, warp_size=32), 'constants': {}, 'native_matmul': False, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]], (6,): [['tt.divisibility', 16]], (7,): [['tt.divisibility', 16]], (8,): [['tt.divisibility', 16]]}], 'enable_fp_fusion': True},
+    inductor_meta={'grid_type': 'Grid2DWithYZOverflow', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__fused_rms_norm_cat_view_2', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'atomic_add_found': False, 'num_load': 6, 'num_store': 1, 'num_reduction': 0, 'backend_hash': '139C22A3A3C364569C9941DE9469DCB674B7A631E094782CBD415193800462F6', 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'deterministic': False, 'force_filter_reduction_configs': False, 'are_deterministic_algorithms_enabled': False, 'tiling_scores': {'y': 589824, 'x': 75497984}},
+    min_elem_per_thread=0
+)
+@triton.jit
+def triton_poi_fused__fused_rms_norm_cat_view_2(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, out_ptr0, ynumel, xnumel, YBLOCK : tl.constexpr, XBLOCK : tl.constexpr):
+    ynumel = 73728
+    xnumel = 128
+    yoffset = (tl.program_id(1) + tl.program_id(2) * tl.num_programs(1)) * YBLOCK
+    yindex = yoffset + tl.arange(0, YBLOCK)[:, None]
+    ymask = yindex < ynumel
+    xoffset = tl.program_id(0) * XBLOCK
+    xindex = xoffset + tl.arange(0, XBLOCK)[None, :]
+    xmask = xindex < xnumel
+    y1 = yindex // 32
+    x2 = xindex
+    y0 = (yindex % 32)
+    y3 = yindex
+    tmp0 = y1
+    tmp1 = tl.full([1, 1], 0, tl.int64)
+    tmp2 = tmp0 >= tmp1
+    tmp3 = tl.full([1, 1], 256, tl.int64)
+    tmp4 = tmp0 < tmp3
+    tmp5 = tl.load(in_ptr0 + (x2 + 128*y0 + 12288*(y1)), tmp4 & xmask & ymask, eviction_policy='evict_last', other=0.0).to(tl.float32)
+    tmp6 = tmp5.to(tl.float32)
+    tmp7 = tl.load(in_ptr1 + (tl.broadcast_to(y0 + 32*(y1), [YBLOCK, XBLOCK])), tmp4 & xmask & ymask, eviction_policy='evict_last', other=0.0)
+    tmp8 = 128.0
+    tmp9 = (tmp7 / tmp8)
+    tmp10 = 1e-06
+    tmp11 = tmp9 + tmp10
+    tmp12 = libdevice.rsqrt(tmp11)
+    tmp13 = tmp6 * tmp12
+    tmp14 = tl.load(in_ptr2 + (tl.broadcast_to(x2, [YBLOCK, XBLOCK])), tmp4 & xmask & ymask, eviction_policy='evict_last', other=0.0).to(tl.float32)
+    tmp15 = tmp14.to(tl.float32)
+    tmp16 = tmp13 * tmp15
+    tmp17 = tmp16.to(tl.float32)
+    tmp18 = tl.full(tmp17.shape, 0.0, tmp17.dtype)
+    tmp19 = tl.where(tmp4, tmp17, tmp18)
+    tmp20 = tmp0 >= tmp3
+    tmp21 = tl.full([1, 1], 2304, tl.int64)
+    tmp22 = tmp0 < tmp21
+    tmp23 = tl.load(in_ptr3 + (x2 + 128*y0 + 12288*((-256) + y1)), tmp20 & xmask & ymask, eviction_policy='evict_last', other=0.0).to(tl.float32)
+    tmp24 = tmp23.to(tl.float32)
+    tmp25 = tl.load(in_ptr4 + (tl.broadcast_to(y0 + 32*((-256) + y1), [YBLOCK, XBLOCK])), tmp20 & xmask & ymask, eviction_policy='evict_last', other=0.0)
+    tmp26 = 128.0
+    tmp27 = (tmp25 / tmp26)
+    tmp28 = 1e-06
+    tmp29 = tmp27 + tmp28
+    tmp30 = libdevice.rsqrt(tmp29)
+    tmp31 = tmp24 * tmp30
+    tmp32 = tl.load(in_ptr5 + (tl.broadcast_to(x2, [YBLOCK, XBLOCK])), tmp20 & xmask & ymask, eviction_policy='evict_last', other=0.0).to(tl.float32)
+    tmp33 = tmp32.to(tl.float32)
+    tmp34 = tmp31 * tmp33
+    tmp35 = tmp34.to(tl.float32)
+    tmp36 = tl.full(tmp35.shape, 0.0, tmp35.dtype)
+    tmp37 = tl.where(tmp20, tmp35, tmp36)
+    tmp38 = tl.where(tmp4, tmp19, tmp37)
+    tl.store(out_ptr0 + (x2 + 128*y3), tmp38, xmask & ymask)
diff --git a/torchinductor/2o/c2oduffhka4c52657rppatcdtgtnibm42qywfo2spmul2dpsj6jj.py b/torchinductor/2o/c2oduffhka4c52657rppatcdtgtnibm42qywfo2spmul2dpsj6jj.py
new file mode 100644
index 0000000000000000000000000000000000000000..10b6da72848e57b5999f4a1a9747babb3ae470d2
--- /dev/null
+++ b/torchinductor/2o/c2oduffhka4c52657rppatcdtgtnibm42qywfo2spmul2dpsj6jj.py
@@ -0,0 +1,297 @@
+# AOT ID: ['0_inference']
+from ctypes import c_void_p, c_long, c_int
+import torch
+import math
+import random
+import os
+import tempfile
+from math import inf, nan
+from cmath import nanj
+from torch._inductor.hooks import run_intermediate_hooks
+from torch._inductor.utils import maybe_profile
+from torch._inductor.codegen.memory_planning import _align as align
+from torch import device, empty_strided
+from torch._inductor.async_compile import AsyncCompile
+from torch._inductor.select_algorithm import extern_kernels
+import triton
+import triton.language as tl
+from torch._inductor.runtime.triton_heuristics import start_graph, end_graph
+from torch._C import _cuda_getCurrentRawStream as get_raw_stream
+
+aten = torch.ops.aten
+inductor_ops = torch.ops.inductor
+_quantized = torch.ops._quantized
+assert_size_stride = torch._C._dynamo.guards.assert_size_stride
+assert_alignment = torch._C._dynamo.guards.assert_alignment
+empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu
+empty_strided_cpu_pinned = torch._C._dynamo.guards._empty_strided_cpu_pinned
+empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
+empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu
+empty_strided_mtia = torch._C._dynamo.guards._empty_strided_mtia
+reinterpret_tensor = torch._C._dynamo.guards._reinterpret_tensor
+alloc_from_pool = torch.ops.inductor._alloc_from_pool
+async_compile = AsyncCompile()
+empty_strided_p2p = torch._C._distributed_c10d._SymmetricMemory.empty_strided_p2p
+
+
+# kernel path: /app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py
+# Topologically Sorted Source Nodes: [norm_hidden_states, add, mul, norm_hidden_states_1], Original ATen: [aten.native_layer_norm, aten.add, aten.mul]
+# Source node to ATen node mapping:
+#   add => add_1
+#   mul => mul_1
+#   norm_hidden_states => add, convert_element_type, convert_element_type_1, mul, rsqrt, sub, var_mean
+#   norm_hidden_states_1 => add_2
+# Graph fragment:
+#   %arg0_1 : Tensor "bf16[1, 2048, 4096][8388608, 4096, 1]cuda:0" = PlaceHolder[target=arg0_1]
+#   %arg1_1 : Tensor "bf16[1, 1, 4096][24576, 24576, 1]cuda:0" = PlaceHolder[target=arg1_1]
+#   %getitem_1 : Tensor "f32[1, 2048, 1][2048, 1, 2048]cuda:0" = PlaceHolder[target=getitem_1]
+#   %buf1 : Tensor "f32[1, 2048, 1][2048, 1, 2048]cuda:0" = PlaceHolder[target=buf1]
+#   %arg2_1 : Tensor "bf16[1, 1, 4096][24576, 24576, 1]cuda:0" = PlaceHolder[target=arg2_1]
+#   %convert_element_type : Tensor "f32[1, 2048, 4096][8388608, 4096, 1]cuda:0"[num_users=2] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%arg0_1, torch.float32), kwargs = {})
+#   %var_mean : [num_users=2] = call_function[target=torch.ops.aten.var_mean.correction](args = (%convert_element_type, [2]), kwargs = {correction: 0, keepdim: True})
+#   %add_1 : Tensor "bf16[1, 1, 4096][4096, 4096, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%arg1_1, 1), kwargs = {})
+#   %sub : Tensor "f32[1, 2048, 4096][8388608, 4096, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.sub.Tensor](args = (%convert_element_type, %getitem_1), kwargs = {})
+#   %add : Tensor "f32[1, 2048, 1][2048, 1, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%getitem, 1e-06), kwargs = {})
+#   %rsqrt : Tensor "f32[1, 2048, 1][2048, 1, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.rsqrt.default](args = (%add,), kwargs = {})
+#   %mul : Tensor "f32[1, 2048, 4096][8388608, 4096, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%sub, %rsqrt), kwargs = {})
+#   %convert_element_type_1 : Tensor "bf16[1, 2048, 4096][8388608, 4096, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%mul, torch.bfloat16), kwargs = {})
+#   %mul_1 : Tensor "bf16[1, 2048, 4096][8388608, 4096, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%add_1, %convert_element_type_1), kwargs = {})
+#   %add_2 : Tensor "bf16[1, 2048, 4096][8388608, 4096, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%mul_1, %arg2_1), kwargs = {})
+#   return %getitem_1,%buf1,%add_2
+triton_red_fused_add_mul_native_layer_norm_0 = async_compile.triton('triton_red_fused_add_mul_native_layer_norm_0', '''
+import triton
+import triton.language as tl
+
+from torch._inductor.runtime import triton_helpers, triton_heuristics
+from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+triton_helpers.set_driver_to_gpu()
+
+@triton_heuristics.reduction(
+    size_hints={'x': 2048, 'r0_': 4096},
+    reduction_hint=ReductionHint.INNER,
+    filename=__file__,
+    triton_meta={'signature': {'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'in_ptr2': '*bf16', 'out_ptr2': '*bf16', 'xnumel': 'i32', 'r0_numel': 'i32', 'XBLOCK': 'constexpr', 'R0_BLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=128, cc=89, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=1536, warp_size=32), 'constants': {}, 'native_matmul': False, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]]}], 'enable_fp_fusion': True},
+    inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_red_fused_add_mul_native_layer_norm_0', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'atomic_add_found': False, 'num_load': 4, 'num_store': 1, 'num_reduction': 2, 'backend_hash': '139C22A3A3C364569C9941DE9469DCB674B7A631E094782CBD415193800462F6', 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'deterministic': False, 'force_filter_reduction_configs': False, 'are_deterministic_algorithms_enabled': False, 'add_persistent_rblock': True, 'tiling_scores': {'x': 0, 'r0_': 50348032}}
+)
+@triton.jit
+def triton_red_fused_add_mul_native_layer_norm_0(in_ptr0, in_ptr1, in_ptr2, out_ptr2, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr):
+    xnumel = 2048
+    r0_numel = 4096
+    rnumel = r0_numel
+    RBLOCK: tl.constexpr = R0_BLOCK
+    xoffset = tl.program_id(0) * XBLOCK
+    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
+    xmask = xindex < xnumel
+    r0_base = tl.arange(0, R0_BLOCK)[None, :]
+    rbase = r0_base
+    x0 = xindex
+    tmp3_mean = tl.zeros([XBLOCK, R0_BLOCK], tl.float32)
+    tmp3_m2 = tl.zeros([XBLOCK, R0_BLOCK], tl.float32)
+    tmp3_weight = tl.zeros([XBLOCK, R0_BLOCK], tl.float32)
+    for r0_offset in tl.range(0, r0_numel, R0_BLOCK):
+        r0_index = r0_offset + r0_base
+        r0_mask = r0_index < r0_numel
+        roffset = r0_offset
+        rindex = r0_index
+        r0_1 = r0_index
+        tmp0 = tl.load(in_ptr0 + (r0_1 + 4096*x0), r0_mask & xmask, eviction_policy='evict_last', other=0.0).to(tl.float32)
+        tmp1 = tmp0.to(tl.float32)
+        tmp2 = tl.broadcast_to(tmp1, [XBLOCK, R0_BLOCK])
+        tmp3_mean_next, tmp3_m2_next, tmp3_weight_next = triton_helpers.welford_reduce(
+            tmp2, tmp3_mean, tmp3_m2, tmp3_weight, roffset == 0
+        )
+        tmp3_mean = tl.where(r0_mask & xmask, tmp3_mean_next, tmp3_mean)
+        tmp3_m2 = tl.where(r0_mask & xmask, tmp3_m2_next, tmp3_m2)
+        tmp3_weight = tl.where(r0_mask & xmask, tmp3_weight_next, tmp3_weight)
+    tmp4, tmp5, tmp6 = triton_helpers.welford(tmp3_mean, tmp3_m2, tmp3_weight, 1)
+    tmp3 = tmp4[:, None]
+    tmp7 = tmp5[:, None]
+    tmp8 = tmp6[:, None]
+    for r0_offset in tl.range(0, r0_numel, R0_BLOCK):
+        r0_index = r0_offset + r0_base
+        r0_mask = r0_index < r0_numel
+        roffset = r0_offset
+        rindex = r0_index
+        r0_1 = r0_index
+        tmp9 = tl.load(in_ptr1 + (r0_1), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32)
+        tmp12 = tl.load(in_ptr0 + (r0_1 + 4096*x0), r0_mask & xmask, eviction_policy='evict_first', other=0.0).to(tl.float32)
+        tmp23 = tl.load(in_ptr2 + (r0_1), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32)
+        tmp10 = 1.0
+        tmp11 = tmp9 + tmp10
+        tmp13 = tmp12.to(tl.float32)
+        tmp14 = tmp13 - tmp3
+        tmp15 = 4096.0
+        tmp16 = (tmp7 / tmp15)
+        tmp17 = 1e-06
+        tmp18 = tmp16 + tmp17
+        tmp19 = libdevice.rsqrt(tmp18)
+        tmp20 = tmp14 * tmp19
+        tmp21 = tmp20.to(tl.float32)
+        tmp22 = tmp11 * tmp21
+        tmp24 = tmp22 + tmp23
+        tl.store(out_ptr2 + (r0_1 + 4096*x0), tmp24, r0_mask & xmask)
+''', device_str='cuda')
+
+
+# kernel path: /app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py
+# Topologically Sorted Source Nodes: [norm_encoder_hidden_states, add_2, mul_1, norm_encoder_hidden_states_1], Original ATen: [aten.native_layer_norm, aten.add, aten.mul]
+# Source node to ATen node mapping:
+#   add_2 => add_4
+#   mul_1 => mul_3
+#   norm_encoder_hidden_states => add_3, convert_element_type_2, convert_element_type_3, mul_2, rsqrt_1, sub_1, var_mean_1
+#   norm_encoder_hidden_states_1 => add_5
+# Graph fragment:
+#   %arg3_1 : Tensor "bf16[1, 256, 4096][1048576, 4096, 1]cuda:0" = PlaceHolder[target=arg3_1]
+#   %arg4_1 : Tensor "bf16[1, 1, 4096][24576, 24576, 1]cuda:0" = PlaceHolder[target=arg4_1]
+#   %getitem_3 : Tensor "f32[1, 256, 1][256, 1, 256]cuda:0" = PlaceHolder[target=getitem_3]
+#   %buf4 : Tensor "f32[1, 256, 1][256, 1, 256]cuda:0" = PlaceHolder[target=buf4]
+#   %arg5_1 : Tensor "bf16[1, 1, 4096][24576, 24576, 1]cuda:0" = PlaceHolder[target=arg5_1]
+#   %convert_element_type_2 : Tensor "f32[1, 256, 4096][1048576, 4096, 1]cuda:0"[num_users=2] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%arg3_1, torch.float32), kwargs = {})
+#   %var_mean_1 : [num_users=2] = call_function[target=torch.ops.aten.var_mean.correction](args = (%convert_element_type_2, [2]), kwargs = {correction: 0, keepdim: True})
+#   %add_4 : Tensor "bf16[1, 1, 4096][4096, 4096, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%arg4_1, 1), kwargs = {})
+#   %sub_1 : Tensor "f32[1, 256, 4096][1048576, 4096, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.sub.Tensor](args = (%convert_element_type_2, %getitem_3), kwargs = {})
+#   %add_3 : Tensor "f32[1, 256, 1][256, 1, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%getitem_2, 1e-06), kwargs = {})
+#   %rsqrt_1 : Tensor "f32[1, 256, 1][256, 1, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.rsqrt.default](args = (%add_3,), kwargs = {})
+#   %mul_2 : Tensor "f32[1, 256, 4096][1048576, 4096, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%sub_1, %rsqrt_1), kwargs = {})
+#   %convert_element_type_3 : Tensor "bf16[1, 256, 4096][1048576, 4096, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%mul_2, torch.bfloat16), kwargs = {})
+#   %mul_3 : Tensor "bf16[1, 256, 4096][1048576, 4096, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%add_4, %convert_element_type_3), kwargs = {})
+#   %add_5 : Tensor "bf16[1, 256, 4096][1048576, 4096, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%mul_3, %arg5_1), kwargs = {})
+#   return %getitem_3,%buf4,%add_5
+triton_red_fused_add_mul_native_layer_norm_1 = async_compile.triton('triton_red_fused_add_mul_native_layer_norm_1', '''
+import triton
+import triton.language as tl
+
+from torch._inductor.runtime import triton_helpers, triton_heuristics
+from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+triton_helpers.set_driver_to_gpu()
+
+@triton_heuristics.reduction(
+    size_hints={'x': 256, 'r0_': 4096},
+    reduction_hint=ReductionHint.INNER,
+    filename=__file__,
+    triton_meta={'signature': {'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'in_ptr2': '*bf16', 'out_ptr2': '*bf16', 'xnumel': 'i32', 'r0_numel': 'i32', 'XBLOCK': 'constexpr', 'R0_BLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=128, cc=89, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=1536, warp_size=32), 'constants': {}, 'native_matmul': False, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]]}], 'enable_fp_fusion': True},
+    inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_red_fused_add_mul_native_layer_norm_1', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'atomic_add_found': False, 'num_load': 4, 'num_store': 1, 'num_reduction': 2, 'backend_hash': '139C22A3A3C364569C9941DE9469DCB674B7A631E094782CBD415193800462F6', 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'deterministic': False, 'force_filter_reduction_configs': False, 'are_deterministic_algorithms_enabled': False, 'add_persistent_rblock': True, 'tiling_scores': {'x': 0, 'r0_': 6307840}}
+)
+@triton.jit
+def triton_red_fused_add_mul_native_layer_norm_1(in_ptr0, in_ptr1, in_ptr2, out_ptr2, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr):
+    xnumel = 256
+    r0_numel = 4096
+    rnumel = r0_numel
+    RBLOCK: tl.constexpr = R0_BLOCK
+    xoffset = tl.program_id(0) * XBLOCK
+    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
+    xmask = xindex < xnumel
+    r0_base = tl.arange(0, R0_BLOCK)[None, :]
+    rbase = r0_base
+    x0 = xindex
+    tmp3_mean = tl.zeros([XBLOCK, R0_BLOCK], tl.float32)
+    tmp3_m2 = tl.zeros([XBLOCK, R0_BLOCK], tl.float32)
+    tmp3_weight = tl.zeros([XBLOCK, R0_BLOCK], tl.float32)
+    for r0_offset in tl.range(0, r0_numel, R0_BLOCK):
+        r0_index = r0_offset + r0_base
+        r0_mask = r0_index < r0_numel
+        roffset = r0_offset
+        rindex = r0_index
+        r0_1 = r0_index
+        tmp0 = tl.load(in_ptr0 + (r0_1 + 4096*x0), r0_mask & xmask, eviction_policy='evict_last', other=0.0).to(tl.float32)
+        tmp1 = tmp0.to(tl.float32)
+        tmp2 = tl.broadcast_to(tmp1, [XBLOCK, R0_BLOCK])
+        tmp3_mean_next, tmp3_m2_next, tmp3_weight_next = triton_helpers.welford_reduce(
+            tmp2, tmp3_mean, tmp3_m2, tmp3_weight, roffset == 0
+        )
+        tmp3_mean = tl.where(r0_mask & xmask, tmp3_mean_next, tmp3_mean)
+        tmp3_m2 = tl.where(r0_mask & xmask, tmp3_m2_next, tmp3_m2)
+        tmp3_weight = tl.where(r0_mask & xmask, tmp3_weight_next, tmp3_weight)
+    tmp4, tmp5, tmp6 = triton_helpers.welford(tmp3_mean, tmp3_m2, tmp3_weight, 1)
+    tmp3 = tmp4[:, None]
+    tmp7 = tmp5[:, None]
+    tmp8 = tmp6[:, None]
+    for r0_offset in tl.range(0, r0_numel, R0_BLOCK):
+        r0_index = r0_offset + r0_base
+        r0_mask = r0_index < r0_numel
+        roffset = r0_offset
+        rindex = r0_index
+        r0_1 = r0_index
+        tmp9 = tl.load(in_ptr1 + (r0_1), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32)
+        tmp12 = tl.load(in_ptr0 + (r0_1 + 4096*x0), r0_mask & xmask, eviction_policy='evict_first', other=0.0).to(tl.float32)
+        tmp23 = tl.load(in_ptr2 + (r0_1), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32)
+        tmp10 = 1.0
+        tmp11 = tmp9 + tmp10
+        tmp13 = tmp12.to(tl.float32)
+        tmp14 = tmp13 - tmp3
+        tmp15 = 4096.0
+        tmp16 = (tmp7 / tmp15)
+        tmp17 = 1e-06
+        tmp18 = tmp16 + tmp17
+        tmp19 = libdevice.rsqrt(tmp18)
+        tmp20 = tmp14 * tmp19
+        tmp21 = tmp20.to(tl.float32)
+        tmp22 = tmp11 * tmp21
+        tmp24 = tmp22 + tmp23
+        tl.store(out_ptr2 + (r0_1 + 4096*x0), tmp24, r0_mask & xmask)
+''', device_str='cuda')
+
+
+async_compile.wait(globals())
+del async_compile
+
+class Runner:
+    def __init__(self, partitions):
+        self.partitions = partitions
+
+    def recursively_apply_fns(self, fns):
+        new_callables = []
+        for fn, c in zip(fns, self.partitions):
+            new_callables.append(fn(c))
+        self.partitions = new_callables
+
+    def call(self, args):
+        arg0_1, arg1_1, arg2_1, arg3_1, arg4_1, arg5_1 = args
+        args.clear()
+        assert_size_stride(arg0_1, (1, 2048, 4096), (8388608, 4096, 1))
+        assert_size_stride(arg1_1, (1, 1, 4096), (24576, 24576, 1))
+        assert_size_stride(arg2_1, (1, 1, 4096), (24576, 24576, 1))
+        assert_size_stride(arg3_1, (1, 256, 4096), (1048576, 4096, 1))
+        assert_size_stride(arg4_1, (1, 1, 4096), (24576, 24576, 1))
+        assert_size_stride(arg5_1, (1, 1, 4096), (24576, 24576, 1))
+        with torch.cuda._DeviceGuard(0):
+            torch.cuda.set_device(0)
+            buf6 = empty_strided_cuda((1, 2048, 4096), (8388608, 4096, 1), torch.bfloat16)
+            # Topologically Sorted Source Nodes: [norm_hidden_states, add, mul, norm_hidden_states_1], Original ATen: [aten.native_layer_norm, aten.add, aten.mul]
+            stream0 = get_raw_stream(0)
+            triton_red_fused_add_mul_native_layer_norm_0.run(arg0_1, arg1_1, arg2_1, buf6, 2048, 4096, stream=stream0)
+            del arg0_1
+            del arg1_1
+            del arg2_1
+            buf7 = empty_strided_cuda((1, 256, 4096), (1048576, 4096, 1), torch.bfloat16)
+            # Topologically Sorted Source Nodes: [norm_encoder_hidden_states, add_2, mul_1, norm_encoder_hidden_states_1], Original ATen: [aten.native_layer_norm, aten.add, aten.mul]
+            stream0 = get_raw_stream(0)
+            triton_red_fused_add_mul_native_layer_norm_1.run(arg3_1, arg4_1, arg5_1, buf7, 256, 4096, stream=stream0)
+            del arg3_1
+            del arg4_1
+            del arg5_1
+        return (buf6, buf7, )
+
+runner = Runner(partitions=[])
+call = runner.call
+recursively_apply_fns = runner.recursively_apply_fns
+
+
+def benchmark_compiled_module(times=10, repeat=10):
+    from torch._dynamo.testing import rand_strided
+    from torch._inductor.utils import print_performance
+    arg0_1 = rand_strided((1, 2048, 4096), (8388608, 4096, 1), device='cuda:0', dtype=torch.bfloat16)
+    arg1_1 = rand_strided((1, 1, 4096), (24576, 24576, 1), device='cuda:0', dtype=torch.bfloat16)
+    arg2_1 = rand_strided((1, 1, 4096), (24576, 24576, 1), device='cuda:0', dtype=torch.bfloat16)
+    arg3_1 = rand_strided((1, 256, 4096), (1048576, 4096, 1), device='cuda:0', dtype=torch.bfloat16)
+    arg4_1 = rand_strided((1, 1, 4096), (24576, 24576, 1), device='cuda:0', dtype=torch.bfloat16)
+    arg5_1 = rand_strided((1, 1, 4096), (24576, 24576, 1), device='cuda:0', dtype=torch.bfloat16)
+    fn = lambda: call([arg0_1, arg1_1, arg2_1, arg3_1, arg4_1, arg5_1])
+    return print_performance(fn, times=times, repeat=repeat)
+
+
+if __name__ == "__main__":
+    from torch._inductor.wrapper_benchmark import compiled_module_main
+    compiled_module_main('None', benchmark_compiled_module)
diff --git a/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py b/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e42f89ff78e62b3218fe7765bb4da20c0715404
--- /dev/null
+++ b/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py
@@ -0,0 +1,45 @@
+
+import triton
+import triton.language as tl
+
+from torch._inductor.runtime import triton_helpers, triton_heuristics
+from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+triton_helpers.set_driver_to_gpu()
+
+@triton_heuristics.pointwise(
+    size_hints={'x': 67108864}, 
+    filename=__file__,
+    triton_meta={'signature': {'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'out_ptr0': '*bf16', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=128, cc=89, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=1536, warp_size=32), 'constants': {}, 'native_matmul': False, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]]}], 'enable_fp_fusion': True},
+    inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_cat_mul_silu_split_view_0', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'atomic_add_found': False, 'num_load': 3, 'num_store': 1, 'num_reduction': 0, 'backend_hash': '139C22A3A3C364569C9941DE9469DCB674B7A631E094782CBD415193800462F6', 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'deterministic': False, 'force_filter_reduction_configs': False, 'are_deterministic_algorithms_enabled': False, 'tiling_scores': {'x': 377487360}},
+    min_elem_per_thread=0
+)
+@triton.jit
+def triton_poi_fused_cat_mul_silu_split_view_0(in_ptr0, in_ptr1, out_ptr0, xnumel, XBLOCK : tl.constexpr):
+    xnumel = 37748736
+    xoffset = tl.program_id(0) * XBLOCK
+    xindex = xoffset + tl.arange(0, XBLOCK)[:]
+    xmask = tl.full([XBLOCK], True, tl.int1)
+    x0 = (xindex % 16384)
+    x1 = xindex // 16384
+    x2 = xindex
+    tmp0 = x0
+    tmp1 = tl.full([1], 0, tl.int64)
+    tmp2 = tmp0 >= tmp1
+    tmp3 = tl.full([1], 4096, tl.int64)
+    tmp4 = tmp0 < tmp3
+    tmp5 = tl.load(in_ptr0 + (4096*x1 + (x0)), tmp4, eviction_policy='evict_last', other=0.0).to(tl.float32)
+    tmp6 = tmp0 >= tmp3
+    tmp7 = tl.full([1], 16384, tl.int64)
+    tmp8 = tmp0 < tmp7
+    tmp9 = tl.load(in_ptr1 + (36864*x1 + ((-4096) + x0)), tmp6, eviction_policy='evict_last', other=0.0).to(tl.float32)
+    tmp10 = tmp9.to(tl.float32)
+    tmp11 = tl.sigmoid(tmp10)
+    tmp12 = tmp10 * tmp11
+    tmp13 = tmp12.to(tl.float32)
+    tmp14 = tl.load(in_ptr1 + (12288 + 36864*x1 + ((-4096) + x0)), tmp6, eviction_policy='evict_last', other=0.0).to(tl.float32)
+    tmp15 = tmp13 * tmp14
+    tmp16 = tl.full(tmp15.shape, 0.0, tmp15.dtype)
+    tmp17 = tl.where(tmp6, tmp15, tmp16)
+    tmp18 = tl.where(tmp4, tmp5, tmp17)
+    tl.store(out_ptr0 + (x2), tmp18, None)
diff --git a/torchinductor/3i/cf1587a2fd240ce39177274973308f6fd100d746bf6716a8d96ed4fd12c89d55.best_config b/torchinductor/3i/cf1587a2fd240ce39177274973308f6fd100d746bf6716a8d96ed4fd12c89d55.best_config
new file mode 100644
index 0000000000000000000000000000000000000000..a8b7d3123d4d086103b46178b97f09b296ed11b8
--- /dev/null
+++ b/torchinductor/3i/cf1587a2fd240ce39177274973308f6fd100d746bf6716a8d96ed4fd12c89d55.best_config
@@ -0,0 +1 @@
+{"XBLOCK": 512, "num_warps": 8, "num_stages": 1, "configs_hash": "3ca5c3e34d35093f3c9ab2829a9faeebad5e61c4ca13d5ed6053d7b71ce60d5a", "found_by_coordesc": false, "time_taken_ms": 81, "triton_cache_hash": "PPN4SVQW2UFKVPWUB7HCOIHQMJON3EA6PX7FI3IPMCGAPBBOTNMQ"}
\ No newline at end of file
diff --git a/torchinductor/3v/4a00da1b5d4ce251d2cb392c24118fc2e6c3818f25b8457665f0d53e12234277.best_config b/torchinductor/3v/4a00da1b5d4ce251d2cb392c24118fc2e6c3818f25b8457665f0d53e12234277.best_config
new file mode 100644
index 0000000000000000000000000000000000000000..67290006b21855bf0166f929dde8fe79bbdc46d9
--- /dev/null
+++ b/torchinductor/3v/4a00da1b5d4ce251d2cb392c24118fc2e6c3818f25b8457665f0d53e12234277.best_config
@@ -0,0 +1 @@
+{"XBLOCK": 1024, "num_warps": 4, "num_stages": 1, "configs_hash": "3ca5c3e34d35093f3c9ab2829a9faeebad5e61c4ca13d5ed6053d7b71ce60d5a", "found_by_coordesc": false, "time_taken_ms": 43, "triton_cache_hash": "SJ2F5NEEPBSFTTPVSLW22OOIZQR5FPT5YWSURMFRPHLWAFZ5VB7A"}
\ No newline at end of file
diff --git a/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py b/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py
new file mode 100644
index 0000000000000000000000000000000000000000..65ddf1168e44a098e3e59f5d2bb8c3ca7d867482
--- /dev/null
+++ b/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py
@@ -0,0 +1,28 @@
+
+import triton
+import triton.language as tl
+
+from torch._inductor.runtime import triton_helpers, triton_heuristics
+from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+triton_helpers.set_driver_to_gpu()
+
+@triton_heuristics.pointwise(
+    size_hints={'x': 16777216}, 
+    filename=__file__,
+    triton_meta={'signature': {'in_ptr0': '*bf16', 'out_ptr0': '*bf16', 'ks0': 'i64', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=128, cc=89, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=1536, warp_size=32), 'constants': {}, 'native_matmul': False, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]]}], 'enable_fp_fusion': True},
+    inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'atomic_add_found': False, 'num_load': 1, 'num_store': 1, 'num_reduction': 0, 'backend_hash': '139C22A3A3C364569C9941DE9469DCB674B7A631E094782CBD415193800462F6', 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'deterministic': False, 'force_filter_reduction_configs': False, 'are_deterministic_algorithms_enabled': False, 'tiling_scores': {'x': 37748736}},
+    min_elem_per_thread=0
+)
+@triton.jit
+def triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1(in_ptr0, out_ptr0, ks0, xnumel, XBLOCK : tl.constexpr):
+    xnumel = 9437184
+    xoffset = tl.program_id(0) * XBLOCK
+    xindex = xoffset + tl.arange(0, XBLOCK)[:]
+    xmask = tl.full([XBLOCK], True, tl.int1)
+    x0 = (xindex % 128)
+    x1 = ((xindex // 128) % 2304)
+    x2 = xindex // 294912
+    x3 = xindex
+    tmp0 = tl.load(in_ptr0 + (x0 + 128*x2 + ks0*x1), None).to(tl.float32)
+    tl.store(out_ptr0 + (x3), tmp0, None)
diff --git a/torchinductor/4y/c4ykjyk6fv6enet6mgkj5bsan42tc6rsdfs7aaskpjgv5rzw7tbr.py b/torchinductor/4y/c4ykjyk6fv6enet6mgkj5bsan42tc6rsdfs7aaskpjgv5rzw7tbr.py
new file mode 100644
index 0000000000000000000000000000000000000000..2a225ba4fb5bfa2323ec7e00316fe01a313d660c
--- /dev/null
+++ b/torchinductor/4y/c4ykjyk6fv6enet6mgkj5bsan42tc6rsdfs7aaskpjgv5rzw7tbr.py
@@ -0,0 +1,357 @@
+# AOT ID: ['25_inference']
+from ctypes import c_void_p, c_long, c_int
+import torch
+import math
+import random
+import os
+import tempfile
+from math import inf, nan
+from cmath import nanj
+from torch._inductor.hooks import run_intermediate_hooks
+from torch._inductor.utils import maybe_profile
+from torch._inductor.codegen.memory_planning import _align as align
+from torch import device, empty_strided
+from torch._inductor.async_compile import AsyncCompile
+from torch._inductor.select_algorithm import extern_kernels
+import triton
+import triton.language as tl
+from torch._inductor.runtime.triton_heuristics import start_graph, end_graph
+from torch._C import _cuda_getCurrentRawStream as get_raw_stream
+
+aten = torch.ops.aten
+inductor_ops = torch.ops.inductor
+_quantized = torch.ops._quantized
+assert_size_stride = torch._C._dynamo.guards.assert_size_stride
+assert_alignment = torch._C._dynamo.guards.assert_alignment
+empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu
+empty_strided_cpu_pinned = torch._C._dynamo.guards._empty_strided_cpu_pinned
+empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
+empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu
+empty_strided_mtia = torch._C._dynamo.guards._empty_strided_mtia
+reinterpret_tensor = torch._C._dynamo.guards._reinterpret_tensor
+alloc_from_pool = torch.ops.inductor._alloc_from_pool
+async_compile = AsyncCompile()
+empty_strided_p2p = torch._C._distributed_c10d._SymmetricMemory.empty_strided_p2p
+
+
+# kernel path: /app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py
+# Topologically Sorted Source Nodes: [split, chunk, query_1, query_2, reshape, unbind, key_1, key_2, reshape_1, unbind_1, float_1, cos, mul, neg, stack, x_rotated, float_2, sin, mul_1, add, out, float_3, cos_2, mul_2, neg_1, stack_1, x_rotated_1, float_4, sin_2, mul_3, add_1, out_1], Original ATen: [aten.split_with_sizes, aten.split, aten.view, aten._fused_rms_norm, aten.unbind, aten._to_copy, aten.unsqueeze, aten.mul, aten.neg, aten.stack, aten.add]
+# Source node to ATen node mapping:
+#   add => add_2
+#   add_1 => add_3
+#   chunk => split
+#   cos => unsqueeze, unsqueeze_1
+#   cos_2 => unsqueeze_6, unsqueeze_7
+#   float_1 => convert_element_type_4
+#   float_2 => convert_element_type_5
+#   float_3 => convert_element_type_7
+#   float_4 => convert_element_type_8
+#   key_1 => view_1
+#   key_2 => add_1, convert_element_type_2, convert_element_type_3, mean_1, mul_2, mul_3, pow_2, rsqrt_1
+#   mul => mul_4
+#   mul_1 => mul_5
+#   mul_2 => mul_6
+#   mul_3 => mul_7
+#   neg => neg
+#   neg_1 => neg_1
+#   out => convert_element_type_6
+#   out_1 => convert_element_type_9
+#   query_1 => view
+#   query_2 => add, convert_element_type, convert_element_type_1, mean, mul, mul_1, pow_1, rsqrt
+#   reshape => view_3
+#   reshape_1 => view_5
+#   sin => unsqueeze_2, unsqueeze_3
+#   sin_2 => unsqueeze_8, unsqueeze_9
+#   split => split_with_sizes
+#   stack => cat, unsqueeze_4, unsqueeze_5
+#   stack_1 => cat_1, unsqueeze_10, unsqueeze_11
+#   unbind => unbind
+#   unbind_1 => unbind_1
+#   x_rotated => view_4
+#   x_rotated_1 => view_6
+# Graph fragment:
+#   %arg0_1 : Tensor "bf16[1, 2304, 36864][84934656, 36864, 1]cuda:0" = PlaceHolder[target=arg0_1]
+#   %buf0 : Tensor "f32[1, 2304, 32, 1][73728, 32, 1, 73728]cuda:0" = PlaceHolder[target=buf0]
+#   %arg1_1 : Tensor "bf16[128][1]cuda:0" = PlaceHolder[target=arg1_1]
+#   %arg3_1 : Tensor "f32[2304, 128][128, 1]cuda:0" = PlaceHolder[target=arg3_1]
+#   %cat : Tensor "bf16[1, 2304, 32, 64, 2][9437184, 4096, 128, 2, 1]cuda:0" = PlaceHolder[target=cat]
+#   %arg4_1 : Tensor "f32[2304, 128][128, 1]cuda:0" = PlaceHolder[target=arg4_1]
+#   %buf1 : Tensor "f32[1, 2304, 32, 1][73728, 32, 1, 73728]cuda:0" = PlaceHolder[target=buf1]
+#   %arg2_1 : Tensor "bf16[128][1]cuda:0" = PlaceHolder[target=arg2_1]
+#   %cat_1 : Tensor "bf16[1, 2304, 32, 64, 2][9437184, 4096, 128, 2, 1]cuda:0" = PlaceHolder[target=cat_1]
+#   %split_with_sizes : [num_users=2] = call_function[target=torch.ops.aten.split_with_sizes.default](args = (%arg0_1, [12288, 24576], -1), kwargs = {})
+#   %split : [num_users=3] = call_function[target=torch.ops.aten.split.Tensor](args = (%getitem, 4096, -1), kwargs = {})
+#   %view : Tensor "bf16[1, 2304, 32, 128][84934656, 36864, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%getitem_2, [1, 2304, 32, 128]), kwargs = {})
+#   %convert_element_type : Tensor "f32[1, 2304, 32, 128][9437184, 4096, 128, 1]cuda:0"[num_users=2] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%view, torch.float32), kwargs = {})
+#   %pow_1 : Tensor "f32[1, 2304, 32, 128][9437184, 4096, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.pow.Tensor_Scalar](args = (%convert_element_type, 2), kwargs = {})
+#   %mean : Tensor "f32[1, 2304, 32, 1][73728, 32, 1, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.mean.dim](args = (%pow_1, [3], True), kwargs = {})
+#   %add : Tensor "f32[1, 2304, 32, 1][73728, 32, 1, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.add.Scalar](args = (%mean, 1e-06), kwargs = {})
+#   %rsqrt : Tensor "f32[1, 2304, 32, 1][73728, 32, 1, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.rsqrt.default](args = (%add,), kwargs = {})
+#   %mul : Tensor "f32[1, 2304, 32, 128][9437184, 4096, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type, %rsqrt), kwargs = {})
+#   %mul_1 : Tensor "f32[1, 2304, 32, 128][9437184, 4096, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%mul, %arg1_1), kwargs = {})
+#   %convert_element_type_1 : Tensor "bf16[1, 2304, 32, 128][9437184, 4096, 128, 1]cuda:0"[num_users=2] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%mul_1, torch.bfloat16), kwargs = {})
+#   %view_3 : Tensor "bf16[1, 2304, 32, 64, 2][9437184, 4096, 128, 2, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%convert_element_type_1, [1, 2304, 32, -1, 2]), kwargs = {})
+#   %unbind : [num_users=2] = call_function[target=torch.ops.aten.unbind.int](args = (%view_3, -1), kwargs = {})
+#   %view_1 : Tensor "bf16[1, 2304, 32, 128][84934656, 36864, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%getitem_3, [1, 2304, 32, 128]), kwargs = {})
+#   %convert_element_type_2 : Tensor "f32[1, 2304, 32, 128][9437184, 4096, 128, 1]cuda:0"[num_users=2] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%view_1, torch.float32), kwargs = {})
+#   %pow_2 : Tensor "f32[1, 2304, 32, 128][9437184, 4096, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.pow.Tensor_Scalar](args = (%convert_element_type_2, 2), kwargs = {})
+#   %mean_1 : Tensor "f32[1, 2304, 32, 1][73728, 32, 1, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.mean.dim](args = (%pow_2, [3], True), kwargs = {})
+#   %add_1 : Tensor "f32[1, 2304, 32, 1][73728, 32, 1, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.add.Scalar](args = (%mean_1, 1e-06), kwargs = {})
+#   %rsqrt_1 : Tensor "f32[1, 2304, 32, 1][73728, 32, 1, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.rsqrt.default](args = (%add_1,), kwargs = {})
+#   %mul_2 : Tensor "f32[1, 2304, 32, 128][9437184, 4096, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type_2, %rsqrt_1), kwargs = {})
+#   %mul_3 : Tensor "f32[1, 2304, 32, 128][9437184, 4096, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%mul_2, %arg2_1), kwargs = {})
+#   %convert_element_type_3 : Tensor "bf16[1, 2304, 32, 128][9437184, 4096, 128, 1]cuda:0"[num_users=2] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%mul_3, torch.bfloat16), kwargs = {})
+#   %view_5 : Tensor "bf16[1, 2304, 32, 64, 2][9437184, 4096, 128, 2, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%convert_element_type_3, [1, 2304, 32, -1, 2]), kwargs = {})
+#   %unbind_1 : [num_users=2] = call_function[target=torch.ops.aten.unbind.int](args = (%view_5, -1), kwargs = {})
+#   %convert_element_type_4 : Tensor "f32[1, 2304, 32, 128][9437184, 4096, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%convert_element_type_1, torch.float32), kwargs = {})
+#   %unsqueeze : Tensor "f32[1, 2304, 128][294912, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.unsqueeze.default](args = (%arg3_1, 0), kwargs = {})
+#   %unsqueeze_1 : Tensor "f32[1, 2304, 1, 128][294912, 128, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.unsqueeze.default](args = (%unsqueeze, 2), kwargs = {})
+#   %mul_4 : Tensor "f32[1, 2304, 32, 128][9437184, 4096, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type_4, %unsqueeze_1), kwargs = {})
+#   %neg : Tensor "bf16[1, 2304, 32, 64][4718592, 2048, 64, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.neg.default](args = (%getitem_6,), kwargs = {})
+#   %unsqueeze_4 : Tensor "bf16[1, 2304, 32, 64, 1][4718592, 2048, 64, 1, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.unsqueeze.default](args = (%neg, 4), kwargs = {})
+#   %unsqueeze_5 : Tensor "bf16[1, 2304, 32, 64, 1][9437184, 4096, 128, 2, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.unsqueeze.default](args = (%getitem_5, 4), kwargs = {})
+#   %cat : Tensor "bf16[1, 2304, 32, 64, 2][9437184, 4096, 128, 2, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.cat.default](args = ([%unsqueeze_4, %unsqueeze_5], -1), kwargs = {})
+#   %view_4 : Tensor "bf16[1, 2304, 32, 128][9437184, 4096, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%cat, [1, 2304, 32, 128]), kwargs = {})
+#   %convert_element_type_5 : Tensor "f32[1, 2304, 32, 128][9437184, 4096, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%view_4, torch.float32), kwargs = {})
+#   %unsqueeze_2 : Tensor "f32[1, 2304, 128][294912, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.unsqueeze.default](args = (%arg4_1, 0), kwargs = {})
+#   %unsqueeze_3 : Tensor "f32[1, 2304, 1, 128][294912, 128, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.unsqueeze.default](args = (%unsqueeze_2, 2), kwargs = {})
+#   %mul_5 : Tensor "f32[1, 2304, 32, 128][9437184, 4096, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type_5, %unsqueeze_3), kwargs = {})
+#   %add_2 : Tensor "f32[1, 2304, 32, 128][9437184, 4096, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%mul_4, %mul_5), kwargs = {})
+#   %convert_element_type_6 : Tensor "bf16[1, 2304, 32, 128][9437184, 4096, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%add_2, torch.bfloat16), kwargs = {})
+#   %convert_element_type_7 : Tensor "f32[1, 2304, 32, 128][9437184, 4096, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%convert_element_type_3, torch.float32), kwargs = {})
+#   %unsqueeze_6 : Tensor "f32[1, 2304, 128][294912, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.unsqueeze.default](args = (%arg3_1, 0), kwargs = {})
+#   %unsqueeze_7 : Tensor "f32[1, 2304, 1, 128][294912, 128, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.unsqueeze.default](args = (%unsqueeze_6, 2), kwargs = {})
+#   %mul_6 : Tensor "f32[1, 2304, 32, 128][9437184, 4096, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type_7, %unsqueeze_7), kwargs = {})
+#   %neg_1 : Tensor "bf16[1, 2304, 32, 64][4718592, 2048, 64, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.neg.default](args = (%getitem_8,), kwargs = {})
+#   %unsqueeze_10 : Tensor "bf16[1, 2304, 32, 64, 1][4718592, 2048, 64, 1, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.unsqueeze.default](args = (%neg_1, 4), kwargs = {})
+#   %unsqueeze_11 : Tensor "bf16[1, 2304, 32, 64, 1][9437184, 4096, 128, 2, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.unsqueeze.default](args = (%getitem_7, 4), kwargs = {})
+#   %cat_1 : Tensor "bf16[1, 2304, 32, 64, 2][9437184, 4096, 128, 2, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.cat.default](args = ([%unsqueeze_10, %unsqueeze_11], -1), kwargs = {})
+#   %view_6 : Tensor "bf16[1, 2304, 32, 128][9437184, 4096, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%cat_1, [1, 2304, 32, 128]), kwargs = {})
+#   %convert_element_type_8 : Tensor "f32[1, 2304, 32, 128][9437184, 4096, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%view_6, torch.float32), kwargs = {})
+#   %unsqueeze_8 : Tensor "f32[1, 2304, 128][294912, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.unsqueeze.default](args = (%arg4_1, 0), kwargs = {})
+#   %unsqueeze_9 : Tensor "f32[1, 2304, 1, 128][294912, 128, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.unsqueeze.default](args = (%unsqueeze_8, 2), kwargs = {})
+#   %mul_7 : Tensor "f32[1, 2304, 32, 128][9437184, 4096, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type_8, %unsqueeze_9), kwargs = {})
+#   %add_3 : Tensor "f32[1, 2304, 32, 128][9437184, 4096, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%mul_6, %mul_7), kwargs = {})
+#   %convert_element_type_9 : Tensor "bf16[1, 2304, 32, 128][9437184, 4096, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%add_3, torch.bfloat16), kwargs = {})
+#   return %buf1,%buf0,%cat,%convert_element_type_6,%cat_1,%convert_element_type_9
+triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0 = async_compile.triton('triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0', '''
+import triton
+import triton.language as tl
+
+from torch._inductor.runtime import triton_helpers, triton_heuristics
+from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+triton_helpers.set_driver_to_gpu()
+
+@triton_heuristics.reduction(
+    size_hints={'x': 131072, 'r0_': 128},
+    reduction_hint=ReductionHint.DEFAULT,
+    filename=__file__,
+    triton_meta={'signature': {'in_out_ptr0': '*bf16', 'in_out_ptr1': '*bf16', 'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'in_ptr2': '*fp32', 'in_ptr3': '*fp32', 'in_ptr4': '*bf16', 'xnumel': 'i32', 'r0_numel': 'i32', 'XBLOCK': 'constexpr', 'R0_BLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=128, cc=89, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=1536, warp_size=32), 'constants': {}, 'native_matmul': False, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]], (6,): [['tt.divisibility', 16]], (7,): [['tt.divisibility', 16]], (8,): [['tt.divisibility', 16]]}], 'enable_fp_fusion': True},
+    inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0', 'mutated_arg_names': ['in_out_ptr0', 'in_out_ptr1'], 'optimize_mem': True, 'no_x_dim': False, 'atomic_add_found': False, 'num_load': 16, 'num_store': 2, 'num_reduction': 2, 'backend_hash': '139C22A3A3C364569C9941DE9469DCB674B7A631E094782CBD415193800462F6', 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'deterministic': False, 'force_filter_reduction_configs': False, 'are_deterministic_algorithms_enabled': False, 'tiling_scores': {'x': 0, 'r0_': 115606016}}
+)
+@triton.jit
+def triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0(in_out_ptr0, in_out_ptr1, in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr):
+    xnumel = 73728
+    r0_numel = 128
+    rnumel = r0_numel
+    RBLOCK: tl.constexpr = R0_BLOCK
+    xoffset = tl.program_id(0) * XBLOCK
+    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
+    xmask = tl.full([XBLOCK, R0_BLOCK], True, tl.int1)
+    r0_base = tl.arange(0, R0_BLOCK)[None, :]
+    rbase = r0_base
+    x0 = (xindex % 32)
+    x1 = xindex // 32
+    _tmp4 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32)
+    x5 = xindex
+    _tmp10 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32)
+    for r0_offset in tl.range(0, r0_numel, R0_BLOCK):
+        r0_index = r0_offset + r0_base
+        r0_mask = r0_index < r0_numel
+        roffset = r0_offset
+        rindex = r0_index
+        r0_2 = r0_index
+        tmp0 = tl.load(in_ptr0 + (4096 + r0_2 + 128*x0 + 36864*x1), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32)
+        tmp6 = tl.load(in_ptr0 + (r0_2 + 128*x0 + 36864*x1), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32)
+        tmp1 = tmp0.to(tl.float32)
+        tmp2 = tmp1 * tmp1
+        tmp3 = tl.broadcast_to(tmp2, [XBLOCK, R0_BLOCK])
+        tmp5 = _tmp4 + tmp3
+        _tmp4 = tl.where(r0_mask, tmp5, _tmp4)
+        tmp7 = tmp6.to(tl.float32)
+        tmp8 = tmp7 * tmp7
+        tmp9 = tl.broadcast_to(tmp8, [XBLOCK, R0_BLOCK])
+        tmp11 = _tmp10 + tmp9
+        _tmp10 = tl.where(r0_mask, tmp11, _tmp10)
+    tmp4 = tl.sum(_tmp4, 1)[:, None]
+    tmp10 = tl.sum(_tmp10, 1)[:, None]
+    for r0_offset in tl.range(0, r0_numel, R0_BLOCK):
+        r0_index = r0_offset + r0_base
+        r0_mask = r0_index < r0_numel
+        roffset = r0_offset
+        rindex = r0_index
+        r0_3 = (r0_index % 2)
+        r0_4 = r0_index // 2
+        r0_2 = r0_index
+        tmp50 = tl.load(in_ptr0 + (r0_2 + 128*x0 + 36864*x1), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32)
+        tmp58 = tl.load(in_ptr1 + (r0_2), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32)
+        tmp63 = tl.load(in_ptr2 + (r0_2 + 128*x1), r0_mask, eviction_policy='evict_last', other=0.0)
+        tmp66 = tl.load(in_ptr3 + (r0_2 + 128*x1), r0_mask, eviction_policy='evict_last', other=0.0)
+        tmp96 = tl.load(in_ptr0 + (4096 + r0_2 + 128*x0 + 36864*x1), r0_mask, eviction_policy='evict_first', other=0.0).to(tl.float32)
+        tmp102 = tl.load(in_ptr4 + (r0_2), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32)
+        tmp12 = r0_3
+        tmp13 = tl.full([1, 1], 0, tl.int64)
+        tmp14 = tmp12 >= tmp13
+        tmp15 = tl.full([1, 1], 1, tl.int64)
+        tmp16 = tmp12 < tmp15
+        tmp17 = tl.load(in_ptr0 + (1 + 2*r0_4 + 128*x0 + 36864*x1), r0_mask & tmp16, eviction_policy='evict_last', other=0.0).to(tl.float32)
+        tmp18 = tmp17.to(tl.float32)
+        tmp19 = 128.0
+        tmp20 = (tmp10 / tmp19)
+        tmp21 = 1e-06
+        tmp22 = tmp20 + tmp21
+        tmp23 = libdevice.rsqrt(tmp22)
+        tmp24 = tmp18 * tmp23
+        tmp25 = tl.load(in_ptr1 + (tl.broadcast_to(1 + 2*r0_4, [XBLOCK, R0_BLOCK])), r0_mask & tmp16, eviction_policy='evict_last', other=0.0).to(tl.float32)
+        tmp26 = tmp25.to(tl.float32)
+        tmp27 = tmp24 * tmp26
+        tmp28 = tmp27.to(tl.float32)
+        tmp29 = -tmp28
+        tmp30 = tl.full(tmp29.shape, 0.0, tmp29.dtype)
+        tmp31 = tl.where(tmp16, tmp29, tmp30)
+        tmp32 = tmp12 >= tmp15
+        tmp33 = tl.full([1, 1], 2, tl.int64)
+        tmp34 = tmp12 < tmp33
+        tmp35 = tl.load(in_ptr0 + (2*r0_4 + 128*x0 + 36864*x1), r0_mask & tmp32, eviction_policy='evict_last', other=0.0).to(tl.float32)
+        tmp36 = tmp35.to(tl.float32)
+        tmp37 = 128.0
+        tmp38 = (tmp10 / tmp37)
+        tmp39 = 1e-06
+        tmp40 = tmp38 + tmp39
+        tmp41 = libdevice.rsqrt(tmp40)
+        tmp42 = tmp36 * tmp41
+        tmp43 = tl.load(in_ptr1 + (tl.broadcast_to(2*r0_4, [XBLOCK, R0_BLOCK])), r0_mask & tmp32, eviction_policy='evict_last', other=0.0).to(tl.float32)
+        tmp44 = tmp43.to(tl.float32)
+        tmp45 = tmp42 * tmp44
+        tmp46 = tmp45.to(tl.float32)
+        tmp47 = tl.full(tmp46.shape, 0.0, tmp46.dtype)
+        tmp48 = tl.where(tmp32, tmp46, tmp47)
+        tmp49 = tl.where(tmp16, tmp31, tmp48)
+        tmp51 = tmp50.to(tl.float32)
+        tmp52 = 128.0
+        tmp53 = (tmp10 / tmp52)
+        tmp54 = 1e-06
+        tmp55 = tmp53 + tmp54
+        tmp56 = libdevice.rsqrt(tmp55)
+        tmp57 = tmp51 * tmp56
+        tmp59 = tmp58.to(tl.float32)
+        tmp60 = tmp57 * tmp59
+        tmp61 = tmp60.to(tl.float32)
+        tmp62 = tmp61.to(tl.float32)
+        tmp64 = tmp62 * tmp63
+        tmp65 = tmp49.to(tl.float32)
+        tmp67 = tmp65 * tmp66
+        tmp68 = tmp64 + tmp67
+        tmp69 = tmp68.to(tl.float32)
+        tmp70 = tl.load(in_ptr0 + (4097 + 2*r0_4 + 128*x0 + 36864*x1), r0_mask & tmp16, eviction_policy='evict_last', other=0.0).to(tl.float32)
+        tmp71 = tmp70.to(tl.float32)
+        tmp72 = (tmp4 / tmp19)
+        tmp73 = tmp72 + tmp21
+        tmp74 = libdevice.rsqrt(tmp73)
+        tmp75 = tmp71 * tmp74
+        tmp76 = tl.load(in_ptr4 + (tl.broadcast_to(1 + 2*r0_4, [XBLOCK, R0_BLOCK])), r0_mask & tmp16, eviction_policy='evict_last', other=0.0).to(tl.float32)
+        tmp77 = tmp76.to(tl.float32)
+        tmp78 = tmp75 * tmp77
+        tmp79 = tmp78.to(tl.float32)
+        tmp80 = -tmp79
+        tmp81 = tl.full(tmp80.shape, 0.0, tmp80.dtype)
+        tmp82 = tl.where(tmp16, tmp80, tmp81)
+        tmp83 = tl.load(in_ptr0 + (4096 + 2*r0_4 + 128*x0 + 36864*x1), r0_mask & tmp32, eviction_policy='evict_last', other=0.0).to(tl.float32)
+        tmp84 = tmp83.to(tl.float32)
+        tmp85 = (tmp4 / tmp37)
+        tmp86 = tmp85 + tmp39
+        tmp87 = libdevice.rsqrt(tmp86)
+        tmp88 = tmp84 * tmp87
+        tmp89 = tl.load(in_ptr4 + (tl.broadcast_to(2*r0_4, [XBLOCK, R0_BLOCK])), r0_mask & tmp32, eviction_policy='evict_last', other=0.0).to(tl.float32)
+        tmp90 = tmp89.to(tl.float32)
+        tmp91 = tmp88 * tmp90
+        tmp92 = tmp91.to(tl.float32)
+        tmp93 = tl.full(tmp92.shape, 0.0, tmp92.dtype)
+        tmp94 = tl.where(tmp32, tmp92, tmp93)
+        tmp95 = tl.where(tmp16, tmp82, tmp94)
+        tmp97 = tmp96.to(tl.float32)
+        tmp98 = (tmp4 / tmp52)
+        tmp99 = tmp98 + tmp54
+        tmp100 = libdevice.rsqrt(tmp99)
+        tmp101 = tmp97 * tmp100
+        tmp103 = tmp102.to(tl.float32)
+        tmp104 = tmp101 * tmp103
+        tmp105 = tmp104.to(tl.float32)
+        tmp106 = tmp105.to(tl.float32)
+        tmp107 = tmp106 * tmp63
+        tmp108 = tmp95.to(tl.float32)
+        tmp109 = tmp108 * tmp66
+        tmp110 = tmp107 + tmp109
+        tmp111 = tmp110.to(tl.float32)
+        tl.store(in_out_ptr0 + (r0_2 + 128*x5), tmp69, r0_mask)
+        tl.store(in_out_ptr1 + (r0_2 + 128*x5), tmp111, r0_mask)
+''', device_str='cuda')
+
+
+async_compile.wait(globals())
+del async_compile
+
+class Runner:
+    def __init__(self, partitions):
+        self.partitions = partitions
+
+    def recursively_apply_fns(self, fns):
+        new_callables = []
+        for fn, c in zip(fns, self.partitions):
+            new_callables.append(fn(c))
+        self.partitions = new_callables
+
+    def call(self, args):
+        arg0_1, arg1_1, arg2_1, arg3_1, arg4_1 = args
+        args.clear()
+        assert_size_stride(arg0_1, (1, 2304, 36864), (84934656, 36864, 1))
+        assert_size_stride(arg1_1, (128, ), (1, ))
+        assert_size_stride(arg2_1, (128, ), (1, ))
+        assert_size_stride(arg3_1, (2304, 128), (128, 1))
+        assert_size_stride(arg4_1, (2304, 128), (128, 1))
+        with torch.cuda._DeviceGuard(0):
+            torch.cuda.set_device(0)
+            buf2 = empty_strided_cuda((1, 2304, 32, 64, 2), (9437184, 4096, 128, 2, 1), torch.bfloat16)
+            buf3 = reinterpret_tensor(buf2, (1, 2304, 32, 128), (9437184, 4096, 128, 1), 0); del buf2  # reuse
+            buf4 = empty_strided_cuda((1, 2304, 32, 64, 2), (9437184, 4096, 128, 2, 1), torch.bfloat16)
+            buf5 = reinterpret_tensor(buf4, (1, 2304, 32, 128), (9437184, 4096, 128, 1), 0); del buf4  # reuse
+            # Topologically Sorted Source Nodes: [split, chunk, query_1, query_2, reshape, unbind, key_1, key_2, reshape_1, unbind_1, float_1, cos, mul, neg, stack, x_rotated, float_2, sin, mul_1, add, out, float_3, cos_2, mul_2, neg_1, stack_1, x_rotated_1, float_4, sin_2, mul_3, add_1, out_1], Original ATen: [aten.split_with_sizes, aten.split, aten.view, aten._fused_rms_norm, aten.unbind, aten._to_copy, aten.unsqueeze, aten.mul, aten.neg, aten.stack, aten.add]
+            stream0 = get_raw_stream(0)
+            triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.run(buf3, buf5, arg0_1, arg1_1, arg3_1, arg4_1, arg2_1, 73728, 128, stream=stream0)
+            del arg1_1
+            del arg2_1
+            del arg3_1
+            del arg4_1
+        return (buf3, buf5, reinterpret_tensor(arg0_1, (1, 2304, 32, 128), (84934656, 36864, 128, 1), 8192), reinterpret_tensor(arg0_1, (1, 2304, 24576), (84934656, 36864, 1), 12288), )
+
+runner = Runner(partitions=[])
+call = runner.call
+recursively_apply_fns = runner.recursively_apply_fns
+
+
+def benchmark_compiled_module(times=10, repeat=10):
+    from torch._dynamo.testing import rand_strided
+    from torch._inductor.utils import print_performance
+    arg0_1 = rand_strided((1, 2304, 36864), (84934656, 36864, 1), device='cuda:0', dtype=torch.bfloat16)
+    arg1_1 = rand_strided((128, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
+    arg2_1 = rand_strided((128, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
+    arg3_1 = rand_strided((2304, 128), (128, 1), device='cuda:0', dtype=torch.float32)
+    arg4_1 = rand_strided((2304, 128), (128, 1), device='cuda:0', dtype=torch.float32)
+    fn = lambda: call([arg0_1, arg1_1, arg2_1, arg3_1, arg4_1])
+    return print_performance(fn, times=times, repeat=repeat)
+
+
+if __name__ == "__main__":
+    from torch._inductor.wrapper_benchmark import compiled_module_main
+    compiled_module_main('None', benchmark_compiled_module)
diff --git a/torchinductor/6k/abd9e26dfce6bf628201c09f1f90f4340fdaab3cc2dd99f7186afe82fe013d1a.best_config b/torchinductor/6k/abd9e26dfce6bf628201c09f1f90f4340fdaab3cc2dd99f7186afe82fe013d1a.best_config
new file mode 100644
index 0000000000000000000000000000000000000000..971d380017dc765c34827c697ce9956bb1e05fb0
--- /dev/null
+++ b/torchinductor/6k/abd9e26dfce6bf628201c09f1f90f4340fdaab3cc2dd99f7186afe82fe013d1a.best_config
@@ -0,0 +1 @@
+{"XBLOCK": 1024, "num_warps": 4, "num_stages": 1, "configs_hash": "3ca5c3e34d35093f3c9ab2829a9faeebad5e61c4ca13d5ed6053d7b71ce60d5a", "found_by_coordesc": false, "time_taken_ms": 35, "triton_cache_hash": "Q5QIKEPJDRH7FHZ6CDBLMD5Y4GTGU6Y7IAWNFLJIJRNGOB7RFV4Q"}
\ No newline at end of file
diff --git a/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py b/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py
new file mode 100644
index 0000000000000000000000000000000000000000..04685fb8a7084ed414e55edb2df1b2916f279909
--- /dev/null
+++ b/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py
@@ -0,0 +1,30 @@
+
+import triton
+import triton.language as tl
+
+from torch._inductor.runtime import triton_helpers, triton_heuristics
+from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+triton_helpers.set_driver_to_gpu()
+
+@triton_heuristics.pointwise(
+    size_hints={'x': 1048576}, 
+    filename=__file__,
+    triton_meta={'signature': {'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'in_ptr2': '*bf16', 'out_ptr0': '*bf16', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=128, cc=89, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=1536, warp_size=32), 'constants': {}, 'native_matmul': False, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]]}], 'enable_fp_fusion': True},
+    inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_add_mul_0', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'atomic_add_found': False, 'num_load': 3, 'num_store': 1, 'num_reduction': 0, 'backend_hash': '139C22A3A3C364569C9941DE9469DCB674B7A631E094782CBD415193800462F6', 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'deterministic': False, 'force_filter_reduction_configs': False, 'are_deterministic_algorithms_enabled': False, 'tiling_scores': {'x': 8396800}},
+    min_elem_per_thread=0
+)
+@triton.jit
+def triton_poi_fused_add_mul_0(in_ptr0, in_ptr1, in_ptr2, out_ptr0, xnumel, XBLOCK : tl.constexpr):
+    xnumel = 1048576
+    xoffset = tl.program_id(0) * XBLOCK
+    xindex = xoffset + tl.arange(0, XBLOCK)[:]
+    xmask = tl.full([XBLOCK], True, tl.int1)
+    x2 = xindex
+    x0 = (xindex % 4096)
+    tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32)
+    tmp1 = tl.load(in_ptr1 + (x0), None, eviction_policy='evict_last').to(tl.float32)
+    tmp2 = tl.load(in_ptr2 + (x2), None).to(tl.float32)
+    tmp3 = tmp1 * tmp2
+    tmp4 = tmp0 + tmp3
+    tl.store(out_ptr0 + (x2), tmp4, None)
diff --git a/torchinductor/6w/4fb0f9adeff50e9452e8fd238a1808052c095c59a0b2f1d9f3f7d7106bd1ede5.best_config b/torchinductor/6w/4fb0f9adeff50e9452e8fd238a1808052c095c59a0b2f1d9f3f7d7106bd1ede5.best_config
new file mode 100644
index 0000000000000000000000000000000000000000..b70899f8b8985c3a02e12ab5d4b5a15f2ca15b04
--- /dev/null
+++ b/torchinductor/6w/4fb0f9adeff50e9452e8fd238a1808052c095c59a0b2f1d9f3f7d7106bd1ede5.best_config
@@ -0,0 +1 @@
+{"XBLOCK": 1024, "num_warps": 4, "num_stages": 1, "configs_hash": "3ca5c3e34d35093f3c9ab2829a9faeebad5e61c4ca13d5ed6053d7b71ce60d5a", "found_by_coordesc": false, "time_taken_ms": 68, "triton_cache_hash": "6DP457QWOYDYHZ7TARQ4OLPDLGEKSLMNVUE3G2KQSQUSVRN7FBVA"}
\ No newline at end of file
diff --git a/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py b/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py
new file mode 100644
index 0000000000000000000000000000000000000000..cad5a337f915a1c559798a385ba04a0072936100
--- /dev/null
+++ b/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py
@@ -0,0 +1,33 @@
+
+import triton
+import triton.language as tl
+
+from torch._inductor.runtime import triton_helpers, triton_heuristics
+from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+triton_helpers.set_driver_to_gpu()
+
+@triton_heuristics.pointwise(
+    size_hints={'x': 33554432}, 
+    filename=__file__,
+    triton_meta={'signature': {'in_ptr0': '*bf16', 'out_ptr0': '*bf16', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=128, cc=89, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=1536, warp_size=32), 'constants': {}, 'native_matmul': False, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]]}], 'enable_fp_fusion': True},
+    inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_mul_silu_split_0', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'atomic_add_found': False, 'num_load': 2, 'num_store': 1, 'num_reduction': 0, 'backend_hash': '139C22A3A3C364569C9941DE9469DCB674B7A631E094782CBD415193800462F6', 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'deterministic': False, 'force_filter_reduction_configs': False, 'are_deterministic_algorithms_enabled': False, 'tiling_scores': {'x': 201326592}},
+    min_elem_per_thread=0
+)
+@triton.jit
+def triton_poi_fused_mul_silu_split_0(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
+    xnumel = 25165824
+    xoffset = tl.program_id(0) * XBLOCK
+    xindex = xoffset + tl.arange(0, XBLOCK)[:]
+    xmask = tl.full([XBLOCK], True, tl.int1)
+    x0 = (xindex % 12288)
+    x1 = xindex // 12288
+    x2 = xindex
+    tmp0 = tl.load(in_ptr0 + (x0 + 24576*x1), None).to(tl.float32)
+    tmp5 = tl.load(in_ptr0 + (12288 + x0 + 24576*x1), None).to(tl.float32)
+    tmp1 = tmp0.to(tl.float32)
+    tmp2 = tl.sigmoid(tmp1)
+    tmp3 = tmp1 * tmp2
+    tmp4 = tmp3.to(tl.float32)
+    tmp6 = tmp4 * tmp5
+    tl.store(out_ptr0 + (x2), tmp6, None)
diff --git a/torchinductor/7f/be95397d0c18f43f4314e0cac66d456d9d3e2b12116963a4bf988016e97f7a5e.best_config b/torchinductor/7f/be95397d0c18f43f4314e0cac66d456d9d3e2b12116963a4bf988016e97f7a5e.best_config
new file mode 100644
index 0000000000000000000000000000000000000000..d376bfab6dd203c3be2ed98dc83eac3fdac99cd4
--- /dev/null
+++ b/torchinductor/7f/be95397d0c18f43f4314e0cac66d456d9d3e2b12116963a4bf988016e97f7a5e.best_config
@@ -0,0 +1 @@
+{"XBLOCK": 1024, "num_warps": 4, "num_stages": 1, "configs_hash": "3ca5c3e34d35093f3c9ab2829a9faeebad5e61c4ca13d5ed6053d7b71ce60d5a", "found_by_coordesc": false, "time_taken_ms": 45, "triton_cache_hash": "EQOEBZDPMDVSX6EJFLBNKY5DUKJXFLSS4SF4QQZQUN6AV3JLHKJQ"}
\ No newline at end of file
diff --git a/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py b/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py
new file mode 100644
index 0000000000000000000000000000000000000000..a55f1d57e8e29166b759b911bd95c270d13cace7
--- /dev/null
+++ b/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py
@@ -0,0 +1,30 @@
+
+import triton
+import triton.language as tl
+
+from torch._inductor.runtime import triton_helpers, triton_heuristics
+from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+triton_helpers.set_driver_to_gpu()
+
+@triton_heuristics.pointwise(
+    size_hints={'x': 8388608}, 
+    filename=__file__,
+    triton_meta={'signature': {'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'in_ptr2': '*bf16', 'out_ptr0': '*bf16', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=128, cc=89, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=1536, warp_size=32), 'constants': {}, 'native_matmul': False, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]]}], 'enable_fp_fusion': True},
+    inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_add_mul_1', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'atomic_add_found': False, 'num_load': 3, 'num_store': 1, 'num_reduction': 0, 'backend_hash': '139C22A3A3C364569C9941DE9469DCB674B7A631E094782CBD415193800462F6', 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'deterministic': False, 'force_filter_reduction_configs': False, 'are_deterministic_algorithms_enabled': False, 'tiling_scores': {'x': 67117056}},
+    min_elem_per_thread=0
+)
+@triton.jit
+def triton_poi_fused_add_mul_1(in_ptr0, in_ptr1, in_ptr2, out_ptr0, xnumel, XBLOCK : tl.constexpr):
+    xnumel = 8388608
+    xoffset = tl.program_id(0) * XBLOCK
+    xindex = xoffset + tl.arange(0, XBLOCK)[:]
+    xmask = tl.full([XBLOCK], True, tl.int1)
+    x2 = xindex
+    x0 = (xindex % 4096)
+    tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32)
+    tmp1 = tl.load(in_ptr1 + (x0), None, eviction_policy='evict_last').to(tl.float32)
+    tmp2 = tl.load(in_ptr2 + (x2), None).to(tl.float32)
+    tmp3 = tmp1 * tmp2
+    tmp4 = tmp0 + tmp3
+    tl.store(out_ptr0 + (x2), tmp4, None)
diff --git a/torchinductor/a3/94dc88253134d772dc28ed260760d9a0059b054d472700be3c22dd06b228f22f.best_config b/torchinductor/a3/94dc88253134d772dc28ed260760d9a0059b054d472700be3c22dd06b228f22f.best_config
new file mode 100644
index 0000000000000000000000000000000000000000..47c3098c0f7e55d5446b3abf3c618707c676e6b9
--- /dev/null
+++ b/torchinductor/a3/94dc88253134d772dc28ed260760d9a0059b054d472700be3c22dd06b228f22f.best_config
@@ -0,0 +1 @@
+{"XBLOCK": 1, "R0_BLOCK": 2048, "num_warps": 16, "num_stages": 1, "configs_hash": "ba27f374f6982634f1ab959ad1e63f726920cfc2c7c821f8e68ec55c3d4d94fc", "found_by_coordesc": false, "time_taken_ms": 35, "triton_cache_hash": "H6VG26TW2DOV7R3PXVPFDX6HZCVIESL5ZYKZWLUWKZYONCE6NSLQ"}
\ No newline at end of file
diff --git a/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py b/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py
new file mode 100644
index 0000000000000000000000000000000000000000..f16537278f5cbb5b38ec2e7a1dfef3c34c40ef4c
--- /dev/null
+++ b/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py
@@ -0,0 +1,78 @@
+
+import triton
+import triton.language as tl
+
+from torch._inductor.runtime import triton_helpers, triton_heuristics
+from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+triton_helpers.set_driver_to_gpu()
+
+@triton_heuristics.reduction(
+    size_hints={'x': 256, 'r0_': 4096},
+    reduction_hint=ReductionHint.INNER,
+    filename=__file__,
+    triton_meta={'signature': {'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'in_ptr2': '*bf16', 'in_ptr3': '*bf16', 'in_ptr4': '*bf16', 'out_ptr0': '*bf16', 'out_ptr3': '*bf16', 'xnumel': 'i32', 'r0_numel': 'i32', 'XBLOCK': 'constexpr', 'R0_BLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=128, cc=89, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=1536, warp_size=32), 'constants': {}, 'native_matmul': False, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]], (6,): [['tt.divisibility', 16]], (7,): [['tt.divisibility', 16]], (8,): [['tt.divisibility', 16]]}], 'enable_fp_fusion': True},
+    inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_red_fused_add_mul_native_layer_norm_0', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'atomic_add_found': False, 'num_load': 6, 'num_store': 2, 'num_reduction': 2, 'backend_hash': '139C22A3A3C364569C9941DE9469DCB674B7A631E094782CBD415193800462F6', 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'deterministic': False, 'force_filter_reduction_configs': False, 'are_deterministic_algorithms_enabled': False, 'add_persistent_rblock': True, 'tiling_scores': {'x': 0, 'r0_': 12607488}}
+)
+@triton.jit
+def triton_red_fused_add_mul_native_layer_norm_0(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, out_ptr3, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr):
+    xnumel = 256
+    r0_numel = 4096
+    rnumel = r0_numel
+    RBLOCK: tl.constexpr = R0_BLOCK
+    xoffset = tl.program_id(0) * XBLOCK
+    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
+    xmask = xindex < xnumel
+    r0_base = tl.arange(0, R0_BLOCK)[None, :]
+    rbase = r0_base
+    x0 = xindex
+    tmp7_mean = tl.zeros([XBLOCK, R0_BLOCK], tl.float32)
+    tmp7_m2 = tl.zeros([XBLOCK, R0_BLOCK], tl.float32)
+    tmp7_weight = tl.zeros([XBLOCK, R0_BLOCK], tl.float32)
+    for r0_offset in tl.range(0, r0_numel, R0_BLOCK):
+        r0_index = r0_offset + r0_base
+        r0_mask = r0_index < r0_numel
+        roffset = r0_offset
+        rindex = r0_index
+        r0_1 = r0_index
+        tmp0 = tl.load(in_ptr0 + (r0_1 + 4096*x0), r0_mask & xmask, eviction_policy='evict_first', other=0.0).to(tl.float32)
+        tmp1 = tl.load(in_ptr1 + (r0_1), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32)
+        tmp2 = tl.load(in_ptr2 + (r0_1 + 4096*x0), r0_mask & xmask, eviction_policy='evict_first', other=0.0).to(tl.float32)
+        tmp3 = tmp1 * tmp2
+        tmp4 = tmp0 + tmp3
+        tmp5 = tmp4.to(tl.float32)
+        tmp6 = tl.broadcast_to(tmp5, [XBLOCK, R0_BLOCK])
+        tmp7_mean_next, tmp7_m2_next, tmp7_weight_next = triton_helpers.welford_reduce(
+            tmp6, tmp7_mean, tmp7_m2, tmp7_weight, roffset == 0
+        )
+        tmp7_mean = tl.where(r0_mask & xmask, tmp7_mean_next, tmp7_mean)
+        tmp7_m2 = tl.where(r0_mask & xmask, tmp7_m2_next, tmp7_m2)
+        tmp7_weight = tl.where(r0_mask & xmask, tmp7_weight_next, tmp7_weight)
+        tl.store(out_ptr0 + (r0_1 + 4096*x0), tmp4, r0_mask & xmask)
+    tmp8, tmp9, tmp10 = triton_helpers.welford(tmp7_mean, tmp7_m2, tmp7_weight, 1)
+    tmp7 = tmp8[:, None]
+    tmp11 = tmp9[:, None]
+    tmp12 = tmp10[:, None]
+    for r0_offset in tl.range(0, r0_numel, R0_BLOCK):
+        r0_index = r0_offset + r0_base
+        r0_mask = r0_index < r0_numel
+        roffset = r0_offset
+        rindex = r0_index
+        r0_1 = r0_index
+        tmp13 = tl.load(out_ptr0 + (r0_1 + 4096*x0), r0_mask & xmask, eviction_policy='evict_first', other=0.0).to(tl.float32)
+        tmp23 = tl.load(in_ptr3 + (r0_1), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32)
+        tmp27 = tl.load(in_ptr4 + (r0_1), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32)
+        tmp14 = tmp13.to(tl.float32)
+        tmp15 = tmp14 - tmp7
+        tmp16 = 4096.0
+        tmp17 = (tmp11 / tmp16)
+        tmp18 = 1e-06
+        tmp19 = tmp17 + tmp18
+        tmp20 = libdevice.rsqrt(tmp19)
+        tmp21 = tmp15 * tmp20
+        tmp22 = tmp21.to(tl.float32)
+        tmp24 = 1.0
+        tmp25 = tmp23 + tmp24
+        tmp26 = tmp22 * tmp25
+        tmp28 = tmp26 + tmp27
+        tl.store(out_ptr3 + (r0_1 + 4096*x0), tmp28, r0_mask & xmask)
diff --git a/torchinductor/aotautograd/a27rkqg32yfaub3aygtms2gl3oet2qxfcnp4zxa3zy5h6c3risxz/aw5eda3h36wpnnltujgkb4mvobznersd4fuvo2p7vy2quujasos b/torchinductor/aotautograd/a27rkqg32yfaub3aygtms2gl3oet2qxfcnp4zxa3zy5h6c3risxz/aw5eda3h36wpnnltujgkb4mvobznersd4fuvo2p7vy2quujasos
new file mode 100644
index 0000000000000000000000000000000000000000..a75c75963a7dd128def3a7cc44c38642f52132b6
Binary files /dev/null and b/torchinductor/aotautograd/a27rkqg32yfaub3aygtms2gl3oet2qxfcnp4zxa3zy5h6c3risxz/aw5eda3h36wpnnltujgkb4mvobznersd4fuvo2p7vy2quujasos differ
diff --git a/torchinductor/aotautograd/a3443o3ywoehrda4trn5q47mauudwcinftvd52hitdnfmakyhqc4/lw6yvpbd45y77sg6fh5v4otbinchkwbf7b56u3rh3wgq3x2wkhq b/torchinductor/aotautograd/a3443o3ywoehrda4trn5q47mauudwcinftvd52hitdnfmakyhqc4/lw6yvpbd45y77sg6fh5v4otbinchkwbf7b56u3rh3wgq3x2wkhq
new file mode 100644
index 0000000000000000000000000000000000000000..85f549011b728e4dd378ceb11df78ff9df78b4f2
Binary files /dev/null and b/torchinductor/aotautograd/a3443o3ywoehrda4trn5q47mauudwcinftvd52hitdnfmakyhqc4/lw6yvpbd45y77sg6fh5v4otbinchkwbf7b56u3rh3wgq3x2wkhq differ
diff --git a/torchinductor/aotautograd/a3554ihbxq57jan4ib74iqo5mnaqevqume4yzewukzkm6ehpsilz/eubahghkef62rmchvnle5v6h3ddip4av5qqjxomdlm7ura45qve b/torchinductor/aotautograd/a3554ihbxq57jan4ib74iqo5mnaqevqume4yzewukzkm6ehpsilz/eubahghkef62rmchvnle5v6h3ddip4av5qqjxomdlm7ura45qve
new file mode 100644
index 0000000000000000000000000000000000000000..b344ce096a4759a5fc1c7c9751de14002b25b079
Binary files /dev/null and b/torchinductor/aotautograd/a3554ihbxq57jan4ib74iqo5mnaqevqume4yzewukzkm6ehpsilz/eubahghkef62rmchvnle5v6h3ddip4av5qqjxomdlm7ura45qve differ
diff --git a/torchinductor/aotautograd/a3hojixb5fzn7f7jfco3ddoohdsuggk4qbop3lcg7rjy3e7fkgfz/o7wvolbgborwtoofbovayor23y4ubooymfcvv6jeqm2wbx3n2cs b/torchinductor/aotautograd/a3hojixb5fzn7f7jfco3ddoohdsuggk4qbop3lcg7rjy3e7fkgfz/o7wvolbgborwtoofbovayor23y4ubooymfcvv6jeqm2wbx3n2cs
new file mode 100644
index 0000000000000000000000000000000000000000..c25ad76a5497cbb250d6f704688a1c333ae61aaf
Binary files /dev/null and b/torchinductor/aotautograd/a3hojixb5fzn7f7jfco3ddoohdsuggk4qbop3lcg7rjy3e7fkgfz/o7wvolbgborwtoofbovayor23y4ubooymfcvv6jeqm2wbx3n2cs differ
diff --git a/torchinductor/aotautograd/a54twb2qknddjxnxtmkoagy3umo5y3ptsesm2pdhy7nkefklf6wx/emxzj524wmpvifsxw4dsnnkzemqpzfgkenbo5obwmksvlhsr354 b/torchinductor/aotautograd/a54twb2qknddjxnxtmkoagy3umo5y3ptsesm2pdhy7nkefklf6wx/emxzj524wmpvifsxw4dsnnkzemqpzfgkenbo5obwmksvlhsr354
new file mode 100644
index 0000000000000000000000000000000000000000..22b825fa8a9a866f1ef9dd4c024a3c0d4c2f99a3
Binary files /dev/null and b/torchinductor/aotautograd/a54twb2qknddjxnxtmkoagy3umo5y3ptsesm2pdhy7nkefklf6wx/emxzj524wmpvifsxw4dsnnkzemqpzfgkenbo5obwmksvlhsr354 differ
diff --git a/torchinductor/aotautograd/a5ksywxhfabbequvxwstheyyj5w3sinuubxcrypqjwbqsyw5la3l/ew4fxjyfoflznyuws2w2ylu4p7owpjuqoshsef75w43w2vvwejd b/torchinductor/aotautograd/a5ksywxhfabbequvxwstheyyj5w3sinuubxcrypqjwbqsyw5la3l/ew4fxjyfoflznyuws2w2ylu4p7owpjuqoshsef75w43w2vvwejd
new file mode 100644
index 0000000000000000000000000000000000000000..0cbab3e3bda3aaf0f712cadbf3bee91f8624c693
Binary files /dev/null and b/torchinductor/aotautograd/a5ksywxhfabbequvxwstheyyj5w3sinuubxcrypqjwbqsyw5la3l/ew4fxjyfoflznyuws2w2ylu4p7owpjuqoshsef75w43w2vvwejd differ
diff --git a/torchinductor/aotautograd/a7ptufzlocphh5n5o5u63gfzkf74tjb3l5is45u5hqjspv32qda6/an4kgppgf4vt5yfvvghrnmho6jc3qnj4l6c75zrsiotr5d4u5gv b/torchinductor/aotautograd/a7ptufzlocphh5n5o5u63gfzkf74tjb3l5is45u5hqjspv32qda6/an4kgppgf4vt5yfvvghrnmho6jc3qnj4l6c75zrsiotr5d4u5gv
new file mode 100644
index 0000000000000000000000000000000000000000..8cb686adafd1e0881524b4544c1117acc700794d
Binary files /dev/null and b/torchinductor/aotautograd/a7ptufzlocphh5n5o5u63gfzkf74tjb3l5is45u5hqjspv32qda6/an4kgppgf4vt5yfvvghrnmho6jc3qnj4l6c75zrsiotr5d4u5gv differ
diff --git a/torchinductor/aotautograd/aal6kceyfi7eazavxzpgcec5hzt32bkwo7p4doeyc56ubzlwuvx4/nkoni3ckgbheucucq64bmrta4lhz7x237lalaqcrejvdc3supg4 b/torchinductor/aotautograd/aal6kceyfi7eazavxzpgcec5hzt32bkwo7p4doeyc56ubzlwuvx4/nkoni3ckgbheucucq64bmrta4lhz7x237lalaqcrejvdc3supg4
new file mode 100644
index 0000000000000000000000000000000000000000..0a851d7123d5e434f42c8fb9c164ee8c4dcb33ac
Binary files /dev/null and b/torchinductor/aotautograd/aal6kceyfi7eazavxzpgcec5hzt32bkwo7p4doeyc56ubzlwuvx4/nkoni3ckgbheucucq64bmrta4lhz7x237lalaqcrejvdc3supg4 differ
diff --git a/torchinductor/aotautograd/aan5kpy6i54rnpeu5vlzbx6i6blimsvhducl7futzdjr4xciy472/a35s4usnkzmh6ybhedo3b6zehfepmwdv2gxscayjeeuucr3zat7 b/torchinductor/aotautograd/aan5kpy6i54rnpeu5vlzbx6i6blimsvhducl7futzdjr4xciy472/a35s4usnkzmh6ybhedo3b6zehfepmwdv2gxscayjeeuucr3zat7
new file mode 100644
index 0000000000000000000000000000000000000000..a875cbae71a6435797a6678651cad0a75d4a4615
Binary files /dev/null and b/torchinductor/aotautograd/aan5kpy6i54rnpeu5vlzbx6i6blimsvhducl7futzdjr4xciy472/a35s4usnkzmh6ybhedo3b6zehfepmwdv2gxscayjeeuucr3zat7 differ
diff --git a/torchinductor/aotautograd/aesonb7djseswkbtu2qzhvg6ikd5rewxnqlt6pwuytadpxxmjcod/lap2sypphhofd6d5rhojruk2vfyvw2olc7gtulmom4i5y7ix2cp b/torchinductor/aotautograd/aesonb7djseswkbtu2qzhvg6ikd5rewxnqlt6pwuytadpxxmjcod/lap2sypphhofd6d5rhojruk2vfyvw2olc7gtulmom4i5y7ix2cp
new file mode 100644
index 0000000000000000000000000000000000000000..c8d86999d1eed7e32dbdbe87172016ef36de3d11
Binary files /dev/null and b/torchinductor/aotautograd/aesonb7djseswkbtu2qzhvg6ikd5rewxnqlt6pwuytadpxxmjcod/lap2sypphhofd6d5rhojruk2vfyvw2olc7gtulmom4i5y7ix2cp differ
diff --git a/torchinductor/aotautograd/age65c4dyk2rxcqufpxd6bsafzao7tacrsvejbf3pjbsngnoashv/upzttal3jaj233iyzyps7mjpq75jt6qi6rzramvgyyewfg76h6s b/torchinductor/aotautograd/age65c4dyk2rxcqufpxd6bsafzao7tacrsvejbf3pjbsngnoashv/upzttal3jaj233iyzyps7mjpq75jt6qi6rzramvgyyewfg76h6s
new file mode 100644
index 0000000000000000000000000000000000000000..c4d65e6647fc42a8a82752e219d65675a3bc294c
Binary files /dev/null and b/torchinductor/aotautograd/age65c4dyk2rxcqufpxd6bsafzao7tacrsvejbf3pjbsngnoashv/upzttal3jaj233iyzyps7mjpq75jt6qi6rzramvgyyewfg76h6s differ
diff --git a/torchinductor/aotautograd/ahkpwjcp2qqyj6wu2ckjqlrit2pbb3ig3ddi75hgbkgngvvipwyq/ha76p7wv3nimmrgvx6kdiqikd6adbw7nlnaiars5ey4anx46mwn b/torchinductor/aotautograd/ahkpwjcp2qqyj6wu2ckjqlrit2pbb3ig3ddi75hgbkgngvvipwyq/ha76p7wv3nimmrgvx6kdiqikd6adbw7nlnaiars5ey4anx46mwn
new file mode 100644
index 0000000000000000000000000000000000000000..920ab76a22529e945f9034f1743b80ab496da06f
Binary files /dev/null and b/torchinductor/aotautograd/ahkpwjcp2qqyj6wu2ckjqlrit2pbb3ig3ddi75hgbkgngvvipwyq/ha76p7wv3nimmrgvx6kdiqikd6adbw7nlnaiars5ey4anx46mwn differ
diff --git a/torchinductor/aotautograd/aiojzczi5txclvaydkrk5g3qlf33pdkkhxtefkhfphkpc3o6rr4p/w3n37k3qhqfhuewneurnairyblp3h7nrak6oyp2p3um7uwnfcz5 b/torchinductor/aotautograd/aiojzczi5txclvaydkrk5g3qlf33pdkkhxtefkhfphkpc3o6rr4p/w3n37k3qhqfhuewneurnairyblp3h7nrak6oyp2p3um7uwnfcz5
new file mode 100644
index 0000000000000000000000000000000000000000..31443751b6c080f9631942333d1ddbdce5c45484
Binary files /dev/null and b/torchinductor/aotautograd/aiojzczi5txclvaydkrk5g3qlf33pdkkhxtefkhfphkpc3o6rr4p/w3n37k3qhqfhuewneurnairyblp3h7nrak6oyp2p3um7uwnfcz5 differ
diff --git a/torchinductor/aotautograd/ajdkg3gacw25klanvqotc3mkab3mi23jtjpagxrosdmqv3d4yg7v/ejzrqbsrchqzxfppkzo4ep7edhv7lrjjbcdxkxvodbk4vvk3b62 b/torchinductor/aotautograd/ajdkg3gacw25klanvqotc3mkab3mi23jtjpagxrosdmqv3d4yg7v/ejzrqbsrchqzxfppkzo4ep7edhv7lrjjbcdxkxvodbk4vvk3b62
new file mode 100644
index 0000000000000000000000000000000000000000..887f90362faa59ade6fd988b632f6d95c7b035e9
Binary files /dev/null and b/torchinductor/aotautograd/ajdkg3gacw25klanvqotc3mkab3mi23jtjpagxrosdmqv3d4yg7v/ejzrqbsrchqzxfppkzo4ep7edhv7lrjjbcdxkxvodbk4vvk3b62 differ
diff --git a/torchinductor/aotautograd/amb262dx57ptj6gg2ch6skr372w6arsr3i7i4ed5pljhiycuxduw/fntav2w4z5lvr443jxseqalau2vuzp7x7ljd3hanoqubtutjkvp b/torchinductor/aotautograd/amb262dx57ptj6gg2ch6skr372w6arsr3i7i4ed5pljhiycuxduw/fntav2w4z5lvr443jxseqalau2vuzp7x7ljd3hanoqubtutjkvp
new file mode 100644
index 0000000000000000000000000000000000000000..f4347151aed10d5cb40f2b46b2695063148b8e89
Binary files /dev/null and b/torchinductor/aotautograd/amb262dx57ptj6gg2ch6skr372w6arsr3i7i4ed5pljhiycuxduw/fntav2w4z5lvr443jxseqalau2vuzp7x7ljd3hanoqubtutjkvp differ
diff --git a/torchinductor/aotautograd/amjjivi2p6firai3idkjgfxyy6z4prevujsjdno2uuchwvd7xqll/enc6ruqcyggs4mnt54tjdd2lvexcvipd5vhhamxwcj77g5fpyof b/torchinductor/aotautograd/amjjivi2p6firai3idkjgfxyy6z4prevujsjdno2uuchwvd7xqll/enc6ruqcyggs4mnt54tjdd2lvexcvipd5vhhamxwcj77g5fpyof
new file mode 100644
index 0000000000000000000000000000000000000000..cc09fbd335287f63644161ab26d5d9b11c1aaf44
Binary files /dev/null and b/torchinductor/aotautograd/amjjivi2p6firai3idkjgfxyy6z4prevujsjdno2uuchwvd7xqll/enc6ruqcyggs4mnt54tjdd2lvexcvipd5vhhamxwcj77g5fpyof differ
diff --git a/torchinductor/aotautograd/apfaqlwe555qd2zoz575w5mvoxoiasmcomkv76mhz5zvnm5jok66/epmli5r46rzrqf73pqrnb5tratdg3mbbwdf5vyzqr6ejyhnooye b/torchinductor/aotautograd/apfaqlwe555qd2zoz575w5mvoxoiasmcomkv76mhz5zvnm5jok66/epmli5r46rzrqf73pqrnb5tratdg3mbbwdf5vyzqr6ejyhnooye
new file mode 100644
index 0000000000000000000000000000000000000000..d17c574375648b55e711b6e3f3c095ccebe5e310
Binary files /dev/null and b/torchinductor/aotautograd/apfaqlwe555qd2zoz575w5mvoxoiasmcomkv76mhz5zvnm5jok66/epmli5r46rzrqf73pqrnb5tratdg3mbbwdf5vyzqr6ejyhnooye differ
diff --git a/torchinductor/aotautograd/asjbg7f735jw54kcldmvv5uost22wzpy3hkxgaihos4rllvagheu/lwqpsnp52rszp2nlwkgi33embno5st2u5bxfm4rpyoy6fql5aor b/torchinductor/aotautograd/asjbg7f735jw54kcldmvv5uost22wzpy3hkxgaihos4rllvagheu/lwqpsnp52rszp2nlwkgi33embno5st2u5bxfm4rpyoy6fql5aor
new file mode 100644
index 0000000000000000000000000000000000000000..b0703d77e52c8559c1d465e355c9224829b81166
Binary files /dev/null and b/torchinductor/aotautograd/asjbg7f735jw54kcldmvv5uost22wzpy3hkxgaihos4rllvagheu/lwqpsnp52rszp2nlwkgi33embno5st2u5bxfm4rpyoy6fql5aor differ
diff --git a/torchinductor/aotautograd/atc2ggqhejcse5aydwh2wjakijsc2dyhqjxwdqrwpra3mgjwe4st/xwy7lzraqocjillvk4s2yc2qhpkx43s2nbkxmeb2wpph3sgyc7n b/torchinductor/aotautograd/atc2ggqhejcse5aydwh2wjakijsc2dyhqjxwdqrwpra3mgjwe4st/xwy7lzraqocjillvk4s2yc2qhpkx43s2nbkxmeb2wpph3sgyc7n
new file mode 100644
index 0000000000000000000000000000000000000000..a8c5090c898a09145b0b84a61f8ba426a01716fe
Binary files /dev/null and b/torchinductor/aotautograd/atc2ggqhejcse5aydwh2wjakijsc2dyhqjxwdqrwpra3mgjwe4st/xwy7lzraqocjillvk4s2yc2qhpkx43s2nbkxmeb2wpph3sgyc7n differ
diff --git a/torchinductor/aotautograd/atsevoi6zqdcnehuxassvjosi3j5vrk54uisibylfgspeewp6vyx/4sfzv7d6ch2yoi6nnr5ym3i6yibku3vfveyrr6sx6dqbmavxo32 b/torchinductor/aotautograd/atsevoi6zqdcnehuxassvjosi3j5vrk54uisibylfgspeewp6vyx/4sfzv7d6ch2yoi6nnr5ym3i6yibku3vfveyrr6sx6dqbmavxo32
new file mode 100644
index 0000000000000000000000000000000000000000..e6289bfe09c5a26fff81a4e57e466a9bd84c53fb
Binary files /dev/null and b/torchinductor/aotautograd/atsevoi6zqdcnehuxassvjosi3j5vrk54uisibylfgspeewp6vyx/4sfzv7d6ch2yoi6nnr5ym3i6yibku3vfveyrr6sx6dqbmavxo32 differ
diff --git a/torchinductor/aotautograd/ax7bbwqbruobasu7vagn2oj2owh5vgosxbjelta324rvf4tkesd4/ipnutob47ydixp2zetluyw4apg7fe5sfkkiianwaawh6yq3uang b/torchinductor/aotautograd/ax7bbwqbruobasu7vagn2oj2owh5vgosxbjelta324rvf4tkesd4/ipnutob47ydixp2zetluyw4apg7fe5sfkkiianwaawh6yq3uang
new file mode 100644
index 0000000000000000000000000000000000000000..0b86a5c999c016f475cf83332f02f65c0f0ea13b
Binary files /dev/null and b/torchinductor/aotautograd/ax7bbwqbruobasu7vagn2oj2owh5vgosxbjelta324rvf4tkesd4/ipnutob47ydixp2zetluyw4apg7fe5sfkkiianwaawh6yq3uang differ
diff --git a/torchinductor/aotautograd/ay26zyuzpll2prvy7zzoeydo7r47lrr6s6jcmzi2zmytjxzebmnz/nzx7lukg3r25p6sjlwtqmkf6gmgzuq7iwagwki2x4kvhw5ducr5 b/torchinductor/aotautograd/ay26zyuzpll2prvy7zzoeydo7r47lrr6s6jcmzi2zmytjxzebmnz/nzx7lukg3r25p6sjlwtqmkf6gmgzuq7iwagwki2x4kvhw5ducr5
new file mode 100644
index 0000000000000000000000000000000000000000..cba52096030d21d51f66a850a98a6e47f8d5a7b4
Binary files /dev/null and b/torchinductor/aotautograd/ay26zyuzpll2prvy7zzoeydo7r47lrr6s6jcmzi2zmytjxzebmnz/nzx7lukg3r25p6sjlwtqmkf6gmgzuq7iwagwki2x4kvhw5ducr5 differ
diff --git a/torchinductor/aotautograd/ay65riayezoo7bqggl72pzrzdi6lvy5mp23ajx4f453ylzpmve3s/p7clvcke3bsgsaumutstrxc7bkq4tq6yoia7nwigana3n3unini b/torchinductor/aotautograd/ay65riayezoo7bqggl72pzrzdi6lvy5mp23ajx4f453ylzpmve3s/p7clvcke3bsgsaumutstrxc7bkq4tq6yoia7nwigana3n3unini
new file mode 100644
index 0000000000000000000000000000000000000000..9f2a4422836d443257787e673b5b9181c112db72
Binary files /dev/null and b/torchinductor/aotautograd/ay65riayezoo7bqggl72pzrzdi6lvy5mp23ajx4f453ylzpmve3s/p7clvcke3bsgsaumutstrxc7bkq4tq6yoia7nwigana3n3unini differ
diff --git a/torchinductor/aotautograd/azyih32olvhzuay5zpfypzhk2cdlosvaqxdhcnjzlwfs6k3a2ne6/5sz2kjdze7ixdny7hz24p4uma7uup7chdcpiumqznifqn4mpmqb b/torchinductor/aotautograd/azyih32olvhzuay5zpfypzhk2cdlosvaqxdhcnjzlwfs6k3a2ne6/5sz2kjdze7ixdny7hz24p4uma7uup7chdcpiumqznifqn4mpmqb
new file mode 100644
index 0000000000000000000000000000000000000000..87f1a9ed39ece421a9c1c73639f23090ac3c8d71
Binary files /dev/null and b/torchinductor/aotautograd/azyih32olvhzuay5zpfypzhk2cdlosvaqxdhcnjzlwfs6k3a2ne6/5sz2kjdze7ixdny7hz24p4uma7uup7chdcpiumqznifqn4mpmqb differ
diff --git a/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py b/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py
new file mode 100644
index 0000000000000000000000000000000000000000..87fa8f5c1041dafcba46e5cf4f29b4c2186765e9
--- /dev/null
+++ b/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py
@@ -0,0 +1,73 @@
+
+import triton
+import triton.language as tl
+
+from torch._inductor.runtime import triton_helpers, triton_heuristics
+from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+triton_helpers.set_driver_to_gpu()
+
+@triton_heuristics.reduction(
+    size_hints={'x': 256, 'r0_': 4096},
+    reduction_hint=ReductionHint.INNER,
+    filename=__file__,
+    triton_meta={'signature': {'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'in_ptr2': '*bf16', 'out_ptr2': '*bf16', 'xnumel': 'i32', 'r0_numel': 'i32', 'XBLOCK': 'constexpr', 'R0_BLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=128, cc=89, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=1536, warp_size=32), 'constants': {}, 'native_matmul': False, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]]}], 'enable_fp_fusion': True},
+    inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_red_fused_add_mul_native_layer_norm_1', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'atomic_add_found': False, 'num_load': 4, 'num_store': 1, 'num_reduction': 2, 'backend_hash': '139C22A3A3C364569C9941DE9469DCB674B7A631E094782CBD415193800462F6', 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'deterministic': False, 'force_filter_reduction_configs': False, 'are_deterministic_algorithms_enabled': False, 'add_persistent_rblock': True, 'tiling_scores': {'x': 0, 'r0_': 6307840}}
+)
+@triton.jit
+def triton_red_fused_add_mul_native_layer_norm_1(in_ptr0, in_ptr1, in_ptr2, out_ptr2, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr):
+    xnumel = 256
+    r0_numel = 4096
+    rnumel = r0_numel
+    RBLOCK: tl.constexpr = R0_BLOCK
+    xoffset = tl.program_id(0) * XBLOCK
+    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
+    xmask = xindex < xnumel
+    r0_base = tl.arange(0, R0_BLOCK)[None, :]
+    rbase = r0_base
+    x0 = xindex
+    tmp3_mean = tl.zeros([XBLOCK, R0_BLOCK], tl.float32)
+    tmp3_m2 = tl.zeros([XBLOCK, R0_BLOCK], tl.float32)
+    tmp3_weight = tl.zeros([XBLOCK, R0_BLOCK], tl.float32)
+    for r0_offset in tl.range(0, r0_numel, R0_BLOCK):
+        r0_index = r0_offset + r0_base
+        r0_mask = r0_index < r0_numel
+        roffset = r0_offset
+        rindex = r0_index
+        r0_1 = r0_index
+        tmp0 = tl.load(in_ptr0 + (r0_1 + 4096*x0), r0_mask & xmask, eviction_policy='evict_last', other=0.0).to(tl.float32)
+        tmp1 = tmp0.to(tl.float32)
+        tmp2 = tl.broadcast_to(tmp1, [XBLOCK, R0_BLOCK])
+        tmp3_mean_next, tmp3_m2_next, tmp3_weight_next = triton_helpers.welford_reduce(
+            tmp2, tmp3_mean, tmp3_m2, tmp3_weight, roffset == 0
+        )
+        tmp3_mean = tl.where(r0_mask & xmask, tmp3_mean_next, tmp3_mean)
+        tmp3_m2 = tl.where(r0_mask & xmask, tmp3_m2_next, tmp3_m2)
+        tmp3_weight = tl.where(r0_mask & xmask, tmp3_weight_next, tmp3_weight)
+    tmp4, tmp5, tmp6 = triton_helpers.welford(tmp3_mean, tmp3_m2, tmp3_weight, 1)
+    tmp3 = tmp4[:, None]
+    tmp7 = tmp5[:, None]
+    tmp8 = tmp6[:, None]
+    for r0_offset in tl.range(0, r0_numel, R0_BLOCK):
+        r0_index = r0_offset + r0_base
+        r0_mask = r0_index < r0_numel
+        roffset = r0_offset
+        rindex = r0_index
+        r0_1 = r0_index
+        tmp9 = tl.load(in_ptr1 + (r0_1), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32)
+        tmp12 = tl.load(in_ptr0 + (r0_1 + 4096*x0), r0_mask & xmask, eviction_policy='evict_first', other=0.0).to(tl.float32)
+        tmp23 = tl.load(in_ptr2 + (r0_1), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32)
+        tmp10 = 1.0
+        tmp11 = tmp9 + tmp10
+        tmp13 = tmp12.to(tl.float32)
+        tmp14 = tmp13 - tmp3
+        tmp15 = 4096.0
+        tmp16 = (tmp7 / tmp15)
+        tmp17 = 1e-06
+        tmp18 = tmp16 + tmp17
+        tmp19 = libdevice.rsqrt(tmp18)
+        tmp20 = tmp14 * tmp19
+        tmp21 = tmp20.to(tl.float32)
+        tmp22 = tmp11 * tmp21
+        tmp24 = tmp22 + tmp23
+        tl.store(out_ptr2 + (r0_1 + 4096*x0), tmp24, r0_mask & xmask)
diff --git a/torchinductor/av/d186a24d3c8af5514b42dea48fc981efd3f5afb7bba6c30406e42c75862888b1.best_config b/torchinductor/av/d186a24d3c8af5514b42dea48fc981efd3f5afb7bba6c30406e42c75862888b1.best_config
new file mode 100644
index 0000000000000000000000000000000000000000..7c8326bc2575b9a99082ac834be7ee0544765495
--- /dev/null
+++ b/torchinductor/av/d186a24d3c8af5514b42dea48fc981efd3f5afb7bba6c30406e42c75862888b1.best_config
@@ -0,0 +1 @@
+{"XBLOCK": 1, "R0_BLOCK": 4096, "num_warps": 16, "num_stages": 1, "configs_hash": "ba27f374f6982634f1ab959ad1e63f726920cfc2c7c821f8e68ec55c3d4d94fc", "found_by_coordesc": false, "time_taken_ms": 33, "triton_cache_hash": "CYKNGA4OMPRI7EV7H5FM47DKU7VFZ4Q5NYQGPNW6ZIVYBLBWPVMA"}
\ No newline at end of file
diff --git a/torchinductor/ay/cayicsdjyjxzpcmkvjbneubnqkuhs3y37qiwy5qlel3z2loa4qav.py b/torchinductor/ay/cayicsdjyjxzpcmkvjbneubnqkuhs3y37qiwy5qlel3z2loa4qav.py
new file mode 100644
index 0000000000000000000000000000000000000000..20443b971a2cf1b0b349bcc41a04f57441227ec2
--- /dev/null
+++ b/torchinductor/ay/cayicsdjyjxzpcmkvjbneubnqkuhs3y37qiwy5qlel3z2loa4qav.py
@@ -0,0 +1,69 @@
+# AOT ID: ['1_inference']
+from ctypes import c_void_p, c_long, c_int
+import torch
+import math
+import random
+import os
+import tempfile
+from math import inf, nan
+from cmath import nanj
+from torch._inductor.hooks import run_intermediate_hooks
+from torch._inductor.utils import maybe_profile
+from torch._inductor.codegen.memory_planning import _align as align
+from torch import device, empty_strided
+from torch._inductor.async_compile import AsyncCompile
+from torch._inductor.select_algorithm import extern_kernels
+
+aten = torch.ops.aten
+inductor_ops = torch.ops.inductor
+_quantized = torch.ops._quantized
+assert_size_stride = torch._C._dynamo.guards.assert_size_stride
+assert_alignment = torch._C._dynamo.guards.assert_alignment
+empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu
+empty_strided_cpu_pinned = torch._C._dynamo.guards._empty_strided_cpu_pinned
+empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
+empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu
+empty_strided_mtia = torch._C._dynamo.guards._empty_strided_mtia
+reinterpret_tensor = torch._C._dynamo.guards._reinterpret_tensor
+alloc_from_pool = torch.ops.inductor._alloc_from_pool
+async_compile = AsyncCompile()
+empty_strided_p2p = torch._C._distributed_c10d._SymmetricMemory.empty_strided_p2p
+
+
+async_compile.wait(globals())
+del async_compile
+
+class Runner:
+    def __init__(self, partitions):
+        self.partitions = partitions
+
+    def recursively_apply_fns(self, fns):
+        new_callables = []
+        for fn, c in zip(fns, self.partitions):
+            new_callables.append(fn(c))
+        self.partitions = new_callables
+
+    def call(self, args):
+        arg0_1, arg1_1 = args
+        args.clear()
+        assert_size_stride(arg0_1, (4096, 12288), (1, 4096))
+        assert_size_stride(arg1_1, (1, 1), (1, 1))
+        return (aten.view.dtype(reinterpret_tensor(arg0_1, (12288, 4096), (4096, 1), 0), torch.uint8), reinterpret_tensor(arg1_1, (1, ), (1, ), 0), )
+
+runner = Runner(partitions=[])
+call = runner.call
+recursively_apply_fns = runner.recursively_apply_fns
+
+
+def benchmark_compiled_module(times=10, repeat=10):
+    from torch._dynamo.testing import rand_strided
+    from torch._inductor.utils import print_performance
+    arg0_1 = rand_strided((4096, 12288), (1, 4096), device='cuda:0', dtype=torch.float8_e4m3fn)
+    arg1_1 = rand_strided((1, 1), (1, 1), device='cuda:0', dtype=torch.float32)
+    fn = lambda: call([arg0_1, arg1_1])
+    return print_performance(fn, times=times, repeat=repeat)
+
+
+if __name__ == "__main__":
+    from torch._inductor.wrapper_benchmark import compiled_module_main
+    compiled_module_main('None', benchmark_compiled_module)
diff --git a/torchinductor/bv/7969eba2eb589b95d2894ee75ee67ba01cd2bee09cd64d315c70c0950888c19e.best_config b/torchinductor/bv/7969eba2eb589b95d2894ee75ee67ba01cd2bee09cd64d315c70c0950888c19e.best_config
new file mode 100644
index 0000000000000000000000000000000000000000..e1fbfaeab573ca9626bf597a65efd90f0ec324ab
--- /dev/null
+++ b/torchinductor/bv/7969eba2eb589b95d2894ee75ee67ba01cd2bee09cd64d315c70c0950888c19e.best_config
@@ -0,0 +1 @@
+{"XBLOCK": 2, "R0_BLOCK": 128, "num_warps": 2, "num_stages": 1, "configs_hash": "6ffa43f2ca8cb1499f3ff3fbf8c975f2c07eef9b57fcecda113029ab12cbef66", "found_by_coordesc": false, "time_taken_ms": 307, "triton_cache_hash": "AQ3FCZKOYK5LBOX7RLBQGX5T77RKI4M7SEZTYJU34QROQSJNLP5A"}
\ No newline at end of file
diff --git a/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py b/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py
new file mode 100644
index 0000000000000000000000000000000000000000..08757a34f04fdfbafcdc149b68963d655d5da3c3
--- /dev/null
+++ b/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py
@@ -0,0 +1,162 @@
+
+import triton
+import triton.language as tl
+
+from torch._inductor.runtime import triton_helpers, triton_heuristics
+from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+triton_helpers.set_driver_to_gpu()
+
+@triton_heuristics.reduction(
+    size_hints={'x': 131072, 'r0_': 128},
+    reduction_hint=ReductionHint.DEFAULT,
+    filename=__file__,
+    triton_meta={'signature': {'in_out_ptr0': '*bf16', 'in_out_ptr1': '*bf16', 'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'in_ptr2': '*fp32', 'in_ptr3': '*fp32', 'in_ptr4': '*bf16', 'xnumel': 'i32', 'r0_numel': 'i32', 'XBLOCK': 'constexpr', 'R0_BLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=128, cc=89, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=1536, warp_size=32), 'constants': {}, 'native_matmul': False, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]], (6,): [['tt.divisibility', 16]], (7,): [['tt.divisibility', 16]], (8,): [['tt.divisibility', 16]]}], 'enable_fp_fusion': True},
+    inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0', 'mutated_arg_names': ['in_out_ptr0', 'in_out_ptr1'], 'optimize_mem': True, 'no_x_dim': False, 'atomic_add_found': False, 'num_load': 16, 'num_store': 2, 'num_reduction': 2, 'backend_hash': '139C22A3A3C364569C9941DE9469DCB674B7A631E094782CBD415193800462F6', 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'deterministic': False, 'force_filter_reduction_configs': False, 'are_deterministic_algorithms_enabled': False, 'tiling_scores': {'x': 0, 'r0_': 115606016}}
+)
+@triton.jit
+def triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0(in_out_ptr0, in_out_ptr1, in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr):
+    xnumel = 73728
+    r0_numel = 128
+    rnumel = r0_numel
+    RBLOCK: tl.constexpr = R0_BLOCK
+    xoffset = tl.program_id(0) * XBLOCK
+    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
+    xmask = tl.full([XBLOCK, R0_BLOCK], True, tl.int1)
+    r0_base = tl.arange(0, R0_BLOCK)[None, :]
+    rbase = r0_base
+    x0 = (xindex % 32)
+    x1 = xindex // 32
+    _tmp4 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32)
+    x5 = xindex
+    _tmp10 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32)
+    for r0_offset in tl.range(0, r0_numel, R0_BLOCK):
+        r0_index = r0_offset + r0_base
+        r0_mask = r0_index < r0_numel
+        roffset = r0_offset
+        rindex = r0_index
+        r0_2 = r0_index
+        tmp0 = tl.load(in_ptr0 + (4096 + r0_2 + 128*x0 + 36864*x1), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32)
+        tmp6 = tl.load(in_ptr0 + (r0_2 + 128*x0 + 36864*x1), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32)
+        tmp1 = tmp0.to(tl.float32)
+        tmp2 = tmp1 * tmp1
+        tmp3 = tl.broadcast_to(tmp2, [XBLOCK, R0_BLOCK])
+        tmp5 = _tmp4 + tmp3
+        _tmp4 = tl.where(r0_mask, tmp5, _tmp4)
+        tmp7 = tmp6.to(tl.float32)
+        tmp8 = tmp7 * tmp7
+        tmp9 = tl.broadcast_to(tmp8, [XBLOCK, R0_BLOCK])
+        tmp11 = _tmp10 + tmp9
+        _tmp10 = tl.where(r0_mask, tmp11, _tmp10)
+    tmp4 = tl.sum(_tmp4, 1)[:, None]
+    tmp10 = tl.sum(_tmp10, 1)[:, None]
+    for r0_offset in tl.range(0, r0_numel, R0_BLOCK):
+        r0_index = r0_offset + r0_base
+        r0_mask = r0_index < r0_numel
+        roffset = r0_offset
+        rindex = r0_index
+        r0_3 = (r0_index % 2)
+        r0_4 = r0_index // 2
+        r0_2 = r0_index
+        tmp50 = tl.load(in_ptr0 + (r0_2 + 128*x0 + 36864*x1), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32)
+        tmp58 = tl.load(in_ptr1 + (r0_2), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32)
+        tmp63 = tl.load(in_ptr2 + (r0_2 + 128*x1), r0_mask, eviction_policy='evict_last', other=0.0)
+        tmp66 = tl.load(in_ptr3 + (r0_2 + 128*x1), r0_mask, eviction_policy='evict_last', other=0.0)
+        tmp96 = tl.load(in_ptr0 + (4096 + r0_2 + 128*x0 + 36864*x1), r0_mask, eviction_policy='evict_first', other=0.0).to(tl.float32)
+        tmp102 = tl.load(in_ptr4 + (r0_2), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32)
+        tmp12 = r0_3
+        tmp13 = tl.full([1, 1], 0, tl.int64)
+        tmp14 = tmp12 >= tmp13
+        tmp15 = tl.full([1, 1], 1, tl.int64)
+        tmp16 = tmp12 < tmp15
+        tmp17 = tl.load(in_ptr0 + (1 + 2*r0_4 + 128*x0 + 36864*x1), r0_mask & tmp16, eviction_policy='evict_last', other=0.0).to(tl.float32)
+        tmp18 = tmp17.to(tl.float32)
+        tmp19 = 128.0
+        tmp20 = (tmp10 / tmp19)
+        tmp21 = 1e-06
+        tmp22 = tmp20 + tmp21
+        tmp23 = libdevice.rsqrt(tmp22)
+        tmp24 = tmp18 * tmp23
+        tmp25 = tl.load(in_ptr1 + (tl.broadcast_to(1 + 2*r0_4, [XBLOCK, R0_BLOCK])), r0_mask & tmp16, eviction_policy='evict_last', other=0.0).to(tl.float32)
+        tmp26 = tmp25.to(tl.float32)
+        tmp27 = tmp24 * tmp26
+        tmp28 = tmp27.to(tl.float32)
+        tmp29 = -tmp28
+        tmp30 = tl.full(tmp29.shape, 0.0, tmp29.dtype)
+        tmp31 = tl.where(tmp16, tmp29, tmp30)
+        tmp32 = tmp12 >= tmp15
+        tmp33 = tl.full([1, 1], 2, tl.int64)
+        tmp34 = tmp12 < tmp33
+        tmp35 = tl.load(in_ptr0 + (2*r0_4 + 128*x0 + 36864*x1), r0_mask & tmp32, eviction_policy='evict_last', other=0.0).to(tl.float32)
+        tmp36 = tmp35.to(tl.float32)
+        tmp37 = 128.0
+        tmp38 = (tmp10 / tmp37)
+        tmp39 = 1e-06
+        tmp40 = tmp38 + tmp39
+        tmp41 = libdevice.rsqrt(tmp40)
+        tmp42 = tmp36 * tmp41
+        tmp43 = tl.load(in_ptr1 + (tl.broadcast_to(2*r0_4, [XBLOCK, R0_BLOCK])), r0_mask & tmp32, eviction_policy='evict_last', other=0.0).to(tl.float32)
+        tmp44 = tmp43.to(tl.float32)
+        tmp45 = tmp42 * tmp44
+        tmp46 = tmp45.to(tl.float32)
+        tmp47 = tl.full(tmp46.shape, 0.0, tmp46.dtype)
+        tmp48 = tl.where(tmp32, tmp46, tmp47)
+        tmp49 = tl.where(tmp16, tmp31, tmp48)
+        tmp51 = tmp50.to(tl.float32)
+        tmp52 = 128.0
+        tmp53 = (tmp10 / tmp52)
+        tmp54 = 1e-06
+        tmp55 = tmp53 + tmp54
+        tmp56 = libdevice.rsqrt(tmp55)
+        tmp57 = tmp51 * tmp56
+        tmp59 = tmp58.to(tl.float32)
+        tmp60 = tmp57 * tmp59
+        tmp61 = tmp60.to(tl.float32)
+        tmp62 = tmp61.to(tl.float32)
+        tmp64 = tmp62 * tmp63
+        tmp65 = tmp49.to(tl.float32)
+        tmp67 = tmp65 * tmp66
+        tmp68 = tmp64 + tmp67
+        tmp69 = tmp68.to(tl.float32)
+        tmp70 = tl.load(in_ptr0 + (4097 + 2*r0_4 + 128*x0 + 36864*x1), r0_mask & tmp16, eviction_policy='evict_last', other=0.0).to(tl.float32)
+        tmp71 = tmp70.to(tl.float32)
+        tmp72 = (tmp4 / tmp19)
+        tmp73 = tmp72 + tmp21
+        tmp74 = libdevice.rsqrt(tmp73)
+        tmp75 = tmp71 * tmp74
+        tmp76 = tl.load(in_ptr4 + (tl.broadcast_to(1 + 2*r0_4, [XBLOCK, R0_BLOCK])), r0_mask & tmp16, eviction_policy='evict_last', other=0.0).to(tl.float32)
+        tmp77 = tmp76.to(tl.float32)
+        tmp78 = tmp75 * tmp77
+        tmp79 = tmp78.to(tl.float32)
+        tmp80 = -tmp79
+        tmp81 = tl.full(tmp80.shape, 0.0, tmp80.dtype)
+        tmp82 = tl.where(tmp16, tmp80, tmp81)
+        tmp83 = tl.load(in_ptr0 + (4096 + 2*r0_4 + 128*x0 + 36864*x1), r0_mask & tmp32, eviction_policy='evict_last', other=0.0).to(tl.float32)
+        tmp84 = tmp83.to(tl.float32)
+        tmp85 = (tmp4 / tmp37)
+        tmp86 = tmp85 + tmp39
+        tmp87 = libdevice.rsqrt(tmp86)
+        tmp88 = tmp84 * tmp87
+        tmp89 = tl.load(in_ptr4 + (tl.broadcast_to(2*r0_4, [XBLOCK, R0_BLOCK])), r0_mask & tmp32, eviction_policy='evict_last', other=0.0).to(tl.float32)
+        tmp90 = tmp89.to(tl.float32)
+        tmp91 = tmp88 * tmp90
+        tmp92 = tmp91.to(tl.float32)
+        tmp93 = tl.full(tmp92.shape, 0.0, tmp92.dtype)
+        tmp94 = tl.where(tmp32, tmp92, tmp93)
+        tmp95 = tl.where(tmp16, tmp82, tmp94)
+        tmp97 = tmp96.to(tl.float32)
+        tmp98 = (tmp4 / tmp52)
+        tmp99 = tmp98 + tmp54
+        tmp100 = libdevice.rsqrt(tmp99)
+        tmp101 = tmp97 * tmp100
+        tmp103 = tmp102.to(tl.float32)
+        tmp104 = tmp101 * tmp103
+        tmp105 = tmp104.to(tl.float32)
+        tmp106 = tmp105.to(tl.float32)
+        tmp107 = tmp106 * tmp63
+        tmp108 = tmp95.to(tl.float32)
+        tmp109 = tmp108 * tmp66
+        tmp110 = tmp107 + tmp109
+        tmp111 = tmp110.to(tl.float32)
+        tl.store(in_out_ptr0 + (r0_2 + 128*x5), tmp69, r0_mask)
+        tl.store(in_out_ptr1 + (r0_2 + 128*x5), tmp111, r0_mask)
diff --git a/torchinductor/cr/ccr2gijy4jp6vvdbewmzgaogxbf5as7ytxtou4zo2yelawomrjjg.py b/torchinductor/cr/ccr2gijy4jp6vvdbewmzgaogxbf5as7ytxtou4zo2yelawomrjjg.py
new file mode 100644
index 0000000000000000000000000000000000000000..f4c5bc00599317e3aa2565f0ca8806433600015c
--- /dev/null
+++ b/torchinductor/cr/ccr2gijy4jp6vvdbewmzgaogxbf5as7ytxtou4zo2yelawomrjjg.py
@@ -0,0 +1,131 @@
+# AOT ID: ['21_inference']
+from ctypes import c_void_p, c_long, c_int
+import torch
+import math
+import random
+import os
+import tempfile
+from math import inf, nan
+from cmath import nanj
+from torch._inductor.hooks import run_intermediate_hooks
+from torch._inductor.utils import maybe_profile
+from torch._inductor.codegen.memory_planning import _align as align
+from torch import device, empty_strided
+from torch._inductor.async_compile import AsyncCompile
+from torch._inductor.select_algorithm import extern_kernels
+import triton
+import triton.language as tl
+from torch._inductor.runtime.triton_heuristics import start_graph, end_graph
+from torch._C import _cuda_getCurrentRawStream as get_raw_stream
+
+aten = torch.ops.aten
+inductor_ops = torch.ops.inductor
+_quantized = torch.ops._quantized
+assert_size_stride = torch._C._dynamo.guards.assert_size_stride
+assert_alignment = torch._C._dynamo.guards.assert_alignment
+empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu
+empty_strided_cpu_pinned = torch._C._dynamo.guards._empty_strided_cpu_pinned
+empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
+empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu
+empty_strided_mtia = torch._C._dynamo.guards._empty_strided_mtia
+reinterpret_tensor = torch._C._dynamo.guards._reinterpret_tensor
+alloc_from_pool = torch.ops.inductor._alloc_from_pool
+async_compile = AsyncCompile()
+empty_strided_p2p = torch._C._distributed_c10d._SymmetricMemory.empty_strided_p2p
+
+
+# kernel path: /app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py
+# Topologically Sorted Source Nodes: [chunk, silu, x], Original ATen: [aten.split, aten.silu, aten.mul]
+# Source node to ATen node mapping:
+#   chunk => split
+#   silu => convert_element_type, convert_element_type_1, mul_6, sigmoid
+#   x => mul_10
+# Graph fragment:
+#   %arg1_1 : Tensor "bf16[1, s67, 24576][24576*s67, 24576, 1]cuda:0" = PlaceHolder[target=arg1_1]
+#   %split : [num_users=2] = call_function[target=torch.ops.aten.split.Tensor](args = (%arg1_1, 12288, -1), kwargs = {})
+#   %convert_element_type : Tensor "f32[1, s67, 12288][12288*s67, 12288, 1]cuda:0"[num_users=2] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%getitem, torch.float32), kwargs = {})
+#   %sigmoid : Tensor "f32[1, s67, 12288][12288*s67, 12288, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.sigmoid.default](args = (%convert_element_type,), kwargs = {})
+#   %mul_6 : Tensor "f32[1, s67, 12288][12288*s67, 12288, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type, %sigmoid), kwargs = {})
+#   %convert_element_type_1 : Tensor "bf16[1, s67, 12288][12288*s67, 12288, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%mul_6, torch.bfloat16), kwargs = {})
+#   %mul_10 : Tensor "bf16[1, s67, 12288][12288*s67, 12288, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type_1, %getitem_1), kwargs = {})
+#   return %mul_10
+triton_poi_fused_mul_silu_split_0 = async_compile.triton('triton_poi_fused_mul_silu_split_0', '''
+import triton
+import triton.language as tl
+
+from torch._inductor.runtime import triton_helpers, triton_heuristics
+from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+triton_helpers.set_driver_to_gpu()
+
+@triton_heuristics.pointwise(
+    size_hints={'x': 4194304}, 
+    filename=__file__,
+    triton_meta={'signature': {'in_ptr0': '*bf16', 'out_ptr0': '*bf16', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=128, cc=89, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=1536, warp_size=32), 'constants': {}, 'native_matmul': False, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]]}], 'enable_fp_fusion': True},
+    inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_mul_silu_split_0', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'atomic_add_found': False, 'num_load': 2, 'num_store': 1, 'num_reduction': 0, 'backend_hash': '139C22A3A3C364569C9941DE9469DCB674B7A631E094782CBD415193800462F6', 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'deterministic': False, 'force_filter_reduction_configs': False, 'are_deterministic_algorithms_enabled': False},
+    min_elem_per_thread=0
+)
+@triton.jit
+def triton_poi_fused_mul_silu_split_0(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
+    xoffset = tl.program_id(0) * XBLOCK
+    xindex = xoffset + tl.arange(0, XBLOCK)[:]
+    xmask = tl.full([XBLOCK], True, tl.int1)
+    x0 = (xindex % 12288)
+    x1 = xindex // 12288
+    x2 = xindex
+    tmp0 = tl.load(in_ptr0 + (x0 + 24576*x1), None).to(tl.float32)
+    tmp5 = tl.load(in_ptr0 + (12288 + x0 + 24576*x1), None).to(tl.float32)
+    tmp1 = tmp0.to(tl.float32)
+    tmp2 = tl.sigmoid(tmp1)
+    tmp3 = tmp1 * tmp2
+    tmp4 = tmp3.to(tl.float32)
+    tmp6 = tmp4 * tmp5
+    tl.store(out_ptr0 + (x2), tmp6, None)
+''', device_str='cuda')
+
+
+async_compile.wait(globals())
+del async_compile
+
+class Runner:
+    def __init__(self, partitions):
+        self.partitions = partitions
+
+    def recursively_apply_fns(self, fns):
+        new_callables = []
+        for fn, c in zip(fns, self.partitions):
+            new_callables.append(fn(c))
+        self.partitions = new_callables
+
+    def call(self, args):
+        arg0_1, arg1_1 = args
+        args.clear()
+        s67 = arg0_1
+        assert_size_stride(arg1_1, (1, s67, 24576), (24576*s67, 24576, 1))
+        with torch.cuda._DeviceGuard(0):
+            torch.cuda.set_device(0)
+            buf0 = empty_strided_cuda((1, s67, 12288), (12288*s67, 12288, 1), torch.bfloat16)
+            # Topologically Sorted Source Nodes: [chunk, silu, x], Original ATen: [aten.split, aten.silu, aten.mul]
+            triton_poi_fused_mul_silu_split_0_xnumel = 12288*s67
+            stream0 = get_raw_stream(0)
+            triton_poi_fused_mul_silu_split_0.run(arg1_1, buf0, triton_poi_fused_mul_silu_split_0_xnumel, stream=stream0)
+            del arg1_1
+        return (buf0, )
+
+runner = Runner(partitions=[])
+call = runner.call
+recursively_apply_fns = runner.recursively_apply_fns
+
+
+def benchmark_compiled_module(times=10, repeat=10):
+    from torch._dynamo.testing import rand_strided
+    from torch._inductor.utils import print_performance
+    arg0_1 = 256
+    arg1_1 = rand_strided((1, 256, 24576), (6291456, 24576, 1), device='cuda:0', dtype=torch.bfloat16)
+    fn = lambda: call([arg0_1, arg1_1])
+    return print_performance(fn, times=times, repeat=repeat)
+
+
+if __name__ == "__main__":
+    from torch._inductor.wrapper_benchmark import compiled_module_main
+    compiled_module_main('None', benchmark_compiled_module)
diff --git a/torchinductor/cz/bb6645c6be31f426023ec47eef09e354ad9fa8b2d59e6e45ab49b803eb34d44e.best_config b/torchinductor/cz/bb6645c6be31f426023ec47eef09e354ad9fa8b2d59e6e45ab49b803eb34d44e.best_config
new file mode 100644
index 0000000000000000000000000000000000000000..00f2a256b252b5c28ebf908e7e25905ecf7b69a4
--- /dev/null
+++ b/torchinductor/cz/bb6645c6be31f426023ec47eef09e354ad9fa8b2d59e6e45ab49b803eb34d44e.best_config
@@ -0,0 +1 @@
+{"XBLOCK": 512, "num_warps": 8, "num_stages": 1, "configs_hash": "3ca5c3e34d35093f3c9ab2829a9faeebad5e61c4ca13d5ed6053d7b71ce60d5a", "found_by_coordesc": false, "time_taken_ms": 41, "triton_cache_hash": "SWIO2NFSYH3NKX6EWLJXN7WN2QH2K7ETN3JE2BQRCZXLIIDUWOOA"}
\ No newline at end of file
diff --git a/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py b/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py
new file mode 100644
index 0000000000000000000000000000000000000000..037b7f150f116a1a303491003b70151d640b4a75
--- /dev/null
+++ b/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py
@@ -0,0 +1,25 @@
+
+import triton
+import triton.language as tl
+
+from torch._inductor.runtime import triton_helpers, triton_heuristics
+from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+triton_helpers.set_driver_to_gpu()
+
+@triton_heuristics.pointwise(
+    size_hints={'x': 8388608}, 
+    filename=__file__,
+    triton_meta={'signature': {'in_ptr0': '*bf16', 'out_ptr0': '*bf16', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=128, cc=89, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=1536, warp_size=32), 'constants': {}, 'native_matmul': False, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]]}], 'enable_fp_fusion': True},
+    inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_clone_0', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'atomic_add_found': False, 'num_load': 1, 'num_store': 1, 'num_reduction': 0, 'backend_hash': '139C22A3A3C364569C9941DE9469DCB674B7A631E094782CBD415193800462F6', 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'deterministic': False, 'force_filter_reduction_configs': False, 'are_deterministic_algorithms_enabled': False, 'tiling_scores': {'x': 50331648}},
+    min_elem_per_thread=0
+)
+@triton.jit
+def triton_poi_fused_clone_0(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
+    xnumel = 8388608
+    xoffset = tl.program_id(0) * XBLOCK
+    xindex = xoffset + tl.arange(0, XBLOCK)[:]
+    xmask = tl.full([XBLOCK], True, tl.int1)
+    x0 = xindex
+    tmp0 = tl.load(in_ptr0 + (x0), None).to(tl.float32)
+    tl.store(out_ptr0 + (x0), tmp0, None)
diff --git a/torchinductor/ei/289dba45a6462a57311a4b0e777a8c6425cf2b7e76724abdb2f263dfa285ce74.best_config b/torchinductor/ei/289dba45a6462a57311a4b0e777a8c6425cf2b7e76724abdb2f263dfa285ce74.best_config
new file mode 100644
index 0000000000000000000000000000000000000000..9b2857e928afd2e15293e47e21a2ab312208306e
--- /dev/null
+++ b/torchinductor/ei/289dba45a6462a57311a4b0e777a8c6425cf2b7e76724abdb2f263dfa285ce74.best_config
@@ -0,0 +1 @@
+{"XBLOCK": 1024, "num_warps": 4, "num_stages": 1, "configs_hash": "3ca5c3e34d35093f3c9ab2829a9faeebad5e61c4ca13d5ed6053d7b71ce60d5a", "found_by_coordesc": false, "time_taken_ms": 42, "triton_cache_hash": "7ZCAPUU6AQHQ5DX4ENCYV3QMXTYVGXUYKDIHSZVQ45BB7FWTBYGQ"}
\ No newline at end of file
diff --git a/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py b/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py
new file mode 100644
index 0000000000000000000000000000000000000000..ead91a76e47dbac191ed52e5f93858af9c6c6b90
--- /dev/null
+++ b/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py
@@ -0,0 +1,28 @@
+
+import triton
+import triton.language as tl
+
+from torch._inductor.runtime import triton_helpers, triton_heuristics
+from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+triton_helpers.set_driver_to_gpu()
+
+@triton_heuristics.pointwise(
+    size_hints={'x': 16777216}, 
+    filename=__file__,
+    triton_meta={'signature': {'in_ptr0': '*bf16', 'out_ptr0': '*bf16', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=128, cc=89, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=1536, warp_size=32), 'constants': {}, 'native_matmul': False, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]]}], 'enable_fp_fusion': True},
+    inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'atomic_add_found': False, 'num_load': 1, 'num_store': 1, 'num_reduction': 0, 'backend_hash': '139C22A3A3C364569C9941DE9469DCB674B7A631E094782CBD415193800462F6', 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'deterministic': False, 'force_filter_reduction_configs': False, 'are_deterministic_algorithms_enabled': False, 'tiling_scores': {'x': 56623104}},
+    min_elem_per_thread=0
+)
+@triton.jit
+def triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
+    xnumel = 9437184
+    xoffset = tl.program_id(0) * XBLOCK
+    xindex = xoffset + tl.arange(0, XBLOCK)[:]
+    xmask = tl.full([XBLOCK], True, tl.int1)
+    x0 = (xindex % 128)
+    x1 = ((xindex // 128) % 2304)
+    x2 = xindex // 294912
+    x3 = xindex
+    tmp0 = tl.load(in_ptr0 + (x0 + 128*x2 + 4096*x1), None).to(tl.float32)
+    tl.store(out_ptr0 + (x3), tmp0, None)
diff --git a/torchinductor/fxgraph/22/f22et4hxfdbezzlil53lu2pcyq6hgd3sgpjtfwxqwvo3nhcfcvx3/iqgyrwyxmlbu22glkzz24rsbjiieww3sllxux2ttk4c6gtoiuba b/torchinductor/fxgraph/22/f22et4hxfdbezzlil53lu2pcyq6hgd3sgpjtfwxqwvo3nhcfcvx3/iqgyrwyxmlbu22glkzz24rsbjiieww3sllxux2ttk4c6gtoiuba
new file mode 100644
index 0000000000000000000000000000000000000000..0e2205f612f4b99296b22b7c125d2ed420482c67
--- /dev/null
+++ b/torchinductor/fxgraph/22/f22et4hxfdbezzlil53lu2pcyq6hgd3sgpjtfwxqwvo3nhcfcvx3/iqgyrwyxmlbu22glkzz24rsbjiieww3sllxux2ttk4c6gtoiuba
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:440d88db1762c34d68cb5673ae46414a104b5b725aef4bea735705e34dc8a041
+size 938428
diff --git a/torchinductor/fxgraph/53/f533iu6ruyb2fsdtxzocuyehmvcspxovsu3acdnbfpe47o3slvj3/hnzkkjjtkqsys574iwq2z2eutuefa26qvxdntknyhr2fw5tnbux b/torchinductor/fxgraph/53/f533iu6ruyb2fsdtxzocuyehmvcspxovsu3acdnbfpe47o3slvj3/hnzkkjjtkqsys574iwq2z2eutuefa26qvxdntknyhr2fw5tnbux
new file mode 100644
index 0000000000000000000000000000000000000000..40a38fadf357d2b2a42bbe6d87004ac3c842811f
Binary files /dev/null and b/torchinductor/fxgraph/53/f533iu6ruyb2fsdtxzocuyehmvcspxovsu3acdnbfpe47o3slvj3/hnzkkjjtkqsys574iwq2z2eutuefa26qvxdntknyhr2fw5tnbux differ
diff --git a/torchinductor/fxgraph/6v/f6v2dym5xl4l4b2xlv35ic4ajld4mxcvhsvdsiwx2uug77q36cad/nhugjvtrt6cm53zizhnw673hj52m5m3kegaqvkjzkf4qh6d6rhm b/torchinductor/fxgraph/6v/f6v2dym5xl4l4b2xlv35ic4ajld4mxcvhsvdsiwx2uug77q36cad/nhugjvtrt6cm53zizhnw673hj52m5m3kegaqvkjzkf4qh6d6rhm
new file mode 100644
index 0000000000000000000000000000000000000000..d2a337ea87cd19e16d55948c01b6065cb40f47c6
--- /dev/null
+++ b/torchinductor/fxgraph/6v/f6v2dym5xl4l4b2xlv35ic4ajld4mxcvhsvdsiwx2uug77q36cad/nhugjvtrt6cm53zizhnw673hj52m5m3kegaqvkjzkf4qh6d6rhm
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:69e864d6719f84ceef28c9db6f7f674f74ceb36a21810aa939517903f87e89d9
+size 264608
diff --git a/torchinductor/fxgraph/ae/faeqzqjo2ky5lpgrcqqhwk6jpyjxv7ptgmodvyw35laruvkwlabz/k5csppa65iixuznasdpjggbp2ltfolxpa4c5sryfyz5w6cz56xl b/torchinductor/fxgraph/ae/faeqzqjo2ky5lpgrcqqhwk6jpyjxv7ptgmodvyw35laruvkwlabz/k5csppa65iixuznasdpjggbp2ltfolxpa4c5sryfyz5w6cz56xl
new file mode 100644
index 0000000000000000000000000000000000000000..df2581de9c449f550507a397a33b8e167e140dca
Binary files /dev/null and b/torchinductor/fxgraph/ae/faeqzqjo2ky5lpgrcqqhwk6jpyjxv7ptgmodvyw35laruvkwlabz/k5csppa65iixuznasdpjggbp2ltfolxpa4c5sryfyz5w6cz56xl differ
diff --git a/torchinductor/fxgraph/ah/fahbtdmoejcqs352pnbnedqns63nbnu6hdbrwzvf6chptnsannjh/fhpbiokcxxh7ksbfgiljcvh7erywotuv4ddvlfcb4fk2ef7dd5c b/torchinductor/fxgraph/ah/fahbtdmoejcqs352pnbnedqns63nbnu6hdbrwzvf6chptnsannjh/fhpbiokcxxh7ksbfgiljcvh7erywotuv4ddvlfcb4fk2ef7dd5c
new file mode 100644
index 0000000000000000000000000000000000000000..5868ebbdbea1294f85ecab57918568a617c0f63b
--- /dev/null
+++ b/torchinductor/fxgraph/ah/fahbtdmoejcqs352pnbnedqns63nbnu6hdbrwzvf6chptnsannjh/fhpbiokcxxh7ksbfgiljcvh7erywotuv4ddvlfcb4fk2ef7dd5c
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:29de143942bdcff5482532169154ff2471674e95e0c7559441e155a217e31f44
+size 174116
diff --git a/torchinductor/fxgraph/be/fbeg6ilh2lejqppgwjhp3rakthwkxiwywtjmfnc4ndj7bpmq6jb2/ug37hiwkgulfctc6yjfbmae23i6gakdvpvwwtn63w7z73byeze3 b/torchinductor/fxgraph/be/fbeg6ilh2lejqppgwjhp3rakthwkxiwywtjmfnc4ndj7bpmq6jb2/ug37hiwkgulfctc6yjfbmae23i6gakdvpvwwtn63w7z73byeze3
new file mode 100644
index 0000000000000000000000000000000000000000..b1e2a881de2b55d68047104b541a9e74116619e8
Binary files /dev/null and b/torchinductor/fxgraph/be/fbeg6ilh2lejqppgwjhp3rakthwkxiwywtjmfnc4ndj7bpmq6jb2/ug37hiwkgulfctc6yjfbmae23i6gakdvpvwwtn63w7z73byeze3 differ
diff --git a/torchinductor/fxgraph/bn/fbnlruhvmagcngqd5is2xjbucjaq7uf3sgsbdahfi6ovtehbhzyo/62gizmmmqz43cclymnr7ftyo5qt7ux4og6bc2xazrwstkjpsy2e b/torchinductor/fxgraph/bn/fbnlruhvmagcngqd5is2xjbucjaq7uf3sgsbdahfi6ovtehbhzyo/62gizmmmqz43cclymnr7ftyo5qt7ux4og6bc2xazrwstkjpsy2e
new file mode 100644
index 0000000000000000000000000000000000000000..5e07ad494683e012c6f3b650f3211d839bae6bab
--- /dev/null
+++ b/torchinductor/fxgraph/bn/fbnlruhvmagcngqd5is2xjbucjaq7uf3sgsbdahfi6ovtehbhzyo/62gizmmmqz43cclymnr7ftyo5qt7ux4og6bc2xazrwstkjpsy2e
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f68c8cb18c8679b109786363f2cf0eec27fa5f8e37822d5c198da53525f2c689
+size 3410265
diff --git a/torchinductor/fxgraph/bp/fbppvpvvibf6dmfslxtlr7rupsubjfcpto27skj4uv6z3rv72yqu/2yq6yplsvvilpuklnah5vfnufjdanewtyezrjycco3vxvnedvyc b/torchinductor/fxgraph/bp/fbppvpvvibf6dmfslxtlr7rupsubjfcpto27skj4uv6z3rv72yqu/2yq6yplsvvilpuklnah5vfnufjdanewtyezrjycco3vxvnedvyc
new file mode 100644
index 0000000000000000000000000000000000000000..69838d1570a505e8ce653c7302cfb506f5e49550
Binary files /dev/null and b/torchinductor/fxgraph/bp/fbppvpvvibf6dmfslxtlr7rupsubjfcpto27skj4uv6z3rv72yqu/2yq6yplsvvilpuklnah5vfnufjdanewtyezrjycco3vxvnedvyc differ
diff --git a/torchinductor/fxgraph/bt/fbtdosmtnvserozccxcnybd6tgd6ofwg2uxsp2mledlcicmt4qzk/wmthsneohmxumg7xfzujsqbo2fome7du5pzabiw6wmu4uoutbxc b/torchinductor/fxgraph/bt/fbtdosmtnvserozccxcnybd6tgd6ofwg2uxsp2mledlcicmt4qzk/wmthsneohmxumg7xfzujsqbo2fome7du5pzabiw6wmu4uoutbxc
new file mode 100644
index 0000000000000000000000000000000000000000..632d6dfeab8f3fea047166214420d1f2cc21b144
Binary files /dev/null and b/torchinductor/fxgraph/bt/fbtdosmtnvserozccxcnybd6tgd6ofwg2uxsp2mledlcicmt4qzk/wmthsneohmxumg7xfzujsqbo2fome7du5pzabiw6wmu4uoutbxc differ
diff --git a/torchinductor/fxgraph/cz/fczfijqqjco22ccfvbme7svhnvhj4do2m5f62tmd57h26f3hol7q/ysvactlqfhnxnqe57jpw2dhsxtia64lerdmzkfbsydfsf6ixr3m b/torchinductor/fxgraph/cz/fczfijqqjco22ccfvbme7svhnvhj4do2m5f62tmd57h26f3hol7q/ysvactlqfhnxnqe57jpw2dhsxtia64lerdmzkfbsydfsf6ixr3m
new file mode 100644
index 0000000000000000000000000000000000000000..57efb12048afabcef85599b65a2b2c26fde76ada
Binary files /dev/null and b/torchinductor/fxgraph/cz/fczfijqqjco22ccfvbme7svhnvhj4do2m5f62tmd57h26f3hol7q/ysvactlqfhnxnqe57jpw2dhsxtia64lerdmzkfbsydfsf6ixr3m differ
diff --git a/torchinductor/fxgraph/dx/fdxnindsdqgodotc5a4cqrqpz34ijsi7rtays3feytoj4ko6orse/vantarodghbm737o5fovpz4x6sv5auvy47o366nk7ihggags3we b/torchinductor/fxgraph/dx/fdxnindsdqgodotc5a4cqrqpz34ijsi7rtays3feytoj4ko6orse/vantarodghbm737o5fovpz4x6sv5auvy47o366nk7ihggags3we
new file mode 100644
index 0000000000000000000000000000000000000000..1b81067c9ec7e3f0c81cdf82aecd4675c710626a
Binary files /dev/null and b/torchinductor/fxgraph/dx/fdxnindsdqgodotc5a4cqrqpz34ijsi7rtays3feytoj4ko6orse/vantarodghbm737o5fovpz4x6sv5auvy47o366nk7ihggags3we differ
diff --git a/torchinductor/fxgraph/e2/fe2tjoiexjbavh5sakfaxvga43vsvwn5ev5bzhfjg76jvmtjqtbn/ejg3u4qymaxsvvl2vdequli7pwsrdjf5zdgqjkgbrxdsvgfv3h4 b/torchinductor/fxgraph/e2/fe2tjoiexjbavh5sakfaxvga43vsvwn5ev5bzhfjg76jvmtjqtbn/ejg3u4qymaxsvvl2vdequli7pwsrdjf5zdgqjkgbrxdsvgfv3h4
new file mode 100644
index 0000000000000000000000000000000000000000..5cc300fa0cf99c368a0f9e44626e1b8da195cbcf
--- /dev/null
+++ b/torchinductor/fxgraph/e2/fe2tjoiexjbavh5sakfaxvga43vsvwn5ev5bzhfjg76jvmtjqtbn/ejg3u4qymaxsvvl2vdequli7pwsrdjf5zdgqjkgbrxdsvgfv3h4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:224dba7218602f2ad57aa8c90a2d1f7da511a4bdc8cd04a8c18dc72a98b5d9f8
+size 563115
diff --git a/torchinductor/fxgraph/fr/ffrx7clryowwzulnhruopihutvaxlycymqopsyoha6yecifyw2m2/g3wh462wylribiwz4th3gnlt5rtnrcb5bkad6w3yucxodv3q5ks b/torchinductor/fxgraph/fr/ffrx7clryowwzulnhruopihutvaxlycymqopsyoha6yecifyw2m2/g3wh462wylribiwz4th3gnlt5rtnrcb5bkad6w3yucxodv3q5ks
new file mode 100644
index 0000000000000000000000000000000000000000..b1e00fa1e977528213190d06d2d90a7e77b46d8b
--- /dev/null
+++ b/torchinductor/fxgraph/fr/ffrx7clryowwzulnhruopihutvaxlycymqopsyoha6yecifyw2m2/g3wh462wylribiwz4th3gnlt5rtnrcb5bkad6w3yucxodv3q5ks
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:36ec7e7b56c2e280a2d9e4cfb33573ec66d8883d0a803f5b78a0aee1d770eaa4
+size 2958304
diff --git a/torchinductor/fxgraph/gl/fglggif4cconx3mjhni5xdqsapr365j6lcigwntolt36ggvy6xoa/lldkvrm3bfx4l4wyn6ric67cviyotwhuua4w7xtue3ki6zihdme b/torchinductor/fxgraph/gl/fglggif4cconx3mjhni5xdqsapr365j6lcigwntolt36ggvy6xoa/lldkvrm3bfx4l4wyn6ric67cviyotwhuua4w7xtue3ki6zihdme
new file mode 100644
index 0000000000000000000000000000000000000000..ebe07680a5bda8ebe71415af07da7cfe3e9e5cdf
Binary files /dev/null and b/torchinductor/fxgraph/gl/fglggif4cconx3mjhni5xdqsapr365j6lcigwntolt36ggvy6xoa/lldkvrm3bfx4l4wyn6ric67cviyotwhuua4w7xtue3ki6zihdme differ
diff --git a/torchinductor/fxgraph/gu/fguzn264ahmz2nwbeukdevusbfubyrjhr676ynmcbas4v5s55wdq/tkigttpzup32xmjthoydmqv2wmbdw54qhikri52mbatcy6pdgfn b/torchinductor/fxgraph/gu/fguzn264ahmz2nwbeukdevusbfubyrjhr676ynmcbas4v5s55wdq/tkigttpzup32xmjthoydmqv2wmbdw54qhikri52mbatcy6pdgfn
new file mode 100644
index 0000000000000000000000000000000000000000..070460ccc17ab89c37126471d606733b7b721b20
Binary files /dev/null and b/torchinductor/fxgraph/gu/fguzn264ahmz2nwbeukdevusbfubyrjhr676ynmcbas4v5s55wdq/tkigttpzup32xmjthoydmqv2wmbdw54qhikri52mbatcy6pdgfn differ
diff --git a/torchinductor/fxgraph/gw/fgwoasterkg46eom5rlkmnxomajflnxlkp6lj2kuc6echyghg5bu/rlivt46ououdjetwpsd2yu3fw57nwakjhkk5y5jpf23qnhfyqhf b/torchinductor/fxgraph/gw/fgwoasterkg46eom5rlkmnxomajflnxlkp6lj2kuc6echyghg5bu/rlivt46ououdjetwpsd2yu3fw57nwakjhkk5y5jpf23qnhfyqhf
new file mode 100644
index 0000000000000000000000000000000000000000..5b499e4cb925ea6a0ce22b810c01095d249a3c78
Binary files /dev/null and b/torchinductor/fxgraph/gw/fgwoasterkg46eom5rlkmnxomajflnxlkp6lj2kuc6echyghg5bu/rlivt46ououdjetwpsd2yu3fw57nwakjhkk5y5jpf23qnhfyqhf differ
diff --git a/torchinductor/fxgraph/gz/fgzy6konies5s7bd2elbil2jcj7zkgvoixpugnguncnrarcnahdq/kwskkw5g6u7ncnmnxj2mbgxghzm3bgayjlxguhamtpzc6earoeu b/torchinductor/fxgraph/gz/fgzy6konies5s7bd2elbil2jcj7zkgvoixpugnguncnrarcnahdq/kwskkw5g6u7ncnmnxj2mbgxghzm3bgayjlxguhamtpzc6earoeu
new file mode 100644
index 0000000000000000000000000000000000000000..c74836742af2cbb7a6352125234617132230ef83
Binary files /dev/null and b/torchinductor/fxgraph/gz/fgzy6konies5s7bd2elbil2jcj7zkgvoixpugnguncnrarcnahdq/kwskkw5g6u7ncnmnxj2mbgxghzm3bgayjlxguhamtpzc6earoeu differ
diff --git a/torchinductor/fxgraph/ji/fjiy5thw24pernaeujhzsyccltrmsf63h2zi7ew3s3fjuoxo4q3d/xrye5rdod7ebchmrpuw4t3cg5he7cgyiz3u3pztqzvylr4vlhl3 b/torchinductor/fxgraph/ji/fjiy5thw24pernaeujhzsyccltrmsf63h2zi7ew3s3fjuoxo4q3d/xrye5rdod7ebchmrpuw4t3cg5he7cgyiz3u3pztqzvylr4vlhl3
new file mode 100644
index 0000000000000000000000000000000000000000..5b8c0bced4d61ed7b530ee1d814775bc4b7ffc48
Binary files /dev/null and b/torchinductor/fxgraph/ji/fjiy5thw24pernaeujhzsyccltrmsf63h2zi7ew3s3fjuoxo4q3d/xrye5rdod7ebchmrpuw4t3cg5he7cgyiz3u3pztqzvylr4vlhl3 differ
diff --git a/torchinductor/fxgraph/k6/fk6cfyjfeiu7xe6ebkapsnixuplqczgfc5534mitqsfkssbzjyak/4xid4w6sg2yg7xaseouf2vwhp2fyff56a2t6z6ownb3yw3g25rk b/torchinductor/fxgraph/k6/fk6cfyjfeiu7xe6ebkapsnixuplqczgfc5534mitqsfkssbzjyak/4xid4w6sg2yg7xaseouf2vwhp2fyff56a2t6z6ownb3yw3g25rk
new file mode 100644
index 0000000000000000000000000000000000000000..466573cabf40c8fd52d28cceeb01ac364bc6d9ab
--- /dev/null
+++ b/torchinductor/fxgraph/k6/fk6cfyjfeiu7xe6ebkapsnixuplqczgfc5534mitqsfkssbzjyak/4xid4w6sg2yg7xaseouf2vwhp2fyff56a2t6z6ownb3yw3g25rk
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e5d03e5bd236b06fdc1223a85d56c77e8b8297be06a7ecf9d668778b6cdaec55
+size 333975
diff --git a/torchinductor/fxgraph/kj/fkjh2kykxecmnv6oe3zzwtjpek77nmrm35vgv2daxfgkim6xfk4u/gigbnwpmixz5epksvvrh4mtg3nxlpy724eojb3ajzvtepgvx7y4 b/torchinductor/fxgraph/kj/fkjh2kykxecmnv6oe3zzwtjpek77nmrm35vgv2daxfgkim6xfk4u/gigbnwpmixz5epksvvrh4mtg3nxlpy724eojb3ajzvtepgvx7y4
new file mode 100644
index 0000000000000000000000000000000000000000..8a4cbdd00bc66ae731c68bfe418b5637f4f60fd1
--- /dev/null
+++ b/torchinductor/fxgraph/kj/fkjh2kykxecmnv6oe3zzwtjpek77nmrm35vgv2daxfgkim6xfk4u/gigbnwpmixz5epksvvrh4mtg3nxlpy724eojb3ajzvtepgvx7y4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:320c16d9ec45f3d23d52ad627e3266db6eb7e3fae11c90ec09cd66479ab7fe38
+size 205255
diff --git a/torchinductor/fxgraph/n6/fn6x7m44e35jdmh6iqj3eqiyrz7tbhzd3rqartt67myyrnickjmp/um3sgirsxogup4murdiaoy7dxu4ogolqsa343kh56kq24zd53fb b/torchinductor/fxgraph/n6/fn6x7m44e35jdmh6iqj3eqiyrz7tbhzd3rqartt67myyrnickjmp/um3sgirsxogup4murdiaoy7dxu4ogolqsa343kh56kq24zd53fb
new file mode 100644
index 0000000000000000000000000000000000000000..132619ad9b2fd103025e96232885c65b686fa2d5
--- /dev/null
+++ b/torchinductor/fxgraph/n6/fn6x7m44e35jdmh6iqj3eqiyrz7tbhzd3rqartt67myyrnickjmp/um3sgirsxogup4murdiaoy7dxu4ogolqsa343kh56kq24zd53fb
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a337232232bb8d47f19488d00763e3bd38e339709037cda8fdf2a1ae647dd943
+size 680412
diff --git a/torchinductor/fxgraph/te/fte5y7bccssideiluepvpscj6srf7orxnfgql6to32ni27zf2uv2/vmrpdvw3meiqsf22oras6imorrybogkgp6jjr3ddtreaiuutais b/torchinductor/fxgraph/te/fte5y7bccssideiluepvpscj6srf7orxnfgql6to32ni27zf2uv2/vmrpdvw3meiqsf22oras6imorrybogkgp6jjr3ddtreaiuutais
new file mode 100644
index 0000000000000000000000000000000000000000..17dbb2435826af317d46344bc3983dd025e7fbeb
--- /dev/null
+++ b/torchinductor/fxgraph/te/fte5y7bccssideiluepvpscj6srf7orxnfgql6to32ni27zf2uv2/vmrpdvw3meiqsf22oras6imorrybogkgp6jjr3ddtreaiuutais
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ab22f1d6db611109175a74412f218e8c701719467f9298ec639c480452930224
+size 205639
diff --git a/torchinductor/fxgraph/tz/ftzd5ordyehsowurkwjjpkso24gayhyplcd6wz7xdv53fad276l6/36luuy7klcmb7554z63umrezysn6xbat5wxfaadxh4clxhcn2j7 b/torchinductor/fxgraph/tz/ftzd5ordyehsowurkwjjpkso24gayhyplcd6wz7xdv53fad276l6/36luuy7klcmb7554z63umrezysn6xbat5wxfaadxh4clxhcn2j7
new file mode 100644
index 0000000000000000000000000000000000000000..322a4378fd8ab9504083cc49c4ddd315fcdd0523
--- /dev/null
+++ b/torchinductor/fxgraph/tz/ftzd5ordyehsowurkwjjpkso24gayhyplcd6wz7xdv53fad276l6/36luuy7klcmb7554z63umrezysn6xbat5wxfaadxh4clxhcn2j7
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:df974a63ea58981ff7bccfb7464499c49beb8413edae5000773f04bb9c4dd27f
+size 259431
diff --git a/torchinductor/fxgraph/u4/fu47dchf76mmiajgnawm3xgek4ysnnmqaavupgy4cddyxygid6iq/725pjrxppjygb6kbca6zklwmn7iunv65thw23b5s4im6zi27j3i b/torchinductor/fxgraph/u4/fu47dchf76mmiajgnawm3xgek4ysnnmqaavupgy4cddyxygid6iq/725pjrxppjygb6kbca6zklwmn7iunv65thw23b5s4im6zi27j3i
new file mode 100644
index 0000000000000000000000000000000000000000..da88a3f349ecadc5f4ff97bd5b99f19f443bd557
--- /dev/null
+++ b/torchinductor/fxgraph/u4/fu47dchf76mmiajgnawm3xgek4ysnnmqaavupgy4cddyxygid6iq/725pjrxppjygb6kbca6zklwmn7iunv65thw23b5s4im6zi27j3i
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:febaf4c6ef7a7060f941103d952ecc6fd146d7dd99edad87b2e219eca35f4ed0
+size 498357
diff --git a/torchinductor/fxgraph/uy/fuygegwmldon4qz3wvjs3cld4hnjz6yxh6aa2cmfsal4u3xxws43/l4w2mroymid3qdffvzt4wffavpm5it6rzi6lmbvxzzezfkbavuo b/torchinductor/fxgraph/uy/fuygegwmldon4qz3wvjs3cld4hnjz6yxh6aa2cmfsal4u3xxws43/l4w2mroymid3qdffvzt4wffavpm5it6rzi6lmbvxzzezfkbavuo
new file mode 100644
index 0000000000000000000000000000000000000000..372e7c807b708faadc27d59dd4e223b2ba9cf83d
--- /dev/null
+++ b/torchinductor/fxgraph/uy/fuygegwmldon4qz3wvjs3cld4hnjz6yxh6aa2cmfsal4u3xxws43/l4w2mroymid3qdffvzt4wffavpm5it6rzi6lmbvxzzezfkbavuo
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5f2da645d86207b80ca5ae67cb14a0abd9d44fd1ca3cb606b7ce4992a820ad1d
+size 174161
diff --git a/torchinductor/fxgraph/w5/fw5vzdkweh3kv3fm3mnal4wu63gxhw2anwx2pzuved4acfz4fdzm/n5dfreyro3slkufuydw2d54nm7bfiskxjjasoyg2z5yept5c3rf b/torchinductor/fxgraph/w5/fw5vzdkweh3kv3fm3mnal4wu63gxhw2anwx2pzuved4acfz4fdzm/n5dfreyro3slkufuydw2d54nm7bfiskxjjasoyg2z5yept5c3rf
new file mode 100644
index 0000000000000000000000000000000000000000..f8d3a44d8909452ae95e74de5fba7ffe05822f3b
--- /dev/null
+++ b/torchinductor/fxgraph/w5/fw5vzdkweh3kv3fm3mnal4wu63gxhw2anwx2pzuved4acfz4fdzm/n5dfreyro3slkufuydw2d54nm7bfiskxjjasoyg2z5yept5c3rf
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6f4658931176e4b550b4c0eda1f78d67c25449574a412760dacf7047cfa2dc4b
+size 128456
diff --git a/torchinductor/g3/cg3nh42xzsnf5ms7nmsutyrbxuujlzqikaduzqov5wb5mzikcdj6.py b/torchinductor/g3/cg3nh42xzsnf5ms7nmsutyrbxuujlzqikaduzqov5wb5mzikcdj6.py
new file mode 100644
index 0000000000000000000000000000000000000000..22721c7f7dd98857316528b9462eee997164cd80
--- /dev/null
+++ b/torchinductor/g3/cg3nh42xzsnf5ms7nmsutyrbxuujlzqikaduzqov5wb5mzikcdj6.py
@@ -0,0 +1,67 @@
+# AOT ID: ['7_inference']
+from ctypes import c_void_p, c_long, c_int
+import torch
+import math
+import random
+import os
+import tempfile
+from math import inf, nan
+from cmath import nanj
+from torch._inductor.hooks import run_intermediate_hooks
+from torch._inductor.utils import maybe_profile
+from torch._inductor.codegen.memory_planning import _align as align
+from torch import device, empty_strided
+from torch._inductor.async_compile import AsyncCompile
+from torch._inductor.select_algorithm import extern_kernels
+
+aten = torch.ops.aten
+inductor_ops = torch.ops.inductor
+_quantized = torch.ops._quantized
+assert_size_stride = torch._C._dynamo.guards.assert_size_stride
+assert_alignment = torch._C._dynamo.guards.assert_alignment
+empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu
+empty_strided_cpu_pinned = torch._C._dynamo.guards._empty_strided_cpu_pinned
+empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
+empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu
+empty_strided_mtia = torch._C._dynamo.guards._empty_strided_mtia
+reinterpret_tensor = torch._C._dynamo.guards._reinterpret_tensor
+alloc_from_pool = torch.ops.inductor._alloc_from_pool
+async_compile = AsyncCompile()
+empty_strided_p2p = torch._C._distributed_c10d._SymmetricMemory.empty_strided_p2p
+
+
+async_compile.wait(globals())
+del async_compile
+
+class Runner:
+    def __init__(self, partitions):
+        self.partitions = partitions
+
+    def recursively_apply_fns(self, fns):
+        new_callables = []
+        for fn, c in zip(fns, self.partitions):
+            new_callables.append(fn(c))
+        self.partitions = new_callables
+
+    def call(self, args):
+        arg0_1, = args
+        args.clear()
+        assert_size_stride(arg0_1, (1, 256, 12288), (3145728, 12288, 1))
+        return (reinterpret_tensor(arg0_1, (1, 256, 4096), (3145728, 12288, 1), 0), reinterpret_tensor(arg0_1, (1, 256, 4096), (3145728, 12288, 1), 4096), reinterpret_tensor(arg0_1, (1, 256, 4096), (3145728, 12288, 1), 8192), )
+
+runner = Runner(partitions=[])
+call = runner.call
+recursively_apply_fns = runner.recursively_apply_fns
+
+
+def benchmark_compiled_module(times=10, repeat=10):
+    from torch._dynamo.testing import rand_strided
+    from torch._inductor.utils import print_performance
+    arg0_1 = rand_strided((1, 256, 12288), (3145728, 12288, 1), device='cuda:0', dtype=torch.bfloat16)
+    fn = lambda: call([arg0_1])
+    return print_performance(fn, times=times, repeat=repeat)
+
+
+if __name__ == "__main__":
+    from torch._inductor.wrapper_benchmark import compiled_module_main
+    compiled_module_main('None', benchmark_compiled_module)
diff --git a/torchinductor/gq/cgqvtct76tcr46zepwtpzklusornjsauaa2yqljz4nersnfxryln.py b/torchinductor/gq/cgqvtct76tcr46zepwtpzklusornjsauaa2yqljz4nersnfxryln.py
new file mode 100644
index 0000000000000000000000000000000000000000..7914008e7b95f2c23fbb8d37c5661b6953b0beb5
--- /dev/null
+++ b/torchinductor/gq/cgqvtct76tcr46zepwtpzklusornjsauaa2yqljz4nersnfxryln.py
@@ -0,0 +1,129 @@
+# AOT ID: ['29_inference']
+from ctypes import c_void_p, c_long, c_int
+import torch
+import math
+import random
+import os
+import tempfile
+from math import inf, nan
+from cmath import nanj
+from torch._inductor.hooks import run_intermediate_hooks
+from torch._inductor.utils import maybe_profile
+from torch._inductor.codegen.memory_planning import _align as align
+from torch import device, empty_strided
+from torch._inductor.async_compile import AsyncCompile
+from torch._inductor.select_algorithm import extern_kernels
+import triton
+import triton.language as tl
+from torch._inductor.runtime.triton_heuristics import start_graph, end_graph
+from torch._C import _cuda_getCurrentRawStream as get_raw_stream
+
+aten = torch.ops.aten
+inductor_ops = torch.ops.inductor
+_quantized = torch.ops._quantized
+assert_size_stride = torch._C._dynamo.guards.assert_size_stride
+assert_alignment = torch._C._dynamo.guards.assert_alignment
+empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu
+empty_strided_cpu_pinned = torch._C._dynamo.guards._empty_strided_cpu_pinned
+empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
+empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu
+empty_strided_mtia = torch._C._dynamo.guards._empty_strided_mtia
+reinterpret_tensor = torch._C._dynamo.guards._reinterpret_tensor
+alloc_from_pool = torch.ops.inductor._alloc_from_pool
+async_compile = AsyncCompile()
+empty_strided_p2p = torch._C._distributed_c10d._SymmetricMemory.empty_strided_p2p
+
+
+# kernel path: /app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py
+# Topologically Sorted Source Nodes: [mul, hidden_states], Original ATen: [aten.mul, aten.add]
+# Source node to ATen node mapping:
+#   hidden_states => add
+#   mul => mul
+# Graph fragment:
+#   %arg2_1 : Tensor "bf16[1, 2304, 4096][9437184, 4096, 1]cuda:0" = PlaceHolder[target=arg2_1]
+#   %arg0_1 : Tensor "bf16[1, 1, 4096][12288, 12288, 1]cuda:0" = PlaceHolder[target=arg0_1]
+#   %arg1_1 : Tensor "bf16[1, 2304, 4096][9437184, 4096, 1]cuda:0" = PlaceHolder[target=arg1_1]
+#   %mul : Tensor "bf16[1, 2304, 4096][9437184, 4096, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%arg0_1, %arg1_1), kwargs = {})
+#   %add : Tensor "bf16[1, 2304, 4096][9437184, 4096, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%arg2_1, %mul), kwargs = {})
+#   return %add
+triton_poi_fused_add_mul_0 = async_compile.triton('triton_poi_fused_add_mul_0', '''
+import triton
+import triton.language as tl
+
+from torch._inductor.runtime import triton_helpers, triton_heuristics
+from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+triton_helpers.set_driver_to_gpu()
+
+@triton_heuristics.pointwise(
+    size_hints={'x': 16777216}, 
+    filename=__file__,
+    triton_meta={'signature': {'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'in_ptr2': '*bf16', 'out_ptr0': '*bf16', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=128, cc=89, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=1536, warp_size=32), 'constants': {}, 'native_matmul': False, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]]}], 'enable_fp_fusion': True},
+    inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_add_mul_0', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'atomic_add_found': False, 'num_load': 3, 'num_store': 1, 'num_reduction': 0, 'backend_hash': '139C22A3A3C364569C9941DE9469DCB674B7A631E094782CBD415193800462F6', 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'deterministic': False, 'force_filter_reduction_configs': False, 'are_deterministic_algorithms_enabled': False, 'tiling_scores': {'x': 75505664}},
+    min_elem_per_thread=0
+)
+@triton.jit
+def triton_poi_fused_add_mul_0(in_ptr0, in_ptr1, in_ptr2, out_ptr0, xnumel, XBLOCK : tl.constexpr):
+    xnumel = 9437184
+    xoffset = tl.program_id(0) * XBLOCK
+    xindex = xoffset + tl.arange(0, XBLOCK)[:]
+    xmask = tl.full([XBLOCK], True, tl.int1)
+    x2 = xindex
+    x0 = (xindex % 4096)
+    tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32)
+    tmp1 = tl.load(in_ptr1 + (x0), None, eviction_policy='evict_last').to(tl.float32)
+    tmp2 = tl.load(in_ptr2 + (x2), None).to(tl.float32)
+    tmp3 = tmp1 * tmp2
+    tmp4 = tmp0 + tmp3
+    tl.store(out_ptr0 + (x2), tmp4, None)
+''', device_str='cuda')
+
+
+async_compile.wait(globals())
+del async_compile
+
+class Runner:
+    def __init__(self, partitions):
+        self.partitions = partitions
+
+    def recursively_apply_fns(self, fns):
+        new_callables = []
+        for fn, c in zip(fns, self.partitions):
+            new_callables.append(fn(c))
+        self.partitions = new_callables
+
+    def call(self, args):
+        arg0_1, arg1_1, arg2_1 = args
+        args.clear()
+        assert_size_stride(arg0_1, (1, 1, 4096), (12288, 12288, 1))
+        assert_size_stride(arg1_1, (1, 2304, 4096), (9437184, 4096, 1))
+        assert_size_stride(arg2_1, (1, 2304, 4096), (9437184, 4096, 1))
+        with torch.cuda._DeviceGuard(0):
+            torch.cuda.set_device(0)
+            buf0 = empty_strided_cuda((1, 2304, 4096), (9437184, 4096, 1), torch.bfloat16)
+            # Topologically Sorted Source Nodes: [mul, hidden_states], Original ATen: [aten.mul, aten.add]
+            stream0 = get_raw_stream(0)
+            triton_poi_fused_add_mul_0.run(arg2_1, arg0_1, arg1_1, buf0, 9437184, stream=stream0)
+            del arg0_1
+            del arg1_1
+            del arg2_1
+        return (buf0, )
+
+runner = Runner(partitions=[])
+call = runner.call
+recursively_apply_fns = runner.recursively_apply_fns
+
+
+def benchmark_compiled_module(times=10, repeat=10):
+    from torch._dynamo.testing import rand_strided
+    from torch._inductor.utils import print_performance
+    arg0_1 = rand_strided((1, 1, 4096), (12288, 12288, 1), device='cuda:0', dtype=torch.bfloat16)
+    arg1_1 = rand_strided((1, 2304, 4096), (9437184, 4096, 1), device='cuda:0', dtype=torch.bfloat16)
+    arg2_1 = rand_strided((1, 2304, 4096), (9437184, 4096, 1), device='cuda:0', dtype=torch.bfloat16)
+    fn = lambda: call([arg0_1, arg1_1, arg2_1])
+    return print_performance(fn, times=times, repeat=repeat)
+
+
+if __name__ == "__main__":
+    from torch._inductor.wrapper_benchmark import compiled_module_main
+    compiled_module_main('None', benchmark_compiled_module)
diff --git a/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py b/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py
new file mode 100644
index 0000000000000000000000000000000000000000..f21f0db40715806b347caf193f7b01984a76025b
--- /dev/null
+++ b/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py
@@ -0,0 +1,28 @@
+
+import triton
+import triton.language as tl
+
+from torch._inductor.runtime import triton_helpers, triton_heuristics
+from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+triton_helpers.set_driver_to_gpu()
+
+@triton_heuristics.pointwise(
+    size_hints={'x': 16777216}, 
+    filename=__file__,
+    triton_meta={'signature': {'in_ptr0': '*bf16', 'out_ptr0': '*bf16', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=128, cc=89, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=1536, warp_size=32), 'constants': {}, 'native_matmul': False, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]]}], 'enable_fp_fusion': True},
+    inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_clone_permute_2', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'atomic_add_found': False, 'num_load': 1, 'num_store': 1, 'num_reduction': 0, 'backend_hash': '139C22A3A3C364569C9941DE9469DCB674B7A631E094782CBD415193800462F6', 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'deterministic': False, 'force_filter_reduction_configs': False, 'are_deterministic_algorithms_enabled': False, 'tiling_scores': {'x': 56623104}},
+    min_elem_per_thread=0
+)
+@triton.jit
+def triton_poi_fused_clone_permute_2(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
+    xnumel = 9437184
+    xoffset = tl.program_id(0) * XBLOCK
+    xindex = xoffset + tl.arange(0, XBLOCK)[:]
+    xmask = tl.full([XBLOCK], True, tl.int1)
+    x0 = (xindex % 128)
+    x1 = ((xindex // 128) % 32)
+    x2 = xindex // 4096
+    x3 = xindex
+    tmp0 = tl.load(in_ptr0 + (x0 + 128*x2 + 294912*x1), None).to(tl.float32)
+    tl.store(out_ptr0 + (x3), tmp0, None)
diff --git a/torchinductor/j4/f4615e42c3266e11c9ea4b5f133496e24bb48471321801b15c4cd2969aeb5f72.best_config b/torchinductor/j4/f4615e42c3266e11c9ea4b5f133496e24bb48471321801b15c4cd2969aeb5f72.best_config
new file mode 100644
index 0000000000000000000000000000000000000000..dba0954936470862c616d97fdae31ec321ffffe2
--- /dev/null
+++ b/torchinductor/j4/f4615e42c3266e11c9ea4b5f133496e24bb48471321801b15c4cd2969aeb5f72.best_config
@@ -0,0 +1 @@
+{"XBLOCK": 1024, "num_warps": 4, "num_stages": 1, "configs_hash": "3ca5c3e34d35093f3c9ab2829a9faeebad5e61c4ca13d5ed6053d7b71ce60d5a", "found_by_coordesc": false, "time_taken_ms": 43, "triton_cache_hash": "LGWNITOJHF5DZS2J4SNJA2RC2V3NCW5I54J4KKH76NMKB5PT6OPQ"}
\ No newline at end of file
diff --git a/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py b/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py
new file mode 100644
index 0000000000000000000000000000000000000000..992a95b38fa088fd235ef759bf166e0b1f20cfe2
--- /dev/null
+++ b/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py
@@ -0,0 +1,64 @@
+
+import triton
+import triton.language as tl
+
+from torch._inductor.runtime import triton_helpers, triton_heuristics
+from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+triton_helpers.set_driver_to_gpu()
+
+@triton_heuristics.pointwise(
+    size_hints={'x': 16777216}, 
+    filename=__file__,
+    triton_meta={'signature': {'in_ptr0': '*bf16', 'in_ptr1': '*fp32', 'in_ptr2': '*fp32', 'in_ptr3': '*bf16', 'out_ptr0': '*bf16', 'out_ptr1': '*bf16', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=128, cc=89, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=1536, warp_size=32), 'constants': {}, 'native_matmul': False, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]], (6,): [['tt.divisibility', 16]]}], 'enable_fp_fusion': True},
+    inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'atomic_add_found': False, 'num_load': 8, 'num_store': 2, 'num_reduction': 0, 'backend_hash': '139C22A3A3C364569C9941DE9469DCB674B7A631E094782CBD415193800462F6', 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'deterministic': False, 'force_filter_reduction_configs': False, 'are_deterministic_algorithms_enabled': False, 'tiling_scores': {'x': 115605504}},
+    min_elem_per_thread=0
+)
+@triton.jit
+def triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3(in_ptr0, in_ptr1, in_ptr2, in_ptr3, out_ptr0, out_ptr1, xnumel, XBLOCK : tl.constexpr):
+    xnumel = 9437184
+    xoffset = tl.program_id(0) * XBLOCK
+    xindex = xoffset + tl.arange(0, XBLOCK)[:]
+    xmask = tl.full([XBLOCK], True, tl.int1)
+    x3 = xindex
+    x0 = (xindex % 128)
+    x2 = xindex // 4096
+    x4 = xindex // 128
+    tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32)
+    tmp2 = tl.load(in_ptr1 + (x0 + 128*x2), None, eviction_policy='evict_last')
+    tmp19 = tl.load(in_ptr2 + (x0 + 128*x2), None, eviction_policy='evict_last')
+    tmp23 = tl.load(in_ptr3 + (x3), None).to(tl.float32)
+    tmp1 = tmp0.to(tl.float32)
+    tmp3 = tmp1 * tmp2
+    tmp4 = (x3 % 2)
+    tmp5 = tl.full([1], 0, tl.int64)
+    tmp6 = tmp4 >= tmp5
+    tmp7 = tl.full([1], 1, tl.int64)
+    tmp8 = tmp4 < tmp7
+    tmp9 = tl.load(in_ptr0 + (1 + 2*(x0 // 2) + 128*x4), tmp8, eviction_policy='evict_last', other=0.0).to(tl.float32)
+    tmp10 = -tmp9
+    tmp11 = tl.full(tmp10.shape, 0.0, tmp10.dtype)
+    tmp12 = tl.where(tmp8, tmp10, tmp11)
+    tmp13 = tmp4 >= tmp7
+    tmp14 = tl.full([1], 2, tl.int64)
+    tmp15 = tmp4 < tmp14
+    tmp16 = tl.load(in_ptr0 + (2*(x0 // 2) + 128*x4), tmp13, eviction_policy='evict_last', other=0.0).to(tl.float32)
+    tmp17 = tl.where(tmp8, tmp12, tmp16)
+    tmp18 = tmp17.to(tl.float32)
+    tmp20 = tmp18 * tmp19
+    tmp21 = tmp3 + tmp20
+    tmp22 = tmp21.to(tl.float32)
+    tmp24 = tmp23.to(tl.float32)
+    tmp25 = tmp24 * tmp2
+    tmp26 = tl.load(in_ptr3 + (1 + 2*(x0 // 2) + 128*x4), tmp8, eviction_policy='evict_last', other=0.0).to(tl.float32)
+    tmp27 = -tmp26
+    tmp28 = tl.full(tmp27.shape, 0.0, tmp27.dtype)
+    tmp29 = tl.where(tmp8, tmp27, tmp28)
+    tmp30 = tl.load(in_ptr3 + (2*(x0 // 2) + 128*x4), tmp13, eviction_policy='evict_last', other=0.0).to(tl.float32)
+    tmp31 = tl.where(tmp8, tmp29, tmp30)
+    tmp32 = tmp31.to(tl.float32)
+    tmp33 = tmp32 * tmp19
+    tmp34 = tmp25 + tmp33
+    tmp35 = tmp34.to(tl.float32)
+    tl.store(out_ptr0 + (x3), tmp22, None)
+    tl.store(out_ptr1 + (x3), tmp35, None)
diff --git a/torchinductor/j6/fc31ff8ea0e88de49e017c5dfd904cd4aaecfaa99b40a6e7893ff786df367c1f.best_config b/torchinductor/j6/fc31ff8ea0e88de49e017c5dfd904cd4aaecfaa99b40a6e7893ff786df367c1f.best_config
new file mode 100644
index 0000000000000000000000000000000000000000..cd6cae08e89399ea871f8bc0c275150fb3afa4d6
--- /dev/null
+++ b/torchinductor/j6/fc31ff8ea0e88de49e017c5dfd904cd4aaecfaa99b40a6e7893ff786df367c1f.best_config
@@ -0,0 +1 @@
+{"XBLOCK": 1024, "num_warps": 4, "num_stages": 1, "configs_hash": "3ca5c3e34d35093f3c9ab2829a9faeebad5e61c4ca13d5ed6053d7b71ce60d5a", "found_by_coordesc": false, "time_taken_ms": 51, "triton_cache_hash": "PSM7NANFVWEDYUPXKUGOX4GWFVUW6ZQXELVXM5G5LMW6RWIXRCOQ"}
\ No newline at end of file
diff --git a/torchinductor/jb/cjbidoaebunoc2ogxudwkej7mv2qtrm646yq6aqkxnupjaa2gjpm.py b/torchinductor/jb/cjbidoaebunoc2ogxudwkej7mv2qtrm646yq6aqkxnupjaa2gjpm.py
new file mode 100644
index 0000000000000000000000000000000000000000..cc7ec964475e2e8913a5eba64af989d6dca4fd1b
--- /dev/null
+++ b/torchinductor/jb/cjbidoaebunoc2ogxudwkej7mv2qtrm646yq6aqkxnupjaa2gjpm.py
@@ -0,0 +1,69 @@
+# AOT ID: ['24_inference']
+from ctypes import c_void_p, c_long, c_int
+import torch
+import math
+import random
+import os
+import tempfile
+from math import inf, nan
+from cmath import nanj
+from torch._inductor.hooks import run_intermediate_hooks
+from torch._inductor.utils import maybe_profile
+from torch._inductor.codegen.memory_planning import _align as align
+from torch import device, empty_strided
+from torch._inductor.async_compile import AsyncCompile
+from torch._inductor.select_algorithm import extern_kernels
+
+aten = torch.ops.aten
+inductor_ops = torch.ops.inductor
+_quantized = torch.ops._quantized
+assert_size_stride = torch._C._dynamo.guards.assert_size_stride
+assert_alignment = torch._C._dynamo.guards.assert_alignment
+empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu
+empty_strided_cpu_pinned = torch._C._dynamo.guards._empty_strided_cpu_pinned
+empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
+empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu
+empty_strided_mtia = torch._C._dynamo.guards._empty_strided_mtia
+reinterpret_tensor = torch._C._dynamo.guards._reinterpret_tensor
+alloc_from_pool = torch.ops.inductor._alloc_from_pool
+async_compile = AsyncCompile()
+empty_strided_p2p = torch._C._distributed_c10d._SymmetricMemory.empty_strided_p2p
+
+
+async_compile.wait(globals())
+del async_compile
+
+class Runner:
+    def __init__(self, partitions):
+        self.partitions = partitions
+
+    def recursively_apply_fns(self, fns):
+        new_callables = []
+        for fn, c in zip(fns, self.partitions):
+            new_callables.append(fn(c))
+        self.partitions = new_callables
+
+    def call(self, args):
+        arg0_1, arg1_1 = args
+        args.clear()
+        assert_size_stride(arg0_1, (4096, 36864), (1, 4096))
+        assert_size_stride(arg1_1, (1, 1), (1, 1))
+        return (aten.view.dtype(reinterpret_tensor(arg0_1, (36864, 4096), (4096, 1), 0), torch.uint8), reinterpret_tensor(arg1_1, (1, ), (1, ), 0), )
+
+runner = Runner(partitions=[])
+call = runner.call
+recursively_apply_fns = runner.recursively_apply_fns
+
+
+def benchmark_compiled_module(times=10, repeat=10):
+    from torch._dynamo.testing import rand_strided
+    from torch._inductor.utils import print_performance
+    arg0_1 = rand_strided((4096, 36864), (1, 4096), device='cuda:0', dtype=torch.float8_e4m3fn)
+    arg1_1 = rand_strided((1, 1), (1, 1), device='cuda:0', dtype=torch.float32)
+    fn = lambda: call([arg0_1, arg1_1])
+    return print_performance(fn, times=times, repeat=repeat)
+
+
+if __name__ == "__main__":
+    from torch._inductor.wrapper_benchmark import compiled_module_main
+    compiled_module_main('None', benchmark_compiled_module)
diff --git a/torchinductor/l6/cl6bgn7zrhznjdcy5hbh2jf4gtl5wugmqgueeot2jxpfdygccy32.py b/torchinductor/l6/cl6bgn7zrhznjdcy5hbh2jf4gtl5wugmqgueeot2jxpfdygccy32.py
new file mode 100644
index 0000000000000000000000000000000000000000..0b955216b6ee1a9fe8eef5addaef9450d0f023c4
--- /dev/null
+++ b/torchinductor/l6/cl6bgn7zrhznjdcy5hbh2jf4gtl5wugmqgueeot2jxpfdygccy32.py
@@ -0,0 +1,69 @@
+# AOT ID: ['18_inference']
+from ctypes import c_void_p, c_long, c_int
+import torch
+import math
+import random
+import os
+import tempfile
+from math import inf, nan
+from cmath import nanj
+from torch._inductor.hooks import run_intermediate_hooks
+from torch._inductor.utils import maybe_profile
+from torch._inductor.codegen.memory_planning import _align as align
+from torch import device, empty_strided
+from torch._inductor.async_compile import AsyncCompile
+from torch._inductor.select_algorithm import extern_kernels
+
+aten = torch.ops.aten
+inductor_ops = torch.ops.inductor
+_quantized = torch.ops._quantized
+assert_size_stride = torch._C._dynamo.guards.assert_size_stride
+assert_alignment = torch._C._dynamo.guards.assert_alignment
+empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu
+empty_strided_cpu_pinned = torch._C._dynamo.guards._empty_strided_cpu_pinned
+empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
+empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu
+empty_strided_mtia = torch._C._dynamo.guards._empty_strided_mtia
+reinterpret_tensor = torch._C._dynamo.guards._reinterpret_tensor
+alloc_from_pool = torch.ops.inductor._alloc_from_pool
+async_compile = AsyncCompile()
+empty_strided_p2p = torch._C._distributed_c10d._SymmetricMemory.empty_strided_p2p
+
+
+async_compile.wait(globals())
+del async_compile
+
+class Runner:
+    def __init__(self, partitions):
+        self.partitions = partitions
+
+    def recursively_apply_fns(self, fns):
+        new_callables = []
+        for fn, c in zip(fns, self.partitions):
+            new_callables.append(fn(c))
+        self.partitions = new_callables
+
+    def call(self, args):
+        arg0_1, arg1_1 = args
+        args.clear()
+        s3 = arg0_1
+        s52 = arg1_1
+        return (s3*s52, s3, )
+
+runner = Runner(partitions=[])
+call = runner.call
+recursively_apply_fns = runner.recursively_apply_fns
+
+
+def benchmark_compiled_module(times=10, repeat=10):
+    from torch._dynamo.testing import rand_strided
+    from torch._inductor.utils import print_performance
+    arg0_1 = 12288
+    arg1_1 = 2048
+    fn = lambda: call([arg0_1, arg1_1])
+    return print_performance(fn, times=times, repeat=repeat)
+
+
+if __name__ == "__main__":
+    from torch._inductor.wrapper_benchmark import compiled_module_main
+    compiled_module_main('None', benchmark_compiled_module)
diff --git a/torchinductor/lm/clmvm4rjx26qkbhmxge35no7qn6s4jcf5wpuzu4xdj2bl5fufzn5.py b/torchinductor/lm/clmvm4rjx26qkbhmxge35no7qn6s4jcf5wpuzu4xdj2bl5fufzn5.py
new file mode 100644
index 0000000000000000000000000000000000000000..21c5c36f25ad59c064e9c28d2bbe36bac7032a9b
--- /dev/null
+++ b/torchinductor/lm/clmvm4rjx26qkbhmxge35no7qn6s4jcf5wpuzu4xdj2bl5fufzn5.py
@@ -0,0 +1,66 @@
+# AOT ID: ['3_inference']
+from ctypes import c_void_p, c_long, c_int
+import torch
+import math
+import random
+import os
+import tempfile
+from math import inf, nan
+from cmath import nanj
+from torch._inductor.hooks import run_intermediate_hooks
+from torch._inductor.utils import maybe_profile
+from torch._inductor.codegen.memory_planning import _align as align
+from torch import device, empty_strided
+from torch._inductor.async_compile import AsyncCompile
+from torch._inductor.select_algorithm import extern_kernels
+
+aten = torch.ops.aten
+inductor_ops = torch.ops.inductor
+_quantized = torch.ops._quantized
+assert_size_stride = torch._C._dynamo.guards.assert_size_stride
+assert_alignment = torch._C._dynamo.guards.assert_alignment
+empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu
+empty_strided_cpu_pinned = torch._C._dynamo.guards._empty_strided_cpu_pinned
+empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
+empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu
+empty_strided_mtia = torch._C._dynamo.guards._empty_strided_mtia
+reinterpret_tensor = torch._C._dynamo.guards._reinterpret_tensor
+alloc_from_pool = torch.ops.inductor._alloc_from_pool
+async_compile = AsyncCompile()
+empty_strided_p2p = torch._C._distributed_c10d._SymmetricMemory.empty_strided_p2p
+
+
+async_compile.wait(globals())
+del async_compile
+
+class Runner:
+    def __init__(self, partitions):
+        self.partitions = partitions
+
+    def recursively_apply_fns(self, fns):
+        new_callables = []
+        for fn, c in zip(fns, self.partitions):
+            new_callables.append(fn(c))
+        self.partitions = new_callables
+
+    def call(self, args):
+        with torch.cuda._DeviceGuard(0):
+            torch.cuda.set_device(0)
+            buf0 = empty_strided_cuda((4194304, ), (1, ), torch.uint8)
+        return (buf0, )
+
+runner = Runner(partitions=[])
+call = runner.call
+recursively_apply_fns = runner.recursively_apply_fns
+
+
+def benchmark_compiled_module(times=10, repeat=10):
+    from torch._dynamo.testing import rand_strided
+    from torch._inductor.utils import print_performance
+    fn = lambda: call([])
+    return print_performance(fn, times=times, repeat=repeat)
+
+
+if __name__ == "__main__":
+    from torch._inductor.wrapper_benchmark import compiled_module_main
+    compiled_module_main('None', benchmark_compiled_module)
diff --git a/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py b/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py
new file mode 100644
index 0000000000000000000000000000000000000000..d92d5d0cc033c61120e42219ad20e6f3844e53d1
--- /dev/null
+++ b/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py
@@ -0,0 +1,37 @@
+
+import triton
+import triton.language as tl
+
+from torch._inductor.runtime import triton_helpers, triton_heuristics
+from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+triton_helpers.set_driver_to_gpu()
+
+@triton_heuristics.pointwise(
+    size_hints={'x': 16777216}, 
+    filename=__file__,
+    triton_meta={'signature': {'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'out_ptr0': '*bf16', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=128, cc=89, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=1536, warp_size=32), 'constants': {}, 'native_matmul': False, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]]}], 'enable_fp_fusion': True},
+    inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_cat_view_4', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'atomic_add_found': False, 'num_load': 2, 'num_store': 1, 'num_reduction': 0, 'backend_hash': '139C22A3A3C364569C9941DE9469DCB674B7A631E094782CBD415193800462F6', 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'deterministic': False, 'force_filter_reduction_configs': False, 'are_deterministic_algorithms_enabled': False, 'tiling_scores': {'x': 75497472}},
+    min_elem_per_thread=0
+)
+@triton.jit
+def triton_poi_fused_cat_view_4(in_ptr0, in_ptr1, out_ptr0, xnumel, XBLOCK : tl.constexpr):
+    xnumel = 9437184
+    xoffset = tl.program_id(0) * XBLOCK
+    xindex = xoffset + tl.arange(0, XBLOCK)[:]
+    xmask = tl.full([XBLOCK], True, tl.int1)
+    x1 = xindex // 4096
+    x0 = (xindex % 4096)
+    x2 = xindex
+    tmp0 = x1
+    tmp1 = tl.full([1], 0, tl.int64)
+    tmp2 = tmp0 >= tmp1
+    tmp3 = tl.full([1], 256, tl.int64)
+    tmp4 = tmp0 < tmp3
+    tmp5 = tl.load(in_ptr0 + (x0 + 12288*(x1)), tmp4, other=0.0).to(tl.float32)
+    tmp6 = tmp0 >= tmp3
+    tmp7 = tl.full([1], 2304, tl.int64)
+    tmp8 = tmp0 < tmp7
+    tmp9 = tl.load(in_ptr1 + (x0 + 12288*((-256) + x1)), tmp6, other=0.0).to(tl.float32)
+    tmp10 = tl.where(tmp4, tmp5, tmp9)
+    tl.store(out_ptr0 + (x2), tmp10, None)
diff --git a/torchinductor/lp/e492dcb4532db2d7228670fc17c874efe5ebefa061173fe11c857897719b4f8a.best_config b/torchinductor/lp/e492dcb4532db2d7228670fc17c874efe5ebefa061173fe11c857897719b4f8a.best_config
new file mode 100644
index 0000000000000000000000000000000000000000..a0851c3c0a82173d9ff3ae08fb6c10a2d0294257
--- /dev/null
+++ b/torchinductor/lp/e492dcb4532db2d7228670fc17c874efe5ebefa061173fe11c857897719b4f8a.best_config
@@ -0,0 +1 @@
+{"XBLOCK": 512, "num_warps": 8, "num_stages": 1, "configs_hash": "3ca5c3e34d35093f3c9ab2829a9faeebad5e61c4ca13d5ed6053d7b71ce60d5a", "found_by_coordesc": false, "time_taken_ms": 42, "triton_cache_hash": "P6RZ5PAFGN2GJD5U5GBCNRKQD25CGJQM7AHKJR3IJHQE4JWAUJZQ"}
\ No newline at end of file
diff --git a/torchinductor/m7/cm7pu7u7om2jn3kzchwei42hx5yogcrhxykjyf2psczxqpq6pqgn.py b/torchinductor/m7/cm7pu7u7om2jn3kzchwei42hx5yogcrhxykjyf2psczxqpq6pqgn.py
new file mode 100644
index 0000000000000000000000000000000000000000..81545237becae7bc96a78611831d49204aa9669c
--- /dev/null
+++ b/torchinductor/m7/cm7pu7u7om2jn3kzchwei42hx5yogcrhxykjyf2psczxqpq6pqgn.py
@@ -0,0 +1,114 @@
+# AOT ID: ['13_inference']
+from ctypes import c_void_p, c_long, c_int
+import torch
+import math
+import random
+import os
+import tempfile
+from math import inf, nan
+from cmath import nanj
+from torch._inductor.hooks import run_intermediate_hooks
+from torch._inductor.utils import maybe_profile
+from torch._inductor.codegen.memory_planning import _align as align
+from torch import device, empty_strided
+from torch._inductor.async_compile import AsyncCompile
+from torch._inductor.select_algorithm import extern_kernels
+import triton
+import triton.language as tl
+from torch._inductor.runtime.triton_heuristics import start_graph, end_graph
+from torch._C import _cuda_getCurrentRawStream as get_raw_stream
+
+aten = torch.ops.aten
+inductor_ops = torch.ops.inductor
+_quantized = torch.ops._quantized
+assert_size_stride = torch._C._dynamo.guards.assert_size_stride
+assert_alignment = torch._C._dynamo.guards.assert_alignment
+empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu
+empty_strided_cpu_pinned = torch._C._dynamo.guards._empty_strided_cpu_pinned
+empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
+empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu
+empty_strided_mtia = torch._C._dynamo.guards._empty_strided_mtia
+reinterpret_tensor = torch._C._dynamo.guards._reinterpret_tensor
+alloc_from_pool = torch.ops.inductor._alloc_from_pool
+async_compile = AsyncCompile()
+empty_strided_p2p = torch._C._distributed_c10d._SymmetricMemory.empty_strided_p2p
+
+
+# kernel path: /app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py
+# Topologically Sorted Source Nodes: [hidden_states], Original ATen: [aten.clone]
+# Source node to ATen node mapping:
+#   hidden_states => clone
+# Graph fragment:
+#   %arg0_1 : Tensor "bf16[1, 2048, 4096][8388608, 4096, 1]cuda:0" = PlaceHolder[target=arg0_1]
+#   %clone : Tensor "bf16[1, 2048, 4096][8388608, 4096, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.clone.default](args = (%arg0_1,), kwargs = {})
+#   return %clone
+triton_poi_fused_clone_0 = async_compile.triton('triton_poi_fused_clone_0', '''
+import triton
+import triton.language as tl
+
+from torch._inductor.runtime import triton_helpers, triton_heuristics
+from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+triton_helpers.set_driver_to_gpu()
+
+@triton_heuristics.pointwise(
+    size_hints={'x': 8388608}, 
+    filename=__file__,
+    triton_meta={'signature': {'in_ptr0': '*bf16', 'out_ptr0': '*bf16', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=128, cc=89, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=1536, warp_size=32), 'constants': {}, 'native_matmul': False, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]]}], 'enable_fp_fusion': True},
+    inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_clone_0', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'atomic_add_found': False, 'num_load': 1, 'num_store': 1, 'num_reduction': 0, 'backend_hash': '139C22A3A3C364569C9941DE9469DCB674B7A631E094782CBD415193800462F6', 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'deterministic': False, 'force_filter_reduction_configs': False, 'are_deterministic_algorithms_enabled': False, 'tiling_scores': {'x': 50331648}},
+    min_elem_per_thread=0
+)
+@triton.jit
+def triton_poi_fused_clone_0(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
+    xnumel = 8388608
+    xoffset = tl.program_id(0) * XBLOCK
+    xindex = xoffset + tl.arange(0, XBLOCK)[:]
+    xmask = tl.full([XBLOCK], True, tl.int1)
+    x0 = xindex
+    tmp0 = tl.load(in_ptr0 + (x0), None).to(tl.float32)
+    tl.store(out_ptr0 + (x0), tmp0, None)
+''', device_str='cuda')
+
+
+async_compile.wait(globals())
+del async_compile
+
+class Runner:
+    def __init__(self, partitions):
+        self.partitions = partitions
+
+    def recursively_apply_fns(self, fns):
+        new_callables = []
+        for fn, c in zip(fns, self.partitions):
+            new_callables.append(fn(c))
+        self.partitions = new_callables
+
+    def call(self, args):
+        arg0_1, = args
+        args.clear()
+        assert_size_stride(arg0_1, (1, 2048, 4096), (8388608, 4096, 1))
+        with torch.cuda._DeviceGuard(0):
+            torch.cuda.set_device(0)
+            buf0 = empty_strided_cuda((1, 2048, 4096), (8388608, 4096, 1), torch.bfloat16)
+            # Topologically Sorted Source Nodes: [hidden_states], Original ATen: [aten.clone]
+            stream0 = get_raw_stream(0)
+            triton_poi_fused_clone_0.run(arg0_1, buf0, 8388608, stream=stream0)
+            del arg0_1
+        return (buf0, )
+
+runner = Runner(partitions=[])
+call = runner.call
+recursively_apply_fns = runner.recursively_apply_fns
+
+
+def benchmark_compiled_module(times=10, repeat=10):
+    from torch._dynamo.testing import rand_strided
+    from torch._inductor.utils import print_performance
+    arg0_1 = rand_strided((1, 2048, 4096), (8388608, 4096, 1), device='cuda:0', dtype=torch.bfloat16)
+    fn = lambda: call([arg0_1])
+    return print_performance(fn, times=times, repeat=repeat)
+
+
+if __name__ == "__main__":
+    from torch._inductor.wrapper_benchmark import compiled_module_main
+    compiled_module_main('None', benchmark_compiled_module)
diff --git a/torchinductor/mm/cmm7cgnyfzmeqiuyvf2c3y53lnswvir5imajipe4ngulpefdv4cb.py b/torchinductor/mm/cmm7cgnyfzmeqiuyvf2c3y53lnswvir5imajipe4ngulpefdv4cb.py
new file mode 100644
index 0000000000000000000000000000000000000000..6b0c94ad2ab2866a454a91c2a0f800016c4f4ed1
--- /dev/null
+++ b/torchinductor/mm/cmm7cgnyfzmeqiuyvf2c3y53lnswvir5imajipe4ngulpefdv4cb.py
@@ -0,0 +1,69 @@
+# AOT ID: ['12_inference']
+from ctypes import c_void_p, c_long, c_int
+import torch
+import math
+import random
+import os
+import tempfile
+from math import inf, nan
+from cmath import nanj
+from torch._inductor.hooks import run_intermediate_hooks
+from torch._inductor.utils import maybe_profile
+from torch._inductor.codegen.memory_planning import _align as align
+from torch import device, empty_strided
+from torch._inductor.async_compile import AsyncCompile
+from torch._inductor.select_algorithm import extern_kernels
+
+aten = torch.ops.aten
+inductor_ops = torch.ops.inductor
+_quantized = torch.ops._quantized
+assert_size_stride = torch._C._dynamo.guards.assert_size_stride
+assert_alignment = torch._C._dynamo.guards.assert_alignment
+empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu
+empty_strided_cpu_pinned = torch._C._dynamo.guards._empty_strided_cpu_pinned
+empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
+empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu
+empty_strided_mtia = torch._C._dynamo.guards._empty_strided_mtia
+reinterpret_tensor = torch._C._dynamo.guards._reinterpret_tensor
+alloc_from_pool = torch.ops.inductor._alloc_from_pool
+async_compile = AsyncCompile()
+empty_strided_p2p = torch._C._distributed_c10d._SymmetricMemory.empty_strided_p2p
+
+
+async_compile.wait(globals())
+del async_compile
+
+class Runner:
+    def __init__(self, partitions):
+        self.partitions = partitions
+
+    def recursively_apply_fns(self, fns):
+        new_callables = []
+        for fn, c in zip(fns, self.partitions):
+            new_callables.append(fn(c))
+        self.partitions = new_callables
+
+    def call(self, args):
+        arg0_1, arg1_1 = args
+        args.clear()
+        assert_size_stride(arg0_1, (4096, 4096), (1, 4096))
+        assert_size_stride(arg1_1, (1, 1), (1, 1))
+        return (aten.view.dtype(reinterpret_tensor(arg0_1, (4096, 4096), (4096, 1), 0), torch.uint8), reinterpret_tensor(arg1_1, (1, ), (1, ), 0), )
+
+runner = Runner(partitions=[])
+call = runner.call
+recursively_apply_fns = runner.recursively_apply_fns
+
+
+def benchmark_compiled_module(times=10, repeat=10):
+    from torch._dynamo.testing import rand_strided
+    from torch._inductor.utils import print_performance
+    arg0_1 = rand_strided((4096, 4096), (1, 4096), device='cuda:0', dtype=torch.float8_e4m3fn)
+    arg1_1 = rand_strided((1, 1), (1, 1), device='cuda:0', dtype=torch.float32)
+    fn = lambda: call([arg0_1, arg1_1])
+    return print_performance(fn, times=times, repeat=repeat)
+
+
+if __name__ == "__main__":
+    from torch._inductor.wrapper_benchmark import compiled_module_main
+    compiled_module_main('None', benchmark_compiled_module)
diff --git a/torchinductor/nm/cnmrfsgnv5zogcysnfemjfhv46bzzfsonihlzpnx6hdtwdd5t3pl.py b/torchinductor/nm/cnmrfsgnv5zogcysnfemjfhv46bzzfsonihlzpnx6hdtwdd5t3pl.py
new file mode 100644
index 0000000000000000000000000000000000000000..4beab6ba6cd9510e88c36f2510bdfd5565ea494f
--- /dev/null
+++ b/torchinductor/nm/cnmrfsgnv5zogcysnfemjfhv46bzzfsonihlzpnx6hdtwdd5t3pl.py
@@ -0,0 +1,184 @@
+# AOT ID: ['23_inference']
+from ctypes import c_void_p, c_long, c_int
+import torch
+import math
+import random
+import os
+import tempfile
+from math import inf, nan
+from cmath import nanj
+from torch._inductor.hooks import run_intermediate_hooks
+from torch._inductor.utils import maybe_profile
+from torch._inductor.codegen.memory_planning import _align as align
+from torch import device, empty_strided
+from torch._inductor.async_compile import AsyncCompile
+from torch._inductor.select_algorithm import extern_kernels
+import triton
+import triton.language as tl
+from torch._inductor.runtime.triton_heuristics import start_graph, end_graph
+from torch._C import _cuda_getCurrentRawStream as get_raw_stream
+
+aten = torch.ops.aten
+inductor_ops = torch.ops.inductor
+_quantized = torch.ops._quantized
+assert_size_stride = torch._C._dynamo.guards.assert_size_stride
+assert_alignment = torch._C._dynamo.guards.assert_alignment
+empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu
+empty_strided_cpu_pinned = torch._C._dynamo.guards._empty_strided_cpu_pinned
+empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
+empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu
+empty_strided_mtia = torch._C._dynamo.guards._empty_strided_mtia
+reinterpret_tensor = torch._C._dynamo.guards._reinterpret_tensor
+alloc_from_pool = torch.ops.inductor._alloc_from_pool
+async_compile = AsyncCompile()
+empty_strided_p2p = torch._C._distributed_c10d._SymmetricMemory.empty_strided_p2p
+
+
+# kernel path: /app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py
+# Topologically Sorted Source Nodes: [norm_hidden_states, add, mul, norm_hidden_states_1], Original ATen: [aten.native_layer_norm, aten.add, aten.mul]
+# Source node to ATen node mapping:
+#   add => add_1
+#   mul => mul_1
+#   norm_hidden_states => add, convert_element_type, convert_element_type_1, mul, rsqrt, sub, var_mean
+#   norm_hidden_states_1 => add_2
+# Graph fragment:
+#   %arg0_1 : Tensor "bf16[1, 2304, 4096][9437184, 4096, 1]cuda:0" = PlaceHolder[target=arg0_1]
+#   %arg1_1 : Tensor "bf16[1, 1, 4096][12288, 12288, 1]cuda:0" = PlaceHolder[target=arg1_1]
+#   %getitem_1 : Tensor "f32[1, 2304, 1][2304, 1, 2304]cuda:0" = PlaceHolder[target=getitem_1]
+#   %buf1 : Tensor "f32[1, 2304, 1][2304, 1, 2304]cuda:0" = PlaceHolder[target=buf1]
+#   %arg2_1 : Tensor "bf16[1, 1, 4096][12288, 12288, 1]cuda:0" = PlaceHolder[target=arg2_1]
+#   %convert_element_type : Tensor "f32[1, 2304, 4096][9437184, 4096, 1]cuda:0"[num_users=2] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%arg0_1, torch.float32), kwargs = {})
+#   %var_mean : [num_users=2] = call_function[target=torch.ops.aten.var_mean.correction](args = (%convert_element_type, [2]), kwargs = {correction: 0, keepdim: True})
+#   %add_1 : Tensor "bf16[1, 1, 4096][4096, 4096, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%arg1_1, 1), kwargs = {})
+#   %sub : Tensor "f32[1, 2304, 4096][9437184, 4096, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.sub.Tensor](args = (%convert_element_type, %getitem_1), kwargs = {})
+#   %add : Tensor "f32[1, 2304, 1][2304, 1, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%getitem, 1e-06), kwargs = {})
+#   %rsqrt : Tensor "f32[1, 2304, 1][2304, 1, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.rsqrt.default](args = (%add,), kwargs = {})
+#   %mul : Tensor "f32[1, 2304, 4096][9437184, 4096, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%sub, %rsqrt), kwargs = {})
+#   %convert_element_type_1 : Tensor "bf16[1, 2304, 4096][9437184, 4096, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%mul, torch.bfloat16), kwargs = {})
+#   %mul_1 : Tensor "bf16[1, 2304, 4096][9437184, 4096, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%add_1, %convert_element_type_1), kwargs = {})
+#   %add_2 : Tensor "bf16[1, 2304, 4096][9437184, 4096, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%mul_1, %arg2_1), kwargs = {})
+#   return %getitem_1,%buf1,%add_2
+triton_red_fused_add_mul_native_layer_norm_0 = async_compile.triton('triton_red_fused_add_mul_native_layer_norm_0', '''
+import triton
+import triton.language as tl
+
+from torch._inductor.runtime import triton_helpers, triton_heuristics
+from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+triton_helpers.set_driver_to_gpu()
+
+@triton_heuristics.reduction(
+    size_hints={'x': 4096, 'r0_': 4096},
+    reduction_hint=ReductionHint.INNER,
+    filename=__file__,
+    triton_meta={'signature': {'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'in_ptr2': '*bf16', 'out_ptr2': '*bf16', 'xnumel': 'i32', 'r0_numel': 'i32', 'XBLOCK': 'constexpr', 'R0_BLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=128, cc=89, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=1536, warp_size=32), 'constants': {}, 'native_matmul': False, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]]}], 'enable_fp_fusion': True},
+    inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_red_fused_add_mul_native_layer_norm_0', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'atomic_add_found': False, 'num_load': 4, 'num_store': 1, 'num_reduction': 2, 'backend_hash': '139C22A3A3C364569C9941DE9469DCB674B7A631E094782CBD415193800462F6', 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'deterministic': False, 'force_filter_reduction_configs': False, 'are_deterministic_algorithms_enabled': False, 'add_persistent_rblock': True, 'tiling_scores': {'x': 0, 'r0_': 56639488}}
+)
+@triton.jit
+def triton_red_fused_add_mul_native_layer_norm_0(in_ptr0, in_ptr1, in_ptr2, out_ptr2, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr):
+    xnumel = 2304
+    r0_numel = 4096
+    rnumel = r0_numel
+    RBLOCK: tl.constexpr = R0_BLOCK
+    xoffset = tl.program_id(0) * XBLOCK
+    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
+    xmask = xindex < xnumel
+    r0_base = tl.arange(0, R0_BLOCK)[None, :]
+    rbase = r0_base
+    x0 = xindex
+    tmp3_mean = tl.zeros([XBLOCK, R0_BLOCK], tl.float32)
+    tmp3_m2 = tl.zeros([XBLOCK, R0_BLOCK], tl.float32)
+    tmp3_weight = tl.zeros([XBLOCK, R0_BLOCK], tl.float32)
+    for r0_offset in tl.range(0, r0_numel, R0_BLOCK):
+        r0_index = r0_offset + r0_base
+        r0_mask = r0_index < r0_numel
+        roffset = r0_offset
+        rindex = r0_index
+        r0_1 = r0_index
+        tmp0 = tl.load(in_ptr0 + (r0_1 + 4096*x0), r0_mask & xmask, eviction_policy='evict_last', other=0.0).to(tl.float32)
+        tmp1 = tmp0.to(tl.float32)
+        tmp2 = tl.broadcast_to(tmp1, [XBLOCK, R0_BLOCK])
+        tmp3_mean_next, tmp3_m2_next, tmp3_weight_next = triton_helpers.welford_reduce(
+            tmp2, tmp3_mean, tmp3_m2, tmp3_weight, roffset == 0
+        )
+        tmp3_mean = tl.where(r0_mask & xmask, tmp3_mean_next, tmp3_mean)
+        tmp3_m2 = tl.where(r0_mask & xmask, tmp3_m2_next, tmp3_m2)
+        tmp3_weight = tl.where(r0_mask & xmask, tmp3_weight_next, tmp3_weight)
+    tmp4, tmp5, tmp6 = triton_helpers.welford(tmp3_mean, tmp3_m2, tmp3_weight, 1)
+    tmp3 = tmp4[:, None]
+    tmp7 = tmp5[:, None]
+    tmp8 = tmp6[:, None]
+    for r0_offset in tl.range(0, r0_numel, R0_BLOCK):
+        r0_index = r0_offset + r0_base
+        r0_mask = r0_index < r0_numel
+        roffset = r0_offset
+        rindex = r0_index
+        r0_1 = r0_index
+        tmp9 = tl.load(in_ptr1 + (r0_1), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32)
+        tmp12 = tl.load(in_ptr0 + (r0_1 + 4096*x0), r0_mask & xmask, eviction_policy='evict_first', other=0.0).to(tl.float32)
+        tmp23 = tl.load(in_ptr2 + (r0_1), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32)
+        tmp10 = 1.0
+        tmp11 = tmp9 + tmp10
+        tmp13 = tmp12.to(tl.float32)
+        tmp14 = tmp13 - tmp3
+        tmp15 = 4096.0
+        tmp16 = (tmp7 / tmp15)
+        tmp17 = 1e-06
+        tmp18 = tmp16 + tmp17
+        tmp19 = libdevice.rsqrt(tmp18)
+        tmp20 = tmp14 * tmp19
+        tmp21 = tmp20.to(tl.float32)
+        tmp22 = tmp11 * tmp21
+        tmp24 = tmp22 + tmp23
+        tl.store(out_ptr2 + (r0_1 + 4096*x0), tmp24, r0_mask & xmask)
+''', device_str='cuda')
+
+
+async_compile.wait(globals())
+del async_compile
+
+class Runner:
+    def __init__(self, partitions):
+        self.partitions = partitions
+
+    def recursively_apply_fns(self, fns):
+        new_callables = []
+        for fn, c in zip(fns, self.partitions):
+            new_callables.append(fn(c))
+        self.partitions = new_callables
+
+    def call(self, args):
+        arg0_1, arg1_1, arg2_1 = args
+        args.clear()
+        assert_size_stride(arg0_1, (1, 2304, 4096), (9437184, 4096, 1))
+        assert_size_stride(arg1_1, (1, 1, 4096), (12288, 12288, 1))
+        assert_size_stride(arg2_1, (1, 1, 4096), (12288, 12288, 1))
+        with torch.cuda._DeviceGuard(0):
+            torch.cuda.set_device(0)
+            buf3 = empty_strided_cuda((1, 2304, 4096), (9437184, 4096, 1), torch.bfloat16)
+            # Topologically Sorted Source Nodes: [norm_hidden_states, add, mul, norm_hidden_states_1], Original ATen: [aten.native_layer_norm, aten.add, aten.mul]
+            stream0 = get_raw_stream(0)
+            triton_red_fused_add_mul_native_layer_norm_0.run(arg0_1, arg1_1, arg2_1, buf3, 2304, 4096, stream=stream0)
+            del arg0_1
+            del arg1_1
+            del arg2_1
+        return (buf3, )
+
+runner = Runner(partitions=[])
+call = runner.call
+recursively_apply_fns = runner.recursively_apply_fns
+
+
+def benchmark_compiled_module(times=10, repeat=10):
+    from torch._dynamo.testing import rand_strided
+    from torch._inductor.utils import print_performance
+    arg0_1 = rand_strided((1, 2304, 4096), (9437184, 4096, 1), device='cuda:0', dtype=torch.bfloat16)
+    arg1_1 = rand_strided((1, 1, 4096), (12288, 12288, 1), device='cuda:0', dtype=torch.bfloat16)
+    arg2_1 = rand_strided((1, 1, 4096), (12288, 12288, 1), device='cuda:0', dtype=torch.bfloat16)
+    fn = lambda: call([arg0_1, arg1_1, arg2_1])
+    return print_performance(fn, times=times, repeat=repeat)
+
+
+if __name__ == "__main__":
+    from torch._inductor.wrapper_benchmark import compiled_module_main
+    compiled_module_main('None', benchmark_compiled_module)
diff --git a/torchinductor/od/cod27kpzzzmh5ox7u6e6gvblzejrgxtrpbq2umbsyzgpvwzamxjx.py b/torchinductor/od/cod27kpzzzmh5ox7u6e6gvblzejrgxtrpbq2umbsyzgpvwzamxjx.py
new file mode 100644
index 0000000000000000000000000000000000000000..99ab7d031726aa9c6576aabb9f9e0fbfc69fd3f3
--- /dev/null
+++ b/torchinductor/od/cod27kpzzzmh5ox7u6e6gvblzejrgxtrpbq2umbsyzgpvwzamxjx.py
@@ -0,0 +1,250 @@
+# AOT ID: ['26_inference']
+from ctypes import c_void_p, c_long, c_int
+import torch
+import math
+import random
+import os
+import tempfile
+from math import inf, nan
+from cmath import nanj
+from torch._inductor.hooks import run_intermediate_hooks
+from torch._inductor.utils import maybe_profile
+from torch._inductor.codegen.memory_planning import _align as align
+from torch import device, empty_strided
+from torch._inductor.async_compile import AsyncCompile
+from torch._inductor.select_algorithm import extern_kernels
+import triton
+import triton.language as tl
+from torch._inductor.runtime.triton_heuristics import start_graph, end_graph
+from torch._C import _cuda_getCurrentRawStream as get_raw_stream
+
+aten = torch.ops.aten
+inductor_ops = torch.ops.inductor
+_quantized = torch.ops._quantized
+assert_size_stride = torch._C._dynamo.guards.assert_size_stride
+assert_alignment = torch._C._dynamo.guards.assert_alignment
+empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu
+empty_strided_cpu_pinned = torch._C._dynamo.guards._empty_strided_cpu_pinned
+empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
+empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu
+empty_strided_mtia = torch._C._dynamo.guards._empty_strided_mtia
+reinterpret_tensor = torch._C._dynamo.guards._reinterpret_tensor
+alloc_from_pool = torch.ops.inductor._alloc_from_pool
+async_compile = AsyncCompile()
+empty_strided_p2p = torch._C._distributed_c10d._SymmetricMemory.empty_strided_p2p
+
+
+# kernel path: /app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py
+# Topologically Sorted Source Nodes: [permute, q, permute_1, k, permute_2, v, output], Original ATen: [aten.permute, aten.clone, aten._scaled_dot_product_cudnn_attention]
+# Source node to ATen node mapping:
+#   k => clone_1
+#   output => _scaled_dot_product_cudnn_attention
+#   permute => permute
+#   permute_1 => permute_1
+#   permute_2 => permute_2
+#   q => clone
+#   v => clone_2
+# Graph fragment:
+#   %arg0_1 : Tensor "bf16[1, 2304, 32, 128][9437184, 4096, 128, 1]cuda:0" = PlaceHolder[target=arg0_1]
+#   %permute : Tensor "bf16[1, 32, 2304, 128][9437184, 128, 4096, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.permute.default](args = (%arg0_1, [0, 2, 1, 3]), kwargs = {})
+#   %clone : Tensor "bf16[1, 32, 2304, 128][9437184, 294912, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.clone.default](args = (%permute,), kwargs = {memory_format: torch.contiguous_format})
+#   %permute_1 : Tensor "bf16[1, 32, 2304, 128][9437184, 128, 4096, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.permute.default](args = (%arg1_1, [0, 2, 1, 3]), kwargs = {})
+#   %clone_1 : Tensor "bf16[1, 32, 2304, 128][9437184, 294912, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.clone.default](args = (%permute_1,), kwargs = {memory_format: torch.contiguous_format})
+#   %permute_2 : Tensor "bf16[1, 32, 2304, 128][2304*s73, 128, s73, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.permute.default](args = (%arg3_1, [0, 2, 1, 3]), kwargs = {})
+#   %clone_2 : Tensor "bf16[1, 32, 2304, 128][9437184, 294912, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.clone.default](args = (%permute_2,), kwargs = {memory_format: torch.contiguous_format})
+#   %_scaled_dot_product_cudnn_attention : [num_users=1] = call_function[target=torch.ops.aten._scaled_dot_product_cudnn_attention.default](args = (%clone, %clone_1, %clone_2, None, False), kwargs = {})
+#   return %buf0
+triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0 = async_compile.triton('triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0', '''
+import triton
+import triton.language as tl
+
+from torch._inductor.runtime import triton_helpers, triton_heuristics
+from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+triton_helpers.set_driver_to_gpu()
+
+@triton_heuristics.pointwise(
+    size_hints={'x': 16777216}, 
+    filename=__file__,
+    triton_meta={'signature': {'in_ptr0': '*bf16', 'out_ptr0': '*bf16', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=128, cc=89, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=1536, warp_size=32), 'constants': {}, 'native_matmul': False, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]]}], 'enable_fp_fusion': True},
+    inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'atomic_add_found': False, 'num_load': 1, 'num_store': 1, 'num_reduction': 0, 'backend_hash': '139C22A3A3C364569C9941DE9469DCB674B7A631E094782CBD415193800462F6', 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'deterministic': False, 'force_filter_reduction_configs': False, 'are_deterministic_algorithms_enabled': False, 'tiling_scores': {'x': 56623104}},
+    min_elem_per_thread=0
+)
+@triton.jit
+def triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
+    xnumel = 9437184
+    xoffset = tl.program_id(0) * XBLOCK
+    xindex = xoffset + tl.arange(0, XBLOCK)[:]
+    xmask = tl.full([XBLOCK], True, tl.int1)
+    x0 = (xindex % 128)
+    x1 = ((xindex // 128) % 2304)
+    x2 = xindex // 294912
+    x3 = xindex
+    tmp0 = tl.load(in_ptr0 + (x0 + 128*x2 + 4096*x1), None).to(tl.float32)
+    tl.store(out_ptr0 + (x3), tmp0, None)
+''', device_str='cuda')
+
+
+# kernel path: /app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py
+# Topologically Sorted Source Nodes: [permute, q, permute_1, k, permute_2, v, output], Original ATen: [aten.permute, aten.clone, aten._scaled_dot_product_cudnn_attention]
+# Source node to ATen node mapping:
+#   k => clone_1
+#   output => _scaled_dot_product_cudnn_attention
+#   permute => permute
+#   permute_1 => permute_1
+#   permute_2 => permute_2
+#   q => clone
+#   v => clone_2
+# Graph fragment:
+#   %arg3_1 : Tensor "bf16[1, 2304, 32, 128][2304*s73, s73, 128, 1]cuda:0" = PlaceHolder[target=arg3_1]
+#   %permute : Tensor "bf16[1, 32, 2304, 128][9437184, 128, 4096, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.permute.default](args = (%arg0_1, [0, 2, 1, 3]), kwargs = {})
+#   %clone : Tensor "bf16[1, 32, 2304, 128][9437184, 294912, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.clone.default](args = (%permute,), kwargs = {memory_format: torch.contiguous_format})
+#   %permute_1 : Tensor "bf16[1, 32, 2304, 128][9437184, 128, 4096, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.permute.default](args = (%arg1_1, [0, 2, 1, 3]), kwargs = {})
+#   %clone_1 : Tensor "bf16[1, 32, 2304, 128][9437184, 294912, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.clone.default](args = (%permute_1,), kwargs = {memory_format: torch.contiguous_format})
+#   %permute_2 : Tensor "bf16[1, 32, 2304, 128][2304*s73, 128, s73, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.permute.default](args = (%arg3_1, [0, 2, 1, 3]), kwargs = {})
+#   %clone_2 : Tensor "bf16[1, 32, 2304, 128][9437184, 294912, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.clone.default](args = (%permute_2,), kwargs = {memory_format: torch.contiguous_format})
+#   %_scaled_dot_product_cudnn_attention : [num_users=1] = call_function[target=torch.ops.aten._scaled_dot_product_cudnn_attention.default](args = (%clone, %clone_1, %clone_2, None, False), kwargs = {})
+#   return %buf2
+triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1 = async_compile.triton('triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1', '''
+import triton
+import triton.language as tl
+
+from torch._inductor.runtime import triton_helpers, triton_heuristics
+from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+triton_helpers.set_driver_to_gpu()
+
+@triton_heuristics.pointwise(
+    size_hints={'x': 16777216}, 
+    filename=__file__,
+    triton_meta={'signature': {'in_ptr0': '*bf16', 'out_ptr0': '*bf16', 'ks0': 'i64', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=128, cc=89, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=1536, warp_size=32), 'constants': {}, 'native_matmul': False, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]]}], 'enable_fp_fusion': True},
+    inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'atomic_add_found': False, 'num_load': 1, 'num_store': 1, 'num_reduction': 0, 'backend_hash': '139C22A3A3C364569C9941DE9469DCB674B7A631E094782CBD415193800462F6', 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'deterministic': False, 'force_filter_reduction_configs': False, 'are_deterministic_algorithms_enabled': False, 'tiling_scores': {'x': 37748736}},
+    min_elem_per_thread=0
+)
+@triton.jit
+def triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1(in_ptr0, out_ptr0, ks0, xnumel, XBLOCK : tl.constexpr):
+    xnumel = 9437184
+    xoffset = tl.program_id(0) * XBLOCK
+    xindex = xoffset + tl.arange(0, XBLOCK)[:]
+    xmask = tl.full([XBLOCK], True, tl.int1)
+    x0 = (xindex % 128)
+    x1 = ((xindex // 128) % 2304)
+    x2 = xindex // 294912
+    x3 = xindex
+    tmp0 = tl.load(in_ptr0 + (x0 + 128*x2 + ks0*x1), None).to(tl.float32)
+    tl.store(out_ptr0 + (x3), tmp0, None)
+''', device_str='cuda')
+
+
+# kernel path: /app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py
+# Topologically Sorted Source Nodes: [permute_3, out], Original ATen: [aten.permute, aten.clone]
+# Source node to ATen node mapping:
+#   out => clone_3
+#   permute_3 => permute_3
+# Graph fragment:
+#   %getitem : Tensor "bf16[1, 32, 2304, 128][9437184, 294912, 128, 1]cuda:0" = PlaceHolder[target=getitem]
+#   %permute_3 : Tensor "bf16[1, 2304, 32, 128][9437184, 128, 294912, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.permute.default](args = (%getitem, [0, 2, 1, 3]), kwargs = {})
+#   %clone_3 : Tensor "bf16[1, 2304, 32, 128][9437184, 4096, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.clone.default](args = (%permute_3,), kwargs = {memory_format: torch.contiguous_format})
+#   return %clone_3
+triton_poi_fused_clone_permute_2 = async_compile.triton('triton_poi_fused_clone_permute_2', '''
+import triton
+import triton.language as tl
+
+from torch._inductor.runtime import triton_helpers, triton_heuristics
+from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+triton_helpers.set_driver_to_gpu()
+
+@triton_heuristics.pointwise(
+    size_hints={'x': 16777216}, 
+    filename=__file__,
+    triton_meta={'signature': {'in_ptr0': '*bf16', 'out_ptr0': '*bf16', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=128, cc=89, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=1536, warp_size=32), 'constants': {}, 'native_matmul': False, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]]}], 'enable_fp_fusion': True},
+    inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_clone_permute_2', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'atomic_add_found': False, 'num_load': 1, 'num_store': 1, 'num_reduction': 0, 'backend_hash': '139C22A3A3C364569C9941DE9469DCB674B7A631E094782CBD415193800462F6', 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'deterministic': False, 'force_filter_reduction_configs': False, 'are_deterministic_algorithms_enabled': False, 'tiling_scores': {'x': 56623104}},
+    min_elem_per_thread=0
+)
+@triton.jit
+def triton_poi_fused_clone_permute_2(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
+    xnumel = 9437184
+    xoffset = tl.program_id(0) * XBLOCK
+    xindex = xoffset + tl.arange(0, XBLOCK)[:]
+    xmask = tl.full([XBLOCK], True, tl.int1)
+    x0 = (xindex % 128)
+    x1 = ((xindex // 128) % 32)
+    x2 = xindex // 4096
+    x3 = xindex
+    tmp0 = tl.load(in_ptr0 + (x0 + 128*x2 + 294912*x1), None).to(tl.float32)
+    tl.store(out_ptr0 + (x3), tmp0, None)
+''', device_str='cuda')
+
+
+async_compile.wait(globals())
+del async_compile
+
+class Runner:
+    def __init__(self, partitions):
+        self.partitions = partitions
+
+    def recursively_apply_fns(self, fns):
+        new_callables = []
+        for fn, c in zip(fns, self.partitions):
+            new_callables.append(fn(c))
+        self.partitions = new_callables
+
+    def call(self, args):
+        arg0_1, arg1_1, arg2_1, arg3_1 = args
+        args.clear()
+        s73 = arg2_1
+        assert_size_stride(arg0_1, (1, 2304, 32, 128), (9437184, 4096, 128, 1))
+        assert_size_stride(arg1_1, (1, 2304, 32, 128), (9437184, 4096, 128, 1))
+        assert_size_stride(arg3_1, (1, 2304, 32, 128), (2304*s73, s73, 128, 1))
+        with torch.cuda._DeviceGuard(0):
+            torch.cuda.set_device(0)
+            buf0 = empty_strided_cuda((1, 32, 2304, 128), (9437184, 294912, 128, 1), torch.bfloat16)
+            # Topologically Sorted Source Nodes: [permute, q, permute_1, k, permute_2, v, output], Original ATen: [aten.permute, aten.clone, aten._scaled_dot_product_cudnn_attention]
+            stream0 = get_raw_stream(0)
+            triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.run(arg0_1, buf0, 9437184, stream=stream0)
+            del arg0_1
+            buf1 = empty_strided_cuda((1, 32, 2304, 128), (9437184, 294912, 128, 1), torch.bfloat16)
+            # Topologically Sorted Source Nodes: [permute, q, permute_1, k, permute_2, v, output], Original ATen: [aten.permute, aten.clone, aten._scaled_dot_product_cudnn_attention]
+            stream0 = get_raw_stream(0)
+            triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.run(arg1_1, buf1, 9437184, stream=stream0)
+            del arg1_1
+            buf2 = empty_strided_cuda((1, 32, 2304, 128), (9437184, 294912, 128, 1), torch.bfloat16)
+            # Topologically Sorted Source Nodes: [permute, q, permute_1, k, permute_2, v, output], Original ATen: [aten.permute, aten.clone, aten._scaled_dot_product_cudnn_attention]
+            stream0 = get_raw_stream(0)
+            triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.run(arg3_1, buf2, s73, 9437184, stream=stream0)
+            del arg3_1
+            # Topologically Sorted Source Nodes: [permute, q, permute_1, k, permute_2, v, output], Original ATen: [aten.permute, aten.clone, aten._scaled_dot_product_cudnn_attention]
+            buf3 = torch.ops.aten._scaled_dot_product_cudnn_attention.default(buf0, buf1, buf2, None, False)
+            del buf0
+            del buf1
+            buf4 = buf3[0]
+            assert_size_stride(buf4, (1, 32, 2304, 128), (9437184, 294912, 128, 1), 'torch.ops.aten._scaled_dot_product_cudnn_attention.default')
+            assert_alignment(buf4, 16, 'torch.ops.aten._scaled_dot_product_cudnn_attention.default')
+            del buf3
+            buf8 = reinterpret_tensor(buf2, (1, 2304, 32, 128), (9437184, 4096, 128, 1), 0); del buf2  # reuse
+            # Topologically Sorted Source Nodes: [permute_3, out], Original ATen: [aten.permute, aten.clone]
+            stream0 = get_raw_stream(0)
+            triton_poi_fused_clone_permute_2.run(buf4, buf8, 9437184, stream=stream0)
+            del buf4
+        return (buf8, )
+
+runner = Runner(partitions=[])
+call = runner.call
+recursively_apply_fns = runner.recursively_apply_fns
+
+
+def benchmark_compiled_module(times=10, repeat=10):
+    from torch._dynamo.testing import rand_strided
+    from torch._inductor.utils import print_performance
+    arg0_1 = rand_strided((1, 2304, 32, 128), (9437184, 4096, 128, 1), device='cuda:0', dtype=torch.bfloat16)
+    arg1_1 = rand_strided((1, 2304, 32, 128), (9437184, 4096, 128, 1), device='cuda:0', dtype=torch.bfloat16)
+    arg2_1 = 36864
+    arg3_1 = rand_strided((1, 2304, 32, 128), (84934656, 36864, 128, 1), device='cuda:0', dtype=torch.bfloat16)
+    fn = lambda: call([arg0_1, arg1_1, arg2_1, arg3_1])
+    return print_performance(fn, times=times, repeat=repeat)
+
+
+if __name__ == "__main__":
+    from torch._inductor.wrapper_benchmark import compiled_module_main
+    compiled_module_main('None', benchmark_compiled_module)
diff --git a/torchinductor/pg/8fb4988f5104615f769d2dd3f2407e6d419f6d4936f5f3f5ac38851533607a15.best_config b/torchinductor/pg/8fb4988f5104615f769d2dd3f2407e6d419f6d4936f5f3f5ac38851533607a15.best_config
new file mode 100644
index 0000000000000000000000000000000000000000..76c99916a5947bdea0009b5df465a376f7040f79
--- /dev/null
+++ b/torchinductor/pg/8fb4988f5104615f769d2dd3f2407e6d419f6d4936f5f3f5ac38851533607a15.best_config
@@ -0,0 +1 @@
+{"XBLOCK": 1, "R0_BLOCK": 4096, "num_warps": 16, "num_stages": 1, "configs_hash": "ba27f374f6982634f1ab959ad1e63f726920cfc2c7c821f8e68ec55c3d4d94fc", "found_by_coordesc": false, "time_taken_ms": 41, "triton_cache_hash": "U2Y4PVYJWFSKZDU4AWOF4HOYPDZZL3MFTQLITJAO62J7FJKLEL7A"}
\ No newline at end of file
diff --git a/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py b/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py
new file mode 100644
index 0000000000000000000000000000000000000000..73c4d36f9ce26693ed156cf6ffe8986dad375879
--- /dev/null
+++ b/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py
@@ -0,0 +1,73 @@
+
+import triton
+import triton.language as tl
+
+from torch._inductor.runtime import triton_helpers, triton_heuristics
+from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+triton_helpers.set_driver_to_gpu()
+
+@triton_heuristics.reduction(
+    size_hints={'x': 4096, 'r0_': 4096},
+    reduction_hint=ReductionHint.INNER,
+    filename=__file__,
+    triton_meta={'signature': {'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'in_ptr2': '*bf16', 'out_ptr2': '*bf16', 'xnumel': 'i32', 'r0_numel': 'i32', 'XBLOCK': 'constexpr', 'R0_BLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=128, cc=89, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=1536, warp_size=32), 'constants': {}, 'native_matmul': False, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]]}], 'enable_fp_fusion': True},
+    inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_red_fused_add_mul_native_layer_norm_0', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'atomic_add_found': False, 'num_load': 4, 'num_store': 1, 'num_reduction': 2, 'backend_hash': '139C22A3A3C364569C9941DE9469DCB674B7A631E094782CBD415193800462F6', 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'deterministic': False, 'force_filter_reduction_configs': False, 'are_deterministic_algorithms_enabled': False, 'add_persistent_rblock': True, 'tiling_scores': {'x': 0, 'r0_': 56639488}}
+)
+@triton.jit
+def triton_red_fused_add_mul_native_layer_norm_0(in_ptr0, in_ptr1, in_ptr2, out_ptr2, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr):
+    xnumel = 2304
+    r0_numel = 4096
+    rnumel = r0_numel
+    RBLOCK: tl.constexpr = R0_BLOCK
+    xoffset = tl.program_id(0) * XBLOCK
+    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
+    xmask = xindex < xnumel
+    r0_base = tl.arange(0, R0_BLOCK)[None, :]
+    rbase = r0_base
+    x0 = xindex
+    tmp3_mean = tl.zeros([XBLOCK, R0_BLOCK], tl.float32)
+    tmp3_m2 = tl.zeros([XBLOCK, R0_BLOCK], tl.float32)
+    tmp3_weight = tl.zeros([XBLOCK, R0_BLOCK], tl.float32)
+    for r0_offset in tl.range(0, r0_numel, R0_BLOCK):
+        r0_index = r0_offset + r0_base
+        r0_mask = r0_index < r0_numel
+        roffset = r0_offset
+        rindex = r0_index
+        r0_1 = r0_index
+        tmp0 = tl.load(in_ptr0 + (r0_1 + 4096*x0), r0_mask & xmask, eviction_policy='evict_last', other=0.0).to(tl.float32)
+        tmp1 = tmp0.to(tl.float32)
+        tmp2 = tl.broadcast_to(tmp1, [XBLOCK, R0_BLOCK])
+        tmp3_mean_next, tmp3_m2_next, tmp3_weight_next = triton_helpers.welford_reduce(
+            tmp2, tmp3_mean, tmp3_m2, tmp3_weight, roffset == 0
+        )
+        tmp3_mean = tl.where(r0_mask & xmask, tmp3_mean_next, tmp3_mean)
+        tmp3_m2 = tl.where(r0_mask & xmask, tmp3_m2_next, tmp3_m2)
+        tmp3_weight = tl.where(r0_mask & xmask, tmp3_weight_next, tmp3_weight)
+    tmp4, tmp5, tmp6 = triton_helpers.welford(tmp3_mean, tmp3_m2, tmp3_weight, 1)
+    tmp3 = tmp4[:, None]
+    tmp7 = tmp5[:, None]
+    tmp8 = tmp6[:, None]
+    for r0_offset in tl.range(0, r0_numel, R0_BLOCK):
+        r0_index = r0_offset + r0_base
+        r0_mask = r0_index < r0_numel
+        roffset = r0_offset
+        rindex = r0_index
+        r0_1 = r0_index
+        tmp9 = tl.load(in_ptr1 + (r0_1), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32)
+        tmp12 = tl.load(in_ptr0 + (r0_1 + 4096*x0), r0_mask & xmask, eviction_policy='evict_first', other=0.0).to(tl.float32)
+        tmp23 = tl.load(in_ptr2 + (r0_1), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32)
+        tmp10 = 1.0
+        tmp11 = tmp9 + tmp10
+        tmp13 = tmp12.to(tl.float32)
+        tmp14 = tmp13 - tmp3
+        tmp15 = 4096.0
+        tmp16 = (tmp7 / tmp15)
+        tmp17 = 1e-06
+        tmp18 = tmp16 + tmp17
+        tmp19 = libdevice.rsqrt(tmp18)
+        tmp20 = tmp14 * tmp19
+        tmp21 = tmp20.to(tl.float32)
+        tmp22 = tmp11 * tmp21
+        tmp24 = tmp22 + tmp23
+        tl.store(out_ptr2 + (r0_1 + 4096*x0), tmp24, r0_mask & xmask)
diff --git a/torchinductor/pr/cprazirlfgitxz6pco2dilxlbw7te34agf7yp3dftuyge55edpsr.py b/torchinductor/pr/cprazirlfgitxz6pco2dilxlbw7te34agf7yp3dftuyge55edpsr.py
new file mode 100644
index 0000000000000000000000000000000000000000..572568ff7523f5d2306706e8104a0276a0a90263
--- /dev/null
+++ b/torchinductor/pr/cprazirlfgitxz6pco2dilxlbw7te34agf7yp3dftuyge55edpsr.py
@@ -0,0 +1,203 @@
+# AOT ID: ['14_inference']
+from ctypes import c_void_p, c_long, c_int
+import torch
+import math
+import random
+import os
+import tempfile
+from math import inf, nan
+from cmath import nanj
+from torch._inductor.hooks import run_intermediate_hooks
+from torch._inductor.utils import maybe_profile
+from torch._inductor.codegen.memory_planning import _align as align
+from torch import device, empty_strided
+from torch._inductor.async_compile import AsyncCompile
+from torch._inductor.select_algorithm import extern_kernels
+import triton
+import triton.language as tl
+from torch._inductor.runtime.triton_heuristics import start_graph, end_graph
+from torch._C import _cuda_getCurrentRawStream as get_raw_stream
+
+aten = torch.ops.aten
+inductor_ops = torch.ops.inductor
+_quantized = torch.ops._quantized
+assert_size_stride = torch._C._dynamo.guards.assert_size_stride
+assert_alignment = torch._C._dynamo.guards.assert_alignment
+empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu
+empty_strided_cpu_pinned = torch._C._dynamo.guards._empty_strided_cpu_pinned
+empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
+empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu
+empty_strided_mtia = torch._C._dynamo.guards._empty_strided_mtia
+reinterpret_tensor = torch._C._dynamo.guards._reinterpret_tensor
+alloc_from_pool = torch.ops.inductor._alloc_from_pool
+async_compile = AsyncCompile()
+empty_strided_p2p = torch._C._distributed_c10d._SymmetricMemory.empty_strided_p2p
+
+
+# kernel path: /app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py
+# Topologically Sorted Source Nodes: [attn_output, hidden_states, norm_hidden_states, add_1, mul_1, norm_hidden_states_1], Original ATen: [aten.mul, aten.add, aten.native_layer_norm]
+# Source node to ATen node mapping:
+#   add_1 => add_2
+#   attn_output => mul
+#   hidden_states => add
+#   mul_1 => mul_2
+#   norm_hidden_states => add_1, convert_element_type, convert_element_type_1, mul_1, rsqrt, sub, var_mean
+#   norm_hidden_states_1 => add_3
+# Graph fragment:
+#   %arg2_1 : Tensor "bf16[1, 2048, 4096][8388608, 4096, 1]cuda:0" = PlaceHolder[target=arg2_1]
+#   %arg0_1 : Tensor "bf16[1, 1, 4096][24576, 24576, 1]cuda:0" = PlaceHolder[target=arg0_1]
+#   %arg1_1 : Tensor "bf16[1, 2048, 4096][8388608, 4096, 1]cuda:0" = PlaceHolder[target=arg1_1]
+#   %add : Tensor "bf16[1, 2048, 4096][8388608, 4096, 1]cuda:0" = PlaceHolder[target=add]
+#   %getitem_1 : Tensor "f32[1, 2048, 1][2048, 1, 2048]cuda:0" = PlaceHolder[target=getitem_1]
+#   %buf2 : Tensor "f32[1, 2048, 1][2048, 1, 2048]cuda:0" = PlaceHolder[target=buf2]
+#   %arg3_1 : Tensor "bf16[1, 1, 4096][24576, 24576, 1]cuda:0" = PlaceHolder[target=arg3_1]
+#   %arg4_1 : Tensor "bf16[1, 1, 4096][24576, 24576, 1]cuda:0" = PlaceHolder[target=arg4_1]
+#   %mul : Tensor "bf16[1, 2048, 4096][8388608, 4096, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%arg0_1, %arg1_1), kwargs = {})
+#   %add : Tensor "bf16[1, 2048, 4096][8388608, 4096, 1]cuda:0"[num_users=2] = call_function[target=torch.ops.aten.add.Tensor](args = (%arg2_1, %mul), kwargs = {})
+#   %convert_element_type : Tensor "f32[1, 2048, 4096][8388608, 4096, 1]cuda:0"[num_users=2] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%add, torch.float32), kwargs = {})
+#   %var_mean : [num_users=2] = call_function[target=torch.ops.aten.var_mean.correction](args = (%convert_element_type, [2]), kwargs = {correction: 0, keepdim: True})
+#   %sub : Tensor "f32[1, 2048, 4096][8388608, 4096, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.sub.Tensor](args = (%convert_element_type, %getitem_1), kwargs = {})
+#   %add_1 : Tensor "f32[1, 2048, 1][2048, 1, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%getitem, 1e-06), kwargs = {})
+#   %rsqrt : Tensor "f32[1, 2048, 1][2048, 1, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.rsqrt.default](args = (%add_1,), kwargs = {})
+#   %mul_1 : Tensor "f32[1, 2048, 4096][8388608, 4096, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%sub, %rsqrt), kwargs = {})
+#   %convert_element_type_1 : Tensor "bf16[1, 2048, 4096][8388608, 4096, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%mul_1, torch.bfloat16), kwargs = {})
+#   %add_2 : Tensor "bf16[1, 1, 4096][4096, 4096, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%arg3_1, 1), kwargs = {})
+#   %mul_2 : Tensor "bf16[1, 2048, 4096][8388608, 4096, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type_1, %add_2), kwargs = {})
+#   %add_3 : Tensor "bf16[1, 2048, 4096][8388608, 4096, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%mul_2, %arg4_1), kwargs = {})
+#   return %add,%getitem_1,%buf2,%add_3
+triton_red_fused_add_mul_native_layer_norm_0 = async_compile.triton('triton_red_fused_add_mul_native_layer_norm_0', '''
+import triton
+import triton.language as tl
+
+from torch._inductor.runtime import triton_helpers, triton_heuristics
+from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+triton_helpers.set_driver_to_gpu()
+
+@triton_heuristics.reduction(
+    size_hints={'x': 2048, 'r0_': 4096},
+    reduction_hint=ReductionHint.INNER,
+    filename=__file__,
+    triton_meta={'signature': {'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'in_ptr2': '*bf16', 'in_ptr3': '*bf16', 'in_ptr4': '*bf16', 'out_ptr0': '*bf16', 'out_ptr3': '*bf16', 'xnumel': 'i32', 'r0_numel': 'i32', 'XBLOCK': 'constexpr', 'R0_BLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=128, cc=89, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=1536, warp_size=32), 'constants': {}, 'native_matmul': False, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]], (6,): [['tt.divisibility', 16]], (7,): [['tt.divisibility', 16]], (8,): [['tt.divisibility', 16]]}], 'enable_fp_fusion': True},
+    inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_red_fused_add_mul_native_layer_norm_0', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'atomic_add_found': False, 'num_load': 6, 'num_store': 2, 'num_reduction': 2, 'backend_hash': '139C22A3A3C364569C9941DE9469DCB674B7A631E094782CBD415193800462F6', 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'deterministic': False, 'force_filter_reduction_configs': False, 'are_deterministic_algorithms_enabled': False, 'add_persistent_rblock': True, 'tiling_scores': {'x': 0, 'r0_': 100687872}}
+)
+@triton.jit
+def triton_red_fused_add_mul_native_layer_norm_0(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, out_ptr3, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr):
+    xnumel = 2048
+    r0_numel = 4096
+    rnumel = r0_numel
+    RBLOCK: tl.constexpr = R0_BLOCK
+    xoffset = tl.program_id(0) * XBLOCK
+    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
+    xmask = xindex < xnumel
+    r0_base = tl.arange(0, R0_BLOCK)[None, :]
+    rbase = r0_base
+    x0 = xindex
+    tmp7_mean = tl.zeros([XBLOCK, R0_BLOCK], tl.float32)
+    tmp7_m2 = tl.zeros([XBLOCK, R0_BLOCK], tl.float32)
+    tmp7_weight = tl.zeros([XBLOCK, R0_BLOCK], tl.float32)
+    for r0_offset in tl.range(0, r0_numel, R0_BLOCK):
+        r0_index = r0_offset + r0_base
+        r0_mask = r0_index < r0_numel
+        roffset = r0_offset
+        rindex = r0_index
+        r0_1 = r0_index
+        tmp0 = tl.load(in_ptr0 + (r0_1 + 4096*x0), r0_mask & xmask, eviction_policy='evict_first', other=0.0).to(tl.float32)
+        tmp1 = tl.load(in_ptr1 + (r0_1), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32)
+        tmp2 = tl.load(in_ptr2 + (r0_1 + 4096*x0), r0_mask & xmask, eviction_policy='evict_first', other=0.0).to(tl.float32)
+        tmp3 = tmp1 * tmp2
+        tmp4 = tmp0 + tmp3
+        tmp5 = tmp4.to(tl.float32)
+        tmp6 = tl.broadcast_to(tmp5, [XBLOCK, R0_BLOCK])
+        tmp7_mean_next, tmp7_m2_next, tmp7_weight_next = triton_helpers.welford_reduce(
+            tmp6, tmp7_mean, tmp7_m2, tmp7_weight, roffset == 0
+        )
+        tmp7_mean = tl.where(r0_mask & xmask, tmp7_mean_next, tmp7_mean)
+        tmp7_m2 = tl.where(r0_mask & xmask, tmp7_m2_next, tmp7_m2)
+        tmp7_weight = tl.where(r0_mask & xmask, tmp7_weight_next, tmp7_weight)
+        tl.store(out_ptr0 + (r0_1 + 4096*x0), tmp4, r0_mask & xmask)
+    tmp8, tmp9, tmp10 = triton_helpers.welford(tmp7_mean, tmp7_m2, tmp7_weight, 1)
+    tmp7 = tmp8[:, None]
+    tmp11 = tmp9[:, None]
+    tmp12 = tmp10[:, None]
+    for r0_offset in tl.range(0, r0_numel, R0_BLOCK):
+        r0_index = r0_offset + r0_base
+        r0_mask = r0_index < r0_numel
+        roffset = r0_offset
+        rindex = r0_index
+        r0_1 = r0_index
+        tmp13 = tl.load(out_ptr0 + (r0_1 + 4096*x0), r0_mask & xmask, eviction_policy='evict_first', other=0.0).to(tl.float32)
+        tmp23 = tl.load(in_ptr3 + (r0_1), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32)
+        tmp27 = tl.load(in_ptr4 + (r0_1), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32)
+        tmp14 = tmp13.to(tl.float32)
+        tmp15 = tmp14 - tmp7
+        tmp16 = 4096.0
+        tmp17 = (tmp11 / tmp16)
+        tmp18 = 1e-06
+        tmp19 = tmp17 + tmp18
+        tmp20 = libdevice.rsqrt(tmp19)
+        tmp21 = tmp15 * tmp20
+        tmp22 = tmp21.to(tl.float32)
+        tmp24 = 1.0
+        tmp25 = tmp23 + tmp24
+        tmp26 = tmp22 * tmp25
+        tmp28 = tmp26 + tmp27
+        tl.store(out_ptr3 + (r0_1 + 4096*x0), tmp28, r0_mask & xmask)
+''', device_str='cuda')
+
+
+async_compile.wait(globals())
+del async_compile
+
+class Runner:
+    def __init__(self, partitions):
+        self.partitions = partitions
+
+    def recursively_apply_fns(self, fns):
+        new_callables = []
+        for fn, c in zip(fns, self.partitions):
+            new_callables.append(fn(c))
+        self.partitions = new_callables
+
+    def call(self, args):
+        arg0_1, arg1_1, arg2_1, arg3_1, arg4_1 = args
+        args.clear()
+        assert_size_stride(arg0_1, (1, 1, 4096), (24576, 24576, 1))
+        assert_size_stride(arg1_1, (1, 2048, 4096), (8388608, 4096, 1))
+        assert_size_stride(arg2_1, (1, 2048, 4096), (8388608, 4096, 1))
+        assert_size_stride(arg3_1, (1, 1, 4096), (24576, 24576, 1))
+        assert_size_stride(arg4_1, (1, 1, 4096), (24576, 24576, 1))
+        with torch.cuda._DeviceGuard(0):
+            torch.cuda.set_device(0)
+            buf0 = empty_strided_cuda((1, 2048, 4096), (8388608, 4096, 1), torch.bfloat16)
+            buf4 = empty_strided_cuda((1, 2048, 4096), (8388608, 4096, 1), torch.bfloat16)
+            # Topologically Sorted Source Nodes: [attn_output, hidden_states, norm_hidden_states, add_1, mul_1, norm_hidden_states_1], Original ATen: [aten.mul, aten.add, aten.native_layer_norm]
+            stream0 = get_raw_stream(0)
+            triton_red_fused_add_mul_native_layer_norm_0.run(arg2_1, arg0_1, arg1_1, arg3_1, arg4_1, buf0, buf4, 2048, 4096, stream=stream0)
+            del arg0_1
+            del arg1_1
+            del arg2_1
+            del arg3_1
+            del arg4_1
+        return (buf4, buf0, )
+
+runner = Runner(partitions=[])
+call = runner.call
+recursively_apply_fns = runner.recursively_apply_fns
+
+
+def benchmark_compiled_module(times=10, repeat=10):
+    from torch._dynamo.testing import rand_strided
+    from torch._inductor.utils import print_performance
+    arg0_1 = rand_strided((1, 1, 4096), (24576, 24576, 1), device='cuda:0', dtype=torch.bfloat16)
+    arg1_1 = rand_strided((1, 2048, 4096), (8388608, 4096, 1), device='cuda:0', dtype=torch.bfloat16)
+    arg2_1 = rand_strided((1, 2048, 4096), (8388608, 4096, 1), device='cuda:0', dtype=torch.bfloat16)
+    arg3_1 = rand_strided((1, 1, 4096), (24576, 24576, 1), device='cuda:0', dtype=torch.bfloat16)
+    arg4_1 = rand_strided((1, 1, 4096), (24576, 24576, 1), device='cuda:0', dtype=torch.bfloat16)
+    fn = lambda: call([arg0_1, arg1_1, arg2_1, arg3_1, arg4_1])
+    return print_performance(fn, times=times, repeat=repeat)
+
+
+if __name__ == "__main__":
+    from torch._inductor.wrapper_benchmark import compiled_module_main
+    compiled_module_main('None', benchmark_compiled_module)
diff --git a/torchinductor/py/cpymuollhpudssjtecoz4ymxklqd6cy3cy7ayt7z4vptnib66447.py b/torchinductor/py/cpymuollhpudssjtecoz4ymxklqd6cy3cy7ayt7z4vptnib66447.py
new file mode 100644
index 0000000000000000000000000000000000000000..1341cf6f6209c5e0341d4ff07f5041d338661d0d
--- /dev/null
+++ b/torchinductor/py/cpymuollhpudssjtecoz4ymxklqd6cy3cy7ayt7z4vptnib66447.py
@@ -0,0 +1,560 @@
+# AOT ID: ['8_inference']
+from ctypes import c_void_p, c_long, c_int
+import torch
+import math
+import random
+import os
+import tempfile
+from math import inf, nan
+from cmath import nanj
+from torch._inductor.hooks import run_intermediate_hooks
+from torch._inductor.utils import maybe_profile
+from torch._inductor.codegen.memory_planning import _align as align
+from torch import device, empty_strided
+from torch._inductor.async_compile import AsyncCompile
+from torch._inductor.select_algorithm import extern_kernels
+import triton
+import triton.language as tl
+from torch._inductor.runtime.triton_heuristics import start_graph, end_graph
+from torch._C import _cuda_getCurrentRawStream as get_raw_stream
+
+aten = torch.ops.aten
+inductor_ops = torch.ops.inductor
+_quantized = torch.ops._quantized
+assert_size_stride = torch._C._dynamo.guards.assert_size_stride
+assert_alignment = torch._C._dynamo.guards.assert_alignment
+empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu
+empty_strided_cpu_pinned = torch._C._dynamo.guards._empty_strided_cpu_pinned
+empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
+empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu
+empty_strided_mtia = torch._C._dynamo.guards._empty_strided_mtia
+reinterpret_tensor = torch._C._dynamo.guards._reinterpret_tensor
+alloc_from_pool = torch.ops.inductor._alloc_from_pool
+async_compile = AsyncCompile()
+empty_strided_p2p = torch._C._distributed_c10d._SymmetricMemory.empty_strided_p2p
+
+
+# kernel path: /app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py
+# Topologically Sorted Source Nodes: [encoder_query, encoder_query_1], Original ATen: [aten.view, aten._fused_rms_norm]
+# Source node to ATen node mapping:
+#   encoder_query => view_3
+#   encoder_query_1 => convert_element_type_4, mean_2, pow_3
+# Graph fragment:
+#   %arg5_1 : Tensor "bf16[1, 256, 4096][3145728, 12288, 1]cuda:0" = PlaceHolder[target=arg5_1]
+#   %view_3 : Tensor "bf16[1, 256, 32, 128][3145728, 12288, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%arg5_1, [1, 256, 32, 128]), kwargs = {})
+#   %convert_element_type_4 : Tensor "f32[1, 256, 32, 128][1048576, 4096, 128, 1]cuda:0"[num_users=2] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%view_3, torch.float32), kwargs = {})
+#   %pow_3 : Tensor "f32[1, 256, 32, 128][1048576, 4096, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.pow.Tensor_Scalar](args = (%convert_element_type_4, 2), kwargs = {})
+#   %mean_2 : Tensor "f32[1, 256, 32, 1][8192, 32, 1, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.mean.dim](args = (%pow_3, [3], True), kwargs = {})
+#   return %buf0
+triton_red_fused__fused_rms_norm_view_0 = async_compile.triton('triton_red_fused__fused_rms_norm_view_0', '''
+import triton
+import triton.language as tl
+
+from torch._inductor.runtime import triton_helpers, triton_heuristics
+from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+triton_helpers.set_driver_to_gpu()
+
+@triton_heuristics.reduction(
+    size_hints={'x': 8192, 'r0_': 128},
+    reduction_hint=ReductionHint.DEFAULT,
+    filename=__file__,
+    triton_meta={'signature': {'in_ptr0': '*bf16', 'out_ptr0': '*fp32', 'xnumel': 'i32', 'r0_numel': 'i32', 'XBLOCK': 'constexpr', 'R0_BLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=128, cc=89, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=1536, warp_size=32), 'constants': {}, 'native_matmul': False, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]]}], 'enable_fp_fusion': True},
+    inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_red_fused__fused_rms_norm_view_0', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'atomic_add_found': False, 'num_load': 1, 'num_store': 1, 'num_reduction': 1, 'backend_hash': '139C22A3A3C364569C9941DE9469DCB674B7A631E094782CBD415193800462F6', 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'deterministic': False, 'force_filter_reduction_configs': False, 'are_deterministic_algorithms_enabled': False, 'tiling_scores': {'x': 65536, 'r0_': 2097152}}
+)
+@triton.jit
+def triton_red_fused__fused_rms_norm_view_0(in_ptr0, out_ptr0, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr):
+    xnumel = 8192
+    r0_numel = 128
+    rnumel = r0_numel
+    RBLOCK: tl.constexpr = R0_BLOCK
+    xoffset = tl.program_id(0) * XBLOCK
+    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
+    xmask = tl.full([XBLOCK, R0_BLOCK], True, tl.int1)
+    r0_base = tl.arange(0, R0_BLOCK)[None, :]
+    rbase = r0_base
+    x0 = (xindex % 32)
+    x1 = xindex // 32
+    _tmp4 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32)
+    x3 = xindex
+    for r0_offset in tl.range(0, r0_numel, R0_BLOCK):
+        r0_index = r0_offset + r0_base
+        r0_mask = r0_index < r0_numel
+        roffset = r0_offset
+        rindex = r0_index
+        r0_2 = r0_index
+        tmp0 = tl.load(in_ptr0 + (r0_2 + 128*x0 + 12288*x1), r0_mask, eviction_policy='evict_first', other=0.0).to(tl.float32)
+        tmp1 = tmp0.to(tl.float32)
+        tmp2 = tmp1 * tmp1
+        tmp3 = tl.broadcast_to(tmp2, [XBLOCK, R0_BLOCK])
+        tmp5 = _tmp4 + tmp3
+        _tmp4 = tl.where(r0_mask, tmp5, _tmp4)
+    tmp4 = tl.sum(_tmp4, 1)[:, None]
+    tl.store(out_ptr0 + (x3), tmp4, None)
+''', device_str='cuda')
+
+
+# kernel path: /app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py
+# Topologically Sorted Source Nodes: [query, query_1], Original ATen: [aten.view, aten._fused_rms_norm]
+# Source node to ATen node mapping:
+#   query => view
+#   query_1 => convert_element_type, mean, pow_1
+# Graph fragment:
+#   %arg0_1 : Tensor "bf16[1, 2048, 4096][25165824, 12288, 1]cuda:0" = PlaceHolder[target=arg0_1]
+#   %view : Tensor "bf16[1, 2048, 32, 128][25165824, 12288, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%arg0_1, [1, 2048, 32, 128]), kwargs = {})
+#   %convert_element_type : Tensor "f32[1, 2048, 32, 128][8388608, 4096, 128, 1]cuda:0"[num_users=2] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%view, torch.float32), kwargs = {})
+#   %pow_1 : Tensor "f32[1, 2048, 32, 128][8388608, 4096, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.pow.Tensor_Scalar](args = (%convert_element_type, 2), kwargs = {})
+#   %mean : Tensor "f32[1, 2048, 32, 1][65536, 32, 1, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.mean.dim](args = (%pow_1, [3], True), kwargs = {})
+#   return %buf1
+triton_red_fused__fused_rms_norm_view_1 = async_compile.triton('triton_red_fused__fused_rms_norm_view_1', '''
+import triton
+import triton.language as tl
+
+from torch._inductor.runtime import triton_helpers, triton_heuristics
+from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+triton_helpers.set_driver_to_gpu()
+
+@triton_heuristics.reduction(
+    size_hints={'x': 65536, 'r0_': 128},
+    reduction_hint=ReductionHint.DEFAULT,
+    filename=__file__,
+    triton_meta={'signature': {'in_ptr0': '*bf16', 'out_ptr0': '*fp32', 'xnumel': 'i32', 'r0_numel': 'i32', 'XBLOCK': 'constexpr', 'R0_BLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=128, cc=89, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=1536, warp_size=32), 'constants': {}, 'native_matmul': False, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]]}], 'enable_fp_fusion': True},
+    inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_red_fused__fused_rms_norm_view_1', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'atomic_add_found': False, 'num_load': 1, 'num_store': 1, 'num_reduction': 1, 'backend_hash': '139C22A3A3C364569C9941DE9469DCB674B7A631E094782CBD415193800462F6', 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'deterministic': False, 'force_filter_reduction_configs': False, 'are_deterministic_algorithms_enabled': False, 'tiling_scores': {'x': 524288, 'r0_': 16777216}}
+)
+@triton.jit
+def triton_red_fused__fused_rms_norm_view_1(in_ptr0, out_ptr0, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr):
+    xnumel = 65536
+    r0_numel = 128
+    rnumel = r0_numel
+    RBLOCK: tl.constexpr = R0_BLOCK
+    xoffset = tl.program_id(0) * XBLOCK
+    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
+    xmask = tl.full([XBLOCK, R0_BLOCK], True, tl.int1)
+    r0_base = tl.arange(0, R0_BLOCK)[None, :]
+    rbase = r0_base
+    x0 = (xindex % 32)
+    x1 = xindex // 32
+    _tmp4 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32)
+    x3 = xindex
+    for r0_offset in tl.range(0, r0_numel, R0_BLOCK):
+        r0_index = r0_offset + r0_base
+        r0_mask = r0_index < r0_numel
+        roffset = r0_offset
+        rindex = r0_index
+        r0_2 = r0_index
+        tmp0 = tl.load(in_ptr0 + (r0_2 + 128*x0 + 12288*x1), r0_mask, eviction_policy='evict_first', other=0.0).to(tl.float32)
+        tmp1 = tmp0.to(tl.float32)
+        tmp2 = tmp1 * tmp1
+        tmp3 = tl.broadcast_to(tmp2, [XBLOCK, R0_BLOCK])
+        tmp5 = _tmp4 + tmp3
+        _tmp4 = tl.where(r0_mask, tmp5, _tmp4)
+    tmp4 = tl.sum(_tmp4, 1)[:, None]
+    tl.store(out_ptr0 + (x3), tmp4, None)
+''', device_str='cuda')
+
+
+# kernel path: /app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py
+# Topologically Sorted Source Nodes: [encoder_query, encoder_query_1, query, query_1, query_2], Original ATen: [aten.view, aten._fused_rms_norm, aten.cat]
+# Source node to ATen node mapping:
+#   encoder_query => view_3
+#   encoder_query_1 => add_2, convert_element_type_4, convert_element_type_5, mean_2, mul_4, mul_5, pow_3, rsqrt_2
+#   query => view
+#   query_1 => add, convert_element_type, convert_element_type_1, mean, mul, mul_1, pow_1, rsqrt
+#   query_2 => cat
+# Graph fragment:
+#   %arg5_1 : Tensor "bf16[1, 256, 4096][3145728, 12288, 1]cuda:0" = PlaceHolder[target=arg5_1]
+#   %buf0 : Tensor "f32[1, 256, 32, 1][8192, 32, 1, 8192]cuda:0" = PlaceHolder[target=buf0]
+#   %arg8_1 : Tensor "bf16[128][1]cuda:0" = PlaceHolder[target=arg8_1]
+#   %arg0_1 : Tensor "bf16[1, 2048, 4096][25165824, 12288, 1]cuda:0" = PlaceHolder[target=arg0_1]
+#   %buf1 : Tensor "f32[1, 2048, 32, 1][65536, 32, 1, 65536]cuda:0" = PlaceHolder[target=buf1]
+#   %arg3_1 : Tensor "bf16[128][1]cuda:0" = PlaceHolder[target=arg3_1]
+#   %view_3 : Tensor "bf16[1, 256, 32, 128][3145728, 12288, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%arg5_1, [1, 256, 32, 128]), kwargs = {})
+#   %convert_element_type_4 : Tensor "f32[1, 256, 32, 128][1048576, 4096, 128, 1]cuda:0"[num_users=2] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%view_3, torch.float32), kwargs = {})
+#   %pow_3 : Tensor "f32[1, 256, 32, 128][1048576, 4096, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.pow.Tensor_Scalar](args = (%convert_element_type_4, 2), kwargs = {})
+#   %mean_2 : Tensor "f32[1, 256, 32, 1][8192, 32, 1, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.mean.dim](args = (%pow_3, [3], True), kwargs = {})
+#   %add_2 : Tensor "f32[1, 256, 32, 1][8192, 32, 1, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.add.Scalar](args = (%mean_2, 1e-06), kwargs = {})
+#   %rsqrt_2 : Tensor "f32[1, 256, 32, 1][8192, 32, 1, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.rsqrt.default](args = (%add_2,), kwargs = {})
+#   %mul_4 : Tensor "f32[1, 256, 32, 128][1048576, 4096, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type_4, %rsqrt_2), kwargs = {})
+#   %mul_5 : Tensor "f32[1, 256, 32, 128][1048576, 4096, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%mul_4, %arg8_1), kwargs = {})
+#   %convert_element_type_5 : Tensor "bf16[1, 256, 32, 128][1048576, 4096, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%mul_5, torch.bfloat16), kwargs = {})
+#   %view : Tensor "bf16[1, 2048, 32, 128][25165824, 12288, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%arg0_1, [1, 2048, 32, 128]), kwargs = {})
+#   %convert_element_type : Tensor "f32[1, 2048, 32, 128][8388608, 4096, 128, 1]cuda:0"[num_users=2] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%view, torch.float32), kwargs = {})
+#   %pow_1 : Tensor "f32[1, 2048, 32, 128][8388608, 4096, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.pow.Tensor_Scalar](args = (%convert_element_type, 2), kwargs = {})
+#   %mean : Tensor "f32[1, 2048, 32, 1][65536, 32, 1, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.mean.dim](args = (%pow_1, [3], True), kwargs = {})
+#   %add : Tensor "f32[1, 2048, 32, 1][65536, 32, 1, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.add.Scalar](args = (%mean, 1e-06), kwargs = {})
+#   %rsqrt : Tensor "f32[1, 2048, 32, 1][65536, 32, 1, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.rsqrt.default](args = (%add,), kwargs = {})
+#   %mul : Tensor "f32[1, 2048, 32, 128][8388608, 4096, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type, %rsqrt), kwargs = {})
+#   %mul_1 : Tensor "f32[1, 2048, 32, 128][8388608, 4096, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%mul, %arg3_1), kwargs = {})
+#   %convert_element_type_1 : Tensor "bf16[1, 2048, 32, 128][8388608, 4096, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%mul_1, torch.bfloat16), kwargs = {})
+#   %cat : Tensor "bf16[1, 2304, 32, 128][9437184, 4096, 128, 1]cuda:0"[num_users=2] = call_function[target=torch.ops.aten.cat.default](args = ([%convert_element_type_5, %convert_element_type_1], 1), kwargs = {})
+#   return %cat
+triton_poi_fused__fused_rms_norm_cat_view_2 = async_compile.triton('triton_poi_fused__fused_rms_norm_cat_view_2', '''
+import triton
+import triton.language as tl
+
+from torch._inductor.runtime import triton_helpers, triton_heuristics
+from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+triton_helpers.set_driver_to_gpu()
+
+@triton_heuristics.pointwise(
+    size_hints={'y': 131072, 'x': 128}, tile_hint=TileHint.DEFAULT,
+    filename=__file__,
+    triton_meta={'signature': {'in_ptr0': '*bf16', 'in_ptr1': '*fp32', 'in_ptr2': '*bf16', 'in_ptr3': '*bf16', 'in_ptr4': '*fp32', 'in_ptr5': '*bf16', 'out_ptr0': '*bf16', 'ynumel': 'i32', 'xnumel': 'i32', 'YBLOCK': 'constexpr', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=128, cc=89, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=1536, warp_size=32), 'constants': {}, 'native_matmul': False, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]], (6,): [['tt.divisibility', 16]], (7,): [['tt.divisibility', 16]], (8,): [['tt.divisibility', 16]]}], 'enable_fp_fusion': True},
+    inductor_meta={'grid_type': 'Grid2DWithYZOverflow', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__fused_rms_norm_cat_view_2', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'atomic_add_found': False, 'num_load': 6, 'num_store': 1, 'num_reduction': 0, 'backend_hash': '139C22A3A3C364569C9941DE9469DCB674B7A631E094782CBD415193800462F6', 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'deterministic': False, 'force_filter_reduction_configs': False, 'are_deterministic_algorithms_enabled': False, 'tiling_scores': {'y': 589824, 'x': 75497984}},
+    min_elem_per_thread=0
+)
+@triton.jit
+def triton_poi_fused__fused_rms_norm_cat_view_2(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, out_ptr0, ynumel, xnumel, YBLOCK : tl.constexpr, XBLOCK : tl.constexpr):
+    ynumel = 73728
+    xnumel = 128
+    yoffset = (tl.program_id(1) + tl.program_id(2) * tl.num_programs(1)) * YBLOCK
+    yindex = yoffset + tl.arange(0, YBLOCK)[:, None]
+    ymask = yindex < ynumel
+    xoffset = tl.program_id(0) * XBLOCK
+    xindex = xoffset + tl.arange(0, XBLOCK)[None, :]
+    xmask = xindex < xnumel
+    y1 = yindex // 32
+    x2 = xindex
+    y0 = (yindex % 32)
+    y3 = yindex
+    tmp0 = y1
+    tmp1 = tl.full([1, 1], 0, tl.int64)
+    tmp2 = tmp0 >= tmp1
+    tmp3 = tl.full([1, 1], 256, tl.int64)
+    tmp4 = tmp0 < tmp3
+    tmp5 = tl.load(in_ptr0 + (x2 + 128*y0 + 12288*(y1)), tmp4 & xmask & ymask, eviction_policy='evict_last', other=0.0).to(tl.float32)
+    tmp6 = tmp5.to(tl.float32)
+    tmp7 = tl.load(in_ptr1 + (tl.broadcast_to(y0 + 32*(y1), [YBLOCK, XBLOCK])), tmp4 & xmask & ymask, eviction_policy='evict_last', other=0.0)
+    tmp8 = 128.0
+    tmp9 = (tmp7 / tmp8)
+    tmp10 = 1e-06
+    tmp11 = tmp9 + tmp10
+    tmp12 = libdevice.rsqrt(tmp11)
+    tmp13 = tmp6 * tmp12
+    tmp14 = tl.load(in_ptr2 + (tl.broadcast_to(x2, [YBLOCK, XBLOCK])), tmp4 & xmask & ymask, eviction_policy='evict_last', other=0.0).to(tl.float32)
+    tmp15 = tmp14.to(tl.float32)
+    tmp16 = tmp13 * tmp15
+    tmp17 = tmp16.to(tl.float32)
+    tmp18 = tl.full(tmp17.shape, 0.0, tmp17.dtype)
+    tmp19 = tl.where(tmp4, tmp17, tmp18)
+    tmp20 = tmp0 >= tmp3
+    tmp21 = tl.full([1, 1], 2304, tl.int64)
+    tmp22 = tmp0 < tmp21
+    tmp23 = tl.load(in_ptr3 + (x2 + 128*y0 + 12288*((-256) + y1)), tmp20 & xmask & ymask, eviction_policy='evict_last', other=0.0).to(tl.float32)
+    tmp24 = tmp23.to(tl.float32)
+    tmp25 = tl.load(in_ptr4 + (tl.broadcast_to(y0 + 32*((-256) + y1), [YBLOCK, XBLOCK])), tmp20 & xmask & ymask, eviction_policy='evict_last', other=0.0)
+    tmp26 = 128.0
+    tmp27 = (tmp25 / tmp26)
+    tmp28 = 1e-06
+    tmp29 = tmp27 + tmp28
+    tmp30 = libdevice.rsqrt(tmp29)
+    tmp31 = tmp24 * tmp30
+    tmp32 = tl.load(in_ptr5 + (tl.broadcast_to(x2, [YBLOCK, XBLOCK])), tmp20 & xmask & ymask, eviction_policy='evict_last', other=0.0).to(tl.float32)
+    tmp33 = tmp32.to(tl.float32)
+    tmp34 = tmp31 * tmp33
+    tmp35 = tmp34.to(tl.float32)
+    tmp36 = tl.full(tmp35.shape, 0.0, tmp35.dtype)
+    tmp37 = tl.where(tmp20, tmp35, tmp36)
+    tmp38 = tl.where(tmp4, tmp19, tmp37)
+    tl.store(out_ptr0 + (x2 + 128*y3), tmp38, xmask & ymask)
+''', device_str='cuda')
+
+
+# kernel path: /app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py
+# Topologically Sorted Source Nodes: [reshape, unbind, reshape_1, unbind_1, float_1, cos, mul, neg, stack, x_rotated, float_2, sin, mul_1, add, out, float_3, cos_2, mul_2, neg_1, stack_1, x_rotated_1, float_4, sin_2, mul_3, add_1, out_1], Original ATen: [aten.view, aten.unbind, aten._to_copy, aten.unsqueeze, aten.mul, aten.neg, aten.stack, aten.add]
+# Source node to ATen node mapping:
+#   add => add_4
+#   add_1 => add_5
+#   cos => unsqueeze, unsqueeze_1
+#   cos_2 => unsqueeze_6, unsqueeze_7
+#   float_1 => convert_element_type_8
+#   float_2 => convert_element_type_9
+#   float_3 => convert_element_type_11
+#   float_4 => convert_element_type_12
+#   mul => mul_8
+#   mul_1 => mul_9
+#   mul_2 => mul_10
+#   mul_3 => mul_11
+#   neg => neg
+#   neg_1 => neg_1
+#   out => convert_element_type_10
+#   out_1 => convert_element_type_13
+#   reshape => view_6
+#   reshape_1 => view_8
+#   sin => unsqueeze_2, unsqueeze_3
+#   sin_2 => unsqueeze_8, unsqueeze_9
+#   stack => cat_3, unsqueeze_4, unsqueeze_5
+#   stack_1 => cat_4, unsqueeze_10, unsqueeze_11
+#   unbind => unbind
+#   unbind_1 => unbind_1
+#   x_rotated => view_7
+#   x_rotated_1 => view_9
+# Graph fragment:
+#   %cat : Tensor "bf16[1, 2304, 32, 128][9437184, 4096, 128, 1]cuda:0" = PlaceHolder[target=cat]
+#   %arg10_1 : Tensor "f32[2304, 128][128, 1]cuda:0" = PlaceHolder[target=arg10_1]
+#   %arg11_1 : Tensor "f32[2304, 128][128, 1]cuda:0" = PlaceHolder[target=arg11_1]
+#   %cat_1 : Tensor "bf16[1, 2304, 32, 128][9437184, 4096, 128, 1]cuda:0" = PlaceHolder[target=cat_1]
+#   %view_6 : Tensor "bf16[1, 2304, 32, 64, 2][9437184, 4096, 128, 2, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%cat, [1, 2304, 32, -1, 2]), kwargs = {})
+#   %unbind : [num_users=2] = call_function[target=torch.ops.aten.unbind.int](args = (%view_6, -1), kwargs = {})
+#   %view_8 : Tensor "bf16[1, 2304, 32, 64, 2][9437184, 4096, 128, 2, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%cat_1, [1, 2304, 32, -1, 2]), kwargs = {})
+#   %unbind_1 : [num_users=2] = call_function[target=torch.ops.aten.unbind.int](args = (%view_8, -1), kwargs = {})
+#   %convert_element_type_8 : Tensor "f32[1, 2304, 32, 128][9437184, 4096, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%cat, torch.float32), kwargs = {})
+#   %unsqueeze : Tensor "f32[1, 2304, 128][294912, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.unsqueeze.default](args = (%arg10_1, 0), kwargs = {})
+#   %unsqueeze_1 : Tensor "f32[1, 2304, 1, 128][294912, 128, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.unsqueeze.default](args = (%unsqueeze, 2), kwargs = {})
+#   %mul_8 : Tensor "f32[1, 2304, 32, 128][9437184, 4096, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type_8, %unsqueeze_1), kwargs = {})
+#   %neg : Tensor "bf16[1, 2304, 32, 64][4718592, 2048, 64, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.neg.default](args = (%getitem_1,), kwargs = {})
+#   %unsqueeze_4 : Tensor "bf16[1, 2304, 32, 64, 1][4718592, 2048, 64, 1, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.unsqueeze.default](args = (%neg, 4), kwargs = {})
+#   %unsqueeze_5 : Tensor "bf16[1, 2304, 32, 64, 1][9437184, 4096, 128, 2, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.unsqueeze.default](args = (%getitem, 4), kwargs = {})
+#   %cat_3 : Tensor "bf16[1, 2304, 32, 64, 2][9437184, 4096, 128, 2, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.cat.default](args = ([%unsqueeze_4, %unsqueeze_5], -1), kwargs = {})
+#   %view_7 : Tensor "bf16[1, 2304, 32, 128][9437184, 4096, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%cat_3, [1, 2304, 32, 128]), kwargs = {})
+#   %convert_element_type_9 : Tensor "f32[1, 2304, 32, 128][9437184, 4096, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%view_7, torch.float32), kwargs = {})
+#   %unsqueeze_2 : Tensor "f32[1, 2304, 128][294912, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.unsqueeze.default](args = (%arg11_1, 0), kwargs = {})
+#   %unsqueeze_3 : Tensor "f32[1, 2304, 1, 128][294912, 128, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.unsqueeze.default](args = (%unsqueeze_2, 2), kwargs = {})
+#   %mul_9 : Tensor "f32[1, 2304, 32, 128][9437184, 4096, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type_9, %unsqueeze_3), kwargs = {})
+#   %add_4 : Tensor "f32[1, 2304, 32, 128][9437184, 4096, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%mul_8, %mul_9), kwargs = {})
+#   %convert_element_type_10 : Tensor "bf16[1, 2304, 32, 128][9437184, 4096, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%add_4, torch.bfloat16), kwargs = {})
+#   %convert_element_type_11 : Tensor "f32[1, 2304, 32, 128][9437184, 4096, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%cat_1, torch.float32), kwargs = {})
+#   %unsqueeze_6 : Tensor "f32[1, 2304, 128][294912, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.unsqueeze.default](args = (%arg10_1, 0), kwargs = {})
+#   %unsqueeze_7 : Tensor "f32[1, 2304, 1, 128][294912, 128, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.unsqueeze.default](args = (%unsqueeze_6, 2), kwargs = {})
+#   %mul_10 : Tensor "f32[1, 2304, 32, 128][9437184, 4096, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type_11, %unsqueeze_7), kwargs = {})
+#   %neg_1 : Tensor "bf16[1, 2304, 32, 64][4718592, 2048, 64, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.neg.default](args = (%getitem_3,), kwargs = {})
+#   %unsqueeze_10 : Tensor "bf16[1, 2304, 32, 64, 1][4718592, 2048, 64, 1, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.unsqueeze.default](args = (%neg_1, 4), kwargs = {})
+#   %unsqueeze_11 : Tensor "bf16[1, 2304, 32, 64, 1][9437184, 4096, 128, 2, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.unsqueeze.default](args = (%getitem_2, 4), kwargs = {})
+#   %cat_4 : Tensor "bf16[1, 2304, 32, 64, 2][9437184, 4096, 128, 2, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.cat.default](args = ([%unsqueeze_10, %unsqueeze_11], -1), kwargs = {})
+#   %view_9 : Tensor "bf16[1, 2304, 32, 128][9437184, 4096, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%cat_4, [1, 2304, 32, 128]), kwargs = {})
+#   %convert_element_type_12 : Tensor "f32[1, 2304, 32, 128][9437184, 4096, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%view_9, torch.float32), kwargs = {})
+#   %unsqueeze_8 : Tensor "f32[1, 2304, 128][294912, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.unsqueeze.default](args = (%arg11_1, 0), kwargs = {})
+#   %unsqueeze_9 : Tensor "f32[1, 2304, 1, 128][294912, 128, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.unsqueeze.default](args = (%unsqueeze_8, 2), kwargs = {})
+#   %mul_11 : Tensor "f32[1, 2304, 32, 128][9437184, 4096, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type_12, %unsqueeze_9), kwargs = {})
+#   %add_5 : Tensor "f32[1, 2304, 32, 128][9437184, 4096, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%mul_10, %mul_11), kwargs = {})
+#   %convert_element_type_13 : Tensor "bf16[1, 2304, 32, 128][9437184, 4096, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%add_5, torch.bfloat16), kwargs = {})
+#   return %convert_element_type_10,%convert_element_type_13
+triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3 = async_compile.triton('triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3', '''
+import triton
+import triton.language as tl
+
+from torch._inductor.runtime import triton_helpers, triton_heuristics
+from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+triton_helpers.set_driver_to_gpu()
+
+@triton_heuristics.pointwise(
+    size_hints={'x': 16777216}, 
+    filename=__file__,
+    triton_meta={'signature': {'in_ptr0': '*bf16', 'in_ptr1': '*fp32', 'in_ptr2': '*fp32', 'in_ptr3': '*bf16', 'out_ptr0': '*bf16', 'out_ptr1': '*bf16', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=128, cc=89, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=1536, warp_size=32), 'constants': {}, 'native_matmul': False, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]], (6,): [['tt.divisibility', 16]]}], 'enable_fp_fusion': True},
+    inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'atomic_add_found': False, 'num_load': 8, 'num_store': 2, 'num_reduction': 0, 'backend_hash': '139C22A3A3C364569C9941DE9469DCB674B7A631E094782CBD415193800462F6', 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'deterministic': False, 'force_filter_reduction_configs': False, 'are_deterministic_algorithms_enabled': False, 'tiling_scores': {'x': 115605504}},
+    min_elem_per_thread=0
+)
+@triton.jit
+def triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3(in_ptr0, in_ptr1, in_ptr2, in_ptr3, out_ptr0, out_ptr1, xnumel, XBLOCK : tl.constexpr):
+    xnumel = 9437184
+    xoffset = tl.program_id(0) * XBLOCK
+    xindex = xoffset + tl.arange(0, XBLOCK)[:]
+    xmask = tl.full([XBLOCK], True, tl.int1)
+    x3 = xindex
+    x0 = (xindex % 128)
+    x2 = xindex // 4096
+    x4 = xindex // 128
+    tmp0 = tl.load(in_ptr0 + (x3), None).to(tl.float32)
+    tmp2 = tl.load(in_ptr1 + (x0 + 128*x2), None, eviction_policy='evict_last')
+    tmp19 = tl.load(in_ptr2 + (x0 + 128*x2), None, eviction_policy='evict_last')
+    tmp23 = tl.load(in_ptr3 + (x3), None).to(tl.float32)
+    tmp1 = tmp0.to(tl.float32)
+    tmp3 = tmp1 * tmp2
+    tmp4 = (x3 % 2)
+    tmp5 = tl.full([1], 0, tl.int64)
+    tmp6 = tmp4 >= tmp5
+    tmp7 = tl.full([1], 1, tl.int64)
+    tmp8 = tmp4 < tmp7
+    tmp9 = tl.load(in_ptr0 + (1 + 2*(x0 // 2) + 128*x4), tmp8, eviction_policy='evict_last', other=0.0).to(tl.float32)
+    tmp10 = -tmp9
+    tmp11 = tl.full(tmp10.shape, 0.0, tmp10.dtype)
+    tmp12 = tl.where(tmp8, tmp10, tmp11)
+    tmp13 = tmp4 >= tmp7
+    tmp14 = tl.full([1], 2, tl.int64)
+    tmp15 = tmp4 < tmp14
+    tmp16 = tl.load(in_ptr0 + (2*(x0 // 2) + 128*x4), tmp13, eviction_policy='evict_last', other=0.0).to(tl.float32)
+    tmp17 = tl.where(tmp8, tmp12, tmp16)
+    tmp18 = tmp17.to(tl.float32)
+    tmp20 = tmp18 * tmp19
+    tmp21 = tmp3 + tmp20
+    tmp22 = tmp21.to(tl.float32)
+    tmp24 = tmp23.to(tl.float32)
+    tmp25 = tmp24 * tmp2
+    tmp26 = tl.load(in_ptr3 + (1 + 2*(x0 // 2) + 128*x4), tmp8, eviction_policy='evict_last', other=0.0).to(tl.float32)
+    tmp27 = -tmp26
+    tmp28 = tl.full(tmp27.shape, 0.0, tmp27.dtype)
+    tmp29 = tl.where(tmp8, tmp27, tmp28)
+    tmp30 = tl.load(in_ptr3 + (2*(x0 // 2) + 128*x4), tmp13, eviction_policy='evict_last', other=0.0).to(tl.float32)
+    tmp31 = tl.where(tmp8, tmp29, tmp30)
+    tmp32 = tmp31.to(tl.float32)
+    tmp33 = tmp32 * tmp19
+    tmp34 = tmp25 + tmp33
+    tmp35 = tmp34.to(tl.float32)
+    tl.store(out_ptr0 + (x3), tmp22, None)
+    tl.store(out_ptr1 + (x3), tmp35, None)
+''', device_str='cuda')
+
+
+# kernel path: /app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py
+# Topologically Sorted Source Nodes: [encoder_value, value, value_1], Original ATen: [aten.view, aten.cat]
+# Source node to ATen node mapping:
+#   encoder_value => view_5
+#   value => view_2
+#   value_1 => cat_2
+# Graph fragment:
+#   %arg7_1 : Tensor "bf16[1, 256, 4096][3145728, 12288, 1]cuda:0" = PlaceHolder[target=arg7_1]
+#   %arg2_1 : Tensor "bf16[1, 2048, 4096][25165824, 12288, 1]cuda:0" = PlaceHolder[target=arg2_1]
+#   %view_5 : Tensor "bf16[1, 256, 32, 128][3145728, 12288, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%arg7_1, [1, 256, 32, 128]), kwargs = {})
+#   %view_2 : Tensor "bf16[1, 2048, 32, 128][25165824, 12288, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%arg2_1, [1, 2048, 32, 128]), kwargs = {})
+#   %cat_2 : Tensor "bf16[1, 2304, 32, 128][9437184, 4096, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.cat.default](args = ([%view_5, %view_2], 1), kwargs = {})
+#   return %cat_2
+triton_poi_fused_cat_view_4 = async_compile.triton('triton_poi_fused_cat_view_4', '''
+import triton
+import triton.language as tl
+
+from torch._inductor.runtime import triton_helpers, triton_heuristics
+from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+triton_helpers.set_driver_to_gpu()
+
+@triton_heuristics.pointwise(
+    size_hints={'x': 16777216}, 
+    filename=__file__,
+    triton_meta={'signature': {'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'out_ptr0': '*bf16', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=128, cc=89, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=1536, warp_size=32), 'constants': {}, 'native_matmul': False, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]]}], 'enable_fp_fusion': True},
+    inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_cat_view_4', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'atomic_add_found': False, 'num_load': 2, 'num_store': 1, 'num_reduction': 0, 'backend_hash': '139C22A3A3C364569C9941DE9469DCB674B7A631E094782CBD415193800462F6', 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'deterministic': False, 'force_filter_reduction_configs': False, 'are_deterministic_algorithms_enabled': False, 'tiling_scores': {'x': 75497472}},
+    min_elem_per_thread=0
+)
+@triton.jit
+def triton_poi_fused_cat_view_4(in_ptr0, in_ptr1, out_ptr0, xnumel, XBLOCK : tl.constexpr):
+    xnumel = 9437184
+    xoffset = tl.program_id(0) * XBLOCK
+    xindex = xoffset + tl.arange(0, XBLOCK)[:]
+    xmask = tl.full([XBLOCK], True, tl.int1)
+    x1 = xindex // 4096
+    x0 = (xindex % 4096)
+    x2 = xindex
+    tmp0 = x1
+    tmp1 = tl.full([1], 0, tl.int64)
+    tmp2 = tmp0 >= tmp1
+    tmp3 = tl.full([1], 256, tl.int64)
+    tmp4 = tmp0 < tmp3
+    tmp5 = tl.load(in_ptr0 + (x0 + 12288*(x1)), tmp4, other=0.0).to(tl.float32)
+    tmp6 = tmp0 >= tmp3
+    tmp7 = tl.full([1], 2304, tl.int64)
+    tmp8 = tmp0 < tmp7
+    tmp9 = tl.load(in_ptr1 + (x0 + 12288*((-256) + x1)), tmp6, other=0.0).to(tl.float32)
+    tmp10 = tl.where(tmp4, tmp5, tmp9)
+    tl.store(out_ptr0 + (x2), tmp10, None)
+''', device_str='cuda')
+
+
+async_compile.wait(globals())
+del async_compile
+
+class Runner:
+    def __init__(self, partitions):
+        self.partitions = partitions
+
+    def recursively_apply_fns(self, fns):
+        new_callables = []
+        for fn, c in zip(fns, self.partitions):
+            new_callables.append(fn(c))
+        self.partitions = new_callables
+
+    def call(self, args):
+        arg0_1, arg1_1, arg2_1, arg3_1, arg4_1, arg5_1, arg6_1, arg7_1, arg8_1, arg9_1, arg10_1, arg11_1 = args
+        args.clear()
+        assert_size_stride(arg0_1, (1, 2048, 4096), (25165824, 12288, 1))
+        assert_size_stride(arg1_1, (1, 2048, 4096), (25165824, 12288, 1))
+        assert_size_stride(arg2_1, (1, 2048, 4096), (25165824, 12288, 1))
+        assert_size_stride(arg3_1, (128, ), (1, ))
+        assert_size_stride(arg4_1, (128, ), (1, ))
+        assert_size_stride(arg5_1, (1, 256, 4096), (3145728, 12288, 1))
+        assert_size_stride(arg6_1, (1, 256, 4096), (3145728, 12288, 1))
+        assert_size_stride(arg7_1, (1, 256, 4096), (3145728, 12288, 1))
+        assert_size_stride(arg8_1, (128, ), (1, ))
+        assert_size_stride(arg9_1, (128, ), (1, ))
+        assert_size_stride(arg10_1, (2304, 128), (128, 1))
+        assert_size_stride(arg11_1, (2304, 128), (128, 1))
+        with torch.cuda._DeviceGuard(0):
+            torch.cuda.set_device(0)
+            buf0 = empty_strided_cuda((1, 256, 32, 1), (8192, 32, 1, 8192), torch.float32)
+            # Topologically Sorted Source Nodes: [encoder_query, encoder_query_1], Original ATen: [aten.view, aten._fused_rms_norm]
+            stream0 = get_raw_stream(0)
+            triton_red_fused__fused_rms_norm_view_0.run(arg5_1, buf0, 8192, 128, stream=stream0)
+            buf1 = empty_strided_cuda((1, 2048, 32, 1), (65536, 32, 1, 65536), torch.float32)
+            # Topologically Sorted Source Nodes: [query, query_1], Original ATen: [aten.view, aten._fused_rms_norm]
+            stream0 = get_raw_stream(0)
+            triton_red_fused__fused_rms_norm_view_1.run(arg0_1, buf1, 65536, 128, stream=stream0)
+            buf2 = empty_strided_cuda((1, 2304, 32, 128), (9437184, 4096, 128, 1), torch.bfloat16)
+            # Topologically Sorted Source Nodes: [encoder_query, encoder_query_1, query, query_1, query_2], Original ATen: [aten.view, aten._fused_rms_norm, aten.cat]
+            stream0 = get_raw_stream(0)
+            triton_poi_fused__fused_rms_norm_cat_view_2.run(arg5_1, buf0, arg8_1, arg0_1, buf1, arg3_1, buf2, 73728, 128, stream=stream0)
+            del arg0_1
+            del arg3_1
+            del arg5_1
+            del arg8_1
+            buf3 = buf0; del buf0  # reuse
+            # Topologically Sorted Source Nodes: [encoder_key, encoder_key_1], Original ATen: [aten.view, aten._fused_rms_norm]
+            stream0 = get_raw_stream(0)
+            triton_red_fused__fused_rms_norm_view_0.run(arg6_1, buf3, 8192, 128, stream=stream0)
+            buf4 = buf1; del buf1  # reuse
+            # Topologically Sorted Source Nodes: [key, key_1], Original ATen: [aten.view, aten._fused_rms_norm]
+            stream0 = get_raw_stream(0)
+            triton_red_fused__fused_rms_norm_view_1.run(arg1_1, buf4, 65536, 128, stream=stream0)
+            buf5 = empty_strided_cuda((1, 2304, 32, 128), (9437184, 4096, 128, 1), torch.bfloat16)
+            # Topologically Sorted Source Nodes: [encoder_key, encoder_key_1, key, key_1, key_2], Original ATen: [aten.view, aten._fused_rms_norm, aten.cat]
+            stream0 = get_raw_stream(0)
+            triton_poi_fused__fused_rms_norm_cat_view_2.run(arg6_1, buf3, arg9_1, arg1_1, buf4, arg4_1, buf5, 73728, 128, stream=stream0)
+            del arg1_1
+            del arg4_1
+            del arg6_1
+            del arg9_1
+            del buf3
+            del buf4
+            buf6 = empty_strided_cuda((1, 2304, 32, 128), (9437184, 4096, 128, 1), torch.bfloat16)
+            buf7 = empty_strided_cuda((1, 2304, 32, 128), (9437184, 4096, 128, 1), torch.bfloat16)
+            # Topologically Sorted Source Nodes: [reshape, unbind, reshape_1, unbind_1, float_1, cos, mul, neg, stack, x_rotated, float_2, sin, mul_1, add, out, float_3, cos_2, mul_2, neg_1, stack_1, x_rotated_1, float_4, sin_2, mul_3, add_1, out_1], Original ATen: [aten.view, aten.unbind, aten._to_copy, aten.unsqueeze, aten.mul, aten.neg, aten.stack, aten.add]
+            stream0 = get_raw_stream(0)
+            triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.run(buf2, arg10_1, arg11_1, buf5, buf6, buf7, 9437184, stream=stream0)
+            del arg10_1
+            del arg11_1
+            del buf2
+            buf8 = buf5; del buf5  # reuse
+            # Topologically Sorted Source Nodes: [encoder_value, value, value_1], Original ATen: [aten.view, aten.cat]
+            stream0 = get_raw_stream(0)
+            triton_poi_fused_cat_view_4.run(arg7_1, arg2_1, buf8, 9437184, stream=stream0)
+            del arg2_1
+            del arg7_1
+        return (buf6, buf7, buf8, )
+
+runner = Runner(partitions=[])
+call = runner.call
+recursively_apply_fns = runner.recursively_apply_fns
+
+
+def benchmark_compiled_module(times=10, repeat=10):
+    from torch._dynamo.testing import rand_strided
+    from torch._inductor.utils import print_performance
+    arg0_1 = rand_strided((1, 2048, 4096), (25165824, 12288, 1), device='cuda:0', dtype=torch.bfloat16)
+    arg1_1 = rand_strided((1, 2048, 4096), (25165824, 12288, 1), device='cuda:0', dtype=torch.bfloat16)
+    arg2_1 = rand_strided((1, 2048, 4096), (25165824, 12288, 1), device='cuda:0', dtype=torch.bfloat16)
+    arg3_1 = rand_strided((128, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
+    arg4_1 = rand_strided((128, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
+    arg5_1 = rand_strided((1, 256, 4096), (3145728, 12288, 1), device='cuda:0', dtype=torch.bfloat16)
+    arg6_1 = rand_strided((1, 256, 4096), (3145728, 12288, 1), device='cuda:0', dtype=torch.bfloat16)
+    arg7_1 = rand_strided((1, 256, 4096), (3145728, 12288, 1), device='cuda:0', dtype=torch.bfloat16)
+    arg8_1 = rand_strided((128, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
+    arg9_1 = rand_strided((128, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
+    arg10_1 = rand_strided((2304, 128), (128, 1), device='cuda:0', dtype=torch.float32)
+    arg11_1 = rand_strided((2304, 128), (128, 1), device='cuda:0', dtype=torch.float32)
+    fn = lambda: call([arg0_1, arg1_1, arg2_1, arg3_1, arg4_1, arg5_1, arg6_1, arg7_1, arg8_1, arg9_1, arg10_1, arg11_1])
+    return print_performance(fn, times=times, repeat=repeat)
+
+
+if __name__ == "__main__":
+    from torch._inductor.wrapper_benchmark import compiled_module_main
+    compiled_module_main('None', benchmark_compiled_module)
diff --git a/torchinductor/qi/43ec9cea2ff86c63c13b7d3cc01865c162876223bb0311f7a189de47919626e8.best_config b/torchinductor/qi/43ec9cea2ff86c63c13b7d3cc01865c162876223bb0311f7a189de47919626e8.best_config
new file mode 100644
index 0000000000000000000000000000000000000000..8ea64e593bba4bd1abf52810c7324083f8b86ade
--- /dev/null
+++ b/torchinductor/qi/43ec9cea2ff86c63c13b7d3cc01865c162876223bb0311f7a189de47919626e8.best_config
@@ -0,0 +1 @@
+{"XBLOCK": 64, "R0_BLOCK": 64, "num_warps": 16, "num_stages": 1, "configs_hash": "6ffa43f2ca8cb1499f3ff3fbf8c975f2c07eef9b57fcecda113029ab12cbef66", "found_by_coordesc": false, "time_taken_ms": 116, "triton_cache_hash": "7VRSMQ6HCWL4FYVSALSUIPZQQ2RD3JJX7SZDTD6UHJE5ZNP2MUVA"}
\ No newline at end of file
diff --git a/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py b/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py
new file mode 100644
index 0000000000000000000000000000000000000000..fabc882f8d8b27e7850e810b8645af4c260c6187
--- /dev/null
+++ b/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py
@@ -0,0 +1,45 @@
+
+import triton
+import triton.language as tl
+
+from torch._inductor.runtime import triton_helpers, triton_heuristics
+from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+triton_helpers.set_driver_to_gpu()
+
+@triton_heuristics.reduction(
+    size_hints={'x': 65536, 'r0_': 128},
+    reduction_hint=ReductionHint.DEFAULT,
+    filename=__file__,
+    triton_meta={'signature': {'in_ptr0': '*bf16', 'out_ptr0': '*fp32', 'xnumel': 'i32', 'r0_numel': 'i32', 'XBLOCK': 'constexpr', 'R0_BLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=128, cc=89, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=1536, warp_size=32), 'constants': {}, 'native_matmul': False, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]]}], 'enable_fp_fusion': True},
+    inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_red_fused__fused_rms_norm_view_1', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'atomic_add_found': False, 'num_load': 1, 'num_store': 1, 'num_reduction': 1, 'backend_hash': '139C22A3A3C364569C9941DE9469DCB674B7A631E094782CBD415193800462F6', 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'deterministic': False, 'force_filter_reduction_configs': False, 'are_deterministic_algorithms_enabled': False, 'tiling_scores': {'x': 524288, 'r0_': 16777216}}
+)
+@triton.jit
+def triton_red_fused__fused_rms_norm_view_1(in_ptr0, out_ptr0, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr):
+    xnumel = 65536
+    r0_numel = 128
+    rnumel = r0_numel
+    RBLOCK: tl.constexpr = R0_BLOCK
+    xoffset = tl.program_id(0) * XBLOCK
+    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
+    xmask = tl.full([XBLOCK, R0_BLOCK], True, tl.int1)
+    r0_base = tl.arange(0, R0_BLOCK)[None, :]
+    rbase = r0_base
+    x0 = (xindex % 32)
+    x1 = xindex // 32
+    _tmp4 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32)
+    x3 = xindex
+    for r0_offset in tl.range(0, r0_numel, R0_BLOCK):
+        r0_index = r0_offset + r0_base
+        r0_mask = r0_index < r0_numel
+        roffset = r0_offset
+        rindex = r0_index
+        r0_2 = r0_index
+        tmp0 = tl.load(in_ptr0 + (r0_2 + 128*x0 + 12288*x1), r0_mask, eviction_policy='evict_first', other=0.0).to(tl.float32)
+        tmp1 = tmp0.to(tl.float32)
+        tmp2 = tmp1 * tmp1
+        tmp3 = tl.broadcast_to(tmp2, [XBLOCK, R0_BLOCK])
+        tmp5 = _tmp4 + tmp3
+        _tmp4 = tl.where(r0_mask, tmp5, _tmp4)
+    tmp4 = tl.sum(_tmp4, 1)[:, None]
+    tl.store(out_ptr0 + (x3), tmp4, None)
diff --git a/torchinductor/qo/cqo2iwjnshnag7tcsvqf4zqyfi65lsg3hg6ckh7lrd3p4rff3m4j.py b/torchinductor/qo/cqo2iwjnshnag7tcsvqf4zqyfi65lsg3hg6ckh7lrd3p4rff3m4j.py
new file mode 100644
index 0000000000000000000000000000000000000000..adf40a9f9b663a683b7992b0140c2d72b433da97
--- /dev/null
+++ b/torchinductor/qo/cqo2iwjnshnag7tcsvqf4zqyfi65lsg3hg6ckh7lrd3p4rff3m4j.py
@@ -0,0 +1,129 @@
+# AOT ID: ['16_inference']
+from ctypes import c_void_p, c_long, c_int
+import torch
+import math
+import random
+import os
+import tempfile
+from math import inf, nan
+from cmath import nanj
+from torch._inductor.hooks import run_intermediate_hooks
+from torch._inductor.utils import maybe_profile
+from torch._inductor.codegen.memory_planning import _align as align
+from torch import device, empty_strided
+from torch._inductor.async_compile import AsyncCompile
+from torch._inductor.select_algorithm import extern_kernels
+import triton
+import triton.language as tl
+from torch._inductor.runtime.triton_heuristics import start_graph, end_graph
+from torch._C import _cuda_getCurrentRawStream as get_raw_stream
+
+aten = torch.ops.aten
+inductor_ops = torch.ops.inductor
+_quantized = torch.ops._quantized
+assert_size_stride = torch._C._dynamo.guards.assert_size_stride
+assert_alignment = torch._C._dynamo.guards.assert_alignment
+empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu
+empty_strided_cpu_pinned = torch._C._dynamo.guards._empty_strided_cpu_pinned
+empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
+empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu
+empty_strided_mtia = torch._C._dynamo.guards._empty_strided_mtia
+reinterpret_tensor = torch._C._dynamo.guards._reinterpret_tensor
+alloc_from_pool = torch.ops.inductor._alloc_from_pool
+async_compile = AsyncCompile()
+empty_strided_p2p = torch._C._distributed_c10d._SymmetricMemory.empty_strided_p2p
+
+
+# kernel path: /app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py
+# Topologically Sorted Source Nodes: [chunk, silu, x], Original ATen: [aten.split, aten.silu, aten.mul]
+# Source node to ATen node mapping:
+#   chunk => split
+#   silu => convert_element_type, convert_element_type_1, mul, sigmoid
+#   x => mul_1
+# Graph fragment:
+#   %arg0_1 : Tensor "bf16[1, 2048, 24576][50331648, 24576, 1]cuda:0" = PlaceHolder[target=arg0_1]
+#   %split : [num_users=2] = call_function[target=torch.ops.aten.split.Tensor](args = (%arg0_1, 12288, -1), kwargs = {})
+#   %convert_element_type : Tensor "f32[1, 2048, 12288][25165824, 12288, 1]cuda:0"[num_users=2] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%getitem, torch.float32), kwargs = {})
+#   %sigmoid : Tensor "f32[1, 2048, 12288][25165824, 12288, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.sigmoid.default](args = (%convert_element_type,), kwargs = {})
+#   %mul : Tensor "f32[1, 2048, 12288][25165824, 12288, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type, %sigmoid), kwargs = {})
+#   %convert_element_type_1 : Tensor "bf16[1, 2048, 12288][25165824, 12288, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%mul, torch.bfloat16), kwargs = {})
+#   %mul_1 : Tensor "bf16[1, 2048, 12288][25165824, 12288, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type_1, %getitem_1), kwargs = {})
+#   return %mul_1
+triton_poi_fused_mul_silu_split_0 = async_compile.triton('triton_poi_fused_mul_silu_split_0', '''
+import triton
+import triton.language as tl
+
+from torch._inductor.runtime import triton_helpers, triton_heuristics
+from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+triton_helpers.set_driver_to_gpu()
+
+@triton_heuristics.pointwise(
+    size_hints={'x': 33554432}, 
+    filename=__file__,
+    triton_meta={'signature': {'in_ptr0': '*bf16', 'out_ptr0': '*bf16', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=128, cc=89, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=1536, warp_size=32), 'constants': {}, 'native_matmul': False, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]]}], 'enable_fp_fusion': True},
+    inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_mul_silu_split_0', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'atomic_add_found': False, 'num_load': 2, 'num_store': 1, 'num_reduction': 0, 'backend_hash': '139C22A3A3C364569C9941DE9469DCB674B7A631E094782CBD415193800462F6', 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'deterministic': False, 'force_filter_reduction_configs': False, 'are_deterministic_algorithms_enabled': False, 'tiling_scores': {'x': 201326592}},
+    min_elem_per_thread=0
+)
+@triton.jit
+def triton_poi_fused_mul_silu_split_0(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
+    xnumel = 25165824
+    xoffset = tl.program_id(0) * XBLOCK
+    xindex = xoffset + tl.arange(0, XBLOCK)[:]
+    xmask = tl.full([XBLOCK], True, tl.int1)
+    x0 = (xindex % 12288)
+    x1 = xindex // 12288
+    x2 = xindex
+    tmp0 = tl.load(in_ptr0 + (x0 + 24576*x1), None).to(tl.float32)
+    tmp5 = tl.load(in_ptr0 + (12288 + x0 + 24576*x1), None).to(tl.float32)
+    tmp1 = tmp0.to(tl.float32)
+    tmp2 = tl.sigmoid(tmp1)
+    tmp3 = tmp1 * tmp2
+    tmp4 = tmp3.to(tl.float32)
+    tmp6 = tmp4 * tmp5
+    tl.store(out_ptr0 + (x2), tmp6, None)
+''', device_str='cuda')
+
+
+async_compile.wait(globals())
+del async_compile
+
+class Runner:
+    def __init__(self, partitions):
+        self.partitions = partitions
+
+    def recursively_apply_fns(self, fns):
+        new_callables = []
+        for fn, c in zip(fns, self.partitions):
+            new_callables.append(fn(c))
+        self.partitions = new_callables
+
+    def call(self, args):
+        arg0_1, = args
+        args.clear()
+        assert_size_stride(arg0_1, (1, 2048, 24576), (50331648, 24576, 1))
+        with torch.cuda._DeviceGuard(0):
+            torch.cuda.set_device(0)
+            buf0 = empty_strided_cuda((1, 2048, 12288), (25165824, 12288, 1), torch.bfloat16)
+            # Topologically Sorted Source Nodes: [chunk, silu, x], Original ATen: [aten.split, aten.silu, aten.mul]
+            stream0 = get_raw_stream(0)
+            triton_poi_fused_mul_silu_split_0.run(arg0_1, buf0, 25165824, stream=stream0)
+            del arg0_1
+        return (buf0, )
+
+runner = Runner(partitions=[])
+call = runner.call
+recursively_apply_fns = runner.recursively_apply_fns
+
+
+def benchmark_compiled_module(times=10, repeat=10):
+    from torch._dynamo.testing import rand_strided
+    from torch._inductor.utils import print_performance
+    arg0_1 = rand_strided((1, 2048, 24576), (50331648, 24576, 1), device='cuda:0', dtype=torch.bfloat16)
+    fn = lambda: call([arg0_1])
+    return print_performance(fn, times=times, repeat=repeat)
+
+
+if __name__ == "__main__":
+    from torch._inductor.wrapper_benchmark import compiled_module_main
+    compiled_module_main('None', benchmark_compiled_module)
diff --git a/torchinductor/r4/6028a425a75f0721e0fdad1d9db266ea37b002463e12788ce8a68ad2891bcf49.best_config b/torchinductor/r4/6028a425a75f0721e0fdad1d9db266ea37b002463e12788ce8a68ad2891bcf49.best_config
new file mode 100644
index 0000000000000000000000000000000000000000..34767cd30c8472ccb4470e1cbfe06f14f75fa580
--- /dev/null
+++ b/torchinductor/r4/6028a425a75f0721e0fdad1d9db266ea37b002463e12788ce8a68ad2891bcf49.best_config
@@ -0,0 +1 @@
+{"XBLOCK": 512, "num_warps": 8, "num_stages": 1, "configs_hash": "3ca5c3e34d35093f3c9ab2829a9faeebad5e61c4ca13d5ed6053d7b71ce60d5a", "found_by_coordesc": false, "time_taken_ms": 43, "triton_cache_hash": "X64FO2735YQCHUQEHARXMBHC2OO7I7W5Y2YUTJO7NRKIYNQ572AQ"}
\ No newline at end of file
diff --git a/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py b/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py
new file mode 100644
index 0000000000000000000000000000000000000000..4ea2f5a5befa39b1ed1cc85d2a4d3ff6a5ef5dfc
--- /dev/null
+++ b/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py
@@ -0,0 +1,28 @@
+
+import triton
+import triton.language as tl
+
+from torch._inductor.runtime import triton_helpers, triton_heuristics
+from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+triton_helpers.set_driver_to_gpu()
+
+@triton_heuristics.pointwise(
+    size_hints={'x': 16777216}, 
+    filename=__file__,
+    triton_meta={'signature': {'in_ptr0': '*bf16', 'out_ptr0': '*bf16', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=128, cc=89, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=1536, warp_size=32), 'constants': {}, 'native_matmul': False, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]]}], 'enable_fp_fusion': True},
+    inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_clone_permute_1', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'atomic_add_found': False, 'num_load': 1, 'num_store': 1, 'num_reduction': 0, 'backend_hash': '139C22A3A3C364569C9941DE9469DCB674B7A631E094782CBD415193800462F6', 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'deterministic': False, 'force_filter_reduction_configs': False, 'are_deterministic_algorithms_enabled': False, 'tiling_scores': {'x': 56623104}},
+    min_elem_per_thread=0
+)
+@triton.jit
+def triton_poi_fused_clone_permute_1(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
+    xnumel = 9437184
+    xoffset = tl.program_id(0) * XBLOCK
+    xindex = xoffset + tl.arange(0, XBLOCK)[:]
+    xmask = tl.full([XBLOCK], True, tl.int1)
+    x0 = (xindex % 128)
+    x1 = ((xindex // 128) % 32)
+    x2 = xindex // 4096
+    x3 = xindex
+    tmp0 = tl.load(in_ptr0 + (x0 + 128*x2 + 294912*x1), None).to(tl.float32)
+    tl.store(out_ptr0 + (x3), tmp0, None)
diff --git a/torchinductor/s7/cs7ssipzow4rpjc35xc3dleet56e5bkdlxrbyh5qbhp6vzfhfqpg.py b/torchinductor/s7/cs7ssipzow4rpjc35xc3dleet56e5bkdlxrbyh5qbhp6vzfhfqpg.py
new file mode 100644
index 0000000000000000000000000000000000000000..9b290100d32b9fe6dda9b7350dbaefcf4b163001
--- /dev/null
+++ b/torchinductor/s7/cs7ssipzow4rpjc35xc3dleet56e5bkdlxrbyh5qbhp6vzfhfqpg.py
@@ -0,0 +1,69 @@
+# AOT ID: ['19_inference']
+from ctypes import c_void_p, c_long, c_int
+import torch
+import math
+import random
+import os
+import tempfile
+from math import inf, nan
+from cmath import nanj
+from torch._inductor.hooks import run_intermediate_hooks
+from torch._inductor.utils import maybe_profile
+from torch._inductor.codegen.memory_planning import _align as align
+from torch import device, empty_strided
+from torch._inductor.async_compile import AsyncCompile
+from torch._inductor.select_algorithm import extern_kernels
+
+aten = torch.ops.aten
+inductor_ops = torch.ops.inductor
+_quantized = torch.ops._quantized
+assert_size_stride = torch._C._dynamo.guards.assert_size_stride
+assert_alignment = torch._C._dynamo.guards.assert_alignment
+empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu
+empty_strided_cpu_pinned = torch._C._dynamo.guards._empty_strided_cpu_pinned
+empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
+empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu
+empty_strided_mtia = torch._C._dynamo.guards._empty_strided_mtia
+reinterpret_tensor = torch._C._dynamo.guards._reinterpret_tensor
+alloc_from_pool = torch.ops.inductor._alloc_from_pool
+async_compile = AsyncCompile()
+empty_strided_p2p = torch._C._distributed_c10d._SymmetricMemory.empty_strided_p2p
+
+
+async_compile.wait(globals())
+del async_compile
+
+class Runner:
+    def __init__(self, partitions):
+        self.partitions = partitions
+
+    def recursively_apply_fns(self, fns):
+        new_callables = []
+        for fn, c in zip(fns, self.partitions):
+            new_callables.append(fn(c))
+        self.partitions = new_callables
+
+    def call(self, args):
+        arg0_1, arg1_1 = args
+        args.clear()
+        assert_size_stride(arg0_1, (12288, 4096), (1, 12288))
+        assert_size_stride(arg1_1, (1, 1), (1, 1))
+        return (aten.view.dtype(reinterpret_tensor(arg0_1, (4096, 12288), (12288, 1), 0), torch.uint8), reinterpret_tensor(arg1_1, (1, ), (1, ), 0), )
+
+runner = Runner(partitions=[])
+call = runner.call
+recursively_apply_fns = runner.recursively_apply_fns
+
+
+def benchmark_compiled_module(times=10, repeat=10):
+    from torch._dynamo.testing import rand_strided
+    from torch._inductor.utils import print_performance
+    arg0_1 = rand_strided((12288, 4096), (1, 12288), device='cuda:0', dtype=torch.float8_e4m3fn)
+    arg1_1 = rand_strided((1, 1), (1, 1), device='cuda:0', dtype=torch.float32)
+    fn = lambda: call([arg0_1, arg1_1])
+    return print_performance(fn, times=times, repeat=repeat)
+
+
+if __name__ == "__main__":
+    from torch._inductor.wrapper_benchmark import compiled_module_main
+    compiled_module_main('None', benchmark_compiled_module)
diff --git a/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py b/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py
new file mode 100644
index 0000000000000000000000000000000000000000..701e65bf90b13cc0feabb549526bd21e1c720221
--- /dev/null
+++ b/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py
@@ -0,0 +1,32 @@
+
+import triton
+import triton.language as tl
+
+from torch._inductor.runtime import triton_helpers, triton_heuristics
+from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+triton_helpers.set_driver_to_gpu()
+
+@triton_heuristics.pointwise(
+    size_hints={'x': 4194304}, 
+    filename=__file__,
+    triton_meta={'signature': {'in_ptr0': '*bf16', 'out_ptr0': '*bf16', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=128, cc=89, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=1536, warp_size=32), 'constants': {}, 'native_matmul': False, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]]}], 'enable_fp_fusion': True},
+    inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_mul_silu_split_0', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'atomic_add_found': False, 'num_load': 2, 'num_store': 1, 'num_reduction': 0, 'backend_hash': '139C22A3A3C364569C9941DE9469DCB674B7A631E094782CBD415193800462F6', 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'deterministic': False, 'force_filter_reduction_configs': False, 'are_deterministic_algorithms_enabled': False},
+    min_elem_per_thread=0
+)
+@triton.jit
+def triton_poi_fused_mul_silu_split_0(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
+    xoffset = tl.program_id(0) * XBLOCK
+    xindex = xoffset + tl.arange(0, XBLOCK)[:]
+    xmask = tl.full([XBLOCK], True, tl.int1)
+    x0 = (xindex % 12288)
+    x1 = xindex // 12288
+    x2 = xindex
+    tmp0 = tl.load(in_ptr0 + (x0 + 24576*x1), None).to(tl.float32)
+    tmp5 = tl.load(in_ptr0 + (12288 + x0 + 24576*x1), None).to(tl.float32)
+    tmp1 = tmp0.to(tl.float32)
+    tmp2 = tl.sigmoid(tmp1)
+    tmp3 = tmp1 * tmp2
+    tmp4 = tmp3.to(tl.float32)
+    tmp6 = tmp4 * tmp5
+    tl.store(out_ptr0 + (x2), tmp6, None)
diff --git a/torchinductor/sy/fab59b692a7cc2d74a6b7044a7f84ec2ff8e1f7b3f0b76089865ebc72c0cdb5e.best_config b/torchinductor/sy/fab59b692a7cc2d74a6b7044a7f84ec2ff8e1f7b3f0b76089865ebc72c0cdb5e.best_config
new file mode 100644
index 0000000000000000000000000000000000000000..95a3dc12e5ccc24885a84327e17db66fc3f7e791
--- /dev/null
+++ b/torchinductor/sy/fab59b692a7cc2d74a6b7044a7f84ec2ff8e1f7b3f0b76089865ebc72c0cdb5e.best_config
@@ -0,0 +1 @@
+{"XBLOCK": 512, "num_warps": 8, "num_stages": 1, "configs_hash": "3ca5c3e34d35093f3c9ab2829a9faeebad5e61c4ca13d5ed6053d7b71ce60d5a", "found_by_coordesc": false, "time_taken_ms": 38, "triton_cache_hash": "THXMHSCFSPTLTP6J3LZOK4TG4KTYQR7QTC3P2EXOBY7MDGHSVWFA"}
\ No newline at end of file
diff --git a/torchinductor/tu/ctulszk4mmiqip66rpuyd3jhbqxphwudytcgr5dwumqowwaq7mtb.py b/torchinductor/tu/ctulszk4mmiqip66rpuyd3jhbqxphwudytcgr5dwumqowwaq7mtb.py
new file mode 100644
index 0000000000000000000000000000000000000000..626bb27ff8f752f0961d5ea2053d46128fff9c31
--- /dev/null
+++ b/torchinductor/tu/ctulszk4mmiqip66rpuyd3jhbqxphwudytcgr5dwumqowwaq7mtb.py
@@ -0,0 +1,69 @@
+# AOT ID: ['28_inference']
+from ctypes import c_void_p, c_long, c_int
+import torch
+import math
+import random
+import os
+import tempfile
+from math import inf, nan
+from cmath import nanj
+from torch._inductor.hooks import run_intermediate_hooks
+from torch._inductor.utils import maybe_profile
+from torch._inductor.codegen.memory_planning import _align as align
+from torch import device, empty_strided
+from torch._inductor.async_compile import AsyncCompile
+from torch._inductor.select_algorithm import extern_kernels
+
+aten = torch.ops.aten
+inductor_ops = torch.ops.inductor
+_quantized = torch.ops._quantized
+assert_size_stride = torch._C._dynamo.guards.assert_size_stride
+assert_alignment = torch._C._dynamo.guards.assert_alignment
+empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu
+empty_strided_cpu_pinned = torch._C._dynamo.guards._empty_strided_cpu_pinned
+empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
+empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu
+empty_strided_mtia = torch._C._dynamo.guards._empty_strided_mtia
+reinterpret_tensor = torch._C._dynamo.guards._reinterpret_tensor
+alloc_from_pool = torch.ops.inductor._alloc_from_pool
+async_compile = AsyncCompile()
+empty_strided_p2p = torch._C._distributed_c10d._SymmetricMemory.empty_strided_p2p
+
+
+async_compile.wait(globals())
+del async_compile
+
+class Runner:
+    def __init__(self, partitions):
+        self.partitions = partitions
+
+    def recursively_apply_fns(self, fns):
+        new_callables = []
+        for fn, c in zip(fns, self.partitions):
+            new_callables.append(fn(c))
+        self.partitions = new_callables
+
+    def call(self, args):
+        arg0_1, arg1_1 = args
+        args.clear()
+        assert_size_stride(arg0_1, (16384, 4096), (1, 16384))
+        assert_size_stride(arg1_1, (1, 1), (1, 1))
+        return (aten.view.dtype(reinterpret_tensor(arg0_1, (4096, 16384), (16384, 1), 0), torch.uint8), reinterpret_tensor(arg1_1, (1, ), (1, ), 0), )
+
+runner = Runner(partitions=[])
+call = runner.call
+recursively_apply_fns = runner.recursively_apply_fns
+
+
+def benchmark_compiled_module(times=10, repeat=10):
+    from torch._dynamo.testing import rand_strided
+    from torch._inductor.utils import print_performance
+    arg0_1 = rand_strided((16384, 4096), (1, 16384), device='cuda:0', dtype=torch.float8_e4m3fn)
+    arg1_1 = rand_strided((1, 1), (1, 1), device='cuda:0', dtype=torch.float32)
+    fn = lambda: call([arg0_1, arg1_1])
+    return print_performance(fn, times=times, repeat=repeat)
+
+
+if __name__ == "__main__":
+    from torch._inductor.wrapper_benchmark import compiled_module_main
+    compiled_module_main('None', benchmark_compiled_module)
diff --git a/torchinductor/uv/cuviukt3lq3axnbe5hpjzwmyxahv35wielkqnmdck4fg7m2bd6kz.py b/torchinductor/uv/cuviukt3lq3axnbe5hpjzwmyxahv35wielkqnmdck4fg7m2bd6kz.py
new file mode 100644
index 0000000000000000000000000000000000000000..77b9afb61558a36ce68d395d1c84d0f67789e1a6
--- /dev/null
+++ b/torchinductor/uv/cuviukt3lq3axnbe5hpjzwmyxahv35wielkqnmdck4fg7m2bd6kz.py
@@ -0,0 +1,69 @@
+# AOT ID: ['15_inference']
+from ctypes import c_void_p, c_long, c_int
+import torch
+import math
+import random
+import os
+import tempfile
+from math import inf, nan
+from cmath import nanj
+from torch._inductor.hooks import run_intermediate_hooks
+from torch._inductor.utils import maybe_profile
+from torch._inductor.codegen.memory_planning import _align as align
+from torch import device, empty_strided
+from torch._inductor.async_compile import AsyncCompile
+from torch._inductor.select_algorithm import extern_kernels
+
+aten = torch.ops.aten
+inductor_ops = torch.ops.inductor
+_quantized = torch.ops._quantized
+assert_size_stride = torch._C._dynamo.guards.assert_size_stride
+assert_alignment = torch._C._dynamo.guards.assert_alignment
+empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu
+empty_strided_cpu_pinned = torch._C._dynamo.guards._empty_strided_cpu_pinned
+empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
+empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu
+empty_strided_mtia = torch._C._dynamo.guards._empty_strided_mtia
+reinterpret_tensor = torch._C._dynamo.guards._reinterpret_tensor
+alloc_from_pool = torch.ops.inductor._alloc_from_pool
+async_compile = AsyncCompile()
+empty_strided_p2p = torch._C._distributed_c10d._SymmetricMemory.empty_strided_p2p
+
+
+async_compile.wait(globals())
+del async_compile
+
+class Runner:
+    def __init__(self, partitions):
+        self.partitions = partitions
+
+    def recursively_apply_fns(self, fns):
+        new_callables = []
+        for fn, c in zip(fns, self.partitions):
+            new_callables.append(fn(c))
+        self.partitions = new_callables
+
+    def call(self, args):
+        arg0_1, arg1_1 = args
+        args.clear()
+        assert_size_stride(arg0_1, (4096, 24576), (1, 4096))
+        assert_size_stride(arg1_1, (1, 1), (1, 1))
+        return (aten.view.dtype(reinterpret_tensor(arg0_1, (24576, 4096), (4096, 1), 0), torch.uint8), reinterpret_tensor(arg1_1, (1, ), (1, ), 0), )
+
+runner = Runner(partitions=[])
+call = runner.call
+recursively_apply_fns = runner.recursively_apply_fns
+
+
+def benchmark_compiled_module(times=10, repeat=10):
+    from torch._dynamo.testing import rand_strided
+    from torch._inductor.utils import print_performance
+    arg0_1 = rand_strided((4096, 24576), (1, 4096), device='cuda:0', dtype=torch.float8_e4m3fn)
+    arg1_1 = rand_strided((1, 1), (1, 1), device='cuda:0', dtype=torch.float32)
+    fn = lambda: call([arg0_1, arg1_1])
+    return print_performance(fn, times=times, repeat=repeat)
+
+
+if __name__ == "__main__":
+    from torch._inductor.wrapper_benchmark import compiled_module_main
+    compiled_module_main('None', benchmark_compiled_module)
diff --git a/torchinductor/vv/cvvhb6mmoaw3lk4z2krosoew2gyomjg7q2lg2omxzpz5kkx6rf2i.py b/torchinductor/vv/cvvhb6mmoaw3lk4z2krosoew2gyomjg7q2lg2omxzpz5kkx6rf2i.py
new file mode 100644
index 0000000000000000000000000000000000000000..be5ddf0fc35056e2564d7fc26a9388cf18f2873f
--- /dev/null
+++ b/torchinductor/vv/cvvhb6mmoaw3lk4z2krosoew2gyomjg7q2lg2omxzpz5kkx6rf2i.py
@@ -0,0 +1,129 @@
+# AOT ID: ['22_inference']
+from ctypes import c_void_p, c_long, c_int
+import torch
+import math
+import random
+import os
+import tempfile
+from math import inf, nan
+from cmath import nanj
+from torch._inductor.hooks import run_intermediate_hooks
+from torch._inductor.utils import maybe_profile
+from torch._inductor.codegen.memory_planning import _align as align
+from torch import device, empty_strided
+from torch._inductor.async_compile import AsyncCompile
+from torch._inductor.select_algorithm import extern_kernels
+import triton
+import triton.language as tl
+from torch._inductor.runtime.triton_heuristics import start_graph, end_graph
+from torch._C import _cuda_getCurrentRawStream as get_raw_stream
+
+aten = torch.ops.aten
+inductor_ops = torch.ops.inductor
+_quantized = torch.ops._quantized
+assert_size_stride = torch._C._dynamo.guards.assert_size_stride
+assert_alignment = torch._C._dynamo.guards.assert_alignment
+empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu
+empty_strided_cpu_pinned = torch._C._dynamo.guards._empty_strided_cpu_pinned
+empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
+empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu
+empty_strided_mtia = torch._C._dynamo.guards._empty_strided_mtia
+reinterpret_tensor = torch._C._dynamo.guards._reinterpret_tensor
+alloc_from_pool = torch.ops.inductor._alloc_from_pool
+async_compile = AsyncCompile()
+empty_strided_p2p = torch._C._distributed_c10d._SymmetricMemory.empty_strided_p2p
+
+
+# kernel path: /app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py
+# Topologically Sorted Source Nodes: [mul, encoder_hidden_states], Original ATen: [aten.mul, aten.add]
+# Source node to ATen node mapping:
+#   encoder_hidden_states => add
+#   mul => mul
+# Graph fragment:
+#   %arg2_1 : Tensor "bf16[1, 256, 4096][1048576, 4096, 1]cuda:0" = PlaceHolder[target=arg2_1]
+#   %arg0_1 : Tensor "bf16[1, 1, 4096][24576, 24576, 1]cuda:0" = PlaceHolder[target=arg0_1]
+#   %arg1_1 : Tensor "bf16[1, 256, 4096][1048576, 4096, 1]cuda:0" = PlaceHolder[target=arg1_1]
+#   %mul : Tensor "bf16[1, 256, 4096][1048576, 4096, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%arg0_1, %arg1_1), kwargs = {})
+#   %add : Tensor "bf16[1, 256, 4096][1048576, 4096, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%arg2_1, %mul), kwargs = {})
+#   return %add
+triton_poi_fused_add_mul_0 = async_compile.triton('triton_poi_fused_add_mul_0', '''
+import triton
+import triton.language as tl
+
+from torch._inductor.runtime import triton_helpers, triton_heuristics
+from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+triton_helpers.set_driver_to_gpu()
+
+@triton_heuristics.pointwise(
+    size_hints={'x': 1048576}, 
+    filename=__file__,
+    triton_meta={'signature': {'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'in_ptr2': '*bf16', 'out_ptr0': '*bf16', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=128, cc=89, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=1536, warp_size=32), 'constants': {}, 'native_matmul': False, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]]}], 'enable_fp_fusion': True},
+    inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_add_mul_0', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'atomic_add_found': False, 'num_load': 3, 'num_store': 1, 'num_reduction': 0, 'backend_hash': '139C22A3A3C364569C9941DE9469DCB674B7A631E094782CBD415193800462F6', 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'deterministic': False, 'force_filter_reduction_configs': False, 'are_deterministic_algorithms_enabled': False, 'tiling_scores': {'x': 8396800}},
+    min_elem_per_thread=0
+)
+@triton.jit
+def triton_poi_fused_add_mul_0(in_ptr0, in_ptr1, in_ptr2, out_ptr0, xnumel, XBLOCK : tl.constexpr):
+    xnumel = 1048576
+    xoffset = tl.program_id(0) * XBLOCK
+    xindex = xoffset + tl.arange(0, XBLOCK)[:]
+    xmask = tl.full([XBLOCK], True, tl.int1)
+    x2 = xindex
+    x0 = (xindex % 4096)
+    tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32)
+    tmp1 = tl.load(in_ptr1 + (x0), None, eviction_policy='evict_last').to(tl.float32)
+    tmp2 = tl.load(in_ptr2 + (x2), None).to(tl.float32)
+    tmp3 = tmp1 * tmp2
+    tmp4 = tmp0 + tmp3
+    tl.store(out_ptr0 + (x2), tmp4, None)
+''', device_str='cuda')
+
+
+async_compile.wait(globals())
+del async_compile
+
+class Runner:
+    def __init__(self, partitions):
+        self.partitions = partitions
+
+    def recursively_apply_fns(self, fns):
+        new_callables = []
+        for fn, c in zip(fns, self.partitions):
+            new_callables.append(fn(c))
+        self.partitions = new_callables
+
+    def call(self, args):
+        arg0_1, arg1_1, arg2_1 = args
+        args.clear()
+        assert_size_stride(arg0_1, (1, 1, 4096), (24576, 24576, 1))
+        assert_size_stride(arg1_1, (1, 256, 4096), (1048576, 4096, 1))
+        assert_size_stride(arg2_1, (1, 256, 4096), (1048576, 4096, 1))
+        with torch.cuda._DeviceGuard(0):
+            torch.cuda.set_device(0)
+            buf0 = empty_strided_cuda((1, 256, 4096), (1048576, 4096, 1), torch.bfloat16)
+            # Topologically Sorted Source Nodes: [mul, encoder_hidden_states], Original ATen: [aten.mul, aten.add]
+            stream0 = get_raw_stream(0)
+            triton_poi_fused_add_mul_0.run(arg2_1, arg0_1, arg1_1, buf0, 1048576, stream=stream0)
+            del arg0_1
+            del arg1_1
+            del arg2_1
+        return (buf0, )
+
+runner = Runner(partitions=[])
+call = runner.call
+recursively_apply_fns = runner.recursively_apply_fns
+
+
+def benchmark_compiled_module(times=10, repeat=10):
+    from torch._dynamo.testing import rand_strided
+    from torch._inductor.utils import print_performance
+    arg0_1 = rand_strided((1, 1, 4096), (24576, 24576, 1), device='cuda:0', dtype=torch.bfloat16)
+    arg1_1 = rand_strided((1, 256, 4096), (1048576, 4096, 1), device='cuda:0', dtype=torch.bfloat16)
+    arg2_1 = rand_strided((1, 256, 4096), (1048576, 4096, 1), device='cuda:0', dtype=torch.bfloat16)
+    fn = lambda: call([arg0_1, arg1_1, arg2_1])
+    return print_performance(fn, times=times, repeat=repeat)
+
+
+if __name__ == "__main__":
+    from torch._inductor.wrapper_benchmark import compiled_module_main
+    compiled_module_main('None', benchmark_compiled_module)
diff --git a/torchinductor/vw/cvw7uqdyk4vwvsddaezbct3gzsp7n42ycsripavamngotuxavr4d.py b/torchinductor/vw/cvw7uqdyk4vwvsddaezbct3gzsp7n42ycsripavamngotuxavr4d.py
new file mode 100644
index 0000000000000000000000000000000000000000..1619b1c5ffa127cf3d0a084667d70cf0782f5e64
--- /dev/null
+++ b/torchinductor/vw/cvw7uqdyk4vwvsddaezbct3gzsp7n42ycsripavamngotuxavr4d.py
@@ -0,0 +1,197 @@
+# AOT ID: ['9_inference']
+from ctypes import c_void_p, c_long, c_int
+import torch
+import math
+import random
+import os
+import tempfile
+from math import inf, nan
+from cmath import nanj
+from torch._inductor.hooks import run_intermediate_hooks
+from torch._inductor.utils import maybe_profile
+from torch._inductor.codegen.memory_planning import _align as align
+from torch import device, empty_strided
+from torch._inductor.async_compile import AsyncCompile
+from torch._inductor.select_algorithm import extern_kernels
+import triton
+import triton.language as tl
+from torch._inductor.runtime.triton_heuristics import start_graph, end_graph
+from torch._C import _cuda_getCurrentRawStream as get_raw_stream
+
+aten = torch.ops.aten
+inductor_ops = torch.ops.inductor
+_quantized = torch.ops._quantized
+assert_size_stride = torch._C._dynamo.guards.assert_size_stride
+assert_alignment = torch._C._dynamo.guards.assert_alignment
+empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu
+empty_strided_cpu_pinned = torch._C._dynamo.guards._empty_strided_cpu_pinned
+empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
+empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu
+empty_strided_mtia = torch._C._dynamo.guards._empty_strided_mtia
+reinterpret_tensor = torch._C._dynamo.guards._reinterpret_tensor
+alloc_from_pool = torch.ops.inductor._alloc_from_pool
+async_compile = AsyncCompile()
+empty_strided_p2p = torch._C._distributed_c10d._SymmetricMemory.empty_strided_p2p
+
+
+# kernel path: /app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py
+# Topologically Sorted Source Nodes: [permute, q, permute_1, k, permute_2, v, output], Original ATen: [aten.permute, aten.clone, aten._scaled_dot_product_cudnn_attention]
+# Source node to ATen node mapping:
+#   k => clone_1
+#   output => _scaled_dot_product_cudnn_attention
+#   permute => permute
+#   permute_1 => permute_1
+#   permute_2 => permute_2
+#   q => clone
+#   v => clone_2
+# Graph fragment:
+#   %arg0_1 : Tensor "bf16[1, 2304, 32, 128][9437184, 4096, 128, 1]cuda:0" = PlaceHolder[target=arg0_1]
+#   %permute : Tensor "bf16[1, 32, 2304, 128][9437184, 128, 4096, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.permute.default](args = (%arg0_1, [0, 2, 1, 3]), kwargs = {})
+#   %clone : Tensor "bf16[1, 32, 2304, 128][9437184, 294912, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.clone.default](args = (%permute,), kwargs = {memory_format: torch.contiguous_format})
+#   %permute_1 : Tensor "bf16[1, 32, 2304, 128][9437184, 128, 4096, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.permute.default](args = (%arg1_1, [0, 2, 1, 3]), kwargs = {})
+#   %clone_1 : Tensor "bf16[1, 32, 2304, 128][9437184, 294912, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.clone.default](args = (%permute_1,), kwargs = {memory_format: torch.contiguous_format})
+#   %permute_2 : Tensor "bf16[1, 32, 2304, 128][9437184, 128, 4096, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.permute.default](args = (%arg2_1, [0, 2, 1, 3]), kwargs = {})
+#   %clone_2 : Tensor "bf16[1, 32, 2304, 128][9437184, 294912, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.clone.default](args = (%permute_2,), kwargs = {memory_format: torch.contiguous_format})
+#   %_scaled_dot_product_cudnn_attention : [num_users=1] = call_function[target=torch.ops.aten._scaled_dot_product_cudnn_attention.default](args = (%clone, %clone_1, %clone_2, None, False), kwargs = {})
+#   return %buf0
+triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0 = async_compile.triton('triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0', '''
+import triton
+import triton.language as tl
+
+from torch._inductor.runtime import triton_helpers, triton_heuristics
+from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+triton_helpers.set_driver_to_gpu()
+
+@triton_heuristics.pointwise(
+    size_hints={'x': 16777216}, 
+    filename=__file__,
+    triton_meta={'signature': {'in_ptr0': '*bf16', 'out_ptr0': '*bf16', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=128, cc=89, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=1536, warp_size=32), 'constants': {}, 'native_matmul': False, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]]}], 'enable_fp_fusion': True},
+    inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'atomic_add_found': False, 'num_load': 1, 'num_store': 1, 'num_reduction': 0, 'backend_hash': '139C22A3A3C364569C9941DE9469DCB674B7A631E094782CBD415193800462F6', 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'deterministic': False, 'force_filter_reduction_configs': False, 'are_deterministic_algorithms_enabled': False, 'tiling_scores': {'x': 56623104}},
+    min_elem_per_thread=0
+)
+@triton.jit
+def triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
+    xnumel = 9437184
+    xoffset = tl.program_id(0) * XBLOCK
+    xindex = xoffset + tl.arange(0, XBLOCK)[:]
+    xmask = tl.full([XBLOCK], True, tl.int1)
+    x0 = (xindex % 128)
+    x1 = ((xindex // 128) % 2304)
+    x2 = xindex // 294912
+    x3 = xindex
+    tmp0 = tl.load(in_ptr0 + (x0 + 128*x2 + 4096*x1), None).to(tl.float32)
+    tl.store(out_ptr0 + (x3), tmp0, None)
+''', device_str='cuda')
+
+
+# kernel path: /app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py
+# Topologically Sorted Source Nodes: [permute_3, out], Original ATen: [aten.permute, aten.clone]
+# Source node to ATen node mapping:
+#   out => clone_3
+#   permute_3 => permute_3
+# Graph fragment:
+#   %getitem : Tensor "bf16[1, 32, 2304, 128][9437184, 294912, 128, 1]cuda:0" = PlaceHolder[target=getitem]
+#   %permute_3 : Tensor "bf16[1, 2304, 32, 128][9437184, 128, 294912, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.permute.default](args = (%getitem, [0, 2, 1, 3]), kwargs = {})
+#   %clone_3 : Tensor "bf16[1, 2304, 32, 128][9437184, 4096, 128, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.clone.default](args = (%permute_3,), kwargs = {memory_format: torch.contiguous_format})
+#   return %clone_3
+triton_poi_fused_clone_permute_1 = async_compile.triton('triton_poi_fused_clone_permute_1', '''
+import triton
+import triton.language as tl
+
+from torch._inductor.runtime import triton_helpers, triton_heuristics
+from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+triton_helpers.set_driver_to_gpu()
+
+@triton_heuristics.pointwise(
+    size_hints={'x': 16777216}, 
+    filename=__file__,
+    triton_meta={'signature': {'in_ptr0': '*bf16', 'out_ptr0': '*bf16', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=128, cc=89, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=1536, warp_size=32), 'constants': {}, 'native_matmul': False, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]]}], 'enable_fp_fusion': True},
+    inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_clone_permute_1', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'atomic_add_found': False, 'num_load': 1, 'num_store': 1, 'num_reduction': 0, 'backend_hash': '139C22A3A3C364569C9941DE9469DCB674B7A631E094782CBD415193800462F6', 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'deterministic': False, 'force_filter_reduction_configs': False, 'are_deterministic_algorithms_enabled': False, 'tiling_scores': {'x': 56623104}},
+    min_elem_per_thread=0
+)
+@triton.jit
+def triton_poi_fused_clone_permute_1(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
+    xnumel = 9437184
+    xoffset = tl.program_id(0) * XBLOCK
+    xindex = xoffset + tl.arange(0, XBLOCK)[:]
+    xmask = tl.full([XBLOCK], True, tl.int1)
+    x0 = (xindex % 128)
+    x1 = ((xindex // 128) % 32)
+    x2 = xindex // 4096
+    x3 = xindex
+    tmp0 = tl.load(in_ptr0 + (x0 + 128*x2 + 294912*x1), None).to(tl.float32)
+    tl.store(out_ptr0 + (x3), tmp0, None)
+''', device_str='cuda')
+
+
+async_compile.wait(globals())
+del async_compile
+
+class Runner:
+    def __init__(self, partitions):
+        self.partitions = partitions
+
+    def recursively_apply_fns(self, fns):
+        new_callables = []
+        for fn, c in zip(fns, self.partitions):
+            new_callables.append(fn(c))
+        self.partitions = new_callables
+
+    def call(self, args):
+        arg0_1, arg1_1, arg2_1 = args
+        args.clear()
+        assert_size_stride(arg0_1, (1, 2304, 32, 128), (9437184, 4096, 128, 1))
+        assert_size_stride(arg1_1, (1, 2304, 32, 128), (9437184, 4096, 128, 1))
+        assert_size_stride(arg2_1, (1, 2304, 32, 128), (9437184, 4096, 128, 1))
+        with torch.cuda._DeviceGuard(0):
+            torch.cuda.set_device(0)
+            buf0 = empty_strided_cuda((1, 32, 2304, 128), (9437184, 294912, 128, 1), torch.bfloat16)
+            # Topologically Sorted Source Nodes: [permute, q, permute_1, k, permute_2, v, output], Original ATen: [aten.permute, aten.clone, aten._scaled_dot_product_cudnn_attention]
+            stream0 = get_raw_stream(0)
+            triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.run(arg0_1, buf0, 9437184, stream=stream0)
+            del arg0_1
+            buf1 = empty_strided_cuda((1, 32, 2304, 128), (9437184, 294912, 128, 1), torch.bfloat16)
+            # Topologically Sorted Source Nodes: [permute, q, permute_1, k, permute_2, v, output], Original ATen: [aten.permute, aten.clone, aten._scaled_dot_product_cudnn_attention]
+            stream0 = get_raw_stream(0)
+            triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.run(arg1_1, buf1, 9437184, stream=stream0)
+            del arg1_1
+            buf2 = empty_strided_cuda((1, 32, 2304, 128), (9437184, 294912, 128, 1), torch.bfloat16)
+            # Topologically Sorted Source Nodes: [permute, q, permute_1, k, permute_2, v, output], Original ATen: [aten.permute, aten.clone, aten._scaled_dot_product_cudnn_attention]
+            stream0 = get_raw_stream(0)
+            triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.run(arg2_1, buf2, 9437184, stream=stream0)
+            del arg2_1
+            # Topologically Sorted Source Nodes: [permute, q, permute_1, k, permute_2, v, output], Original ATen: [aten.permute, aten.clone, aten._scaled_dot_product_cudnn_attention]
+            buf3 = torch.ops.aten._scaled_dot_product_cudnn_attention.default(buf0, buf1, buf2, None, False)
+            del buf0
+            del buf1
+            buf4 = buf3[0]
+            assert_size_stride(buf4, (1, 32, 2304, 128), (9437184, 294912, 128, 1), 'torch.ops.aten._scaled_dot_product_cudnn_attention.default')
+            assert_alignment(buf4, 16, 'torch.ops.aten._scaled_dot_product_cudnn_attention.default')
+            del buf3
+            buf8 = reinterpret_tensor(buf2, (1, 2304, 32, 128), (9437184, 4096, 128, 1), 0); del buf2  # reuse
+            # Topologically Sorted Source Nodes: [permute_3, out], Original ATen: [aten.permute, aten.clone]
+            stream0 = get_raw_stream(0)
+            triton_poi_fused_clone_permute_1.run(buf4, buf8, 9437184, stream=stream0)
+            del buf4
+        return (buf8, )
+
+runner = Runner(partitions=[])
+call = runner.call
+recursively_apply_fns = runner.recursively_apply_fns
+
+
+def benchmark_compiled_module(times=10, repeat=10):
+    from torch._dynamo.testing import rand_strided
+    from torch._inductor.utils import print_performance
+    arg0_1 = rand_strided((1, 2304, 32, 128), (9437184, 4096, 128, 1), device='cuda:0', dtype=torch.bfloat16)
+    arg1_1 = rand_strided((1, 2304, 32, 128), (9437184, 4096, 128, 1), device='cuda:0', dtype=torch.bfloat16)
+    arg2_1 = rand_strided((1, 2304, 32, 128), (9437184, 4096, 128, 1), device='cuda:0', dtype=torch.bfloat16)
+    fn = lambda: call([arg0_1, arg1_1, arg2_1])
+    return print_performance(fn, times=times, repeat=repeat)
+
+
+if __name__ == "__main__":
+    from torch._inductor.wrapper_benchmark import compiled_module_main
+    compiled_module_main('None', benchmark_compiled_module)
diff --git a/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py b/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py
new file mode 100644
index 0000000000000000000000000000000000000000..d31889c745af06fcb68dc4eb9b5dfbef10d3c301
--- /dev/null
+++ b/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py
@@ -0,0 +1,78 @@
+
+import triton
+import triton.language as tl
+
+from torch._inductor.runtime import triton_helpers, triton_heuristics
+from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+triton_helpers.set_driver_to_gpu()
+
+@triton_heuristics.reduction(
+    size_hints={'x': 2048, 'r0_': 4096},
+    reduction_hint=ReductionHint.INNER,
+    filename=__file__,
+    triton_meta={'signature': {'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'in_ptr2': '*bf16', 'in_ptr3': '*bf16', 'in_ptr4': '*bf16', 'out_ptr0': '*bf16', 'out_ptr3': '*bf16', 'xnumel': 'i32', 'r0_numel': 'i32', 'XBLOCK': 'constexpr', 'R0_BLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=128, cc=89, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=1536, warp_size=32), 'constants': {}, 'native_matmul': False, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]], (6,): [['tt.divisibility', 16]], (7,): [['tt.divisibility', 16]], (8,): [['tt.divisibility', 16]]}], 'enable_fp_fusion': True},
+    inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_red_fused_add_mul_native_layer_norm_0', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'atomic_add_found': False, 'num_load': 6, 'num_store': 2, 'num_reduction': 2, 'backend_hash': '139C22A3A3C364569C9941DE9469DCB674B7A631E094782CBD415193800462F6', 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'deterministic': False, 'force_filter_reduction_configs': False, 'are_deterministic_algorithms_enabled': False, 'add_persistent_rblock': True, 'tiling_scores': {'x': 0, 'r0_': 100687872}}
+)
+@triton.jit
+def triton_red_fused_add_mul_native_layer_norm_0(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, out_ptr3, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr):
+    xnumel = 2048
+    r0_numel = 4096
+    rnumel = r0_numel
+    RBLOCK: tl.constexpr = R0_BLOCK
+    xoffset = tl.program_id(0) * XBLOCK
+    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
+    xmask = xindex < xnumel
+    r0_base = tl.arange(0, R0_BLOCK)[None, :]
+    rbase = r0_base
+    x0 = xindex
+    tmp7_mean = tl.zeros([XBLOCK, R0_BLOCK], tl.float32)
+    tmp7_m2 = tl.zeros([XBLOCK, R0_BLOCK], tl.float32)
+    tmp7_weight = tl.zeros([XBLOCK, R0_BLOCK], tl.float32)
+    for r0_offset in tl.range(0, r0_numel, R0_BLOCK):
+        r0_index = r0_offset + r0_base
+        r0_mask = r0_index < r0_numel
+        roffset = r0_offset
+        rindex = r0_index
+        r0_1 = r0_index
+        tmp0 = tl.load(in_ptr0 + (r0_1 + 4096*x0), r0_mask & xmask, eviction_policy='evict_first', other=0.0).to(tl.float32)
+        tmp1 = tl.load(in_ptr1 + (r0_1), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32)
+        tmp2 = tl.load(in_ptr2 + (r0_1 + 4096*x0), r0_mask & xmask, eviction_policy='evict_first', other=0.0).to(tl.float32)
+        tmp3 = tmp1 * tmp2
+        tmp4 = tmp0 + tmp3
+        tmp5 = tmp4.to(tl.float32)
+        tmp6 = tl.broadcast_to(tmp5, [XBLOCK, R0_BLOCK])
+        tmp7_mean_next, tmp7_m2_next, tmp7_weight_next = triton_helpers.welford_reduce(
+            tmp6, tmp7_mean, tmp7_m2, tmp7_weight, roffset == 0
+        )
+        tmp7_mean = tl.where(r0_mask & xmask, tmp7_mean_next, tmp7_mean)
+        tmp7_m2 = tl.where(r0_mask & xmask, tmp7_m2_next, tmp7_m2)
+        tmp7_weight = tl.where(r0_mask & xmask, tmp7_weight_next, tmp7_weight)
+        tl.store(out_ptr0 + (r0_1 + 4096*x0), tmp4, r0_mask & xmask)
+    tmp8, tmp9, tmp10 = triton_helpers.welford(tmp7_mean, tmp7_m2, tmp7_weight, 1)
+    tmp7 = tmp8[:, None]
+    tmp11 = tmp9[:, None]
+    tmp12 = tmp10[:, None]
+    for r0_offset in tl.range(0, r0_numel, R0_BLOCK):
+        r0_index = r0_offset + r0_base
+        r0_mask = r0_index < r0_numel
+        roffset = r0_offset
+        rindex = r0_index
+        r0_1 = r0_index
+        tmp13 = tl.load(out_ptr0 + (r0_1 + 4096*x0), r0_mask & xmask, eviction_policy='evict_first', other=0.0).to(tl.float32)
+        tmp23 = tl.load(in_ptr3 + (r0_1), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32)
+        tmp27 = tl.load(in_ptr4 + (r0_1), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32)
+        tmp14 = tmp13.to(tl.float32)
+        tmp15 = tmp14 - tmp7
+        tmp16 = 4096.0
+        tmp17 = (tmp11 / tmp16)
+        tmp18 = 1e-06
+        tmp19 = tmp17 + tmp18
+        tmp20 = libdevice.rsqrt(tmp19)
+        tmp21 = tmp15 * tmp20
+        tmp22 = tmp21.to(tl.float32)
+        tmp24 = 1.0
+        tmp25 = tmp23 + tmp24
+        tmp26 = tmp22 * tmp25
+        tmp28 = tmp26 + tmp27
+        tl.store(out_ptr3 + (r0_1 + 4096*x0), tmp28, r0_mask & xmask)
diff --git a/torchinductor/w3/f47f62f438f942996b5cf11eb19c6e256d1ed91f8e9d6804e5f718dcdfe8080f.best_config b/torchinductor/w3/f47f62f438f942996b5cf11eb19c6e256d1ed91f8e9d6804e5f718dcdfe8080f.best_config
new file mode 100644
index 0000000000000000000000000000000000000000..d23168327bf310e3204332f9c95ed66190063ddc
--- /dev/null
+++ b/torchinductor/w3/f47f62f438f942996b5cf11eb19c6e256d1ed91f8e9d6804e5f718dcdfe8080f.best_config
@@ -0,0 +1 @@
+{"XBLOCK": 1, "R0_BLOCK": 2048, "num_warps": 16, "num_stages": 1, "configs_hash": "ba27f374f6982634f1ab959ad1e63f726920cfc2c7c821f8e68ec55c3d4d94fc", "found_by_coordesc": false, "time_taken_ms": 49, "triton_cache_hash": "DTSINFKV23R7UUCB5Y3DX56UD6DUQS3DXJCZPAKYYXLDFCJFQIOA"}
\ No newline at end of file
diff --git a/torchinductor/wc/cwcymlhbzenwwbbudgealzjyh42evsv2zpmylwljovxkosebwbpk.py b/torchinductor/wc/cwcymlhbzenwwbbudgealzjyh42evsv2zpmylwljovxkosebwbpk.py
new file mode 100644
index 0000000000000000000000000000000000000000..073acdaa9018085bb28c673c0e387da3a5442912
--- /dev/null
+++ b/torchinductor/wc/cwcymlhbzenwwbbudgealzjyh42evsv2zpmylwljovxkosebwbpk.py
@@ -0,0 +1,67 @@
+# AOT ID: ['6_inference']
+from ctypes import c_void_p, c_long, c_int
+import torch
+import math
+import random
+import os
+import tempfile
+from math import inf, nan
+from cmath import nanj
+from torch._inductor.hooks import run_intermediate_hooks
+from torch._inductor.utils import maybe_profile
+from torch._inductor.codegen.memory_planning import _align as align
+from torch import device, empty_strided
+from torch._inductor.async_compile import AsyncCompile
+from torch._inductor.select_algorithm import extern_kernels
+
+aten = torch.ops.aten
+inductor_ops = torch.ops.inductor
+_quantized = torch.ops._quantized
+assert_size_stride = torch._C._dynamo.guards.assert_size_stride
+assert_alignment = torch._C._dynamo.guards.assert_alignment
+empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu
+empty_strided_cpu_pinned = torch._C._dynamo.guards._empty_strided_cpu_pinned
+empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
+empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu
+empty_strided_mtia = torch._C._dynamo.guards._empty_strided_mtia
+reinterpret_tensor = torch._C._dynamo.guards._reinterpret_tensor
+alloc_from_pool = torch.ops.inductor._alloc_from_pool
+async_compile = AsyncCompile()
+empty_strided_p2p = torch._C._distributed_c10d._SymmetricMemory.empty_strided_p2p
+
+
+async_compile.wait(globals())
+del async_compile
+
+class Runner:
+    def __init__(self, partitions):
+        self.partitions = partitions
+
+    def recursively_apply_fns(self, fns):
+        new_callables = []
+        for fn, c in zip(fns, self.partitions):
+            new_callables.append(fn(c))
+        self.partitions = new_callables
+
+    def call(self, args):
+        arg0_1, = args
+        args.clear()
+        s52 = arg0_1
+        return (4096*s52, )
+
+runner = Runner(partitions=[])
+call = runner.call
+recursively_apply_fns = runner.recursively_apply_fns
+
+
+def benchmark_compiled_module(times=10, repeat=10):
+    from torch._dynamo.testing import rand_strided
+    from torch._inductor.utils import print_performance
+    arg0_1 = 256
+    fn = lambda: call([arg0_1])
+    return print_performance(fn, times=times, repeat=repeat)
+
+
+if __name__ == "__main__":
+    from torch._inductor.wrapper_benchmark import compiled_module_main
+    compiled_module_main('None', benchmark_compiled_module)
diff --git a/torchinductor/wv/46eff4b043eb1561ef5f4e5311f476ba8ecf12b7c030a45f10b615fff48ee754.best_config b/torchinductor/wv/46eff4b043eb1561ef5f4e5311f476ba8ecf12b7c030a45f10b615fff48ee754.best_config
new file mode 100644
index 0000000000000000000000000000000000000000..0b77f7cd0905e2efaa3d56bcc591ab9eb952ae8c
--- /dev/null
+++ b/torchinductor/wv/46eff4b043eb1561ef5f4e5311f476ba8ecf12b7c030a45f10b615fff48ee754.best_config
@@ -0,0 +1 @@
+{"XBLOCK": 4, "R0_BLOCK": 128, "num_warps": 4, "num_stages": 1, "configs_hash": "f874ed6abc48e5e95ac45a4a098cc27fc009d3b1219d27438179d79ebfae2c22", "found_by_coordesc": false, "time_taken_ms": 103, "triton_cache_hash": "3QDGC5QSUWQPN5RXKLXXN7PUSYQW3O2VHNHT4G2SVMKW6BPNAZVQ"}
\ No newline at end of file
diff --git a/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py b/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py
new file mode 100644
index 0000000000000000000000000000000000000000..6b4c48a33613d7561a4418911bd0b9db96d61f81
--- /dev/null
+++ b/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py
@@ -0,0 +1,45 @@
+
+import triton
+import triton.language as tl
+
+from torch._inductor.runtime import triton_helpers, triton_heuristics
+from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+triton_helpers.set_driver_to_gpu()
+
+@triton_heuristics.reduction(
+    size_hints={'x': 8192, 'r0_': 128},
+    reduction_hint=ReductionHint.DEFAULT,
+    filename=__file__,
+    triton_meta={'signature': {'in_ptr0': '*bf16', 'out_ptr0': '*fp32', 'xnumel': 'i32', 'r0_numel': 'i32', 'XBLOCK': 'constexpr', 'R0_BLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=128, cc=89, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=1536, warp_size=32), 'constants': {}, 'native_matmul': False, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]]}], 'enable_fp_fusion': True},
+    inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_red_fused__fused_rms_norm_view_0', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'atomic_add_found': False, 'num_load': 1, 'num_store': 1, 'num_reduction': 1, 'backend_hash': '139C22A3A3C364569C9941DE9469DCB674B7A631E094782CBD415193800462F6', 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'deterministic': False, 'force_filter_reduction_configs': False, 'are_deterministic_algorithms_enabled': False, 'tiling_scores': {'x': 65536, 'r0_': 2097152}}
+)
+@triton.jit
+def triton_red_fused__fused_rms_norm_view_0(in_ptr0, out_ptr0, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr):
+    xnumel = 8192
+    r0_numel = 128
+    rnumel = r0_numel
+    RBLOCK: tl.constexpr = R0_BLOCK
+    xoffset = tl.program_id(0) * XBLOCK
+    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
+    xmask = tl.full([XBLOCK, R0_BLOCK], True, tl.int1)
+    r0_base = tl.arange(0, R0_BLOCK)[None, :]
+    rbase = r0_base
+    x0 = (xindex % 32)
+    x1 = xindex // 32
+    _tmp4 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32)
+    x3 = xindex
+    for r0_offset in tl.range(0, r0_numel, R0_BLOCK):
+        r0_index = r0_offset + r0_base
+        r0_mask = r0_index < r0_numel
+        roffset = r0_offset
+        rindex = r0_index
+        r0_2 = r0_index
+        tmp0 = tl.load(in_ptr0 + (r0_2 + 128*x0 + 12288*x1), r0_mask, eviction_policy='evict_first', other=0.0).to(tl.float32)
+        tmp1 = tmp0.to(tl.float32)
+        tmp2 = tmp1 * tmp1
+        tmp3 = tl.broadcast_to(tmp2, [XBLOCK, R0_BLOCK])
+        tmp5 = _tmp4 + tmp3
+        _tmp4 = tl.where(r0_mask, tmp5, _tmp4)
+    tmp4 = tl.sum(_tmp4, 1)[:, None]
+    tl.store(out_ptr0 + (x3), tmp4, None)
diff --git a/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py b/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py
new file mode 100644
index 0000000000000000000000000000000000000000..dc19a9bfc2303aabb090f42a09638fd8bb0cd17b
--- /dev/null
+++ b/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py
@@ -0,0 +1,73 @@
+
+import triton
+import triton.language as tl
+
+from torch._inductor.runtime import triton_helpers, triton_heuristics
+from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+triton_helpers.set_driver_to_gpu()
+
+@triton_heuristics.reduction(
+    size_hints={'x': 2048, 'r0_': 4096},
+    reduction_hint=ReductionHint.INNER,
+    filename=__file__,
+    triton_meta={'signature': {'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'in_ptr2': '*bf16', 'out_ptr2': '*bf16', 'xnumel': 'i32', 'r0_numel': 'i32', 'XBLOCK': 'constexpr', 'R0_BLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=128, cc=89, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=1536, warp_size=32), 'constants': {}, 'native_matmul': False, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]]}], 'enable_fp_fusion': True},
+    inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_red_fused_add_mul_native_layer_norm_0', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'atomic_add_found': False, 'num_load': 4, 'num_store': 1, 'num_reduction': 2, 'backend_hash': '139C22A3A3C364569C9941DE9469DCB674B7A631E094782CBD415193800462F6', 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'deterministic': False, 'force_filter_reduction_configs': False, 'are_deterministic_algorithms_enabled': False, 'add_persistent_rblock': True, 'tiling_scores': {'x': 0, 'r0_': 50348032}}
+)
+@triton.jit
+def triton_red_fused_add_mul_native_layer_norm_0(in_ptr0, in_ptr1, in_ptr2, out_ptr2, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr):
+    xnumel = 2048
+    r0_numel = 4096
+    rnumel = r0_numel
+    RBLOCK: tl.constexpr = R0_BLOCK
+    xoffset = tl.program_id(0) * XBLOCK
+    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
+    xmask = xindex < xnumel
+    r0_base = tl.arange(0, R0_BLOCK)[None, :]
+    rbase = r0_base
+    x0 = xindex
+    tmp3_mean = tl.zeros([XBLOCK, R0_BLOCK], tl.float32)
+    tmp3_m2 = tl.zeros([XBLOCK, R0_BLOCK], tl.float32)
+    tmp3_weight = tl.zeros([XBLOCK, R0_BLOCK], tl.float32)
+    for r0_offset in tl.range(0, r0_numel, R0_BLOCK):
+        r0_index = r0_offset + r0_base
+        r0_mask = r0_index < r0_numel
+        roffset = r0_offset
+        rindex = r0_index
+        r0_1 = r0_index
+        tmp0 = tl.load(in_ptr0 + (r0_1 + 4096*x0), r0_mask & xmask, eviction_policy='evict_last', other=0.0).to(tl.float32)
+        tmp1 = tmp0.to(tl.float32)
+        tmp2 = tl.broadcast_to(tmp1, [XBLOCK, R0_BLOCK])
+        tmp3_mean_next, tmp3_m2_next, tmp3_weight_next = triton_helpers.welford_reduce(
+            tmp2, tmp3_mean, tmp3_m2, tmp3_weight, roffset == 0
+        )
+        tmp3_mean = tl.where(r0_mask & xmask, tmp3_mean_next, tmp3_mean)
+        tmp3_m2 = tl.where(r0_mask & xmask, tmp3_m2_next, tmp3_m2)
+        tmp3_weight = tl.where(r0_mask & xmask, tmp3_weight_next, tmp3_weight)
+    tmp4, tmp5, tmp6 = triton_helpers.welford(tmp3_mean, tmp3_m2, tmp3_weight, 1)
+    tmp3 = tmp4[:, None]
+    tmp7 = tmp5[:, None]
+    tmp8 = tmp6[:, None]
+    for r0_offset in tl.range(0, r0_numel, R0_BLOCK):
+        r0_index = r0_offset + r0_base
+        r0_mask = r0_index < r0_numel
+        roffset = r0_offset
+        rindex = r0_index
+        r0_1 = r0_index
+        tmp9 = tl.load(in_ptr1 + (r0_1), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32)
+        tmp12 = tl.load(in_ptr0 + (r0_1 + 4096*x0), r0_mask & xmask, eviction_policy='evict_first', other=0.0).to(tl.float32)
+        tmp23 = tl.load(in_ptr2 + (r0_1), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32)
+        tmp10 = 1.0
+        tmp11 = tmp9 + tmp10
+        tmp13 = tmp12.to(tl.float32)
+        tmp14 = tmp13 - tmp3
+        tmp15 = 4096.0
+        tmp16 = (tmp7 / tmp15)
+        tmp17 = 1e-06
+        tmp18 = tmp16 + tmp17
+        tmp19 = libdevice.rsqrt(tmp18)
+        tmp20 = tmp14 * tmp19
+        tmp21 = tmp20.to(tl.float32)
+        tmp22 = tmp11 * tmp21
+        tmp24 = tmp22 + tmp23
+        tl.store(out_ptr2 + (r0_1 + 4096*x0), tmp24, r0_mask & xmask)
diff --git a/torchinductor/ww/dd2f7cfc81aa7ae5bbbff692e51eaea946345cf52243d58440af807766535cf3.best_config b/torchinductor/ww/dd2f7cfc81aa7ae5bbbff692e51eaea946345cf52243d58440af807766535cf3.best_config
new file mode 100644
index 0000000000000000000000000000000000000000..dc8de79fb45152deb37fa0f01377313bf011c741
--- /dev/null
+++ b/torchinductor/ww/dd2f7cfc81aa7ae5bbbff692e51eaea946345cf52243d58440af807766535cf3.best_config
@@ -0,0 +1 @@
+{"XBLOCK": 1, "R0_BLOCK": 4096, "num_warps": 16, "num_stages": 1, "configs_hash": "ba27f374f6982634f1ab959ad1e63f726920cfc2c7c821f8e68ec55c3d4d94fc", "found_by_coordesc": false, "time_taken_ms": 39, "triton_cache_hash": "MRY6R44HLP4JU6GX5XHYUUNGQ6LLBRKAPA6ZNJPRYYOSCCZPWAPQ"}
\ No newline at end of file
diff --git a/torchinductor/xj/7c8e81e42663cba3c73a14ca8935673dbe9d521cd0a91444125a99d262815f3f.best_config b/torchinductor/xj/7c8e81e42663cba3c73a14ca8935673dbe9d521cd0a91444125a99d262815f3f.best_config
new file mode 100644
index 0000000000000000000000000000000000000000..3da299395a805c9e6c60b214074c2f77852ff8ab
--- /dev/null
+++ b/torchinductor/xj/7c8e81e42663cba3c73a14ca8935673dbe9d521cd0a91444125a99d262815f3f.best_config
@@ -0,0 +1 @@
+{"XBLOCK": 1024, "num_warps": 4, "num_stages": 1, "configs_hash": "3ca5c3e34d35093f3c9ab2829a9faeebad5e61c4ca13d5ed6053d7b71ce60d5a", "found_by_coordesc": false, "time_taken_ms": 46, "triton_cache_hash": "WJMADSUCHEDSCOLWBJQFN3ERNWG6GZG72GNV74XDUGQSSKHQEOCA"}
\ No newline at end of file
diff --git a/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py b/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py
new file mode 100644
index 0000000000000000000000000000000000000000..7db8216b2e1229c46cc2844a9596acaa4bcd5f93
--- /dev/null
+++ b/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py
@@ -0,0 +1,30 @@
+
+import triton
+import triton.language as tl
+
+from torch._inductor.runtime import triton_helpers, triton_heuristics
+from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+triton_helpers.set_driver_to_gpu()
+
+@triton_heuristics.pointwise(
+    size_hints={'x': 16777216}, 
+    filename=__file__,
+    triton_meta={'signature': {'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'in_ptr2': '*bf16', 'out_ptr0': '*bf16', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=128, cc=89, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=1536, warp_size=32), 'constants': {}, 'native_matmul': False, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]]}], 'enable_fp_fusion': True},
+    inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_add_mul_0', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'atomic_add_found': False, 'num_load': 3, 'num_store': 1, 'num_reduction': 0, 'backend_hash': '139C22A3A3C364569C9941DE9469DCB674B7A631E094782CBD415193800462F6', 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'deterministic': False, 'force_filter_reduction_configs': False, 'are_deterministic_algorithms_enabled': False, 'tiling_scores': {'x': 75505664}},
+    min_elem_per_thread=0
+)
+@triton.jit
+def triton_poi_fused_add_mul_0(in_ptr0, in_ptr1, in_ptr2, out_ptr0, xnumel, XBLOCK : tl.constexpr):
+    xnumel = 9437184
+    xoffset = tl.program_id(0) * XBLOCK
+    xindex = xoffset + tl.arange(0, XBLOCK)[:]
+    xmask = tl.full([XBLOCK], True, tl.int1)
+    x2 = xindex
+    x0 = (xindex % 4096)
+    tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32)
+    tmp1 = tl.load(in_ptr1 + (x0), None, eviction_policy='evict_last').to(tl.float32)
+    tmp2 = tl.load(in_ptr2 + (x2), None).to(tl.float32)
+    tmp3 = tmp1 * tmp2
+    tmp4 = tmp0 + tmp3
+    tl.store(out_ptr0 + (x2), tmp4, None)
diff --git a/torchinductor/xo/cxo4pcd7p734icbgoa3zx6bvctofr7g2634t2fqhg22bhtg6eddc.py b/torchinductor/xo/cxo4pcd7p734icbgoa3zx6bvctofr7g2634t2fqhg22bhtg6eddc.py
new file mode 100644
index 0000000000000000000000000000000000000000..be536e88a02ee28a06ab69ddbe1e458336720730
--- /dev/null
+++ b/torchinductor/xo/cxo4pcd7p734icbgoa3zx6bvctofr7g2634t2fqhg22bhtg6eddc.py
@@ -0,0 +1,261 @@
+# AOT ID: ['20_inference']
+from ctypes import c_void_p, c_long, c_int
+import torch
+import math
+import random
+import os
+import tempfile
+from math import inf, nan
+from cmath import nanj
+from torch._inductor.hooks import run_intermediate_hooks
+from torch._inductor.utils import maybe_profile
+from torch._inductor.codegen.memory_planning import _align as align
+from torch import device, empty_strided
+from torch._inductor.async_compile import AsyncCompile
+from torch._inductor.select_algorithm import extern_kernels
+import triton
+import triton.language as tl
+from torch._inductor.runtime.triton_heuristics import start_graph, end_graph
+from torch._C import _cuda_getCurrentRawStream as get_raw_stream
+
+aten = torch.ops.aten
+inductor_ops = torch.ops.inductor
+_quantized = torch.ops._quantized
+assert_size_stride = torch._C._dynamo.guards.assert_size_stride
+assert_alignment = torch._C._dynamo.guards.assert_alignment
+empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu
+empty_strided_cpu_pinned = torch._C._dynamo.guards._empty_strided_cpu_pinned
+empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
+empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu
+empty_strided_mtia = torch._C._dynamo.guards._empty_strided_mtia
+reinterpret_tensor = torch._C._dynamo.guards._reinterpret_tensor
+alloc_from_pool = torch.ops.inductor._alloc_from_pool
+async_compile = AsyncCompile()
+empty_strided_p2p = torch._C._distributed_c10d._SymmetricMemory.empty_strided_p2p
+
+
+# kernel path: /app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py
+# Topologically Sorted Source Nodes: [context_attn_output, encoder_hidden_states, norm_encoder_hidden_states, add_2, mul_2, norm_encoder_hidden_states_1], Original ATen: [aten.mul, aten.add, aten.native_layer_norm]
+# Source node to ATen node mapping:
+#   add_2 => add_3
+#   context_attn_output => mul_1
+#   encoder_hidden_states => add_1
+#   mul_2 => mul_3
+#   norm_encoder_hidden_states => add_2, convert_element_type, convert_element_type_1, mul_2, rsqrt, sub, var_mean
+#   norm_encoder_hidden_states_1 => add_4
+# Graph fragment:
+#   %arg5_1 : Tensor "bf16[1, 256, 4096][1048576, 4096, 1]cuda:0" = PlaceHolder[target=arg5_1]
+#   %arg3_1 : Tensor "bf16[1, 1, 4096][24576, 24576, 1]cuda:0" = PlaceHolder[target=arg3_1]
+#   %arg4_1 : Tensor "bf16[1, 256, 4096][1048576, 4096, 1]cuda:0" = PlaceHolder[target=arg4_1]
+#   %add_1 : Tensor "bf16[1, 256, 4096][1048576, 4096, 1]cuda:0" = PlaceHolder[target=add_1]
+#   %getitem_1 : Tensor "f32[1, 256, 1][256, 1, 256]cuda:0" = PlaceHolder[target=getitem_1]
+#   %buf2 : Tensor "f32[1, 256, 1][256, 1, 256]cuda:0" = PlaceHolder[target=buf2]
+#   %arg6_1 : Tensor "bf16[1, 1, 4096][24576, 24576, 1]cuda:0" = PlaceHolder[target=arg6_1]
+#   %arg7_1 : Tensor "bf16[1, 1, 4096][24576, 24576, 1]cuda:0" = PlaceHolder[target=arg7_1]
+#   %mul_1 : Tensor "bf16[1, 256, 4096][1048576, 4096, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%arg3_1, %arg4_1), kwargs = {})
+#   %add_1 : Tensor "bf16[1, 256, 4096][1048576, 4096, 1]cuda:0"[num_users=2] = call_function[target=torch.ops.aten.add.Tensor](args = (%arg5_1, %mul_1), kwargs = {})
+#   %convert_element_type : Tensor "f32[1, 256, 4096][1048576, 4096, 1]cuda:0"[num_users=2] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%add_1, torch.float32), kwargs = {})
+#   %var_mean : [num_users=2] = call_function[target=torch.ops.aten.var_mean.correction](args = (%convert_element_type, [2]), kwargs = {correction: 0, keepdim: True})
+#   %sub : Tensor "f32[1, 256, 4096][1048576, 4096, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.sub.Tensor](args = (%convert_element_type, %getitem_1), kwargs = {})
+#   %add_2 : Tensor "f32[1, 256, 1][256, 1, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%getitem, 1e-06), kwargs = {})
+#   %rsqrt : Tensor "f32[1, 256, 1][256, 1, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.rsqrt.default](args = (%add_2,), kwargs = {})
+#   %mul_2 : Tensor "f32[1, 256, 4096][1048576, 4096, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%sub, %rsqrt), kwargs = {})
+#   %convert_element_type_1 : Tensor "bf16[1, 256, 4096][1048576, 4096, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%mul_2, torch.bfloat16), kwargs = {})
+#   %add_3 : Tensor "bf16[1, 1, 4096][4096, 4096, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%arg6_1, 1), kwargs = {})
+#   %mul_3 : Tensor "bf16[1, 256, 4096][1048576, 4096, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type_1, %add_3), kwargs = {})
+#   %add_4 : Tensor "bf16[1, 256, 4096][1048576, 4096, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%mul_3, %arg7_1), kwargs = {})
+#   return %add_1,%getitem_1,%buf2,%add_4
+triton_red_fused_add_mul_native_layer_norm_0 = async_compile.triton('triton_red_fused_add_mul_native_layer_norm_0', '''
+import triton
+import triton.language as tl
+
+from torch._inductor.runtime import triton_helpers, triton_heuristics
+from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+triton_helpers.set_driver_to_gpu()
+
+@triton_heuristics.reduction(
+    size_hints={'x': 256, 'r0_': 4096},
+    reduction_hint=ReductionHint.INNER,
+    filename=__file__,
+    triton_meta={'signature': {'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'in_ptr2': '*bf16', 'in_ptr3': '*bf16', 'in_ptr4': '*bf16', 'out_ptr0': '*bf16', 'out_ptr3': '*bf16', 'xnumel': 'i32', 'r0_numel': 'i32', 'XBLOCK': 'constexpr', 'R0_BLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=128, cc=89, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=1536, warp_size=32), 'constants': {}, 'native_matmul': False, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]], (6,): [['tt.divisibility', 16]], (7,): [['tt.divisibility', 16]], (8,): [['tt.divisibility', 16]]}], 'enable_fp_fusion': True},
+    inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_red_fused_add_mul_native_layer_norm_0', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'atomic_add_found': False, 'num_load': 6, 'num_store': 2, 'num_reduction': 2, 'backend_hash': '139C22A3A3C364569C9941DE9469DCB674B7A631E094782CBD415193800462F6', 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'deterministic': False, 'force_filter_reduction_configs': False, 'are_deterministic_algorithms_enabled': False, 'add_persistent_rblock': True, 'tiling_scores': {'x': 0, 'r0_': 12607488}}
+)
+@triton.jit
+def triton_red_fused_add_mul_native_layer_norm_0(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, out_ptr3, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr):
+    xnumel = 256
+    r0_numel = 4096
+    rnumel = r0_numel
+    RBLOCK: tl.constexpr = R0_BLOCK
+    xoffset = tl.program_id(0) * XBLOCK
+    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
+    xmask = xindex < xnumel
+    r0_base = tl.arange(0, R0_BLOCK)[None, :]
+    rbase = r0_base
+    x0 = xindex
+    tmp7_mean = tl.zeros([XBLOCK, R0_BLOCK], tl.float32)
+    tmp7_m2 = tl.zeros([XBLOCK, R0_BLOCK], tl.float32)
+    tmp7_weight = tl.zeros([XBLOCK, R0_BLOCK], tl.float32)
+    for r0_offset in tl.range(0, r0_numel, R0_BLOCK):
+        r0_index = r0_offset + r0_base
+        r0_mask = r0_index < r0_numel
+        roffset = r0_offset
+        rindex = r0_index
+        r0_1 = r0_index
+        tmp0 = tl.load(in_ptr0 + (r0_1 + 4096*x0), r0_mask & xmask, eviction_policy='evict_first', other=0.0).to(tl.float32)
+        tmp1 = tl.load(in_ptr1 + (r0_1), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32)
+        tmp2 = tl.load(in_ptr2 + (r0_1 + 4096*x0), r0_mask & xmask, eviction_policy='evict_first', other=0.0).to(tl.float32)
+        tmp3 = tmp1 * tmp2
+        tmp4 = tmp0 + tmp3
+        tmp5 = tmp4.to(tl.float32)
+        tmp6 = tl.broadcast_to(tmp5, [XBLOCK, R0_BLOCK])
+        tmp7_mean_next, tmp7_m2_next, tmp7_weight_next = triton_helpers.welford_reduce(
+            tmp6, tmp7_mean, tmp7_m2, tmp7_weight, roffset == 0
+        )
+        tmp7_mean = tl.where(r0_mask & xmask, tmp7_mean_next, tmp7_mean)
+        tmp7_m2 = tl.where(r0_mask & xmask, tmp7_m2_next, tmp7_m2)
+        tmp7_weight = tl.where(r0_mask & xmask, tmp7_weight_next, tmp7_weight)
+        tl.store(out_ptr0 + (r0_1 + 4096*x0), tmp4, r0_mask & xmask)
+    tmp8, tmp9, tmp10 = triton_helpers.welford(tmp7_mean, tmp7_m2, tmp7_weight, 1)
+    tmp7 = tmp8[:, None]
+    tmp11 = tmp9[:, None]
+    tmp12 = tmp10[:, None]
+    for r0_offset in tl.range(0, r0_numel, R0_BLOCK):
+        r0_index = r0_offset + r0_base
+        r0_mask = r0_index < r0_numel
+        roffset = r0_offset
+        rindex = r0_index
+        r0_1 = r0_index
+        tmp13 = tl.load(out_ptr0 + (r0_1 + 4096*x0), r0_mask & xmask, eviction_policy='evict_first', other=0.0).to(tl.float32)
+        tmp23 = tl.load(in_ptr3 + (r0_1), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32)
+        tmp27 = tl.load(in_ptr4 + (r0_1), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32)
+        tmp14 = tmp13.to(tl.float32)
+        tmp15 = tmp14 - tmp7
+        tmp16 = 4096.0
+        tmp17 = (tmp11 / tmp16)
+        tmp18 = 1e-06
+        tmp19 = tmp17 + tmp18
+        tmp20 = libdevice.rsqrt(tmp19)
+        tmp21 = tmp15 * tmp20
+        tmp22 = tmp21.to(tl.float32)
+        tmp24 = 1.0
+        tmp25 = tmp23 + tmp24
+        tmp26 = tmp22 * tmp25
+        tmp28 = tmp26 + tmp27
+        tl.store(out_ptr3 + (r0_1 + 4096*x0), tmp28, r0_mask & xmask)
+''', device_str='cuda')
+
+
+# kernel path: /app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py
+# Topologically Sorted Source Nodes: [mul, hidden_states], Original ATen: [aten.mul, aten.add]
+# Source node to ATen node mapping:
+#   hidden_states => add
+#   mul => mul
+# Graph fragment:
+#   %arg2_1 : Tensor "bf16[1, 2048, 4096][8388608, 4096, 1]cuda:0" = PlaceHolder[target=arg2_1]
+#   %arg0_1 : Tensor "bf16[1, 1, 4096][24576, 24576, 1]cuda:0" = PlaceHolder[target=arg0_1]
+#   %arg1_1 : Tensor "bf16[1, 2048, 4096][8388608, 4096, 1]cuda:0" = PlaceHolder[target=arg1_1]
+#   %mul : Tensor "bf16[1, 2048, 4096][8388608, 4096, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%arg0_1, %arg1_1), kwargs = {})
+#   %add : Tensor "bf16[1, 2048, 4096][8388608, 4096, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%arg2_1, %mul), kwargs = {})
+#   return %add
+triton_poi_fused_add_mul_1 = async_compile.triton('triton_poi_fused_add_mul_1', '''
+import triton
+import triton.language as tl
+
+from torch._inductor.runtime import triton_helpers, triton_heuristics
+from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+triton_helpers.set_driver_to_gpu()
+
+@triton_heuristics.pointwise(
+    size_hints={'x': 8388608}, 
+    filename=__file__,
+    triton_meta={'signature': {'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'in_ptr2': '*bf16', 'out_ptr0': '*bf16', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=128, cc=89, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=1536, warp_size=32), 'constants': {}, 'native_matmul': False, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]]}], 'enable_fp_fusion': True},
+    inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_add_mul_1', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'atomic_add_found': False, 'num_load': 3, 'num_store': 1, 'num_reduction': 0, 'backend_hash': '139C22A3A3C364569C9941DE9469DCB674B7A631E094782CBD415193800462F6', 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'deterministic': False, 'force_filter_reduction_configs': False, 'are_deterministic_algorithms_enabled': False, 'tiling_scores': {'x': 67117056}},
+    min_elem_per_thread=0
+)
+@triton.jit
+def triton_poi_fused_add_mul_1(in_ptr0, in_ptr1, in_ptr2, out_ptr0, xnumel, XBLOCK : tl.constexpr):
+    xnumel = 8388608
+    xoffset = tl.program_id(0) * XBLOCK
+    xindex = xoffset + tl.arange(0, XBLOCK)[:]
+    xmask = tl.full([XBLOCK], True, tl.int1)
+    x2 = xindex
+    x0 = (xindex % 4096)
+    tmp0 = tl.load(in_ptr0 + (x2), None).to(tl.float32)
+    tmp1 = tl.load(in_ptr1 + (x0), None, eviction_policy='evict_last').to(tl.float32)
+    tmp2 = tl.load(in_ptr2 + (x2), None).to(tl.float32)
+    tmp3 = tmp1 * tmp2
+    tmp4 = tmp0 + tmp3
+    tl.store(out_ptr0 + (x2), tmp4, None)
+''', device_str='cuda')
+
+
+async_compile.wait(globals())
+del async_compile
+
+class Runner:
+    def __init__(self, partitions):
+        self.partitions = partitions
+
+    def recursively_apply_fns(self, fns):
+        new_callables = []
+        for fn, c in zip(fns, self.partitions):
+            new_callables.append(fn(c))
+        self.partitions = new_callables
+
+    def call(self, args):
+        arg0_1, arg1_1, arg2_1, arg3_1, arg4_1, arg5_1, arg6_1, arg7_1 = args
+        args.clear()
+        assert_size_stride(arg0_1, (1, 1, 4096), (24576, 24576, 1))
+        assert_size_stride(arg1_1, (1, 2048, 4096), (8388608, 4096, 1))
+        assert_size_stride(arg2_1, (1, 2048, 4096), (8388608, 4096, 1))
+        assert_size_stride(arg3_1, (1, 1, 4096), (24576, 24576, 1))
+        assert_size_stride(arg4_1, (1, 256, 4096), (1048576, 4096, 1))
+        assert_size_stride(arg5_1, (1, 256, 4096), (1048576, 4096, 1))
+        assert_size_stride(arg6_1, (1, 1, 4096), (24576, 24576, 1))
+        assert_size_stride(arg7_1, (1, 1, 4096), (24576, 24576, 1))
+        with torch.cuda._DeviceGuard(0):
+            torch.cuda.set_device(0)
+            buf0 = empty_strided_cuda((1, 256, 4096), (1048576, 4096, 1), torch.bfloat16)
+            buf4 = empty_strided_cuda((1, 256, 4096), (1048576, 4096, 1), torch.bfloat16)
+            # Topologically Sorted Source Nodes: [context_attn_output, encoder_hidden_states, norm_encoder_hidden_states, add_2, mul_2, norm_encoder_hidden_states_1], Original ATen: [aten.mul, aten.add, aten.native_layer_norm]
+            stream0 = get_raw_stream(0)
+            triton_red_fused_add_mul_native_layer_norm_0.run(arg5_1, arg3_1, arg4_1, arg6_1, arg7_1, buf0, buf4, 256, 4096, stream=stream0)
+            del arg3_1
+            del arg4_1
+            del arg5_1
+            del arg6_1
+            del arg7_1
+            buf5 = empty_strided_cuda((1, 2048, 4096), (8388608, 4096, 1), torch.bfloat16)
+            # Topologically Sorted Source Nodes: [mul, hidden_states], Original ATen: [aten.mul, aten.add]
+            stream0 = get_raw_stream(0)
+            triton_poi_fused_add_mul_1.run(arg2_1, arg0_1, arg1_1, buf5, 8388608, stream=stream0)
+            del arg0_1
+            del arg1_1
+            del arg2_1
+        return (buf4, buf5, buf0, )
+
+runner = Runner(partitions=[])
+call = runner.call
+recursively_apply_fns = runner.recursively_apply_fns
+
+
+def benchmark_compiled_module(times=10, repeat=10):
+    from torch._dynamo.testing import rand_strided
+    from torch._inductor.utils import print_performance
+    arg0_1 = rand_strided((1, 1, 4096), (24576, 24576, 1), device='cuda:0', dtype=torch.bfloat16)
+    arg1_1 = rand_strided((1, 2048, 4096), (8388608, 4096, 1), device='cuda:0', dtype=torch.bfloat16)
+    arg2_1 = rand_strided((1, 2048, 4096), (8388608, 4096, 1), device='cuda:0', dtype=torch.bfloat16)
+    arg3_1 = rand_strided((1, 1, 4096), (24576, 24576, 1), device='cuda:0', dtype=torch.bfloat16)
+    arg4_1 = rand_strided((1, 256, 4096), (1048576, 4096, 1), device='cuda:0', dtype=torch.bfloat16)
+    arg5_1 = rand_strided((1, 256, 4096), (1048576, 4096, 1), device='cuda:0', dtype=torch.bfloat16)
+    arg6_1 = rand_strided((1, 1, 4096), (24576, 24576, 1), device='cuda:0', dtype=torch.bfloat16)
+    arg7_1 = rand_strided((1, 1, 4096), (24576, 24576, 1), device='cuda:0', dtype=torch.bfloat16)
+    fn = lambda: call([arg0_1, arg1_1, arg2_1, arg3_1, arg4_1, arg5_1, arg6_1, arg7_1])
+    return print_performance(fn, times=times, repeat=repeat)
+
+
+if __name__ == "__main__":
+    from torch._inductor.wrapper_benchmark import compiled_module_main
+    compiled_module_main('None', benchmark_compiled_module)
diff --git a/torchinductor/yl/cylzp3e6x3k4g5ipjr32zqvdf6mwtqwl2kcqrlshji7fgzldiits.py b/torchinductor/yl/cylzp3e6x3k4g5ipjr32zqvdf6mwtqwl2kcqrlshji7fgzldiits.py
new file mode 100644
index 0000000000000000000000000000000000000000..4a39a3a7b4c5fbbf12f9f7105a218296fb4ebff4
--- /dev/null
+++ b/torchinductor/yl/cylzp3e6x3k4g5ipjr32zqvdf6mwtqwl2kcqrlshji7fgzldiits.py
@@ -0,0 +1,67 @@
+# AOT ID: ['4_inference']
+from ctypes import c_void_p, c_long, c_int
+import torch
+import math
+import random
+import os
+import tempfile
+from math import inf, nan
+from cmath import nanj
+from torch._inductor.hooks import run_intermediate_hooks
+from torch._inductor.utils import maybe_profile
+from torch._inductor.codegen.memory_planning import _align as align
+from torch import device, empty_strided
+from torch._inductor.async_compile import AsyncCompile
+from torch._inductor.select_algorithm import extern_kernels
+
+aten = torch.ops.aten
+inductor_ops = torch.ops.inductor
+_quantized = torch.ops._quantized
+assert_size_stride = torch._C._dynamo.guards.assert_size_stride
+assert_alignment = torch._C._dynamo.guards.assert_alignment
+empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu
+empty_strided_cpu_pinned = torch._C._dynamo.guards._empty_strided_cpu_pinned
+empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
+empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu
+empty_strided_mtia = torch._C._dynamo.guards._empty_strided_mtia
+reinterpret_tensor = torch._C._dynamo.guards._reinterpret_tensor
+alloc_from_pool = torch.ops.inductor._alloc_from_pool
+async_compile = AsyncCompile()
+empty_strided_p2p = torch._C._distributed_c10d._SymmetricMemory.empty_strided_p2p
+
+
+async_compile.wait(globals())
+del async_compile
+
+class Runner:
+    def __init__(self, partitions):
+        self.partitions = partitions
+
+    def recursively_apply_fns(self, fns):
+        new_callables = []
+        for fn, c in zip(fns, self.partitions):
+            new_callables.append(fn(c))
+        self.partitions = new_callables
+
+    def call(self, args):
+        arg0_1, = args
+        args.clear()
+        assert_size_stride(arg0_1, (1, 2048, 12288), (25165824, 12288, 1))
+        return (reinterpret_tensor(arg0_1, (1, 2048, 4096), (25165824, 12288, 1), 0), reinterpret_tensor(arg0_1, (1, 2048, 4096), (25165824, 12288, 1), 4096), reinterpret_tensor(arg0_1, (1, 2048, 4096), (25165824, 12288, 1), 8192), )
+
+runner = Runner(partitions=[])
+call = runner.call
+recursively_apply_fns = runner.recursively_apply_fns
+
+
+def benchmark_compiled_module(times=10, repeat=10):
+    from torch._dynamo.testing import rand_strided
+    from torch._inductor.utils import print_performance
+    arg0_1 = rand_strided((1, 2048, 12288), (25165824, 12288, 1), device='cuda:0', dtype=torch.bfloat16)
+    fn = lambda: call([arg0_1])
+    return print_performance(fn, times=times, repeat=repeat)
+
+
+if __name__ == "__main__":
+    from torch._inductor.wrapper_benchmark import compiled_module_main
+    compiled_module_main('None', benchmark_compiled_module)
diff --git a/torchinductor/zk/czkiayrorhnzt4u2yqgtjl6bgtgtbc3y2p2rzrorom7xd23lxjcj.py b/torchinductor/zk/czkiayrorhnzt4u2yqgtjl6bgtgtbc3y2p2rzrorom7xd23lxjcj.py
new file mode 100644
index 0000000000000000000000000000000000000000..1581f87dd8d0e90738aee97b717fd6737170ab77
--- /dev/null
+++ b/torchinductor/zk/czkiayrorhnzt4u2yqgtjl6bgtgtbc3y2p2rzrorom7xd23lxjcj.py
@@ -0,0 +1,75 @@
+# AOT ID: ['10_inference']
+from ctypes import c_void_p, c_long, c_int
+import torch
+import math
+import random
+import os
+import tempfile
+from math import inf, nan
+from cmath import nanj
+from torch._inductor.hooks import run_intermediate_hooks
+from torch._inductor.utils import maybe_profile
+from torch._inductor.codegen.memory_planning import _align as align
+from torch import device, empty_strided
+from torch._inductor.async_compile import AsyncCompile
+from torch._inductor.select_algorithm import extern_kernels
+
+aten = torch.ops.aten
+inductor_ops = torch.ops.inductor
+_quantized = torch.ops._quantized
+assert_size_stride = torch._C._dynamo.guards.assert_size_stride
+assert_alignment = torch._C._dynamo.guards.assert_alignment
+empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu
+empty_strided_cpu_pinned = torch._C._dynamo.guards._empty_strided_cpu_pinned
+empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
+empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu
+empty_strided_mtia = torch._C._dynamo.guards._empty_strided_mtia
+reinterpret_tensor = torch._C._dynamo.guards._reinterpret_tensor
+alloc_from_pool = torch.ops.inductor._alloc_from_pool
+async_compile = AsyncCompile()
+empty_strided_p2p = torch._C._distributed_c10d._SymmetricMemory.empty_strided_p2p
+
+
+async_compile.wait(globals())
+del async_compile
+
+class Runner:
+    def __init__(self, partitions):
+        self.partitions = partitions
+
+    def recursively_apply_fns(self, fns):
+        new_callables = []
+        for fn, c in zip(fns, self.partitions):
+            new_callables.append(fn(c))
+        self.partitions = new_callables
+
+    def call(self, args):
+        arg0_1, arg1_1 = args
+        args.clear()
+        assert_size_stride(arg0_1, (1, 2304, 32, 128), (9437184, 4096, 128, 1))
+        assert_size_stride(arg1_1, (4096, 4096), (4096, 1))
+        with torch.cuda._DeviceGuard(0):
+            torch.cuda.set_device(0)
+            buf0 = empty_strided_cuda((256, 4096), (4096, 1), torch.bfloat16)
+            # Topologically Sorted Source Nodes: [hidden_states, split_with_sizes, encoder_hidden_states_1], Original ATen: [aten.view, aten.split_with_sizes, aten.t, aten.mm]
+            extern_kernels.mm(reinterpret_tensor(arg0_1, (256, 4096), (4096, 1), 0), reinterpret_tensor(arg1_1, (4096, 4096), (1, 4096), 0), out=buf0)
+            del arg1_1
+        return (reinterpret_tensor(arg0_1, (1, 2048, 4096), (9437184, 4096, 1), 1048576), reinterpret_tensor(buf0, (1, 256, 4096), (1048576, 4096, 1), 0), )
+
+runner = Runner(partitions=[])
+call = runner.call
+recursively_apply_fns = runner.recursively_apply_fns
+
+
+def benchmark_compiled_module(times=10, repeat=10):
+    from torch._dynamo.testing import rand_strided
+    from torch._inductor.utils import print_performance
+    arg0_1 = rand_strided((1, 2304, 32, 128), (9437184, 4096, 128, 1), device='cuda:0', dtype=torch.bfloat16)
+    arg1_1 = rand_strided((4096, 4096), (4096, 1), device='cuda:0', dtype=torch.bfloat16)
+    fn = lambda: call([arg0_1, arg1_1])
+    return print_performance(fn, times=times, repeat=repeat)
+
+
+if __name__ == "__main__":
+    from torch._inductor.wrapper_benchmark import compiled_module_main
+    compiled_module_main('None', benchmark_compiled_module)
diff --git a/torchinductor/zu/czuvgwm66nfvn4y5p22chgolcyxckgcjsedtcb5l3poxbxvq37ok.py b/torchinductor/zu/czuvgwm66nfvn4y5p22chgolcyxckgcjsedtcb5l3poxbxvq37ok.py
new file mode 100644
index 0000000000000000000000000000000000000000..5581a26ccd6a1dd3329cde85d19c0b96207834ef
--- /dev/null
+++ b/torchinductor/zu/czuvgwm66nfvn4y5p22chgolcyxckgcjsedtcb5l3poxbxvq37ok.py
@@ -0,0 +1,149 @@
+# AOT ID: ['27_inference']
+from ctypes import c_void_p, c_long, c_int
+import torch
+import math
+import random
+import os
+import tempfile
+from math import inf, nan
+from cmath import nanj
+from torch._inductor.hooks import run_intermediate_hooks
+from torch._inductor.utils import maybe_profile
+from torch._inductor.codegen.memory_planning import _align as align
+from torch import device, empty_strided
+from torch._inductor.async_compile import AsyncCompile
+from torch._inductor.select_algorithm import extern_kernels
+import triton
+import triton.language as tl
+from torch._inductor.runtime.triton_heuristics import start_graph, end_graph
+from torch._C import _cuda_getCurrentRawStream as get_raw_stream
+
+aten = torch.ops.aten
+inductor_ops = torch.ops.inductor
+_quantized = torch.ops._quantized
+assert_size_stride = torch._C._dynamo.guards.assert_size_stride
+assert_alignment = torch._C._dynamo.guards.assert_alignment
+empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu
+empty_strided_cpu_pinned = torch._C._dynamo.guards._empty_strided_cpu_pinned
+empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
+empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu
+empty_strided_mtia = torch._C._dynamo.guards._empty_strided_mtia
+reinterpret_tensor = torch._C._dynamo.guards._reinterpret_tensor
+alloc_from_pool = torch.ops.inductor._alloc_from_pool
+async_compile = AsyncCompile()
+empty_strided_p2p = torch._C._distributed_c10d._SymmetricMemory.empty_strided_p2p
+
+
+# kernel path: /app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py
+# Topologically Sorted Source Nodes: [chunk, hidden_states, silu, x, hidden_states_2], Original ATen: [aten.split, aten.view, aten.silu, aten.mul, aten.cat]
+# Source node to ATen node mapping:
+#   chunk => split
+#   hidden_states => view
+#   hidden_states_2 => cat
+#   silu => convert_element_type, convert_element_type_1, mul, sigmoid
+#   x => mul_1
+# Graph fragment:
+#   %arg0_1 : Tensor "bf16[1, 2304, 32, 128][9437184, 4096, 128, 1]cuda:0" = PlaceHolder[target=arg0_1]
+#   %arg1_1 : Tensor "bf16[1, 2304, 24576][84934656, 36864, 1]cuda:0" = PlaceHolder[target=arg1_1]
+#   %split : [num_users=2] = call_function[target=torch.ops.aten.split.Tensor](args = (%arg1_1, 12288, -1), kwargs = {})
+#   %view : Tensor "bf16[1, 2304, 4096][9437184, 4096, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%arg0_1, [1, 2304, 4096]), kwargs = {})
+#   %convert_element_type : Tensor "f32[1, 2304, 12288][28311552, 12288, 1]cuda:0"[num_users=2] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%getitem, torch.float32), kwargs = {})
+#   %sigmoid : Tensor "f32[1, 2304, 12288][28311552, 12288, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.sigmoid.default](args = (%convert_element_type,), kwargs = {})
+#   %mul : Tensor "f32[1, 2304, 12288][28311552, 12288, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type, %sigmoid), kwargs = {})
+#   %convert_element_type_1 : Tensor "bf16[1, 2304, 12288][28311552, 12288, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%mul, torch.bfloat16), kwargs = {})
+#   %mul_1 : Tensor "bf16[1, 2304, 12288][28311552, 12288, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type_1, %getitem_1), kwargs = {})
+#   %cat : Tensor "bf16[1, 2304, 16384][37748736, 16384, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.cat.default](args = ([%view, %mul_1], -1), kwargs = {})
+#   return %cat
+triton_poi_fused_cat_mul_silu_split_view_0 = async_compile.triton('triton_poi_fused_cat_mul_silu_split_view_0', '''
+import triton
+import triton.language as tl
+
+from torch._inductor.runtime import triton_helpers, triton_heuristics
+from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+triton_helpers.set_driver_to_gpu()
+
+@triton_heuristics.pointwise(
+    size_hints={'x': 67108864}, 
+    filename=__file__,
+    triton_meta={'signature': {'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'out_ptr0': '*bf16', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=128, cc=89, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=1536, warp_size=32), 'constants': {}, 'native_matmul': False, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]]}], 'enable_fp_fusion': True},
+    inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_cat_mul_silu_split_view_0', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'atomic_add_found': False, 'num_load': 3, 'num_store': 1, 'num_reduction': 0, 'backend_hash': '139C22A3A3C364569C9941DE9469DCB674B7A631E094782CBD415193800462F6', 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'deterministic': False, 'force_filter_reduction_configs': False, 'are_deterministic_algorithms_enabled': False, 'tiling_scores': {'x': 377487360}},
+    min_elem_per_thread=0
+)
+@triton.jit
+def triton_poi_fused_cat_mul_silu_split_view_0(in_ptr0, in_ptr1, out_ptr0, xnumel, XBLOCK : tl.constexpr):
+    xnumel = 37748736
+    xoffset = tl.program_id(0) * XBLOCK
+    xindex = xoffset + tl.arange(0, XBLOCK)[:]
+    xmask = tl.full([XBLOCK], True, tl.int1)
+    x0 = (xindex % 16384)
+    x1 = xindex // 16384
+    x2 = xindex
+    tmp0 = x0
+    tmp1 = tl.full([1], 0, tl.int64)
+    tmp2 = tmp0 >= tmp1
+    tmp3 = tl.full([1], 4096, tl.int64)
+    tmp4 = tmp0 < tmp3
+    tmp5 = tl.load(in_ptr0 + (4096*x1 + (x0)), tmp4, eviction_policy='evict_last', other=0.0).to(tl.float32)
+    tmp6 = tmp0 >= tmp3
+    tmp7 = tl.full([1], 16384, tl.int64)
+    tmp8 = tmp0 < tmp7
+    tmp9 = tl.load(in_ptr1 + (36864*x1 + ((-4096) + x0)), tmp6, eviction_policy='evict_last', other=0.0).to(tl.float32)
+    tmp10 = tmp9.to(tl.float32)
+    tmp11 = tl.sigmoid(tmp10)
+    tmp12 = tmp10 * tmp11
+    tmp13 = tmp12.to(tl.float32)
+    tmp14 = tl.load(in_ptr1 + (12288 + 36864*x1 + ((-4096) + x0)), tmp6, eviction_policy='evict_last', other=0.0).to(tl.float32)
+    tmp15 = tmp13 * tmp14
+    tmp16 = tl.full(tmp15.shape, 0.0, tmp15.dtype)
+    tmp17 = tl.where(tmp6, tmp15, tmp16)
+    tmp18 = tl.where(tmp4, tmp5, tmp17)
+    tl.store(out_ptr0 + (x2), tmp18, None)
+''', device_str='cuda')
+
+
+async_compile.wait(globals())
+del async_compile
+
+class Runner:
+    def __init__(self, partitions):
+        self.partitions = partitions
+
+    def recursively_apply_fns(self, fns):
+        new_callables = []
+        for fn, c in zip(fns, self.partitions):
+            new_callables.append(fn(c))
+        self.partitions = new_callables
+
+    def call(self, args):
+        arg0_1, arg1_1 = args
+        args.clear()
+        assert_size_stride(arg0_1, (1, 2304, 32, 128), (9437184, 4096, 128, 1))
+        assert_size_stride(arg1_1, (1, 2304, 24576), (84934656, 36864, 1))
+        with torch.cuda._DeviceGuard(0):
+            torch.cuda.set_device(0)
+            buf0 = empty_strided_cuda((1, 2304, 16384), (37748736, 16384, 1), torch.bfloat16)
+            # Topologically Sorted Source Nodes: [chunk, hidden_states, silu, x, hidden_states_2], Original ATen: [aten.split, aten.view, aten.silu, aten.mul, aten.cat]
+            stream0 = get_raw_stream(0)
+            triton_poi_fused_cat_mul_silu_split_view_0.run(arg0_1, arg1_1, buf0, 37748736, stream=stream0)
+            del arg0_1
+            del arg1_1
+        return (buf0, )
+
+runner = Runner(partitions=[])
+call = runner.call
+recursively_apply_fns = runner.recursively_apply_fns
+
+
+def benchmark_compiled_module(times=10, repeat=10):
+    from torch._dynamo.testing import rand_strided
+    from torch._inductor.utils import print_performance
+    arg0_1 = rand_strided((1, 2304, 32, 128), (9437184, 4096, 128, 1), device='cuda:0', dtype=torch.bfloat16)
+    arg1_1 = rand_strided((1, 2304, 24576), (84934656, 36864, 1), device='cuda:0', dtype=torch.bfloat16)
+    fn = lambda: call([arg0_1, arg1_1])
+    return print_performance(fn, times=times, repeat=repeat)
+
+
+if __name__ == "__main__":
+    from torch._inductor.wrapper_benchmark import compiled_module_main
+    compiled_module_main('None', benchmark_compiled_module)
diff --git a/triton/25HRGTEVYDLN7RNHIFI6FAR4GDSWMTIFSL53NM55AG547GFRGJZA/__grp__triton_red_fused__fused_rms_norm_view_1.json b/triton/25HRGTEVYDLN7RNHIFI6FAR4GDSWMTIFSL53NM55AG547GFRGJZA/__grp__triton_red_fused__fused_rms_norm_view_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..a5edc15f871f8dbba4d3ec899e0f0834afcbd7ee
--- /dev/null
+++ b/triton/25HRGTEVYDLN7RNHIFI6FAR4GDSWMTIFSL53NM55AG547GFRGJZA/__grp__triton_red_fused__fused_rms_norm_view_1.json
@@ -0,0 +1 @@
+{"child_paths": {"triton_red_fused__fused_rms_norm_view_1.source": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/25HRGTEVYDLN7RNHIFI6FAR4GDSWMTIFSL53NM55AG547GFRGJZA/triton_red_fused__fused_rms_norm_view_1.source", "triton_red_fused__fused_rms_norm_view_1.ttir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/25HRGTEVYDLN7RNHIFI6FAR4GDSWMTIFSL53NM55AG547GFRGJZA/triton_red_fused__fused_rms_norm_view_1.ttir", "triton_red_fused__fused_rms_norm_view_1.ttgir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/25HRGTEVYDLN7RNHIFI6FAR4GDSWMTIFSL53NM55AG547GFRGJZA/triton_red_fused__fused_rms_norm_view_1.ttgir", "triton_red_fused__fused_rms_norm_view_1.llir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/25HRGTEVYDLN7RNHIFI6FAR4GDSWMTIFSL53NM55AG547GFRGJZA/triton_red_fused__fused_rms_norm_view_1.llir", "triton_red_fused__fused_rms_norm_view_1.ptx": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/25HRGTEVYDLN7RNHIFI6FAR4GDSWMTIFSL53NM55AG547GFRGJZA/triton_red_fused__fused_rms_norm_view_1.ptx", "triton_red_fused__fused_rms_norm_view_1.cubin": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/25HRGTEVYDLN7RNHIFI6FAR4GDSWMTIFSL53NM55AG547GFRGJZA/triton_red_fused__fused_rms_norm_view_1.cubin", "triton_red_fused__fused_rms_norm_view_1.json": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/25HRGTEVYDLN7RNHIFI6FAR4GDSWMTIFSL53NM55AG547GFRGJZA/triton_red_fused__fused_rms_norm_view_1.json"}}
\ No newline at end of file
diff --git a/triton/25HRGTEVYDLN7RNHIFI6FAR4GDSWMTIFSL53NM55AG547GFRGJZA/triton_red_fused__fused_rms_norm_view_1.cubin b/triton/25HRGTEVYDLN7RNHIFI6FAR4GDSWMTIFSL53NM55AG547GFRGJZA/triton_red_fused__fused_rms_norm_view_1.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..79c64d9c3967bcb0ac58897b9f0c2e757a8d2a4b
Binary files /dev/null and b/triton/25HRGTEVYDLN7RNHIFI6FAR4GDSWMTIFSL53NM55AG547GFRGJZA/triton_red_fused__fused_rms_norm_view_1.cubin differ
diff --git a/triton/25HRGTEVYDLN7RNHIFI6FAR4GDSWMTIFSL53NM55AG547GFRGJZA/triton_red_fused__fused_rms_norm_view_1.json b/triton/25HRGTEVYDLN7RNHIFI6FAR4GDSWMTIFSL53NM55AG547GFRGJZA/triton_red_fused__fused_rms_norm_view_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..ff3a7c01c258341e08008e5565140185ef0802f1
--- /dev/null
+++ b/triton/25HRGTEVYDLN7RNHIFI6FAR4GDSWMTIFSL53NM55AG547GFRGJZA/triton_red_fused__fused_rms_norm_view_1.json
@@ -0,0 +1 @@
+{"hash": "d74f134c95c0d6dfc5a74151e2823c30e5664d0592fbb6b3bd01bbcf98b13272", "target": {"backend": "cuda", "arch": 89, "warp_size": 32}, "num_warps": 2, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "enable_reflect_ftz": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee", "bf16x3", "bf16x6"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm89", "instrumentation_mode": "", "triton_version": "3.6.0", "tensordesc_meta": [], "shared": 8, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused__fused_rms_norm_view_1"}
\ No newline at end of file
diff --git a/triton/25HRGTEVYDLN7RNHIFI6FAR4GDSWMTIFSL53NM55AG547GFRGJZA/triton_red_fused__fused_rms_norm_view_1.llir b/triton/25HRGTEVYDLN7RNHIFI6FAR4GDSWMTIFSL53NM55AG547GFRGJZA/triton_red_fused__fused_rms_norm_view_1.llir
new file mode 100644
index 0000000000000000000000000000000000000000..d5c9c0620201d932a29266ca8a946ba9e4fd23fa
--- /dev/null
+++ b/triton/25HRGTEVYDLN7RNHIFI6FAR4GDSWMTIFSL53NM55AG547GFRGJZA/triton_red_fused__fused_rms_norm_view_1.llir
@@ -0,0 +1,136 @@
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64"
+
+@global_smem = external local_unnamed_addr addrspace(3) global [0 x i8], align 16
+
+; Function Attrs: nounwind
+define ptx_kernel void @triton_red_fused__fused_rms_norm_view_1(ptr addrspace(1) %0, ptr addrspace(1) %1, i32 %2, i32 %3, ptr addrspace(1) readnone captures(none) %4, ptr addrspace(1) readnone captures(none) %5) local_unnamed_addr #0 !dbg !4 {
+  %7 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7
+  %8 = shl nuw i32 %7, 1, !dbg !8
+  %9 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9
+  %10 = and i32 %9, 32, !dbg !9
+  %.lobit = lshr exact i32 %10, 5, !dbg !9
+  %11 = and i32 %9, 1, !dbg !9
+  %12 = or disjoint i32 %.lobit, %8, !dbg !10
+  %13 = or disjoint i32 %8, %11, !dbg !10
+  %14 = shl nuw nsw i32 %9, 2, !dbg !11
+  %15 = and i32 %14, 124, !dbg !11
+  %16 = sdiv i32 %12, 32, !dbg !12
+  %17 = mul i32 %16, 32, !dbg !13
+  %.decomposed = sub i32 %12, %17, !dbg !13
+  %18 = shl nsw i32 %.decomposed, 7, !dbg !14
+  %19 = or disjoint i32 %18, %15, !dbg !15
+  %20 = mul i32 %16, 12288, !dbg !16
+  %21 = add i32 %19, %20, !dbg !17
+  %22 = sext i32 %21 to i64, !dbg !18
+  %23 = getelementptr bfloat, ptr addrspace(1) %0, i64 %22, !dbg !18
+  %24 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !19
+  %25 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %23, i64 %24, i1 true) #4, !dbg !19
+  %26 = extractvalue { i32, i32 } %25, 0, !dbg !19
+  %27 = bitcast i32 %26 to <2 x bfloat>, !dbg !19
+  %28 = extractvalue { i32, i32 } %25, 1, !dbg !19
+  %29 = bitcast i32 %28 to <2 x bfloat>, !dbg !19
+  %30 = extractelement <2 x bfloat> %27, i64 0, !dbg !19
+  %31 = extractelement <2 x bfloat> %27, i64 1, !dbg !19
+  %32 = extractelement <2 x bfloat> %29, i64 0, !dbg !19
+  %33 = extractelement <2 x bfloat> %29, i64 1, !dbg !19
+  %34 = fpext bfloat %30 to float, !dbg !20
+  %35 = fpext bfloat %31 to float, !dbg !20
+  %36 = fpext bfloat %32 to float, !dbg !20
+  %37 = fpext bfloat %33 to float, !dbg !20
+  %38 = fmul float %34, %34, !dbg !21
+  %39 = fmul float %35, %35, !dbg !21
+  %40 = fmul float %36, %36, !dbg !21
+  %41 = fmul float %37, %37, !dbg !21
+  %42 = fadd float %38, %39, !dbg !22
+  %43 = fadd float %40, %42, !dbg !22
+  %44 = fadd float %41, %43, !dbg !22
+  %45 = bitcast float %44 to i32, !dbg !25
+  %46 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %45, i32 16, i32 31), !dbg !25
+  %47 = bitcast i32 %46 to float, !dbg !25
+  %48 = fadd float %44, %47, !dbg !22
+  %49 = bitcast float %48 to i32, !dbg !25
+  %50 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %49, i32 8, i32 31), !dbg !25
+  %51 = bitcast i32 %50 to float, !dbg !25
+  %52 = fadd float %48, %51, !dbg !22
+  %53 = bitcast float %52 to i32, !dbg !25
+  %54 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %53, i32 4, i32 31), !dbg !25
+  %55 = bitcast i32 %54 to float, !dbg !25
+  %56 = fadd float %52, %55, !dbg !22
+  %57 = bitcast float %56 to i32, !dbg !25
+  %58 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %57, i32 2, i32 31), !dbg !25
+  %59 = bitcast i32 %58 to float, !dbg !25
+  %60 = fadd float %56, %59, !dbg !22
+  %61 = bitcast float %60 to i32, !dbg !25
+  %62 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %61, i32 1, i32 31), !dbg !25
+  %63 = bitcast i32 %62 to float, !dbg !25
+  %64 = fadd float %60, %63, !dbg !22
+  %65 = lshr exact i32 %10, 3, !dbg !28
+  %66 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %65, !dbg !28
+  store float %64, ptr addrspace(3) %66, align 4, !dbg !28
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !28
+  %67 = shl nuw nsw i32 %11, 2, !dbg !28
+  %68 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %67, !dbg !28
+  %69 = load i32, ptr addrspace(3) %68, align 4, !dbg !28
+  %70 = sext i32 %13 to i64, !dbg !29
+  %71 = getelementptr float, ptr addrspace(1) %1, i64 %70, !dbg !29
+  %72 = and i32 %9, 62, !dbg !30
+  %73 = icmp eq i32 %72, 0, !dbg !30
+  tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %69, ptr addrspace(1) %71, i1 %73) #4, !dbg !30
+  ret void, !dbg !31
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1
+
+; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
+declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2
+
+; Function Attrs: convergent nocallback nounwind
+declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #3
+
+attributes #0 = { nounwind "nvvm.reqntid"="64" }
+attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
+attributes #3 = { convergent nocallback nounwind }
+attributes #4 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly)
+!1 = !DIFile(filename: "cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py", directory: "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi")
+!2 = !{i32 2, !"Debug Info Version", i32 3}
+!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
+!4 = distinct !DISubprogram(name: "triton_red_fused__fused_rms_norm_view_1", linkageName: "triton_red_fused__fused_rms_norm_view_1", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!5 = !DISubroutineType(cc: DW_CC_normal, types: !6)
+!6 = !{}
+!7 = !DILocation(line: 23, column: 28, scope: !4)
+!8 = !DILocation(line: 23, column: 33, scope: !4)
+!9 = !DILocation(line: 24, column: 44, scope: !4)
+!10 = !DILocation(line: 24, column: 23, scope: !4)
+!11 = !DILocation(line: 26, column: 37, scope: !4)
+!12 = !DILocation(line: 29, column: 19, scope: !4)
+!13 = !DILocation(line: 28, column: 19, scope: !4)
+!14 = !DILocation(line: 38, column: 45, scope: !4)
+!15 = !DILocation(line: 38, column: 41, scope: !4)
+!16 = !DILocation(line: 38, column: 56, scope: !4)
+!17 = !DILocation(line: 38, column: 50, scope: !4)
+!18 = !DILocation(line: 38, column: 34, scope: !4)
+!19 = !DILocation(line: 38, column: 61, scope: !4)
+!20 = !DILocation(line: 38, column: 115, scope: !4)
+!21 = !DILocation(line: 40, column: 22, scope: !4)
+!22 = !DILocation(line: 263, column: 15, scope: !23, inlinedAt: !25)
+!23 = distinct !DILexicalBlockFile(scope: !4, file: !24, discriminator: 0)
+!24 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.12/dist-packages/triton/language")
+!25 = !DILocation(line: 293, column: 36, scope: !23, inlinedAt: !26)
+!26 = !DILocation(line: 44, column: 25, scope: !27)
+!27 = distinct !DILexicalBlockFile(scope: !4, file: !1, discriminator: 0)
+!28 = !DILocation(line: 44, column: 28, scope: !4)
+!29 = !DILocation(line: 45, column: 25, scope: !4)
+!30 = !DILocation(line: 45, column: 36, scope: !4)
+!31 = !DILocation(line: 45, column: 4, scope: !4)
diff --git a/triton/25HRGTEVYDLN7RNHIFI6FAR4GDSWMTIFSL53NM55AG547GFRGJZA/triton_red_fused__fused_rms_norm_view_1.ptx b/triton/25HRGTEVYDLN7RNHIFI6FAR4GDSWMTIFSL53NM55AG547GFRGJZA/triton_red_fused__fused_rms_norm_view_1.ptx
new file mode 100644
index 0000000000000000000000000000000000000000..87e167b9babc8e2c5bfd09dae9831acb29db0fff
--- /dev/null
+++ b/triton/25HRGTEVYDLN7RNHIFI6FAR4GDSWMTIFSL53NM55AG547GFRGJZA/triton_red_fused__fused_rms_norm_view_1.ptx
@@ -0,0 +1,506 @@
+//
+// Generated by LLVM NVPTX Back-End
+//
+
+.version 9.1
+.target sm_89
+.address_size 64
+
+	// .globl	triton_red_fused__fused_rms_norm_view_1 // -- Begin function triton_red_fused__fused_rms_norm_view_1
+.extern .shared .align 16 .b8 global_smem[];
+                                        // @triton_red_fused__fused_rms_norm_view_1
+.visible .entry triton_red_fused__fused_rms_norm_view_1(
+	.param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm_view_1_param_0,
+	.param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm_view_1_param_1,
+	.param .u32 triton_red_fused__fused_rms_norm_view_1_param_2,
+	.param .u32 triton_red_fused__fused_rms_norm_view_1_param_3,
+	.param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm_view_1_param_4,
+	.param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm_view_1_param_5
+)
+.reqntid 64
+{
+	.reg .pred 	%p<3>;
+	.reg .b16 	%rs<5>;
+	.reg .b32 	%r<48>;
+	.reg .b64 	%rd<6>;
+	.loc	1 18 0                          // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:18:0
+$L__func_begin0:
+	.loc	1 18 0                          // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:18:0
+
+// %bb.0:
+	ld.param.b64 	%rd4, [triton_red_fused__fused_rms_norm_view_1_param_0];
+	ld.param.b64 	%rd5, [triton_red_fused__fused_rms_norm_view_1_param_1];
+$L__tmp0:
+	.loc	1 23 28                         // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:23:28
+	mov.u32 	%r5, %ctaid.x;
+	.loc	1 23 33                         // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:23:33
+	shl.b32 	%r6, %r5, 1;
+	.loc	1 24 44                         // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:24:44
+	mov.u32 	%r7, %tid.x;
+	and.b32 	%r8, %r7, 32;
+	bfe.u32 	%r9, %r7, 5, 1;
+	and.b32 	%r10, %r7, 1;
+	.loc	1 24 23                         // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:24:23
+	or.b32 	%r11, %r9, %r6;
+	or.b32 	%r12, %r6, %r10;
+	.loc	1 26 37                         // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:26:37
+	shl.b32 	%r13, %r7, 2;
+	and.b32 	%r14, %r13, 124;
+	.loc	1 29 19                         // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:29:19
+	bfe.s32 	%r15, %r5, 30, 1;
+	shr.u32 	%r16, %r15, 27;
+	add.s32 	%r17, %r11, %r16;
+	shr.u32 	%r18, %r17, 5;
+	.loc	1 28 19                         // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:28:19
+	and.b32 	%r19, %r17, 33554400;
+	sub.s32 	%r20, %r11, %r19;
+	.loc	1 38 45                         // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:38:45
+	shl.b32 	%r21, %r20, 7;
+	.loc	1 38 41                         // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:38:41
+	or.b32 	%r22, %r21, %r14;
+	.loc	1 38 50                         // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:38:50
+	mad.lo.s32 	%r23, %r18, 12288, %r22;
+	.loc	1 38 34                         // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:38:34
+	mad.wide.s32 	%rd1, %r23, 2, %rd4;
+	.loc	1 38 61                         // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:38:61
+	// begin inline asm
+	mov.u64 %rd2, 0x0;
+	createpolicy.fractional.L2::evict_first.b64 %rd2, 1.0;
+	// end inline asm
+	mov.b32 	%r3, 0;
+	mov.pred 	%p1, -1;
+	// begin inline asm
+	mov.u32 %r1, %r3;
+	mov.u32 %r2, %r3;
+	@%p1 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { %r1, %r2 }, [ %rd1 + 0 ], %rd2;
+	// end inline asm
+	mov.b32 	{%rs1, %rs2}, %r1;
+	mov.b32 	{%rs3, %rs4}, %r2;
+	.loc	1 38 115                        // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:38:115
+	cvt.f32.bf16 	%r24, %rs1;
+	cvt.f32.bf16 	%r25, %rs2;
+	cvt.f32.bf16 	%r26, %rs3;
+	cvt.f32.bf16 	%r27, %rs4;
+	.loc	1 40 22                         // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:40:22
+	mul.f32 	%r28, %r25, %r25;
+$L__tmp1:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:44:25 ] ]
+	fma.rn.f32 	%r29, %r24, %r24, %r28;
+	fma.rn.f32 	%r30, %r26, %r26, %r29;
+	fma.rn.f32 	%r31, %r27, %r27, %r30;
+$L__tmp2:
+	.loc	2 293 36                        // standard.py:293:36 @[ cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:44:25 ]
+	shfl.sync.bfly.b32 	%r32, %r31, 16, 31, -1;
+$L__tmp3:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:44:25 ] ]
+	add.f32 	%r33, %r31, %r32;
+$L__tmp4:
+	.loc	2 293 36                        // standard.py:293:36 @[ cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:44:25 ]
+	shfl.sync.bfly.b32 	%r34, %r33, 8, 31, -1;
+$L__tmp5:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:44:25 ] ]
+	add.f32 	%r35, %r33, %r34;
+$L__tmp6:
+	.loc	2 293 36                        // standard.py:293:36 @[ cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:44:25 ]
+	shfl.sync.bfly.b32 	%r36, %r35, 4, 31, -1;
+$L__tmp7:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:44:25 ] ]
+	add.f32 	%r37, %r35, %r36;
+$L__tmp8:
+	.loc	2 293 36                        // standard.py:293:36 @[ cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:44:25 ]
+	shfl.sync.bfly.b32 	%r38, %r37, 2, 31, -1;
+$L__tmp9:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:44:25 ] ]
+	add.f32 	%r39, %r37, %r38;
+$L__tmp10:
+	.loc	2 293 36                        // standard.py:293:36 @[ cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:44:25 ]
+	shfl.sync.bfly.b32 	%r40, %r39, 1, 31, -1;
+$L__tmp11:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:44:25 ] ]
+	add.f32 	%r41, %r39, %r40;
+$L__tmp12:
+	.loc	1 44 28                         // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:44:28
+	shr.u32 	%r42, %r8, 3;
+	mov.b32 	%r43, global_smem;
+	add.s32 	%r44, %r43, %r42;
+	st.shared.b32 	[%r44], %r41;
+	bar.sync 	0;
+	shl.b32 	%r45, %r10, 2;
+	add.s32 	%r46, %r43, %r45;
+	ld.shared.b32 	%r4, [%r46];
+	.loc	1 45 25                         // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:45:25
+	mad.wide.s32 	%rd3, %r12, 4, %rd5;
+	.loc	1 45 36                         // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:45:36
+	and.b32 	%r47, %r7, 62;
+	setp.eq.b32 	%p2, %r47, 0;
+	// begin inline asm
+	@%p2 st.global.b32 [ %rd3 + 0 ], { %r4 };
+	// end inline asm
+	.loc	1 45 4                          // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:45:4
+	ret;
+$L__tmp13:
+$L__func_end0:
+                                        // -- End function
+}
+	.file	1 "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py"
+	.file	2 "/usr/local/lib/python3.12/dist-packages/triton/language/standard.py"
+	.section	.debug_abbrev
+	{
+.b8 1                                   // Abbreviation Code
+.b8 17                                  // DW_TAG_compile_unit
+.b8 1                                   // DW_CHILDREN_yes
+.b8 37                                  // DW_AT_producer
+.b8 8                                   // DW_FORM_string
+.b8 19                                  // DW_AT_language
+.b8 5                                   // DW_FORM_data2
+.b8 3                                   // DW_AT_name
+.b8 8                                   // DW_FORM_string
+.b8 16                                  // DW_AT_stmt_list
+.b8 6                                   // DW_FORM_data4
+.b8 27                                  // DW_AT_comp_dir
+.b8 8                                   // DW_FORM_string
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 2                                   // Abbreviation Code
+.b8 46                                  // DW_TAG_subprogram
+.b8 0                                   // DW_CHILDREN_no
+.b8 3                                   // DW_AT_name
+.b8 8                                   // DW_FORM_string
+.b8 32                                  // DW_AT_inline
+.b8 11                                  // DW_FORM_data1
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 3                                   // Abbreviation Code
+.b8 46                                  // DW_TAG_subprogram
+.b8 1                                   // DW_CHILDREN_yes
+.b8 17                                  // DW_AT_low_pc
+.b8 1                                   // DW_FORM_addr
+.b8 18                                  // DW_AT_high_pc
+.b8 1                                   // DW_FORM_addr
+.b8 49                                  // DW_AT_abstract_origin
+.b8 19                                  // DW_FORM_ref4
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 4                                   // Abbreviation Code
+.b8 29                                  // DW_TAG_inlined_subroutine
+.b8 1                                   // DW_CHILDREN_yes
+.b8 49                                  // DW_AT_abstract_origin
+.b8 19                                  // DW_FORM_ref4
+.b8 17                                  // DW_AT_low_pc
+.b8 1                                   // DW_FORM_addr
+.b8 18                                  // DW_AT_high_pc
+.b8 1                                   // DW_FORM_addr
+.b8 88                                  // DW_AT_call_file
+.b8 11                                  // DW_FORM_data1
+.b8 89                                  // DW_AT_call_line
+.b8 11                                  // DW_FORM_data1
+.b8 87                                  // DW_AT_call_column
+.b8 11                                  // DW_FORM_data1
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 5                                   // Abbreviation Code
+.b8 29                                  // DW_TAG_inlined_subroutine
+.b8 0                                   // DW_CHILDREN_no
+.b8 49                                  // DW_AT_abstract_origin
+.b8 19                                  // DW_FORM_ref4
+.b8 17                                  // DW_AT_low_pc
+.b8 1                                   // DW_FORM_addr
+.b8 18                                  // DW_AT_high_pc
+.b8 1                                   // DW_FORM_addr
+.b8 88                                  // DW_AT_call_file
+.b8 11                                  // DW_FORM_data1
+.b8 89                                  // DW_AT_call_line
+.b8 5                                   // DW_FORM_data2
+.b8 87                                  // DW_AT_call_column
+.b8 11                                  // DW_FORM_data1
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 0                                   // EOM(3)
+	}
+	.section	.debug_info
+	{
+.b32 339                                // Length of Unit
+.b8 2                                   // DWARF version number
+.b8 0
+.b32 .debug_abbrev                      // Offset Into Abbrev. Section
+.b8 8                                   // Address Size (in bytes)
+.b8 1                                   // Abbrev [1] 0xb:0x14c DW_TAG_compile_unit
+.b8 116                                 // DW_AT_producer
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 0
+.b8 2                                   // DW_AT_language
+.b8 0
+.b8 99                                  // DW_AT_name
+.b8 113
+.b8 105
+.b8 116
+.b8 120
+.b8 53
+.b8 104
+.b8 119
+.b8 117
+.b8 112
+.b8 107
+.b8 98
+.b8 106
+.b8 109
+.b8 99
+.b8 115
+.b8 111
+.b8 121
+.b8 107
+.b8 113
+.b8 101
+.b8 112
+.b8 122
+.b8 113
+.b8 99
+.b8 55
+.b8 122
+.b8 99
+.b8 120
+.b8 106
+.b8 99
+.b8 98
+.b8 53
+.b8 97
+.b8 99
+.b8 113
+.b8 107
+.b8 105
+.b8 55
+.b8 122
+.b8 99
+.b8 115
+.b8 106
+.b8 105
+.b8 102
+.b8 114
+.b8 110
+.b8 114
+.b8 122
+.b8 99
+.b8 114
+.b8 114
+.b8 46
+.b8 112
+.b8 121
+.b8 0
+.b32 .debug_line                        // DW_AT_stmt_list
+.b8 47                                  // DW_AT_comp_dir
+.b8 97
+.b8 112
+.b8 112
+.b8 47
+.b8 116
+.b8 101
+.b8 110
+.b8 115
+.b8 111
+.b8 114
+.b8 114
+.b8 116
+.b8 95
+.b8 108
+.b8 108
+.b8 109
+.b8 47
+.b8 118
+.b8 105
+.b8 115
+.b8 117
+.b8 97
+.b8 108
+.b8 95
+.b8 103
+.b8 101
+.b8 110
+.b8 47
+.b8 99
+.b8 111
+.b8 109
+.b8 112
+.b8 105
+.b8 108
+.b8 101
+.b8 100
+.b8 95
+.b8 99
+.b8 97
+.b8 99
+.b8 104
+.b8 101
+.b8 47
+.b8 102
+.b8 108
+.b8 117
+.b8 120
+.b8 50
+.b8 95
+.b8 107
+.b8 108
+.b8 101
+.b8 105
+.b8 110
+.b8 95
+.b8 57
+.b8 98
+.b8 95
+.b8 78
+.b8 86
+.b8 73
+.b8 68
+.b8 73
+.b8 65
+.b8 95
+.b8 71
+.b8 101
+.b8 70
+.b8 111
+.b8 114
+.b8 99
+.b8 101
+.b8 95
+.b8 82
+.b8 84
+.b8 88
+.b8 95
+.b8 52
+.b8 48
+.b8 57
+.b8 48
+.b8 95
+.b8 115
+.b8 109
+.b8 56
+.b8 57
+.b8 95
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 50
+.b8 46
+.b8 49
+.b8 48
+.b8 46
+.b8 48
+.b8 97
+.b8 48
+.b8 95
+.b8 98
+.b8 52
+.b8 101
+.b8 52
+.b8 101
+.b8 101
+.b8 56
+.b8 49
+.b8 100
+.b8 51
+.b8 46
+.b8 110
+.b8 118
+.b8 50
+.b8 53
+.b8 46
+.b8 49
+.b8 50
+.b8 95
+.b8 99
+.b8 117
+.b8 100
+.b8 97
+.b8 49
+.b8 51
+.b8 95
+.b8 49
+.b8 47
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 105
+.b8 110
+.b8 100
+.b8 117
+.b8 99
+.b8 116
+.b8 111
+.b8 114
+.b8 47
+.b8 113
+.b8 105
+.b8 0
+.b8 2                                   // Abbrev [2] 0xe4:0x2a DW_TAG_subprogram
+.b8 116                                 // DW_AT_name
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 114
+.b8 101
+.b8 100
+.b8 95
+.b8 102
+.b8 117
+.b8 115
+.b8 101
+.b8 100
+.b8 95
+.b8 95
+.b8 102
+.b8 117
+.b8 115
+.b8 101
+.b8 100
+.b8 95
+.b8 114
+.b8 109
+.b8 115
+.b8 95
+.b8 110
+.b8 111
+.b8 114
+.b8 109
+.b8 95
+.b8 118
+.b8 105
+.b8 101
+.b8 119
+.b8 95
+.b8 49
+.b8 0
+.b8 1                                   // DW_AT_inline
+.b8 3                                   // Abbrev [3] 0x10e:0x48 DW_TAG_subprogram
+.b64 $L__func_begin0                    // DW_AT_low_pc
+.b64 $L__func_end0                      // DW_AT_high_pc
+.b32 228                                // DW_AT_abstract_origin
+.b8 4                                   // Abbrev [4] 0x123:0x32 DW_TAG_inlined_subroutine
+.b32 228                                // DW_AT_abstract_origin
+.b64 $L__tmp1                           // DW_AT_low_pc
+.b64 $L__tmp12                          // DW_AT_high_pc
+.b8 1                                   // DW_AT_call_file
+.b8 44                                  // DW_AT_call_line
+.b8 25                                  // DW_AT_call_column
+.b8 5                                   // Abbrev [5] 0x13b:0x19 DW_TAG_inlined_subroutine
+.b32 228                                // DW_AT_abstract_origin
+.b64 $L__tmp1                           // DW_AT_low_pc
+.b64 $L__tmp12                          // DW_AT_high_pc
+.b8 2                                   // DW_AT_call_file
+.b8 37                                  // DW_AT_call_line
+.b8 1
+.b8 36                                  // DW_AT_call_column
+.b8 0                                   // End Of Children Mark
+.b8 0                                   // End Of Children Mark
+.b8 0                                   // End Of Children Mark
+	}
+	.section	.debug_macinfo	{	}
diff --git a/triton/25HRGTEVYDLN7RNHIFI6FAR4GDSWMTIFSL53NM55AG547GFRGJZA/triton_red_fused__fused_rms_norm_view_1.source b/triton/25HRGTEVYDLN7RNHIFI6FAR4GDSWMTIFSL53NM55AG547GFRGJZA/triton_red_fused__fused_rms_norm_view_1.source
new file mode 100644
index 0000000000000000000000000000000000000000..21b1b4c3a3a4ad24327b73f7cacdb6bdcf03a3c1
--- /dev/null
+++ b/triton/25HRGTEVYDLN7RNHIFI6FAR4GDSWMTIFSL53NM55AG547GFRGJZA/triton_red_fused__fused_rms_norm_view_1.source
@@ -0,0 +1,167 @@
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":18:0)
+#loc33 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":287:0)
+#loc35 = loc(unknown)
+#loc38 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":262:0)
+#loc42 = loc("in_ptr0"(#loc))
+#loc43 = loc("out_ptr0"(#loc))
+#loc44 = loc("xnumel"(#loc))
+#loc45 = loc("r0_numel"(#loc))
+#loc74 = loc("input"(#loc33))
+#loc75 = loc("a"(#loc38))
+#loc76 = loc("b"(#loc38))
+module {
+  tt.func public @triton_red_fused__fused_rms_norm_view_1(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
+    %xnumel_0 = arith.constant 65536 : i32 loc(#loc46)
+    %r0_numel_1 = arith.constant 128 : i32 loc(#loc47)
+    %xoffset = tt.get_program_id x : i32 loc(#loc48)
+    %xoffset_2 = arith.constant 2 : i32 loc(#loc49)
+    %xoffset_3 = arith.constant 2 : i32 loc(#loc49)
+    %xoffset_4 = arith.muli %xoffset, %xoffset_3 : i32 loc(#loc49)
+    %xindex = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc50)
+    %xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<2xi32> -> tensor<2x1xi32> loc(#loc51)
+    %xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<2x1xi32> loc(#loc52)
+    %xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<2x1xi32> loc(#loc52)
+    %xmask = arith.constant true loc(#loc53)
+    %xmask_8 = arith.constant dense<true> : tensor<2x128xi1> loc(#loc53)
+    %r0_base = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc54)
+    %r0_base_9 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc55)
+    %x0 = arith.constant 32 : i32 loc(#loc56)
+    %x0_10 = arith.constant 32 : i32 loc(#loc56)
+    %x0_11 = arith.constant dense<32> : tensor<2x1xi32> loc(#loc56)
+    %x0_12 = arith.remsi %xindex_7, %x0_11 : tensor<2x1xi32> loc(#loc56)
+    %x1 = arith.constant 32 : i32 loc(#loc57)
+    %x1_13 = arith.constant 32 : i32 loc(#loc57)
+    %x1_14 = arith.constant dense<32> : tensor<2x1xi32> loc(#loc57)
+    %x1_15 = arith.divsi %xindex_7, %x1_14 : tensor<2x1xi32> loc(#loc57)
+    %_tmp4 = arith.constant 0.000000e+00 : f32 loc(#loc58)
+    %_tmp4_16 = arith.constant dense<0.000000e+00> : tensor<2x128xf32> loc(#loc58)
+    %c0_i32 = arith.constant 0 : i32 loc(#loc14)
+    %c128_i32 = arith.constant 128 : i32 loc(#loc14)
+    %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc14)
+    %1 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc14)
+    %2 = arith.bitcast %c128_i32 : i32 to i32 loc(#loc14)
+    %3 = ub.poison : i32 loc(#loc14)
+    %_tmp4_17 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%_tmp4_19 = %_tmp4_16) -> (tensor<2x128xf32>)  : i32 {
+      %r0_index = tt.splat %r0_offset : i32 -> tensor<1x128xi32> loc(#loc60)
+      %r0_index_20 = arith.addi %r0_index, %r0_base_9 : tensor<1x128xi32> loc(#loc60)
+      %r0_mask = arith.constant dense<128> : tensor<1x128xi32> loc(#loc61)
+      %r0_mask_21 = arith.cmpi slt, %r0_index_20, %r0_mask : tensor<1x128xi32> loc(#loc61)
+      %tmp0 = arith.constant 128 : i32 loc(#loc62)
+      %tmp0_22 = arith.constant 128 : i32 loc(#loc62)
+      %tmp0_23 = arith.constant dense<128> : tensor<2x1xi32> loc(#loc62)
+      %tmp0_24 = arith.muli %tmp0_23, %x0_12 : tensor<2x1xi32> loc(#loc62)
+      %tmp0_25 = tt.broadcast %r0_index_20 : tensor<1x128xi32> -> tensor<2x128xi32> loc(#loc63)
+      %tmp0_26 = tt.broadcast %tmp0_24 : tensor<2x1xi32> -> tensor<2x128xi32> loc(#loc63)
+      %tmp0_27 = arith.addi %tmp0_25, %tmp0_26 : tensor<2x128xi32> loc(#loc63)
+      %tmp0_28 = arith.constant 12288 : i32 loc(#loc64)
+      %tmp0_29 = arith.constant 12288 : i32 loc(#loc64)
+      %tmp0_30 = arith.constant dense<12288> : tensor<2x1xi32> loc(#loc64)
+      %tmp0_31 = arith.muli %tmp0_30, %x1_15 : tensor<2x1xi32> loc(#loc64)
+      %tmp0_32 = tt.broadcast %tmp0_31 : tensor<2x1xi32> -> tensor<2x128xi32> loc(#loc65)
+      %tmp0_33 = arith.addi %tmp0_27, %tmp0_32 : tensor<2x128xi32> loc(#loc65)
+      %tmp0_34 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<2x128x!tt.ptr<bf16>> loc(#loc66)
+      %tmp0_35 = tt.addptr %tmp0_34, %tmp0_33 : tensor<2x128x!tt.ptr<bf16>>, tensor<2x128xi32> loc(#loc66)
+      %tmp0_36 = arith.constant 0.000000e+00 : f32 loc(#loc67)
+      %tmp0_37 = tt.broadcast %r0_mask_21 : tensor<1x128xi1> -> tensor<2x128xi1> loc(#loc67)
+      %tmp0_38 = arith.constant dense<0.000000e+00> : tensor<2x128xf32> loc(#loc67)
+      %tmp0_39 = arith.truncf %tmp0_38 : tensor<2x128xf32> to tensor<2x128xbf16> loc(#loc67)
+      %tmp0_40 = tt.load %tmp0_35, %tmp0_37, %tmp0_39 evictionPolicy = evict_first : tensor<2x128x!tt.ptr<bf16>> loc(#loc67)
+      %tmp0_41 = arith.extf %tmp0_40 : tensor<2x128xbf16> to tensor<2x128xf32> loc(#loc68)
+      %tmp2 = arith.mulf %tmp0_41, %tmp0_41 : tensor<2x128xf32> loc(#loc69)
+      %tmp5 = arith.addf %_tmp4_19, %tmp2 : tensor<2x128xf32> loc(#loc70)
+      %_tmp4_42 = tt.broadcast %r0_mask_21 : tensor<1x128xi1> -> tensor<2x128xi1> loc(#loc71)
+      %_tmp4_43 = arith.select %_tmp4_42, %tmp5, %_tmp4_19 : tensor<2x128xi1>, tensor<2x128xf32> loc(#loc71)
+      scf.yield %_tmp4_43 : tensor<2x128xf32> loc(#loc27)
+    } loc(#loc59)
+    %tmp4 = tt.call @"triton.language.standard.sum__fp32S2_128S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%_tmp4_17) : (tensor<2x128xf32>) -> tensor<2xf32> loc(#loc72)
+    %tmp4_18 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<2xf32> -> tensor<2x1xf32> loc(#loc73)
+    %4 = tt.splat %out_ptr0 : !tt.ptr<f32> -> tensor<2x1x!tt.ptr<f32>> loc(#loc30)
+    %5 = tt.addptr %4, %xindex_7 : tensor<2x1x!tt.ptr<f32>>, tensor<2x1xi32> loc(#loc30)
+    tt.store %5, %tmp4_18 : tensor<2x1x!tt.ptr<f32>> loc(#loc31)
+    tt.return loc(#loc32)
+  } loc(#loc)
+  tt.func private @"triton.language.standard.sum__fp32S2_128S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<2x128xf32> loc("input"(#loc33))) -> tensor<2xf32> attributes {noinline = false} {
+    %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({
+    ^bb0(%arg1: f32 loc(unknown), %arg2: f32 loc(unknown)):
+      %2 = tt.call @triton.language.standard._sum_combine__fp32_fp32__(%arg1, %arg2) : (f32, f32) -> f32 loc(#loc34)
+      tt.reduce.return %2 : f32 loc(#loc34)
+    }) : (tensor<2x128xf32>) -> tensor<2xf32> loc(#loc34)
+    tt.return %0 : tensor<2xf32> loc(#loc36)
+  ^bb1:  // no predecessors
+    %1 = ub.poison : tensor<2xf32> loc(#loc37)
+    tt.return %1 : tensor<2xf32> loc(#loc37)
+  } loc(#loc33)
+  tt.func private @triton.language.standard._sum_combine__fp32_fp32__(%a: f32 loc("a"(#loc38)), %b: f32 loc("b"(#loc38))) -> f32 attributes {noinline = false} {
+    %0 = arith.addf %a, %b : f32 loc(#loc39)
+    tt.return %0 : f32 loc(#loc40)
+  ^bb1:  // no predecessors
+    %1 = ub.poison : f32 loc(#loc41)
+    tt.return %1 : f32 loc(#loc41)
+  } loc(#loc38)
+} loc(#loc)
+#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":19:13)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":20:15)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":23:28)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":23:33)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":24:36)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":24:44)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":24:23)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":25:46)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":26:27)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":26:37)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":28:19)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":29:19)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":30:43)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":32:43)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":33:31)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":34:29)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:45)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:41)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:56)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:50)
+#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:34)
+#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:61)
+#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:115)
+#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":40:22)
+#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":42:23)
+#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":43:40)
+#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":43:8)
+#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":44:25)
+#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":44:28)
+#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":45:25)
+#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":45:36)
+#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":45:4)
+#loc34 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:36)
+#loc36 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:11)
+#loc37 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:4)
+#loc39 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:15)
+#loc40 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:11)
+#loc41 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:4)
+#loc46 = loc("xnumel"(#loc1))
+#loc47 = loc("r0_numel"(#loc2))
+#loc48 = loc("xoffset"(#loc3))
+#loc49 = loc("xoffset"(#loc4))
+#loc50 = loc("xindex"(#loc5))
+#loc51 = loc("xindex"(#loc6))
+#loc52 = loc("xindex"(#loc7))
+#loc53 = loc("xmask"(#loc8))
+#loc54 = loc("r0_base"(#loc9))
+#loc55 = loc("r0_base"(#loc10))
+#loc56 = loc("x0"(#loc11))
+#loc57 = loc("x1"(#loc12))
+#loc58 = loc("_tmp4"(#loc13))
+#loc59 = loc("_tmp4"(#loc14))
+#loc60 = loc("r0_index"(#loc15))
+#loc61 = loc("r0_mask"(#loc16))
+#loc62 = loc("tmp0"(#loc17))
+#loc63 = loc("tmp0"(#loc18))
+#loc64 = loc("tmp0"(#loc19))
+#loc65 = loc("tmp0"(#loc20))
+#loc66 = loc("tmp0"(#loc21))
+#loc67 = loc("tmp0"(#loc22))
+#loc68 = loc("tmp0"(#loc23))
+#loc69 = loc("tmp2"(#loc24))
+#loc70 = loc("tmp5"(#loc25))
+#loc71 = loc("_tmp4"(#loc26))
+#loc72 = loc("tmp4"(#loc28))
+#loc73 = loc("tmp4"(#loc29))
diff --git a/triton/25HRGTEVYDLN7RNHIFI6FAR4GDSWMTIFSL53NM55AG547GFRGJZA/triton_red_fused__fused_rms_norm_view_1.ttgir b/triton/25HRGTEVYDLN7RNHIFI6FAR4GDSWMTIFSL53NM55AG547GFRGJZA/triton_red_fused__fused_rms_norm_view_1.ttgir
new file mode 100644
index 0000000000000000000000000000000000000000..66c29e1103cd2585e55b8e2980219703a9faa89b
--- /dev/null
+++ b/triton/25HRGTEVYDLN7RNHIFI6FAR4GDSWMTIFSL53NM55AG547GFRGJZA/triton_red_fused__fused_rms_norm_view_1.ttgir
@@ -0,0 +1,108 @@
+#blocked = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 32], warpsPerCTA = [2, 1], order = [1, 0]}>
+#blocked1 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [2, 16], warpsPerCTA = [1, 2], order = [0, 1]}>
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":18:0)
+#loc1 = loc(unknown)
+#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":44:25)
+#loc27 = loc("in_ptr0"(#loc))
+#loc28 = loc("out_ptr0"(#loc))
+#loc29 = loc("xnumel"(#loc))
+#loc30 = loc("r0_numel"(#loc))
+#loc49 = loc("tmp4"(#loc21))
+#loc52 = loc(callsite(#loc1 at #loc49))
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 2 : i32, ttg.target = "cuda:89", "ttg.threads-per-warp" = 32 : i32} {
+  tt.func public @triton_red_fused__fused_rms_norm_view_1(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
+    %cst = arith.constant dense<128> : tensor<1x128xi32, #blocked> loc(#loc1)
+    %cst_0 = arith.constant dense<128> : tensor<2x1xi32, #blocked> loc(#loc1)
+    %cst_1 = arith.constant dense<12288> : tensor<2x1xi32, #blocked> loc(#loc1)
+    %cst_2 = arith.constant dense<32> : tensor<2x1xi32, #blocked> loc(#loc1)
+    %c2_i32 = arith.constant 2 : i32 loc(#loc1)
+    %cst_3 = arith.constant dense<0.000000e+00> : tensor<2x128xbf16, #blocked> loc(#loc1)
+    %cst_4 = arith.constant dense<0.000000e+00> : tensor<2x128xf32, #blocked> loc(#loc1)
+    %xoffset = tt.get_program_id x : i32 loc(#loc31)
+    %xoffset_5 = arith.muli %xoffset, %c2_i32 : i32 loc(#loc32)
+    %xindex = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc33)
+    %xindex_6 = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc33)
+    %xindex_7 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<2xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<2x1xi32, #blocked> loc(#loc33)
+    %xindex_8 = tt.expand_dims %xindex_6 {axis = 1 : i32} : tensor<2xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<2x1xi32, #blocked1> loc(#loc33)
+    %xindex_9 = tt.splat %xoffset_5 : i32 -> tensor<2x1xi32, #blocked> loc(#loc34)
+    %xindex_10 = tt.splat %xoffset_5 : i32 -> tensor<2x1xi32, #blocked1> loc(#loc34)
+    %xindex_11 = arith.addi %xindex_9, %xindex_7 : tensor<2x1xi32, #blocked> loc(#loc34)
+    %xindex_12 = arith.addi %xindex_10, %xindex_8 : tensor<2x1xi32, #blocked1> loc(#loc34)
+    %r0_base = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc35)
+    %r0_base_13 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x128xi32, #blocked> loc(#loc35)
+    %x0 = arith.remsi %xindex_11, %cst_2 : tensor<2x1xi32, #blocked> loc(#loc36)
+    %x1 = arith.divsi %xindex_11, %cst_2 : tensor<2x1xi32, #blocked> loc(#loc37)
+    %r0_mask = arith.cmpi slt, %r0_base_13, %cst : tensor<1x128xi32, #blocked> loc(#loc38)
+    %tmp0 = arith.muli %x0, %cst_0 : tensor<2x1xi32, #blocked> loc(#loc39)
+    %tmp0_14 = tt.broadcast %r0_base_13 : tensor<1x128xi32, #blocked> -> tensor<2x128xi32, #blocked> loc(#loc40)
+    %tmp0_15 = tt.broadcast %tmp0 : tensor<2x1xi32, #blocked> -> tensor<2x128xi32, #blocked> loc(#loc40)
+    %tmp0_16 = arith.addi %tmp0_14, %tmp0_15 : tensor<2x128xi32, #blocked> loc(#loc40)
+    %tmp0_17 = arith.muli %x1, %cst_1 : tensor<2x1xi32, #blocked> loc(#loc41)
+    %tmp0_18 = tt.broadcast %tmp0_17 : tensor<2x1xi32, #blocked> -> tensor<2x128xi32, #blocked> loc(#loc42)
+    %tmp0_19 = arith.addi %tmp0_16, %tmp0_18 : tensor<2x128xi32, #blocked> loc(#loc42)
+    %tmp0_20 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<2x128x!tt.ptr<bf16>, #blocked> loc(#loc43)
+    %tmp0_21 = tt.addptr %tmp0_20, %tmp0_19 : tensor<2x128x!tt.ptr<bf16>, #blocked>, tensor<2x128xi32, #blocked> loc(#loc43)
+    %tmp0_22 = tt.broadcast %r0_mask : tensor<1x128xi1, #blocked> -> tensor<2x128xi1, #blocked> loc(#loc44)
+    %tmp0_23 = tt.load %tmp0_21, %tmp0_22, %cst_3 evictionPolicy = evict_first : tensor<2x128x!tt.ptr<bf16>, #blocked> loc(#loc44)
+    %tmp0_24 = arith.extf %tmp0_23 : tensor<2x128xbf16, #blocked> to tensor<2x128xf32, #blocked> loc(#loc45)
+    %tmp2 = arith.mulf %tmp0_24, %tmp0_24 : tensor<2x128xf32, #blocked> loc(#loc46)
+    %tmp5 = arith.addf %tmp2, %cst_4 : tensor<2x128xf32, #blocked> loc(#loc47)
+    %_tmp4 = arith.select %tmp0_22, %tmp5, %cst_4 : tensor<2x128xi1, #blocked>, tensor<2x128xf32, #blocked> loc(#loc48)
+    %tmp4 = "tt.reduce"(%_tmp4) <{axis = 1 : i32}> ({
+    ^bb0(%tmp4_27: f32 loc(callsite(#loc1 at #loc49)), %tmp4_28: f32 loc(callsite(#loc1 at #loc49))):
+      %tmp4_29 = arith.addf %tmp4_27, %tmp4_28 : f32 loc(#loc53)
+      tt.reduce.return %tmp4_29 : f32 loc(#loc51)
+    }) : (tensor<2x128xf32, #blocked>) -> tensor<2xf32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc51)
+    %tmp4_25 = ttg.convert_layout %tmp4 : tensor<2xf32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<2xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc50)
+    %tmp4_26 = tt.expand_dims %tmp4_25 {axis = 1 : i32} : tensor<2xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<2x1xf32, #blocked1> loc(#loc50)
+    %0 = tt.splat %out_ptr0 : !tt.ptr<f32> -> tensor<2x1x!tt.ptr<f32>, #blocked1> loc(#loc24)
+    %1 = tt.addptr %0, %xindex_12 : tensor<2x1x!tt.ptr<f32>, #blocked1>, tensor<2x1xi32, #blocked1> loc(#loc24)
+    tt.store %1, %tmp4_26 : tensor<2x1x!tt.ptr<f32>, #blocked1> loc(#loc25)
+    tt.return loc(#loc26)
+  } loc(#loc)
+} loc(#loc)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":23:28)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":23:33)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":24:44)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":24:23)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":26:37)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":28:19)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":29:19)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":34:29)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:45)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:41)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:56)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:50)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:34)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:61)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:115)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":40:22)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":42:23)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":43:40)
+#loc20 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:36)
+#loc22 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:15)
+#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":44:28)
+#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":45:25)
+#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":45:36)
+#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":45:4)
+#loc31 = loc("xoffset"(#loc2))
+#loc32 = loc("xoffset"(#loc3))
+#loc33 = loc("xindex"(#loc4))
+#loc34 = loc("xindex"(#loc5))
+#loc35 = loc("r0_base"(#loc6))
+#loc36 = loc("x0"(#loc7))
+#loc37 = loc("x1"(#loc8))
+#loc38 = loc("r0_mask"(#loc9))
+#loc39 = loc("tmp0"(#loc10))
+#loc40 = loc("tmp0"(#loc11))
+#loc41 = loc("tmp0"(#loc12))
+#loc42 = loc("tmp0"(#loc13))
+#loc43 = loc("tmp0"(#loc14))
+#loc44 = loc("tmp0"(#loc15))
+#loc45 = loc("tmp0"(#loc16))
+#loc46 = loc("tmp2"(#loc17))
+#loc47 = loc("tmp5"(#loc18))
+#loc48 = loc("_tmp4"(#loc19))
+#loc50 = loc("tmp4"(#loc23))
+#loc51 = loc(callsite(#loc20 at #loc49))
+#loc53 = loc(callsite(#loc22 at #loc51))
diff --git a/triton/25HRGTEVYDLN7RNHIFI6FAR4GDSWMTIFSL53NM55AG547GFRGJZA/triton_red_fused__fused_rms_norm_view_1.ttir b/triton/25HRGTEVYDLN7RNHIFI6FAR4GDSWMTIFSL53NM55AG547GFRGJZA/triton_red_fused__fused_rms_norm_view_1.ttir
new file mode 100644
index 0000000000000000000000000000000000000000..44e28b7eb9c38afd708cdb430c86636f80325231
--- /dev/null
+++ b/triton/25HRGTEVYDLN7RNHIFI6FAR4GDSWMTIFSL53NM55AG547GFRGJZA/triton_red_fused__fused_rms_norm_view_1.ttir
@@ -0,0 +1,105 @@
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":18:0)
+#loc2 = loc(unknown)
+#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":44:25)
+#loc29 = loc("in_ptr0"(#loc))
+#loc30 = loc("out_ptr0"(#loc))
+#loc31 = loc("xnumel"(#loc))
+#loc32 = loc("r0_numel"(#loc))
+#loc53 = loc("tmp4"(#loc23))
+#loc56 = loc(callsite(#loc2 at #loc53))
+module {
+  tt.func public @triton_red_fused__fused_rms_norm_view_1(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
+    %tmp0 = arith.constant dense<0.000000e+00> : tensor<2x128xbf16> loc(#loc33)
+    %cst = arith.constant dense<12288> : tensor<2x1xi32> loc(#loc2)
+    %cst_0 = arith.constant dense<128> : tensor<2x1xi32> loc(#loc2)
+    %cst_1 = arith.constant dense<128> : tensor<1x128xi32> loc(#loc2)
+    %cst_2 = arith.constant dense<0.000000e+00> : tensor<2x128xf32> loc(#loc2)
+    %cst_3 = arith.constant dense<32> : tensor<2x1xi32> loc(#loc2)
+    %c2_i32 = arith.constant 2 : i32 loc(#loc2)
+    %xoffset = tt.get_program_id x : i32 loc(#loc34)
+    %xoffset_4 = arith.muli %xoffset, %c2_i32 : i32 loc(#loc35)
+    %xindex = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc36)
+    %xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<2xi32> -> tensor<2x1xi32> loc(#loc37)
+    %xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<2x1xi32> loc(#loc38)
+    %xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<2x1xi32> loc(#loc38)
+    %r0_base = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc39)
+    %r0_base_8 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc40)
+    %x0 = arith.remsi %xindex_7, %cst_3 : tensor<2x1xi32> loc(#loc41)
+    %x1 = arith.divsi %xindex_7, %cst_3 : tensor<2x1xi32> loc(#loc42)
+    %r0_mask = arith.cmpi slt, %r0_base_8, %cst_1 : tensor<1x128xi32> loc(#loc43)
+    %tmp0_9 = arith.muli %x0, %cst_0 : tensor<2x1xi32> loc(#loc44)
+    %tmp0_10 = tt.broadcast %r0_base_8 : tensor<1x128xi32> -> tensor<2x128xi32> loc(#loc45)
+    %tmp0_11 = tt.broadcast %tmp0_9 : tensor<2x1xi32> -> tensor<2x128xi32> loc(#loc45)
+    %tmp0_12 = arith.addi %tmp0_10, %tmp0_11 : tensor<2x128xi32> loc(#loc45)
+    %tmp0_13 = arith.muli %x1, %cst : tensor<2x1xi32> loc(#loc46)
+    %tmp0_14 = tt.broadcast %tmp0_13 : tensor<2x1xi32> -> tensor<2x128xi32> loc(#loc47)
+    %tmp0_15 = arith.addi %tmp0_12, %tmp0_14 : tensor<2x128xi32> loc(#loc47)
+    %tmp0_16 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<2x128x!tt.ptr<bf16>> loc(#loc48)
+    %tmp0_17 = tt.addptr %tmp0_16, %tmp0_15 : tensor<2x128x!tt.ptr<bf16>>, tensor<2x128xi32> loc(#loc48)
+    %tmp0_18 = tt.broadcast %r0_mask : tensor<1x128xi1> -> tensor<2x128xi1> loc(#loc33)
+    %tmp0_19 = tt.load %tmp0_17, %tmp0_18, %tmp0 evictionPolicy = evict_first : tensor<2x128x!tt.ptr<bf16>> loc(#loc33)
+    %tmp0_20 = arith.extf %tmp0_19 : tensor<2x128xbf16> to tensor<2x128xf32> loc(#loc49)
+    %tmp2 = arith.mulf %tmp0_20, %tmp0_20 : tensor<2x128xf32> loc(#loc50)
+    %tmp5 = arith.addf %tmp2, %cst_2 : tensor<2x128xf32> loc(#loc51)
+    %_tmp4 = arith.select %tmp0_18, %tmp5, %cst_2 : tensor<2x128xi1>, tensor<2x128xf32> loc(#loc52)
+    %tmp4 = "tt.reduce"(%_tmp4) <{axis = 1 : i32}> ({
+    ^bb0(%tmp4_22: f32 loc(callsite(#loc2 at #loc53)), %tmp4_23: f32 loc(callsite(#loc2 at #loc53))):
+      %tmp4_24 = arith.addf %tmp4_22, %tmp4_23 : f32 loc(#loc57)
+      tt.reduce.return %tmp4_24 : f32 loc(#loc55)
+    }) : (tensor<2x128xf32>) -> tensor<2xf32> loc(#loc55)
+    %tmp4_21 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<2xf32> -> tensor<2x1xf32> loc(#loc54)
+    %0 = tt.splat %out_ptr0 : !tt.ptr<f32> -> tensor<2x1x!tt.ptr<f32>> loc(#loc26)
+    %1 = tt.addptr %0, %xindex_7 : tensor<2x1x!tt.ptr<f32>>, tensor<2x1xi32> loc(#loc26)
+    tt.store %1, %tmp4_21 : tensor<2x1x!tt.ptr<f32>> loc(#loc27)
+    tt.return loc(#loc28)
+  } loc(#loc)
+} loc(#loc)
+#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:61)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":23:28)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":23:33)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":24:36)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":24:44)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":24:23)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":26:27)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":26:37)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":28:19)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":29:19)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":34:29)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:45)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:41)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:56)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:50)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:34)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:115)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":40:22)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":42:23)
+#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":43:40)
+#loc22 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:36)
+#loc24 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:15)
+#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":44:28)
+#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":45:25)
+#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":45:36)
+#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":45:4)
+#loc33 = loc("tmp0"(#loc1))
+#loc34 = loc("xoffset"(#loc3))
+#loc35 = loc("xoffset"(#loc4))
+#loc36 = loc("xindex"(#loc5))
+#loc37 = loc("xindex"(#loc6))
+#loc38 = loc("xindex"(#loc7))
+#loc39 = loc("r0_base"(#loc8))
+#loc40 = loc("r0_base"(#loc9))
+#loc41 = loc("x0"(#loc10))
+#loc42 = loc("x1"(#loc11))
+#loc43 = loc("r0_mask"(#loc12))
+#loc44 = loc("tmp0"(#loc13))
+#loc45 = loc("tmp0"(#loc14))
+#loc46 = loc("tmp0"(#loc15))
+#loc47 = loc("tmp0"(#loc16))
+#loc48 = loc("tmp0"(#loc17))
+#loc49 = loc("tmp0"(#loc18))
+#loc50 = loc("tmp2"(#loc19))
+#loc51 = loc("tmp5"(#loc20))
+#loc52 = loc("_tmp4"(#loc21))
+#loc54 = loc("tmp4"(#loc25))
+#loc55 = loc(callsite(#loc22 at #loc53))
+#loc57 = loc(callsite(#loc24 at #loc55))
diff --git a/triton/3DQMQTGHGYIP7XSP4YFFOCWDCXOM4VUO2FQM6Z26AAUWNSRLGREA/__grp__triton_red_fused__fused_rms_norm_view_1.json b/triton/3DQMQTGHGYIP7XSP4YFFOCWDCXOM4VUO2FQM6Z26AAUWNSRLGREA/__grp__triton_red_fused__fused_rms_norm_view_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..dd1f0f6ba8c6fbbe3583f5146e7473ad6cdf12c1
--- /dev/null
+++ b/triton/3DQMQTGHGYIP7XSP4YFFOCWDCXOM4VUO2FQM6Z26AAUWNSRLGREA/__grp__triton_red_fused__fused_rms_norm_view_1.json
@@ -0,0 +1 @@
+{"child_paths": {"triton_red_fused__fused_rms_norm_view_1.source": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/3DQMQTGHGYIP7XSP4YFFOCWDCXOM4VUO2FQM6Z26AAUWNSRLGREA/triton_red_fused__fused_rms_norm_view_1.source", "triton_red_fused__fused_rms_norm_view_1.ttir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/3DQMQTGHGYIP7XSP4YFFOCWDCXOM4VUO2FQM6Z26AAUWNSRLGREA/triton_red_fused__fused_rms_norm_view_1.ttir", "triton_red_fused__fused_rms_norm_view_1.ttgir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/3DQMQTGHGYIP7XSP4YFFOCWDCXOM4VUO2FQM6Z26AAUWNSRLGREA/triton_red_fused__fused_rms_norm_view_1.ttgir", "triton_red_fused__fused_rms_norm_view_1.llir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/3DQMQTGHGYIP7XSP4YFFOCWDCXOM4VUO2FQM6Z26AAUWNSRLGREA/triton_red_fused__fused_rms_norm_view_1.llir", "triton_red_fused__fused_rms_norm_view_1.ptx": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/3DQMQTGHGYIP7XSP4YFFOCWDCXOM4VUO2FQM6Z26AAUWNSRLGREA/triton_red_fused__fused_rms_norm_view_1.ptx", "triton_red_fused__fused_rms_norm_view_1.cubin": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/3DQMQTGHGYIP7XSP4YFFOCWDCXOM4VUO2FQM6Z26AAUWNSRLGREA/triton_red_fused__fused_rms_norm_view_1.cubin", "triton_red_fused__fused_rms_norm_view_1.json": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/3DQMQTGHGYIP7XSP4YFFOCWDCXOM4VUO2FQM6Z26AAUWNSRLGREA/triton_red_fused__fused_rms_norm_view_1.json"}}
\ No newline at end of file
diff --git a/triton/3DQMQTGHGYIP7XSP4YFFOCWDCXOM4VUO2FQM6Z26AAUWNSRLGREA/triton_red_fused__fused_rms_norm_view_1.cubin b/triton/3DQMQTGHGYIP7XSP4YFFOCWDCXOM4VUO2FQM6Z26AAUWNSRLGREA/triton_red_fused__fused_rms_norm_view_1.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..e87070faca30a45589ed7d5d5cd5916011544f24
Binary files /dev/null and b/triton/3DQMQTGHGYIP7XSP4YFFOCWDCXOM4VUO2FQM6Z26AAUWNSRLGREA/triton_red_fused__fused_rms_norm_view_1.cubin differ
diff --git a/triton/3DQMQTGHGYIP7XSP4YFFOCWDCXOM4VUO2FQM6Z26AAUWNSRLGREA/triton_red_fused__fused_rms_norm_view_1.json b/triton/3DQMQTGHGYIP7XSP4YFFOCWDCXOM4VUO2FQM6Z26AAUWNSRLGREA/triton_red_fused__fused_rms_norm_view_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..8fdca38b3acaf627358eb68862191313b7f624e8
--- /dev/null
+++ b/triton/3DQMQTGHGYIP7XSP4YFFOCWDCXOM4VUO2FQM6Z26AAUWNSRLGREA/triton_red_fused__fused_rms_norm_view_1.json
@@ -0,0 +1 @@
+{"hash": "d8e0c84cc73610ffde4fe60a570ac315dcce568ed160cf675e002966ca2b3448", "target": {"backend": "cuda", "arch": 89, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "enable_reflect_ftz": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee", "bf16x3", "bf16x6"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm89", "instrumentation_mode": "", "triton_version": "3.6.0", "tensordesc_meta": [], "shared": 256, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused__fused_rms_norm_view_1"}
\ No newline at end of file
diff --git a/triton/3DQMQTGHGYIP7XSP4YFFOCWDCXOM4VUO2FQM6Z26AAUWNSRLGREA/triton_red_fused__fused_rms_norm_view_1.llir b/triton/3DQMQTGHGYIP7XSP4YFFOCWDCXOM4VUO2FQM6Z26AAUWNSRLGREA/triton_red_fused__fused_rms_norm_view_1.llir
new file mode 100644
index 0000000000000000000000000000000000000000..fc51d50082194daceaae04fbf4382e738745fc48
--- /dev/null
+++ b/triton/3DQMQTGHGYIP7XSP4YFFOCWDCXOM4VUO2FQM6Z26AAUWNSRLGREA/triton_red_fused__fused_rms_norm_view_1.llir
@@ -0,0 +1,140 @@
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64"
+
+@global_smem = external local_unnamed_addr addrspace(3) global [0 x i8], align 16
+
+; Function Attrs: nounwind
+define ptx_kernel void @triton_red_fused__fused_rms_norm_view_1(ptr addrspace(1) %0, ptr addrspace(1) %1, i32 %2, i32 %3, ptr addrspace(1) readnone captures(none) %4, ptr addrspace(1) readnone captures(none) %5) local_unnamed_addr #0 !dbg !4 {
+  %7 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7
+  %8 = shl i32 %7, 6, !dbg !8
+  %9 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9
+  %10 = and i32 %9, 126, !dbg !9
+  %11 = lshr exact i32 %10, 1, !dbg !9
+  %12 = or disjoint i32 %11, %8, !dbg !10
+  %13 = shl nuw nsw i32 %9, 2, !dbg !11
+  %14 = and i32 %13, 4, !dbg !11
+  %15 = sdiv i32 %12, 32, !dbg !12
+  %16 = mul i32 %15, 32, !dbg !13
+  %.decomposed = sub i32 %12, %16, !dbg !13
+  %17 = shl nsw i32 %.decomposed, 7, !dbg !14
+  %18 = mul i32 %15, 12288, !dbg !15
+  %19 = or disjoint i32 %17, %14
+  %20 = add i32 %19, %18
+  br label %21, !dbg !16
+
+21:                                               ; preds = %6, %21
+  %indvars.iv = phi i64 [ 0, %6 ], [ %indvars.iv.next, %21 ]
+  %22 = phi float [ 0.000000e+00, %6 ], [ %48, %21 ]
+  %23 = phi float [ 0.000000e+00, %6 ], [ %49, %21 ]
+  %24 = phi float [ 0.000000e+00, %6 ], [ %50, %21 ]
+  %25 = phi float [ 0.000000e+00, %6 ], [ %51, %21 ]
+  %26 = trunc nuw nsw i64 %indvars.iv to i32, !dbg !17
+  %27 = add i32 %20, %26, !dbg !17
+  %28 = sext i32 %27 to i64, !dbg !18
+  %29 = getelementptr bfloat, ptr addrspace(1) %0, i64 %28, !dbg !18
+  %30 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !19
+  %31 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %29, i64 %30, i1 true) #4, !dbg !19
+  %32 = extractvalue { i32, i32 } %31, 0, !dbg !19
+  %33 = bitcast i32 %32 to <2 x bfloat>, !dbg !19
+  %34 = extractvalue { i32, i32 } %31, 1, !dbg !19
+  %35 = bitcast i32 %34 to <2 x bfloat>, !dbg !19
+  %36 = extractelement <2 x bfloat> %33, i64 0, !dbg !19
+  %37 = extractelement <2 x bfloat> %33, i64 1, !dbg !19
+  %38 = extractelement <2 x bfloat> %35, i64 0, !dbg !19
+  %39 = extractelement <2 x bfloat> %35, i64 1, !dbg !19
+  %40 = fpext bfloat %36 to float, !dbg !20
+  %41 = fpext bfloat %37 to float, !dbg !20
+  %42 = fpext bfloat %38 to float, !dbg !20
+  %43 = fpext bfloat %39 to float, !dbg !20
+  %44 = fmul float %40, %40, !dbg !21
+  %45 = fmul float %41, %41, !dbg !21
+  %46 = fmul float %42, %42, !dbg !21
+  %47 = fmul float %43, %43, !dbg !21
+  %48 = fadd float %22, %44, !dbg !22
+  %49 = fadd float %23, %45, !dbg !22
+  %50 = fadd float %24, %46, !dbg !22
+  %51 = fadd float %25, %47, !dbg !22
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 8, !dbg !16
+  %52 = icmp samesign ult i64 %indvars.iv, 120, !dbg !16
+  br i1 %52, label %21, label %53, !dbg !16
+
+53:                                               ; preds = %21
+  %54 = and i32 %9, 63, !dbg !9
+  %55 = or disjoint i32 %8, %54, !dbg !10
+  %56 = fadd float %48, %49, !dbg !23
+  %57 = fadd float %50, %56, !dbg !23
+  %58 = fadd float %51, %57, !dbg !23
+  %59 = bitcast float %58 to i32, !dbg !26
+  %60 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %59, i32 1, i32 31), !dbg !26
+  %61 = bitcast i32 %60 to float, !dbg !26
+  %62 = fadd float %58, %61, !dbg !23
+  %63 = shl nuw nsw i32 %10, 1, !dbg !29
+  %64 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %63, !dbg !29
+  store float %62, ptr addrspace(3) %64, align 4, !dbg !29
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !29
+  %65 = shl nuw nsw i32 %54, 2, !dbg !29
+  %66 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %65, !dbg !29
+  %67 = load i32, ptr addrspace(3) %66, align 4, !dbg !29
+  %68 = sext i32 %55 to i64, !dbg !30
+  %69 = getelementptr float, ptr addrspace(1) %1, i64 %68, !dbg !30
+  %70 = and i32 %9, 64, !dbg !31
+  %71 = icmp eq i32 %70, 0, !dbg !31
+  tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %67, ptr addrspace(1) %69, i1 %71) #4, !dbg !31
+  ret void, !dbg !32
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1
+
+; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
+declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2
+
+; Function Attrs: convergent nocallback nounwind
+declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #3
+
+attributes #0 = { nounwind "nvvm.reqntid"="128" }
+attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
+attributes #3 = { convergent nocallback nounwind }
+attributes #4 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly)
+!1 = !DIFile(filename: "cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py", directory: "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi")
+!2 = !{i32 2, !"Debug Info Version", i32 3}
+!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
+!4 = distinct !DISubprogram(name: "triton_red_fused__fused_rms_norm_view_1", linkageName: "triton_red_fused__fused_rms_norm_view_1", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!5 = !DISubroutineType(cc: DW_CC_normal, types: !6)
+!6 = !{}
+!7 = !DILocation(line: 23, column: 28, scope: !4)
+!8 = !DILocation(line: 23, column: 33, scope: !4)
+!9 = !DILocation(line: 24, column: 44, scope: !4)
+!10 = !DILocation(line: 24, column: 23, scope: !4)
+!11 = !DILocation(line: 26, column: 37, scope: !4)
+!12 = !DILocation(line: 29, column: 19, scope: !4)
+!13 = !DILocation(line: 28, column: 19, scope: !4)
+!14 = !DILocation(line: 38, column: 45, scope: !4)
+!15 = !DILocation(line: 38, column: 56, scope: !4)
+!16 = !DILocation(line: 32, column: 43, scope: !4)
+!17 = !DILocation(line: 38, column: 50, scope: !4)
+!18 = !DILocation(line: 38, column: 34, scope: !4)
+!19 = !DILocation(line: 38, column: 61, scope: !4)
+!20 = !DILocation(line: 38, column: 115, scope: !4)
+!21 = !DILocation(line: 40, column: 22, scope: !4)
+!22 = !DILocation(line: 42, column: 23, scope: !4)
+!23 = !DILocation(line: 263, column: 15, scope: !24, inlinedAt: !26)
+!24 = distinct !DILexicalBlockFile(scope: !4, file: !25, discriminator: 0)
+!25 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.12/dist-packages/triton/language")
+!26 = !DILocation(line: 293, column: 36, scope: !24, inlinedAt: !27)
+!27 = !DILocation(line: 44, column: 25, scope: !28)
+!28 = distinct !DILexicalBlockFile(scope: !4, file: !1, discriminator: 0)
+!29 = !DILocation(line: 44, column: 28, scope: !4)
+!30 = !DILocation(line: 45, column: 25, scope: !4)
+!31 = !DILocation(line: 45, column: 36, scope: !4)
+!32 = !DILocation(line: 45, column: 4, scope: !4)
diff --git a/triton/3DQMQTGHGYIP7XSP4YFFOCWDCXOM4VUO2FQM6Z26AAUWNSRLGREA/triton_red_fused__fused_rms_norm_view_1.ptx b/triton/3DQMQTGHGYIP7XSP4YFFOCWDCXOM4VUO2FQM6Z26AAUWNSRLGREA/triton_red_fused__fused_rms_norm_view_1.ptx
new file mode 100644
index 0000000000000000000000000000000000000000..9a23b53fb69f273a74c936c42a0d5c1c69b6691d
--- /dev/null
+++ b/triton/3DQMQTGHGYIP7XSP4YFFOCWDCXOM4VUO2FQM6Z26AAUWNSRLGREA/triton_red_fused__fused_rms_norm_view_1.ptx
@@ -0,0 +1,499 @@
+//
+// Generated by LLVM NVPTX Back-End
+//
+
+.version 9.1
+.target sm_89
+.address_size 64
+
+	// .globl	triton_red_fused__fused_rms_norm_view_1 // -- Begin function triton_red_fused__fused_rms_norm_view_1
+.extern .shared .align 16 .b8 global_smem[];
+                                        // @triton_red_fused__fused_rms_norm_view_1
+.visible .entry triton_red_fused__fused_rms_norm_view_1(
+	.param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm_view_1_param_0,
+	.param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm_view_1_param_1,
+	.param .u32 triton_red_fused__fused_rms_norm_view_1_param_2,
+	.param .u32 triton_red_fused__fused_rms_norm_view_1_param_3,
+	.param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm_view_1_param_4,
+	.param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm_view_1_param_5
+)
+.reqntid 128
+{
+	.reg .pred 	%p<4>;
+	.reg .b16 	%rs<5>;
+	.reg .b32 	%r<45>;
+	.reg .b64 	%rd<9>;
+	.loc	1 18 0                          // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:18:0
+$L__func_begin0:
+	.loc	1 18 0                          // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:18:0
+
+// %bb.0:
+	ld.param.b64 	%rd3, [triton_red_fused__fused_rms_norm_view_1_param_1];
+	ld.param.b64 	%rd2, [triton_red_fused__fused_rms_norm_view_1_param_0];
+$L__tmp0:
+	.loc	1 23 28                         // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:23:28
+	mov.u32 	%r4, %ctaid.x;
+	.loc	1 23 33                         // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:23:33
+	shl.b32 	%r1, %r4, 6;
+	.loc	1 24 44                         // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:24:44
+	mov.u32 	%r2, %tid.x;
+	and.b32 	%r3, %r2, 126;
+	bfe.u32 	%r5, %r2, 1, 6;
+	.loc	1 24 23                         // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:24:23
+	or.b32 	%r6, %r5, %r1;
+	.loc	1 26 37                         // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:26:37
+	shl.b32 	%r7, %r2, 2;
+	and.b32 	%r8, %r7, 4;
+	.loc	1 29 19                         // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:29:19
+	bfe.s32 	%r9, %r4, 25, 1;
+	shr.u32 	%r10, %r9, 27;
+	add.s32 	%r11, %r6, %r10;
+	shr.u32 	%r12, %r11, 5;
+	.loc	1 32 43                         // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:32:43
+	add.s32 	%r13, %r4, %r12;
+	shl.b32 	%r14, %r13, 13;
+	shl.b32 	%r15, %r5, 7;
+	or.b32 	%r16, %r14, %r15;
+	or.b32 	%r17, %r16, %r8;
+	cvt.u64.u32 	%rd1, %r17;
+	mov.b32 	%r41, 0f00000000;
+	mov.b64 	%rd8, -8;
+	mov.b32 	%r42, %r41;
+	mov.b32 	%r43, %r41;
+	mov.b32 	%r44, %r41;
+$L__BB0_1:                              // =>This Inner Loop Header: Depth=1
+	.loc	1 38 34                         // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:38:34
+	add.s64 	%rd6, %rd1, %rd8;
+	cvt.u32.u64 	%r21, %rd6;
+	add.s32 	%r22, %r21, 8;
+	mad.wide.s32 	%rd5, %r22, 2, %rd2;
+	.loc	1 38 61                         // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:38:61
+	// begin inline asm
+	mov.u64 %rd4, 0x0;
+	createpolicy.fractional.L2::evict_first.b64 %rd4, 1.0;
+	// end inline asm
+	mov.b32 	%r20, 0;
+	mov.pred 	%p1, -1;
+	// begin inline asm
+	mov.u32 %r18, %r20;
+	mov.u32 %r19, %r20;
+	@%p1 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { %r18, %r19 }, [ %rd5 + 0 ], %rd4;
+	// end inline asm
+	mov.b32 	{%rs1, %rs2}, %r18;
+	mov.b32 	{%rs3, %rs4}, %r19;
+	.loc	1 38 115                        // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:38:115
+	cvt.f32.bf16 	%r23, %rs1;
+	cvt.f32.bf16 	%r24, %rs2;
+	cvt.f32.bf16 	%r25, %rs3;
+	cvt.f32.bf16 	%r26, %rs4;
+	.loc	1 42 23                         // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:42:23
+	fma.rn.f32 	%r41, %r23, %r23, %r41;
+	fma.rn.f32 	%r42, %r24, %r24, %r42;
+	fma.rn.f32 	%r43, %r25, %r25, %r43;
+	fma.rn.f32 	%r44, %r26, %r26, %r44;
+	.loc	1 32 43                         // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:32:43
+	add.s64 	%rd8, %rd8, 8;
+	setp.lt.u64 	%p2, %rd8, 120;
+	@%p2 bra 	$L__BB0_1;
+// %bb.2:
+	.loc	1 24 44                         // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:24:44
+	and.b32 	%r28, %r2, 63;
+	.loc	1 24 23                         // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:24:23
+	or.b32 	%r29, %r1, %r28;
+$L__tmp1:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:44:25 ] ]
+	add.f32 	%r30, %r41, %r42;
+	add.f32 	%r31, %r43, %r30;
+	add.f32 	%r32, %r44, %r31;
+$L__tmp2:
+	.loc	2 293 36                        // standard.py:293:36 @[ cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:44:25 ]
+	shfl.sync.bfly.b32 	%r33, %r32, 1, 31, -1;
+$L__tmp3:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:44:25 ] ]
+	add.f32 	%r34, %r32, %r33;
+$L__tmp4:
+	.loc	1 44 28                         // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:44:28
+	shl.b32 	%r35, %r3, 1;
+	mov.b32 	%r36, global_smem;
+	add.s32 	%r37, %r36, %r35;
+	st.shared.b32 	[%r37], %r34;
+	bar.sync 	0;
+	shl.b32 	%r38, %r28, 2;
+	add.s32 	%r39, %r36, %r38;
+	ld.shared.b32 	%r27, [%r39];
+	.loc	1 45 25                         // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:45:25
+	mad.wide.s32 	%rd7, %r29, 4, %rd3;
+	.loc	1 45 36                         // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:45:36
+	and.b32 	%r40, %r2, 64;
+	setp.eq.b32 	%p3, %r40, 0;
+	// begin inline asm
+	@%p3 st.global.b32 [ %rd7 + 0 ], { %r27 };
+	// end inline asm
+	.loc	1 45 4                          // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:45:4
+	ret;
+$L__tmp5:
+$L__func_end0:
+                                        // -- End function
+}
+	.file	1 "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py"
+	.file	2 "/usr/local/lib/python3.12/dist-packages/triton/language/standard.py"
+	.section	.debug_abbrev
+	{
+.b8 1                                   // Abbreviation Code
+.b8 17                                  // DW_TAG_compile_unit
+.b8 1                                   // DW_CHILDREN_yes
+.b8 37                                  // DW_AT_producer
+.b8 8                                   // DW_FORM_string
+.b8 19                                  // DW_AT_language
+.b8 5                                   // DW_FORM_data2
+.b8 3                                   // DW_AT_name
+.b8 8                                   // DW_FORM_string
+.b8 16                                  // DW_AT_stmt_list
+.b8 6                                   // DW_FORM_data4
+.b8 27                                  // DW_AT_comp_dir
+.b8 8                                   // DW_FORM_string
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 2                                   // Abbreviation Code
+.b8 46                                  // DW_TAG_subprogram
+.b8 0                                   // DW_CHILDREN_no
+.b8 3                                   // DW_AT_name
+.b8 8                                   // DW_FORM_string
+.b8 32                                  // DW_AT_inline
+.b8 11                                  // DW_FORM_data1
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 3                                   // Abbreviation Code
+.b8 46                                  // DW_TAG_subprogram
+.b8 1                                   // DW_CHILDREN_yes
+.b8 17                                  // DW_AT_low_pc
+.b8 1                                   // DW_FORM_addr
+.b8 18                                  // DW_AT_high_pc
+.b8 1                                   // DW_FORM_addr
+.b8 49                                  // DW_AT_abstract_origin
+.b8 19                                  // DW_FORM_ref4
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 4                                   // Abbreviation Code
+.b8 29                                  // DW_TAG_inlined_subroutine
+.b8 1                                   // DW_CHILDREN_yes
+.b8 49                                  // DW_AT_abstract_origin
+.b8 19                                  // DW_FORM_ref4
+.b8 17                                  // DW_AT_low_pc
+.b8 1                                   // DW_FORM_addr
+.b8 18                                  // DW_AT_high_pc
+.b8 1                                   // DW_FORM_addr
+.b8 88                                  // DW_AT_call_file
+.b8 11                                  // DW_FORM_data1
+.b8 89                                  // DW_AT_call_line
+.b8 11                                  // DW_FORM_data1
+.b8 87                                  // DW_AT_call_column
+.b8 11                                  // DW_FORM_data1
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 5                                   // Abbreviation Code
+.b8 29                                  // DW_TAG_inlined_subroutine
+.b8 0                                   // DW_CHILDREN_no
+.b8 49                                  // DW_AT_abstract_origin
+.b8 19                                  // DW_FORM_ref4
+.b8 17                                  // DW_AT_low_pc
+.b8 1                                   // DW_FORM_addr
+.b8 18                                  // DW_AT_high_pc
+.b8 1                                   // DW_FORM_addr
+.b8 88                                  // DW_AT_call_file
+.b8 11                                  // DW_FORM_data1
+.b8 89                                  // DW_AT_call_line
+.b8 5                                   // DW_FORM_data2
+.b8 87                                  // DW_AT_call_column
+.b8 11                                  // DW_FORM_data1
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 0                                   // EOM(3)
+	}
+	.section	.debug_info
+	{
+.b32 339                                // Length of Unit
+.b8 2                                   // DWARF version number
+.b8 0
+.b32 .debug_abbrev                      // Offset Into Abbrev. Section
+.b8 8                                   // Address Size (in bytes)
+.b8 1                                   // Abbrev [1] 0xb:0x14c DW_TAG_compile_unit
+.b8 116                                 // DW_AT_producer
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 0
+.b8 2                                   // DW_AT_language
+.b8 0
+.b8 99                                  // DW_AT_name
+.b8 113
+.b8 105
+.b8 116
+.b8 120
+.b8 53
+.b8 104
+.b8 119
+.b8 117
+.b8 112
+.b8 107
+.b8 98
+.b8 106
+.b8 109
+.b8 99
+.b8 115
+.b8 111
+.b8 121
+.b8 107
+.b8 113
+.b8 101
+.b8 112
+.b8 122
+.b8 113
+.b8 99
+.b8 55
+.b8 122
+.b8 99
+.b8 120
+.b8 106
+.b8 99
+.b8 98
+.b8 53
+.b8 97
+.b8 99
+.b8 113
+.b8 107
+.b8 105
+.b8 55
+.b8 122
+.b8 99
+.b8 115
+.b8 106
+.b8 105
+.b8 102
+.b8 114
+.b8 110
+.b8 114
+.b8 122
+.b8 99
+.b8 114
+.b8 114
+.b8 46
+.b8 112
+.b8 121
+.b8 0
+.b32 .debug_line                        // DW_AT_stmt_list
+.b8 47                                  // DW_AT_comp_dir
+.b8 97
+.b8 112
+.b8 112
+.b8 47
+.b8 116
+.b8 101
+.b8 110
+.b8 115
+.b8 111
+.b8 114
+.b8 114
+.b8 116
+.b8 95
+.b8 108
+.b8 108
+.b8 109
+.b8 47
+.b8 118
+.b8 105
+.b8 115
+.b8 117
+.b8 97
+.b8 108
+.b8 95
+.b8 103
+.b8 101
+.b8 110
+.b8 47
+.b8 99
+.b8 111
+.b8 109
+.b8 112
+.b8 105
+.b8 108
+.b8 101
+.b8 100
+.b8 95
+.b8 99
+.b8 97
+.b8 99
+.b8 104
+.b8 101
+.b8 47
+.b8 102
+.b8 108
+.b8 117
+.b8 120
+.b8 50
+.b8 95
+.b8 107
+.b8 108
+.b8 101
+.b8 105
+.b8 110
+.b8 95
+.b8 57
+.b8 98
+.b8 95
+.b8 78
+.b8 86
+.b8 73
+.b8 68
+.b8 73
+.b8 65
+.b8 95
+.b8 71
+.b8 101
+.b8 70
+.b8 111
+.b8 114
+.b8 99
+.b8 101
+.b8 95
+.b8 82
+.b8 84
+.b8 88
+.b8 95
+.b8 52
+.b8 48
+.b8 57
+.b8 48
+.b8 95
+.b8 115
+.b8 109
+.b8 56
+.b8 57
+.b8 95
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 50
+.b8 46
+.b8 49
+.b8 48
+.b8 46
+.b8 48
+.b8 97
+.b8 48
+.b8 95
+.b8 98
+.b8 52
+.b8 101
+.b8 52
+.b8 101
+.b8 101
+.b8 56
+.b8 49
+.b8 100
+.b8 51
+.b8 46
+.b8 110
+.b8 118
+.b8 50
+.b8 53
+.b8 46
+.b8 49
+.b8 50
+.b8 95
+.b8 99
+.b8 117
+.b8 100
+.b8 97
+.b8 49
+.b8 51
+.b8 95
+.b8 49
+.b8 47
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 105
+.b8 110
+.b8 100
+.b8 117
+.b8 99
+.b8 116
+.b8 111
+.b8 114
+.b8 47
+.b8 113
+.b8 105
+.b8 0
+.b8 2                                   // Abbrev [2] 0xe4:0x2a DW_TAG_subprogram
+.b8 116                                 // DW_AT_name
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 114
+.b8 101
+.b8 100
+.b8 95
+.b8 102
+.b8 117
+.b8 115
+.b8 101
+.b8 100
+.b8 95
+.b8 95
+.b8 102
+.b8 117
+.b8 115
+.b8 101
+.b8 100
+.b8 95
+.b8 114
+.b8 109
+.b8 115
+.b8 95
+.b8 110
+.b8 111
+.b8 114
+.b8 109
+.b8 95
+.b8 118
+.b8 105
+.b8 101
+.b8 119
+.b8 95
+.b8 49
+.b8 0
+.b8 1                                   // DW_AT_inline
+.b8 3                                   // Abbrev [3] 0x10e:0x48 DW_TAG_subprogram
+.b64 $L__func_begin0                    // DW_AT_low_pc
+.b64 $L__func_end0                      // DW_AT_high_pc
+.b32 228                                // DW_AT_abstract_origin
+.b8 4                                   // Abbrev [4] 0x123:0x32 DW_TAG_inlined_subroutine
+.b32 228                                // DW_AT_abstract_origin
+.b64 $L__tmp1                           // DW_AT_low_pc
+.b64 $L__tmp4                           // DW_AT_high_pc
+.b8 1                                   // DW_AT_call_file
+.b8 44                                  // DW_AT_call_line
+.b8 25                                  // DW_AT_call_column
+.b8 5                                   // Abbrev [5] 0x13b:0x19 DW_TAG_inlined_subroutine
+.b32 228                                // DW_AT_abstract_origin
+.b64 $L__tmp1                           // DW_AT_low_pc
+.b64 $L__tmp4                           // DW_AT_high_pc
+.b8 2                                   // DW_AT_call_file
+.b8 37                                  // DW_AT_call_line
+.b8 1
+.b8 36                                  // DW_AT_call_column
+.b8 0                                   // End Of Children Mark
+.b8 0                                   // End Of Children Mark
+.b8 0                                   // End Of Children Mark
+	}
+	.section	.debug_macinfo	{	}
diff --git a/triton/3DQMQTGHGYIP7XSP4YFFOCWDCXOM4VUO2FQM6Z26AAUWNSRLGREA/triton_red_fused__fused_rms_norm_view_1.source b/triton/3DQMQTGHGYIP7XSP4YFFOCWDCXOM4VUO2FQM6Z26AAUWNSRLGREA/triton_red_fused__fused_rms_norm_view_1.source
new file mode 100644
index 0000000000000000000000000000000000000000..4f065d51891ed493c167e6a6ca7939ec6c30d04e
--- /dev/null
+++ b/triton/3DQMQTGHGYIP7XSP4YFFOCWDCXOM4VUO2FQM6Z26AAUWNSRLGREA/triton_red_fused__fused_rms_norm_view_1.source
@@ -0,0 +1,167 @@
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":18:0)
+#loc33 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":287:0)
+#loc35 = loc(unknown)
+#loc38 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":262:0)
+#loc42 = loc("in_ptr0"(#loc))
+#loc43 = loc("out_ptr0"(#loc))
+#loc44 = loc("xnumel"(#loc))
+#loc45 = loc("r0_numel"(#loc))
+#loc74 = loc("input"(#loc33))
+#loc75 = loc("a"(#loc38))
+#loc76 = loc("b"(#loc38))
+module {
+  tt.func public @triton_red_fused__fused_rms_norm_view_1(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
+    %xnumel_0 = arith.constant 65536 : i32 loc(#loc46)
+    %r0_numel_1 = arith.constant 128 : i32 loc(#loc47)
+    %xoffset = tt.get_program_id x : i32 loc(#loc48)
+    %xoffset_2 = arith.constant 64 : i32 loc(#loc49)
+    %xoffset_3 = arith.constant 64 : i32 loc(#loc49)
+    %xoffset_4 = arith.muli %xoffset, %xoffset_3 : i32 loc(#loc49)
+    %xindex = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc50)
+    %xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc51)
+    %xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<64x1xi32> loc(#loc52)
+    %xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<64x1xi32> loc(#loc52)
+    %xmask = arith.constant true loc(#loc53)
+    %xmask_8 = arith.constant dense<true> : tensor<64x8xi1> loc(#loc53)
+    %r0_base = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32> loc(#loc54)
+    %r0_base_9 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<8xi32> -> tensor<1x8xi32> loc(#loc55)
+    %x0 = arith.constant 32 : i32 loc(#loc56)
+    %x0_10 = arith.constant 32 : i32 loc(#loc56)
+    %x0_11 = arith.constant dense<32> : tensor<64x1xi32> loc(#loc56)
+    %x0_12 = arith.remsi %xindex_7, %x0_11 : tensor<64x1xi32> loc(#loc56)
+    %x1 = arith.constant 32 : i32 loc(#loc57)
+    %x1_13 = arith.constant 32 : i32 loc(#loc57)
+    %x1_14 = arith.constant dense<32> : tensor<64x1xi32> loc(#loc57)
+    %x1_15 = arith.divsi %xindex_7, %x1_14 : tensor<64x1xi32> loc(#loc57)
+    %_tmp4 = arith.constant 0.000000e+00 : f32 loc(#loc58)
+    %_tmp4_16 = arith.constant dense<0.000000e+00> : tensor<64x8xf32> loc(#loc58)
+    %c0_i32 = arith.constant 0 : i32 loc(#loc14)
+    %c8_i32 = arith.constant 8 : i32 loc(#loc14)
+    %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc14)
+    %1 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc14)
+    %2 = arith.bitcast %c8_i32 : i32 to i32 loc(#loc14)
+    %3 = ub.poison : i32 loc(#loc14)
+    %_tmp4_17 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%_tmp4_19 = %_tmp4_16) -> (tensor<64x8xf32>)  : i32 {
+      %r0_index = tt.splat %r0_offset : i32 -> tensor<1x8xi32> loc(#loc60)
+      %r0_index_20 = arith.addi %r0_index, %r0_base_9 : tensor<1x8xi32> loc(#loc60)
+      %r0_mask = arith.constant dense<128> : tensor<1x8xi32> loc(#loc61)
+      %r0_mask_21 = arith.cmpi slt, %r0_index_20, %r0_mask : tensor<1x8xi32> loc(#loc61)
+      %tmp0 = arith.constant 128 : i32 loc(#loc62)
+      %tmp0_22 = arith.constant 128 : i32 loc(#loc62)
+      %tmp0_23 = arith.constant dense<128> : tensor<64x1xi32> loc(#loc62)
+      %tmp0_24 = arith.muli %tmp0_23, %x0_12 : tensor<64x1xi32> loc(#loc62)
+      %tmp0_25 = tt.broadcast %r0_index_20 : tensor<1x8xi32> -> tensor<64x8xi32> loc(#loc63)
+      %tmp0_26 = tt.broadcast %tmp0_24 : tensor<64x1xi32> -> tensor<64x8xi32> loc(#loc63)
+      %tmp0_27 = arith.addi %tmp0_25, %tmp0_26 : tensor<64x8xi32> loc(#loc63)
+      %tmp0_28 = arith.constant 12288 : i32 loc(#loc64)
+      %tmp0_29 = arith.constant 12288 : i32 loc(#loc64)
+      %tmp0_30 = arith.constant dense<12288> : tensor<64x1xi32> loc(#loc64)
+      %tmp0_31 = arith.muli %tmp0_30, %x1_15 : tensor<64x1xi32> loc(#loc64)
+      %tmp0_32 = tt.broadcast %tmp0_31 : tensor<64x1xi32> -> tensor<64x8xi32> loc(#loc65)
+      %tmp0_33 = arith.addi %tmp0_27, %tmp0_32 : tensor<64x8xi32> loc(#loc65)
+      %tmp0_34 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<64x8x!tt.ptr<bf16>> loc(#loc66)
+      %tmp0_35 = tt.addptr %tmp0_34, %tmp0_33 : tensor<64x8x!tt.ptr<bf16>>, tensor<64x8xi32> loc(#loc66)
+      %tmp0_36 = arith.constant 0.000000e+00 : f32 loc(#loc67)
+      %tmp0_37 = tt.broadcast %r0_mask_21 : tensor<1x8xi1> -> tensor<64x8xi1> loc(#loc67)
+      %tmp0_38 = arith.constant dense<0.000000e+00> : tensor<64x8xf32> loc(#loc67)
+      %tmp0_39 = arith.truncf %tmp0_38 : tensor<64x8xf32> to tensor<64x8xbf16> loc(#loc67)
+      %tmp0_40 = tt.load %tmp0_35, %tmp0_37, %tmp0_39 evictionPolicy = evict_first : tensor<64x8x!tt.ptr<bf16>> loc(#loc67)
+      %tmp0_41 = arith.extf %tmp0_40 : tensor<64x8xbf16> to tensor<64x8xf32> loc(#loc68)
+      %tmp2 = arith.mulf %tmp0_41, %tmp0_41 : tensor<64x8xf32> loc(#loc69)
+      %tmp5 = arith.addf %_tmp4_19, %tmp2 : tensor<64x8xf32> loc(#loc70)
+      %_tmp4_42 = tt.broadcast %r0_mask_21 : tensor<1x8xi1> -> tensor<64x8xi1> loc(#loc71)
+      %_tmp4_43 = arith.select %_tmp4_42, %tmp5, %_tmp4_19 : tensor<64x8xi1>, tensor<64x8xf32> loc(#loc71)
+      scf.yield %_tmp4_43 : tensor<64x8xf32> loc(#loc27)
+    } loc(#loc59)
+    %tmp4 = tt.call @"triton.language.standard.sum__fp32S64_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%_tmp4_17) : (tensor<64x8xf32>) -> tensor<64xf32> loc(#loc72)
+    %tmp4_18 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<64xf32> -> tensor<64x1xf32> loc(#loc73)
+    %4 = tt.splat %out_ptr0 : !tt.ptr<f32> -> tensor<64x1x!tt.ptr<f32>> loc(#loc30)
+    %5 = tt.addptr %4, %xindex_7 : tensor<64x1x!tt.ptr<f32>>, tensor<64x1xi32> loc(#loc30)
+    tt.store %5, %tmp4_18 : tensor<64x1x!tt.ptr<f32>> loc(#loc31)
+    tt.return loc(#loc32)
+  } loc(#loc)
+  tt.func private @"triton.language.standard.sum__fp32S64_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<64x8xf32> loc("input"(#loc33))) -> tensor<64xf32> attributes {noinline = false} {
+    %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({
+    ^bb0(%arg1: f32 loc(unknown), %arg2: f32 loc(unknown)):
+      %2 = tt.call @triton.language.standard._sum_combine__fp32_fp32__(%arg1, %arg2) : (f32, f32) -> f32 loc(#loc34)
+      tt.reduce.return %2 : f32 loc(#loc34)
+    }) : (tensor<64x8xf32>) -> tensor<64xf32> loc(#loc34)
+    tt.return %0 : tensor<64xf32> loc(#loc36)
+  ^bb1:  // no predecessors
+    %1 = ub.poison : tensor<64xf32> loc(#loc37)
+    tt.return %1 : tensor<64xf32> loc(#loc37)
+  } loc(#loc33)
+  tt.func private @triton.language.standard._sum_combine__fp32_fp32__(%a: f32 loc("a"(#loc38)), %b: f32 loc("b"(#loc38))) -> f32 attributes {noinline = false} {
+    %0 = arith.addf %a, %b : f32 loc(#loc39)
+    tt.return %0 : f32 loc(#loc40)
+  ^bb1:  // no predecessors
+    %1 = ub.poison : f32 loc(#loc41)
+    tt.return %1 : f32 loc(#loc41)
+  } loc(#loc38)
+} loc(#loc)
+#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":19:13)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":20:15)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":23:28)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":23:33)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":24:36)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":24:44)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":24:23)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":25:46)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":26:27)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":26:37)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":28:19)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":29:19)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":30:43)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":32:43)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":33:31)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":34:29)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:45)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:41)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:56)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:50)
+#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:34)
+#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:61)
+#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:115)
+#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":40:22)
+#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":42:23)
+#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":43:40)
+#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":43:8)
+#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":44:25)
+#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":44:28)
+#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":45:25)
+#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":45:36)
+#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":45:4)
+#loc34 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:36)
+#loc36 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:11)
+#loc37 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:4)
+#loc39 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:15)
+#loc40 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:11)
+#loc41 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:4)
+#loc46 = loc("xnumel"(#loc1))
+#loc47 = loc("r0_numel"(#loc2))
+#loc48 = loc("xoffset"(#loc3))
+#loc49 = loc("xoffset"(#loc4))
+#loc50 = loc("xindex"(#loc5))
+#loc51 = loc("xindex"(#loc6))
+#loc52 = loc("xindex"(#loc7))
+#loc53 = loc("xmask"(#loc8))
+#loc54 = loc("r0_base"(#loc9))
+#loc55 = loc("r0_base"(#loc10))
+#loc56 = loc("x0"(#loc11))
+#loc57 = loc("x1"(#loc12))
+#loc58 = loc("_tmp4"(#loc13))
+#loc59 = loc("_tmp4"(#loc14))
+#loc60 = loc("r0_index"(#loc15))
+#loc61 = loc("r0_mask"(#loc16))
+#loc62 = loc("tmp0"(#loc17))
+#loc63 = loc("tmp0"(#loc18))
+#loc64 = loc("tmp0"(#loc19))
+#loc65 = loc("tmp0"(#loc20))
+#loc66 = loc("tmp0"(#loc21))
+#loc67 = loc("tmp0"(#loc22))
+#loc68 = loc("tmp0"(#loc23))
+#loc69 = loc("tmp2"(#loc24))
+#loc70 = loc("tmp5"(#loc25))
+#loc71 = loc("_tmp4"(#loc26))
+#loc72 = loc("tmp4"(#loc28))
+#loc73 = loc("tmp4"(#loc29))
diff --git a/triton/3DQMQTGHGYIP7XSP4YFFOCWDCXOM4VUO2FQM6Z26AAUWNSRLGREA/triton_red_fused__fused_rms_norm_view_1.ttgir b/triton/3DQMQTGHGYIP7XSP4YFFOCWDCXOM4VUO2FQM6Z26AAUWNSRLGREA/triton_red_fused__fused_rms_norm_view_1.ttgir
new file mode 100644
index 0000000000000000000000000000000000000000..d4a9e6bec47fe30fb029ebd51cb004904dd8e618
--- /dev/null
+++ b/triton/3DQMQTGHGYIP7XSP4YFFOCWDCXOM4VUO2FQM6Z26AAUWNSRLGREA/triton_red_fused__fused_rms_norm_view_1.ttgir
@@ -0,0 +1,121 @@
+#blocked = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [16, 2], warpsPerCTA = [4, 1], order = [1, 0]}>
+#blocked1 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [2, 2], order = [0, 1]}>
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":18:0)
+#loc1 = loc(unknown)
+#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":44:25)
+#loc30 = loc("in_ptr0"(#loc))
+#loc31 = loc("out_ptr0"(#loc))
+#loc32 = loc("xnumel"(#loc))
+#loc33 = loc("r0_numel"(#loc))
+#loc54 = loc("tmp4"(#loc24))
+#loc57 = loc(callsite(#loc1 at #loc54))
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:89", "ttg.threads-per-warp" = 32 : i32} {
+  tt.func public @triton_red_fused__fused_rms_norm_view_1(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
+    %cst = arith.constant dense<128> : tensor<1x8xi32, #blocked> loc(#loc1)
+    %cst_0 = arith.constant dense<128> : tensor<64x1xi32, #blocked> loc(#loc1)
+    %cst_1 = arith.constant dense<12288> : tensor<64x1xi32, #blocked> loc(#loc1)
+    %cst_2 = arith.constant dense<32> : tensor<64x1xi32, #blocked> loc(#loc1)
+    %c64_i32 = arith.constant 64 : i32 loc(#loc1)
+    %cst_3 = arith.constant dense<0.000000e+00> : tensor<64x8xbf16, #blocked> loc(#loc1)
+    %c8_i32 = arith.constant 8 : i32 loc(#loc1)
+    %c128_i32 = arith.constant 128 : i32 loc(#loc1)
+    %c0_i32 = arith.constant 0 : i32 loc(#loc1)
+    %cst_4 = arith.constant dense<0.000000e+00> : tensor<64x8xf32, #blocked> loc(#loc1)
+    %xoffset = tt.get_program_id x : i32 loc(#loc34)
+    %xoffset_5 = arith.muli %xoffset, %c64_i32 : i32 loc(#loc35)
+    %xindex = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc36)
+    %xindex_6 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc36)
+    %xindex_7 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> loc(#loc36)
+    %xindex_8 = tt.expand_dims %xindex_6 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<64x1xi32, #blocked1> loc(#loc36)
+    %xindex_9 = tt.splat %xoffset_5 : i32 -> tensor<64x1xi32, #blocked> loc(#loc37)
+    %xindex_10 = tt.splat %xoffset_5 : i32 -> tensor<64x1xi32, #blocked1> loc(#loc37)
+    %xindex_11 = arith.addi %xindex_9, %xindex_7 : tensor<64x1xi32, #blocked> loc(#loc37)
+    %xindex_12 = arith.addi %xindex_10, %xindex_8 : tensor<64x1xi32, #blocked1> loc(#loc37)
+    %r0_base = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc38)
+    %r0_base_13 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<8xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x8xi32, #blocked> loc(#loc38)
+    %x0 = arith.remsi %xindex_11, %cst_2 : tensor<64x1xi32, #blocked> loc(#loc39)
+    %x1 = arith.divsi %xindex_11, %cst_2 : tensor<64x1xi32, #blocked> loc(#loc40)
+    %tmp0 = arith.muli %x0, %cst_0 : tensor<64x1xi32, #blocked> loc(#loc41)
+    %tmp0_14 = tt.broadcast %tmp0 : tensor<64x1xi32, #blocked> -> tensor<64x8xi32, #blocked> loc(#loc42)
+    %tmp0_15 = arith.muli %x1, %cst_1 : tensor<64x1xi32, #blocked> loc(#loc43)
+    %tmp0_16 = tt.broadcast %tmp0_15 : tensor<64x1xi32, #blocked> -> tensor<64x8xi32, #blocked> loc(#loc44)
+    %tmp0_17 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<64x8x!tt.ptr<bf16>, #blocked> loc(#loc45)
+    %_tmp4 = scf.for %_tmp4_20 = %c0_i32 to %c128_i32 step %c8_i32 iter_args(%arg5 = %cst_4) -> (tensor<64x8xf32, #blocked>)  : i32 {
+      %r0_index = tt.splat %_tmp4_20 : i32 -> tensor<1x8xi32, #blocked> loc(#loc47)
+      %r0_index_21 = arith.addi %r0_index, %r0_base_13 : tensor<1x8xi32, #blocked> loc(#loc47)
+      %r0_mask = arith.cmpi slt, %r0_index_21, %cst : tensor<1x8xi32, #blocked> loc(#loc48)
+      %tmp0_22 = tt.broadcast %r0_index_21 : tensor<1x8xi32, #blocked> -> tensor<64x8xi32, #blocked> loc(#loc42)
+      %tmp0_23 = arith.addi %tmp0_22, %tmp0_14 : tensor<64x8xi32, #blocked> loc(#loc42)
+      %tmp0_24 = arith.addi %tmp0_23, %tmp0_16 : tensor<64x8xi32, #blocked> loc(#loc44)
+      %tmp0_25 = tt.addptr %tmp0_17, %tmp0_24 : tensor<64x8x!tt.ptr<bf16>, #blocked>, tensor<64x8xi32, #blocked> loc(#loc45)
+      %tmp0_26 = tt.broadcast %r0_mask : tensor<1x8xi1, #blocked> -> tensor<64x8xi1, #blocked> loc(#loc49)
+      %tmp0_27 = tt.load %tmp0_25, %tmp0_26, %cst_3 evictionPolicy = evict_first : tensor<64x8x!tt.ptr<bf16>, #blocked> loc(#loc49)
+      %tmp0_28 = arith.extf %tmp0_27 : tensor<64x8xbf16, #blocked> to tensor<64x8xf32, #blocked> loc(#loc50)
+      %tmp2 = arith.mulf %tmp0_28, %tmp0_28 : tensor<64x8xf32, #blocked> loc(#loc51)
+      %tmp5 = arith.addf %arg5, %tmp2 : tensor<64x8xf32, #blocked> loc(#loc52)
+      %_tmp4_29 = arith.select %tmp0_26, %tmp5, %arg5 : tensor<64x8xi1, #blocked>, tensor<64x8xf32, #blocked> loc(#loc53)
+      scf.yield %_tmp4_29 : tensor<64x8xf32, #blocked> loc(#loc22)
+    } loc(#loc46)
+    %tmp4 = "tt.reduce"(%_tmp4) <{axis = 1 : i32}> ({
+    ^bb0(%tmp4_20: f32 loc(callsite(#loc1 at #loc54)), %tmp4_21: f32 loc(callsite(#loc1 at #loc54))):
+      %tmp4_22 = arith.addf %tmp4_20, %tmp4_21 : f32 loc(#loc58)
+      tt.reduce.return %tmp4_22 : f32 loc(#loc56)
+    }) : (tensor<64x8xf32, #blocked>) -> tensor<64xf32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc56)
+    %tmp4_18 = ttg.convert_layout %tmp4 : tensor<64xf32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc55)
+    %tmp4_19 = tt.expand_dims %tmp4_18 {axis = 1 : i32} : tensor<64xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<64x1xf32, #blocked1> loc(#loc55)
+    %0 = tt.splat %out_ptr0 : !tt.ptr<f32> -> tensor<64x1x!tt.ptr<f32>, #blocked1> loc(#loc27)
+    %1 = tt.addptr %0, %xindex_12 : tensor<64x1x!tt.ptr<f32>, #blocked1>, tensor<64x1xi32, #blocked1> loc(#loc27)
+    tt.store %1, %tmp4_19 : tensor<64x1x!tt.ptr<f32>, #blocked1> loc(#loc28)
+    tt.return loc(#loc29)
+  } loc(#loc)
+} loc(#loc)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":23:28)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":23:33)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":24:44)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":24:23)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":26:37)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":28:19)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":29:19)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:45)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:41)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:56)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:50)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:34)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":32:43)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":33:31)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":34:29)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:61)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:115)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":40:22)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":42:23)
+#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":43:40)
+#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":43:8)
+#loc23 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:36)
+#loc25 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:15)
+#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":44:28)
+#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":45:25)
+#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":45:36)
+#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":45:4)
+#loc34 = loc("xoffset"(#loc2))
+#loc35 = loc("xoffset"(#loc3))
+#loc36 = loc("xindex"(#loc4))
+#loc37 = loc("xindex"(#loc5))
+#loc38 = loc("r0_base"(#loc6))
+#loc39 = loc("x0"(#loc7))
+#loc40 = loc("x1"(#loc8))
+#loc41 = loc("tmp0"(#loc9))
+#loc42 = loc("tmp0"(#loc10))
+#loc43 = loc("tmp0"(#loc11))
+#loc44 = loc("tmp0"(#loc12))
+#loc45 = loc("tmp0"(#loc13))
+#loc46 = loc("_tmp4"(#loc14))
+#loc47 = loc("r0_index"(#loc15))
+#loc48 = loc("r0_mask"(#loc16))
+#loc49 = loc("tmp0"(#loc17))
+#loc50 = loc("tmp0"(#loc18))
+#loc51 = loc("tmp2"(#loc19))
+#loc52 = loc("tmp5"(#loc20))
+#loc53 = loc("_tmp4"(#loc21))
+#loc55 = loc("tmp4"(#loc26))
+#loc56 = loc(callsite(#loc23 at #loc54))
+#loc58 = loc(callsite(#loc25 at #loc56))
diff --git a/triton/3DQMQTGHGYIP7XSP4YFFOCWDCXOM4VUO2FQM6Z26AAUWNSRLGREA/triton_red_fused__fused_rms_norm_view_1.ttir b/triton/3DQMQTGHGYIP7XSP4YFFOCWDCXOM4VUO2FQM6Z26AAUWNSRLGREA/triton_red_fused__fused_rms_norm_view_1.ttir
new file mode 100644
index 0000000000000000000000000000000000000000..407429e35d55d7116140391b571e3ab6691d167b
--- /dev/null
+++ b/triton/3DQMQTGHGYIP7XSP4YFFOCWDCXOM4VUO2FQM6Z26AAUWNSRLGREA/triton_red_fused__fused_rms_norm_view_1.ttir
@@ -0,0 +1,118 @@
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":18:0)
+#loc1 = loc(unknown)
+#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":44:25)
+#loc32 = loc("in_ptr0"(#loc))
+#loc33 = loc("out_ptr0"(#loc))
+#loc34 = loc("xnumel"(#loc))
+#loc35 = loc("r0_numel"(#loc))
+#loc58 = loc("tmp4"(#loc26))
+#loc61 = loc(callsite(#loc1 at #loc58))
+module {
+  tt.func public @triton_red_fused__fused_rms_norm_view_1(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
+    %cst = arith.constant dense<0.000000e+00> : tensor<64x8xbf16> loc(#loc1)
+    %c8_i32 = arith.constant 8 : i32 loc(#loc2)
+    %c128_i32 = arith.constant 128 : i32 loc(#loc2)
+    %c0_i32 = arith.constant 0 : i32 loc(#loc2)
+    %cst_0 = arith.constant dense<12288> : tensor<64x1xi32> loc(#loc1)
+    %cst_1 = arith.constant dense<128> : tensor<64x1xi32> loc(#loc1)
+    %cst_2 = arith.constant dense<128> : tensor<1x8xi32> loc(#loc1)
+    %cst_3 = arith.constant dense<0.000000e+00> : tensor<64x8xf32> loc(#loc1)
+    %cst_4 = arith.constant dense<32> : tensor<64x1xi32> loc(#loc1)
+    %c64_i32 = arith.constant 64 : i32 loc(#loc1)
+    %xoffset = tt.get_program_id x : i32 loc(#loc36)
+    %xoffset_5 = arith.muli %xoffset, %c64_i32 : i32 loc(#loc37)
+    %xindex = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc38)
+    %xindex_6 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc39)
+    %xindex_7 = tt.splat %xoffset_5 : i32 -> tensor<64x1xi32> loc(#loc40)
+    %xindex_8 = arith.addi %xindex_7, %xindex_6 : tensor<64x1xi32> loc(#loc40)
+    %r0_base = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32> loc(#loc41)
+    %r0_base_9 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<8xi32> -> tensor<1x8xi32> loc(#loc42)
+    %x0 = arith.remsi %xindex_8, %cst_4 : tensor<64x1xi32> loc(#loc43)
+    %x1 = arith.divsi %xindex_8, %cst_4 : tensor<64x1xi32> loc(#loc44)
+    %_tmp4 = scf.for %r0_offset = %c0_i32 to %c128_i32 step %c8_i32 iter_args(%_tmp4_11 = %cst_3) -> (tensor<64x8xf32>)  : i32 {
+      %r0_index = tt.splat %r0_offset : i32 -> tensor<1x8xi32> loc(#loc46)
+      %r0_index_12 = arith.addi %r0_index, %r0_base_9 : tensor<1x8xi32> loc(#loc46)
+      %r0_mask = arith.cmpi slt, %r0_index_12, %cst_2 : tensor<1x8xi32> loc(#loc47)
+      %tmp0 = arith.muli %x0, %cst_1 : tensor<64x1xi32> loc(#loc48)
+      %tmp0_13 = tt.broadcast %r0_index_12 : tensor<1x8xi32> -> tensor<64x8xi32> loc(#loc49)
+      %tmp0_14 = tt.broadcast %tmp0 : tensor<64x1xi32> -> tensor<64x8xi32> loc(#loc49)
+      %tmp0_15 = arith.addi %tmp0_13, %tmp0_14 : tensor<64x8xi32> loc(#loc49)
+      %tmp0_16 = arith.muli %x1, %cst_0 : tensor<64x1xi32> loc(#loc50)
+      %tmp0_17 = tt.broadcast %tmp0_16 : tensor<64x1xi32> -> tensor<64x8xi32> loc(#loc51)
+      %tmp0_18 = arith.addi %tmp0_15, %tmp0_17 : tensor<64x8xi32> loc(#loc51)
+      %tmp0_19 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<64x8x!tt.ptr<bf16>> loc(#loc52)
+      %tmp0_20 = tt.addptr %tmp0_19, %tmp0_18 : tensor<64x8x!tt.ptr<bf16>>, tensor<64x8xi32> loc(#loc52)
+      %tmp0_21 = tt.broadcast %r0_mask : tensor<1x8xi1> -> tensor<64x8xi1> loc(#loc53)
+      %tmp0_22 = tt.load %tmp0_20, %tmp0_21, %cst evictionPolicy = evict_first : tensor<64x8x!tt.ptr<bf16>> loc(#loc53)
+      %tmp0_23 = arith.extf %tmp0_22 : tensor<64x8xbf16> to tensor<64x8xf32> loc(#loc54)
+      %tmp2 = arith.mulf %tmp0_23, %tmp0_23 : tensor<64x8xf32> loc(#loc55)
+      %tmp5 = arith.addf %_tmp4_11, %tmp2 : tensor<64x8xf32> loc(#loc56)
+      %_tmp4_24 = arith.select %tmp0_21, %tmp5, %_tmp4_11 : tensor<64x8xi1>, tensor<64x8xf32> loc(#loc57)
+      scf.yield %_tmp4_24 : tensor<64x8xf32> loc(#loc24)
+    } loc(#loc45)
+    %tmp4 = "tt.reduce"(%_tmp4) <{axis = 1 : i32}> ({
+    ^bb0(%tmp4_11: f32 loc(callsite(#loc1 at #loc58)), %tmp4_12: f32 loc(callsite(#loc1 at #loc58))):
+      %tmp4_13 = arith.addf %tmp4_11, %tmp4_12 : f32 loc(#loc62)
+      tt.reduce.return %tmp4_13 : f32 loc(#loc60)
+    }) : (tensor<64x8xf32>) -> tensor<64xf32> loc(#loc60)
+    %tmp4_10 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<64xf32> -> tensor<64x1xf32> loc(#loc59)
+    %0 = tt.splat %out_ptr0 : !tt.ptr<f32> -> tensor<64x1x!tt.ptr<f32>> loc(#loc29)
+    %1 = tt.addptr %0, %xindex_8 : tensor<64x1x!tt.ptr<f32>>, tensor<64x1xi32> loc(#loc29)
+    tt.store %1, %tmp4_10 : tensor<64x1x!tt.ptr<f32>> loc(#loc30)
+    tt.return loc(#loc31)
+  } loc(#loc)
+} loc(#loc)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":32:43)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":23:28)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":23:33)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":24:36)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":24:44)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":24:23)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":26:27)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":26:37)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":28:19)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":29:19)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":33:31)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":34:29)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:45)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:41)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:56)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:50)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:34)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:61)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:115)
+#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":40:22)
+#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":42:23)
+#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":43:40)
+#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":43:8)
+#loc25 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:36)
+#loc27 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:15)
+#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":44:28)
+#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":45:25)
+#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":45:36)
+#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":45:4)
+#loc36 = loc("xoffset"(#loc3))
+#loc37 = loc("xoffset"(#loc4))
+#loc38 = loc("xindex"(#loc5))
+#loc39 = loc("xindex"(#loc6))
+#loc40 = loc("xindex"(#loc7))
+#loc41 = loc("r0_base"(#loc8))
+#loc42 = loc("r0_base"(#loc9))
+#loc43 = loc("x0"(#loc10))
+#loc44 = loc("x1"(#loc11))
+#loc45 = loc("_tmp4"(#loc2))
+#loc46 = loc("r0_index"(#loc12))
+#loc47 = loc("r0_mask"(#loc13))
+#loc48 = loc("tmp0"(#loc14))
+#loc49 = loc("tmp0"(#loc15))
+#loc50 = loc("tmp0"(#loc16))
+#loc51 = loc("tmp0"(#loc17))
+#loc52 = loc("tmp0"(#loc18))
+#loc53 = loc("tmp0"(#loc19))
+#loc54 = loc("tmp0"(#loc20))
+#loc55 = loc("tmp2"(#loc21))
+#loc56 = loc("tmp5"(#loc22))
+#loc57 = loc("_tmp4"(#loc23))
+#loc59 = loc("tmp4"(#loc28))
+#loc60 = loc(callsite(#loc25 at #loc58))
+#loc62 = loc(callsite(#loc27 at #loc60))
diff --git a/triton/3QDGC5QSUWQPN5RXKLXXN7PUSYQW3O2VHNHT4G2SVMKW6BPNAZVQ/__grp__triton_red_fused__fused_rms_norm_view_0.json b/triton/3QDGC5QSUWQPN5RXKLXXN7PUSYQW3O2VHNHT4G2SVMKW6BPNAZVQ/__grp__triton_red_fused__fused_rms_norm_view_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..aa72ff5194566353c399d6f451f35fd9ce02e11c
--- /dev/null
+++ b/triton/3QDGC5QSUWQPN5RXKLXXN7PUSYQW3O2VHNHT4G2SVMKW6BPNAZVQ/__grp__triton_red_fused__fused_rms_norm_view_0.json
@@ -0,0 +1 @@
+{"child_paths": {"triton_red_fused__fused_rms_norm_view_0.source": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/3QDGC5QSUWQPN5RXKLXXN7PUSYQW3O2VHNHT4G2SVMKW6BPNAZVQ/triton_red_fused__fused_rms_norm_view_0.source", "triton_red_fused__fused_rms_norm_view_0.ttir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/3QDGC5QSUWQPN5RXKLXXN7PUSYQW3O2VHNHT4G2SVMKW6BPNAZVQ/triton_red_fused__fused_rms_norm_view_0.ttir", "triton_red_fused__fused_rms_norm_view_0.ttgir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/3QDGC5QSUWQPN5RXKLXXN7PUSYQW3O2VHNHT4G2SVMKW6BPNAZVQ/triton_red_fused__fused_rms_norm_view_0.ttgir", "triton_red_fused__fused_rms_norm_view_0.llir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/3QDGC5QSUWQPN5RXKLXXN7PUSYQW3O2VHNHT4G2SVMKW6BPNAZVQ/triton_red_fused__fused_rms_norm_view_0.llir", "triton_red_fused__fused_rms_norm_view_0.ptx": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/3QDGC5QSUWQPN5RXKLXXN7PUSYQW3O2VHNHT4G2SVMKW6BPNAZVQ/triton_red_fused__fused_rms_norm_view_0.ptx", "triton_red_fused__fused_rms_norm_view_0.cubin": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/3QDGC5QSUWQPN5RXKLXXN7PUSYQW3O2VHNHT4G2SVMKW6BPNAZVQ/triton_red_fused__fused_rms_norm_view_0.cubin", "triton_red_fused__fused_rms_norm_view_0.json": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/3QDGC5QSUWQPN5RXKLXXN7PUSYQW3O2VHNHT4G2SVMKW6BPNAZVQ/triton_red_fused__fused_rms_norm_view_0.json"}}
\ No newline at end of file
diff --git a/triton/3QDGC5QSUWQPN5RXKLXXN7PUSYQW3O2VHNHT4G2SVMKW6BPNAZVQ/triton_red_fused__fused_rms_norm_view_0.cubin b/triton/3QDGC5QSUWQPN5RXKLXXN7PUSYQW3O2VHNHT4G2SVMKW6BPNAZVQ/triton_red_fused__fused_rms_norm_view_0.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..221ae986331cf4f8808c1112afb0012189227783
Binary files /dev/null and b/triton/3QDGC5QSUWQPN5RXKLXXN7PUSYQW3O2VHNHT4G2SVMKW6BPNAZVQ/triton_red_fused__fused_rms_norm_view_0.cubin differ
diff --git a/triton/3QDGC5QSUWQPN5RXKLXXN7PUSYQW3O2VHNHT4G2SVMKW6BPNAZVQ/triton_red_fused__fused_rms_norm_view_0.json b/triton/3QDGC5QSUWQPN5RXKLXXN7PUSYQW3O2VHNHT4G2SVMKW6BPNAZVQ/triton_red_fused__fused_rms_norm_view_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..142d604f0dcd72975c1451ac2f22c38bfb06fe36
--- /dev/null
+++ b/triton/3QDGC5QSUWQPN5RXKLXXN7PUSYQW3O2VHNHT4G2SVMKW6BPNAZVQ/triton_red_fused__fused_rms_norm_view_0.json
@@ -0,0 +1 @@
+{"hash": "dc06617612a5a0f6f63752ef76fdf496216dbb553b4f3e1b52ab156f05ed066b", "target": {"backend": "cuda", "arch": 89, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "enable_reflect_ftz": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee", "bf16x3", "bf16x6"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm89", "instrumentation_mode": "", "triton_version": "3.6.0", "tensordesc_meta": [], "shared": 16, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused__fused_rms_norm_view_0"}
\ No newline at end of file
diff --git a/triton/3QDGC5QSUWQPN5RXKLXXN7PUSYQW3O2VHNHT4G2SVMKW6BPNAZVQ/triton_red_fused__fused_rms_norm_view_0.llir b/triton/3QDGC5QSUWQPN5RXKLXXN7PUSYQW3O2VHNHT4G2SVMKW6BPNAZVQ/triton_red_fused__fused_rms_norm_view_0.llir
new file mode 100644
index 0000000000000000000000000000000000000000..f7f4c1002e9e214125923b9c68c6cdbaaebe6748
--- /dev/null
+++ b/triton/3QDGC5QSUWQPN5RXKLXXN7PUSYQW3O2VHNHT4G2SVMKW6BPNAZVQ/triton_red_fused__fused_rms_norm_view_0.llir
@@ -0,0 +1,136 @@
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64"
+
+@global_smem = external local_unnamed_addr addrspace(3) global [0 x i8], align 16
+
+; Function Attrs: nounwind
+define ptx_kernel void @triton_red_fused__fused_rms_norm_view_0(ptr addrspace(1) %0, ptr addrspace(1) %1, i32 %2, i32 %3, ptr addrspace(1) readnone captures(none) %4, ptr addrspace(1) readnone captures(none) %5) local_unnamed_addr #0 !dbg !4 {
+  %7 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7
+  %8 = shl i32 %7, 2, !dbg !8
+  %9 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9
+  %10 = and i32 %9, 96, !dbg !9
+  %11 = lshr exact i32 %10, 5, !dbg !9
+  %12 = and i32 %9, 3, !dbg !9
+  %13 = or disjoint i32 %11, %8, !dbg !10
+  %14 = or disjoint i32 %8, %12, !dbg !10
+  %15 = shl nuw nsw i32 %9, 2, !dbg !11
+  %16 = and i32 %15, 124, !dbg !11
+  %17 = sdiv i32 %13, 32, !dbg !12
+  %18 = mul i32 %17, 32, !dbg !13
+  %.decomposed = sub i32 %13, %18, !dbg !13
+  %19 = shl nsw i32 %.decomposed, 7, !dbg !14
+  %20 = or disjoint i32 %19, %16, !dbg !15
+  %21 = mul i32 %17, 12288, !dbg !16
+  %22 = add i32 %20, %21, !dbg !17
+  %23 = sext i32 %22 to i64, !dbg !18
+  %24 = getelementptr bfloat, ptr addrspace(1) %0, i64 %23, !dbg !18
+  %25 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !19
+  %26 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %24, i64 %25, i1 true) #4, !dbg !19
+  %27 = extractvalue { i32, i32 } %26, 0, !dbg !19
+  %28 = bitcast i32 %27 to <2 x bfloat>, !dbg !19
+  %29 = extractvalue { i32, i32 } %26, 1, !dbg !19
+  %30 = bitcast i32 %29 to <2 x bfloat>, !dbg !19
+  %31 = extractelement <2 x bfloat> %28, i64 0, !dbg !19
+  %32 = extractelement <2 x bfloat> %28, i64 1, !dbg !19
+  %33 = extractelement <2 x bfloat> %30, i64 0, !dbg !19
+  %34 = extractelement <2 x bfloat> %30, i64 1, !dbg !19
+  %35 = fpext bfloat %31 to float, !dbg !20
+  %36 = fpext bfloat %32 to float, !dbg !20
+  %37 = fpext bfloat %33 to float, !dbg !20
+  %38 = fpext bfloat %34 to float, !dbg !20
+  %39 = fmul float %35, %35, !dbg !21
+  %40 = fmul float %36, %36, !dbg !21
+  %41 = fmul float %37, %37, !dbg !21
+  %42 = fmul float %38, %38, !dbg !21
+  %43 = fadd float %39, %40, !dbg !22
+  %44 = fadd float %41, %43, !dbg !22
+  %45 = fadd float %42, %44, !dbg !22
+  %46 = bitcast float %45 to i32, !dbg !25
+  %47 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %46, i32 16, i32 31), !dbg !25
+  %48 = bitcast i32 %47 to float, !dbg !25
+  %49 = fadd float %45, %48, !dbg !22
+  %50 = bitcast float %49 to i32, !dbg !25
+  %51 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %50, i32 8, i32 31), !dbg !25
+  %52 = bitcast i32 %51 to float, !dbg !25
+  %53 = fadd float %49, %52, !dbg !22
+  %54 = bitcast float %53 to i32, !dbg !25
+  %55 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %54, i32 4, i32 31), !dbg !25
+  %56 = bitcast i32 %55 to float, !dbg !25
+  %57 = fadd float %53, %56, !dbg !22
+  %58 = bitcast float %57 to i32, !dbg !25
+  %59 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %58, i32 2, i32 31), !dbg !25
+  %60 = bitcast i32 %59 to float, !dbg !25
+  %61 = fadd float %57, %60, !dbg !22
+  %62 = bitcast float %61 to i32, !dbg !25
+  %63 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %62, i32 1, i32 31), !dbg !25
+  %64 = bitcast i32 %63 to float, !dbg !25
+  %65 = fadd float %61, %64, !dbg !22
+  %66 = lshr exact i32 %10, 3, !dbg !28
+  %67 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %66, !dbg !28
+  store float %65, ptr addrspace(3) %67, align 4, !dbg !28
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !28
+  %68 = shl nuw nsw i32 %12, 2, !dbg !28
+  %69 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %68, !dbg !28
+  %70 = load i32, ptr addrspace(3) %69, align 4, !dbg !28
+  %71 = sext i32 %14 to i64, !dbg !29
+  %72 = getelementptr float, ptr addrspace(1) %1, i64 %71, !dbg !29
+  %73 = and i32 %9, 124, !dbg !30
+  %74 = icmp eq i32 %73, 0, !dbg !30
+  tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %70, ptr addrspace(1) %72, i1 %74) #4, !dbg !30
+  ret void, !dbg !31
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1
+
+; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
+declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2
+
+; Function Attrs: convergent nocallback nounwind
+declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #3
+
+attributes #0 = { nounwind "nvvm.reqntid"="128" }
+attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
+attributes #3 = { convergent nocallback nounwind }
+attributes #4 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly)
+!1 = !DIFile(filename: "cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py", directory: "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv")
+!2 = !{i32 2, !"Debug Info Version", i32 3}
+!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
+!4 = distinct !DISubprogram(name: "triton_red_fused__fused_rms_norm_view_0", linkageName: "triton_red_fused__fused_rms_norm_view_0", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!5 = !DISubroutineType(cc: DW_CC_normal, types: !6)
+!6 = !{}
+!7 = !DILocation(line: 23, column: 28, scope: !4)
+!8 = !DILocation(line: 23, column: 33, scope: !4)
+!9 = !DILocation(line: 24, column: 44, scope: !4)
+!10 = !DILocation(line: 24, column: 23, scope: !4)
+!11 = !DILocation(line: 26, column: 37, scope: !4)
+!12 = !DILocation(line: 29, column: 19, scope: !4)
+!13 = !DILocation(line: 28, column: 19, scope: !4)
+!14 = !DILocation(line: 38, column: 45, scope: !4)
+!15 = !DILocation(line: 38, column: 41, scope: !4)
+!16 = !DILocation(line: 38, column: 56, scope: !4)
+!17 = !DILocation(line: 38, column: 50, scope: !4)
+!18 = !DILocation(line: 38, column: 34, scope: !4)
+!19 = !DILocation(line: 38, column: 61, scope: !4)
+!20 = !DILocation(line: 38, column: 115, scope: !4)
+!21 = !DILocation(line: 40, column: 22, scope: !4)
+!22 = !DILocation(line: 263, column: 15, scope: !23, inlinedAt: !25)
+!23 = distinct !DILexicalBlockFile(scope: !4, file: !24, discriminator: 0)
+!24 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.12/dist-packages/triton/language")
+!25 = !DILocation(line: 293, column: 36, scope: !23, inlinedAt: !26)
+!26 = !DILocation(line: 44, column: 25, scope: !27)
+!27 = distinct !DILexicalBlockFile(scope: !4, file: !1, discriminator: 0)
+!28 = !DILocation(line: 44, column: 28, scope: !4)
+!29 = !DILocation(line: 45, column: 25, scope: !4)
+!30 = !DILocation(line: 45, column: 36, scope: !4)
+!31 = !DILocation(line: 45, column: 4, scope: !4)
diff --git a/triton/3QDGC5QSUWQPN5RXKLXXN7PUSYQW3O2VHNHT4G2SVMKW6BPNAZVQ/triton_red_fused__fused_rms_norm_view_0.ptx b/triton/3QDGC5QSUWQPN5RXKLXXN7PUSYQW3O2VHNHT4G2SVMKW6BPNAZVQ/triton_red_fused__fused_rms_norm_view_0.ptx
new file mode 100644
index 0000000000000000000000000000000000000000..0433910d2804f7bb4ec0f8846c28cdd91cffe67a
--- /dev/null
+++ b/triton/3QDGC5QSUWQPN5RXKLXXN7PUSYQW3O2VHNHT4G2SVMKW6BPNAZVQ/triton_red_fused__fused_rms_norm_view_0.ptx
@@ -0,0 +1,506 @@
+//
+// Generated by LLVM NVPTX Back-End
+//
+
+.version 9.1
+.target sm_89
+.address_size 64
+
+	// .globl	triton_red_fused__fused_rms_norm_view_0 // -- Begin function triton_red_fused__fused_rms_norm_view_0
+.extern .shared .align 16 .b8 global_smem[];
+                                        // @triton_red_fused__fused_rms_norm_view_0
+.visible .entry triton_red_fused__fused_rms_norm_view_0(
+	.param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm_view_0_param_0,
+	.param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm_view_0_param_1,
+	.param .u32 triton_red_fused__fused_rms_norm_view_0_param_2,
+	.param .u32 triton_red_fused__fused_rms_norm_view_0_param_3,
+	.param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm_view_0_param_4,
+	.param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm_view_0_param_5
+)
+.reqntid 128
+{
+	.reg .pred 	%p<3>;
+	.reg .b16 	%rs<5>;
+	.reg .b32 	%r<48>;
+	.reg .b64 	%rd<6>;
+	.loc	1 18 0                          // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:18:0
+$L__func_begin0:
+	.loc	1 18 0                          // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:18:0
+
+// %bb.0:
+	ld.param.b64 	%rd4, [triton_red_fused__fused_rms_norm_view_0_param_0];
+	ld.param.b64 	%rd5, [triton_red_fused__fused_rms_norm_view_0_param_1];
+$L__tmp0:
+	.loc	1 23 28                         // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:23:28
+	mov.u32 	%r5, %ctaid.x;
+	.loc	1 23 33                         // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:23:33
+	shl.b32 	%r6, %r5, 2;
+	.loc	1 24 44                         // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:24:44
+	mov.u32 	%r7, %tid.x;
+	and.b32 	%r8, %r7, 96;
+	bfe.u32 	%r9, %r7, 5, 2;
+	and.b32 	%r10, %r7, 3;
+	.loc	1 24 23                         // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:24:23
+	or.b32 	%r11, %r9, %r6;
+	or.b32 	%r12, %r6, %r10;
+	.loc	1 26 37                         // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:26:37
+	shl.b32 	%r13, %r7, 2;
+	and.b32 	%r14, %r13, 124;
+	.loc	1 29 19                         // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:29:19
+	bfe.s32 	%r15, %r5, 29, 1;
+	shr.u32 	%r16, %r15, 27;
+	add.s32 	%r17, %r11, %r16;
+	shr.u32 	%r18, %r17, 5;
+	.loc	1 28 19                         // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:28:19
+	and.b32 	%r19, %r17, 33554400;
+	sub.s32 	%r20, %r11, %r19;
+	.loc	1 38 45                         // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:38:45
+	shl.b32 	%r21, %r20, 7;
+	.loc	1 38 41                         // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:38:41
+	or.b32 	%r22, %r21, %r14;
+	.loc	1 38 50                         // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:38:50
+	mad.lo.s32 	%r23, %r18, 12288, %r22;
+	.loc	1 38 34                         // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:38:34
+	mad.wide.s32 	%rd1, %r23, 2, %rd4;
+	.loc	1 38 61                         // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:38:61
+	// begin inline asm
+	mov.u64 %rd2, 0x0;
+	createpolicy.fractional.L2::evict_first.b64 %rd2, 1.0;
+	// end inline asm
+	mov.b32 	%r3, 0;
+	mov.pred 	%p1, -1;
+	// begin inline asm
+	mov.u32 %r1, %r3;
+	mov.u32 %r2, %r3;
+	@%p1 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { %r1, %r2 }, [ %rd1 + 0 ], %rd2;
+	// end inline asm
+	mov.b32 	{%rs1, %rs2}, %r1;
+	mov.b32 	{%rs3, %rs4}, %r2;
+	.loc	1 38 115                        // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:38:115
+	cvt.f32.bf16 	%r24, %rs1;
+	cvt.f32.bf16 	%r25, %rs2;
+	cvt.f32.bf16 	%r26, %rs3;
+	cvt.f32.bf16 	%r27, %rs4;
+	.loc	1 40 22                         // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:40:22
+	mul.f32 	%r28, %r25, %r25;
+$L__tmp1:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:44:25 ] ]
+	fma.rn.f32 	%r29, %r24, %r24, %r28;
+	fma.rn.f32 	%r30, %r26, %r26, %r29;
+	fma.rn.f32 	%r31, %r27, %r27, %r30;
+$L__tmp2:
+	.loc	2 293 36                        // standard.py:293:36 @[ cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:44:25 ]
+	shfl.sync.bfly.b32 	%r32, %r31, 16, 31, -1;
+$L__tmp3:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:44:25 ] ]
+	add.f32 	%r33, %r31, %r32;
+$L__tmp4:
+	.loc	2 293 36                        // standard.py:293:36 @[ cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:44:25 ]
+	shfl.sync.bfly.b32 	%r34, %r33, 8, 31, -1;
+$L__tmp5:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:44:25 ] ]
+	add.f32 	%r35, %r33, %r34;
+$L__tmp6:
+	.loc	2 293 36                        // standard.py:293:36 @[ cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:44:25 ]
+	shfl.sync.bfly.b32 	%r36, %r35, 4, 31, -1;
+$L__tmp7:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:44:25 ] ]
+	add.f32 	%r37, %r35, %r36;
+$L__tmp8:
+	.loc	2 293 36                        // standard.py:293:36 @[ cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:44:25 ]
+	shfl.sync.bfly.b32 	%r38, %r37, 2, 31, -1;
+$L__tmp9:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:44:25 ] ]
+	add.f32 	%r39, %r37, %r38;
+$L__tmp10:
+	.loc	2 293 36                        // standard.py:293:36 @[ cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:44:25 ]
+	shfl.sync.bfly.b32 	%r40, %r39, 1, 31, -1;
+$L__tmp11:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:44:25 ] ]
+	add.f32 	%r41, %r39, %r40;
+$L__tmp12:
+	.loc	1 44 28                         // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:44:28
+	shr.u32 	%r42, %r8, 3;
+	mov.b32 	%r43, global_smem;
+	add.s32 	%r44, %r43, %r42;
+	st.shared.b32 	[%r44], %r41;
+	bar.sync 	0;
+	shl.b32 	%r45, %r10, 2;
+	add.s32 	%r46, %r43, %r45;
+	ld.shared.b32 	%r4, [%r46];
+	.loc	1 45 25                         // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:45:25
+	mad.wide.s32 	%rd3, %r12, 4, %rd5;
+	.loc	1 45 36                         // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:45:36
+	and.b32 	%r47, %r7, 124;
+	setp.eq.b32 	%p2, %r47, 0;
+	// begin inline asm
+	@%p2 st.global.b32 [ %rd3 + 0 ], { %r4 };
+	// end inline asm
+	.loc	1 45 4                          // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:45:4
+	ret;
+$L__tmp13:
+$L__func_end0:
+                                        // -- End function
+}
+	.file	1 "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py"
+	.file	2 "/usr/local/lib/python3.12/dist-packages/triton/language/standard.py"
+	.section	.debug_abbrev
+	{
+.b8 1                                   // Abbreviation Code
+.b8 17                                  // DW_TAG_compile_unit
+.b8 1                                   // DW_CHILDREN_yes
+.b8 37                                  // DW_AT_producer
+.b8 8                                   // DW_FORM_string
+.b8 19                                  // DW_AT_language
+.b8 5                                   // DW_FORM_data2
+.b8 3                                   // DW_AT_name
+.b8 8                                   // DW_FORM_string
+.b8 16                                  // DW_AT_stmt_list
+.b8 6                                   // DW_FORM_data4
+.b8 27                                  // DW_AT_comp_dir
+.b8 8                                   // DW_FORM_string
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 2                                   // Abbreviation Code
+.b8 46                                  // DW_TAG_subprogram
+.b8 0                                   // DW_CHILDREN_no
+.b8 3                                   // DW_AT_name
+.b8 8                                   // DW_FORM_string
+.b8 32                                  // DW_AT_inline
+.b8 11                                  // DW_FORM_data1
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 3                                   // Abbreviation Code
+.b8 46                                  // DW_TAG_subprogram
+.b8 1                                   // DW_CHILDREN_yes
+.b8 17                                  // DW_AT_low_pc
+.b8 1                                   // DW_FORM_addr
+.b8 18                                  // DW_AT_high_pc
+.b8 1                                   // DW_FORM_addr
+.b8 49                                  // DW_AT_abstract_origin
+.b8 19                                  // DW_FORM_ref4
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 4                                   // Abbreviation Code
+.b8 29                                  // DW_TAG_inlined_subroutine
+.b8 1                                   // DW_CHILDREN_yes
+.b8 49                                  // DW_AT_abstract_origin
+.b8 19                                  // DW_FORM_ref4
+.b8 17                                  // DW_AT_low_pc
+.b8 1                                   // DW_FORM_addr
+.b8 18                                  // DW_AT_high_pc
+.b8 1                                   // DW_FORM_addr
+.b8 88                                  // DW_AT_call_file
+.b8 11                                  // DW_FORM_data1
+.b8 89                                  // DW_AT_call_line
+.b8 11                                  // DW_FORM_data1
+.b8 87                                  // DW_AT_call_column
+.b8 11                                  // DW_FORM_data1
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 5                                   // Abbreviation Code
+.b8 29                                  // DW_TAG_inlined_subroutine
+.b8 0                                   // DW_CHILDREN_no
+.b8 49                                  // DW_AT_abstract_origin
+.b8 19                                  // DW_FORM_ref4
+.b8 17                                  // DW_AT_low_pc
+.b8 1                                   // DW_FORM_addr
+.b8 18                                  // DW_AT_high_pc
+.b8 1                                   // DW_FORM_addr
+.b8 88                                  // DW_AT_call_file
+.b8 11                                  // DW_FORM_data1
+.b8 89                                  // DW_AT_call_line
+.b8 5                                   // DW_FORM_data2
+.b8 87                                  // DW_AT_call_column
+.b8 11                                  // DW_FORM_data1
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 0                                   // EOM(3)
+	}
+	.section	.debug_info
+	{
+.b32 339                                // Length of Unit
+.b8 2                                   // DWARF version number
+.b8 0
+.b32 .debug_abbrev                      // Offset Into Abbrev. Section
+.b8 8                                   // Address Size (in bytes)
+.b8 1                                   // Abbrev [1] 0xb:0x14c DW_TAG_compile_unit
+.b8 116                                 // DW_AT_producer
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 0
+.b8 2                                   // DW_AT_language
+.b8 0
+.b8 99                                  // DW_AT_name
+.b8 119
+.b8 118
+.b8 121
+.b8 116
+.b8 52
+.b8 50
+.b8 55
+.b8 51
+.b8 105
+.b8 117
+.b8 51
+.b8 51
+.b8 109
+.b8 112
+.b8 101
+.b8 101
+.b8 55
+.b8 104
+.b8 98
+.b8 101
+.b8 116
+.b8 53
+.b8 106
+.b8 53
+.b8 101
+.b8 113
+.b8 52
+.b8 52
+.b8 100
+.b8 54
+.b8 102
+.b8 115
+.b8 104
+.b8 103
+.b8 119
+.b8 107
+.b8 121
+.b8 120
+.b8 107
+.b8 110
+.b8 53
+.b8 50
+.b8 103
+.b8 103
+.b8 103
+.b8 107
+.b8 105
+.b8 113
+.b8 104
+.b8 106
+.b8 53
+.b8 46
+.b8 112
+.b8 121
+.b8 0
+.b32 .debug_line                        // DW_AT_stmt_list
+.b8 47                                  // DW_AT_comp_dir
+.b8 97
+.b8 112
+.b8 112
+.b8 47
+.b8 116
+.b8 101
+.b8 110
+.b8 115
+.b8 111
+.b8 114
+.b8 114
+.b8 116
+.b8 95
+.b8 108
+.b8 108
+.b8 109
+.b8 47
+.b8 118
+.b8 105
+.b8 115
+.b8 117
+.b8 97
+.b8 108
+.b8 95
+.b8 103
+.b8 101
+.b8 110
+.b8 47
+.b8 99
+.b8 111
+.b8 109
+.b8 112
+.b8 105
+.b8 108
+.b8 101
+.b8 100
+.b8 95
+.b8 99
+.b8 97
+.b8 99
+.b8 104
+.b8 101
+.b8 47
+.b8 102
+.b8 108
+.b8 117
+.b8 120
+.b8 50
+.b8 95
+.b8 107
+.b8 108
+.b8 101
+.b8 105
+.b8 110
+.b8 95
+.b8 57
+.b8 98
+.b8 95
+.b8 78
+.b8 86
+.b8 73
+.b8 68
+.b8 73
+.b8 65
+.b8 95
+.b8 71
+.b8 101
+.b8 70
+.b8 111
+.b8 114
+.b8 99
+.b8 101
+.b8 95
+.b8 82
+.b8 84
+.b8 88
+.b8 95
+.b8 52
+.b8 48
+.b8 57
+.b8 48
+.b8 95
+.b8 115
+.b8 109
+.b8 56
+.b8 57
+.b8 95
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 50
+.b8 46
+.b8 49
+.b8 48
+.b8 46
+.b8 48
+.b8 97
+.b8 48
+.b8 95
+.b8 98
+.b8 52
+.b8 101
+.b8 52
+.b8 101
+.b8 101
+.b8 56
+.b8 49
+.b8 100
+.b8 51
+.b8 46
+.b8 110
+.b8 118
+.b8 50
+.b8 53
+.b8 46
+.b8 49
+.b8 50
+.b8 95
+.b8 99
+.b8 117
+.b8 100
+.b8 97
+.b8 49
+.b8 51
+.b8 95
+.b8 49
+.b8 47
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 105
+.b8 110
+.b8 100
+.b8 117
+.b8 99
+.b8 116
+.b8 111
+.b8 114
+.b8 47
+.b8 119
+.b8 118
+.b8 0
+.b8 2                                   // Abbrev [2] 0xe4:0x2a DW_TAG_subprogram
+.b8 116                                 // DW_AT_name
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 114
+.b8 101
+.b8 100
+.b8 95
+.b8 102
+.b8 117
+.b8 115
+.b8 101
+.b8 100
+.b8 95
+.b8 95
+.b8 102
+.b8 117
+.b8 115
+.b8 101
+.b8 100
+.b8 95
+.b8 114
+.b8 109
+.b8 115
+.b8 95
+.b8 110
+.b8 111
+.b8 114
+.b8 109
+.b8 95
+.b8 118
+.b8 105
+.b8 101
+.b8 119
+.b8 95
+.b8 48
+.b8 0
+.b8 1                                   // DW_AT_inline
+.b8 3                                   // Abbrev [3] 0x10e:0x48 DW_TAG_subprogram
+.b64 $L__func_begin0                    // DW_AT_low_pc
+.b64 $L__func_end0                      // DW_AT_high_pc
+.b32 228                                // DW_AT_abstract_origin
+.b8 4                                   // Abbrev [4] 0x123:0x32 DW_TAG_inlined_subroutine
+.b32 228                                // DW_AT_abstract_origin
+.b64 $L__tmp1                           // DW_AT_low_pc
+.b64 $L__tmp12                          // DW_AT_high_pc
+.b8 1                                   // DW_AT_call_file
+.b8 44                                  // DW_AT_call_line
+.b8 25                                  // DW_AT_call_column
+.b8 5                                   // Abbrev [5] 0x13b:0x19 DW_TAG_inlined_subroutine
+.b32 228                                // DW_AT_abstract_origin
+.b64 $L__tmp1                           // DW_AT_low_pc
+.b64 $L__tmp12                          // DW_AT_high_pc
+.b8 2                                   // DW_AT_call_file
+.b8 37                                  // DW_AT_call_line
+.b8 1
+.b8 36                                  // DW_AT_call_column
+.b8 0                                   // End Of Children Mark
+.b8 0                                   // End Of Children Mark
+.b8 0                                   // End Of Children Mark
+	}
+	.section	.debug_macinfo	{	}
diff --git a/triton/3QDGC5QSUWQPN5RXKLXXN7PUSYQW3O2VHNHT4G2SVMKW6BPNAZVQ/triton_red_fused__fused_rms_norm_view_0.source b/triton/3QDGC5QSUWQPN5RXKLXXN7PUSYQW3O2VHNHT4G2SVMKW6BPNAZVQ/triton_red_fused__fused_rms_norm_view_0.source
new file mode 100644
index 0000000000000000000000000000000000000000..11bcf95e8dcbe462f3479afecf407c7871e0022c
--- /dev/null
+++ b/triton/3QDGC5QSUWQPN5RXKLXXN7PUSYQW3O2VHNHT4G2SVMKW6BPNAZVQ/triton_red_fused__fused_rms_norm_view_0.source
@@ -0,0 +1,167 @@
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":18:0)
+#loc33 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":287:0)
+#loc35 = loc(unknown)
+#loc38 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":262:0)
+#loc42 = loc("in_ptr0"(#loc))
+#loc43 = loc("out_ptr0"(#loc))
+#loc44 = loc("xnumel"(#loc))
+#loc45 = loc("r0_numel"(#loc))
+#loc74 = loc("input"(#loc33))
+#loc75 = loc("a"(#loc38))
+#loc76 = loc("b"(#loc38))
+module {
+  tt.func public @triton_red_fused__fused_rms_norm_view_0(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
+    %xnumel_0 = arith.constant 8192 : i32 loc(#loc46)
+    %r0_numel_1 = arith.constant 128 : i32 loc(#loc47)
+    %xoffset = tt.get_program_id x : i32 loc(#loc48)
+    %xoffset_2 = arith.constant 4 : i32 loc(#loc49)
+    %xoffset_3 = arith.constant 4 : i32 loc(#loc49)
+    %xoffset_4 = arith.muli %xoffset, %xoffset_3 : i32 loc(#loc49)
+    %xindex = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32> loc(#loc50)
+    %xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<4xi32> -> tensor<4x1xi32> loc(#loc51)
+    %xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<4x1xi32> loc(#loc52)
+    %xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<4x1xi32> loc(#loc52)
+    %xmask = arith.constant true loc(#loc53)
+    %xmask_8 = arith.constant dense<true> : tensor<4x128xi1> loc(#loc53)
+    %r0_base = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc54)
+    %r0_base_9 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc55)
+    %x0 = arith.constant 32 : i32 loc(#loc56)
+    %x0_10 = arith.constant 32 : i32 loc(#loc56)
+    %x0_11 = arith.constant dense<32> : tensor<4x1xi32> loc(#loc56)
+    %x0_12 = arith.remsi %xindex_7, %x0_11 : tensor<4x1xi32> loc(#loc56)
+    %x1 = arith.constant 32 : i32 loc(#loc57)
+    %x1_13 = arith.constant 32 : i32 loc(#loc57)
+    %x1_14 = arith.constant dense<32> : tensor<4x1xi32> loc(#loc57)
+    %x1_15 = arith.divsi %xindex_7, %x1_14 : tensor<4x1xi32> loc(#loc57)
+    %_tmp4 = arith.constant 0.000000e+00 : f32 loc(#loc58)
+    %_tmp4_16 = arith.constant dense<0.000000e+00> : tensor<4x128xf32> loc(#loc58)
+    %c0_i32 = arith.constant 0 : i32 loc(#loc14)
+    %c128_i32 = arith.constant 128 : i32 loc(#loc14)
+    %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc14)
+    %1 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc14)
+    %2 = arith.bitcast %c128_i32 : i32 to i32 loc(#loc14)
+    %3 = ub.poison : i32 loc(#loc14)
+    %_tmp4_17 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%_tmp4_19 = %_tmp4_16) -> (tensor<4x128xf32>)  : i32 {
+      %r0_index = tt.splat %r0_offset : i32 -> tensor<1x128xi32> loc(#loc60)
+      %r0_index_20 = arith.addi %r0_index, %r0_base_9 : tensor<1x128xi32> loc(#loc60)
+      %r0_mask = arith.constant dense<128> : tensor<1x128xi32> loc(#loc61)
+      %r0_mask_21 = arith.cmpi slt, %r0_index_20, %r0_mask : tensor<1x128xi32> loc(#loc61)
+      %tmp0 = arith.constant 128 : i32 loc(#loc62)
+      %tmp0_22 = arith.constant 128 : i32 loc(#loc62)
+      %tmp0_23 = arith.constant dense<128> : tensor<4x1xi32> loc(#loc62)
+      %tmp0_24 = arith.muli %tmp0_23, %x0_12 : tensor<4x1xi32> loc(#loc62)
+      %tmp0_25 = tt.broadcast %r0_index_20 : tensor<1x128xi32> -> tensor<4x128xi32> loc(#loc63)
+      %tmp0_26 = tt.broadcast %tmp0_24 : tensor<4x1xi32> -> tensor<4x128xi32> loc(#loc63)
+      %tmp0_27 = arith.addi %tmp0_25, %tmp0_26 : tensor<4x128xi32> loc(#loc63)
+      %tmp0_28 = arith.constant 12288 : i32 loc(#loc64)
+      %tmp0_29 = arith.constant 12288 : i32 loc(#loc64)
+      %tmp0_30 = arith.constant dense<12288> : tensor<4x1xi32> loc(#loc64)
+      %tmp0_31 = arith.muli %tmp0_30, %x1_15 : tensor<4x1xi32> loc(#loc64)
+      %tmp0_32 = tt.broadcast %tmp0_31 : tensor<4x1xi32> -> tensor<4x128xi32> loc(#loc65)
+      %tmp0_33 = arith.addi %tmp0_27, %tmp0_32 : tensor<4x128xi32> loc(#loc65)
+      %tmp0_34 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<4x128x!tt.ptr<bf16>> loc(#loc66)
+      %tmp0_35 = tt.addptr %tmp0_34, %tmp0_33 : tensor<4x128x!tt.ptr<bf16>>, tensor<4x128xi32> loc(#loc66)
+      %tmp0_36 = arith.constant 0.000000e+00 : f32 loc(#loc67)
+      %tmp0_37 = tt.broadcast %r0_mask_21 : tensor<1x128xi1> -> tensor<4x128xi1> loc(#loc67)
+      %tmp0_38 = arith.constant dense<0.000000e+00> : tensor<4x128xf32> loc(#loc67)
+      %tmp0_39 = arith.truncf %tmp0_38 : tensor<4x128xf32> to tensor<4x128xbf16> loc(#loc67)
+      %tmp0_40 = tt.load %tmp0_35, %tmp0_37, %tmp0_39 evictionPolicy = evict_first : tensor<4x128x!tt.ptr<bf16>> loc(#loc67)
+      %tmp0_41 = arith.extf %tmp0_40 : tensor<4x128xbf16> to tensor<4x128xf32> loc(#loc68)
+      %tmp2 = arith.mulf %tmp0_41, %tmp0_41 : tensor<4x128xf32> loc(#loc69)
+      %tmp5 = arith.addf %_tmp4_19, %tmp2 : tensor<4x128xf32> loc(#loc70)
+      %_tmp4_42 = tt.broadcast %r0_mask_21 : tensor<1x128xi1> -> tensor<4x128xi1> loc(#loc71)
+      %_tmp4_43 = arith.select %_tmp4_42, %tmp5, %_tmp4_19 : tensor<4x128xi1>, tensor<4x128xf32> loc(#loc71)
+      scf.yield %_tmp4_43 : tensor<4x128xf32> loc(#loc27)
+    } loc(#loc59)
+    %tmp4 = tt.call @"triton.language.standard.sum__fp32S4_128S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%_tmp4_17) : (tensor<4x128xf32>) -> tensor<4xf32> loc(#loc72)
+    %tmp4_18 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<4xf32> -> tensor<4x1xf32> loc(#loc73)
+    %4 = tt.splat %out_ptr0 : !tt.ptr<f32> -> tensor<4x1x!tt.ptr<f32>> loc(#loc30)
+    %5 = tt.addptr %4, %xindex_7 : tensor<4x1x!tt.ptr<f32>>, tensor<4x1xi32> loc(#loc30)
+    tt.store %5, %tmp4_18 : tensor<4x1x!tt.ptr<f32>> loc(#loc31)
+    tt.return loc(#loc32)
+  } loc(#loc)
+  tt.func private @"triton.language.standard.sum__fp32S4_128S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<4x128xf32> loc("input"(#loc33))) -> tensor<4xf32> attributes {noinline = false} {
+    %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({
+    ^bb0(%arg1: f32 loc(unknown), %arg2: f32 loc(unknown)):
+      %2 = tt.call @triton.language.standard._sum_combine__fp32_fp32__(%arg1, %arg2) : (f32, f32) -> f32 loc(#loc34)
+      tt.reduce.return %2 : f32 loc(#loc34)
+    }) : (tensor<4x128xf32>) -> tensor<4xf32> loc(#loc34)
+    tt.return %0 : tensor<4xf32> loc(#loc36)
+  ^bb1:  // no predecessors
+    %1 = ub.poison : tensor<4xf32> loc(#loc37)
+    tt.return %1 : tensor<4xf32> loc(#loc37)
+  } loc(#loc33)
+  tt.func private @triton.language.standard._sum_combine__fp32_fp32__(%a: f32 loc("a"(#loc38)), %b: f32 loc("b"(#loc38))) -> f32 attributes {noinline = false} {
+    %0 = arith.addf %a, %b : f32 loc(#loc39)
+    tt.return %0 : f32 loc(#loc40)
+  ^bb1:  // no predecessors
+    %1 = ub.poison : f32 loc(#loc41)
+    tt.return %1 : f32 loc(#loc41)
+  } loc(#loc38)
+} loc(#loc)
+#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":19:13)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":20:15)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":23:28)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":23:33)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":24:36)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":24:44)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":24:23)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":25:46)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":26:27)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":26:37)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":28:19)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":29:19)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":30:43)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":32:43)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":33:31)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":34:29)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:45)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:41)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:56)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:50)
+#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:34)
+#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:61)
+#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:115)
+#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":40:22)
+#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":42:23)
+#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":43:40)
+#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":43:8)
+#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":44:25)
+#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":44:28)
+#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":45:25)
+#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":45:36)
+#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":45:4)
+#loc34 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:36)
+#loc36 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:11)
+#loc37 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:4)
+#loc39 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:15)
+#loc40 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:11)
+#loc41 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:4)
+#loc46 = loc("xnumel"(#loc1))
+#loc47 = loc("r0_numel"(#loc2))
+#loc48 = loc("xoffset"(#loc3))
+#loc49 = loc("xoffset"(#loc4))
+#loc50 = loc("xindex"(#loc5))
+#loc51 = loc("xindex"(#loc6))
+#loc52 = loc("xindex"(#loc7))
+#loc53 = loc("xmask"(#loc8))
+#loc54 = loc("r0_base"(#loc9))
+#loc55 = loc("r0_base"(#loc10))
+#loc56 = loc("x0"(#loc11))
+#loc57 = loc("x1"(#loc12))
+#loc58 = loc("_tmp4"(#loc13))
+#loc59 = loc("_tmp4"(#loc14))
+#loc60 = loc("r0_index"(#loc15))
+#loc61 = loc("r0_mask"(#loc16))
+#loc62 = loc("tmp0"(#loc17))
+#loc63 = loc("tmp0"(#loc18))
+#loc64 = loc("tmp0"(#loc19))
+#loc65 = loc("tmp0"(#loc20))
+#loc66 = loc("tmp0"(#loc21))
+#loc67 = loc("tmp0"(#loc22))
+#loc68 = loc("tmp0"(#loc23))
+#loc69 = loc("tmp2"(#loc24))
+#loc70 = loc("tmp5"(#loc25))
+#loc71 = loc("_tmp4"(#loc26))
+#loc72 = loc("tmp4"(#loc28))
+#loc73 = loc("tmp4"(#loc29))
diff --git a/triton/3QDGC5QSUWQPN5RXKLXXN7PUSYQW3O2VHNHT4G2SVMKW6BPNAZVQ/triton_red_fused__fused_rms_norm_view_0.ttgir b/triton/3QDGC5QSUWQPN5RXKLXXN7PUSYQW3O2VHNHT4G2SVMKW6BPNAZVQ/triton_red_fused__fused_rms_norm_view_0.ttgir
new file mode 100644
index 0000000000000000000000000000000000000000..fa653a4402fd3bb1e45f866eca545bb4cba8c417
--- /dev/null
+++ b/triton/3QDGC5QSUWQPN5RXKLXXN7PUSYQW3O2VHNHT4G2SVMKW6BPNAZVQ/triton_red_fused__fused_rms_norm_view_0.ttgir
@@ -0,0 +1,108 @@
+#blocked = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 32], warpsPerCTA = [4, 1], order = [1, 0]}>
+#blocked1 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [4, 8], warpsPerCTA = [1, 4], order = [0, 1]}>
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":18:0)
+#loc1 = loc(unknown)
+#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":44:25)
+#loc27 = loc("in_ptr0"(#loc))
+#loc28 = loc("out_ptr0"(#loc))
+#loc29 = loc("xnumel"(#loc))
+#loc30 = loc("r0_numel"(#loc))
+#loc49 = loc("tmp4"(#loc21))
+#loc52 = loc(callsite(#loc1 at #loc49))
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:89", "ttg.threads-per-warp" = 32 : i32} {
+  tt.func public @triton_red_fused__fused_rms_norm_view_0(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
+    %cst = arith.constant dense<128> : tensor<1x128xi32, #blocked> loc(#loc1)
+    %cst_0 = arith.constant dense<128> : tensor<4x1xi32, #blocked> loc(#loc1)
+    %cst_1 = arith.constant dense<12288> : tensor<4x1xi32, #blocked> loc(#loc1)
+    %cst_2 = arith.constant dense<32> : tensor<4x1xi32, #blocked> loc(#loc1)
+    %c4_i32 = arith.constant 4 : i32 loc(#loc1)
+    %cst_3 = arith.constant dense<0.000000e+00> : tensor<4x128xbf16, #blocked> loc(#loc1)
+    %cst_4 = arith.constant dense<0.000000e+00> : tensor<4x128xf32, #blocked> loc(#loc1)
+    %xoffset = tt.get_program_id x : i32 loc(#loc31)
+    %xoffset_5 = arith.muli %xoffset, %c4_i32 : i32 loc(#loc32)
+    %xindex = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc33)
+    %xindex_6 = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc33)
+    %xindex_7 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<4xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<4x1xi32, #blocked> loc(#loc33)
+    %xindex_8 = tt.expand_dims %xindex_6 {axis = 1 : i32} : tensor<4xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<4x1xi32, #blocked1> loc(#loc33)
+    %xindex_9 = tt.splat %xoffset_5 : i32 -> tensor<4x1xi32, #blocked> loc(#loc34)
+    %xindex_10 = tt.splat %xoffset_5 : i32 -> tensor<4x1xi32, #blocked1> loc(#loc34)
+    %xindex_11 = arith.addi %xindex_9, %xindex_7 : tensor<4x1xi32, #blocked> loc(#loc34)
+    %xindex_12 = arith.addi %xindex_10, %xindex_8 : tensor<4x1xi32, #blocked1> loc(#loc34)
+    %r0_base = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc35)
+    %r0_base_13 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x128xi32, #blocked> loc(#loc35)
+    %x0 = arith.remsi %xindex_11, %cst_2 : tensor<4x1xi32, #blocked> loc(#loc36)
+    %x1 = arith.divsi %xindex_11, %cst_2 : tensor<4x1xi32, #blocked> loc(#loc37)
+    %r0_mask = arith.cmpi slt, %r0_base_13, %cst : tensor<1x128xi32, #blocked> loc(#loc38)
+    %tmp0 = arith.muli %x0, %cst_0 : tensor<4x1xi32, #blocked> loc(#loc39)
+    %tmp0_14 = tt.broadcast %r0_base_13 : tensor<1x128xi32, #blocked> -> tensor<4x128xi32, #blocked> loc(#loc40)
+    %tmp0_15 = tt.broadcast %tmp0 : tensor<4x1xi32, #blocked> -> tensor<4x128xi32, #blocked> loc(#loc40)
+    %tmp0_16 = arith.addi %tmp0_14, %tmp0_15 : tensor<4x128xi32, #blocked> loc(#loc40)
+    %tmp0_17 = arith.muli %x1, %cst_1 : tensor<4x1xi32, #blocked> loc(#loc41)
+    %tmp0_18 = tt.broadcast %tmp0_17 : tensor<4x1xi32, #blocked> -> tensor<4x128xi32, #blocked> loc(#loc42)
+    %tmp0_19 = arith.addi %tmp0_16, %tmp0_18 : tensor<4x128xi32, #blocked> loc(#loc42)
+    %tmp0_20 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<4x128x!tt.ptr<bf16>, #blocked> loc(#loc43)
+    %tmp0_21 = tt.addptr %tmp0_20, %tmp0_19 : tensor<4x128x!tt.ptr<bf16>, #blocked>, tensor<4x128xi32, #blocked> loc(#loc43)
+    %tmp0_22 = tt.broadcast %r0_mask : tensor<1x128xi1, #blocked> -> tensor<4x128xi1, #blocked> loc(#loc44)
+    %tmp0_23 = tt.load %tmp0_21, %tmp0_22, %cst_3 evictionPolicy = evict_first : tensor<4x128x!tt.ptr<bf16>, #blocked> loc(#loc44)
+    %tmp0_24 = arith.extf %tmp0_23 : tensor<4x128xbf16, #blocked> to tensor<4x128xf32, #blocked> loc(#loc45)
+    %tmp2 = arith.mulf %tmp0_24, %tmp0_24 : tensor<4x128xf32, #blocked> loc(#loc46)
+    %tmp5 = arith.addf %tmp2, %cst_4 : tensor<4x128xf32, #blocked> loc(#loc47)
+    %_tmp4 = arith.select %tmp0_22, %tmp5, %cst_4 : tensor<4x128xi1, #blocked>, tensor<4x128xf32, #blocked> loc(#loc48)
+    %tmp4 = "tt.reduce"(%_tmp4) <{axis = 1 : i32}> ({
+    ^bb0(%tmp4_27: f32 loc(callsite(#loc1 at #loc49)), %tmp4_28: f32 loc(callsite(#loc1 at #loc49))):
+      %tmp4_29 = arith.addf %tmp4_27, %tmp4_28 : f32 loc(#loc53)
+      tt.reduce.return %tmp4_29 : f32 loc(#loc51)
+    }) : (tensor<4x128xf32, #blocked>) -> tensor<4xf32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc51)
+    %tmp4_25 = ttg.convert_layout %tmp4 : tensor<4xf32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<4xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc50)
+    %tmp4_26 = tt.expand_dims %tmp4_25 {axis = 1 : i32} : tensor<4xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<4x1xf32, #blocked1> loc(#loc50)
+    %0 = tt.splat %out_ptr0 : !tt.ptr<f32> -> tensor<4x1x!tt.ptr<f32>, #blocked1> loc(#loc24)
+    %1 = tt.addptr %0, %xindex_12 : tensor<4x1x!tt.ptr<f32>, #blocked1>, tensor<4x1xi32, #blocked1> loc(#loc24)
+    tt.store %1, %tmp4_26 : tensor<4x1x!tt.ptr<f32>, #blocked1> loc(#loc25)
+    tt.return loc(#loc26)
+  } loc(#loc)
+} loc(#loc)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":23:28)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":23:33)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":24:44)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":24:23)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":26:37)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":28:19)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":29:19)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":34:29)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:45)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:41)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:56)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:50)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:34)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:61)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:115)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":40:22)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":42:23)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":43:40)
+#loc20 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:36)
+#loc22 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:15)
+#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":44:28)
+#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":45:25)
+#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":45:36)
+#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":45:4)
+#loc31 = loc("xoffset"(#loc2))
+#loc32 = loc("xoffset"(#loc3))
+#loc33 = loc("xindex"(#loc4))
+#loc34 = loc("xindex"(#loc5))
+#loc35 = loc("r0_base"(#loc6))
+#loc36 = loc("x0"(#loc7))
+#loc37 = loc("x1"(#loc8))
+#loc38 = loc("r0_mask"(#loc9))
+#loc39 = loc("tmp0"(#loc10))
+#loc40 = loc("tmp0"(#loc11))
+#loc41 = loc("tmp0"(#loc12))
+#loc42 = loc("tmp0"(#loc13))
+#loc43 = loc("tmp0"(#loc14))
+#loc44 = loc("tmp0"(#loc15))
+#loc45 = loc("tmp0"(#loc16))
+#loc46 = loc("tmp2"(#loc17))
+#loc47 = loc("tmp5"(#loc18))
+#loc48 = loc("_tmp4"(#loc19))
+#loc50 = loc("tmp4"(#loc23))
+#loc51 = loc(callsite(#loc20 at #loc49))
+#loc53 = loc(callsite(#loc22 at #loc51))
diff --git a/triton/3QDGC5QSUWQPN5RXKLXXN7PUSYQW3O2VHNHT4G2SVMKW6BPNAZVQ/triton_red_fused__fused_rms_norm_view_0.ttir b/triton/3QDGC5QSUWQPN5RXKLXXN7PUSYQW3O2VHNHT4G2SVMKW6BPNAZVQ/triton_red_fused__fused_rms_norm_view_0.ttir
new file mode 100644
index 0000000000000000000000000000000000000000..08d5b5e90a3b6a75505d5fc4ca7ec67469f9f082
--- /dev/null
+++ b/triton/3QDGC5QSUWQPN5RXKLXXN7PUSYQW3O2VHNHT4G2SVMKW6BPNAZVQ/triton_red_fused__fused_rms_norm_view_0.ttir
@@ -0,0 +1,105 @@
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":18:0)
+#loc2 = loc(unknown)
+#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":44:25)
+#loc29 = loc("in_ptr0"(#loc))
+#loc30 = loc("out_ptr0"(#loc))
+#loc31 = loc("xnumel"(#loc))
+#loc32 = loc("r0_numel"(#loc))
+#loc53 = loc("tmp4"(#loc23))
+#loc56 = loc(callsite(#loc2 at #loc53))
+module {
+  tt.func public @triton_red_fused__fused_rms_norm_view_0(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
+    %tmp0 = arith.constant dense<0.000000e+00> : tensor<4x128xbf16> loc(#loc33)
+    %cst = arith.constant dense<12288> : tensor<4x1xi32> loc(#loc2)
+    %cst_0 = arith.constant dense<128> : tensor<4x1xi32> loc(#loc2)
+    %cst_1 = arith.constant dense<128> : tensor<1x128xi32> loc(#loc2)
+    %cst_2 = arith.constant dense<0.000000e+00> : tensor<4x128xf32> loc(#loc2)
+    %cst_3 = arith.constant dense<32> : tensor<4x1xi32> loc(#loc2)
+    %c4_i32 = arith.constant 4 : i32 loc(#loc2)
+    %xoffset = tt.get_program_id x : i32 loc(#loc34)
+    %xoffset_4 = arith.muli %xoffset, %c4_i32 : i32 loc(#loc35)
+    %xindex = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32> loc(#loc36)
+    %xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<4xi32> -> tensor<4x1xi32> loc(#loc37)
+    %xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<4x1xi32> loc(#loc38)
+    %xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<4x1xi32> loc(#loc38)
+    %r0_base = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc39)
+    %r0_base_8 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc40)
+    %x0 = arith.remsi %xindex_7, %cst_3 : tensor<4x1xi32> loc(#loc41)
+    %x1 = arith.divsi %xindex_7, %cst_3 : tensor<4x1xi32> loc(#loc42)
+    %r0_mask = arith.cmpi slt, %r0_base_8, %cst_1 : tensor<1x128xi32> loc(#loc43)
+    %tmp0_9 = arith.muli %x0, %cst_0 : tensor<4x1xi32> loc(#loc44)
+    %tmp0_10 = tt.broadcast %r0_base_8 : tensor<1x128xi32> -> tensor<4x128xi32> loc(#loc45)
+    %tmp0_11 = tt.broadcast %tmp0_9 : tensor<4x1xi32> -> tensor<4x128xi32> loc(#loc45)
+    %tmp0_12 = arith.addi %tmp0_10, %tmp0_11 : tensor<4x128xi32> loc(#loc45)
+    %tmp0_13 = arith.muli %x1, %cst : tensor<4x1xi32> loc(#loc46)
+    %tmp0_14 = tt.broadcast %tmp0_13 : tensor<4x1xi32> -> tensor<4x128xi32> loc(#loc47)
+    %tmp0_15 = arith.addi %tmp0_12, %tmp0_14 : tensor<4x128xi32> loc(#loc47)
+    %tmp0_16 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<4x128x!tt.ptr<bf16>> loc(#loc48)
+    %tmp0_17 = tt.addptr %tmp0_16, %tmp0_15 : tensor<4x128x!tt.ptr<bf16>>, tensor<4x128xi32> loc(#loc48)
+    %tmp0_18 = tt.broadcast %r0_mask : tensor<1x128xi1> -> tensor<4x128xi1> loc(#loc33)
+    %tmp0_19 = tt.load %tmp0_17, %tmp0_18, %tmp0 evictionPolicy = evict_first : tensor<4x128x!tt.ptr<bf16>> loc(#loc33)
+    %tmp0_20 = arith.extf %tmp0_19 : tensor<4x128xbf16> to tensor<4x128xf32> loc(#loc49)
+    %tmp2 = arith.mulf %tmp0_20, %tmp0_20 : tensor<4x128xf32> loc(#loc50)
+    %tmp5 = arith.addf %tmp2, %cst_2 : tensor<4x128xf32> loc(#loc51)
+    %_tmp4 = arith.select %tmp0_18, %tmp5, %cst_2 : tensor<4x128xi1>, tensor<4x128xf32> loc(#loc52)
+    %tmp4 = "tt.reduce"(%_tmp4) <{axis = 1 : i32}> ({
+    ^bb0(%tmp4_22: f32 loc(callsite(#loc2 at #loc53)), %tmp4_23: f32 loc(callsite(#loc2 at #loc53))):
+      %tmp4_24 = arith.addf %tmp4_22, %tmp4_23 : f32 loc(#loc57)
+      tt.reduce.return %tmp4_24 : f32 loc(#loc55)
+    }) : (tensor<4x128xf32>) -> tensor<4xf32> loc(#loc55)
+    %tmp4_21 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<4xf32> -> tensor<4x1xf32> loc(#loc54)
+    %0 = tt.splat %out_ptr0 : !tt.ptr<f32> -> tensor<4x1x!tt.ptr<f32>> loc(#loc26)
+    %1 = tt.addptr %0, %xindex_7 : tensor<4x1x!tt.ptr<f32>>, tensor<4x1xi32> loc(#loc26)
+    tt.store %1, %tmp4_21 : tensor<4x1x!tt.ptr<f32>> loc(#loc27)
+    tt.return loc(#loc28)
+  } loc(#loc)
+} loc(#loc)
+#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:61)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":23:28)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":23:33)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":24:36)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":24:44)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":24:23)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":26:27)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":26:37)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":28:19)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":29:19)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":34:29)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:45)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:41)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:56)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:50)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:34)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:115)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":40:22)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":42:23)
+#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":43:40)
+#loc22 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:36)
+#loc24 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:15)
+#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":44:28)
+#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":45:25)
+#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":45:36)
+#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":45:4)
+#loc33 = loc("tmp0"(#loc1))
+#loc34 = loc("xoffset"(#loc3))
+#loc35 = loc("xoffset"(#loc4))
+#loc36 = loc("xindex"(#loc5))
+#loc37 = loc("xindex"(#loc6))
+#loc38 = loc("xindex"(#loc7))
+#loc39 = loc("r0_base"(#loc8))
+#loc40 = loc("r0_base"(#loc9))
+#loc41 = loc("x0"(#loc10))
+#loc42 = loc("x1"(#loc11))
+#loc43 = loc("r0_mask"(#loc12))
+#loc44 = loc("tmp0"(#loc13))
+#loc45 = loc("tmp0"(#loc14))
+#loc46 = loc("tmp0"(#loc15))
+#loc47 = loc("tmp0"(#loc16))
+#loc48 = loc("tmp0"(#loc17))
+#loc49 = loc("tmp0"(#loc18))
+#loc50 = loc("tmp2"(#loc19))
+#loc51 = loc("tmp5"(#loc20))
+#loc52 = loc("_tmp4"(#loc21))
+#loc54 = loc("tmp4"(#loc25))
+#loc55 = loc(callsite(#loc22 at #loc53))
+#loc57 = loc(callsite(#loc24 at #loc55))
diff --git a/triton/3QQ2VKPC7ZTVHWIARO636O4SXEBYGRIJVMN3RDKURPDAUEL7TMAQ/__grp__triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json b/triton/3QQ2VKPC7ZTVHWIARO636O4SXEBYGRIJVMN3RDKURPDAUEL7TMAQ/__grp__triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..25318e6400bd1c4b0e1aaf27589233ff37ad1a92
--- /dev/null
+++ b/triton/3QQ2VKPC7ZTVHWIARO636O4SXEBYGRIJVMN3RDKURPDAUEL7TMAQ/__grp__triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json
@@ -0,0 +1 @@
+{"child_paths": {"triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.source": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/3QQ2VKPC7ZTVHWIARO636O4SXEBYGRIJVMN3RDKURPDAUEL7TMAQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.source", "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/3QQ2VKPC7ZTVHWIARO636O4SXEBYGRIJVMN3RDKURPDAUEL7TMAQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttir", "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttgir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/3QQ2VKPC7ZTVHWIARO636O4SXEBYGRIJVMN3RDKURPDAUEL7TMAQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttgir", "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.llir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/3QQ2VKPC7ZTVHWIARO636O4SXEBYGRIJVMN3RDKURPDAUEL7TMAQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.llir", "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ptx": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/3QQ2VKPC7ZTVHWIARO636O4SXEBYGRIJVMN3RDKURPDAUEL7TMAQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ptx", "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.cubin": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/3QQ2VKPC7ZTVHWIARO636O4SXEBYGRIJVMN3RDKURPDAUEL7TMAQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.cubin", "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/3QQ2VKPC7ZTVHWIARO636O4SXEBYGRIJVMN3RDKURPDAUEL7TMAQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json"}}
\ No newline at end of file
diff --git a/triton/3QQ2VKPC7ZTVHWIARO636O4SXEBYGRIJVMN3RDKURPDAUEL7TMAQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.cubin b/triton/3QQ2VKPC7ZTVHWIARO636O4SXEBYGRIJVMN3RDKURPDAUEL7TMAQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..2b27b3f9e66b184f1be571893b39afec38640c1e
Binary files /dev/null and b/triton/3QQ2VKPC7ZTVHWIARO636O4SXEBYGRIJVMN3RDKURPDAUEL7TMAQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.cubin differ
diff --git a/triton/3QQ2VKPC7ZTVHWIARO636O4SXEBYGRIJVMN3RDKURPDAUEL7TMAQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json b/triton/3QQ2VKPC7ZTVHWIARO636O4SXEBYGRIJVMN3RDKURPDAUEL7TMAQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..90135d9a1efcdd6bf9b0b02cbdf870abfafe1b07
--- /dev/null
+++ b/triton/3QQ2VKPC7ZTVHWIARO636O4SXEBYGRIJVMN3RDKURPDAUEL7TMAQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json
@@ -0,0 +1 @@
+{"hash": "dc21aaa9e2fe6753d9008bbdbf3b92b903834509ab1bb88d548bc60a117f9b01", "target": {"backend": "cuda", "arch": 89, "warp_size": 32}, "num_warps": 8, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "enable_reflect_ftz": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee", "bf16x3", "bf16x6"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm89", "instrumentation_mode": "", "triton_version": "3.6.0", "tensordesc_meta": [], "shared": 1024, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0"}
\ No newline at end of file
diff --git a/triton/3QQ2VKPC7ZTVHWIARO636O4SXEBYGRIJVMN3RDKURPDAUEL7TMAQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.llir b/triton/3QQ2VKPC7ZTVHWIARO636O4SXEBYGRIJVMN3RDKURPDAUEL7TMAQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.llir
new file mode 100644
index 0000000000000000000000000000000000000000..16ab18a45fdcb649f4934668d4f5300ccfc3a884
--- /dev/null
+++ b/triton/3QQ2VKPC7ZTVHWIARO636O4SXEBYGRIJVMN3RDKURPDAUEL7TMAQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.llir
@@ -0,0 +1,464 @@
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64"
+
+@global_smem = external local_unnamed_addr addrspace(3) global [0 x i8], align 16
+@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1
+
+; Function Attrs: nounwind
+define ptx_kernel void @triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, ptr addrspace(1) %6, i32 %7, i32 %8, ptr addrspace(1) readnone captures(none) %9, ptr addrspace(1) readnone captures(none) %10) local_unnamed_addr #0 !dbg !5 {
+  %12 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !8
+  %13 = shl i32 %12, 6, !dbg !9
+  %14 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
+  %15 = and i32 %14, 252, !dbg !10
+  %16 = lshr exact i32 %15, 2, !dbg !10
+  %17 = or disjoint i32 %16, %13, !dbg !11
+  %18 = and i32 %14, 3, !dbg !12
+  %19 = sdiv i32 %17, 32, !dbg !13
+  %20 = shl i32 %17, 7
+  %21 = shl i32 %19, 15
+  %22 = add i32 %21, %20
+  %23 = add i32 %22, 4096
+  %24 = zext nneg i32 %18 to i64, !dbg !14
+  br label %25, !dbg !14
+
+25:                                               ; preds = %11, %25
+  %indvars.iv = phi i64 [ 0, %11 ], [ %indvars.iv.next, %25 ]
+  %26 = phi float [ 0.000000e+00, %11 ], [ %47, %25 ]
+  %27 = phi float [ 0.000000e+00, %11 ], [ %45, %25 ]
+  %28 = trunc nuw nsw i64 %indvars.iv to i32, !dbg !15
+  %29 = or disjoint i32 %18, %28, !dbg !15
+  %30 = add i32 %23, %29, !dbg !15
+  %31 = sext i32 %30 to i64, !dbg !16
+  %32 = getelementptr bfloat, ptr addrspace(1) %2, i64 %31, !dbg !16
+  %33 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !17
+  %34 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %32, i64 %33, i1 true) #6, !dbg !17
+  %35 = bitcast i16 %34 to bfloat, !dbg !17
+  %36 = fpext bfloat %35 to float, !dbg !18
+  %37 = add i32 %22, %29, !dbg !19
+  %38 = sext i32 %37 to i64, !dbg !20
+  %39 = getelementptr bfloat, ptr addrspace(1) %2, i64 %38, !dbg !20
+  %40 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !21
+  %41 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %39, i64 %40, i1 true) #6, !dbg !21
+  %42 = bitcast i16 %41 to bfloat, !dbg !21
+  %43 = fpext bfloat %42 to float, !dbg !22
+  %44 = fmul float %36, %36, !dbg !23
+  %45 = fadd float %27, %44, !dbg !24
+  %46 = fmul float %43, %43, !dbg !25
+  %47 = fadd float %26, %46, !dbg !26
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 4, !dbg !14
+  %48 = icmp samesign ult i64 %indvars.iv, 124, !dbg !14
+  br i1 %48, label %25, label %49, !dbg !14
+
+49:                                               ; preds = %25
+  %50 = and i32 %14, 63, !dbg !10
+  %51 = or disjoint i32 %13, %50, !dbg !11
+  %52 = and i32 %14, 192, !dbg !12
+  %53 = lshr exact i32 %52, 6, !dbg !12
+  %54 = sdiv i32 %51, 32, !dbg !13
+  %55 = bitcast float %45 to i32, !dbg !27
+  %56 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %55, i32 2, i32 31), !dbg !27
+  %57 = bitcast i32 %56 to float, !dbg !27
+  %58 = fadd float %45, %57, !dbg !32
+  %59 = bitcast float %58 to i32, !dbg !27
+  %60 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %59, i32 1, i32 31), !dbg !27
+  %61 = bitcast i32 %60 to float, !dbg !27
+  %62 = fadd float %58, %61, !dbg !32
+  %63 = bitcast float %47 to i32, !dbg !33
+  %64 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %63, i32 2, i32 31), !dbg !33
+  %65 = bitcast i32 %64 to float, !dbg !33
+  %66 = fadd float %47, %65, !dbg !35
+  %67 = bitcast float %66 to i32, !dbg !33
+  %68 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %67, i32 1, i32 31), !dbg !33
+  %69 = bitcast i32 %68 to float, !dbg !33
+  %70 = fadd float %66, %69, !dbg !35
+  %71 = shl i32 %19, 7, !dbg !36
+  %72 = tail call float @llvm.nvvm.div.full(float %70, float 1.280000e+02), !dbg !37
+  %73 = fadd float %72, 0x3EB0C6F7A0000000, !dbg !38
+  %74 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !39
+  %.not.i = icmp eq i32 %74, 0, !dbg !39
+  br i1 %.not.i, label %77, label %75, !dbg !39
+
+75:                                               ; preds = %49
+  %76 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %73), !dbg !39
+  br label %__nv_rsqrtf.exit, !dbg !39
+
+77:                                               ; preds = %49
+  %78 = tail call float @llvm.nvvm.rsqrt.approx.f(float %73), !dbg !39
+  br label %__nv_rsqrtf.exit, !dbg !39
+
+__nv_rsqrtf.exit:                                 ; preds = %75, %77
+  %.0.i = phi float [ %76, %75 ], [ %78, %77 ], !dbg !39
+  %79 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %15, !dbg !40
+  store float %.0.i, ptr addrspace(3) %79, align 4, !dbg !40
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !40
+  %80 = shl nuw nsw i32 %50, 2, !dbg !40
+  %81 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %80, !dbg !40
+  %82 = load float, ptr addrspace(3) %81, align 4, !dbg !40
+  %83 = tail call float @llvm.nvvm.div.full(float %62, float 1.280000e+02), !dbg !41
+  %84 = fadd float %83, 0x3EB0C6F7A0000000, !dbg !42
+  %85 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !43
+  %.not.i2 = icmp eq i32 %85, 0, !dbg !43
+  br i1 %.not.i2, label %88, label %86, !dbg !43
+
+86:                                               ; preds = %__nv_rsqrtf.exit
+  %87 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %84), !dbg !43
+  br label %__nv_rsqrtf.exit4, !dbg !43
+
+88:                                               ; preds = %__nv_rsqrtf.exit
+  %89 = tail call float @llvm.nvvm.rsqrt.approx.f(float %84), !dbg !43
+  br label %__nv_rsqrtf.exit4, !dbg !43
+
+__nv_rsqrtf.exit4:                                ; preds = %86, %88
+  %.0.i3 = phi float [ %87, %86 ], [ %89, %88 ], !dbg !43
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !44
+  store float %.0.i3, ptr addrspace(3) %79, align 4, !dbg !44
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !44
+  %90 = load float, ptr addrspace(3) %81, align 4, !dbg !44
+  %91 = shl i32 %17, 7, !dbg !45
+  %92 = and i32 %53, 1
+  %.masked = and i32 %53, 2
+  %93 = shl nuw nsw i32 %18, 5
+  %94 = and i32 %14, 96
+  %95 = shl nuw nsw i32 %94, 3
+  %96 = or disjoint i32 %93, %95
+  %97 = xor i32 %96, %15
+  %98 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %97
+  %99 = and i32 %14, 24
+  %100 = shl nuw nsw i32 %99, 5
+  %101 = lshr exact i32 %52, 1
+  %102 = or disjoint i32 %100, %80
+  %103 = xor i32 %102, %101
+  %104 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %103
+  %105 = icmp eq i32 %92, 0
+  %106 = shl i32 %51, 7
+  %107 = shl i32 %54, 15
+  %108 = add i32 %107, %106
+  %109 = icmp ne i32 %92, 0
+  %110 = add i32 %108, 4097
+  %111 = add i32 %108, 4096
+  %112 = shl nuw nsw i32 %99, 4
+  %113 = shl nuw nsw i32 %14, 2
+  %114 = and i32 %113, 124
+  %115 = lshr i32 %14, 4
+  %116 = and i32 %115, 2
+  %117 = or disjoint i32 %112, %114
+  %118 = xor i32 %117, %101
+  %119 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %116
+  %120 = getelementptr inbounds nuw i8, ptr addrspace(3) %119, i32 %118
+  %121 = shl nuw nsw i32 %94, 2
+  %122 = and i32 %14, 124
+  %123 = lshr i32 %14, 6
+  %124 = and i32 %123, 2
+  %125 = or disjoint i32 %93, %121
+  %126 = xor i32 %125, %122
+  %127 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %124
+  %128 = getelementptr inbounds nuw i8, ptr addrspace(3) %127, i32 %126
+  %129 = zext nneg i32 %.masked to i64, !dbg !46
+  %130 = sext i32 %71 to i64, !dbg !46
+  %131 = sext i32 %91 to i64, !dbg !46
+  br label %132, !dbg !46
+
+132:                                              ; preds = %__nv_rsqrtf.exit4, %132
+  %indvars.iv7 = phi i64 [ 0, %__nv_rsqrtf.exit4 ], [ %indvars.iv.next8, %132 ]
+  %133 = or disjoint i64 %indvars.iv7, %24, !dbg !47
+  %134 = or disjoint i64 %indvars.iv7, %129, !dbg !48
+  %135 = trunc nuw nsw i64 %133 to i32, !dbg !49
+  %136 = add i32 %22, %135, !dbg !49
+  %137 = sext i32 %136 to i64, !dbg !50
+  %138 = getelementptr bfloat, ptr addrspace(1) %2, i64 %137, !dbg !50
+  %139 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !51
+  %140 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %138, i64 %139, i1 true) #6, !dbg !51
+  %141 = bitcast i16 %140 to bfloat, !dbg !51
+  %142 = fpext bfloat %141 to float, !dbg !52
+  %143 = getelementptr bfloat, ptr addrspace(1) %3, i64 %133, !dbg !53
+  %144 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !54
+  %145 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %143, i64 %144, i1 true) #6, !dbg !54
+  %146 = bitcast i16 %145 to bfloat, !dbg !54
+  %147 = fpext bfloat %146 to float, !dbg !55
+  %148 = add nuw nsw i64 %133, %130, !dbg !56
+  %149 = getelementptr float, ptr addrspace(1) %4, i64 %148, !dbg !57
+  %150 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !58
+  %151 = tail call i32 asm sideeffect "mov.u32 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $2 + 0 ], $3;", "=r,r,l,l,b"(i32 0, ptr addrspace(1) %149, i64 %150, i1 true) #6, !dbg !58
+  %152 = bitcast i32 %151 to float, !dbg !58
+  %153 = getelementptr float, ptr addrspace(1) %5, i64 %148, !dbg !59
+  %154 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !60
+  %155 = tail call i32 asm sideeffect "mov.u32 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $2 + 0 ], $3;", "=r,r,l,l,b"(i32 0, ptr addrspace(1) %153, i64 %154, i1 true) #6, !dbg !60
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !60
+  %156 = insertelement <1 x i32> poison, i32 %155, i64 0, !dbg !60
+  store <1 x i32> %156, ptr addrspace(3) %98, align 4, !dbg !60
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !60
+  %157 = load float, ptr addrspace(3) %104, align 4, !dbg !60
+  %158 = add i32 %23, %135, !dbg !61
+  %159 = sext i32 %158 to i64, !dbg !62
+  %160 = getelementptr bfloat, ptr addrspace(1) %2, i64 %159, !dbg !62
+  %161 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #6, !dbg !63
+  %162 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %160, i64 %161, i1 true) #6, !dbg !63
+  %163 = bitcast i16 %162 to bfloat, !dbg !63
+  %164 = fpext bfloat %163 to float, !dbg !64
+  %165 = getelementptr bfloat, ptr addrspace(1) %6, i64 %133, !dbg !65
+  %166 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !66
+  %167 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %165, i64 %166, i1 true) #6, !dbg !66
+  %168 = bitcast i16 %167 to bfloat, !dbg !66
+  %169 = fpext bfloat %168 to float, !dbg !67
+  %170 = or disjoint i64 %134, 1, !dbg !68
+  %171 = trunc nuw nsw i64 %170 to i32, !dbg !69
+  %172 = add i32 %108, %171, !dbg !69
+  %173 = sext i32 %172 to i64, !dbg !70
+  %174 = getelementptr bfloat, ptr addrspace(1) %2, i64 %173, !dbg !70
+  %175 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !71
+  %176 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %174, i64 %175, i1 %105) #6, !dbg !71
+  %177 = bitcast i16 %176 to bfloat, !dbg !71
+  %178 = fpext bfloat %177 to float, !dbg !72
+  %179 = fmul float %82, %178, !dbg !40
+  %180 = getelementptr bfloat, ptr addrspace(1) %3, i64 %170, !dbg !73
+  %181 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !74
+  %182 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %180, i64 %181, i1 %105) #6, !dbg !74
+  %183 = bitcast i16 %182 to bfloat, !dbg !74
+  %184 = fpext bfloat %183 to float, !dbg !75
+  %185 = fmul float %179, %184, !dbg !76
+  %186 = fsub float 0.000000e+00, %185, !dbg !77
+  %187 = trunc nuw nsw i64 %134 to i32, !dbg !78
+  %188 = add i32 %108, %187, !dbg !78
+  %189 = sext i32 %188 to i64, !dbg !79
+  %190 = getelementptr bfloat, ptr addrspace(1) %2, i64 %189, !dbg !79
+  %191 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !80
+  %192 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %190, i64 %191, i1 %109) #6, !dbg !80
+  %193 = bitcast i16 %192 to bfloat, !dbg !80
+  %194 = fpext bfloat %193 to float, !dbg !81
+  %195 = fmul float %82, %194, !dbg !82
+  %196 = getelementptr bfloat, ptr addrspace(1) %3, i64 %134, !dbg !83
+  %197 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !84
+  %198 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %196, i64 %197, i1 %109) #6, !dbg !84
+  %199 = bitcast i16 %198 to bfloat, !dbg !84
+  %200 = fpext bfloat %199 to float, !dbg !85
+  %201 = fmul float %195, %200, !dbg !86
+  %202 = select i1 %105, float %186, float %201, !dbg !87
+  %203 = fmul float %.0.i, %142, !dbg !88
+  %204 = fmul float %203, %147, !dbg !89
+  %205 = fmul float %204, %152, !dbg !90
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !90
+  store float %205, ptr addrspace(3) %98, align 4, !dbg !90
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !90
+  %206 = load float, ptr addrspace(3) %104, align 4, !dbg !90
+  %207 = fmul float %157, %202, !dbg !91
+  %208 = fadd float %206, %207, !dbg !92
+  %209 = add i32 %110, %187, !dbg !93
+  %210 = sext i32 %209 to i64, !dbg !94
+  %211 = getelementptr bfloat, ptr addrspace(1) %2, i64 %210, !dbg !94
+  %212 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !95
+  %213 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %211, i64 %212, i1 %105) #6, !dbg !95
+  %214 = bitcast i16 %213 to bfloat, !dbg !95
+  %215 = fpext bfloat %214 to float, !dbg !96
+  %216 = fmul float %90, %215, !dbg !44
+  %217 = getelementptr bfloat, ptr addrspace(1) %6, i64 %170, !dbg !97
+  %218 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !98
+  %219 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %217, i64 %218, i1 %105) #6, !dbg !98
+  %220 = bitcast i16 %219 to bfloat, !dbg !98
+  %221 = fpext bfloat %220 to float, !dbg !99
+  %222 = fmul float %216, %221, !dbg !100
+  %223 = fsub float 0.000000e+00, %222, !dbg !101
+  %224 = add i32 %111, %187, !dbg !102
+  %225 = sext i32 %224 to i64, !dbg !103
+  %226 = getelementptr bfloat, ptr addrspace(1) %2, i64 %225, !dbg !103
+  %227 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !104
+  %228 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %226, i64 %227, i1 %109) #6, !dbg !104
+  %229 = bitcast i16 %228 to bfloat, !dbg !104
+  %230 = fpext bfloat %229 to float, !dbg !105
+  %231 = fmul float %90, %230, !dbg !106
+  %232 = getelementptr bfloat, ptr addrspace(1) %6, i64 %134, !dbg !107
+  %233 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !108
+  %234 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %232, i64 %233, i1 %109) #6, !dbg !108
+  %235 = bitcast i16 %234 to bfloat, !dbg !108
+  %236 = fpext bfloat %235 to float, !dbg !109
+  %237 = fmul float %231, %236, !dbg !110
+  %238 = select i1 %105, float %223, float %237, !dbg !87
+  %239 = fmul float %.0.i3, %164, !dbg !111
+  %240 = fmul float %239, %169, !dbg !112
+  %241 = fmul float %240, %152, !dbg !113
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !113
+  store float %241, ptr addrspace(3) %98, align 4, !dbg !113
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !113
+  %242 = load float, ptr addrspace(3) %104, align 4, !dbg !113
+  %243 = fmul float %157, %238, !dbg !114
+  %244 = fadd float %242, %243, !dbg !115
+  %245 = add nuw nsw i64 %133, %131, !dbg !116
+  %246 = getelementptr bfloat, ptr addrspace(1) %0, i64 %245, !dbg !117
+  %247 = fptrunc float %208 to bfloat, !dbg !118
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !118
+  store bfloat %247, ptr addrspace(3) %120, align 2, !dbg !118
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !118
+  %248 = load i16, ptr addrspace(3) %128, align 2, !dbg !118
+  tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %248, ptr addrspace(1) %246, i1 true) #6, !dbg !118
+  %249 = getelementptr bfloat, ptr addrspace(1) %1, i64 %245, !dbg !119
+  %250 = fptrunc float %244 to bfloat, !dbg !120
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !120
+  store bfloat %250, ptr addrspace(3) %120, align 2, !dbg !120
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !120
+  %251 = load i16, ptr addrspace(3) %128, align 2, !dbg !120
+  tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %251, ptr addrspace(1) %249, i1 true) #6, !dbg !120
+  %indvars.iv.next8 = add nuw nsw i64 %indvars.iv7, 4, !dbg !46
+  %252 = icmp samesign ult i64 %indvars.iv7, 124, !dbg !46
+  br i1 %252, label %132, label %253, !dbg !46
+
+253:                                              ; preds = %132
+  ret void, !dbg !121
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1
+
+; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
+declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.div.full(float, float) #3
+
+; Function Attrs: convergent nocallback nounwind
+declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #4
+
+declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #5
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #3
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.rsqrt.approx.f(float) #3
+
+attributes #0 = { nounwind "nvvm.reqntid"="256" }
+attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
+attributes #3 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
+attributes #4 = { convergent nocallback nounwind }
+attributes #5 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #6 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3}
+!llvm.ident = !{!4}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly)
+!1 = !DIFile(filename: "cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py", directory: "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv")
+!2 = !{i32 2, !"Debug Info Version", i32 3}
+!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
+!4 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
+!5 = distinct !DISubprogram(name: "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0", linkageName: "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0", scope: !1, file: !1, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
+!7 = !{}
+!8 = !DILocation(line: 23, column: 28, scope: !5)
+!9 = !DILocation(line: 23, column: 33, scope: !5)
+!10 = !DILocation(line: 24, column: 44, scope: !5)
+!11 = !DILocation(line: 24, column: 23, scope: !5)
+!12 = !DILocation(line: 26, column: 37, scope: !5)
+!13 = !DILocation(line: 29, column: 19, scope: !5)
+!14 = !DILocation(line: 33, column: 43, scope: !5)
+!15 = !DILocation(line: 39, column: 57, scope: !5)
+!16 = !DILocation(line: 39, column: 34, scope: !5)
+!17 = !DILocation(line: 39, column: 68, scope: !5)
+!18 = !DILocation(line: 39, column: 121, scope: !5)
+!19 = !DILocation(line: 40, column: 50, scope: !5)
+!20 = !DILocation(line: 40, column: 34, scope: !5)
+!21 = !DILocation(line: 40, column: 61, scope: !5)
+!22 = !DILocation(line: 40, column: 114, scope: !5)
+!23 = !DILocation(line: 42, column: 22, scope: !5)
+!24 = !DILocation(line: 44, column: 23, scope: !5)
+!25 = !DILocation(line: 47, column: 22, scope: !5)
+!26 = !DILocation(line: 49, column: 25, scope: !5)
+!27 = !DILocation(line: 293, column: 36, scope: !28, inlinedAt: !30)
+!28 = distinct !DILexicalBlockFile(scope: !5, file: !29, discriminator: 0)
+!29 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.12/dist-packages/triton/language")
+!30 = !DILocation(line: 51, column: 25, scope: !31)
+!31 = distinct !DILexicalBlockFile(scope: !5, file: !1, discriminator: 0)
+!32 = !DILocation(line: 263, column: 15, scope: !28, inlinedAt: !27)
+!33 = !DILocation(line: 293, column: 36, scope: !28, inlinedAt: !34)
+!34 = !DILocation(line: 52, column: 27, scope: !31)
+!35 = !DILocation(line: 263, column: 15, scope: !28, inlinedAt: !33)
+!36 = !DILocation(line: 63, column: 46, scope: !5)
+!37 = !DILocation(line: 75, column: 25, scope: !5)
+!38 = !DILocation(line: 77, column: 24, scope: !5)
+!39 = !DILocation(line: 78, column: 32, scope: !5)
+!40 = !DILocation(line: 79, column: 24, scope: !5)
+!41 = !DILocation(line: 123, column: 24, scope: !5)
+!42 = !DILocation(line: 124, column: 24, scope: !5)
+!43 = !DILocation(line: 125, column: 32, scope: !5)
+!44 = !DILocation(line: 126, column: 24, scope: !5)
+!45 = !DILocation(line: 161, column: 43, scope: !5)
+!46 = !DILocation(line: 53, column: 43, scope: !5)
+!47 = !DILocation(line: 54, column: 31, scope: !5)
+!48 = !DILocation(line: 72, column: 41, scope: !5)
+!49 = !DILocation(line: 61, column: 51, scope: !5)
+!50 = !DILocation(line: 61, column: 35, scope: !5)
+!51 = !DILocation(line: 61, column: 62, scope: !5)
+!52 = !DILocation(line: 61, column: 115, scope: !5)
+!53 = !DILocation(line: 62, column: 35, scope: !5)
+!54 = !DILocation(line: 62, column: 42, scope: !5)
+!55 = !DILocation(line: 62, column: 95, scope: !5)
+!56 = !DILocation(line: 63, column: 42, scope: !5)
+!57 = !DILocation(line: 63, column: 35, scope: !5)
+!58 = !DILocation(line: 63, column: 51, scope: !5)
+!59 = !DILocation(line: 64, column: 35, scope: !5)
+!60 = !DILocation(line: 64, column: 51, scope: !5)
+!61 = !DILocation(line: 65, column: 58, scope: !5)
+!62 = !DILocation(line: 65, column: 35, scope: !5)
+!63 = !DILocation(line: 65, column: 69, scope: !5)
+!64 = !DILocation(line: 65, column: 123, scope: !5)
+!65 = !DILocation(line: 66, column: 36, scope: !5)
+!66 = !DILocation(line: 66, column: 43, scope: !5)
+!67 = !DILocation(line: 66, column: 96, scope: !5)
+!68 = !DILocation(line: 72, column: 39, scope: !5)
+!69 = !DILocation(line: 72, column: 57, scope: !5)
+!70 = !DILocation(line: 72, column: 35, scope: !5)
+!71 = !DILocation(line: 72, column: 68, scope: !5)
+!72 = !DILocation(line: 72, column: 129, scope: !5)
+!73 = !DILocation(line: 80, column: 35, scope: !5)
+!74 = !DILocation(line: 80, column: 85, scope: !5)
+!75 = !DILocation(line: 80, column: 146, scope: !5)
+!76 = !DILocation(line: 82, column: 24, scope: !5)
+!77 = !DILocation(line: 84, column: 17, scope: !5)
+!78 = !DILocation(line: 90, column: 53, scope: !5)
+!79 = !DILocation(line: 90, column: 35, scope: !5)
+!80 = !DILocation(line: 90, column: 64, scope: !5)
+!81 = !DILocation(line: 90, column: 125, scope: !5)
+!82 = !DILocation(line: 97, column: 24, scope: !5)
+!83 = !DILocation(line: 98, column: 35, scope: !5)
+!84 = !DILocation(line: 98, column: 81, scope: !5)
+!85 = !DILocation(line: 98, column: 142, scope: !5)
+!86 = !DILocation(line: 100, column: 24, scope: !5)
+!87 = !DILocation(line: 0, scope: !5)
+!88 = !DILocation(line: 111, column: 24, scope: !5)
+!89 = !DILocation(line: 113, column: 24, scope: !5)
+!90 = !DILocation(line: 116, column: 24, scope: !5)
+!91 = !DILocation(line: 118, column: 24, scope: !5)
+!92 = !DILocation(line: 119, column: 24, scope: !5)
+!93 = !DILocation(line: 121, column: 60, scope: !5)
+!94 = !DILocation(line: 121, column: 35, scope: !5)
+!95 = !DILocation(line: 121, column: 71, scope: !5)
+!96 = !DILocation(line: 121, column: 132, scope: !5)
+!97 = !DILocation(line: 127, column: 35, scope: !5)
+!98 = !DILocation(line: 127, column: 85, scope: !5)
+!99 = !DILocation(line: 127, column: 146, scope: !5)
+!100 = !DILocation(line: 129, column: 24, scope: !5)
+!101 = !DILocation(line: 131, column: 17, scope: !5)
+!102 = !DILocation(line: 134, column: 60, scope: !5)
+!103 = !DILocation(line: 134, column: 35, scope: !5)
+!104 = !DILocation(line: 134, column: 71, scope: !5)
+!105 = !DILocation(line: 134, column: 132, scope: !5)
+!106 = !DILocation(line: 139, column: 24, scope: !5)
+!107 = !DILocation(line: 140, column: 35, scope: !5)
+!108 = !DILocation(line: 140, column: 81, scope: !5)
+!109 = !DILocation(line: 140, column: 142, scope: !5)
+!110 = !DILocation(line: 142, column: 24, scope: !5)
+!111 = !DILocation(line: 151, column: 25, scope: !5)
+!112 = !DILocation(line: 153, column: 26, scope: !5)
+!113 = !DILocation(line: 156, column: 26, scope: !5)
+!114 = !DILocation(line: 158, column: 26, scope: !5)
+!115 = !DILocation(line: 159, column: 26, scope: !5)
+!116 = !DILocation(line: 161, column: 39, scope: !5)
+!117 = !DILocation(line: 161, column: 32, scope: !5)
+!118 = !DILocation(line: 161, column: 55, scope: !5)
+!119 = !DILocation(line: 162, column: 32, scope: !5)
+!120 = !DILocation(line: 162, column: 56, scope: !5)
+!121 = !DILocation(line: 53, column: 4, scope: !5)
diff --git a/triton/3QQ2VKPC7ZTVHWIARO636O4SXEBYGRIJVMN3RDKURPDAUEL7TMAQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ptx b/triton/3QQ2VKPC7ZTVHWIARO636O4SXEBYGRIJVMN3RDKURPDAUEL7TMAQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ptx
new file mode 100644
index 0000000000000000000000000000000000000000..c99b13cafe49c2d9fe1eb750434114ea01398a2b
--- /dev/null
+++ b/triton/3QQ2VKPC7ZTVHWIARO636O4SXEBYGRIJVMN3RDKURPDAUEL7TMAQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ptx
@@ -0,0 +1,956 @@
+//
+// Generated by LLVM NVPTX Back-End
+//
+
+.version 9.1
+.target sm_89
+.address_size 64
+
+	// .globl	triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0 // -- Begin function triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0
+.extern .shared .align 16 .b8 global_smem[];
+.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90};
+                                        // @triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0
+.visible .entry triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0(
+	.param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_0,
+	.param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_1,
+	.param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_2,
+	.param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_3,
+	.param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_4,
+	.param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_5,
+	.param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_6,
+	.param .u32 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_7,
+	.param .u32 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_8,
+	.param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_9,
+	.param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_10
+)
+.reqntid 256
+{
+	.reg .pred 	%p<7>;
+	.reg .b16 	%rs<21>;
+	.reg .b32 	%r<139>;
+	.reg .b64 	%rd<67>;
+	.loc	1 18 0                          // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:18:0
+$L__func_begin0:
+	.loc	1 18 0                          // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:18:0
+
+// %bb.0:
+	ld.param.b64 	%rd16, [triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_6];
+	ld.param.b64 	%rd15, [triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_5];
+	ld.param.b64 	%rd14, [triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_4];
+	ld.param.b64 	%rd13, [triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_3];
+	ld.param.b64 	%rd12, [triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_2];
+	ld.param.b64 	%rd11, [triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_1];
+	ld.param.b64 	%rd10, [triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_0];
+$L__tmp0:
+	.loc	1 23 28                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:23:28
+	mov.u32 	%r16, %ctaid.x;
+	.loc	1 23 33                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:23:33
+	shl.b32 	%r1, %r16, 6;
+	.loc	1 24 44                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:24:44
+	mov.u32 	%r2, %tid.x;
+	and.b32 	%r3, %r2, 252;
+	bfe.u32 	%r17, %r2, 2, 6;
+	.loc	1 24 23                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:24:23
+	or.b32 	%r18, %r17, %r1;
+	.loc	1 26 37                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:26:37
+	and.b32 	%r19, %r2, 3;
+	.loc	1 29 19                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:29:19
+	bfe.s32 	%r20, %r16, 25, 1;
+	shr.u32 	%r21, %r20, 27;
+	add.s32 	%r22, %r18, %r21;
+	shr.s32 	%r4, %r22, 5;
+	shl.b32 	%r23, %r4, 15;
+	.loc	1 33 43                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:33:43
+	cvt.u64.u32 	%rd1, %r19;
+	shl.b32 	%r5, %r16, 13;
+	add.s32 	%r24, %r23, %r5;
+	shl.b32 	%r6, %r17, 7;
+	or.b32 	%r25, %r24, %r6;
+	or.b32 	%r26, %r25, %r19;
+	cvt.u64.u32 	%rd2, %r26;
+	mov.b32 	%r137, 0f00000000;
+	mov.b64 	%rd62, -4;
+	mov.b32 	%r138, %r137;
+$L__BB0_1:                              // =>This Inner Loop Header: Depth=1
+	.loc	1 39 34                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:39:34
+	add.s64 	%rd21, %rd2, %rd62;
+	cvt.u32.u64 	%r27, %rd21;
+	add.s32 	%r28, %r27, 4100;
+	mad.wide.s32 	%rd18, %r28, 2, %rd12;
+	.loc	1 39 68                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:39:68
+	// begin inline asm
+	mov.u64 %rd17, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd17, 1.0;
+	// end inline asm
+	mov.b16 	%rs2, 0;
+	mov.pred 	%p1, -1;
+	// begin inline asm
+	mov.u16 %rs1, %rs2;
+	@%p1 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs1 }, [ %rd18 + 0 ], %rd17;
+	// end inline asm
+	.loc	1 39 121                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:39:121
+	cvt.f32.bf16 	%r29, %rs1;
+	add.s32 	%r30, %r27, 4;
+	.loc	1 40 34                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:40:34
+	mad.wide.s32 	%rd20, %r30, 2, %rd12;
+	.loc	1 40 61                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:40:61
+	// begin inline asm
+	mov.u64 %rd19, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd19, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs3, %rs2;
+	@%p1 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs3 }, [ %rd20 + 0 ], %rd19;
+	// end inline asm
+	.loc	1 40 114                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:40:114
+	cvt.f32.bf16 	%r31, %rs3;
+	.loc	1 44 23                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:44:23
+	fma.rn.f32 	%r138, %r29, %r29, %r138;
+	.loc	1 49 25                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:49:25
+	fma.rn.f32 	%r137, %r31, %r31, %r137;
+	.loc	1 33 43                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:33:43
+	add.s64 	%rd62, %rd62, 4;
+	setp.lt.u64 	%p2, %rd62, 124;
+	@%p2 bra 	$L__BB0_1;
+// %bb.2:                               // %__nv_rsqrtf.exit
+	.loc	1 0 43                          // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:0:43
+	cvt.u32.u64 	%r32, %rd1;
+	.loc	1 24 44                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:24:44
+	and.b32 	%r33, %r2, 63;
+	.loc	1 24 23                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:24:23
+	or.b32 	%r34, %r1, %r33;
+	.loc	1 26 37                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:26:37
+	and.b32 	%r35, %r2, 192;
+	.loc	1 29 19                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:29:19
+	shr.s32 	%r36, %r1, 31;
+	shr.u32 	%r37, %r36, 27;
+	add.s32 	%r38, %r34, %r37;
+$L__tmp1:
+	.loc	2 293 36                        // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ]
+	shfl.sync.bfly.b32 	%r39, %r138, 2, 31, -1;
+$L__tmp2:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ] ]
+	add.f32 	%r40, %r138, %r39;
+$L__tmp3:
+	.loc	2 293 36                        // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ]
+	shfl.sync.bfly.b32 	%r41, %r40, 1, 31, -1;
+$L__tmp4:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ] ]
+	add.f32 	%r42, %r40, %r41;
+$L__tmp5:
+	.loc	2 293 36                        // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ]
+	shfl.sync.bfly.b32 	%r43, %r137, 2, 31, -1;
+$L__tmp6:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ] ]
+	add.f32 	%r44, %r137, %r43;
+$L__tmp7:
+	.loc	2 293 36                        // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ]
+	shfl.sync.bfly.b32 	%r45, %r44, 1, 31, -1;
+$L__tmp8:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ] ]
+	add.f32 	%r46, %r44, %r45;
+$L__tmp9:
+	.loc	1 63 46                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:63:46
+	shl.b32 	%r47, %r4, 7;
+	mov.b32 	%r48, 0f43000000;
+	.loc	1 75 25                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:75:25
+	div.full.f32 	%r49, %r46, %r48;
+	.loc	1 77 24                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:77:24
+	add.f32 	%r50, %r49, 0f358637BD;
+	.loc	1 78 32                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:78:32
+	rsqrt.approx.ftz.f32 	%r7, %r50;
+	.loc	1 79 24                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:79:24
+	mov.b32 	%r51, global_smem;
+	add.s32 	%r52, %r51, %r3;
+	st.shared.b32 	[%r52], %r7;
+	bar.sync 	0;
+	shl.b32 	%r53, %r33, 2;
+	add.s32 	%r54, %r51, %r53;
+	ld.shared.b32 	%r8, [%r54];
+	.loc	1 123 24                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:123:24
+	div.full.f32 	%r55, %r42, %r48;
+	.loc	1 124 24                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:124:24
+	add.f32 	%r56, %r55, 0f358637BD;
+	.loc	1 125 32                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:125:32
+	rsqrt.approx.ftz.f32 	%r9, %r56;
+	.loc	1 126 24                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:126:24
+	bar.sync 	0;
+	st.shared.b32 	[%r52], %r9;
+	bar.sync 	0;
+	ld.shared.b32 	%r10, [%r54];
+	bfe.u32 	%r11, %r35, 6, 1;
+	shl.b32 	%r57, %r32, 5;
+	and.b32 	%r58, %r2, 96;
+	shl.b32 	%r59, %r58, 3;
+	or.b32 	%r60, %r57, %r59;
+	xor.b32 	%r61, %r60, %r3;
+	add.s32 	%r12, %r51, %r61;
+	and.b32 	%r62, %r2, 24;
+	shl.b32 	%r63, %r62, 5;
+	shr.u32 	%r64, %r35, 1;
+	or.b32 	%r65, %r63, %r53;
+	xor.b32 	%r66, %r65, %r64;
+	add.s32 	%r13, %r51, %r66;
+	shl.b32 	%r67, %r38, 10;
+	and.b32 	%r68, %r67, -32768;
+	shl.b32 	%r69, %r62, 4;
+	shl.b32 	%r70, %r2, 2;
+	and.b32 	%r71, %r70, 124;
+	shr.u32 	%r72, %r2, 4;
+	and.b32 	%r73, %r72, 2;
+	or.b32 	%r74, %r69, %r71;
+	xor.b32 	%r75, %r74, %r64;
+	add.s32 	%r76, %r51, %r73;
+	add.s32 	%r14, %r76, %r75;
+	shl.b32 	%r77, %r58, 2;
+	and.b32 	%r78, %r2, 124;
+	shr.u32 	%r79, %r2, 6;
+	and.b32 	%r80, %r79, 2;
+	or.b32 	%r81, %r57, %r77;
+	xor.b32 	%r82, %r81, %r78;
+	add.s32 	%r83, %r51, %r80;
+	add.s32 	%r15, %r83, %r82;
+	.loc	1 53 43                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:53:43
+	cvt.s64.s32 	%rd22, %r47;
+	cvt.u64.u32 	%rd23, %r79;
+	bfe.u64 	%rd24, %rd23, 1, 1;
+	shl.b64 	%rd25, %rd24, 2;
+	add.s64 	%rd3, %rd16, %rd25;
+	add.s64 	%rd4, %rd13, %rd25;
+	add.s32 	%r84, %r68, %r5;
+	shl.b32 	%r85, %r33, 7;
+	add.s32 	%r86, %r84, %r85;
+	cvt.u32.u64 	%r87, %rd24;
+	shl.b32 	%r88, %r87, 1;
+	add.s32 	%r89, %r86, %r88;
+	cvt.u64.u32 	%rd5, %r89;
+	add.s32 	%r90, %r5, %r6;
+	cvt.s64.s32 	%rd26, %r90;
+	add.s64 	%rd27, %rd26, %rd1;
+	shl.b64 	%rd28, %rd27, 1;
+	add.s64 	%rd6, %rd11, %rd28;
+	add.s64 	%rd7, %rd10, %rd28;
+	shl.b64 	%rd29, %rd1, 1;
+	add.s64 	%rd8, %rd16, %rd29;
+	or.b64 	%rd30, %rd22, %rd1;
+	shl.b64 	%rd31, %rd30, 2;
+	add.s64 	%rd64, %rd15, %rd31;
+	add.s64 	%rd63, %rd14, %rd31;
+	add.s64 	%rd9, %rd13, %rd29;
+	mov.b64 	%rd66, -4;
+	mov.b64 	%rd65, 0;
+	setp.ne.b32 	%p5, %r11, 0;
+	setp.eq.b32 	%p4, %r11, 0;
+$L__BB0_3:                              // =>This Inner Loop Header: Depth=1
+	.loc	1 61 35                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:61:35
+	add.s64 	%rd60, %rd2, %rd66;
+	cvt.u32.u64 	%r94, %rd60;
+	add.s32 	%r95, %r94, 4;
+	mad.wide.s32 	%rd33, %r95, 2, %rd12;
+	.loc	1 61 62                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:61:62
+	// begin inline asm
+	mov.u64 %rd32, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd32, 1.0;
+	// end inline asm
+	mov.b16 	%rs5, 0;
+	mov.pred 	%p3, -1;
+	// begin inline asm
+	mov.u16 %rs4, %rs5;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs4 }, [ %rd33 + 0 ], %rd32;
+	// end inline asm
+	.loc	1 61 115                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:61:115
+	cvt.f32.bf16 	%r96, %rs4;
+	.loc	1 62 42                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:62:42
+	add.s64 	%rd35, %rd9, %rd65;
+	// begin inline asm
+	mov.u64 %rd34, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd34, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs6, %rs5;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs6 }, [ %rd35 + 0 ], %rd34;
+	// end inline asm
+	.loc	1 62 95                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:62:95
+	cvt.f32.bf16 	%r97, %rs6;
+	.loc	1 63 51                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:63:51
+	// begin inline asm
+	mov.u64 %rd36, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd36, 1.0;
+	// end inline asm
+	mov.b32 	%r92, 0;
+	// begin inline asm
+	mov.u32 %r91, %r92;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.b32 { %r91 }, [ %rd63 + 0 ], %rd36;
+	// end inline asm
+	.loc	1 64 51                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:64:51
+	// begin inline asm
+	mov.u64 %rd37, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd37, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r93, %r92;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.b32 { %r93 }, [ %rd64 + 0 ], %rd37;
+	// end inline asm
+	bar.sync 	0;
+	st.shared.b32 	[%r12], %r93;
+	bar.sync 	0;
+	ld.shared.b32 	%r98, [%r13];
+	add.s32 	%r99, %r94, 4100;
+	.loc	1 65 35                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:65:35
+	mad.wide.s32 	%rd39, %r99, 2, %rd12;
+	.loc	1 65 69                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:65:69
+	// begin inline asm
+	mov.u64 %rd38, 0x0;
+	createpolicy.fractional.L2::evict_first.b64 %rd38, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs7, %rs5;
+	@%p3 ld.global.L1::evict_first.L2::cache_hint.b16 { %rs7 }, [ %rd39 + 0 ], %rd38;
+	// end inline asm
+	.loc	1 65 123                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:65:123
+	cvt.f32.bf16 	%r100, %rs7;
+	.loc	1 66 43                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:66:43
+	add.s64 	%rd41, %rd8, %rd65;
+	// begin inline asm
+	mov.u64 %rd40, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd40, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs8, %rs5;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs8 }, [ %rd41 + 0 ], %rd40;
+	// end inline asm
+	.loc	1 66 96                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:66:96
+	cvt.f32.bf16 	%r101, %rs8;
+	.loc	1 72 35                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:72:35
+	add.s64 	%rd61, %rd5, %rd66;
+	cvt.u32.u64 	%r102, %rd61;
+	add.s32 	%r103, %r102, 5;
+	mad.wide.s32 	%rd43, %r103, 2, %rd12;
+	.loc	1 72 68                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:72:68
+	// begin inline asm
+	mov.u64 %rd42, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd42, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs9, %rs5;
+	@%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs9 }, [ %rd43 + 0 ], %rd42;
+	// end inline asm
+	.loc	1 72 129                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:72:129
+	cvt.f32.bf16 	%r104, %rs9;
+	.loc	1 79 24                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:79:24
+	mul.f32 	%r105, %r8, %r104;
+	.loc	1 80 35                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:80:35
+	add.s64 	%rd49, %rd4, %rd65;
+	.loc	1 80 85                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:80:85
+	add.s64 	%rd45, %rd49, 2;
+	// begin inline asm
+	mov.u64 %rd44, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd44, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs10, %rs5;
+	@%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs10 }, [ %rd45 + 0 ], %rd44;
+	// end inline asm
+	.loc	1 80 146                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:80:146
+	cvt.f32.bf16 	%r106, %rs10;
+	.loc	1 84 17                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:84:17
+	neg.f32 	%r107, %r105;
+	fma.rn.f32 	%r108, %r107, %r106, 0f00000000;
+	add.s32 	%r109, %r102, 4;
+	.loc	1 90 35                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:90:35
+	mad.wide.s32 	%rd47, %r109, 2, %rd12;
+	.loc	1 90 64                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:90:64
+	// begin inline asm
+	mov.u64 %rd46, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd46, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs11, %rs5;
+	@%p5 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs11 }, [ %rd47 + 0 ], %rd46;
+	// end inline asm
+	.loc	1 90 125                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:90:125
+	cvt.f32.bf16 	%r110, %rs11;
+	.loc	1 97 24                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:97:24
+	mul.f32 	%r111, %r8, %r110;
+	.loc	1 98 81                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:98:81
+	// begin inline asm
+	mov.u64 %rd48, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd48, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs12, %rs5;
+	@%p5 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs12 }, [ %rd49 + 0 ], %rd48;
+	// end inline asm
+	.loc	1 98 142                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:98:142
+	cvt.f32.bf16 	%r112, %rs12;
+	.loc	1 100 24                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:100:24
+	mul.f32 	%r113, %r111, %r112;
+	.loc	1 0 0                           // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:0
+	selp.f32 	%r114, %r108, %r113, %p4;
+	.loc	1 111 24                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:111:24
+	mul.f32 	%r115, %r7, %r96;
+	.loc	1 113 24                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:113:24
+	mul.f32 	%r116, %r115, %r97;
+	.loc	1 116 24                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:116:24
+	mul.f32 	%r117, %r116, %r91;
+	bar.sync 	0;
+	st.shared.b32 	[%r12], %r117;
+	bar.sync 	0;
+	ld.shared.b32 	%r118, [%r13];
+	.loc	1 119 24                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:119:24
+	fma.rn.f32 	%r119, %r98, %r114, %r118;
+	add.s32 	%r120, %r102, 4101;
+	.loc	1 121 35                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:121:35
+	mad.wide.s32 	%rd51, %r120, 2, %rd12;
+	.loc	1 121 71                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:121:71
+	// begin inline asm
+	mov.u64 %rd50, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd50, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs13, %rs5;
+	@%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs13 }, [ %rd51 + 0 ], %rd50;
+	// end inline asm
+	.loc	1 121 132                       // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:121:132
+	cvt.f32.bf16 	%r121, %rs13;
+	.loc	1 126 24                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:126:24
+	mul.f32 	%r122, %r10, %r121;
+	.loc	1 127 35                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:127:35
+	add.s64 	%rd57, %rd3, %rd65;
+	.loc	1 127 85                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:127:85
+	add.s64 	%rd53, %rd57, 2;
+	// begin inline asm
+	mov.u64 %rd52, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd52, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs14, %rs5;
+	@%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs14 }, [ %rd53 + 0 ], %rd52;
+	// end inline asm
+	.loc	1 127 146                       // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:127:146
+	cvt.f32.bf16 	%r123, %rs14;
+	.loc	1 131 17                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:131:17
+	neg.f32 	%r124, %r122;
+	fma.rn.f32 	%r125, %r124, %r123, 0f00000000;
+	add.s32 	%r126, %r102, 4100;
+	.loc	1 134 35                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:134:35
+	mad.wide.s32 	%rd55, %r126, 2, %rd12;
+	.loc	1 134 71                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:134:71
+	// begin inline asm
+	mov.u64 %rd54, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd54, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs15, %rs5;
+	@%p5 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs15 }, [ %rd55 + 0 ], %rd54;
+	// end inline asm
+	.loc	1 134 132                       // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:134:132
+	cvt.f32.bf16 	%r127, %rs15;
+	.loc	1 139 24                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:139:24
+	mul.f32 	%r128, %r10, %r127;
+	.loc	1 140 81                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:140:81
+	// begin inline asm
+	mov.u64 %rd56, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd56, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs16, %rs5;
+	@%p5 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs16 }, [ %rd57 + 0 ], %rd56;
+	// end inline asm
+	.loc	1 140 142                       // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:140:142
+	cvt.f32.bf16 	%r129, %rs16;
+	.loc	1 142 24                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:142:24
+	mul.f32 	%r130, %r128, %r129;
+	.loc	1 0 0                           // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:0
+	selp.f32 	%r131, %r125, %r130, %p4;
+	.loc	1 151 25                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:151:25
+	mul.f32 	%r132, %r9, %r100;
+	.loc	1 153 26                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:153:26
+	mul.f32 	%r133, %r132, %r101;
+	.loc	1 156 26                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:156:26
+	mul.f32 	%r134, %r133, %r91;
+	bar.sync 	0;
+	st.shared.b32 	[%r12], %r134;
+	bar.sync 	0;
+	ld.shared.b32 	%r135, [%r13];
+	.loc	1 159 26                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:159:26
+	fma.rn.f32 	%r136, %r98, %r131, %r135;
+	.loc	1 161 55                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:161:55
+	add.s64 	%rd58, %rd7, %rd65;
+	cvt.rn.bf16.f32 	%rs19, %r119;
+	bar.sync 	0;
+	st.shared.b16 	[%r14], %rs19;
+	bar.sync 	0;
+	ld.shared.b16 	%rs17, [%r15];
+	// begin inline asm
+	@%p3 st.global.b16 [ %rd58 + 0 ], { %rs17 };
+	// end inline asm
+	.loc	1 162 56                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:162:56
+	add.s64 	%rd59, %rd6, %rd65;
+	cvt.rn.bf16.f32 	%rs20, %r136;
+	bar.sync 	0;
+	st.shared.b16 	[%r14], %rs20;
+	bar.sync 	0;
+	ld.shared.b16 	%rs18, [%r15];
+	// begin inline asm
+	@%p3 st.global.b16 [ %rd59 + 0 ], { %rs18 };
+	// end inline asm
+	.loc	1 53 43                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:53:43
+	add.s64 	%rd66, %rd66, 4;
+	add.s64 	%rd65, %rd65, 8;
+	add.s64 	%rd64, %rd64, 16;
+	add.s64 	%rd63, %rd63, 16;
+	setp.lt.u64 	%p6, %rd66, 124;
+	@%p6 bra 	$L__BB0_3;
+// %bb.4:
+	.loc	1 53 4                          // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:53:4
+	ret;
+$L__tmp10:
+$L__func_end0:
+                                        // -- End function
+}
+	.file	1 "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py"
+	.file	2 "/usr/local/lib/python3.12/dist-packages/triton/language/standard.py"
+	.section	.debug_abbrev
+	{
+.b8 1                                   // Abbreviation Code
+.b8 17                                  // DW_TAG_compile_unit
+.b8 1                                   // DW_CHILDREN_yes
+.b8 37                                  // DW_AT_producer
+.b8 8                                   // DW_FORM_string
+.b8 19                                  // DW_AT_language
+.b8 5                                   // DW_FORM_data2
+.b8 3                                   // DW_AT_name
+.b8 8                                   // DW_FORM_string
+.b8 16                                  // DW_AT_stmt_list
+.b8 6                                   // DW_FORM_data4
+.b8 27                                  // DW_AT_comp_dir
+.b8 8                                   // DW_FORM_string
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 2                                   // Abbreviation Code
+.b8 46                                  // DW_TAG_subprogram
+.b8 0                                   // DW_CHILDREN_no
+.b8 3                                   // DW_AT_name
+.b8 8                                   // DW_FORM_string
+.b8 32                                  // DW_AT_inline
+.b8 11                                  // DW_FORM_data1
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 3                                   // Abbreviation Code
+.b8 46                                  // DW_TAG_subprogram
+.b8 1                                   // DW_CHILDREN_yes
+.b8 17                                  // DW_AT_low_pc
+.b8 1                                   // DW_FORM_addr
+.b8 18                                  // DW_AT_high_pc
+.b8 1                                   // DW_FORM_addr
+.b8 49                                  // DW_AT_abstract_origin
+.b8 19                                  // DW_FORM_ref4
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 4                                   // Abbreviation Code
+.b8 29                                  // DW_TAG_inlined_subroutine
+.b8 1                                   // DW_CHILDREN_yes
+.b8 49                                  // DW_AT_abstract_origin
+.b8 19                                  // DW_FORM_ref4
+.b8 17                                  // DW_AT_low_pc
+.b8 1                                   // DW_FORM_addr
+.b8 18                                  // DW_AT_high_pc
+.b8 1                                   // DW_FORM_addr
+.b8 88                                  // DW_AT_call_file
+.b8 11                                  // DW_FORM_data1
+.b8 89                                  // DW_AT_call_line
+.b8 11                                  // DW_FORM_data1
+.b8 87                                  // DW_AT_call_column
+.b8 11                                  // DW_FORM_data1
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 5                                   // Abbreviation Code
+.b8 29                                  // DW_TAG_inlined_subroutine
+.b8 0                                   // DW_CHILDREN_no
+.b8 49                                  // DW_AT_abstract_origin
+.b8 19                                  // DW_FORM_ref4
+.b8 17                                  // DW_AT_low_pc
+.b8 1                                   // DW_FORM_addr
+.b8 18                                  // DW_AT_high_pc
+.b8 1                                   // DW_FORM_addr
+.b8 88                                  // DW_AT_call_file
+.b8 11                                  // DW_FORM_data1
+.b8 89                                  // DW_AT_call_line
+.b8 5                                   // DW_FORM_data2
+.b8 87                                  // DW_AT_call_column
+.b8 11                                  // DW_FORM_data1
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 0                                   // EOM(3)
+	}
+	.section	.debug_info
+	{
+.b32 456                                // Length of Unit
+.b8 2                                   // DWARF version number
+.b8 0
+.b32 .debug_abbrev                      // Offset Into Abbrev. Section
+.b8 8                                   // Address Size (in bytes)
+.b8 1                                   // Abbrev [1] 0xb:0x1c1 DW_TAG_compile_unit
+.b8 116                                 // DW_AT_producer
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 0
+.b8 2                                   // DW_AT_language
+.b8 0
+.b8 99                                  // DW_AT_name
+.b8 98
+.b8 118
+.b8 113
+.b8 104
+.b8 106
+.b8 116
+.b8 121
+.b8 103
+.b8 55
+.b8 102
+.b8 118
+.b8 120
+.b8 122
+.b8 119
+.b8 116
+.b8 98
+.b8 116
+.b8 116
+.b8 52
+.b8 118
+.b8 114
+.b8 100
+.b8 107
+.b8 98
+.b8 110
+.b8 98
+.b8 54
+.b8 110
+.b8 51
+.b8 50
+.b8 102
+.b8 110
+.b8 114
+.b8 105
+.b8 106
+.b8 106
+.b8 112
+.b8 108
+.b8 51
+.b8 118
+.b8 118
+.b8 52
+.b8 99
+.b8 102
+.b8 113
+.b8 100
+.b8 52
+.b8 109
+.b8 122
+.b8 110
+.b8 114
+.b8 46
+.b8 112
+.b8 121
+.b8 0
+.b32 .debug_line                        // DW_AT_stmt_list
+.b8 47                                  // DW_AT_comp_dir
+.b8 97
+.b8 112
+.b8 112
+.b8 47
+.b8 116
+.b8 101
+.b8 110
+.b8 115
+.b8 111
+.b8 114
+.b8 114
+.b8 116
+.b8 95
+.b8 108
+.b8 108
+.b8 109
+.b8 47
+.b8 118
+.b8 105
+.b8 115
+.b8 117
+.b8 97
+.b8 108
+.b8 95
+.b8 103
+.b8 101
+.b8 110
+.b8 47
+.b8 99
+.b8 111
+.b8 109
+.b8 112
+.b8 105
+.b8 108
+.b8 101
+.b8 100
+.b8 95
+.b8 99
+.b8 97
+.b8 99
+.b8 104
+.b8 101
+.b8 47
+.b8 102
+.b8 108
+.b8 117
+.b8 120
+.b8 50
+.b8 95
+.b8 107
+.b8 108
+.b8 101
+.b8 105
+.b8 110
+.b8 95
+.b8 57
+.b8 98
+.b8 95
+.b8 78
+.b8 86
+.b8 73
+.b8 68
+.b8 73
+.b8 65
+.b8 95
+.b8 71
+.b8 101
+.b8 70
+.b8 111
+.b8 114
+.b8 99
+.b8 101
+.b8 95
+.b8 82
+.b8 84
+.b8 88
+.b8 95
+.b8 52
+.b8 48
+.b8 57
+.b8 48
+.b8 95
+.b8 115
+.b8 109
+.b8 56
+.b8 57
+.b8 95
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 50
+.b8 46
+.b8 49
+.b8 48
+.b8 46
+.b8 48
+.b8 97
+.b8 48
+.b8 95
+.b8 98
+.b8 52
+.b8 101
+.b8 52
+.b8 101
+.b8 101
+.b8 56
+.b8 49
+.b8 100
+.b8 51
+.b8 46
+.b8 110
+.b8 118
+.b8 50
+.b8 53
+.b8 46
+.b8 49
+.b8 50
+.b8 95
+.b8 99
+.b8 117
+.b8 100
+.b8 97
+.b8 49
+.b8 51
+.b8 95
+.b8 49
+.b8 47
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 105
+.b8 110
+.b8 100
+.b8 117
+.b8 99
+.b8 116
+.b8 111
+.b8 114
+.b8 47
+.b8 98
+.b8 118
+.b8 0
+.b8 2                                   // Abbrev [2] 0xe4:0x6d DW_TAG_subprogram
+.b8 116                                 // DW_AT_name
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 114
+.b8 101
+.b8 100
+.b8 95
+.b8 102
+.b8 117
+.b8 115
+.b8 101
+.b8 100
+.b8 95
+.b8 95
+.b8 102
+.b8 117
+.b8 115
+.b8 101
+.b8 100
+.b8 95
+.b8 114
+.b8 109
+.b8 115
+.b8 95
+.b8 110
+.b8 111
+.b8 114
+.b8 109
+.b8 95
+.b8 95
+.b8 116
+.b8 111
+.b8 95
+.b8 99
+.b8 111
+.b8 112
+.b8 121
+.b8 95
+.b8 97
+.b8 100
+.b8 100
+.b8 95
+.b8 109
+.b8 117
+.b8 108
+.b8 95
+.b8 110
+.b8 101
+.b8 103
+.b8 95
+.b8 115
+.b8 112
+.b8 108
+.b8 105
+.b8 116
+.b8 95
+.b8 115
+.b8 112
+.b8 108
+.b8 105
+.b8 116
+.b8 95
+.b8 119
+.b8 105
+.b8 116
+.b8 104
+.b8 95
+.b8 115
+.b8 105
+.b8 122
+.b8 101
+.b8 115
+.b8 95
+.b8 115
+.b8 116
+.b8 97
+.b8 99
+.b8 107
+.b8 95
+.b8 117
+.b8 110
+.b8 98
+.b8 105
+.b8 110
+.b8 100
+.b8 95
+.b8 117
+.b8 110
+.b8 115
+.b8 113
+.b8 117
+.b8 101
+.b8 101
+.b8 122
+.b8 101
+.b8 95
+.b8 118
+.b8 105
+.b8 101
+.b8 119
+.b8 95
+.b8 48
+.b8 0
+.b8 1                                   // DW_AT_inline
+.b8 3                                   // Abbrev [3] 0x151:0x7a DW_TAG_subprogram
+.b64 $L__func_begin0                    // DW_AT_low_pc
+.b64 $L__func_end0                      // DW_AT_high_pc
+.b32 228                                // DW_AT_abstract_origin
+.b8 4                                   // Abbrev [4] 0x166:0x32 DW_TAG_inlined_subroutine
+.b32 228                                // DW_AT_abstract_origin
+.b64 $L__tmp1                           // DW_AT_low_pc
+.b64 $L__tmp5                           // DW_AT_high_pc
+.b8 1                                   // DW_AT_call_file
+.b8 51                                  // DW_AT_call_line
+.b8 25                                  // DW_AT_call_column
+.b8 5                                   // Abbrev [5] 0x17e:0x19 DW_TAG_inlined_subroutine
+.b32 228                                // DW_AT_abstract_origin
+.b64 $L__tmp2                           // DW_AT_low_pc
+.b64 $L__tmp5                           // DW_AT_high_pc
+.b8 2                                   // DW_AT_call_file
+.b8 37                                  // DW_AT_call_line
+.b8 1
+.b8 36                                  // DW_AT_call_column
+.b8 0                                   // End Of Children Mark
+.b8 4                                   // Abbrev [4] 0x198:0x32 DW_TAG_inlined_subroutine
+.b32 228                                // DW_AT_abstract_origin
+.b64 $L__tmp5                           // DW_AT_low_pc
+.b64 $L__tmp9                           // DW_AT_high_pc
+.b8 1                                   // DW_AT_call_file
+.b8 52                                  // DW_AT_call_line
+.b8 27                                  // DW_AT_call_column
+.b8 5                                   // Abbrev [5] 0x1b0:0x19 DW_TAG_inlined_subroutine
+.b32 228                                // DW_AT_abstract_origin
+.b64 $L__tmp6                           // DW_AT_low_pc
+.b64 $L__tmp9                           // DW_AT_high_pc
+.b8 2                                   // DW_AT_call_file
+.b8 37                                  // DW_AT_call_line
+.b8 1
+.b8 36                                  // DW_AT_call_column
+.b8 0                                   // End Of Children Mark
+.b8 0                                   // End Of Children Mark
+.b8 0                                   // End Of Children Mark
+	}
+	.section	.debug_macinfo	{	}
diff --git a/triton/3QQ2VKPC7ZTVHWIARO636O4SXEBYGRIJVMN3RDKURPDAUEL7TMAQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.source b/triton/3QQ2VKPC7ZTVHWIARO636O4SXEBYGRIJVMN3RDKURPDAUEL7TMAQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.source
new file mode 100644
index 0000000000000000000000000000000000000000..37e71ceadfc771e88cdc1d056801274caec02071
--- /dev/null
+++ b/triton/3QQ2VKPC7ZTVHWIARO636O4SXEBYGRIJVMN3RDKURPDAUEL7TMAQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.source
@@ -0,0 +1,972 @@
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":18:0)
+#loc213 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":287:0)
+#loc215 = loc(unknown)
+#loc218 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":262:0)
+#loc222 = loc("in_out_ptr0"(#loc))
+#loc223 = loc("in_out_ptr1"(#loc))
+#loc224 = loc("in_ptr0"(#loc))
+#loc225 = loc("in_ptr1"(#loc))
+#loc226 = loc("in_ptr2"(#loc))
+#loc227 = loc("in_ptr3"(#loc))
+#loc228 = loc("in_ptr4"(#loc))
+#loc229 = loc("xnumel"(#loc))
+#loc230 = loc("r0_numel"(#loc))
+#loc432 = loc("input"(#loc213))
+#loc433 = loc("a"(#loc218))
+#loc434 = loc("b"(#loc218))
+module {
+  tt.func public @triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0(%in_out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_out_ptr0"(#loc)), %in_out_ptr1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_out_ptr1"(#loc)), %in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %in_ptr4: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr4"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
+    %xnumel_0 = arith.constant 73728 : i32 loc(#loc231)
+    %r0_numel_1 = arith.constant 128 : i32 loc(#loc232)
+    %xoffset = tt.get_program_id x : i32 loc(#loc233)
+    %xoffset_2 = arith.constant 64 : i32 loc(#loc234)
+    %xoffset_3 = arith.constant 64 : i32 loc(#loc234)
+    %xoffset_4 = arith.muli %xoffset, %xoffset_3 : i32 loc(#loc234)
+    %xindex = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc235)
+    %xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc236)
+    %xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<64x1xi32> loc(#loc237)
+    %xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<64x1xi32> loc(#loc237)
+    %xmask = arith.constant true loc(#loc238)
+    %xmask_8 = arith.constant dense<true> : tensor<64x4xi1> loc(#loc238)
+    %r0_base = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32> loc(#loc239)
+    %r0_base_9 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<4xi32> -> tensor<1x4xi32> loc(#loc240)
+    %x0 = arith.constant 32 : i32 loc(#loc241)
+    %x0_10 = arith.constant 32 : i32 loc(#loc241)
+    %x0_11 = arith.constant dense<32> : tensor<64x1xi32> loc(#loc241)
+    %x0_12 = arith.remsi %xindex_7, %x0_11 : tensor<64x1xi32> loc(#loc241)
+    %x1 = arith.constant 32 : i32 loc(#loc242)
+    %x1_13 = arith.constant 32 : i32 loc(#loc242)
+    %x1_14 = arith.constant dense<32> : tensor<64x1xi32> loc(#loc242)
+    %x1_15 = arith.divsi %xindex_7, %x1_14 : tensor<64x1xi32> loc(#loc242)
+    %_tmp4 = arith.constant 0.000000e+00 : f32 loc(#loc243)
+    %_tmp4_16 = arith.constant dense<0.000000e+00> : tensor<64x4xf32> loc(#loc243)
+    %_tmp10 = arith.constant 0.000000e+00 : f32 loc(#loc244)
+    %_tmp10_17 = arith.constant dense<0.000000e+00> : tensor<64x4xf32> loc(#loc244)
+    %c0_i32 = arith.constant 0 : i32 loc(#loc15)
+    %c4_i32 = arith.constant 4 : i32 loc(#loc15)
+    %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc15)
+    %1 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc15)
+    %2 = arith.bitcast %c4_i32 : i32 to i32 loc(#loc15)
+    %3 = ub.poison : i32 loc(#loc15)
+    %_tmp10_18:2 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%_tmp4_23 = %_tmp4_16, %_tmp10_24 = %_tmp10_17) -> (tensor<64x4xf32>, tensor<64x4xf32>)  : i32 {
+      %r0_index = tt.splat %r0_offset : i32 -> tensor<1x4xi32> loc(#loc246)
+      %r0_index_25 = arith.addi %r0_index, %r0_base_9 : tensor<1x4xi32> loc(#loc246)
+      %r0_mask = arith.constant dense<128> : tensor<1x4xi32> loc(#loc247)
+      %r0_mask_26 = arith.cmpi slt, %r0_index_25, %r0_mask : tensor<1x4xi32> loc(#loc247)
+      %tmp0 = arith.constant 4096 : i32 loc(#loc248)
+      %tmp0_27 = arith.constant 4096 : i32 loc(#loc248)
+      %tmp0_28 = arith.constant dense<4096> : tensor<1x4xi32> loc(#loc248)
+      %tmp0_29 = arith.addi %tmp0_28, %r0_index_25 : tensor<1x4xi32> loc(#loc248)
+      %tmp0_30 = arith.constant 128 : i32 loc(#loc249)
+      %tmp0_31 = arith.constant 128 : i32 loc(#loc249)
+      %tmp0_32 = arith.constant dense<128> : tensor<64x1xi32> loc(#loc249)
+      %tmp0_33 = arith.muli %tmp0_32, %x0_12 : tensor<64x1xi32> loc(#loc249)
+      %tmp0_34 = tt.broadcast %tmp0_29 : tensor<1x4xi32> -> tensor<64x4xi32> loc(#loc250)
+      %tmp0_35 = tt.broadcast %tmp0_33 : tensor<64x1xi32> -> tensor<64x4xi32> loc(#loc250)
+      %tmp0_36 = arith.addi %tmp0_34, %tmp0_35 : tensor<64x4xi32> loc(#loc250)
+      %tmp0_37 = arith.constant 36864 : i32 loc(#loc251)
+      %tmp0_38 = arith.constant 36864 : i32 loc(#loc251)
+      %tmp0_39 = arith.constant dense<36864> : tensor<64x1xi32> loc(#loc251)
+      %tmp0_40 = arith.muli %tmp0_39, %x1_15 : tensor<64x1xi32> loc(#loc251)
+      %tmp0_41 = tt.broadcast %tmp0_40 : tensor<64x1xi32> -> tensor<64x4xi32> loc(#loc252)
+      %tmp0_42 = arith.addi %tmp0_36, %tmp0_41 : tensor<64x4xi32> loc(#loc252)
+      %tmp0_43 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<64x4x!tt.ptr<bf16>> loc(#loc253)
+      %tmp0_44 = tt.addptr %tmp0_43, %tmp0_42 : tensor<64x4x!tt.ptr<bf16>>, tensor<64x4xi32> loc(#loc253)
+      %tmp0_45 = arith.constant 0.000000e+00 : f32 loc(#loc254)
+      %tmp0_46 = tt.broadcast %r0_mask_26 : tensor<1x4xi1> -> tensor<64x4xi1> loc(#loc254)
+      %tmp0_47 = arith.constant dense<0.000000e+00> : tensor<64x4xf32> loc(#loc254)
+      %tmp0_48 = arith.truncf %tmp0_47 : tensor<64x4xf32> to tensor<64x4xbf16> loc(#loc254)
+      %tmp0_49 = tt.load %tmp0_44, %tmp0_46, %tmp0_48 evictionPolicy = evict_last : tensor<64x4x!tt.ptr<bf16>> loc(#loc254)
+      %tmp0_50 = arith.extf %tmp0_49 : tensor<64x4xbf16> to tensor<64x4xf32> loc(#loc255)
+      %tmp6 = arith.constant 128 : i32 loc(#loc256)
+      %tmp6_51 = arith.constant 128 : i32 loc(#loc256)
+      %tmp6_52 = arith.constant dense<128> : tensor<64x1xi32> loc(#loc256)
+      %tmp6_53 = arith.muli %tmp6_52, %x0_12 : tensor<64x1xi32> loc(#loc256)
+      %tmp6_54 = tt.broadcast %r0_index_25 : tensor<1x4xi32> -> tensor<64x4xi32> loc(#loc257)
+      %tmp6_55 = tt.broadcast %tmp6_53 : tensor<64x1xi32> -> tensor<64x4xi32> loc(#loc257)
+      %tmp6_56 = arith.addi %tmp6_54, %tmp6_55 : tensor<64x4xi32> loc(#loc257)
+      %tmp6_57 = arith.constant 36864 : i32 loc(#loc258)
+      %tmp6_58 = arith.constant 36864 : i32 loc(#loc258)
+      %tmp6_59 = arith.constant dense<36864> : tensor<64x1xi32> loc(#loc258)
+      %tmp6_60 = arith.muli %tmp6_59, %x1_15 : tensor<64x1xi32> loc(#loc258)
+      %tmp6_61 = tt.broadcast %tmp6_60 : tensor<64x1xi32> -> tensor<64x4xi32> loc(#loc259)
+      %tmp6_62 = arith.addi %tmp6_56, %tmp6_61 : tensor<64x4xi32> loc(#loc259)
+      %tmp6_63 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<64x4x!tt.ptr<bf16>> loc(#loc260)
+      %tmp6_64 = tt.addptr %tmp6_63, %tmp6_62 : tensor<64x4x!tt.ptr<bf16>>, tensor<64x4xi32> loc(#loc260)
+      %tmp6_65 = arith.constant 0.000000e+00 : f32 loc(#loc261)
+      %tmp6_66 = tt.broadcast %r0_mask_26 : tensor<1x4xi1> -> tensor<64x4xi1> loc(#loc261)
+      %tmp6_67 = arith.constant dense<0.000000e+00> : tensor<64x4xf32> loc(#loc261)
+      %tmp6_68 = arith.truncf %tmp6_67 : tensor<64x4xf32> to tensor<64x4xbf16> loc(#loc261)
+      %tmp6_69 = tt.load %tmp6_64, %tmp6_66, %tmp6_68 evictionPolicy = evict_last : tensor<64x4x!tt.ptr<bf16>> loc(#loc261)
+      %tmp6_70 = arith.extf %tmp6_69 : tensor<64x4xbf16> to tensor<64x4xf32> loc(#loc262)
+      %tmp2 = arith.mulf %tmp0_50, %tmp0_50 : tensor<64x4xf32> loc(#loc263)
+      %tmp5 = arith.addf %_tmp4_23, %tmp2 : tensor<64x4xf32> loc(#loc264)
+      %_tmp4_71 = tt.broadcast %r0_mask_26 : tensor<1x4xi1> -> tensor<64x4xi1> loc(#loc265)
+      %_tmp4_72 = arith.select %_tmp4_71, %tmp5, %_tmp4_23 : tensor<64x4xi1>, tensor<64x4xf32> loc(#loc265)
+      %tmp8 = arith.mulf %tmp6_70, %tmp6_70 : tensor<64x4xf32> loc(#loc266)
+      %tmp11 = arith.addf %_tmp10_24, %tmp8 : tensor<64x4xf32> loc(#loc267)
+      %_tmp10_73 = tt.broadcast %r0_mask_26 : tensor<1x4xi1> -> tensor<64x4xi1> loc(#loc268)
+      %_tmp10_74 = arith.select %_tmp10_73, %tmp11, %_tmp10_24 : tensor<64x4xi1>, tensor<64x4xf32> loc(#loc268)
+      scf.yield %_tmp4_72, %_tmp10_74 : tensor<64x4xf32>, tensor<64x4xf32> loc(#loc39)
+    } loc(#loc435)
+    %tmp4 = tt.call @"triton.language.standard.sum__fp32S64_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%_tmp10_18#0) : (tensor<64x4xf32>) -> tensor<64xf32> loc(#loc269)
+    %tmp4_19 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<64xf32> -> tensor<64x1xf32> loc(#loc270)
+    %tmp10 = tt.call @"triton.language.standard.sum__fp32S64_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%_tmp10_18#1) : (tensor<64x4xf32>) -> tensor<64xf32> loc(#loc271)
+    %tmp10_20 = tt.expand_dims %tmp10 {axis = 1 : i32} : tensor<64xf32> -> tensor<64x1xf32> loc(#loc272)
+    %c0_i32_21 = arith.constant 0 : i32 loc(#loc44)
+    %c4_i32_22 = arith.constant 4 : i32 loc(#loc44)
+    %4 = arith.bitcast %c0_i32_21 : i32 to i32 loc(#loc44)
+    %5 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc44)
+    %6 = arith.bitcast %c4_i32_22 : i32 to i32 loc(#loc44)
+    %7 = ub.poison : i32 loc(#loc44)
+    scf.for %r0_offset = %4 to %5 step %6  : i32 {
+      %r0_index = tt.splat %r0_offset : i32 -> tensor<1x4xi32> loc(#loc273)
+      %r0_index_23 = arith.addi %r0_index, %r0_base_9 : tensor<1x4xi32> loc(#loc273)
+      %r0_mask = arith.constant dense<128> : tensor<1x4xi32> loc(#loc274)
+      %r0_mask_24 = arith.cmpi slt, %r0_index_23, %r0_mask : tensor<1x4xi32> loc(#loc274)
+      %r0_3 = arith.constant 2 : i32 loc(#loc275)
+      %r0_3_25 = arith.constant 2 : i32 loc(#loc275)
+      %r0_3_26 = arith.constant dense<2> : tensor<1x4xi32> loc(#loc275)
+      %r0_3_27 = arith.remsi %r0_index_23, %r0_3_26 : tensor<1x4xi32> loc(#loc275)
+      %r0_4 = arith.constant 2 : i32 loc(#loc276)
+      %r0_4_28 = arith.constant 2 : i32 loc(#loc276)
+      %r0_4_29 = arith.constant dense<2> : tensor<1x4xi32> loc(#loc276)
+      %r0_4_30 = arith.divsi %r0_index_23, %r0_4_29 : tensor<1x4xi32> loc(#loc276)
+      %tmp50 = arith.constant 128 : i32 loc(#loc277)
+      %tmp50_31 = arith.constant 128 : i32 loc(#loc277)
+      %tmp50_32 = arith.constant dense<128> : tensor<64x1xi32> loc(#loc277)
+      %tmp50_33 = arith.muli %tmp50_32, %x0_12 : tensor<64x1xi32> loc(#loc277)
+      %tmp50_34 = tt.broadcast %r0_index_23 : tensor<1x4xi32> -> tensor<64x4xi32> loc(#loc278)
+      %tmp50_35 = tt.broadcast %tmp50_33 : tensor<64x1xi32> -> tensor<64x4xi32> loc(#loc278)
+      %tmp50_36 = arith.addi %tmp50_34, %tmp50_35 : tensor<64x4xi32> loc(#loc278)
+      %tmp50_37 = arith.constant 36864 : i32 loc(#loc279)
+      %tmp50_38 = arith.constant 36864 : i32 loc(#loc279)
+      %tmp50_39 = arith.constant dense<36864> : tensor<64x1xi32> loc(#loc279)
+      %tmp50_40 = arith.muli %tmp50_39, %x1_15 : tensor<64x1xi32> loc(#loc279)
+      %tmp50_41 = tt.broadcast %tmp50_40 : tensor<64x1xi32> -> tensor<64x4xi32> loc(#loc280)
+      %tmp50_42 = arith.addi %tmp50_36, %tmp50_41 : tensor<64x4xi32> loc(#loc280)
+      %tmp50_43 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<64x4x!tt.ptr<bf16>> loc(#loc281)
+      %tmp50_44 = tt.addptr %tmp50_43, %tmp50_42 : tensor<64x4x!tt.ptr<bf16>>, tensor<64x4xi32> loc(#loc281)
+      %tmp50_45 = arith.constant 0.000000e+00 : f32 loc(#loc282)
+      %tmp50_46 = tt.broadcast %r0_mask_24 : tensor<1x4xi1> -> tensor<64x4xi1> loc(#loc282)
+      %tmp50_47 = arith.constant dense<0.000000e+00> : tensor<64x4xf32> loc(#loc282)
+      %tmp50_48 = arith.truncf %tmp50_47 : tensor<64x4xf32> to tensor<64x4xbf16> loc(#loc282)
+      %tmp50_49 = tt.load %tmp50_44, %tmp50_46, %tmp50_48 evictionPolicy = evict_last : tensor<64x4x!tt.ptr<bf16>> loc(#loc282)
+      %tmp50_50 = arith.extf %tmp50_49 : tensor<64x4xbf16> to tensor<64x4xf32> loc(#loc283)
+      %tmp58 = tt.splat %in_ptr1 : !tt.ptr<bf16> -> tensor<1x4x!tt.ptr<bf16>> loc(#loc284)
+      %tmp58_51 = tt.addptr %tmp58, %r0_index_23 : tensor<1x4x!tt.ptr<bf16>>, tensor<1x4xi32> loc(#loc284)
+      %tmp58_52 = arith.constant 0.000000e+00 : f32 loc(#loc285)
+      %tmp58_53 = arith.constant dense<0.000000e+00> : tensor<1x4xf32> loc(#loc285)
+      %tmp58_54 = arith.truncf %tmp58_53 : tensor<1x4xf32> to tensor<1x4xbf16> loc(#loc285)
+      %tmp58_55 = tt.load %tmp58_51, %r0_mask_24, %tmp58_54 evictionPolicy = evict_last : tensor<1x4x!tt.ptr<bf16>> loc(#loc285)
+      %tmp58_56 = arith.extf %tmp58_55 : tensor<1x4xbf16> to tensor<1x4xf32> loc(#loc286)
+      %tmp63 = arith.constant 128 : i32 loc(#loc287)
+      %tmp63_57 = arith.constant 128 : i32 loc(#loc287)
+      %tmp63_58 = arith.constant dense<128> : tensor<64x1xi32> loc(#loc287)
+      %tmp63_59 = arith.muli %tmp63_58, %x1_15 : tensor<64x1xi32> loc(#loc287)
+      %tmp63_60 = tt.broadcast %r0_index_23 : tensor<1x4xi32> -> tensor<64x4xi32> loc(#loc288)
+      %tmp63_61 = tt.broadcast %tmp63_59 : tensor<64x1xi32> -> tensor<64x4xi32> loc(#loc288)
+      %tmp63_62 = arith.addi %tmp63_60, %tmp63_61 : tensor<64x4xi32> loc(#loc288)
+      %tmp63_63 = tt.splat %in_ptr2 : !tt.ptr<f32> -> tensor<64x4x!tt.ptr<f32>> loc(#loc289)
+      %tmp63_64 = tt.addptr %tmp63_63, %tmp63_62 : tensor<64x4x!tt.ptr<f32>>, tensor<64x4xi32> loc(#loc289)
+      %tmp63_65 = arith.constant 0.000000e+00 : f32 loc(#loc290)
+      %tmp63_66 = tt.broadcast %r0_mask_24 : tensor<1x4xi1> -> tensor<64x4xi1> loc(#loc290)
+      %tmp63_67 = arith.constant dense<0.000000e+00> : tensor<64x4xf32> loc(#loc290)
+      %tmp63_68 = tt.load %tmp63_64, %tmp63_66, %tmp63_67 evictionPolicy = evict_last : tensor<64x4x!tt.ptr<f32>> loc(#loc290)
+      %tmp66 = arith.constant 128 : i32 loc(#loc291)
+      %tmp66_69 = arith.constant 128 : i32 loc(#loc291)
+      %tmp66_70 = arith.constant dense<128> : tensor<64x1xi32> loc(#loc291)
+      %tmp66_71 = arith.muli %tmp66_70, %x1_15 : tensor<64x1xi32> loc(#loc291)
+      %tmp66_72 = tt.broadcast %r0_index_23 : tensor<1x4xi32> -> tensor<64x4xi32> loc(#loc292)
+      %tmp66_73 = tt.broadcast %tmp66_71 : tensor<64x1xi32> -> tensor<64x4xi32> loc(#loc292)
+      %tmp66_74 = arith.addi %tmp66_72, %tmp66_73 : tensor<64x4xi32> loc(#loc292)
+      %tmp66_75 = tt.splat %in_ptr3 : !tt.ptr<f32> -> tensor<64x4x!tt.ptr<f32>> loc(#loc293)
+      %tmp66_76 = tt.addptr %tmp66_75, %tmp66_74 : tensor<64x4x!tt.ptr<f32>>, tensor<64x4xi32> loc(#loc293)
+      %tmp66_77 = arith.constant 0.000000e+00 : f32 loc(#loc294)
+      %tmp66_78 = tt.broadcast %r0_mask_24 : tensor<1x4xi1> -> tensor<64x4xi1> loc(#loc294)
+      %tmp66_79 = arith.constant dense<0.000000e+00> : tensor<64x4xf32> loc(#loc294)
+      %tmp66_80 = tt.load %tmp66_76, %tmp66_78, %tmp66_79 evictionPolicy = evict_last : tensor<64x4x!tt.ptr<f32>> loc(#loc294)
+      %tmp96 = arith.constant 4096 : i32 loc(#loc295)
+      %tmp96_81 = arith.constant 4096 : i32 loc(#loc295)
+      %tmp96_82 = arith.constant dense<4096> : tensor<1x4xi32> loc(#loc295)
+      %tmp96_83 = arith.addi %tmp96_82, %r0_index_23 : tensor<1x4xi32> loc(#loc295)
+      %tmp96_84 = arith.constant 128 : i32 loc(#loc296)
+      %tmp96_85 = arith.constant 128 : i32 loc(#loc296)
+      %tmp96_86 = arith.constant dense<128> : tensor<64x1xi32> loc(#loc296)
+      %tmp96_87 = arith.muli %tmp96_86, %x0_12 : tensor<64x1xi32> loc(#loc296)
+      %tmp96_88 = tt.broadcast %tmp96_83 : tensor<1x4xi32> -> tensor<64x4xi32> loc(#loc297)
+      %tmp96_89 = tt.broadcast %tmp96_87 : tensor<64x1xi32> -> tensor<64x4xi32> loc(#loc297)
+      %tmp96_90 = arith.addi %tmp96_88, %tmp96_89 : tensor<64x4xi32> loc(#loc297)
+      %tmp96_91 = arith.constant 36864 : i32 loc(#loc298)
+      %tmp96_92 = arith.constant 36864 : i32 loc(#loc298)
+      %tmp96_93 = arith.constant dense<36864> : tensor<64x1xi32> loc(#loc298)
+      %tmp96_94 = arith.muli %tmp96_93, %x1_15 : tensor<64x1xi32> loc(#loc298)
+      %tmp96_95 = tt.broadcast %tmp96_94 : tensor<64x1xi32> -> tensor<64x4xi32> loc(#loc299)
+      %tmp96_96 = arith.addi %tmp96_90, %tmp96_95 : tensor<64x4xi32> loc(#loc299)
+      %tmp96_97 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<64x4x!tt.ptr<bf16>> loc(#loc300)
+      %tmp96_98 = tt.addptr %tmp96_97, %tmp96_96 : tensor<64x4x!tt.ptr<bf16>>, tensor<64x4xi32> loc(#loc300)
+      %tmp96_99 = arith.constant 0.000000e+00 : f32 loc(#loc301)
+      %tmp96_100 = tt.broadcast %r0_mask_24 : tensor<1x4xi1> -> tensor<64x4xi1> loc(#loc301)
+      %tmp96_101 = arith.constant dense<0.000000e+00> : tensor<64x4xf32> loc(#loc301)
+      %tmp96_102 = arith.truncf %tmp96_101 : tensor<64x4xf32> to tensor<64x4xbf16> loc(#loc301)
+      %tmp96_103 = tt.load %tmp96_98, %tmp96_100, %tmp96_102 evictionPolicy = evict_first : tensor<64x4x!tt.ptr<bf16>> loc(#loc301)
+      %tmp96_104 = arith.extf %tmp96_103 : tensor<64x4xbf16> to tensor<64x4xf32> loc(#loc302)
+      %tmp102 = tt.splat %in_ptr4 : !tt.ptr<bf16> -> tensor<1x4x!tt.ptr<bf16>> loc(#loc303)
+      %tmp102_105 = tt.addptr %tmp102, %r0_index_23 : tensor<1x4x!tt.ptr<bf16>>, tensor<1x4xi32> loc(#loc303)
+      %tmp102_106 = arith.constant 0.000000e+00 : f32 loc(#loc304)
+      %tmp102_107 = arith.constant dense<0.000000e+00> : tensor<1x4xf32> loc(#loc304)
+      %tmp102_108 = arith.truncf %tmp102_107 : tensor<1x4xf32> to tensor<1x4xbf16> loc(#loc304)
+      %tmp102_109 = tt.load %tmp102_105, %r0_mask_24, %tmp102_108 evictionPolicy = evict_last : tensor<1x4x!tt.ptr<bf16>> loc(#loc304)
+      %tmp102_110 = arith.extf %tmp102_109 : tensor<1x4xbf16> to tensor<1x4xf32> loc(#loc305)
+      %tmp13 = arith.constant 0 : i64 loc(#loc306)
+      %tmp13_111 = arith.constant dense<0> : tensor<1x1xi64> loc(#loc306)
+      %tmp14 = arith.extsi %r0_3_27 : tensor<1x4xi32> to tensor<1x4xi64> loc(#loc307)
+      %tmp14_112 = arith.constant dense<0> : tensor<1x4xi64> loc(#loc307)
+      %tmp14_113 = arith.cmpi sge, %tmp14, %tmp14_112 : tensor<1x4xi64> loc(#loc307)
+      %tmp15 = arith.constant 1 : i64 loc(#loc308)
+      %tmp15_114 = arith.constant dense<1> : tensor<1x1xi64> loc(#loc308)
+      %tmp16 = arith.extsi %r0_3_27 : tensor<1x4xi32> to tensor<1x4xi64> loc(#loc309)
+      %tmp16_115 = arith.constant dense<1> : tensor<1x4xi64> loc(#loc309)
+      %tmp16_116 = arith.cmpi slt, %tmp16, %tmp16_115 : tensor<1x4xi64> loc(#loc309)
+      %tmp17 = arith.constant 2 : i32 loc(#loc310)
+      %tmp17_117 = arith.constant 2 : i32 loc(#loc310)
+      %tmp17_118 = arith.constant dense<2> : tensor<1x4xi32> loc(#loc310)
+      %tmp17_119 = arith.muli %tmp17_118, %r0_4_30 : tensor<1x4xi32> loc(#loc310)
+      %tmp17_120 = arith.constant 1 : i32 loc(#loc311)
+      %tmp17_121 = arith.constant 1 : i32 loc(#loc311)
+      %tmp17_122 = arith.constant dense<1> : tensor<1x4xi32> loc(#loc311)
+      %tmp17_123 = arith.addi %tmp17_122, %tmp17_119 : tensor<1x4xi32> loc(#loc311)
+      %tmp17_124 = arith.constant 128 : i32 loc(#loc312)
+      %tmp17_125 = arith.constant 128 : i32 loc(#loc312)
+      %tmp17_126 = arith.constant dense<128> : tensor<64x1xi32> loc(#loc312)
+      %tmp17_127 = arith.muli %tmp17_126, %x0_12 : tensor<64x1xi32> loc(#loc312)
+      %tmp17_128 = tt.broadcast %tmp17_123 : tensor<1x4xi32> -> tensor<64x4xi32> loc(#loc313)
+      %tmp17_129 = tt.broadcast %tmp17_127 : tensor<64x1xi32> -> tensor<64x4xi32> loc(#loc313)
+      %tmp17_130 = arith.addi %tmp17_128, %tmp17_129 : tensor<64x4xi32> loc(#loc313)
+      %tmp17_131 = arith.constant 36864 : i32 loc(#loc314)
+      %tmp17_132 = arith.constant 36864 : i32 loc(#loc314)
+      %tmp17_133 = arith.constant dense<36864> : tensor<64x1xi32> loc(#loc314)
+      %tmp17_134 = arith.muli %tmp17_133, %x1_15 : tensor<64x1xi32> loc(#loc314)
+      %tmp17_135 = tt.broadcast %tmp17_134 : tensor<64x1xi32> -> tensor<64x4xi32> loc(#loc315)
+      %tmp17_136 = arith.addi %tmp17_130, %tmp17_135 : tensor<64x4xi32> loc(#loc315)
+      %tmp17_137 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<64x4x!tt.ptr<bf16>> loc(#loc316)
+      %tmp17_138 = tt.addptr %tmp17_137, %tmp17_136 : tensor<64x4x!tt.ptr<bf16>>, tensor<64x4xi32> loc(#loc316)
+      %tmp17_139 = arith.andi %r0_mask_24, %tmp16_116 : tensor<1x4xi1> loc(#loc317)
+      %tmp17_140 = arith.constant 0.000000e+00 : f32 loc(#loc318)
+      %tmp17_141 = tt.broadcast %tmp17_139 : tensor<1x4xi1> -> tensor<64x4xi1> loc(#loc318)
+      %tmp17_142 = arith.constant dense<0.000000e+00> : tensor<64x4xf32> loc(#loc318)
+      %tmp17_143 = arith.truncf %tmp17_142 : tensor<64x4xf32> to tensor<64x4xbf16> loc(#loc318)
+      %tmp17_144 = tt.load %tmp17_138, %tmp17_141, %tmp17_143 evictionPolicy = evict_last : tensor<64x4x!tt.ptr<bf16>> loc(#loc318)
+      %tmp17_145 = arith.extf %tmp17_144 : tensor<64x4xbf16> to tensor<64x4xf32> loc(#loc319)
+      %tmp19 = arith.constant 1.280000e+02 : f32 loc(#loc320)
+      %tmp20 = arith.constant dense<1.280000e+02> : tensor<64x1xf32> loc(#loc321)
+      %tmp20_146 = arith.divf %tmp10_20, %tmp20 : tensor<64x1xf32> loc(#loc321)
+      %tmp21 = arith.constant 9.99999997E-7 : f32 loc(#loc322)
+      %tmp22 = arith.constant dense<9.99999997E-7> : tensor<64x1xf32> loc(#loc323)
+      %tmp22_147 = arith.addf %tmp20_146, %tmp22 : tensor<64x1xf32> loc(#loc323)
+      %tmp23 = tt.extern_elementwise %tmp22_147 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<64x1xf32>) -> tensor<64x1xf32> loc(#loc324)
+      %tmp24 = tt.broadcast %tmp23 : tensor<64x1xf32> -> tensor<64x4xf32> loc(#loc325)
+      %tmp24_148 = arith.mulf %tmp17_145, %tmp24 : tensor<64x4xf32> loc(#loc325)
+      %tmp25 = arith.constant 2 : i32 loc(#loc326)
+      %tmp25_149 = arith.constant 2 : i32 loc(#loc326)
+      %tmp25_150 = arith.constant dense<2> : tensor<1x4xi32> loc(#loc326)
+      %tmp25_151 = arith.muli %tmp25_150, %r0_4_30 : tensor<1x4xi32> loc(#loc326)
+      %tmp25_152 = arith.constant 1 : i32 loc(#loc327)
+      %tmp25_153 = arith.constant 1 : i32 loc(#loc327)
+      %tmp25_154 = arith.constant dense<1> : tensor<1x4xi32> loc(#loc327)
+      %tmp25_155 = arith.addi %tmp25_154, %tmp25_151 : tensor<1x4xi32> loc(#loc327)
+      %tmp25_156 = tt.broadcast %tmp25_155 : tensor<1x4xi32> -> tensor<64x4xi32> loc(#loc328)
+      %tmp25_157 = tt.splat %in_ptr1 : !tt.ptr<bf16> -> tensor<64x4x!tt.ptr<bf16>> loc(#loc329)
+      %tmp25_158 = tt.addptr %tmp25_157, %tmp25_156 : tensor<64x4x!tt.ptr<bf16>>, tensor<64x4xi32> loc(#loc329)
+      %tmp25_159 = arith.andi %r0_mask_24, %tmp16_116 : tensor<1x4xi1> loc(#loc330)
+      %tmp25_160 = arith.constant 0.000000e+00 : f32 loc(#loc331)
+      %tmp25_161 = tt.broadcast %tmp25_159 : tensor<1x4xi1> -> tensor<64x4xi1> loc(#loc331)
+      %tmp25_162 = arith.constant dense<0.000000e+00> : tensor<64x4xf32> loc(#loc331)
+      %tmp25_163 = arith.truncf %tmp25_162 : tensor<64x4xf32> to tensor<64x4xbf16> loc(#loc331)
+      %tmp25_164 = tt.load %tmp25_158, %tmp25_161, %tmp25_163 evictionPolicy = evict_last : tensor<64x4x!tt.ptr<bf16>> loc(#loc331)
+      %tmp25_165 = arith.extf %tmp25_164 : tensor<64x4xbf16> to tensor<64x4xf32> loc(#loc332)
+      %tmp27 = arith.mulf %tmp24_148, %tmp25_165 : tensor<64x4xf32> loc(#loc333)
+      %tmp29 = arith.constant 0.000000e+00 : f32 loc(#loc334)
+      %tmp29_166 = arith.constant dense<0.000000e+00> : tensor<64x4xf32> loc(#loc334)
+      %tmp29_167 = arith.subf %tmp29_166, %tmp27 : tensor<64x4xf32> loc(#loc334)
+      %tmp30 = arith.constant 0.000000e+00 : f32 loc(#loc335)
+      %tmp30_168 = arith.constant dense<0.000000e+00> : tensor<64x4xf32> loc(#loc335)
+      %tmp31 = tt.broadcast %tmp16_116 : tensor<1x4xi1> -> tensor<64x4xi1> loc(#loc336)
+      %tmp31_169 = arith.select %tmp31, %tmp29_167, %tmp30_168 : tensor<64x4xi1>, tensor<64x4xf32> loc(#loc336)
+      %tmp32 = arith.extsi %r0_3_27 : tensor<1x4xi32> to tensor<1x4xi64> loc(#loc337)
+      %tmp32_170 = arith.constant dense<1> : tensor<1x4xi64> loc(#loc337)
+      %tmp32_171 = arith.cmpi sge, %tmp32, %tmp32_170 : tensor<1x4xi64> loc(#loc337)
+      %tmp33 = arith.constant 2 : i64 loc(#loc338)
+      %tmp33_172 = arith.constant dense<2> : tensor<1x1xi64> loc(#loc338)
+      %tmp34 = arith.extsi %r0_3_27 : tensor<1x4xi32> to tensor<1x4xi64> loc(#loc339)
+      %tmp34_173 = arith.constant dense<2> : tensor<1x4xi64> loc(#loc339)
+      %tmp34_174 = arith.cmpi slt, %tmp34, %tmp34_173 : tensor<1x4xi64> loc(#loc339)
+      %tmp35 = arith.constant 2 : i32 loc(#loc340)
+      %tmp35_175 = arith.constant 2 : i32 loc(#loc340)
+      %tmp35_176 = arith.constant dense<2> : tensor<1x4xi32> loc(#loc340)
+      %tmp35_177 = arith.muli %tmp35_176, %r0_4_30 : tensor<1x4xi32> loc(#loc340)
+      %tmp35_178 = arith.constant 128 : i32 loc(#loc341)
+      %tmp35_179 = arith.constant 128 : i32 loc(#loc341)
+      %tmp35_180 = arith.constant dense<128> : tensor<64x1xi32> loc(#loc341)
+      %tmp35_181 = arith.muli %tmp35_180, %x0_12 : tensor<64x1xi32> loc(#loc341)
+      %tmp35_182 = tt.broadcast %tmp35_177 : tensor<1x4xi32> -> tensor<64x4xi32> loc(#loc342)
+      %tmp35_183 = tt.broadcast %tmp35_181 : tensor<64x1xi32> -> tensor<64x4xi32> loc(#loc342)
+      %tmp35_184 = arith.addi %tmp35_182, %tmp35_183 : tensor<64x4xi32> loc(#loc342)
+      %tmp35_185 = arith.constant 36864 : i32 loc(#loc343)
+      %tmp35_186 = arith.constant 36864 : i32 loc(#loc343)
+      %tmp35_187 = arith.constant dense<36864> : tensor<64x1xi32> loc(#loc343)
+      %tmp35_188 = arith.muli %tmp35_187, %x1_15 : tensor<64x1xi32> loc(#loc343)
+      %tmp35_189 = tt.broadcast %tmp35_188 : tensor<64x1xi32> -> tensor<64x4xi32> loc(#loc344)
+      %tmp35_190 = arith.addi %tmp35_184, %tmp35_189 : tensor<64x4xi32> loc(#loc344)
+      %tmp35_191 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<64x4x!tt.ptr<bf16>> loc(#loc345)
+      %tmp35_192 = tt.addptr %tmp35_191, %tmp35_190 : tensor<64x4x!tt.ptr<bf16>>, tensor<64x4xi32> loc(#loc345)
+      %tmp35_193 = arith.andi %r0_mask_24, %tmp32_171 : tensor<1x4xi1> loc(#loc346)
+      %tmp35_194 = arith.constant 0.000000e+00 : f32 loc(#loc347)
+      %tmp35_195 = tt.broadcast %tmp35_193 : tensor<1x4xi1> -> tensor<64x4xi1> loc(#loc347)
+      %tmp35_196 = arith.constant dense<0.000000e+00> : tensor<64x4xf32> loc(#loc347)
+      %tmp35_197 = arith.truncf %tmp35_196 : tensor<64x4xf32> to tensor<64x4xbf16> loc(#loc347)
+      %tmp35_198 = tt.load %tmp35_192, %tmp35_195, %tmp35_197 evictionPolicy = evict_last : tensor<64x4x!tt.ptr<bf16>> loc(#loc347)
+      %tmp35_199 = arith.extf %tmp35_198 : tensor<64x4xbf16> to tensor<64x4xf32> loc(#loc348)
+      %tmp37 = arith.constant 1.280000e+02 : f32 loc(#loc349)
+      %tmp38 = arith.constant dense<1.280000e+02> : tensor<64x1xf32> loc(#loc350)
+      %tmp38_200 = arith.divf %tmp10_20, %tmp38 : tensor<64x1xf32> loc(#loc350)
+      %tmp39 = arith.constant 9.99999997E-7 : f32 loc(#loc351)
+      %tmp40 = arith.constant dense<9.99999997E-7> : tensor<64x1xf32> loc(#loc352)
+      %tmp40_201 = arith.addf %tmp38_200, %tmp40 : tensor<64x1xf32> loc(#loc352)
+      %tmp41 = tt.extern_elementwise %tmp40_201 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<64x1xf32>) -> tensor<64x1xf32> loc(#loc353)
+      %tmp42 = tt.broadcast %tmp41 : tensor<64x1xf32> -> tensor<64x4xf32> loc(#loc354)
+      %tmp42_202 = arith.mulf %tmp35_199, %tmp42 : tensor<64x4xf32> loc(#loc354)
+      %tmp43 = arith.constant 2 : i32 loc(#loc355)
+      %tmp43_203 = arith.constant 2 : i32 loc(#loc355)
+      %tmp43_204 = arith.constant dense<2> : tensor<1x4xi32> loc(#loc355)
+      %tmp43_205 = arith.muli %tmp43_204, %r0_4_30 : tensor<1x4xi32> loc(#loc355)
+      %tmp43_206 = tt.broadcast %tmp43_205 : tensor<1x4xi32> -> tensor<64x4xi32> loc(#loc356)
+      %tmp43_207 = tt.splat %in_ptr1 : !tt.ptr<bf16> -> tensor<64x4x!tt.ptr<bf16>> loc(#loc357)
+      %tmp43_208 = tt.addptr %tmp43_207, %tmp43_206 : tensor<64x4x!tt.ptr<bf16>>, tensor<64x4xi32> loc(#loc357)
+      %tmp43_209 = arith.andi %r0_mask_24, %tmp32_171 : tensor<1x4xi1> loc(#loc358)
+      %tmp43_210 = arith.constant 0.000000e+00 : f32 loc(#loc359)
+      %tmp43_211 = tt.broadcast %tmp43_209 : tensor<1x4xi1> -> tensor<64x4xi1> loc(#loc359)
+      %tmp43_212 = arith.constant dense<0.000000e+00> : tensor<64x4xf32> loc(#loc359)
+      %tmp43_213 = arith.truncf %tmp43_212 : tensor<64x4xf32> to tensor<64x4xbf16> loc(#loc359)
+      %tmp43_214 = tt.load %tmp43_208, %tmp43_211, %tmp43_213 evictionPolicy = evict_last : tensor<64x4x!tt.ptr<bf16>> loc(#loc359)
+      %tmp43_215 = arith.extf %tmp43_214 : tensor<64x4xbf16> to tensor<64x4xf32> loc(#loc360)
+      %tmp45 = arith.mulf %tmp42_202, %tmp43_215 : tensor<64x4xf32> loc(#loc361)
+      %tmp47 = arith.constant 0.000000e+00 : f32 loc(#loc362)
+      %tmp47_216 = arith.constant dense<0.000000e+00> : tensor<64x4xf32> loc(#loc362)
+      %tmp48 = tt.broadcast %tmp32_171 : tensor<1x4xi1> -> tensor<64x4xi1> loc(#loc363)
+      %tmp48_217 = arith.select %tmp48, %tmp45, %tmp47_216 : tensor<64x4xi1>, tensor<64x4xf32> loc(#loc363)
+      %tmp49 = tt.broadcast %tmp16_116 : tensor<1x4xi1> -> tensor<64x4xi1> loc(#loc364)
+      %tmp49_218 = arith.select %tmp49, %tmp31_169, %tmp48_217 : tensor<64x4xi1>, tensor<64x4xf32> loc(#loc364)
+      %tmp52 = arith.constant 1.280000e+02 : f32 loc(#loc365)
+      %tmp53 = arith.constant dense<1.280000e+02> : tensor<64x1xf32> loc(#loc366)
+      %tmp53_219 = arith.divf %tmp10_20, %tmp53 : tensor<64x1xf32> loc(#loc366)
+      %tmp54 = arith.constant 9.99999997E-7 : f32 loc(#loc367)
+      %tmp55 = arith.constant dense<9.99999997E-7> : tensor<64x1xf32> loc(#loc368)
+      %tmp55_220 = arith.addf %tmp53_219, %tmp55 : tensor<64x1xf32> loc(#loc368)
+      %tmp56 = tt.extern_elementwise %tmp55_220 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<64x1xf32>) -> tensor<64x1xf32> loc(#loc369)
+      %tmp57 = tt.broadcast %tmp56 : tensor<64x1xf32> -> tensor<64x4xf32> loc(#loc370)
+      %tmp57_221 = arith.mulf %tmp50_50, %tmp57 : tensor<64x4xf32> loc(#loc370)
+      %tmp60 = tt.broadcast %tmp58_56 : tensor<1x4xf32> -> tensor<64x4xf32> loc(#loc371)
+      %tmp60_222 = arith.mulf %tmp57_221, %tmp60 : tensor<64x4xf32> loc(#loc371)
+      %tmp64 = arith.mulf %tmp60_222, %tmp63_68 : tensor<64x4xf32> loc(#loc372)
+      %tmp67 = arith.mulf %tmp49_218, %tmp66_80 : tensor<64x4xf32> loc(#loc373)
+      %tmp68 = arith.addf %tmp64, %tmp67 : tensor<64x4xf32> loc(#loc374)
+      %tmp70 = arith.constant 2 : i32 loc(#loc375)
+      %tmp70_223 = arith.constant 2 : i32 loc(#loc375)
+      %tmp70_224 = arith.constant dense<2> : tensor<1x4xi32> loc(#loc375)
+      %tmp70_225 = arith.muli %tmp70_224, %r0_4_30 : tensor<1x4xi32> loc(#loc375)
+      %tmp70_226 = arith.constant 4097 : i32 loc(#loc376)
+      %tmp70_227 = arith.constant 4097 : i32 loc(#loc376)
+      %tmp70_228 = arith.constant dense<4097> : tensor<1x4xi32> loc(#loc376)
+      %tmp70_229 = arith.addi %tmp70_228, %tmp70_225 : tensor<1x4xi32> loc(#loc376)
+      %tmp70_230 = arith.constant 128 : i32 loc(#loc377)
+      %tmp70_231 = arith.constant 128 : i32 loc(#loc377)
+      %tmp70_232 = arith.constant dense<128> : tensor<64x1xi32> loc(#loc377)
+      %tmp70_233 = arith.muli %tmp70_232, %x0_12 : tensor<64x1xi32> loc(#loc377)
+      %tmp70_234 = tt.broadcast %tmp70_229 : tensor<1x4xi32> -> tensor<64x4xi32> loc(#loc378)
+      %tmp70_235 = tt.broadcast %tmp70_233 : tensor<64x1xi32> -> tensor<64x4xi32> loc(#loc378)
+      %tmp70_236 = arith.addi %tmp70_234, %tmp70_235 : tensor<64x4xi32> loc(#loc378)
+      %tmp70_237 = arith.constant 36864 : i32 loc(#loc379)
+      %tmp70_238 = arith.constant 36864 : i32 loc(#loc379)
+      %tmp70_239 = arith.constant dense<36864> : tensor<64x1xi32> loc(#loc379)
+      %tmp70_240 = arith.muli %tmp70_239, %x1_15 : tensor<64x1xi32> loc(#loc379)
+      %tmp70_241 = tt.broadcast %tmp70_240 : tensor<64x1xi32> -> tensor<64x4xi32> loc(#loc380)
+      %tmp70_242 = arith.addi %tmp70_236, %tmp70_241 : tensor<64x4xi32> loc(#loc380)
+      %tmp70_243 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<64x4x!tt.ptr<bf16>> loc(#loc381)
+      %tmp70_244 = tt.addptr %tmp70_243, %tmp70_242 : tensor<64x4x!tt.ptr<bf16>>, tensor<64x4xi32> loc(#loc381)
+      %tmp70_245 = arith.andi %r0_mask_24, %tmp16_116 : tensor<1x4xi1> loc(#loc382)
+      %tmp70_246 = arith.constant 0.000000e+00 : f32 loc(#loc383)
+      %tmp70_247 = tt.broadcast %tmp70_245 : tensor<1x4xi1> -> tensor<64x4xi1> loc(#loc383)
+      %tmp70_248 = arith.constant dense<0.000000e+00> : tensor<64x4xf32> loc(#loc383)
+      %tmp70_249 = arith.truncf %tmp70_248 : tensor<64x4xf32> to tensor<64x4xbf16> loc(#loc383)
+      %tmp70_250 = tt.load %tmp70_244, %tmp70_247, %tmp70_249 evictionPolicy = evict_last : tensor<64x4x!tt.ptr<bf16>> loc(#loc383)
+      %tmp70_251 = arith.extf %tmp70_250 : tensor<64x4xbf16> to tensor<64x4xf32> loc(#loc384)
+      %tmp72 = arith.constant dense<1.280000e+02> : tensor<64x1xf32> loc(#loc385)
+      %tmp72_252 = arith.divf %tmp4_19, %tmp72 : tensor<64x1xf32> loc(#loc385)
+      %tmp73 = arith.constant dense<9.99999997E-7> : tensor<64x1xf32> loc(#loc386)
+      %tmp73_253 = arith.addf %tmp72_252, %tmp73 : tensor<64x1xf32> loc(#loc386)
+      %tmp74 = tt.extern_elementwise %tmp73_253 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<64x1xf32>) -> tensor<64x1xf32> loc(#loc387)
+      %tmp75 = tt.broadcast %tmp74 : tensor<64x1xf32> -> tensor<64x4xf32> loc(#loc388)
+      %tmp75_254 = arith.mulf %tmp70_251, %tmp75 : tensor<64x4xf32> loc(#loc388)
+      %tmp76 = arith.constant 2 : i32 loc(#loc389)
+      %tmp76_255 = arith.constant 2 : i32 loc(#loc389)
+      %tmp76_256 = arith.constant dense<2> : tensor<1x4xi32> loc(#loc389)
+      %tmp76_257 = arith.muli %tmp76_256, %r0_4_30 : tensor<1x4xi32> loc(#loc389)
+      %tmp76_258 = arith.constant 1 : i32 loc(#loc390)
+      %tmp76_259 = arith.constant 1 : i32 loc(#loc390)
+      %tmp76_260 = arith.constant dense<1> : tensor<1x4xi32> loc(#loc390)
+      %tmp76_261 = arith.addi %tmp76_260, %tmp76_257 : tensor<1x4xi32> loc(#loc390)
+      %tmp76_262 = tt.broadcast %tmp76_261 : tensor<1x4xi32> -> tensor<64x4xi32> loc(#loc391)
+      %tmp76_263 = tt.splat %in_ptr4 : !tt.ptr<bf16> -> tensor<64x4x!tt.ptr<bf16>> loc(#loc392)
+      %tmp76_264 = tt.addptr %tmp76_263, %tmp76_262 : tensor<64x4x!tt.ptr<bf16>>, tensor<64x4xi32> loc(#loc392)
+      %tmp76_265 = arith.andi %r0_mask_24, %tmp16_116 : tensor<1x4xi1> loc(#loc393)
+      %tmp76_266 = arith.constant 0.000000e+00 : f32 loc(#loc394)
+      %tmp76_267 = tt.broadcast %tmp76_265 : tensor<1x4xi1> -> tensor<64x4xi1> loc(#loc394)
+      %tmp76_268 = arith.constant dense<0.000000e+00> : tensor<64x4xf32> loc(#loc394)
+      %tmp76_269 = arith.truncf %tmp76_268 : tensor<64x4xf32> to tensor<64x4xbf16> loc(#loc394)
+      %tmp76_270 = tt.load %tmp76_264, %tmp76_267, %tmp76_269 evictionPolicy = evict_last : tensor<64x4x!tt.ptr<bf16>> loc(#loc394)
+      %tmp76_271 = arith.extf %tmp76_270 : tensor<64x4xbf16> to tensor<64x4xf32> loc(#loc395)
+      %tmp78 = arith.mulf %tmp75_254, %tmp76_271 : tensor<64x4xf32> loc(#loc396)
+      %tmp80 = arith.constant 0.000000e+00 : f32 loc(#loc397)
+      %tmp80_272 = arith.constant dense<0.000000e+00> : tensor<64x4xf32> loc(#loc397)
+      %tmp80_273 = arith.subf %tmp80_272, %tmp78 : tensor<64x4xf32> loc(#loc397)
+      %tmp81 = arith.constant 0.000000e+00 : f32 loc(#loc398)
+      %tmp81_274 = arith.constant dense<0.000000e+00> : tensor<64x4xf32> loc(#loc398)
+      %tmp82 = tt.broadcast %tmp16_116 : tensor<1x4xi1> -> tensor<64x4xi1> loc(#loc399)
+      %tmp82_275 = arith.select %tmp82, %tmp80_273, %tmp81_274 : tensor<64x4xi1>, tensor<64x4xf32> loc(#loc399)
+      %tmp83 = arith.constant 2 : i32 loc(#loc400)
+      %tmp83_276 = arith.constant 2 : i32 loc(#loc400)
+      %tmp83_277 = arith.constant dense<2> : tensor<1x4xi32> loc(#loc400)
+      %tmp83_278 = arith.muli %tmp83_277, %r0_4_30 : tensor<1x4xi32> loc(#loc400)
+      %tmp83_279 = arith.constant 4096 : i32 loc(#loc401)
+      %tmp83_280 = arith.constant 4096 : i32 loc(#loc401)
+      %tmp83_281 = arith.constant dense<4096> : tensor<1x4xi32> loc(#loc401)
+      %tmp83_282 = arith.addi %tmp83_281, %tmp83_278 : tensor<1x4xi32> loc(#loc401)
+      %tmp83_283 = arith.constant 128 : i32 loc(#loc402)
+      %tmp83_284 = arith.constant 128 : i32 loc(#loc402)
+      %tmp83_285 = arith.constant dense<128> : tensor<64x1xi32> loc(#loc402)
+      %tmp83_286 = arith.muli %tmp83_285, %x0_12 : tensor<64x1xi32> loc(#loc402)
+      %tmp83_287 = tt.broadcast %tmp83_282 : tensor<1x4xi32> -> tensor<64x4xi32> loc(#loc403)
+      %tmp83_288 = tt.broadcast %tmp83_286 : tensor<64x1xi32> -> tensor<64x4xi32> loc(#loc403)
+      %tmp83_289 = arith.addi %tmp83_287, %tmp83_288 : tensor<64x4xi32> loc(#loc403)
+      %tmp83_290 = arith.constant 36864 : i32 loc(#loc404)
+      %tmp83_291 = arith.constant 36864 : i32 loc(#loc404)
+      %tmp83_292 = arith.constant dense<36864> : tensor<64x1xi32> loc(#loc404)
+      %tmp83_293 = arith.muli %tmp83_292, %x1_15 : tensor<64x1xi32> loc(#loc404)
+      %tmp83_294 = tt.broadcast %tmp83_293 : tensor<64x1xi32> -> tensor<64x4xi32> loc(#loc405)
+      %tmp83_295 = arith.addi %tmp83_289, %tmp83_294 : tensor<64x4xi32> loc(#loc405)
+      %tmp83_296 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<64x4x!tt.ptr<bf16>> loc(#loc406)
+      %tmp83_297 = tt.addptr %tmp83_296, %tmp83_295 : tensor<64x4x!tt.ptr<bf16>>, tensor<64x4xi32> loc(#loc406)
+      %tmp83_298 = arith.andi %r0_mask_24, %tmp32_171 : tensor<1x4xi1> loc(#loc407)
+      %tmp83_299 = arith.constant 0.000000e+00 : f32 loc(#loc408)
+      %tmp83_300 = tt.broadcast %tmp83_298 : tensor<1x4xi1> -> tensor<64x4xi1> loc(#loc408)
+      %tmp83_301 = arith.constant dense<0.000000e+00> : tensor<64x4xf32> loc(#loc408)
+      %tmp83_302 = arith.truncf %tmp83_301 : tensor<64x4xf32> to tensor<64x4xbf16> loc(#loc408)
+      %tmp83_303 = tt.load %tmp83_297, %tmp83_300, %tmp83_302 evictionPolicy = evict_last : tensor<64x4x!tt.ptr<bf16>> loc(#loc408)
+      %tmp83_304 = arith.extf %tmp83_303 : tensor<64x4xbf16> to tensor<64x4xf32> loc(#loc409)
+      %tmp85 = arith.constant dense<1.280000e+02> : tensor<64x1xf32> loc(#loc410)
+      %tmp85_305 = arith.divf %tmp4_19, %tmp85 : tensor<64x1xf32> loc(#loc410)
+      %tmp86 = arith.constant dense<9.99999997E-7> : tensor<64x1xf32> loc(#loc411)
+      %tmp86_306 = arith.addf %tmp85_305, %tmp86 : tensor<64x1xf32> loc(#loc411)
+      %tmp87 = tt.extern_elementwise %tmp86_306 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<64x1xf32>) -> tensor<64x1xf32> loc(#loc412)
+      %tmp88 = tt.broadcast %tmp87 : tensor<64x1xf32> -> tensor<64x4xf32> loc(#loc413)
+      %tmp88_307 = arith.mulf %tmp83_304, %tmp88 : tensor<64x4xf32> loc(#loc413)
+      %tmp89 = arith.constant 2 : i32 loc(#loc414)
+      %tmp89_308 = arith.constant 2 : i32 loc(#loc414)
+      %tmp89_309 = arith.constant dense<2> : tensor<1x4xi32> loc(#loc414)
+      %tmp89_310 = arith.muli %tmp89_309, %r0_4_30 : tensor<1x4xi32> loc(#loc414)
+      %tmp89_311 = tt.broadcast %tmp89_310 : tensor<1x4xi32> -> tensor<64x4xi32> loc(#loc415)
+      %tmp89_312 = tt.splat %in_ptr4 : !tt.ptr<bf16> -> tensor<64x4x!tt.ptr<bf16>> loc(#loc416)
+      %tmp89_313 = tt.addptr %tmp89_312, %tmp89_311 : tensor<64x4x!tt.ptr<bf16>>, tensor<64x4xi32> loc(#loc416)
+      %tmp89_314 = arith.andi %r0_mask_24, %tmp32_171 : tensor<1x4xi1> loc(#loc417)
+      %tmp89_315 = arith.constant 0.000000e+00 : f32 loc(#loc418)
+      %tmp89_316 = tt.broadcast %tmp89_314 : tensor<1x4xi1> -> tensor<64x4xi1> loc(#loc418)
+      %tmp89_317 = arith.constant dense<0.000000e+00> : tensor<64x4xf32> loc(#loc418)
+      %tmp89_318 = arith.truncf %tmp89_317 : tensor<64x4xf32> to tensor<64x4xbf16> loc(#loc418)
+      %tmp89_319 = tt.load %tmp89_313, %tmp89_316, %tmp89_318 evictionPolicy = evict_last : tensor<64x4x!tt.ptr<bf16>> loc(#loc418)
+      %tmp89_320 = arith.extf %tmp89_319 : tensor<64x4xbf16> to tensor<64x4xf32> loc(#loc419)
+      %tmp91 = arith.mulf %tmp88_307, %tmp89_320 : tensor<64x4xf32> loc(#loc420)
+      %tmp93 = arith.constant 0.000000e+00 : f32 loc(#loc421)
+      %tmp93_321 = arith.constant dense<0.000000e+00> : tensor<64x4xf32> loc(#loc421)
+      %tmp94 = tt.broadcast %tmp32_171 : tensor<1x4xi1> -> tensor<64x4xi1> loc(#loc422)
+      %tmp94_322 = arith.select %tmp94, %tmp91, %tmp93_321 : tensor<64x4xi1>, tensor<64x4xf32> loc(#loc422)
+      %tmp95 = tt.broadcast %tmp16_116 : tensor<1x4xi1> -> tensor<64x4xi1> loc(#loc423)
+      %tmp95_323 = arith.select %tmp95, %tmp82_275, %tmp94_322 : tensor<64x4xi1>, tensor<64x4xf32> loc(#loc423)
+      %tmp98 = arith.constant dense<1.280000e+02> : tensor<64x1xf32> loc(#loc424)
+      %tmp98_324 = arith.divf %tmp4_19, %tmp98 : tensor<64x1xf32> loc(#loc424)
+      %tmp99 = arith.constant dense<9.99999997E-7> : tensor<64x1xf32> loc(#loc425)
+      %tmp99_325 = arith.addf %tmp98_324, %tmp99 : tensor<64x1xf32> loc(#loc425)
+      %tmp100 = tt.extern_elementwise %tmp99_325 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<64x1xf32>) -> tensor<64x1xf32> loc(#loc426)
+      %tmp101 = tt.broadcast %tmp100 : tensor<64x1xf32> -> tensor<64x4xf32> loc(#loc427)
+      %tmp101_326 = arith.mulf %tmp96_104, %tmp101 : tensor<64x4xf32> loc(#loc427)
+      %tmp104 = tt.broadcast %tmp102_110 : tensor<1x4xf32> -> tensor<64x4xf32> loc(#loc428)
+      %tmp104_327 = arith.mulf %tmp101_326, %tmp104 : tensor<64x4xf32> loc(#loc428)
+      %tmp107 = arith.mulf %tmp104_327, %tmp63_68 : tensor<64x4xf32> loc(#loc429)
+      %tmp109 = arith.mulf %tmp95_323, %tmp66_80 : tensor<64x4xf32> loc(#loc430)
+      %tmp110 = arith.addf %tmp107, %tmp109 : tensor<64x4xf32> loc(#loc431)
+      %c128_i32 = arith.constant 128 : i32 loc(#loc204)
+      %c128_i32_328 = arith.constant 128 : i32 loc(#loc204)
+      %cst = arith.constant dense<128> : tensor<64x1xi32> loc(#loc204)
+      %8 = arith.muli %cst, %xindex_7 : tensor<64x1xi32> loc(#loc204)
+      %9 = tt.broadcast %r0_index_23 : tensor<1x4xi32> -> tensor<64x4xi32> loc(#loc205)
+      %10 = tt.broadcast %8 : tensor<64x1xi32> -> tensor<64x4xi32> loc(#loc205)
+      %11 = arith.addi %9, %10 : tensor<64x4xi32> loc(#loc205)
+      %12 = tt.splat %in_out_ptr0 : !tt.ptr<bf16> -> tensor<64x4x!tt.ptr<bf16>> loc(#loc206)
+      %13 = tt.addptr %12, %11 : tensor<64x4x!tt.ptr<bf16>>, tensor<64x4xi32> loc(#loc206)
+      %14 = tt.broadcast %r0_mask_24 : tensor<1x4xi1> -> tensor<64x4xi1> loc(#loc207)
+      %15 = arith.truncf %tmp68 : tensor<64x4xf32> to tensor<64x4xbf16> loc(#loc207)
+      tt.store %13, %15, %14 : tensor<64x4x!tt.ptr<bf16>> loc(#loc207)
+      %c128_i32_329 = arith.constant 128 : i32 loc(#loc208)
+      %c128_i32_330 = arith.constant 128 : i32 loc(#loc208)
+      %cst_331 = arith.constant dense<128> : tensor<64x1xi32> loc(#loc208)
+      %16 = arith.muli %cst_331, %xindex_7 : tensor<64x1xi32> loc(#loc208)
+      %17 = tt.broadcast %r0_index_23 : tensor<1x4xi32> -> tensor<64x4xi32> loc(#loc209)
+      %18 = tt.broadcast %16 : tensor<64x1xi32> -> tensor<64x4xi32> loc(#loc209)
+      %19 = arith.addi %17, %18 : tensor<64x4xi32> loc(#loc209)
+      %20 = tt.splat %in_out_ptr1 : !tt.ptr<bf16> -> tensor<64x4x!tt.ptr<bf16>> loc(#loc210)
+      %21 = tt.addptr %20, %19 : tensor<64x4x!tt.ptr<bf16>>, tensor<64x4xi32> loc(#loc210)
+      %22 = tt.broadcast %r0_mask_24 : tensor<1x4xi1> -> tensor<64x4xi1> loc(#loc211)
+      %23 = arith.truncf %tmp110 : tensor<64x4xf32> to tensor<64x4xbf16> loc(#loc211)
+      tt.store %21, %23, %22 : tensor<64x4x!tt.ptr<bf16>> loc(#loc211)
+    } loc(#loc44)
+    tt.return loc(#loc212)
+  } loc(#loc)
+  tt.func private @"triton.language.standard.sum__fp32S64_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<64x4xf32> loc("input"(#loc213))) -> tensor<64xf32> attributes {noinline = false} {
+    %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({
+    ^bb0(%arg1: f32 loc(unknown), %arg2: f32 loc(unknown)):
+      %2 = tt.call @triton.language.standard._sum_combine__fp32_fp32__(%arg1, %arg2) : (f32, f32) -> f32 loc(#loc214)
+      tt.reduce.return %2 : f32 loc(#loc214)
+    }) : (tensor<64x4xf32>) -> tensor<64xf32> loc(#loc214)
+    tt.return %0 : tensor<64xf32> loc(#loc216)
+  ^bb1:  // no predecessors
+    %1 = ub.poison : tensor<64xf32> loc(#loc217)
+    tt.return %1 : tensor<64xf32> loc(#loc217)
+  } loc(#loc213)
+  tt.func private @triton.language.standard._sum_combine__fp32_fp32__(%a: f32 loc("a"(#loc218)), %b: f32 loc("b"(#loc218))) -> f32 attributes {noinline = false} {
+    %0 = arith.addf %a, %b : f32 loc(#loc219)
+    tt.return %0 : f32 loc(#loc220)
+  ^bb1:  // no predecessors
+    %1 = ub.poison : f32 loc(#loc221)
+    tt.return %1 : f32 loc(#loc221)
+  } loc(#loc218)
+} loc(#loc)
+#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":19:13)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":20:15)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":23:28)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":23:33)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:36)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:44)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:23)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":25:46)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":26:27)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":26:37)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":28:19)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":29:19)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":30:43)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":32:44)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":33:43)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":34:31)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":35:29)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:41)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:52)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:48)
+#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:63)
+#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:57)
+#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:34)
+#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:68)
+#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:121)
+#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:45)
+#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:41)
+#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:56)
+#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:50)
+#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:34)
+#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:61)
+#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:114)
+#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":42:22)
+#loc34 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":44:23)
+#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":45:40)
+#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":47:22)
+#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":49:25)
+#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":50:42)
+#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":50:8)
+#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":51:25)
+#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":51:28)
+#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":52:27)
+#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":52:30)
+#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":53:43)
+#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":54:31)
+#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":55:29)
+#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":58:27)
+#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":59:27)
+#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:46)
+#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:42)
+#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:57)
+#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:51)
+#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:35)
+#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:62)
+#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:115)
+#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:35)
+#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:42)
+#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:95)
+#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:46)
+#loc60 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:42)
+#loc61 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:35)
+#loc62 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:51)
+#loc63 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:46)
+#loc64 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:42)
+#loc65 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:35)
+#loc66 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:51)
+#loc67 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:42)
+#loc68 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:53)
+#loc69 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:49)
+#loc70 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:64)
+#loc71 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:58)
+#loc72 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:35)
+#loc73 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:69)
+#loc74 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:123)
+#loc75 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:36)
+#loc76 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:43)
+#loc77 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:96)
+#loc78 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":68:35)
+#loc79 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":69:25)
+#loc80 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":70:35)
+#loc81 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":71:24)
+#loc82 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:41)
+#loc83 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:39)
+#loc84 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:52)
+#loc85 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:48)
+#loc86 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:63)
+#loc87 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:57)
+#loc88 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:35)
+#loc89 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:78)
+#loc90 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:68)
+#loc91 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:129)
+#loc92 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":74:16)
+#loc93 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":75:25)
+#loc94 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":76:16)
+#loc95 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":77:24)
+#loc96 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":78:32)
+#loc97 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":79:24)
+#loc98 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:57)
+#loc99 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:55)
+#loc100 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:63)
+#loc101 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:35)
+#loc102 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:95)
+#loc103 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:85)
+#loc104 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:146)
+#loc105 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":82:24)
+#loc106 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":84:17)
+#loc107 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":85:42)
+#loc108 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":86:39)
+#loc109 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":87:25)
+#loc110 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":88:35)
+#loc111 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":89:24)
+#loc112 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:37)
+#loc113 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:48)
+#loc114 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:44)
+#loc115 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:59)
+#loc116 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:53)
+#loc117 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:35)
+#loc118 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:74)
+#loc119 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:64)
+#loc120 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:125)
+#loc121 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":92:16)
+#loc122 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":93:25)
+#loc123 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":94:16)
+#loc124 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":95:24)
+#loc125 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":96:32)
+#loc126 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":97:24)
+#loc127 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:53)
+#loc128 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:59)
+#loc129 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:35)
+#loc130 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:91)
+#loc131 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:81)
+#loc132 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:142)
+#loc133 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":100:24)
+#loc134 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":102:42)
+#loc135 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":103:39)
+#loc136 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":104:39)
+#loc137 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":106:16)
+#loc138 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":107:25)
+#loc139 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":108:16)
+#loc140 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":109:24)
+#loc141 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":110:32)
+#loc142 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":111:24)
+#loc143 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":113:24)
+#loc144 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":116:24)
+#loc145 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":118:24)
+#loc146 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":119:24)
+#loc147 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:44)
+#loc148 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:42)
+#loc149 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:55)
+#loc150 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:51)
+#loc151 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:66)
+#loc152 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:60)
+#loc153 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:35)
+#loc154 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:81)
+#loc155 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:71)
+#loc156 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:132)
+#loc157 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":123:24)
+#loc158 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":124:24)
+#loc159 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":125:32)
+#loc160 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":126:24)
+#loc161 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:57)
+#loc162 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:55)
+#loc163 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:63)
+#loc164 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:35)
+#loc165 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:95)
+#loc166 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:85)
+#loc167 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:146)
+#loc168 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":129:24)
+#loc169 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":131:17)
+#loc170 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":132:42)
+#loc171 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":133:39)
+#loc172 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:44)
+#loc173 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:42)
+#loc174 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:55)
+#loc175 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:51)
+#loc176 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:66)
+#loc177 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:60)
+#loc178 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:35)
+#loc179 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:81)
+#loc180 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:71)
+#loc181 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:132)
+#loc182 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":136:24)
+#loc183 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":137:24)
+#loc184 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":138:32)
+#loc185 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":139:24)
+#loc186 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:53)
+#loc187 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:59)
+#loc188 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:35)
+#loc189 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:91)
+#loc190 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:81)
+#loc191 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:142)
+#loc192 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":142:24)
+#loc193 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":144:42)
+#loc194 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":145:39)
+#loc195 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":146:39)
+#loc196 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":148:24)
+#loc197 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":149:24)
+#loc198 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":150:33)
+#loc199 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":151:25)
+#loc200 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":153:26)
+#loc201 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":156:26)
+#loc202 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":158:26)
+#loc203 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":159:26)
+#loc204 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:43)
+#loc205 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:39)
+#loc206 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:32)
+#loc207 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:55)
+#loc208 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:43)
+#loc209 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:39)
+#loc210 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:32)
+#loc211 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:56)
+#loc212 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":53:4)
+#loc214 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:36)
+#loc216 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:11)
+#loc217 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:4)
+#loc219 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:15)
+#loc220 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:11)
+#loc221 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:4)
+#loc231 = loc("xnumel"(#loc1))
+#loc232 = loc("r0_numel"(#loc2))
+#loc233 = loc("xoffset"(#loc3))
+#loc234 = loc("xoffset"(#loc4))
+#loc235 = loc("xindex"(#loc5))
+#loc236 = loc("xindex"(#loc6))
+#loc237 = loc("xindex"(#loc7))
+#loc238 = loc("xmask"(#loc8))
+#loc239 = loc("r0_base"(#loc9))
+#loc240 = loc("r0_base"(#loc10))
+#loc241 = loc("x0"(#loc11))
+#loc242 = loc("x1"(#loc12))
+#loc243 = loc("_tmp4"(#loc13))
+#loc244 = loc("_tmp10"(#loc14))
+#loc245 = loc("_tmp4"(#loc15))
+#loc246 = loc("r0_index"(#loc16))
+#loc247 = loc("r0_mask"(#loc17))
+#loc248 = loc("tmp0"(#loc18))
+#loc249 = loc("tmp0"(#loc19))
+#loc250 = loc("tmp0"(#loc20))
+#loc251 = loc("tmp0"(#loc21))
+#loc252 = loc("tmp0"(#loc22))
+#loc253 = loc("tmp0"(#loc23))
+#loc254 = loc("tmp0"(#loc24))
+#loc255 = loc("tmp0"(#loc25))
+#loc256 = loc("tmp6"(#loc26))
+#loc257 = loc("tmp6"(#loc27))
+#loc258 = loc("tmp6"(#loc28))
+#loc259 = loc("tmp6"(#loc29))
+#loc260 = loc("tmp6"(#loc30))
+#loc261 = loc("tmp6"(#loc31))
+#loc262 = loc("tmp6"(#loc32))
+#loc263 = loc("tmp2"(#loc33))
+#loc264 = loc("tmp5"(#loc34))
+#loc265 = loc("_tmp4"(#loc35))
+#loc266 = loc("tmp8"(#loc36))
+#loc267 = loc("tmp11"(#loc37))
+#loc268 = loc("_tmp10"(#loc38))
+#loc269 = loc("tmp4"(#loc40))
+#loc270 = loc("tmp4"(#loc41))
+#loc271 = loc("tmp10"(#loc42))
+#loc272 = loc("tmp10"(#loc43))
+#loc273 = loc("r0_index"(#loc45))
+#loc274 = loc("r0_mask"(#loc46))
+#loc275 = loc("r0_3"(#loc47))
+#loc276 = loc("r0_4"(#loc48))
+#loc277 = loc("tmp50"(#loc49))
+#loc278 = loc("tmp50"(#loc50))
+#loc279 = loc("tmp50"(#loc51))
+#loc280 = loc("tmp50"(#loc52))
+#loc281 = loc("tmp50"(#loc53))
+#loc282 = loc("tmp50"(#loc54))
+#loc283 = loc("tmp50"(#loc55))
+#loc284 = loc("tmp58"(#loc56))
+#loc285 = loc("tmp58"(#loc57))
+#loc286 = loc("tmp58"(#loc58))
+#loc287 = loc("tmp63"(#loc59))
+#loc288 = loc("tmp63"(#loc60))
+#loc289 = loc("tmp63"(#loc61))
+#loc290 = loc("tmp63"(#loc62))
+#loc291 = loc("tmp66"(#loc63))
+#loc292 = loc("tmp66"(#loc64))
+#loc293 = loc("tmp66"(#loc65))
+#loc294 = loc("tmp66"(#loc66))
+#loc295 = loc("tmp96"(#loc67))
+#loc296 = loc("tmp96"(#loc68))
+#loc297 = loc("tmp96"(#loc69))
+#loc298 = loc("tmp96"(#loc70))
+#loc299 = loc("tmp96"(#loc71))
+#loc300 = loc("tmp96"(#loc72))
+#loc301 = loc("tmp96"(#loc73))
+#loc302 = loc("tmp96"(#loc74))
+#loc303 = loc("tmp102"(#loc75))
+#loc304 = loc("tmp102"(#loc76))
+#loc305 = loc("tmp102"(#loc77))
+#loc306 = loc("tmp13"(#loc78))
+#loc307 = loc("tmp14"(#loc79))
+#loc308 = loc("tmp15"(#loc80))
+#loc309 = loc("tmp16"(#loc81))
+#loc310 = loc("tmp17"(#loc82))
+#loc311 = loc("tmp17"(#loc83))
+#loc312 = loc("tmp17"(#loc84))
+#loc313 = loc("tmp17"(#loc85))
+#loc314 = loc("tmp17"(#loc86))
+#loc315 = loc("tmp17"(#loc87))
+#loc316 = loc("tmp17"(#loc88))
+#loc317 = loc("tmp17"(#loc89))
+#loc318 = loc("tmp17"(#loc90))
+#loc319 = loc("tmp17"(#loc91))
+#loc320 = loc("tmp19"(#loc92))
+#loc321 = loc("tmp20"(#loc93))
+#loc322 = loc("tmp21"(#loc94))
+#loc323 = loc("tmp22"(#loc95))
+#loc324 = loc("tmp23"(#loc96))
+#loc325 = loc("tmp24"(#loc97))
+#loc326 = loc("tmp25"(#loc98))
+#loc327 = loc("tmp25"(#loc99))
+#loc328 = loc("tmp25"(#loc100))
+#loc329 = loc("tmp25"(#loc101))
+#loc330 = loc("tmp25"(#loc102))
+#loc331 = loc("tmp25"(#loc103))
+#loc332 = loc("tmp25"(#loc104))
+#loc333 = loc("tmp27"(#loc105))
+#loc334 = loc("tmp29"(#loc106))
+#loc335 = loc("tmp30"(#loc107))
+#loc336 = loc("tmp31"(#loc108))
+#loc337 = loc("tmp32"(#loc109))
+#loc338 = loc("tmp33"(#loc110))
+#loc339 = loc("tmp34"(#loc111))
+#loc340 = loc("tmp35"(#loc112))
+#loc341 = loc("tmp35"(#loc113))
+#loc342 = loc("tmp35"(#loc114))
+#loc343 = loc("tmp35"(#loc115))
+#loc344 = loc("tmp35"(#loc116))
+#loc345 = loc("tmp35"(#loc117))
+#loc346 = loc("tmp35"(#loc118))
+#loc347 = loc("tmp35"(#loc119))
+#loc348 = loc("tmp35"(#loc120))
+#loc349 = loc("tmp37"(#loc121))
+#loc350 = loc("tmp38"(#loc122))
+#loc351 = loc("tmp39"(#loc123))
+#loc352 = loc("tmp40"(#loc124))
+#loc353 = loc("tmp41"(#loc125))
+#loc354 = loc("tmp42"(#loc126))
+#loc355 = loc("tmp43"(#loc127))
+#loc356 = loc("tmp43"(#loc128))
+#loc357 = loc("tmp43"(#loc129))
+#loc358 = loc("tmp43"(#loc130))
+#loc359 = loc("tmp43"(#loc131))
+#loc360 = loc("tmp43"(#loc132))
+#loc361 = loc("tmp45"(#loc133))
+#loc362 = loc("tmp47"(#loc134))
+#loc363 = loc("tmp48"(#loc135))
+#loc364 = loc("tmp49"(#loc136))
+#loc365 = loc("tmp52"(#loc137))
+#loc366 = loc("tmp53"(#loc138))
+#loc367 = loc("tmp54"(#loc139))
+#loc368 = loc("tmp55"(#loc140))
+#loc369 = loc("tmp56"(#loc141))
+#loc370 = loc("tmp57"(#loc142))
+#loc371 = loc("tmp60"(#loc143))
+#loc372 = loc("tmp64"(#loc144))
+#loc373 = loc("tmp67"(#loc145))
+#loc374 = loc("tmp68"(#loc146))
+#loc375 = loc("tmp70"(#loc147))
+#loc376 = loc("tmp70"(#loc148))
+#loc377 = loc("tmp70"(#loc149))
+#loc378 = loc("tmp70"(#loc150))
+#loc379 = loc("tmp70"(#loc151))
+#loc380 = loc("tmp70"(#loc152))
+#loc381 = loc("tmp70"(#loc153))
+#loc382 = loc("tmp70"(#loc154))
+#loc383 = loc("tmp70"(#loc155))
+#loc384 = loc("tmp70"(#loc156))
+#loc385 = loc("tmp72"(#loc157))
+#loc386 = loc("tmp73"(#loc158))
+#loc387 = loc("tmp74"(#loc159))
+#loc388 = loc("tmp75"(#loc160))
+#loc389 = loc("tmp76"(#loc161))
+#loc390 = loc("tmp76"(#loc162))
+#loc391 = loc("tmp76"(#loc163))
+#loc392 = loc("tmp76"(#loc164))
+#loc393 = loc("tmp76"(#loc165))
+#loc394 = loc("tmp76"(#loc166))
+#loc395 = loc("tmp76"(#loc167))
+#loc396 = loc("tmp78"(#loc168))
+#loc397 = loc("tmp80"(#loc169))
+#loc398 = loc("tmp81"(#loc170))
+#loc399 = loc("tmp82"(#loc171))
+#loc400 = loc("tmp83"(#loc172))
+#loc401 = loc("tmp83"(#loc173))
+#loc402 = loc("tmp83"(#loc174))
+#loc403 = loc("tmp83"(#loc175))
+#loc404 = loc("tmp83"(#loc176))
+#loc405 = loc("tmp83"(#loc177))
+#loc406 = loc("tmp83"(#loc178))
+#loc407 = loc("tmp83"(#loc179))
+#loc408 = loc("tmp83"(#loc180))
+#loc409 = loc("tmp83"(#loc181))
+#loc410 = loc("tmp85"(#loc182))
+#loc411 = loc("tmp86"(#loc183))
+#loc412 = loc("tmp87"(#loc184))
+#loc413 = loc("tmp88"(#loc185))
+#loc414 = loc("tmp89"(#loc186))
+#loc415 = loc("tmp89"(#loc187))
+#loc416 = loc("tmp89"(#loc188))
+#loc417 = loc("tmp89"(#loc189))
+#loc418 = loc("tmp89"(#loc190))
+#loc419 = loc("tmp89"(#loc191))
+#loc420 = loc("tmp91"(#loc192))
+#loc421 = loc("tmp93"(#loc193))
+#loc422 = loc("tmp94"(#loc194))
+#loc423 = loc("tmp95"(#loc195))
+#loc424 = loc("tmp98"(#loc196))
+#loc425 = loc("tmp99"(#loc197))
+#loc426 = loc("tmp100"(#loc198))
+#loc427 = loc("tmp101"(#loc199))
+#loc428 = loc("tmp104"(#loc200))
+#loc429 = loc("tmp107"(#loc201))
+#loc430 = loc("tmp109"(#loc202))
+#loc431 = loc("tmp110"(#loc203))
+#loc435 = loc("_tmp10"(#loc245))
diff --git a/triton/3QQ2VKPC7ZTVHWIARO636O4SXEBYGRIJVMN3RDKURPDAUEL7TMAQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttgir b/triton/3QQ2VKPC7ZTVHWIARO636O4SXEBYGRIJVMN3RDKURPDAUEL7TMAQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttgir
new file mode 100644
index 0000000000000000000000000000000000000000..f19f7ff2cbbf93184ad759700860ccc97d900c65
--- /dev/null
+++ b/triton/3QQ2VKPC7ZTVHWIARO636O4SXEBYGRIJVMN3RDKURPDAUEL7TMAQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttgir
@@ -0,0 +1,547 @@
+#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [2, 4], order = [0, 1]}>
+#blocked1 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [8, 4], warpsPerCTA = [8, 1], order = [1, 0]}>
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":18:0)
+#loc1 = loc(unknown)
+#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":51:25)
+#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":52:27)
+#loc147 = loc("in_out_ptr0"(#loc))
+#loc148 = loc("in_out_ptr1"(#loc))
+#loc149 = loc("in_ptr0"(#loc))
+#loc150 = loc("in_ptr1"(#loc))
+#loc151 = loc("in_ptr2"(#loc))
+#loc152 = loc("in_ptr3"(#loc))
+#loc153 = loc("in_ptr4"(#loc))
+#loc154 = loc("xnumel"(#loc))
+#loc155 = loc("r0_numel"(#loc))
+#loc185 = loc("tmp4"(#loc33))
+#loc187 = loc("tmp10"(#loc36))
+#loc292 = loc(callsite(#loc1 at #loc185))
+#loc294 = loc(callsite(#loc1 at #loc187))
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "cuda:89", "ttg.threads-per-warp" = 32 : i32} {
+  tt.func public @triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0(%in_out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_out_ptr0"(#loc)), %in_out_ptr1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_out_ptr1"(#loc)), %in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %in_ptr4: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr4"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
+    %cst = arith.constant dense<4097> : tensor<1x4xi32, #blocked> loc(#loc1)
+    %cst_0 = arith.constant dense<1> : tensor<1x4xi32, #blocked> loc(#loc1)
+    %cst_1 = arith.constant dense<1> : tensor<1x4xi64, #blocked> loc(#loc1)
+    %cst_2 = arith.constant dense<2> : tensor<1x4xi32, #blocked> loc(#loc1)
+    %cst_3 = arith.constant dense<36864> : tensor<64x1xi32, #blocked1> loc(#loc1)
+    %cst_4 = arith.constant dense<36864> : tensor<64x1xi32, #blocked> loc(#loc1)
+    %cst_5 = arith.constant dense<128> : tensor<64x1xi32, #blocked1> loc(#loc1)
+    %cst_6 = arith.constant dense<128> : tensor<64x1xi32, #blocked> loc(#loc1)
+    %cst_7 = arith.constant dense<4096> : tensor<1x4xi32, #blocked> loc(#loc1)
+    %cst_8 = arith.constant dense<128> : tensor<1x4xi32, #blocked> loc(#loc1)
+    %cst_9 = arith.constant dense<32> : tensor<64x1xi32, #blocked1> loc(#loc1)
+    %cst_10 = arith.constant dense<32> : tensor<64x1xi32, #blocked> loc(#loc1)
+    %c64_i32 = arith.constant 64 : i32 loc(#loc1)
+    %cst_11 = arith.constant dense<0.000000e+00> : tensor<64x4xbf16, #blocked> loc(#loc1)
+    %cst_12 = arith.constant dense<0.000000e+00> : tensor<64x4xf32, #blocked1> loc(#loc1)
+    %cst_13 = arith.constant dense<128> : tensor<1x4xi32, #blocked1> loc(#loc1)
+    %cst_14 = arith.constant dense<4096> : tensor<1x4xi32, #blocked1> loc(#loc1)
+    %cst_15 = arith.constant dense<0.000000e+00> : tensor<1x4xbf16, #blocked1> loc(#loc1)
+    %cst_16 = arith.constant dense<0.000000e+00> : tensor<64x4xbf16, #blocked1> loc(#loc1)
+    %c4_i32 = arith.constant 4 : i32 loc(#loc1)
+    %c128_i32 = arith.constant 128 : i32 loc(#loc1)
+    %c0_i32 = arith.constant 0 : i32 loc(#loc1)
+    %cst_17 = arith.constant dense<9.99999997E-7> : tensor<64x1xf32, #blocked1> loc(#loc1)
+    %cst_18 = arith.constant dense<1.280000e+02> : tensor<64x1xf32, #blocked1> loc(#loc1)
+    %cst_19 = arith.constant dense<0.000000e+00> : tensor<64x4xf32, #blocked> loc(#loc1)
+    %xoffset = tt.get_program_id x : i32 loc(#loc156)
+    %xoffset_20 = arith.muli %xoffset, %c64_i32 : i32 loc(#loc157)
+    %xindex = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc158)
+    %xindex_21 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc158)
+    %xindex_22 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> loc(#loc158)
+    %xindex_23 = tt.expand_dims %xindex_21 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<64x1xi32, #blocked1> loc(#loc158)
+    %xindex_24 = tt.splat %xoffset_20 : i32 -> tensor<64x1xi32, #blocked> loc(#loc159)
+    %xindex_25 = tt.splat %xoffset_20 : i32 -> tensor<64x1xi32, #blocked1> loc(#loc159)
+    %xindex_26 = arith.addi %xindex_24, %xindex_22 : tensor<64x1xi32, #blocked> loc(#loc159)
+    %xindex_27 = arith.addi %xindex_25, %xindex_23 : tensor<64x1xi32, #blocked1> loc(#loc159)
+    %r0_base = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc160)
+    %r0_base_28 = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc160)
+    %r0_base_29 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<4xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x4xi32, #blocked> loc(#loc160)
+    %r0_base_30 = tt.expand_dims %r0_base_28 {axis = 0 : i32} : tensor<4xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x4xi32, #blocked1> loc(#loc160)
+    %x0 = arith.remsi %xindex_26, %cst_10 : tensor<64x1xi32, #blocked> loc(#loc161)
+    %x0_31 = arith.remsi %xindex_27, %cst_9 : tensor<64x1xi32, #blocked1> loc(#loc161)
+    %x1 = arith.divsi %xindex_26, %cst_10 : tensor<64x1xi32, #blocked> loc(#loc162)
+    %x1_32 = arith.divsi %xindex_27, %cst_9 : tensor<64x1xi32, #blocked1> loc(#loc162)
+    %tmp0 = arith.muli %x0_31, %cst_5 : tensor<64x1xi32, #blocked1> loc(#loc163)
+    %tmp0_33 = tt.broadcast %tmp0 : tensor<64x1xi32, #blocked1> -> tensor<64x4xi32, #blocked1> loc(#loc164)
+    %tmp0_34 = arith.muli %x1_32, %cst_3 : tensor<64x1xi32, #blocked1> loc(#loc165)
+    %tmp0_35 = tt.broadcast %tmp0_34 : tensor<64x1xi32, #blocked1> -> tensor<64x4xi32, #blocked1> loc(#loc166)
+    %tmp0_36 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<64x4x!tt.ptr<bf16>, #blocked1> loc(#loc167)
+    %_tmp10:2 = scf.for %r0_offset = %c0_i32 to %c128_i32 step %c4_i32 iter_args(%_tmp4 = %cst_12, %_tmp10_51 = %cst_12) -> (tensor<64x4xf32, #blocked1>, tensor<64x4xf32, #blocked1>)  : i32 {
+      %r0_index = tt.splat %r0_offset : i32 -> tensor<1x4xi32, #blocked1> loc(#loc169)
+      %r0_index_52 = arith.addi %r0_index, %r0_base_30 : tensor<1x4xi32, #blocked1> loc(#loc169)
+      %r0_mask = arith.cmpi slt, %r0_index_52, %cst_13 : tensor<1x4xi32, #blocked1> loc(#loc170)
+      %tmp0_53 = arith.addi %r0_index_52, %cst_14 : tensor<1x4xi32, #blocked1> loc(#loc171)
+      %tmp0_54 = tt.broadcast %tmp0_53 : tensor<1x4xi32, #blocked1> -> tensor<64x4xi32, #blocked1> loc(#loc164)
+      %tmp0_55 = arith.addi %tmp0_54, %tmp0_33 : tensor<64x4xi32, #blocked1> loc(#loc164)
+      %tmp0_56 = arith.addi %tmp0_55, %tmp0_35 : tensor<64x4xi32, #blocked1> loc(#loc166)
+      %tmp0_57 = tt.addptr %tmp0_36, %tmp0_56 : tensor<64x4x!tt.ptr<bf16>, #blocked1>, tensor<64x4xi32, #blocked1> loc(#loc167)
+      %tmp0_58 = tt.broadcast %r0_mask : tensor<1x4xi1, #blocked1> -> tensor<64x4xi1, #blocked1> loc(#loc172)
+      %tmp0_59 = tt.load %tmp0_57, %tmp0_58, %cst_16 evictionPolicy = evict_last : tensor<64x4x!tt.ptr<bf16>, #blocked1> loc(#loc172)
+      %tmp0_60 = arith.extf %tmp0_59 : tensor<64x4xbf16, #blocked1> to tensor<64x4xf32, #blocked1> loc(#loc173)
+      %tmp6 = tt.broadcast %r0_index_52 : tensor<1x4xi32, #blocked1> -> tensor<64x4xi32, #blocked1> loc(#loc174)
+      %tmp6_61 = arith.addi %tmp6, %tmp0_33 : tensor<64x4xi32, #blocked1> loc(#loc174)
+      %tmp6_62 = arith.addi %tmp6_61, %tmp0_35 : tensor<64x4xi32, #blocked1> loc(#loc175)
+      %tmp6_63 = tt.addptr %tmp0_36, %tmp6_62 : tensor<64x4x!tt.ptr<bf16>, #blocked1>, tensor<64x4xi32, #blocked1> loc(#loc176)
+      %tmp6_64 = tt.load %tmp6_63, %tmp0_58, %cst_16 evictionPolicy = evict_last : tensor<64x4x!tt.ptr<bf16>, #blocked1> loc(#loc177)
+      %tmp6_65 = arith.extf %tmp6_64 : tensor<64x4xbf16, #blocked1> to tensor<64x4xf32, #blocked1> loc(#loc178)
+      %tmp2 = arith.mulf %tmp0_60, %tmp0_60 : tensor<64x4xf32, #blocked1> loc(#loc179)
+      %tmp5 = arith.addf %_tmp4, %tmp2 : tensor<64x4xf32, #blocked1> loc(#loc180)
+      %_tmp4_66 = arith.select %tmp0_58, %tmp5, %_tmp4 : tensor<64x4xi1, #blocked1>, tensor<64x4xf32, #blocked1> loc(#loc181)
+      %tmp8 = arith.mulf %tmp6_65, %tmp6_65 : tensor<64x4xf32, #blocked1> loc(#loc182)
+      %tmp11 = arith.addf %_tmp10_51, %tmp8 : tensor<64x4xf32, #blocked1> loc(#loc183)
+      %_tmp10_67 = arith.select %tmp0_58, %tmp11, %_tmp10_51 : tensor<64x4xi1, #blocked1>, tensor<64x4xf32, #blocked1> loc(#loc184)
+      scf.yield %_tmp4_66, %_tmp10_67 : tensor<64x4xf32, #blocked1>, tensor<64x4xf32, #blocked1> loc(#loc31)
+    } loc(#loc290)
+    %tmp4 = "tt.reduce"(%_tmp10#0) <{axis = 1 : i32}> ({
+    ^bb0(%tmp4_51: f32 loc(callsite(#loc1 at #loc185)), %tmp4_52: f32 loc(callsite(#loc1 at #loc185))):
+      %tmp4_53 = arith.addf %tmp4_51, %tmp4_52 : f32 loc(#loc297)
+      tt.reduce.return %tmp4_53 : f32 loc(#loc291)
+    }) : (tensor<64x4xf32, #blocked1>) -> tensor<64xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc291)
+    %tmp4_37 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<64xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<64x1xf32, #blocked1> loc(#loc186)
+    %tmp10 = "tt.reduce"(%_tmp10#1) <{axis = 1 : i32}> ({
+    ^bb0(%tmp10_51: f32 loc(callsite(#loc1 at #loc187)), %tmp10_52: f32 loc(callsite(#loc1 at #loc187))):
+      %tmp10_53 = arith.addf %tmp10_51, %tmp10_52 : f32 loc(#loc298)
+      tt.reduce.return %tmp10_53 : f32 loc(#loc293)
+    }) : (tensor<64x4xf32, #blocked1>) -> tensor<64xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc293)
+    %tmp10_38 = tt.expand_dims %tmp10 {axis = 1 : i32} : tensor<64xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<64x1xf32, #blocked1> loc(#loc188)
+    %tmp50 = arith.muli %x0, %cst_6 : tensor<64x1xi32, #blocked> loc(#loc189)
+    %tmp50_39 = tt.broadcast %tmp50 : tensor<64x1xi32, #blocked> -> tensor<64x4xi32, #blocked> loc(#loc190)
+    %tmp50_40 = arith.muli %x1, %cst_4 : tensor<64x1xi32, #blocked> loc(#loc191)
+    %tmp50_41 = tt.broadcast %tmp50_40 : tensor<64x1xi32, #blocked> -> tensor<64x4xi32, #blocked> loc(#loc192)
+    %tmp50_42 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<64x4x!tt.ptr<bf16>, #blocked> loc(#loc193)
+    %tmp58 = tt.splat %in_ptr1 : !tt.ptr<bf16> -> tensor<1x4x!tt.ptr<bf16>, #blocked> loc(#loc194)
+    %tmp58_43 = tt.splat %in_ptr1 : !tt.ptr<bf16> -> tensor<1x4x!tt.ptr<bf16>, #blocked1> loc(#loc194)
+    %tmp63 = arith.muli %x1_32, %cst_5 : tensor<64x1xi32, #blocked1> loc(#loc195)
+    %tmp63_44 = tt.broadcast %tmp63 : tensor<64x1xi32, #blocked1> -> tensor<64x4xi32, #blocked1> loc(#loc196)
+    %tmp63_45 = tt.splat %in_ptr2 : !tt.ptr<f32> -> tensor<64x4x!tt.ptr<f32>, #blocked1> loc(#loc197)
+    %tmp66 = tt.splat %in_ptr3 : !tt.ptr<f32> -> tensor<64x4x!tt.ptr<f32>, #blocked1> loc(#loc198)
+    %tmp102 = tt.splat %in_ptr4 : !tt.ptr<bf16> -> tensor<1x4x!tt.ptr<bf16>, #blocked> loc(#loc199)
+    %tmp102_46 = tt.splat %in_ptr4 : !tt.ptr<bf16> -> tensor<1x4x!tt.ptr<bf16>, #blocked1> loc(#loc199)
+    %tmp20 = arith.divf %tmp10_38, %cst_18 : tensor<64x1xf32, #blocked1> loc(#loc200)
+    %tmp22 = arith.addf %tmp20, %cst_17 : tensor<64x1xf32, #blocked1> loc(#loc201)
+    %tmp23 = tt.extern_elementwise %tmp22 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<64x1xf32, #blocked1>) -> tensor<64x1xf32, #blocked1> loc(#loc202)
+    %tmp24 = ttg.convert_layout %tmp23 : tensor<64x1xf32, #blocked1> -> tensor<64x1xf32, #blocked> loc(#loc203)
+    %tmp24_47 = tt.broadcast %tmp24 : tensor<64x1xf32, #blocked> -> tensor<64x4xf32, #blocked> loc(#loc203)
+    %tmp24_48 = tt.broadcast %tmp23 : tensor<64x1xf32, #blocked1> -> tensor<64x4xf32, #blocked1> loc(#loc203)
+    %tmp72 = arith.divf %tmp4_37, %cst_18 : tensor<64x1xf32, #blocked1> loc(#loc204)
+    %tmp73 = arith.addf %tmp72, %cst_17 : tensor<64x1xf32, #blocked1> loc(#loc205)
+    %tmp74 = tt.extern_elementwise %tmp73 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<64x1xf32, #blocked1>) -> tensor<64x1xf32, #blocked1> loc(#loc206)
+    %tmp75 = ttg.convert_layout %tmp74 : tensor<64x1xf32, #blocked1> -> tensor<64x1xf32, #blocked> loc(#loc207)
+    %tmp75_49 = tt.broadcast %tmp75 : tensor<64x1xf32, #blocked> -> tensor<64x4xf32, #blocked> loc(#loc207)
+    %tmp75_50 = tt.broadcast %tmp74 : tensor<64x1xf32, #blocked1> -> tensor<64x4xf32, #blocked1> loc(#loc207)
+    %0 = arith.muli %xindex_27, %cst_5 : tensor<64x1xi32, #blocked1> loc(#loc57)
+    %1 = tt.broadcast %0 : tensor<64x1xi32, #blocked1> -> tensor<64x4xi32, #blocked1> loc(#loc58)
+    %2 = tt.splat %in_out_ptr0 : !tt.ptr<bf16> -> tensor<64x4x!tt.ptr<bf16>, #blocked1> loc(#loc59)
+    %3 = tt.splat %in_out_ptr1 : !tt.ptr<bf16> -> tensor<64x4x!tt.ptr<bf16>, #blocked1> loc(#loc60)
+    scf.for %r0_offset = %c0_i32 to %c128_i32 step %c4_i32  : i32 {
+      %r0_index = tt.splat %r0_offset : i32 -> tensor<1x4xi32, #blocked> loc(#loc208)
+      %r0_index_51 = tt.splat %r0_offset : i32 -> tensor<1x4xi32, #blocked1> loc(#loc208)
+      %r0_index_52 = arith.addi %r0_index, %r0_base_29 : tensor<1x4xi32, #blocked> loc(#loc208)
+      %r0_index_53 = arith.addi %r0_index_51, %r0_base_30 : tensor<1x4xi32, #blocked1> loc(#loc208)
+      %r0_mask = arith.cmpi slt, %r0_index_52, %cst_8 : tensor<1x4xi32, #blocked> loc(#loc209)
+      %r0_mask_54 = arith.cmpi slt, %r0_index_53, %cst_13 : tensor<1x4xi32, #blocked1> loc(#loc209)
+      %r0_3 = arith.remsi %r0_index_52, %cst_2 : tensor<1x4xi32, #blocked> loc(#loc210)
+      %r0_4 = arith.divsi %r0_index_52, %cst_2 : tensor<1x4xi32, #blocked> loc(#loc211)
+      %tmp50_55 = tt.broadcast %r0_index_53 : tensor<1x4xi32, #blocked1> -> tensor<64x4xi32, #blocked1> loc(#loc190)
+      %tmp50_56 = arith.addi %tmp50_55, %tmp0_33 : tensor<64x4xi32, #blocked1> loc(#loc190)
+      %tmp50_57 = arith.addi %tmp50_56, %tmp0_35 : tensor<64x4xi32, #blocked1> loc(#loc192)
+      %tmp50_58 = tt.addptr %tmp0_36, %tmp50_57 : tensor<64x4x!tt.ptr<bf16>, #blocked1>, tensor<64x4xi32, #blocked1> loc(#loc193)
+      %tmp50_59 = tt.broadcast %r0_mask_54 : tensor<1x4xi1, #blocked1> -> tensor<64x4xi1, #blocked1> loc(#loc212)
+      %tmp50_60 = tt.load %tmp50_58, %tmp50_59, %cst_16 evictionPolicy = evict_last : tensor<64x4x!tt.ptr<bf16>, #blocked1> loc(#loc212)
+      %tmp50_61 = arith.extf %tmp50_60 : tensor<64x4xbf16, #blocked1> to tensor<64x4xf32, #blocked1> loc(#loc213)
+      %tmp58_62 = tt.addptr %tmp58_43, %r0_index_53 : tensor<1x4x!tt.ptr<bf16>, #blocked1>, tensor<1x4xi32, #blocked1> loc(#loc194)
+      %tmp58_63 = tt.load %tmp58_62, %r0_mask_54, %cst_15 evictionPolicy = evict_last : tensor<1x4x!tt.ptr<bf16>, #blocked1> loc(#loc214)
+      %tmp58_64 = arith.extf %tmp58_63 : tensor<1x4xbf16, #blocked1> to tensor<1x4xf32, #blocked1> loc(#loc215)
+      %tmp63_65 = arith.addi %tmp50_55, %tmp63_44 : tensor<64x4xi32, #blocked1> loc(#loc196)
+      %tmp63_66 = tt.addptr %tmp63_45, %tmp63_65 : tensor<64x4x!tt.ptr<f32>, #blocked1>, tensor<64x4xi32, #blocked1> loc(#loc197)
+      %tmp63_67 = tt.load %tmp63_66, %tmp50_59, %cst_12 evictionPolicy = evict_last : tensor<64x4x!tt.ptr<f32>, #blocked1> loc(#loc216)
+      %tmp66_68 = tt.addptr %tmp66, %tmp63_65 : tensor<64x4x!tt.ptr<f32>, #blocked1>, tensor<64x4xi32, #blocked1> loc(#loc198)
+      %tmp66_69 = tt.load %tmp66_68, %tmp50_59, %cst_12 evictionPolicy = evict_last : tensor<64x4x!tt.ptr<f32>, #blocked1> loc(#loc217)
+      %tmp66_70 = ttg.convert_layout %tmp66_69 : tensor<64x4xf32, #blocked1> -> tensor<64x4xf32, #blocked> loc(#loc217)
+      %tmp96 = arith.addi %r0_index_53, %cst_14 : tensor<1x4xi32, #blocked1> loc(#loc218)
+      %tmp96_71 = tt.broadcast %tmp96 : tensor<1x4xi32, #blocked1> -> tensor<64x4xi32, #blocked1> loc(#loc219)
+      %tmp96_72 = arith.addi %tmp96_71, %tmp0_33 : tensor<64x4xi32, #blocked1> loc(#loc219)
+      %tmp96_73 = arith.addi %tmp96_72, %tmp0_35 : tensor<64x4xi32, #blocked1> loc(#loc220)
+      %tmp96_74 = tt.addptr %tmp0_36, %tmp96_73 : tensor<64x4x!tt.ptr<bf16>, #blocked1>, tensor<64x4xi32, #blocked1> loc(#loc221)
+      %tmp96_75 = tt.load %tmp96_74, %tmp50_59, %cst_16 evictionPolicy = evict_first : tensor<64x4x!tt.ptr<bf16>, #blocked1> loc(#loc222)
+      %tmp96_76 = arith.extf %tmp96_75 : tensor<64x4xbf16, #blocked1> to tensor<64x4xf32, #blocked1> loc(#loc223)
+      %tmp102_77 = tt.addptr %tmp102_46, %r0_index_53 : tensor<1x4x!tt.ptr<bf16>, #blocked1>, tensor<1x4xi32, #blocked1> loc(#loc199)
+      %tmp102_78 = tt.load %tmp102_77, %r0_mask_54, %cst_15 evictionPolicy = evict_last : tensor<1x4x!tt.ptr<bf16>, #blocked1> loc(#loc224)
+      %tmp102_79 = arith.extf %tmp102_78 : tensor<1x4xbf16, #blocked1> to tensor<1x4xf32, #blocked1> loc(#loc225)
+      %tmp16 = arith.extsi %r0_3 : tensor<1x4xi32, #blocked> to tensor<1x4xi64, #blocked> loc(#loc226)
+      %tmp16_80 = arith.cmpi slt, %tmp16, %cst_1 : tensor<1x4xi64, #blocked> loc(#loc226)
+      %tmp17 = arith.muli %r0_4, %cst_2 : tensor<1x4xi32, #blocked> loc(#loc227)
+      %tmp17_81 = arith.addi %tmp17, %cst_0 : tensor<1x4xi32, #blocked> loc(#loc228)
+      %tmp17_82 = tt.broadcast %tmp17_81 : tensor<1x4xi32, #blocked> -> tensor<64x4xi32, #blocked> loc(#loc229)
+      %tmp17_83 = arith.addi %tmp17_82, %tmp50_39 : tensor<64x4xi32, #blocked> loc(#loc229)
+      %tmp17_84 = arith.addi %tmp17_83, %tmp50_41 : tensor<64x4xi32, #blocked> loc(#loc230)
+      %tmp17_85 = tt.addptr %tmp50_42, %tmp17_84 : tensor<64x4x!tt.ptr<bf16>, #blocked>, tensor<64x4xi32, #blocked> loc(#loc231)
+      %tmp17_86 = arith.andi %r0_mask, %tmp16_80 : tensor<1x4xi1, #blocked> loc(#loc232)
+      %tmp17_87 = tt.broadcast %tmp17_86 : tensor<1x4xi1, #blocked> -> tensor<64x4xi1, #blocked> loc(#loc233)
+      %tmp17_88 = tt.load %tmp17_85, %tmp17_87, %cst_11 evictionPolicy = evict_last : tensor<64x4x!tt.ptr<bf16>, #blocked> loc(#loc233)
+      %tmp17_89 = arith.extf %tmp17_88 : tensor<64x4xbf16, #blocked> to tensor<64x4xf32, #blocked> loc(#loc234)
+      %tmp24_90 = arith.mulf %tmp17_89, %tmp24_47 : tensor<64x4xf32, #blocked> loc(#loc203)
+      %tmp25 = tt.addptr %tmp58, %tmp17_81 : tensor<1x4x!tt.ptr<bf16>, #blocked>, tensor<1x4xi32, #blocked> loc(#loc235)
+      %tmp25_91 = tt.broadcast %tmp25 : tensor<1x4x!tt.ptr<bf16>, #blocked> -> tensor<64x4x!tt.ptr<bf16>, #blocked> loc(#loc235)
+      %tmp25_92 = tt.load %tmp25_91, %tmp17_87, %cst_11 evictionPolicy = evict_last : tensor<64x4x!tt.ptr<bf16>, #blocked> loc(#loc236)
+      %tmp25_93 = arith.extf %tmp25_92 : tensor<64x4xbf16, #blocked> to tensor<64x4xf32, #blocked> loc(#loc237)
+      %tmp27 = arith.mulf %tmp24_90, %tmp25_93 : tensor<64x4xf32, #blocked> loc(#loc238)
+      %tmp29 = arith.subf %cst_19, %tmp27 : tensor<64x4xf32, #blocked> loc(#loc239)
+      %tmp31 = tt.broadcast %tmp16_80 : tensor<1x4xi1, #blocked> -> tensor<64x4xi1, #blocked> loc(#loc240)
+      %tmp32 = arith.cmpi sge, %tmp16, %cst_1 : tensor<1x4xi64, #blocked> loc(#loc241)
+      %tmp35 = tt.broadcast %tmp17 : tensor<1x4xi32, #blocked> -> tensor<64x4xi32, #blocked> loc(#loc242)
+      %tmp35_94 = arith.addi %tmp35, %tmp50_39 : tensor<64x4xi32, #blocked> loc(#loc242)
+      %tmp35_95 = arith.addi %tmp35_94, %tmp50_41 : tensor<64x4xi32, #blocked> loc(#loc243)
+      %tmp35_96 = tt.addptr %tmp50_42, %tmp35_95 : tensor<64x4x!tt.ptr<bf16>, #blocked>, tensor<64x4xi32, #blocked> loc(#loc244)
+      %tmp35_97 = arith.andi %r0_mask, %tmp32 : tensor<1x4xi1, #blocked> loc(#loc245)
+      %tmp35_98 = tt.broadcast %tmp35_97 : tensor<1x4xi1, #blocked> -> tensor<64x4xi1, #blocked> loc(#loc246)
+      %tmp35_99 = tt.load %tmp35_96, %tmp35_98, %cst_11 evictionPolicy = evict_last : tensor<64x4x!tt.ptr<bf16>, #blocked> loc(#loc246)
+      %tmp35_100 = arith.extf %tmp35_99 : tensor<64x4xbf16, #blocked> to tensor<64x4xf32, #blocked> loc(#loc247)
+      %tmp42 = arith.mulf %tmp35_100, %tmp24_47 : tensor<64x4xf32, #blocked> loc(#loc248)
+      %tmp43 = tt.addptr %tmp58, %tmp17 : tensor<1x4x!tt.ptr<bf16>, #blocked>, tensor<1x4xi32, #blocked> loc(#loc249)
+      %tmp43_101 = tt.broadcast %tmp43 : tensor<1x4x!tt.ptr<bf16>, #blocked> -> tensor<64x4x!tt.ptr<bf16>, #blocked> loc(#loc249)
+      %tmp43_102 = tt.load %tmp43_101, %tmp35_98, %cst_11 evictionPolicy = evict_last : tensor<64x4x!tt.ptr<bf16>, #blocked> loc(#loc250)
+      %tmp43_103 = arith.extf %tmp43_102 : tensor<64x4xbf16, #blocked> to tensor<64x4xf32, #blocked> loc(#loc251)
+      %tmp45 = arith.mulf %tmp42, %tmp43_103 : tensor<64x4xf32, #blocked> loc(#loc252)
+      %tmp48 = tt.broadcast %tmp32 : tensor<1x4xi1, #blocked> -> tensor<64x4xi1, #blocked> loc(#loc253)
+      %tmp48_104 = arith.select %tmp48, %tmp45, %cst_19 : tensor<64x4xi1, #blocked>, tensor<64x4xf32, #blocked> loc(#loc253)
+      %tmp49 = arith.select %tmp31, %tmp29, %tmp48_104 : tensor<64x4xi1, #blocked>, tensor<64x4xf32, #blocked> loc(#loc295)
+      %tmp57 = arith.mulf %tmp50_61, %tmp24_48 : tensor<64x4xf32, #blocked1> loc(#loc255)
+      %tmp60 = tt.broadcast %tmp58_64 : tensor<1x4xf32, #blocked1> -> tensor<64x4xf32, #blocked1> loc(#loc256)
+      %tmp60_105 = arith.mulf %tmp57, %tmp60 : tensor<64x4xf32, #blocked1> loc(#loc256)
+      %tmp64 = arith.mulf %tmp60_105, %tmp63_67 : tensor<64x4xf32, #blocked1> loc(#loc257)
+      %tmp64_106 = ttg.convert_layout %tmp64 : tensor<64x4xf32, #blocked1> -> tensor<64x4xf32, #blocked> loc(#loc257)
+      %tmp67 = arith.mulf %tmp49, %tmp66_70 : tensor<64x4xf32, #blocked> loc(#loc258)
+      %tmp68 = arith.addf %tmp64_106, %tmp67 : tensor<64x4xf32, #blocked> loc(#loc259)
+      %tmp70 = arith.addi %tmp17, %cst : tensor<1x4xi32, #blocked> loc(#loc260)
+      %tmp70_107 = tt.broadcast %tmp70 : tensor<1x4xi32, #blocked> -> tensor<64x4xi32, #blocked> loc(#loc261)
+      %tmp70_108 = arith.addi %tmp70_107, %tmp50_39 : tensor<64x4xi32, #blocked> loc(#loc261)
+      %tmp70_109 = arith.addi %tmp70_108, %tmp50_41 : tensor<64x4xi32, #blocked> loc(#loc262)
+      %tmp70_110 = tt.addptr %tmp50_42, %tmp70_109 : tensor<64x4x!tt.ptr<bf16>, #blocked>, tensor<64x4xi32, #blocked> loc(#loc263)
+      %tmp70_111 = tt.load %tmp70_110, %tmp17_87, %cst_11 evictionPolicy = evict_last : tensor<64x4x!tt.ptr<bf16>, #blocked> loc(#loc264)
+      %tmp70_112 = arith.extf %tmp70_111 : tensor<64x4xbf16, #blocked> to tensor<64x4xf32, #blocked> loc(#loc265)
+      %tmp75_113 = arith.mulf %tmp70_112, %tmp75_49 : tensor<64x4xf32, #blocked> loc(#loc207)
+      %tmp76 = tt.addptr %tmp102, %tmp17_81 : tensor<1x4x!tt.ptr<bf16>, #blocked>, tensor<1x4xi32, #blocked> loc(#loc266)
+      %tmp76_114 = tt.broadcast %tmp76 : tensor<1x4x!tt.ptr<bf16>, #blocked> -> tensor<64x4x!tt.ptr<bf16>, #blocked> loc(#loc266)
+      %tmp76_115 = tt.load %tmp76_114, %tmp17_87, %cst_11 evictionPolicy = evict_last : tensor<64x4x!tt.ptr<bf16>, #blocked> loc(#loc267)
+      %tmp76_116 = arith.extf %tmp76_115 : tensor<64x4xbf16, #blocked> to tensor<64x4xf32, #blocked> loc(#loc268)
+      %tmp78 = arith.mulf %tmp75_113, %tmp76_116 : tensor<64x4xf32, #blocked> loc(#loc269)
+      %tmp80 = arith.subf %cst_19, %tmp78 : tensor<64x4xf32, #blocked> loc(#loc270)
+      %tmp83 = arith.addi %tmp17, %cst_7 : tensor<1x4xi32, #blocked> loc(#loc271)
+      %tmp83_117 = tt.broadcast %tmp83 : tensor<1x4xi32, #blocked> -> tensor<64x4xi32, #blocked> loc(#loc272)
+      %tmp83_118 = arith.addi %tmp83_117, %tmp50_39 : tensor<64x4xi32, #blocked> loc(#loc272)
+      %tmp83_119 = arith.addi %tmp83_118, %tmp50_41 : tensor<64x4xi32, #blocked> loc(#loc273)
+      %tmp83_120 = tt.addptr %tmp50_42, %tmp83_119 : tensor<64x4x!tt.ptr<bf16>, #blocked>, tensor<64x4xi32, #blocked> loc(#loc274)
+      %tmp83_121 = tt.load %tmp83_120, %tmp35_98, %cst_11 evictionPolicy = evict_last : tensor<64x4x!tt.ptr<bf16>, #blocked> loc(#loc275)
+      %tmp83_122 = arith.extf %tmp83_121 : tensor<64x4xbf16, #blocked> to tensor<64x4xf32, #blocked> loc(#loc276)
+      %tmp88 = arith.mulf %tmp83_122, %tmp75_49 : tensor<64x4xf32, #blocked> loc(#loc277)
+      %tmp89 = tt.addptr %tmp102, %tmp17 : tensor<1x4x!tt.ptr<bf16>, #blocked>, tensor<1x4xi32, #blocked> loc(#loc278)
+      %tmp89_123 = tt.broadcast %tmp89 : tensor<1x4x!tt.ptr<bf16>, #blocked> -> tensor<64x4x!tt.ptr<bf16>, #blocked> loc(#loc278)
+      %tmp89_124 = tt.load %tmp89_123, %tmp35_98, %cst_11 evictionPolicy = evict_last : tensor<64x4x!tt.ptr<bf16>, #blocked> loc(#loc279)
+      %tmp89_125 = arith.extf %tmp89_124 : tensor<64x4xbf16, #blocked> to tensor<64x4xf32, #blocked> loc(#loc280)
+      %tmp91 = arith.mulf %tmp88, %tmp89_125 : tensor<64x4xf32, #blocked> loc(#loc281)
+      %tmp94 = arith.select %tmp48, %tmp91, %cst_19 : tensor<64x4xi1, #blocked>, tensor<64x4xf32, #blocked> loc(#loc282)
+      %tmp95 = arith.select %tmp31, %tmp80, %tmp94 : tensor<64x4xi1, #blocked>, tensor<64x4xf32, #blocked> loc(#loc296)
+      %tmp101 = arith.mulf %tmp96_76, %tmp75_50 : tensor<64x4xf32, #blocked1> loc(#loc285)
+      %tmp104 = tt.broadcast %tmp102_79 : tensor<1x4xf32, #blocked1> -> tensor<64x4xf32, #blocked1> loc(#loc286)
+      %tmp104_126 = arith.mulf %tmp101, %tmp104 : tensor<64x4xf32, #blocked1> loc(#loc286)
+      %tmp107 = arith.mulf %tmp104_126, %tmp63_67 : tensor<64x4xf32, #blocked1> loc(#loc287)
+      %tmp107_127 = ttg.convert_layout %tmp107 : tensor<64x4xf32, #blocked1> -> tensor<64x4xf32, #blocked> loc(#loc287)
+      %tmp109 = arith.mulf %tmp95, %tmp66_70 : tensor<64x4xf32, #blocked> loc(#loc288)
+      %tmp110 = arith.addf %tmp107_127, %tmp109 : tensor<64x4xf32, #blocked> loc(#loc289)
+      %4 = arith.addi %tmp50_55, %1 : tensor<64x4xi32, #blocked1> loc(#loc58)
+      %5 = tt.addptr %2, %4 : tensor<64x4x!tt.ptr<bf16>, #blocked1>, tensor<64x4xi32, #blocked1> loc(#loc59)
+      %6 = arith.truncf %tmp68 : tensor<64x4xf32, #blocked> to tensor<64x4xbf16, #blocked> loc(#loc144)
+      %7 = ttg.convert_layout %6 : tensor<64x4xbf16, #blocked> -> tensor<64x4xbf16, #blocked1> loc(#loc144)
+      tt.store %5, %7, %tmp50_59 : tensor<64x4x!tt.ptr<bf16>, #blocked1> loc(#loc144)
+      %8 = tt.addptr %3, %4 : tensor<64x4x!tt.ptr<bf16>, #blocked1>, tensor<64x4xi32, #blocked1> loc(#loc60)
+      %9 = arith.truncf %tmp110 : tensor<64x4xf32, #blocked> to tensor<64x4xbf16, #blocked> loc(#loc145)
+      %10 = ttg.convert_layout %9 : tensor<64x4xbf16, #blocked> -> tensor<64x4xbf16, #blocked1> loc(#loc145)
+      tt.store %8, %10, %tmp50_59 : tensor<64x4x!tt.ptr<bf16>, #blocked1> loc(#loc145)
+    } loc(#loc61)
+    tt.return loc(#loc146)
+  } loc(#loc)
+} loc(#loc)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":23:28)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":23:33)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:44)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:23)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":26:37)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":28:19)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":29:19)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:52)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:48)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:63)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:57)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:34)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":33:43)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":34:31)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":35:29)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:41)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:68)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:121)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:41)
+#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:50)
+#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:34)
+#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:61)
+#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:114)
+#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":42:22)
+#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":44:23)
+#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":45:40)
+#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":47:22)
+#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":49:25)
+#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":50:42)
+#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":50:8)
+#loc32 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:36)
+#loc34 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:15)
+#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":51:28)
+#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":52:30)
+#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:46)
+#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:42)
+#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:57)
+#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:51)
+#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:35)
+#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:35)
+#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:46)
+#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:42)
+#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:35)
+#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:35)
+#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:36)
+#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":75:25)
+#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":77:24)
+#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":78:32)
+#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":79:24)
+#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":123:24)
+#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":124:24)
+#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":125:32)
+#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":126:24)
+#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:43)
+#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:39)
+#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:32)
+#loc60 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:32)
+#loc61 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":53:43)
+#loc62 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":54:31)
+#loc63 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":55:29)
+#loc64 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":58:27)
+#loc65 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":59:27)
+#loc66 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:62)
+#loc67 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:115)
+#loc68 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:42)
+#loc69 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:95)
+#loc70 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:51)
+#loc71 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:51)
+#loc72 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:42)
+#loc73 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:49)
+#loc74 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:58)
+#loc75 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:35)
+#loc76 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:69)
+#loc77 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:123)
+#loc78 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:43)
+#loc79 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:96)
+#loc80 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":71:24)
+#loc81 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:41)
+#loc82 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:39)
+#loc83 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:48)
+#loc84 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:57)
+#loc85 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:35)
+#loc86 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:78)
+#loc87 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:68)
+#loc88 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:129)
+#loc89 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:35)
+#loc90 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:85)
+#loc91 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:146)
+#loc92 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":82:24)
+#loc93 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":84:17)
+#loc94 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":86:39)
+#loc95 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":87:25)
+#loc96 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:44)
+#loc97 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:53)
+#loc98 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:35)
+#loc99 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:74)
+#loc100 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:64)
+#loc101 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:125)
+#loc102 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":97:24)
+#loc103 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:35)
+#loc104 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:81)
+#loc105 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:142)
+#loc106 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":100:24)
+#loc107 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":103:39)
+#loc108 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":104:39)
+#loc109 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":111:24)
+#loc110 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":113:24)
+#loc111 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":116:24)
+#loc112 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":118:24)
+#loc113 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":119:24)
+#loc114 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:42)
+#loc115 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:51)
+#loc116 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:60)
+#loc117 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:35)
+#loc118 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:71)
+#loc119 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:132)
+#loc120 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:35)
+#loc121 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:85)
+#loc122 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:146)
+#loc123 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":129:24)
+#loc124 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":131:17)
+#loc125 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:42)
+#loc126 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:51)
+#loc127 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:60)
+#loc128 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:35)
+#loc129 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:71)
+#loc130 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:132)
+#loc131 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":139:24)
+#loc132 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:35)
+#loc133 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:81)
+#loc134 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:142)
+#loc135 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":142:24)
+#loc136 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":145:39)
+#loc137 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":146:39)
+#loc138 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":133:39)
+#loc139 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":151:25)
+#loc140 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":153:26)
+#loc141 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":156:26)
+#loc142 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":158:26)
+#loc143 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":159:26)
+#loc144 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:55)
+#loc145 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:56)
+#loc146 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":53:4)
+#loc156 = loc("xoffset"(#loc2))
+#loc157 = loc("xoffset"(#loc3))
+#loc158 = loc("xindex"(#loc4))
+#loc159 = loc("xindex"(#loc5))
+#loc160 = loc("r0_base"(#loc6))
+#loc161 = loc("x0"(#loc7))
+#loc162 = loc("x1"(#loc8))
+#loc163 = loc("tmp0"(#loc9))
+#loc164 = loc("tmp0"(#loc10))
+#loc165 = loc("tmp0"(#loc11))
+#loc166 = loc("tmp0"(#loc12))
+#loc167 = loc("tmp0"(#loc13))
+#loc168 = loc("_tmp4"(#loc14))
+#loc169 = loc("r0_index"(#loc15))
+#loc170 = loc("r0_mask"(#loc16))
+#loc171 = loc("tmp0"(#loc17))
+#loc172 = loc("tmp0"(#loc18))
+#loc173 = loc("tmp0"(#loc19))
+#loc174 = loc("tmp6"(#loc20))
+#loc175 = loc("tmp6"(#loc21))
+#loc176 = loc("tmp6"(#loc22))
+#loc177 = loc("tmp6"(#loc23))
+#loc178 = loc("tmp6"(#loc24))
+#loc179 = loc("tmp2"(#loc25))
+#loc180 = loc("tmp5"(#loc26))
+#loc181 = loc("_tmp4"(#loc27))
+#loc182 = loc("tmp8"(#loc28))
+#loc183 = loc("tmp11"(#loc29))
+#loc184 = loc("_tmp10"(#loc30))
+#loc186 = loc("tmp4"(#loc35))
+#loc188 = loc("tmp10"(#loc37))
+#loc189 = loc("tmp50"(#loc38))
+#loc190 = loc("tmp50"(#loc39))
+#loc191 = loc("tmp50"(#loc40))
+#loc192 = loc("tmp50"(#loc41))
+#loc193 = loc("tmp50"(#loc42))
+#loc194 = loc("tmp58"(#loc43))
+#loc195 = loc("tmp63"(#loc44))
+#loc196 = loc("tmp63"(#loc45))
+#loc197 = loc("tmp63"(#loc46))
+#loc198 = loc("tmp66"(#loc47))
+#loc199 = loc("tmp102"(#loc48))
+#loc200 = loc("tmp20"(#loc49))
+#loc201 = loc("tmp22"(#loc50))
+#loc202 = loc("tmp23"(#loc51))
+#loc203 = loc("tmp24"(#loc52))
+#loc204 = loc("tmp72"(#loc53))
+#loc205 = loc("tmp73"(#loc54))
+#loc206 = loc("tmp74"(#loc55))
+#loc207 = loc("tmp75"(#loc56))
+#loc208 = loc("r0_index"(#loc62))
+#loc209 = loc("r0_mask"(#loc63))
+#loc210 = loc("r0_3"(#loc64))
+#loc211 = loc("r0_4"(#loc65))
+#loc212 = loc("tmp50"(#loc66))
+#loc213 = loc("tmp50"(#loc67))
+#loc214 = loc("tmp58"(#loc68))
+#loc215 = loc("tmp58"(#loc69))
+#loc216 = loc("tmp63"(#loc70))
+#loc217 = loc("tmp66"(#loc71))
+#loc218 = loc("tmp96"(#loc72))
+#loc219 = loc("tmp96"(#loc73))
+#loc220 = loc("tmp96"(#loc74))
+#loc221 = loc("tmp96"(#loc75))
+#loc222 = loc("tmp96"(#loc76))
+#loc223 = loc("tmp96"(#loc77))
+#loc224 = loc("tmp102"(#loc78))
+#loc225 = loc("tmp102"(#loc79))
+#loc226 = loc("tmp16"(#loc80))
+#loc227 = loc("tmp17"(#loc81))
+#loc228 = loc("tmp17"(#loc82))
+#loc229 = loc("tmp17"(#loc83))
+#loc230 = loc("tmp17"(#loc84))
+#loc231 = loc("tmp17"(#loc85))
+#loc232 = loc("tmp17"(#loc86))
+#loc233 = loc("tmp17"(#loc87))
+#loc234 = loc("tmp17"(#loc88))
+#loc235 = loc("tmp25"(#loc89))
+#loc236 = loc("tmp25"(#loc90))
+#loc237 = loc("tmp25"(#loc91))
+#loc238 = loc("tmp27"(#loc92))
+#loc239 = loc("tmp29"(#loc93))
+#loc240 = loc("tmp31"(#loc94))
+#loc241 = loc("tmp32"(#loc95))
+#loc242 = loc("tmp35"(#loc96))
+#loc243 = loc("tmp35"(#loc97))
+#loc244 = loc("tmp35"(#loc98))
+#loc245 = loc("tmp35"(#loc99))
+#loc246 = loc("tmp35"(#loc100))
+#loc247 = loc("tmp35"(#loc101))
+#loc248 = loc("tmp42"(#loc102))
+#loc249 = loc("tmp43"(#loc103))
+#loc250 = loc("tmp43"(#loc104))
+#loc251 = loc("tmp43"(#loc105))
+#loc252 = loc("tmp45"(#loc106))
+#loc253 = loc("tmp48"(#loc107))
+#loc254 = loc("tmp49"(#loc108))
+#loc255 = loc("tmp57"(#loc109))
+#loc256 = loc("tmp60"(#loc110))
+#loc257 = loc("tmp64"(#loc111))
+#loc258 = loc("tmp67"(#loc112))
+#loc259 = loc("tmp68"(#loc113))
+#loc260 = loc("tmp70"(#loc114))
+#loc261 = loc("tmp70"(#loc115))
+#loc262 = loc("tmp70"(#loc116))
+#loc263 = loc("tmp70"(#loc117))
+#loc264 = loc("tmp70"(#loc118))
+#loc265 = loc("tmp70"(#loc119))
+#loc266 = loc("tmp76"(#loc120))
+#loc267 = loc("tmp76"(#loc121))
+#loc268 = loc("tmp76"(#loc122))
+#loc269 = loc("tmp78"(#loc123))
+#loc270 = loc("tmp80"(#loc124))
+#loc271 = loc("tmp83"(#loc125))
+#loc272 = loc("tmp83"(#loc126))
+#loc273 = loc("tmp83"(#loc127))
+#loc274 = loc("tmp83"(#loc128))
+#loc275 = loc("tmp83"(#loc129))
+#loc276 = loc("tmp83"(#loc130))
+#loc277 = loc("tmp88"(#loc131))
+#loc278 = loc("tmp89"(#loc132))
+#loc279 = loc("tmp89"(#loc133))
+#loc280 = loc("tmp89"(#loc134))
+#loc281 = loc("tmp91"(#loc135))
+#loc282 = loc("tmp94"(#loc136))
+#loc283 = loc("tmp95"(#loc137))
+#loc284 = loc("tmp82"(#loc138))
+#loc285 = loc("tmp101"(#loc139))
+#loc286 = loc("tmp104"(#loc140))
+#loc287 = loc("tmp107"(#loc141))
+#loc288 = loc("tmp109"(#loc142))
+#loc289 = loc("tmp110"(#loc143))
+#loc290 = loc("_tmp10"(#loc168))
+#loc291 = loc(callsite(#loc32 at #loc185))
+#loc293 = loc(callsite(#loc32 at #loc187))
+#loc295 = loc(fused[#loc254, #loc240])
+#loc296 = loc(fused[#loc283, #loc284])
+#loc297 = loc(callsite(#loc34 at #loc291))
+#loc298 = loc(callsite(#loc34 at #loc293))
diff --git a/triton/3QQ2VKPC7ZTVHWIARO636O4SXEBYGRIJVMN3RDKURPDAUEL7TMAQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttir b/triton/3QQ2VKPC7ZTVHWIARO636O4SXEBYGRIJVMN3RDKURPDAUEL7TMAQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttir
new file mode 100644
index 0000000000000000000000000000000000000000..998230c6cf24ff4e180b20c65181e154eb0a780b
--- /dev/null
+++ b/triton/3QQ2VKPC7ZTVHWIARO636O4SXEBYGRIJVMN3RDKURPDAUEL7TMAQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttir
@@ -0,0 +1,520 @@
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":18:0)
+#loc1 = loc(unknown)
+#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":51:25)
+#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":52:27)
+#loc149 = loc("in_out_ptr0"(#loc))
+#loc150 = loc("in_out_ptr1"(#loc))
+#loc151 = loc("in_ptr0"(#loc))
+#loc152 = loc("in_ptr1"(#loc))
+#loc153 = loc("in_ptr2"(#loc))
+#loc154 = loc("in_ptr3"(#loc))
+#loc155 = loc("in_ptr4"(#loc))
+#loc156 = loc("xnumel"(#loc))
+#loc157 = loc("r0_numel"(#loc))
+#loc189 = loc("tmp4"(#loc35))
+#loc191 = loc("tmp10"(#loc38))
+#loc296 = loc(callsite(#loc1 at #loc189))
+#loc298 = loc(callsite(#loc1 at #loc191))
+module {
+  tt.func public @triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0(%in_out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_out_ptr0"(#loc)), %in_out_ptr1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_out_ptr1"(#loc)), %in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %in_ptr4: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr4"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
+    %cst = arith.constant dense<0.000000e+00> : tensor<1x4xbf16> loc(#loc1)
+    %cst_0 = arith.constant dense<0.000000e+00> : tensor<64x4xbf16> loc(#loc1)
+    %c4_i32 = arith.constant 4 : i32 loc(#loc1)
+    %c128_i32 = arith.constant 128 : i32 loc(#loc1)
+    %c0_i32 = arith.constant 0 : i32 loc(#loc1)
+    %cst_1 = arith.constant dense<4097> : tensor<1x4xi32> loc(#loc1)
+    %cst_2 = arith.constant dense<9.99999997E-7> : tensor<64x1xf32> loc(#loc1)
+    %cst_3 = arith.constant dense<1.280000e+02> : tensor<64x1xf32> loc(#loc1)
+    %cst_4 = arith.constant dense<1> : tensor<1x4xi32> loc(#loc1)
+    %cst_5 = arith.constant dense<1> : tensor<1x4xi64> loc(#loc1)
+    %cst_6 = arith.constant dense<2> : tensor<1x4xi32> loc(#loc1)
+    %cst_7 = arith.constant dense<36864> : tensor<64x1xi32> loc(#loc1)
+    %cst_8 = arith.constant dense<128> : tensor<64x1xi32> loc(#loc1)
+    %cst_9 = arith.constant dense<4096> : tensor<1x4xi32> loc(#loc1)
+    %cst_10 = arith.constant dense<128> : tensor<1x4xi32> loc(#loc1)
+    %cst_11 = arith.constant dense<0.000000e+00> : tensor<64x4xf32> loc(#loc1)
+    %cst_12 = arith.constant dense<32> : tensor<64x1xi32> loc(#loc1)
+    %c64_i32 = arith.constant 64 : i32 loc(#loc1)
+    %xoffset = tt.get_program_id x : i32 loc(#loc158)
+    %xoffset_13 = arith.muli %xoffset, %c64_i32 : i32 loc(#loc159)
+    %xindex = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc160)
+    %xindex_14 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc161)
+    %xindex_15 = tt.splat %xoffset_13 : i32 -> tensor<64x1xi32> loc(#loc162)
+    %xindex_16 = arith.addi %xindex_15, %xindex_14 : tensor<64x1xi32> loc(#loc162)
+    %r0_base = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32> loc(#loc163)
+    %r0_base_17 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<4xi32> -> tensor<1x4xi32> loc(#loc164)
+    %x0 = arith.remsi %xindex_16, %cst_12 : tensor<64x1xi32> loc(#loc165)
+    %x1 = arith.divsi %xindex_16, %cst_12 : tensor<64x1xi32> loc(#loc166)
+    %_tmp10:2 = scf.for %r0_offset = %c0_i32 to %c128_i32 step %c4_i32 iter_args(%_tmp4 = %cst_11, %_tmp10_20 = %cst_11) -> (tensor<64x4xf32>, tensor<64x4xf32>)  : i32 {
+      %r0_index = tt.splat %r0_offset : i32 -> tensor<1x4xi32> loc(#loc168)
+      %r0_index_21 = arith.addi %r0_index, %r0_base_17 : tensor<1x4xi32> loc(#loc168)
+      %r0_mask = arith.cmpi slt, %r0_index_21, %cst_10 : tensor<1x4xi32> loc(#loc169)
+      %tmp0 = arith.addi %r0_index_21, %cst_9 : tensor<1x4xi32> loc(#loc170)
+      %tmp0_22 = arith.muli %x0, %cst_8 : tensor<64x1xi32> loc(#loc171)
+      %tmp0_23 = tt.broadcast %tmp0 : tensor<1x4xi32> -> tensor<64x4xi32> loc(#loc172)
+      %tmp0_24 = tt.broadcast %tmp0_22 : tensor<64x1xi32> -> tensor<64x4xi32> loc(#loc172)
+      %tmp0_25 = arith.addi %tmp0_23, %tmp0_24 : tensor<64x4xi32> loc(#loc172)
+      %tmp0_26 = arith.muli %x1, %cst_7 : tensor<64x1xi32> loc(#loc173)
+      %tmp0_27 = tt.broadcast %tmp0_26 : tensor<64x1xi32> -> tensor<64x4xi32> loc(#loc174)
+      %tmp0_28 = arith.addi %tmp0_25, %tmp0_27 : tensor<64x4xi32> loc(#loc174)
+      %tmp0_29 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<64x4x!tt.ptr<bf16>> loc(#loc175)
+      %tmp0_30 = tt.addptr %tmp0_29, %tmp0_28 : tensor<64x4x!tt.ptr<bf16>>, tensor<64x4xi32> loc(#loc175)
+      %tmp0_31 = tt.broadcast %r0_mask : tensor<1x4xi1> -> tensor<64x4xi1> loc(#loc176)
+      %tmp0_32 = tt.load %tmp0_30, %tmp0_31, %cst_0 evictionPolicy = evict_last : tensor<64x4x!tt.ptr<bf16>> loc(#loc176)
+      %tmp0_33 = arith.extf %tmp0_32 : tensor<64x4xbf16> to tensor<64x4xf32> loc(#loc177)
+      %tmp6 = tt.broadcast %r0_index_21 : tensor<1x4xi32> -> tensor<64x4xi32> loc(#loc178)
+      %tmp6_34 = arith.addi %tmp6, %tmp0_24 : tensor<64x4xi32> loc(#loc178)
+      %tmp6_35 = arith.addi %tmp6_34, %tmp0_27 : tensor<64x4xi32> loc(#loc179)
+      %tmp6_36 = tt.addptr %tmp0_29, %tmp6_35 : tensor<64x4x!tt.ptr<bf16>>, tensor<64x4xi32> loc(#loc180)
+      %tmp6_37 = tt.load %tmp6_36, %tmp0_31, %cst_0 evictionPolicy = evict_last : tensor<64x4x!tt.ptr<bf16>> loc(#loc181)
+      %tmp6_38 = arith.extf %tmp6_37 : tensor<64x4xbf16> to tensor<64x4xf32> loc(#loc182)
+      %tmp2 = arith.mulf %tmp0_33, %tmp0_33 : tensor<64x4xf32> loc(#loc183)
+      %tmp5 = arith.addf %_tmp4, %tmp2 : tensor<64x4xf32> loc(#loc184)
+      %_tmp4_39 = arith.select %tmp0_31, %tmp5, %_tmp4 : tensor<64x4xi1>, tensor<64x4xf32> loc(#loc185)
+      %tmp8 = arith.mulf %tmp6_38, %tmp6_38 : tensor<64x4xf32> loc(#loc186)
+      %tmp11 = arith.addf %_tmp10_20, %tmp8 : tensor<64x4xf32> loc(#loc187)
+      %_tmp10_40 = arith.select %tmp0_31, %tmp11, %_tmp10_20 : tensor<64x4xi1>, tensor<64x4xf32> loc(#loc188)
+      scf.yield %_tmp4_39, %_tmp10_40 : tensor<64x4xf32>, tensor<64x4xf32> loc(#loc33)
+    } loc(#loc294)
+    %tmp4 = "tt.reduce"(%_tmp10#0) <{axis = 1 : i32}> ({
+    ^bb0(%tmp4_20: f32 loc(callsite(#loc1 at #loc189)), %tmp4_21: f32 loc(callsite(#loc1 at #loc189))):
+      %tmp4_22 = arith.addf %tmp4_20, %tmp4_21 : f32 loc(#loc299)
+      tt.reduce.return %tmp4_22 : f32 loc(#loc295)
+    }) : (tensor<64x4xf32>) -> tensor<64xf32> loc(#loc295)
+    %tmp4_18 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<64xf32> -> tensor<64x1xf32> loc(#loc190)
+    %tmp10 = "tt.reduce"(%_tmp10#1) <{axis = 1 : i32}> ({
+    ^bb0(%tmp10_20: f32 loc(callsite(#loc1 at #loc191)), %tmp10_21: f32 loc(callsite(#loc1 at #loc191))):
+      %tmp10_22 = arith.addf %tmp10_20, %tmp10_21 : f32 loc(#loc300)
+      tt.reduce.return %tmp10_22 : f32 loc(#loc297)
+    }) : (tensor<64x4xf32>) -> tensor<64xf32> loc(#loc297)
+    %tmp10_19 = tt.expand_dims %tmp10 {axis = 1 : i32} : tensor<64xf32> -> tensor<64x1xf32> loc(#loc192)
+    scf.for %r0_offset = %c0_i32 to %c128_i32 step %c4_i32  : i32 {
+      %r0_index = tt.splat %r0_offset : i32 -> tensor<1x4xi32> loc(#loc193)
+      %r0_index_20 = arith.addi %r0_index, %r0_base_17 : tensor<1x4xi32> loc(#loc193)
+      %r0_mask = arith.cmpi slt, %r0_index_20, %cst_10 : tensor<1x4xi32> loc(#loc194)
+      %r0_3 = arith.remsi %r0_index_20, %cst_6 : tensor<1x4xi32> loc(#loc195)
+      %r0_4 = arith.divsi %r0_index_20, %cst_6 : tensor<1x4xi32> loc(#loc196)
+      %tmp50 = arith.muli %x0, %cst_8 : tensor<64x1xi32> loc(#loc197)
+      %tmp50_21 = tt.broadcast %r0_index_20 : tensor<1x4xi32> -> tensor<64x4xi32> loc(#loc198)
+      %tmp50_22 = tt.broadcast %tmp50 : tensor<64x1xi32> -> tensor<64x4xi32> loc(#loc198)
+      %tmp50_23 = arith.addi %tmp50_21, %tmp50_22 : tensor<64x4xi32> loc(#loc198)
+      %tmp50_24 = arith.muli %x1, %cst_7 : tensor<64x1xi32> loc(#loc199)
+      %tmp50_25 = tt.broadcast %tmp50_24 : tensor<64x1xi32> -> tensor<64x4xi32> loc(#loc200)
+      %tmp50_26 = arith.addi %tmp50_23, %tmp50_25 : tensor<64x4xi32> loc(#loc200)
+      %tmp50_27 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<64x4x!tt.ptr<bf16>> loc(#loc201)
+      %tmp50_28 = tt.addptr %tmp50_27, %tmp50_26 : tensor<64x4x!tt.ptr<bf16>>, tensor<64x4xi32> loc(#loc201)
+      %tmp50_29 = tt.broadcast %r0_mask : tensor<1x4xi1> -> tensor<64x4xi1> loc(#loc202)
+      %tmp50_30 = tt.load %tmp50_28, %tmp50_29, %cst_0 evictionPolicy = evict_last : tensor<64x4x!tt.ptr<bf16>> loc(#loc202)
+      %tmp50_31 = arith.extf %tmp50_30 : tensor<64x4xbf16> to tensor<64x4xf32> loc(#loc203)
+      %tmp58 = tt.splat %in_ptr1 : !tt.ptr<bf16> -> tensor<1x4x!tt.ptr<bf16>> loc(#loc204)
+      %tmp58_32 = tt.addptr %tmp58, %r0_index_20 : tensor<1x4x!tt.ptr<bf16>>, tensor<1x4xi32> loc(#loc204)
+      %tmp58_33 = tt.load %tmp58_32, %r0_mask, %cst evictionPolicy = evict_last : tensor<1x4x!tt.ptr<bf16>> loc(#loc205)
+      %tmp58_34 = arith.extf %tmp58_33 : tensor<1x4xbf16> to tensor<1x4xf32> loc(#loc206)
+      %tmp63 = arith.muli %x1, %cst_8 : tensor<64x1xi32> loc(#loc207)
+      %tmp63_35 = tt.broadcast %tmp63 : tensor<64x1xi32> -> tensor<64x4xi32> loc(#loc208)
+      %tmp63_36 = arith.addi %tmp50_21, %tmp63_35 : tensor<64x4xi32> loc(#loc208)
+      %tmp63_37 = tt.splat %in_ptr2 : !tt.ptr<f32> -> tensor<64x4x!tt.ptr<f32>> loc(#loc209)
+      %tmp63_38 = tt.addptr %tmp63_37, %tmp63_36 : tensor<64x4x!tt.ptr<f32>>, tensor<64x4xi32> loc(#loc209)
+      %tmp63_39 = tt.load %tmp63_38, %tmp50_29, %cst_11 evictionPolicy = evict_last : tensor<64x4x!tt.ptr<f32>> loc(#loc210)
+      %tmp66 = tt.splat %in_ptr3 : !tt.ptr<f32> -> tensor<64x4x!tt.ptr<f32>> loc(#loc211)
+      %tmp66_40 = tt.addptr %tmp66, %tmp63_36 : tensor<64x4x!tt.ptr<f32>>, tensor<64x4xi32> loc(#loc211)
+      %tmp66_41 = tt.load %tmp66_40, %tmp50_29, %cst_11 evictionPolicy = evict_last : tensor<64x4x!tt.ptr<f32>> loc(#loc212)
+      %tmp96 = arith.addi %r0_index_20, %cst_9 : tensor<1x4xi32> loc(#loc213)
+      %tmp96_42 = tt.broadcast %tmp96 : tensor<1x4xi32> -> tensor<64x4xi32> loc(#loc214)
+      %tmp96_43 = arith.addi %tmp96_42, %tmp50_22 : tensor<64x4xi32> loc(#loc214)
+      %tmp96_44 = arith.addi %tmp96_43, %tmp50_25 : tensor<64x4xi32> loc(#loc215)
+      %tmp96_45 = tt.addptr %tmp50_27, %tmp96_44 : tensor<64x4x!tt.ptr<bf16>>, tensor<64x4xi32> loc(#loc216)
+      %tmp96_46 = tt.load %tmp96_45, %tmp50_29, %cst_0 evictionPolicy = evict_first : tensor<64x4x!tt.ptr<bf16>> loc(#loc217)
+      %tmp96_47 = arith.extf %tmp96_46 : tensor<64x4xbf16> to tensor<64x4xf32> loc(#loc218)
+      %tmp102 = tt.splat %in_ptr4 : !tt.ptr<bf16> -> tensor<1x4x!tt.ptr<bf16>> loc(#loc219)
+      %tmp102_48 = tt.addptr %tmp102, %r0_index_20 : tensor<1x4x!tt.ptr<bf16>>, tensor<1x4xi32> loc(#loc219)
+      %tmp102_49 = tt.load %tmp102_48, %r0_mask, %cst evictionPolicy = evict_last : tensor<1x4x!tt.ptr<bf16>> loc(#loc220)
+      %tmp102_50 = arith.extf %tmp102_49 : tensor<1x4xbf16> to tensor<1x4xf32> loc(#loc221)
+      %tmp16 = arith.extsi %r0_3 : tensor<1x4xi32> to tensor<1x4xi64> loc(#loc222)
+      %tmp16_51 = arith.cmpi slt, %tmp16, %cst_5 : tensor<1x4xi64> loc(#loc222)
+      %tmp17 = arith.muli %r0_4, %cst_6 : tensor<1x4xi32> loc(#loc223)
+      %tmp17_52 = arith.addi %tmp17, %cst_4 : tensor<1x4xi32> loc(#loc224)
+      %tmp17_53 = tt.broadcast %tmp17_52 : tensor<1x4xi32> -> tensor<64x4xi32> loc(#loc225)
+      %tmp17_54 = arith.addi %tmp17_53, %tmp50_22 : tensor<64x4xi32> loc(#loc225)
+      %tmp17_55 = arith.addi %tmp17_54, %tmp50_25 : tensor<64x4xi32> loc(#loc226)
+      %tmp17_56 = tt.addptr %tmp50_27, %tmp17_55 : tensor<64x4x!tt.ptr<bf16>>, tensor<64x4xi32> loc(#loc227)
+      %tmp17_57 = arith.andi %r0_mask, %tmp16_51 : tensor<1x4xi1> loc(#loc228)
+      %tmp17_58 = tt.broadcast %tmp17_57 : tensor<1x4xi1> -> tensor<64x4xi1> loc(#loc229)
+      %tmp17_59 = tt.load %tmp17_56, %tmp17_58, %cst_0 evictionPolicy = evict_last : tensor<64x4x!tt.ptr<bf16>> loc(#loc229)
+      %tmp17_60 = arith.extf %tmp17_59 : tensor<64x4xbf16> to tensor<64x4xf32> loc(#loc230)
+      %tmp20 = arith.divf %tmp10_19, %cst_3 : tensor<64x1xf32> loc(#loc231)
+      %tmp22 = arith.addf %tmp20, %cst_2 : tensor<64x1xf32> loc(#loc232)
+      %tmp23 = tt.extern_elementwise %tmp22 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<64x1xf32>) -> tensor<64x1xf32> loc(#loc233)
+      %tmp24 = tt.broadcast %tmp23 : tensor<64x1xf32> -> tensor<64x4xf32> loc(#loc234)
+      %tmp24_61 = arith.mulf %tmp17_60, %tmp24 : tensor<64x4xf32> loc(#loc234)
+      %tmp25 = tt.addptr %tmp58, %tmp17_52 : tensor<1x4x!tt.ptr<bf16>>, tensor<1x4xi32> loc(#loc235)
+      %tmp25_62 = tt.broadcast %tmp25 : tensor<1x4x!tt.ptr<bf16>> -> tensor<64x4x!tt.ptr<bf16>> loc(#loc235)
+      %tmp25_63 = tt.load %tmp25_62, %tmp17_58, %cst_0 evictionPolicy = evict_last : tensor<64x4x!tt.ptr<bf16>> loc(#loc236)
+      %tmp25_64 = arith.extf %tmp25_63 : tensor<64x4xbf16> to tensor<64x4xf32> loc(#loc237)
+      %tmp27 = arith.mulf %tmp24_61, %tmp25_64 : tensor<64x4xf32> loc(#loc238)
+      %tmp29 = arith.subf %cst_11, %tmp27 : tensor<64x4xf32> loc(#loc239)
+      %tmp31 = tt.broadcast %tmp16_51 : tensor<1x4xi1> -> tensor<64x4xi1> loc(#loc240)
+      %tmp31_65 = arith.select %tmp31, %tmp29, %cst_11 : tensor<64x4xi1>, tensor<64x4xf32> loc(#loc240)
+      %tmp32 = arith.cmpi sge, %tmp16, %cst_5 : tensor<1x4xi64> loc(#loc241)
+      %tmp35 = tt.broadcast %tmp17 : tensor<1x4xi32> -> tensor<64x4xi32> loc(#loc242)
+      %tmp35_66 = arith.addi %tmp35, %tmp50_22 : tensor<64x4xi32> loc(#loc242)
+      %tmp35_67 = arith.addi %tmp35_66, %tmp50_25 : tensor<64x4xi32> loc(#loc243)
+      %tmp35_68 = tt.addptr %tmp50_27, %tmp35_67 : tensor<64x4x!tt.ptr<bf16>>, tensor<64x4xi32> loc(#loc244)
+      %tmp35_69 = arith.andi %r0_mask, %tmp32 : tensor<1x4xi1> loc(#loc245)
+      %tmp35_70 = tt.broadcast %tmp35_69 : tensor<1x4xi1> -> tensor<64x4xi1> loc(#loc246)
+      %tmp35_71 = tt.load %tmp35_68, %tmp35_70, %cst_0 evictionPolicy = evict_last : tensor<64x4x!tt.ptr<bf16>> loc(#loc246)
+      %tmp35_72 = arith.extf %tmp35_71 : tensor<64x4xbf16> to tensor<64x4xf32> loc(#loc247)
+      %tmp42 = arith.mulf %tmp35_72, %tmp24 : tensor<64x4xf32> loc(#loc248)
+      %tmp43 = tt.addptr %tmp58, %tmp17 : tensor<1x4x!tt.ptr<bf16>>, tensor<1x4xi32> loc(#loc249)
+      %tmp43_73 = tt.broadcast %tmp43 : tensor<1x4x!tt.ptr<bf16>> -> tensor<64x4x!tt.ptr<bf16>> loc(#loc249)
+      %tmp43_74 = tt.load %tmp43_73, %tmp35_70, %cst_0 evictionPolicy = evict_last : tensor<64x4x!tt.ptr<bf16>> loc(#loc250)
+      %tmp43_75 = arith.extf %tmp43_74 : tensor<64x4xbf16> to tensor<64x4xf32> loc(#loc251)
+      %tmp45 = arith.mulf %tmp42, %tmp43_75 : tensor<64x4xf32> loc(#loc252)
+      %tmp48 = tt.broadcast %tmp32 : tensor<1x4xi1> -> tensor<64x4xi1> loc(#loc253)
+      %tmp48_76 = arith.select %tmp48, %tmp45, %cst_11 : tensor<64x4xi1>, tensor<64x4xf32> loc(#loc253)
+      %tmp49 = arith.select %tmp31, %tmp31_65, %tmp48_76 : tensor<64x4xi1>, tensor<64x4xf32> loc(#loc254)
+      %tmp57 = arith.mulf %tmp50_31, %tmp24 : tensor<64x4xf32> loc(#loc255)
+      %tmp60 = tt.broadcast %tmp58_34 : tensor<1x4xf32> -> tensor<64x4xf32> loc(#loc256)
+      %tmp60_77 = arith.mulf %tmp57, %tmp60 : tensor<64x4xf32> loc(#loc256)
+      %tmp64 = arith.mulf %tmp60_77, %tmp63_39 : tensor<64x4xf32> loc(#loc257)
+      %tmp67 = arith.mulf %tmp49, %tmp66_41 : tensor<64x4xf32> loc(#loc258)
+      %tmp68 = arith.addf %tmp64, %tmp67 : tensor<64x4xf32> loc(#loc259)
+      %tmp70 = arith.addi %tmp17, %cst_1 : tensor<1x4xi32> loc(#loc260)
+      %tmp70_78 = tt.broadcast %tmp70 : tensor<1x4xi32> -> tensor<64x4xi32> loc(#loc261)
+      %tmp70_79 = arith.addi %tmp70_78, %tmp50_22 : tensor<64x4xi32> loc(#loc261)
+      %tmp70_80 = arith.addi %tmp70_79, %tmp50_25 : tensor<64x4xi32> loc(#loc262)
+      %tmp70_81 = tt.addptr %tmp50_27, %tmp70_80 : tensor<64x4x!tt.ptr<bf16>>, tensor<64x4xi32> loc(#loc263)
+      %tmp70_82 = tt.load %tmp70_81, %tmp17_58, %cst_0 evictionPolicy = evict_last : tensor<64x4x!tt.ptr<bf16>> loc(#loc264)
+      %tmp70_83 = arith.extf %tmp70_82 : tensor<64x4xbf16> to tensor<64x4xf32> loc(#loc265)
+      %tmp72 = arith.divf %tmp4_18, %cst_3 : tensor<64x1xf32> loc(#loc266)
+      %tmp73 = arith.addf %tmp72, %cst_2 : tensor<64x1xf32> loc(#loc267)
+      %tmp74 = tt.extern_elementwise %tmp73 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<64x1xf32>) -> tensor<64x1xf32> loc(#loc268)
+      %tmp75 = tt.broadcast %tmp74 : tensor<64x1xf32> -> tensor<64x4xf32> loc(#loc269)
+      %tmp75_84 = arith.mulf %tmp70_83, %tmp75 : tensor<64x4xf32> loc(#loc269)
+      %tmp76 = tt.addptr %tmp102, %tmp17_52 : tensor<1x4x!tt.ptr<bf16>>, tensor<1x4xi32> loc(#loc270)
+      %tmp76_85 = tt.broadcast %tmp76 : tensor<1x4x!tt.ptr<bf16>> -> tensor<64x4x!tt.ptr<bf16>> loc(#loc270)
+      %tmp76_86 = tt.load %tmp76_85, %tmp17_58, %cst_0 evictionPolicy = evict_last : tensor<64x4x!tt.ptr<bf16>> loc(#loc271)
+      %tmp76_87 = arith.extf %tmp76_86 : tensor<64x4xbf16> to tensor<64x4xf32> loc(#loc272)
+      %tmp78 = arith.mulf %tmp75_84, %tmp76_87 : tensor<64x4xf32> loc(#loc273)
+      %tmp80 = arith.subf %cst_11, %tmp78 : tensor<64x4xf32> loc(#loc274)
+      %tmp82 = arith.select %tmp31, %tmp80, %cst_11 : tensor<64x4xi1>, tensor<64x4xf32> loc(#loc275)
+      %tmp83 = arith.addi %tmp17, %cst_9 : tensor<1x4xi32> loc(#loc276)
+      %tmp83_88 = tt.broadcast %tmp83 : tensor<1x4xi32> -> tensor<64x4xi32> loc(#loc277)
+      %tmp83_89 = arith.addi %tmp83_88, %tmp50_22 : tensor<64x4xi32> loc(#loc277)
+      %tmp83_90 = arith.addi %tmp83_89, %tmp50_25 : tensor<64x4xi32> loc(#loc278)
+      %tmp83_91 = tt.addptr %tmp50_27, %tmp83_90 : tensor<64x4x!tt.ptr<bf16>>, tensor<64x4xi32> loc(#loc279)
+      %tmp83_92 = tt.load %tmp83_91, %tmp35_70, %cst_0 evictionPolicy = evict_last : tensor<64x4x!tt.ptr<bf16>> loc(#loc280)
+      %tmp83_93 = arith.extf %tmp83_92 : tensor<64x4xbf16> to tensor<64x4xf32> loc(#loc281)
+      %tmp88 = arith.mulf %tmp83_93, %tmp75 : tensor<64x4xf32> loc(#loc282)
+      %tmp89 = tt.addptr %tmp102, %tmp17 : tensor<1x4x!tt.ptr<bf16>>, tensor<1x4xi32> loc(#loc283)
+      %tmp89_94 = tt.broadcast %tmp89 : tensor<1x4x!tt.ptr<bf16>> -> tensor<64x4x!tt.ptr<bf16>> loc(#loc283)
+      %tmp89_95 = tt.load %tmp89_94, %tmp35_70, %cst_0 evictionPolicy = evict_last : tensor<64x4x!tt.ptr<bf16>> loc(#loc284)
+      %tmp89_96 = arith.extf %tmp89_95 : tensor<64x4xbf16> to tensor<64x4xf32> loc(#loc285)
+      %tmp91 = arith.mulf %tmp88, %tmp89_96 : tensor<64x4xf32> loc(#loc286)
+      %tmp94 = arith.select %tmp48, %tmp91, %cst_11 : tensor<64x4xi1>, tensor<64x4xf32> loc(#loc287)
+      %tmp95 = arith.select %tmp31, %tmp82, %tmp94 : tensor<64x4xi1>, tensor<64x4xf32> loc(#loc288)
+      %tmp101 = arith.mulf %tmp96_47, %tmp75 : tensor<64x4xf32> loc(#loc289)
+      %tmp104 = tt.broadcast %tmp102_50 : tensor<1x4xf32> -> tensor<64x4xf32> loc(#loc290)
+      %tmp104_97 = arith.mulf %tmp101, %tmp104 : tensor<64x4xf32> loc(#loc290)
+      %tmp107 = arith.mulf %tmp104_97, %tmp63_39 : tensor<64x4xf32> loc(#loc291)
+      %tmp109 = arith.mulf %tmp95, %tmp66_41 : tensor<64x4xf32> loc(#loc292)
+      %tmp110 = arith.addf %tmp107, %tmp109 : tensor<64x4xf32> loc(#loc293)
+      %0 = arith.muli %xindex_16, %cst_8 : tensor<64x1xi32> loc(#loc142)
+      %1 = tt.broadcast %0 : tensor<64x1xi32> -> tensor<64x4xi32> loc(#loc143)
+      %2 = arith.addi %tmp50_21, %1 : tensor<64x4xi32> loc(#loc143)
+      %3 = tt.splat %in_out_ptr0 : !tt.ptr<bf16> -> tensor<64x4x!tt.ptr<bf16>> loc(#loc144)
+      %4 = tt.addptr %3, %2 : tensor<64x4x!tt.ptr<bf16>>, tensor<64x4xi32> loc(#loc144)
+      %5 = arith.truncf %tmp68 : tensor<64x4xf32> to tensor<64x4xbf16> loc(#loc145)
+      tt.store %4, %5, %tmp50_29 : tensor<64x4x!tt.ptr<bf16>> loc(#loc145)
+      %6 = tt.splat %in_out_ptr1 : !tt.ptr<bf16> -> tensor<64x4x!tt.ptr<bf16>> loc(#loc146)
+      %7 = tt.addptr %6, %2 : tensor<64x4x!tt.ptr<bf16>>, tensor<64x4xi32> loc(#loc146)
+      %8 = arith.truncf %tmp110 : tensor<64x4xf32> to tensor<64x4xbf16> loc(#loc147)
+      tt.store %7, %8, %tmp50_29 : tensor<64x4x!tt.ptr<bf16>> loc(#loc147)
+    } loc(#loc40)
+    tt.return loc(#loc148)
+  } loc(#loc)
+} loc(#loc)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":23:28)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":23:33)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:36)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:44)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:23)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":26:27)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":26:37)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":28:19)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":29:19)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":33:43)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":34:31)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":35:29)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:41)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:52)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:48)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:63)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:57)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:34)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:68)
+#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:121)
+#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:41)
+#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:50)
+#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:34)
+#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:61)
+#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:114)
+#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":42:22)
+#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":44:23)
+#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":45:40)
+#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":47:22)
+#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":49:25)
+#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":50:42)
+#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":50:8)
+#loc34 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:36)
+#loc36 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:15)
+#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":51:28)
+#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":52:30)
+#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":53:43)
+#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":54:31)
+#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":55:29)
+#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":58:27)
+#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":59:27)
+#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:46)
+#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:42)
+#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:57)
+#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:51)
+#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:35)
+#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:62)
+#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:115)
+#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:35)
+#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:42)
+#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:95)
+#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:46)
+#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:42)
+#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:35)
+#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:51)
+#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:35)
+#loc60 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:51)
+#loc61 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:42)
+#loc62 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:49)
+#loc63 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:58)
+#loc64 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:35)
+#loc65 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:69)
+#loc66 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:123)
+#loc67 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:36)
+#loc68 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:43)
+#loc69 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:96)
+#loc70 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":71:24)
+#loc71 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:41)
+#loc72 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:39)
+#loc73 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:48)
+#loc74 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:57)
+#loc75 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:35)
+#loc76 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:78)
+#loc77 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:68)
+#loc78 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:129)
+#loc79 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":75:25)
+#loc80 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":77:24)
+#loc81 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":78:32)
+#loc82 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":79:24)
+#loc83 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:35)
+#loc84 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:85)
+#loc85 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:146)
+#loc86 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":82:24)
+#loc87 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":84:17)
+#loc88 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":86:39)
+#loc89 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":87:25)
+#loc90 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:44)
+#loc91 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:53)
+#loc92 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:35)
+#loc93 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:74)
+#loc94 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:64)
+#loc95 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:125)
+#loc96 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":97:24)
+#loc97 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:35)
+#loc98 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:81)
+#loc99 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:142)
+#loc100 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":100:24)
+#loc101 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":103:39)
+#loc102 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":104:39)
+#loc103 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":111:24)
+#loc104 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":113:24)
+#loc105 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":116:24)
+#loc106 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":118:24)
+#loc107 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":119:24)
+#loc108 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:42)
+#loc109 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:51)
+#loc110 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:60)
+#loc111 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:35)
+#loc112 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:71)
+#loc113 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:132)
+#loc114 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":123:24)
+#loc115 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":124:24)
+#loc116 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":125:32)
+#loc117 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":126:24)
+#loc118 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:35)
+#loc119 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:85)
+#loc120 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:146)
+#loc121 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":129:24)
+#loc122 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":131:17)
+#loc123 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":133:39)
+#loc124 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:42)
+#loc125 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:51)
+#loc126 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:60)
+#loc127 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:35)
+#loc128 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:71)
+#loc129 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:132)
+#loc130 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":139:24)
+#loc131 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:35)
+#loc132 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:81)
+#loc133 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:142)
+#loc134 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":142:24)
+#loc135 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":145:39)
+#loc136 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":146:39)
+#loc137 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":151:25)
+#loc138 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":153:26)
+#loc139 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":156:26)
+#loc140 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":158:26)
+#loc141 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":159:26)
+#loc142 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:43)
+#loc143 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:39)
+#loc144 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:32)
+#loc145 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:55)
+#loc146 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:32)
+#loc147 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:56)
+#loc148 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":53:4)
+#loc158 = loc("xoffset"(#loc2))
+#loc159 = loc("xoffset"(#loc3))
+#loc160 = loc("xindex"(#loc4))
+#loc161 = loc("xindex"(#loc5))
+#loc162 = loc("xindex"(#loc6))
+#loc163 = loc("r0_base"(#loc7))
+#loc164 = loc("r0_base"(#loc8))
+#loc165 = loc("x0"(#loc9))
+#loc166 = loc("x1"(#loc10))
+#loc167 = loc("_tmp4"(#loc11))
+#loc168 = loc("r0_index"(#loc12))
+#loc169 = loc("r0_mask"(#loc13))
+#loc170 = loc("tmp0"(#loc14))
+#loc171 = loc("tmp0"(#loc15))
+#loc172 = loc("tmp0"(#loc16))
+#loc173 = loc("tmp0"(#loc17))
+#loc174 = loc("tmp0"(#loc18))
+#loc175 = loc("tmp0"(#loc19))
+#loc176 = loc("tmp0"(#loc20))
+#loc177 = loc("tmp0"(#loc21))
+#loc178 = loc("tmp6"(#loc22))
+#loc179 = loc("tmp6"(#loc23))
+#loc180 = loc("tmp6"(#loc24))
+#loc181 = loc("tmp6"(#loc25))
+#loc182 = loc("tmp6"(#loc26))
+#loc183 = loc("tmp2"(#loc27))
+#loc184 = loc("tmp5"(#loc28))
+#loc185 = loc("_tmp4"(#loc29))
+#loc186 = loc("tmp8"(#loc30))
+#loc187 = loc("tmp11"(#loc31))
+#loc188 = loc("_tmp10"(#loc32))
+#loc190 = loc("tmp4"(#loc37))
+#loc192 = loc("tmp10"(#loc39))
+#loc193 = loc("r0_index"(#loc41))
+#loc194 = loc("r0_mask"(#loc42))
+#loc195 = loc("r0_3"(#loc43))
+#loc196 = loc("r0_4"(#loc44))
+#loc197 = loc("tmp50"(#loc45))
+#loc198 = loc("tmp50"(#loc46))
+#loc199 = loc("tmp50"(#loc47))
+#loc200 = loc("tmp50"(#loc48))
+#loc201 = loc("tmp50"(#loc49))
+#loc202 = loc("tmp50"(#loc50))
+#loc203 = loc("tmp50"(#loc51))
+#loc204 = loc("tmp58"(#loc52))
+#loc205 = loc("tmp58"(#loc53))
+#loc206 = loc("tmp58"(#loc54))
+#loc207 = loc("tmp63"(#loc55))
+#loc208 = loc("tmp63"(#loc56))
+#loc209 = loc("tmp63"(#loc57))
+#loc210 = loc("tmp63"(#loc58))
+#loc211 = loc("tmp66"(#loc59))
+#loc212 = loc("tmp66"(#loc60))
+#loc213 = loc("tmp96"(#loc61))
+#loc214 = loc("tmp96"(#loc62))
+#loc215 = loc("tmp96"(#loc63))
+#loc216 = loc("tmp96"(#loc64))
+#loc217 = loc("tmp96"(#loc65))
+#loc218 = loc("tmp96"(#loc66))
+#loc219 = loc("tmp102"(#loc67))
+#loc220 = loc("tmp102"(#loc68))
+#loc221 = loc("tmp102"(#loc69))
+#loc222 = loc("tmp16"(#loc70))
+#loc223 = loc("tmp17"(#loc71))
+#loc224 = loc("tmp17"(#loc72))
+#loc225 = loc("tmp17"(#loc73))
+#loc226 = loc("tmp17"(#loc74))
+#loc227 = loc("tmp17"(#loc75))
+#loc228 = loc("tmp17"(#loc76))
+#loc229 = loc("tmp17"(#loc77))
+#loc230 = loc("tmp17"(#loc78))
+#loc231 = loc("tmp20"(#loc79))
+#loc232 = loc("tmp22"(#loc80))
+#loc233 = loc("tmp23"(#loc81))
+#loc234 = loc("tmp24"(#loc82))
+#loc235 = loc("tmp25"(#loc83))
+#loc236 = loc("tmp25"(#loc84))
+#loc237 = loc("tmp25"(#loc85))
+#loc238 = loc("tmp27"(#loc86))
+#loc239 = loc("tmp29"(#loc87))
+#loc240 = loc("tmp31"(#loc88))
+#loc241 = loc("tmp32"(#loc89))
+#loc242 = loc("tmp35"(#loc90))
+#loc243 = loc("tmp35"(#loc91))
+#loc244 = loc("tmp35"(#loc92))
+#loc245 = loc("tmp35"(#loc93))
+#loc246 = loc("tmp35"(#loc94))
+#loc247 = loc("tmp35"(#loc95))
+#loc248 = loc("tmp42"(#loc96))
+#loc249 = loc("tmp43"(#loc97))
+#loc250 = loc("tmp43"(#loc98))
+#loc251 = loc("tmp43"(#loc99))
+#loc252 = loc("tmp45"(#loc100))
+#loc253 = loc("tmp48"(#loc101))
+#loc254 = loc("tmp49"(#loc102))
+#loc255 = loc("tmp57"(#loc103))
+#loc256 = loc("tmp60"(#loc104))
+#loc257 = loc("tmp64"(#loc105))
+#loc258 = loc("tmp67"(#loc106))
+#loc259 = loc("tmp68"(#loc107))
+#loc260 = loc("tmp70"(#loc108))
+#loc261 = loc("tmp70"(#loc109))
+#loc262 = loc("tmp70"(#loc110))
+#loc263 = loc("tmp70"(#loc111))
+#loc264 = loc("tmp70"(#loc112))
+#loc265 = loc("tmp70"(#loc113))
+#loc266 = loc("tmp72"(#loc114))
+#loc267 = loc("tmp73"(#loc115))
+#loc268 = loc("tmp74"(#loc116))
+#loc269 = loc("tmp75"(#loc117))
+#loc270 = loc("tmp76"(#loc118))
+#loc271 = loc("tmp76"(#loc119))
+#loc272 = loc("tmp76"(#loc120))
+#loc273 = loc("tmp78"(#loc121))
+#loc274 = loc("tmp80"(#loc122))
+#loc275 = loc("tmp82"(#loc123))
+#loc276 = loc("tmp83"(#loc124))
+#loc277 = loc("tmp83"(#loc125))
+#loc278 = loc("tmp83"(#loc126))
+#loc279 = loc("tmp83"(#loc127))
+#loc280 = loc("tmp83"(#loc128))
+#loc281 = loc("tmp83"(#loc129))
+#loc282 = loc("tmp88"(#loc130))
+#loc283 = loc("tmp89"(#loc131))
+#loc284 = loc("tmp89"(#loc132))
+#loc285 = loc("tmp89"(#loc133))
+#loc286 = loc("tmp91"(#loc134))
+#loc287 = loc("tmp94"(#loc135))
+#loc288 = loc("tmp95"(#loc136))
+#loc289 = loc("tmp101"(#loc137))
+#loc290 = loc("tmp104"(#loc138))
+#loc291 = loc("tmp107"(#loc139))
+#loc292 = loc("tmp109"(#loc140))
+#loc293 = loc("tmp110"(#loc141))
+#loc294 = loc("_tmp10"(#loc167))
+#loc295 = loc(callsite(#loc34 at #loc189))
+#loc297 = loc(callsite(#loc34 at #loc191))
+#loc299 = loc(callsite(#loc36 at #loc295))
+#loc300 = loc(callsite(#loc36 at #loc297))
diff --git a/triton/4CABSBCN3JWTJORDZXCB5NETVIT6A64D5LGN2QHUTERFG5YETKTA/__grp__triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json b/triton/4CABSBCN3JWTJORDZXCB5NETVIT6A64D5LGN2QHUTERFG5YETKTA/__grp__triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..f79f09dd17622a1967646779fa828a249f89f662
--- /dev/null
+++ b/triton/4CABSBCN3JWTJORDZXCB5NETVIT6A64D5LGN2QHUTERFG5YETKTA/__grp__triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json
@@ -0,0 +1 @@
+{"child_paths": {"triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.source": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/4CABSBCN3JWTJORDZXCB5NETVIT6A64D5LGN2QHUTERFG5YETKTA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.source", "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/4CABSBCN3JWTJORDZXCB5NETVIT6A64D5LGN2QHUTERFG5YETKTA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttir", "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttgir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/4CABSBCN3JWTJORDZXCB5NETVIT6A64D5LGN2QHUTERFG5YETKTA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttgir", "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.llir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/4CABSBCN3JWTJORDZXCB5NETVIT6A64D5LGN2QHUTERFG5YETKTA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.llir", "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ptx": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/4CABSBCN3JWTJORDZXCB5NETVIT6A64D5LGN2QHUTERFG5YETKTA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ptx", "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.cubin": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/4CABSBCN3JWTJORDZXCB5NETVIT6A64D5LGN2QHUTERFG5YETKTA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.cubin", "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/4CABSBCN3JWTJORDZXCB5NETVIT6A64D5LGN2QHUTERFG5YETKTA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json"}}
\ No newline at end of file
diff --git a/triton/4CABSBCN3JWTJORDZXCB5NETVIT6A64D5LGN2QHUTERFG5YETKTA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.cubin b/triton/4CABSBCN3JWTJORDZXCB5NETVIT6A64D5LGN2QHUTERFG5YETKTA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..3a9918e0230ef35f33f532cb18b983cff0d23ee8
Binary files /dev/null and b/triton/4CABSBCN3JWTJORDZXCB5NETVIT6A64D5LGN2QHUTERFG5YETKTA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.cubin differ
diff --git a/triton/4CABSBCN3JWTJORDZXCB5NETVIT6A64D5LGN2QHUTERFG5YETKTA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json b/triton/4CABSBCN3JWTJORDZXCB5NETVIT6A64D5LGN2QHUTERFG5YETKTA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..7025dac46dadd80b0c45f6c7c29d139ce2c87572
--- /dev/null
+++ b/triton/4CABSBCN3JWTJORDZXCB5NETVIT6A64D5LGN2QHUTERFG5YETKTA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json
@@ -0,0 +1 @@
+{"hash": "e08019044dda6d34ba23cdc41eb493aa27e07b83eaccdd40f499225377049aa6", "target": {"backend": "cuda", "arch": 89, "warp_size": 32}, "num_warps": 2, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "enable_reflect_ftz": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee", "bf16x3", "bf16x6"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm89", "instrumentation_mode": "", "triton_version": "3.6.0", "tensordesc_meta": [], "shared": 512, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0"}
\ No newline at end of file
diff --git a/triton/4CABSBCN3JWTJORDZXCB5NETVIT6A64D5LGN2QHUTERFG5YETKTA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.llir b/triton/4CABSBCN3JWTJORDZXCB5NETVIT6A64D5LGN2QHUTERFG5YETKTA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.llir
new file mode 100644
index 0000000000000000000000000000000000000000..9264a49f978f7b0ec4d99159f44c7c6c55fae03b
--- /dev/null
+++ b/triton/4CABSBCN3JWTJORDZXCB5NETVIT6A64D5LGN2QHUTERFG5YETKTA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.llir
@@ -0,0 +1,688 @@
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64"
+
+@global_smem = external local_unnamed_addr addrspace(3) global [0 x i8], align 16
+@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1
+
+; Function Attrs: nounwind
+define ptx_kernel void @triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, ptr addrspace(1) %6, i32 %7, i32 %8, ptr addrspace(1) readnone captures(none) %9, ptr addrspace(1) readnone captures(none) %10) local_unnamed_addr #0 !dbg !5 {
+  %12 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !8
+  %13 = shl nuw i32 %12, 1, !dbg !9
+  %14 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
+  %15 = and i32 %14, 32, !dbg !10
+  %.lobit = lshr exact i32 %15, 5, !dbg !10
+  %16 = or disjoint i32 %.lobit, %13, !dbg !11
+  %17 = and i32 %14, 31, !dbg !12
+  %18 = shl nuw nsw i32 %17, 1, !dbg !12
+  %19 = sdiv i32 %16, 32, !dbg !13
+  %20 = shl i32 %16, 7
+  %21 = shl i32 %19, 15
+  %22 = add i32 %21, %20
+  %23 = add i32 %22, 4096
+  %24 = zext nneg i32 %18 to i64, !dbg !14
+  %25 = or disjoint i32 %23, %18, !dbg !15
+  %26 = sext i32 %25 to i64, !dbg !16
+  %27 = getelementptr bfloat, ptr addrspace(1) %2, i64 %26, !dbg !16
+  %28 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !17
+  %29 = tail call i32 asm sideeffect "mov.u32 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $2 + 0 ], $3;", "=r,r,l,l,b"(i32 0, ptr addrspace(1) %27, i64 %28, i1 true) #6, !dbg !17
+  %30 = bitcast i32 %29 to <2 x bfloat>, !dbg !17
+  %31 = extractelement <2 x bfloat> %30, i64 0, !dbg !17
+  %32 = extractelement <2 x bfloat> %30, i64 1, !dbg !17
+  %33 = fpext bfloat %31 to float, !dbg !18
+  %34 = fpext bfloat %32 to float, !dbg !18
+  %35 = or disjoint i32 %22, %18, !dbg !19
+  %36 = sext i32 %35 to i64, !dbg !20
+  %37 = getelementptr bfloat, ptr addrspace(1) %2, i64 %36, !dbg !20
+  %38 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !21
+  %39 = tail call i32 asm sideeffect "mov.u32 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $2 + 0 ], $3;", "=r,r,l,l,b"(i32 0, ptr addrspace(1) %37, i64 %38, i1 true) #6, !dbg !21
+  %40 = bitcast i32 %39 to <2 x bfloat>, !dbg !21
+  %41 = extractelement <2 x bfloat> %40, i64 0, !dbg !21
+  %42 = extractelement <2 x bfloat> %40, i64 1, !dbg !21
+  %43 = fpext bfloat %41 to float, !dbg !22
+  %44 = fpext bfloat %42 to float, !dbg !22
+  %45 = fmul float %33, %33, !dbg !23
+  %46 = fmul float %34, %34, !dbg !23
+  %47 = fmul float %43, %43, !dbg !24
+  %48 = fmul float %44, %44, !dbg !24
+  %49 = or disjoint i32 %18, 64, !dbg !25
+  %50 = or disjoint i32 %23, %49, !dbg !15
+  %51 = sext i32 %50 to i64, !dbg !16
+  %52 = getelementptr bfloat, ptr addrspace(1) %2, i64 %51, !dbg !16
+  %53 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !17
+  %54 = tail call i32 asm sideeffect "mov.u32 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $2 + 0 ], $3;", "=r,r,l,l,b"(i32 0, ptr addrspace(1) %52, i64 %53, i1 true) #6, !dbg !17
+  %55 = bitcast i32 %54 to <2 x bfloat>, !dbg !17
+  %56 = extractelement <2 x bfloat> %55, i64 0, !dbg !17
+  %57 = extractelement <2 x bfloat> %55, i64 1, !dbg !17
+  %58 = fpext bfloat %56 to float, !dbg !18
+  %59 = fpext bfloat %57 to float, !dbg !18
+  %60 = or disjoint i32 %22, %49, !dbg !19
+  %61 = sext i32 %60 to i64, !dbg !20
+  %62 = getelementptr bfloat, ptr addrspace(1) %2, i64 %61, !dbg !20
+  %63 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !21
+  %64 = tail call i32 asm sideeffect "mov.u32 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $2 + 0 ], $3;", "=r,r,l,l,b"(i32 0, ptr addrspace(1) %62, i64 %63, i1 true) #6, !dbg !21
+  %65 = bitcast i32 %64 to <2 x bfloat>, !dbg !21
+  %66 = extractelement <2 x bfloat> %65, i64 0, !dbg !21
+  %67 = extractelement <2 x bfloat> %65, i64 1, !dbg !21
+  %68 = fpext bfloat %66 to float, !dbg !22
+  %69 = fpext bfloat %67 to float, !dbg !22
+  %70 = fmul float %58, %58, !dbg !23
+  %71 = fmul float %59, %59, !dbg !23
+  %72 = fadd float %45, %70, !dbg !26
+  %73 = fadd float %46, %71, !dbg !26
+  %74 = fmul float %68, %68, !dbg !24
+  %75 = fmul float %69, %69, !dbg !24
+  %76 = fadd float %47, %74, !dbg !27
+  %77 = fadd float %48, %75, !dbg !27
+  %78 = and i32 %14, 63, !dbg !10
+  %.not = icmp eq i32 %15, 0, !dbg !10
+  %79 = and i32 %14, 1, !dbg !10
+  %.not2 = icmp eq i32 %79, 0, !dbg !10
+  %80 = or disjoint i32 %13, %79, !dbg !11
+  %81 = and i32 %14, 62, !dbg !12
+  %82 = lshr exact i32 %81, 1, !dbg !12
+  %83 = sdiv i32 %80, 32, !dbg !13
+  %84 = fadd float %72, %73, !dbg !28
+  %85 = bitcast float %84 to i32, !dbg !31
+  %86 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %85, i32 16, i32 31), !dbg !31
+  %87 = bitcast i32 %86 to float, !dbg !31
+  %88 = fadd float %84, %87, !dbg !28
+  %89 = bitcast float %88 to i32, !dbg !31
+  %90 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %89, i32 8, i32 31), !dbg !31
+  %91 = bitcast i32 %90 to float, !dbg !31
+  %92 = fadd float %88, %91, !dbg !28
+  %93 = bitcast float %92 to i32, !dbg !31
+  %94 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %93, i32 4, i32 31), !dbg !31
+  %95 = bitcast i32 %94 to float, !dbg !31
+  %96 = fadd float %92, %95, !dbg !28
+  %97 = bitcast float %96 to i32, !dbg !31
+  %98 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %97, i32 2, i32 31), !dbg !31
+  %99 = bitcast i32 %98 to float, !dbg !31
+  %100 = fadd float %96, %99, !dbg !28
+  %101 = bitcast float %100 to i32, !dbg !31
+  %102 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %101, i32 1, i32 31), !dbg !31
+  %103 = bitcast i32 %102 to float, !dbg !31
+  %104 = fadd float %100, %103, !dbg !28
+  %105 = fadd float %76, %77, !dbg !34
+  %106 = bitcast float %105 to i32, !dbg !35
+  %107 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %106, i32 16, i32 31), !dbg !35
+  %108 = bitcast i32 %107 to float, !dbg !35
+  %109 = fadd float %105, %108, !dbg !34
+  %110 = bitcast float %109 to i32, !dbg !35
+  %111 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %110, i32 8, i32 31), !dbg !35
+  %112 = bitcast i32 %111 to float, !dbg !35
+  %113 = fadd float %109, %112, !dbg !34
+  %114 = bitcast float %113 to i32, !dbg !35
+  %115 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %114, i32 4, i32 31), !dbg !35
+  %116 = bitcast i32 %115 to float, !dbg !35
+  %117 = fadd float %113, %116, !dbg !34
+  %118 = bitcast float %117 to i32, !dbg !35
+  %119 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %118, i32 2, i32 31), !dbg !35
+  %120 = bitcast i32 %119 to float, !dbg !35
+  %121 = fadd float %117, %120, !dbg !34
+  %122 = bitcast float %121 to i32, !dbg !35
+  %123 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %122, i32 1, i32 31), !dbg !35
+  %124 = bitcast i32 %123 to float, !dbg !35
+  %125 = fadd float %121, %124, !dbg !34
+  %126 = shl i32 %19, 7, !dbg !37
+  %127 = tail call float @llvm.nvvm.div.full(float %125, float 1.280000e+02), !dbg !38
+  %128 = fadd float %127, 0x3EB0C6F7A0000000, !dbg !39
+  %129 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !40
+  %.not.i = icmp eq i32 %129, 0, !dbg !40
+  br i1 %.not.i, label %132, label %130, !dbg !40
+
+130:                                              ; preds = %11
+  %131 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %128), !dbg !40
+  br label %__nv_rsqrtf.exit, !dbg !40
+
+132:                                              ; preds = %11
+  %133 = tail call float @llvm.nvvm.rsqrt.approx.f(float %128), !dbg !40
+  br label %__nv_rsqrtf.exit, !dbg !40
+
+__nv_rsqrtf.exit:                                 ; preds = %130, %132
+  %.0.i = phi float [ %131, %130 ], [ %133, %132 ], !dbg !40
+  %134 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !40
+  %.not.i4 = icmp eq i32 %134, 0, !dbg !40
+  br i1 %.not.i4, label %137, label %135, !dbg !40
+
+135:                                              ; preds = %__nv_rsqrtf.exit
+  %136 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %128), !dbg !40
+  br label %__nv_rsqrtf.exit6, !dbg !40
+
+137:                                              ; preds = %__nv_rsqrtf.exit
+  %138 = tail call float @llvm.nvvm.rsqrt.approx.f(float %128), !dbg !40
+  br label %__nv_rsqrtf.exit6, !dbg !40
+
+__nv_rsqrtf.exit6:                                ; preds = %135, %137
+  %.0.i5 = phi float [ %136, %135 ], [ %138, %137 ], !dbg !40
+  %139 = lshr exact i32 %15, 3, !dbg !41
+  %140 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %139, !dbg !41
+  store float %.0.i, ptr addrspace(3) %140, align 4, !dbg !41
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !41
+  %141 = shl nuw nsw i32 %79, 2, !dbg !41
+  %142 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %141, !dbg !41
+  %143 = load float, ptr addrspace(3) %142, align 4, !dbg !41
+  %144 = tail call float @llvm.nvvm.div.full(float %104, float 1.280000e+02), !dbg !42
+  %145 = fadd float %144, 0x3EB0C6F7A0000000, !dbg !43
+  %146 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44
+  %.not.i7 = icmp eq i32 %146, 0, !dbg !44
+  br i1 %.not.i7, label %149, label %147, !dbg !44
+
+147:                                              ; preds = %__nv_rsqrtf.exit6
+  %148 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %145), !dbg !44
+  br label %__nv_rsqrtf.exit9, !dbg !44
+
+149:                                              ; preds = %__nv_rsqrtf.exit6
+  %150 = tail call float @llvm.nvvm.rsqrt.approx.f(float %145), !dbg !44
+  br label %__nv_rsqrtf.exit9, !dbg !44
+
+__nv_rsqrtf.exit9:                                ; preds = %147, %149
+  %.0.i8 = phi float [ %148, %147 ], [ %150, %149 ], !dbg !44
+  %151 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44
+  %.not.i10 = icmp eq i32 %151, 0, !dbg !44
+  br i1 %.not.i10, label %154, label %152, !dbg !44
+
+152:                                              ; preds = %__nv_rsqrtf.exit9
+  %153 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %145), !dbg !44
+  br label %__nv_rsqrtf.exit12, !dbg !44
+
+154:                                              ; preds = %__nv_rsqrtf.exit9
+  %155 = tail call float @llvm.nvvm.rsqrt.approx.f(float %145), !dbg !44
+  br label %__nv_rsqrtf.exit12, !dbg !44
+
+__nv_rsqrtf.exit12:                               ; preds = %152, %154
+  %.0.i11 = phi float [ %153, %152 ], [ %155, %154 ], !dbg !44
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !45
+  store float %.0.i8, ptr addrspace(3) %140, align 4, !dbg !45
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !45
+  %156 = load float, ptr addrspace(3) %142, align 4, !dbg !45
+  %157 = shl i32 %16, 7, !dbg !46
+  %158 = and i32 %82, 1
+  %.masked = and i32 %82, 30
+  %159 = shl nuw nsw i32 %14, 3
+  %160 = and i32 %159, 120
+  %161 = and i32 %14, 16
+  %162 = lshr exact i32 %161, 2
+  %163 = select i1 %.not, i32 0, i32 192
+  %164 = xor i32 %163, %160
+  %165 = or disjoint i32 %164, %162
+  %166 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %165
+  %167 = xor i32 %165, 260
+  %168 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %167
+  %169 = shl nuw nsw i32 %14, 1
+  %170 = and i32 %169, 120
+  %171 = select i1 %.not2, i32 0, i32 192
+  %172 = and i32 %14, 2
+  %173 = icmp eq i32 %172, 0
+  %174 = select i1 %173, i32 0, i32 260
+  %175 = xor i32 %171, %170
+  %176 = or disjoint i32 %175, %174
+  %177 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %176
+  %178 = xor i32 %176, 4
+  %179 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %178
+  %180 = icmp eq i32 %158, 0
+  %181 = shl i32 %80, 7
+  %182 = shl i32 %83, 15
+  %183 = add i32 %182, %181
+  %184 = icmp ne i32 %158, 0
+  %185 = shl nuw nsw i32 %81, 1
+  %186 = xor i32 %171, %185
+  %187 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %186
+  %188 = shl nuw nsw i32 %17, 2
+  %189 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %188
+  %190 = xor i32 %188, 192
+  %191 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %190
+  %192 = add i32 %183, 4097
+  %193 = add i32 %183, 4096
+  %194 = shl nuw nsw i32 %78, 2
+  %195 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %194
+  %196 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %185
+  %197 = getelementptr inbounds nuw i8, ptr addrspace(3) %196, i32 128
+  %198 = and i32 %169, 60
+  %199 = lshr exact i32 %15, 4
+  %200 = or disjoint i32 %198, %199
+  %201 = or disjoint i32 %200, %171
+  %202 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %201
+  %203 = xor i32 %201, 64
+  %204 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %203
+  %205 = and i32 %159, 56
+  %206 = lshr i32 %14, 2
+  %207 = and i32 %206, 2
+  %208 = shl nuw nsw i32 %161, 2
+  %209 = or disjoint i32 %205, %208
+  %210 = xor i32 %209, %163
+  %211 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %207
+  %212 = getelementptr inbounds nuw i8, ptr addrspace(3) %211, i32 %210
+  %213 = getelementptr inbounds nuw i8, ptr addrspace(3) %212, i32 4
+  %214 = zext nneg i32 %.masked to i64, !dbg !47
+  %215 = zext nneg i32 %78 to i64, !dbg !47
+  %216 = sext i32 %126 to i64, !dbg !47
+  %217 = sext i32 %157 to i64, !dbg !47
+  br label %218, !dbg !47
+
+218:                                              ; preds = %__nv_rsqrtf.exit12, %218
+  %219 = phi i1 [ true, %__nv_rsqrtf.exit12 ], [ false, %218 ]
+  %indvars.iv = phi i64 [ 0, %__nv_rsqrtf.exit12 ], [ 64, %218 ]
+  %220 = or disjoint i64 %indvars.iv, %24, !dbg !48
+  %221 = or disjoint i64 %indvars.iv, %215, !dbg !48
+  %222 = or disjoint i64 %indvars.iv, %214, !dbg !49
+  %223 = or disjoint i64 %222, 32, !dbg !49
+  %224 = trunc nuw nsw i64 %220 to i32, !dbg !50
+  %225 = or disjoint i32 %22, %224, !dbg !50
+  %226 = sext i32 %225 to i64, !dbg !51
+  %227 = getelementptr bfloat, ptr addrspace(1) %2, i64 %226, !dbg !51
+  %228 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !52
+  %229 = tail call i32 asm sideeffect "mov.u32 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $2 + 0 ], $3;", "=r,r,l,l,b"(i32 0, ptr addrspace(1) %227, i64 %228, i1 true) #6, !dbg !52
+  %230 = bitcast i32 %229 to <2 x bfloat>, !dbg !52
+  %231 = extractelement <2 x bfloat> %230, i64 0, !dbg !52
+  %232 = extractelement <2 x bfloat> %230, i64 1, !dbg !52
+  %233 = fpext bfloat %231 to float, !dbg !53
+  %234 = fpext bfloat %232 to float, !dbg !53
+  %235 = getelementptr bfloat, ptr addrspace(1) %3, i64 %221, !dbg !54
+  %236 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !55
+  %237 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %235, i64 %236, i1 true) #6, !dbg !55
+  %238 = bitcast i16 %237 to bfloat, !dbg !55
+  %239 = fpext bfloat %238 to float, !dbg !56
+  %240 = or disjoint i64 %220, %216, !dbg !57
+  %241 = getelementptr float, ptr addrspace(1) %4, i64 %240, !dbg !58
+  %242 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !59
+  %243 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %241, i64 %242, i1 true) #6, !dbg !59
+  %244 = extractvalue { i32, i32 } %243, 0, !dbg !59
+  %245 = extractvalue { i32, i32 } %243, 1, !dbg !59
+  %246 = bitcast i32 %244 to float, !dbg !59
+  %247 = bitcast i32 %245 to float, !dbg !59
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !59
+  %248 = insertelement <1 x i32> poison, i32 %244, i64 0, !dbg !59
+  store <1 x i32> %248, ptr addrspace(3) %166, align 4, !dbg !59
+  %249 = insertelement <1 x i32> poison, i32 %245, i64 0, !dbg !59
+  store <1 x i32> %249, ptr addrspace(3) %168, align 4, !dbg !59
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !59
+  %250 = load float, ptr addrspace(3) %177, align 4, !dbg !59
+  %251 = load float, ptr addrspace(3) %179, align 4, !dbg !59
+  %252 = getelementptr float, ptr addrspace(1) %5, i64 %240, !dbg !60
+  %253 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !61
+  %254 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %252, i64 %253, i1 true) #6, !dbg !61
+  %255 = extractvalue { i32, i32 } %254, 0, !dbg !61
+  %256 = extractvalue { i32, i32 } %254, 1, !dbg !61
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !61
+  %257 = insertelement <1 x i32> poison, i32 %255, i64 0, !dbg !61
+  store <1 x i32> %257, ptr addrspace(3) %166, align 4, !dbg !61
+  %258 = insertelement <1 x i32> poison, i32 %256, i64 0, !dbg !61
+  store <1 x i32> %258, ptr addrspace(3) %168, align 4, !dbg !61
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !61
+  %259 = load float, ptr addrspace(3) %177, align 4, !dbg !61
+  %260 = load float, ptr addrspace(3) %179, align 4, !dbg !61
+  %261 = or disjoint i32 %23, %224, !dbg !62
+  %262 = sext i32 %261 to i64, !dbg !63
+  %263 = getelementptr bfloat, ptr addrspace(1) %2, i64 %262, !dbg !63
+  %264 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #6, !dbg !64
+  %265 = tail call i32 asm sideeffect "mov.u32 $0, $1;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.b32 { $0 }, [ $2 + 0 ], $3;", "=r,r,l,l,b"(i32 0, ptr addrspace(1) %263, i64 %264, i1 true) #6, !dbg !64
+  %266 = bitcast i32 %265 to <2 x bfloat>, !dbg !64
+  %267 = extractelement <2 x bfloat> %266, i64 0, !dbg !64
+  %268 = extractelement <2 x bfloat> %266, i64 1, !dbg !64
+  %269 = fpext bfloat %267 to float, !dbg !65
+  %270 = fpext bfloat %268 to float, !dbg !65
+  %271 = getelementptr bfloat, ptr addrspace(1) %6, i64 %221, !dbg !66
+  %272 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !67
+  %273 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %271, i64 %272, i1 true) #6, !dbg !67
+  %274 = bitcast i16 %273 to bfloat, !dbg !67
+  %275 = fpext bfloat %274 to float, !dbg !68
+  %276 = or disjoint i64 %222, 1, !dbg !69
+  %277 = or disjoint i64 %222, 33, !dbg !69
+  %278 = trunc nuw nsw i64 %276 to i32, !dbg !70
+  %279 = or disjoint i32 %183, %278, !dbg !70
+  %280 = trunc nuw nsw i64 %277 to i32, !dbg !70
+  %281 = or disjoint i32 %183, %280, !dbg !70
+  %282 = sext i32 %279 to i64, !dbg !71
+  %283 = getelementptr bfloat, ptr addrspace(1) %2, i64 %282, !dbg !71
+  %284 = sext i32 %281 to i64, !dbg !71
+  %285 = getelementptr bfloat, ptr addrspace(1) %2, i64 %284, !dbg !71
+  %286 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !72
+  %287 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %283, i64 %286, i1 %180) #6, !dbg !72
+  %288 = bitcast i16 %287 to bfloat, !dbg !72
+  %289 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !72
+  %290 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %285, i64 %289, i1 %180) #6, !dbg !72
+  %291 = bitcast i16 %290 to bfloat, !dbg !72
+  %292 = fpext bfloat %288 to float, !dbg !73
+  %293 = fpext bfloat %291 to float, !dbg !73
+  %294 = fmul float %143, %292, !dbg !41
+  %295 = fmul float %143, %293, !dbg !41
+  %296 = getelementptr bfloat, ptr addrspace(1) %3, i64 %276, !dbg !74
+  %297 = getelementptr bfloat, ptr addrspace(1) %3, i64 %277, !dbg !74
+  %298 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !75
+  %299 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %296, i64 %298, i1 %180) #6, !dbg !75
+  %300 = bitcast i16 %299 to bfloat, !dbg !75
+  %301 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !75
+  %302 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %297, i64 %301, i1 %180) #6, !dbg !75
+  %303 = bitcast i16 %302 to bfloat, !dbg !75
+  %304 = fpext bfloat %300 to float, !dbg !76
+  %305 = fpext bfloat %303 to float, !dbg !76
+  %306 = fmul float %294, %304, !dbg !77
+  %307 = fmul float %295, %305, !dbg !77
+  %308 = fsub float 0.000000e+00, %306, !dbg !78
+  %309 = fsub float 0.000000e+00, %307, !dbg !78
+  %310 = trunc nuw nsw i64 %222 to i32, !dbg !79
+  %311 = or disjoint i32 %183, %310, !dbg !79
+  %312 = trunc nuw nsw i64 %223 to i32, !dbg !79
+  %313 = or disjoint i32 %183, %312, !dbg !79
+  %314 = sext i32 %311 to i64, !dbg !80
+  %315 = getelementptr bfloat, ptr addrspace(1) %2, i64 %314, !dbg !80
+  %316 = sext i32 %313 to i64, !dbg !80
+  %317 = getelementptr bfloat, ptr addrspace(1) %2, i64 %316, !dbg !80
+  %318 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !81
+  %319 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %315, i64 %318, i1 %184) #6, !dbg !81
+  %320 = bitcast i16 %319 to bfloat, !dbg !81
+  %321 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !81
+  %322 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %317, i64 %321, i1 %184) #6, !dbg !81
+  %323 = bitcast i16 %322 to bfloat, !dbg !81
+  %324 = fpext bfloat %320 to float, !dbg !82
+  %325 = fpext bfloat %323 to float, !dbg !82
+  %326 = fmul float %143, %324, !dbg !83
+  %327 = fmul float %143, %325, !dbg !83
+  %328 = getelementptr bfloat, ptr addrspace(1) %3, i64 %222, !dbg !84
+  %329 = getelementptr bfloat, ptr addrspace(1) %3, i64 %223, !dbg !84
+  %330 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !85
+  %331 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %328, i64 %330, i1 %184) #6, !dbg !85
+  %332 = bitcast i16 %331 to bfloat, !dbg !85
+  %333 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !85
+  %334 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %329, i64 %333, i1 %184) #6, !dbg !85
+  %335 = bitcast i16 %334 to bfloat, !dbg !85
+  %336 = fpext bfloat %332 to float, !dbg !86
+  %337 = fpext bfloat %335 to float, !dbg !86
+  %338 = fmul float %326, %336, !dbg !87
+  %339 = fmul float %327, %337, !dbg !87
+  %340 = select i1 %180, float %308, float %338, !dbg !88
+  %341 = select i1 %180, float %309, float %339, !dbg !88
+  %342 = fmul float %.0.i5, %233, !dbg !89
+  %343 = fmul float %.0.i5, %234, !dbg !89
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !90
+  store float %239, ptr addrspace(3) %187, align 4, !dbg !90
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !90
+  %344 = load float, ptr addrspace(3) %189, align 4, !dbg !90
+  %345 = load float, ptr addrspace(3) %191, align 4, !dbg !90
+  %346 = fmul float %342, %344, !dbg !90
+  %347 = fmul float %343, %345, !dbg !90
+  %348 = fmul float %346, %246, !dbg !91
+  %349 = fmul float %347, %247, !dbg !91
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !91
+  store float %348, ptr addrspace(3) %166, align 4, !dbg !91
+  store float %349, ptr addrspace(3) %168, align 4, !dbg !91
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !91
+  %350 = load float, ptr addrspace(3) %177, align 4, !dbg !91
+  %351 = load float, ptr addrspace(3) %179, align 4, !dbg !91
+  %352 = fmul float %259, %340, !dbg !92
+  %353 = fmul float %260, %341, !dbg !92
+  %354 = fadd float %352, %350, !dbg !93
+  %355 = fadd float %353, %351, !dbg !93
+  %356 = or disjoint i32 %192, %310, !dbg !94
+  %357 = or disjoint i32 %192, %312, !dbg !94
+  %358 = sext i32 %356 to i64, !dbg !95
+  %359 = getelementptr bfloat, ptr addrspace(1) %2, i64 %358, !dbg !95
+  %360 = sext i32 %357 to i64, !dbg !95
+  %361 = getelementptr bfloat, ptr addrspace(1) %2, i64 %360, !dbg !95
+  %362 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !96
+  %363 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %359, i64 %362, i1 %180) #6, !dbg !96
+  %364 = bitcast i16 %363 to bfloat, !dbg !96
+  %365 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !96
+  %366 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %361, i64 %365, i1 %180) #6, !dbg !96
+  %367 = bitcast i16 %366 to bfloat, !dbg !96
+  %368 = fpext bfloat %364 to float, !dbg !97
+  %369 = fpext bfloat %367 to float, !dbg !97
+  %370 = fmul float %156, %368, !dbg !45
+  %371 = fmul float %156, %369, !dbg !45
+  %372 = getelementptr bfloat, ptr addrspace(1) %6, i64 %276, !dbg !98
+  %373 = getelementptr bfloat, ptr addrspace(1) %6, i64 %277, !dbg !98
+  %374 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !99
+  %375 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %372, i64 %374, i1 %180) #6, !dbg !99
+  %376 = bitcast i16 %375 to bfloat, !dbg !99
+  %377 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !99
+  %378 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %373, i64 %377, i1 %180) #6, !dbg !99
+  %379 = bitcast i16 %378 to bfloat, !dbg !99
+  %380 = fpext bfloat %376 to float, !dbg !100
+  %381 = fpext bfloat %379 to float, !dbg !100
+  %382 = fmul float %370, %380, !dbg !101
+  %383 = fmul float %371, %381, !dbg !101
+  %384 = fsub float 0.000000e+00, %382, !dbg !102
+  %385 = fsub float 0.000000e+00, %383, !dbg !102
+  %386 = or disjoint i32 %193, %310, !dbg !103
+  %387 = or disjoint i32 %193, %312, !dbg !103
+  %388 = sext i32 %386 to i64, !dbg !104
+  %389 = getelementptr bfloat, ptr addrspace(1) %2, i64 %388, !dbg !104
+  %390 = sext i32 %387 to i64, !dbg !104
+  %391 = getelementptr bfloat, ptr addrspace(1) %2, i64 %390, !dbg !104
+  %392 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !105
+  %393 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %389, i64 %392, i1 %184) #6, !dbg !105
+  %394 = bitcast i16 %393 to bfloat, !dbg !105
+  %395 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !105
+  %396 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %391, i64 %395, i1 %184) #6, !dbg !105
+  %397 = bitcast i16 %396 to bfloat, !dbg !105
+  %398 = fpext bfloat %394 to float, !dbg !106
+  %399 = fpext bfloat %397 to float, !dbg !106
+  %400 = fmul float %156, %398, !dbg !107
+  %401 = fmul float %156, %399, !dbg !107
+  %402 = getelementptr bfloat, ptr addrspace(1) %6, i64 %222, !dbg !108
+  %403 = getelementptr bfloat, ptr addrspace(1) %6, i64 %223, !dbg !108
+  %404 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !109
+  %405 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %402, i64 %404, i1 %184) #6, !dbg !109
+  %406 = bitcast i16 %405 to bfloat, !dbg !109
+  %407 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !109
+  %408 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %403, i64 %407, i1 %184) #6, !dbg !109
+  %409 = bitcast i16 %408 to bfloat, !dbg !109
+  %410 = fpext bfloat %406 to float, !dbg !110
+  %411 = fpext bfloat %409 to float, !dbg !110
+  %412 = fmul float %400, %410, !dbg !111
+  %413 = fmul float %401, %411, !dbg !111
+  %414 = select i1 %180, float %384, float %412, !dbg !88
+  %415 = select i1 %180, float %385, float %413, !dbg !88
+  %416 = fmul float %.0.i11, %269, !dbg !112
+  %417 = fmul float %.0.i11, %270, !dbg !112
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !112
+  store float %416, ptr addrspace(3) %166, align 4, !dbg !112
+  store float %417, ptr addrspace(3) %168, align 4, !dbg !112
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !112
+  %418 = load float, ptr addrspace(3) %177, align 4, !dbg !112
+  %419 = load float, ptr addrspace(3) %179, align 4, !dbg !112
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !113
+  store float %275, ptr addrspace(3) %195, align 4, !dbg !113
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !113
+  %420 = load float, ptr addrspace(3) %196, align 4, !dbg !113
+  %421 = load float, ptr addrspace(3) %197, align 4, !dbg !113
+  %422 = fmul float %418, %420, !dbg !114
+  %423 = fmul float %419, %421, !dbg !114
+  %424 = fmul float %250, %422, !dbg !113
+  %425 = fmul float %251, %423, !dbg !113
+  %426 = fmul float %259, %414, !dbg !115
+  %427 = fmul float %260, %415, !dbg !115
+  %428 = fadd float %426, %424, !dbg !116
+  %429 = fadd float %427, %425, !dbg !116
+  %430 = or disjoint i64 %220, %217, !dbg !117
+  %431 = getelementptr bfloat, ptr addrspace(1) %0, i64 %430, !dbg !118
+  %432 = fptrunc float %354 to bfloat, !dbg !119
+  %433 = fptrunc float %355 to bfloat, !dbg !119
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !119
+  store bfloat %432, ptr addrspace(3) %202, align 2, !dbg !119
+  store bfloat %433, ptr addrspace(3) %204, align 2, !dbg !119
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !119
+  %434 = load bfloat, ptr addrspace(3) %212, align 2, !dbg !119
+  %435 = load bfloat, ptr addrspace(3) %213, align 2, !dbg !119
+  %436 = insertelement <2 x bfloat> poison, bfloat %434, i64 0, !dbg !119
+  %437 = insertelement <2 x bfloat> %436, bfloat %435, i64 1, !dbg !119
+  %438 = bitcast <2 x bfloat> %437 to i32, !dbg !119
+  tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %438, ptr addrspace(1) %431, i1 true) #6, !dbg !119
+  %439 = getelementptr bfloat, ptr addrspace(1) %1, i64 %430, !dbg !120
+  %440 = fptrunc float %428 to bfloat, !dbg !121
+  %441 = fptrunc float %429 to bfloat, !dbg !121
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !121
+  store bfloat %440, ptr addrspace(3) %202, align 2, !dbg !121
+  store bfloat %441, ptr addrspace(3) %204, align 2, !dbg !121
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !121
+  %442 = load bfloat, ptr addrspace(3) %212, align 2, !dbg !121
+  %443 = load bfloat, ptr addrspace(3) %213, align 2, !dbg !121
+  %444 = insertelement <2 x bfloat> poison, bfloat %442, i64 0, !dbg !121
+  %445 = insertelement <2 x bfloat> %444, bfloat %443, i64 1, !dbg !121
+  %446 = bitcast <2 x bfloat> %445 to i32, !dbg !121
+  tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %446, ptr addrspace(1) %439, i1 true) #6, !dbg !121
+  br i1 %219, label %218, label %447, !dbg !47
+
+447:                                              ; preds = %218
+  ret void, !dbg !122
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1
+
+; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
+declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.div.full(float, float) #3
+
+; Function Attrs: convergent nocallback nounwind
+declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #4
+
+declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #5
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #3
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.rsqrt.approx.f(float) #3
+
+attributes #0 = { nounwind "nvvm.reqntid"="64" }
+attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
+attributes #3 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
+attributes #4 = { convergent nocallback nounwind }
+attributes #5 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #6 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3}
+!llvm.ident = !{!4}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly)
+!1 = !DIFile(filename: "cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py", directory: "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv")
+!2 = !{i32 2, !"Debug Info Version", i32 3}
+!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
+!4 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
+!5 = distinct !DISubprogram(name: "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0", linkageName: "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0", scope: !1, file: !1, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
+!7 = !{}
+!8 = !DILocation(line: 23, column: 28, scope: !5)
+!9 = !DILocation(line: 23, column: 33, scope: !5)
+!10 = !DILocation(line: 24, column: 44, scope: !5)
+!11 = !DILocation(line: 24, column: 23, scope: !5)
+!12 = !DILocation(line: 26, column: 37, scope: !5)
+!13 = !DILocation(line: 29, column: 19, scope: !5)
+!14 = !DILocation(line: 33, column: 43, scope: !5)
+!15 = !DILocation(line: 39, column: 57, scope: !5)
+!16 = !DILocation(line: 39, column: 34, scope: !5)
+!17 = !DILocation(line: 39, column: 68, scope: !5)
+!18 = !DILocation(line: 39, column: 121, scope: !5)
+!19 = !DILocation(line: 40, column: 50, scope: !5)
+!20 = !DILocation(line: 40, column: 34, scope: !5)
+!21 = !DILocation(line: 40, column: 61, scope: !5)
+!22 = !DILocation(line: 40, column: 114, scope: !5)
+!23 = !DILocation(line: 42, column: 22, scope: !5)
+!24 = !DILocation(line: 47, column: 22, scope: !5)
+!25 = !DILocation(line: 34, column: 31, scope: !5)
+!26 = !DILocation(line: 44, column: 23, scope: !5)
+!27 = !DILocation(line: 49, column: 25, scope: !5)
+!28 = !DILocation(line: 263, column: 15, scope: !29, inlinedAt: !31)
+!29 = distinct !DILexicalBlockFile(scope: !5, file: !30, discriminator: 0)
+!30 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.12/dist-packages/triton/language")
+!31 = !DILocation(line: 293, column: 36, scope: !29, inlinedAt: !32)
+!32 = !DILocation(line: 51, column: 25, scope: !33)
+!33 = distinct !DILexicalBlockFile(scope: !5, file: !1, discriminator: 0)
+!34 = !DILocation(line: 263, column: 15, scope: !29, inlinedAt: !35)
+!35 = !DILocation(line: 293, column: 36, scope: !29, inlinedAt: !36)
+!36 = !DILocation(line: 52, column: 27, scope: !33)
+!37 = !DILocation(line: 63, column: 46, scope: !5)
+!38 = !DILocation(line: 75, column: 25, scope: !5)
+!39 = !DILocation(line: 77, column: 24, scope: !5)
+!40 = !DILocation(line: 78, column: 32, scope: !5)
+!41 = !DILocation(line: 79, column: 24, scope: !5)
+!42 = !DILocation(line: 123, column: 24, scope: !5)
+!43 = !DILocation(line: 124, column: 24, scope: !5)
+!44 = !DILocation(line: 125, column: 32, scope: !5)
+!45 = !DILocation(line: 126, column: 24, scope: !5)
+!46 = !DILocation(line: 161, column: 43, scope: !5)
+!47 = !DILocation(line: 53, column: 43, scope: !5)
+!48 = !DILocation(line: 54, column: 31, scope: !5)
+!49 = !DILocation(line: 72, column: 41, scope: !5)
+!50 = !DILocation(line: 61, column: 51, scope: !5)
+!51 = !DILocation(line: 61, column: 35, scope: !5)
+!52 = !DILocation(line: 61, column: 62, scope: !5)
+!53 = !DILocation(line: 61, column: 115, scope: !5)
+!54 = !DILocation(line: 62, column: 35, scope: !5)
+!55 = !DILocation(line: 62, column: 42, scope: !5)
+!56 = !DILocation(line: 62, column: 95, scope: !5)
+!57 = !DILocation(line: 63, column: 42, scope: !5)
+!58 = !DILocation(line: 63, column: 35, scope: !5)
+!59 = !DILocation(line: 63, column: 51, scope: !5)
+!60 = !DILocation(line: 64, column: 35, scope: !5)
+!61 = !DILocation(line: 64, column: 51, scope: !5)
+!62 = !DILocation(line: 65, column: 58, scope: !5)
+!63 = !DILocation(line: 65, column: 35, scope: !5)
+!64 = !DILocation(line: 65, column: 69, scope: !5)
+!65 = !DILocation(line: 65, column: 123, scope: !5)
+!66 = !DILocation(line: 66, column: 36, scope: !5)
+!67 = !DILocation(line: 66, column: 43, scope: !5)
+!68 = !DILocation(line: 66, column: 96, scope: !5)
+!69 = !DILocation(line: 72, column: 39, scope: !5)
+!70 = !DILocation(line: 72, column: 57, scope: !5)
+!71 = !DILocation(line: 72, column: 35, scope: !5)
+!72 = !DILocation(line: 72, column: 68, scope: !5)
+!73 = !DILocation(line: 72, column: 129, scope: !5)
+!74 = !DILocation(line: 80, column: 35, scope: !5)
+!75 = !DILocation(line: 80, column: 85, scope: !5)
+!76 = !DILocation(line: 80, column: 146, scope: !5)
+!77 = !DILocation(line: 82, column: 24, scope: !5)
+!78 = !DILocation(line: 84, column: 17, scope: !5)
+!79 = !DILocation(line: 90, column: 53, scope: !5)
+!80 = !DILocation(line: 90, column: 35, scope: !5)
+!81 = !DILocation(line: 90, column: 64, scope: !5)
+!82 = !DILocation(line: 90, column: 125, scope: !5)
+!83 = !DILocation(line: 97, column: 24, scope: !5)
+!84 = !DILocation(line: 98, column: 35, scope: !5)
+!85 = !DILocation(line: 98, column: 81, scope: !5)
+!86 = !DILocation(line: 98, column: 142, scope: !5)
+!87 = !DILocation(line: 100, column: 24, scope: !5)
+!88 = !DILocation(line: 0, scope: !5)
+!89 = !DILocation(line: 111, column: 24, scope: !5)
+!90 = !DILocation(line: 113, column: 24, scope: !5)
+!91 = !DILocation(line: 116, column: 24, scope: !5)
+!92 = !DILocation(line: 118, column: 24, scope: !5)
+!93 = !DILocation(line: 119, column: 24, scope: !5)
+!94 = !DILocation(line: 121, column: 60, scope: !5)
+!95 = !DILocation(line: 121, column: 35, scope: !5)
+!96 = !DILocation(line: 121, column: 71, scope: !5)
+!97 = !DILocation(line: 121, column: 132, scope: !5)
+!98 = !DILocation(line: 127, column: 35, scope: !5)
+!99 = !DILocation(line: 127, column: 85, scope: !5)
+!100 = !DILocation(line: 127, column: 146, scope: !5)
+!101 = !DILocation(line: 129, column: 24, scope: !5)
+!102 = !DILocation(line: 131, column: 17, scope: !5)
+!103 = !DILocation(line: 134, column: 60, scope: !5)
+!104 = !DILocation(line: 134, column: 35, scope: !5)
+!105 = !DILocation(line: 134, column: 71, scope: !5)
+!106 = !DILocation(line: 134, column: 132, scope: !5)
+!107 = !DILocation(line: 139, column: 24, scope: !5)
+!108 = !DILocation(line: 140, column: 35, scope: !5)
+!109 = !DILocation(line: 140, column: 81, scope: !5)
+!110 = !DILocation(line: 140, column: 142, scope: !5)
+!111 = !DILocation(line: 142, column: 24, scope: !5)
+!112 = !DILocation(line: 151, column: 25, scope: !5)
+!113 = !DILocation(line: 156, column: 26, scope: !5)
+!114 = !DILocation(line: 153, column: 26, scope: !5)
+!115 = !DILocation(line: 158, column: 26, scope: !5)
+!116 = !DILocation(line: 159, column: 26, scope: !5)
+!117 = !DILocation(line: 161, column: 39, scope: !5)
+!118 = !DILocation(line: 161, column: 32, scope: !5)
+!119 = !DILocation(line: 161, column: 55, scope: !5)
+!120 = !DILocation(line: 162, column: 32, scope: !5)
+!121 = !DILocation(line: 162, column: 56, scope: !5)
+!122 = !DILocation(line: 53, column: 4, scope: !5)
diff --git a/triton/4CABSBCN3JWTJORDZXCB5NETVIT6A64D5LGN2QHUTERFG5YETKTA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ptx b/triton/4CABSBCN3JWTJORDZXCB5NETVIT6A64D5LGN2QHUTERFG5YETKTA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ptx
new file mode 100644
index 0000000000000000000000000000000000000000..0d1083c2cdb52c38c0c6488593c7c8ea82c98b59
--- /dev/null
+++ b/triton/4CABSBCN3JWTJORDZXCB5NETVIT6A64D5LGN2QHUTERFG5YETKTA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ptx
@@ -0,0 +1,1211 @@
+//
+// Generated by LLVM NVPTX Back-End
+//
+
+.version 9.1
+.target sm_89
+.address_size 64
+
+	// .globl	triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0 // -- Begin function triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0
+.extern .shared .align 16 .b8 global_smem[];
+.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90};
+                                        // @triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0
+.visible .entry triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0(
+	.param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_0,
+	.param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_1,
+	.param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_2,
+	.param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_3,
+	.param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_4,
+	.param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_5,
+	.param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_6,
+	.param .u32 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_7,
+	.param .u32 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_8,
+	.param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_9,
+	.param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_10
+)
+.reqntid 64
+{
+	.reg .pred 	%p<6>;
+	.reg .b16 	%rs<40>;
+	.reg .b32 	%r<227>;
+	.reg .b64 	%rd<98>;
+	.loc	1 18 0                          // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:18:0
+$L__func_begin0:
+	.loc	1 18 0                          // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:18:0
+
+// %bb.0:                               // %__nv_rsqrtf.exit
+	ld.param.b64 	%rd12, [triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_6];
+	ld.param.b64 	%rd11, [triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_5];
+	ld.param.b64 	%rd10, [triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_4];
+	ld.param.b64 	%rd9, [triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_3];
+	ld.param.b64 	%rd8, [triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_2];
+	ld.param.b64 	%rd7, [triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_1];
+	ld.param.b64 	%rd6, [triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_0];
+$L__tmp0:
+	.loc	1 23 28                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:23:28
+	mov.u32 	%r28, %ctaid.x;
+	.loc	1 23 33                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:23:33
+	shl.b32 	%r29, %r28, 1;
+	.loc	1 24 44                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:24:44
+	mov.u32 	%r30, %tid.x;
+	bfe.s32 	%r31, %r30, 5, 1;
+	and.b32 	%r32, %r30, 32;
+	bfe.u32 	%r33, %r30, 5, 1;
+	.loc	1 24 23                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:24:23
+	or.b32 	%r34, %r33, %r29;
+	.loc	1 26 37                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:26:37
+	and.b32 	%r35, %r30, 31;
+	shl.b32 	%r36, %r35, 1;
+	.loc	1 29 19                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:29:19
+	bfe.s32 	%r37, %r28, 30, 1;
+	shr.u32 	%r38, %r37, 27;
+	add.s32 	%r39, %r34, %r38;
+	shr.s32 	%r40, %r39, 5;
+	shl.b32 	%r41, %r34, 7;
+	shl.b32 	%r42, %r40, 15;
+	add.s32 	%r1, %r42, %r41;
+	add.s32 	%r2, %r1, 4096;
+	.loc	1 33 43                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:33:43
+	cvt.u64.u32 	%rd1, %r36;
+	.loc	1 39 57                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:39:57
+	or.b32 	%r43, %r2, %r36;
+	.loc	1 39 34                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:39:34
+	mad.wide.s32 	%rd13, %r43, 2, %rd8;
+	.loc	1 39 68                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:39:68
+	// begin inline asm
+	mov.u64 %rd14, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd14, 1.0;
+	// end inline asm
+	mov.b32 	%r24, 0;
+	mov.pred 	%p2, -1;
+	// begin inline asm
+	mov.u32 %r23, %r24;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.b32 { %r23 }, [ %rd13 + 0 ], %rd14;
+	// end inline asm
+	mov.b32 	{%rs1, %rs2}, %r23;
+	.loc	1 39 121                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:39:121
+	cvt.f32.bf16 	%r44, %rs1;
+	cvt.f32.bf16 	%r45, %rs2;
+	.loc	1 40 50                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:40:50
+	or.b32 	%r46, %r1, %r36;
+	.loc	1 40 34                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:40:34
+	mad.wide.s32 	%rd15, %r46, 2, %rd8;
+	.loc	1 40 61                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:40:61
+	// begin inline asm
+	mov.u64 %rd16, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd16, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r25, %r24;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.b32 { %r25 }, [ %rd15 + 0 ], %rd16;
+	// end inline asm
+	mov.b32 	{%rs3, %rs4}, %r25;
+	.loc	1 40 114                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:40:114
+	cvt.f32.bf16 	%r47, %rs3;
+	cvt.f32.bf16 	%r48, %rs4;
+	.loc	1 39 34                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:39:34
+	cvt.s64.s32 	%rd21, %r2;
+	or.b64 	%rd22, %rd21, %rd1;
+	shl.b64 	%rd23, %rd22, 1;
+	add.s64 	%rd24, %rd8, %rd23;
+	add.s64 	%rd17, %rd24, 128;
+	.loc	1 39 68                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:39:68
+	// begin inline asm
+	mov.u64 %rd18, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd18, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r26, %r24;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.b32 { %r26 }, [ %rd17 + 0 ], %rd18;
+	// end inline asm
+	mov.b32 	{%rs5, %rs6}, %r26;
+	.loc	1 39 121                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:39:121
+	cvt.f32.bf16 	%r49, %rs5;
+	cvt.f32.bf16 	%r50, %rs6;
+	.loc	1 40 34                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:40:34
+	cvt.s64.s32 	%rd25, %r1;
+	or.b64 	%rd26, %rd25, %rd1;
+	shl.b64 	%rd27, %rd26, 1;
+	add.s64 	%rd28, %rd8, %rd27;
+	add.s64 	%rd19, %rd28, 128;
+	.loc	1 40 61                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:40:61
+	// begin inline asm
+	mov.u64 %rd20, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd20, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r27, %r24;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.b32 { %r27 }, [ %rd19 + 0 ], %rd20;
+	// end inline asm
+	mov.b32 	{%rs7, %rs8}, %r27;
+	.loc	1 40 114                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:40:114
+	cvt.f32.bf16 	%r51, %rs7;
+	cvt.f32.bf16 	%r52, %rs8;
+	.loc	1 42 22                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:42:22
+	mul.f32 	%r53, %r49, %r49;
+	mul.f32 	%r54, %r50, %r50;
+	.loc	1 44 23                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:44:23
+	fma.rn.f32 	%r55, %r44, %r44, %r53;
+	fma.rn.f32 	%r56, %r45, %r45, %r54;
+	.loc	1 47 22                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:47:22
+	mul.f32 	%r57, %r51, %r51;
+	mul.f32 	%r58, %r52, %r52;
+	.loc	1 49 25                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:49:25
+	fma.rn.f32 	%r59, %r47, %r47, %r57;
+	fma.rn.f32 	%r60, %r48, %r48, %r58;
+	.loc	1 24 44                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:24:44
+	and.b32 	%r61, %r30, 63;
+	and.b32 	%r62, %r30, 1;
+	neg.s32 	%r63, %r62;
+	.loc	1 24 23                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:24:23
+	or.b32 	%r64, %r29, %r62;
+	.loc	1 26 37                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:26:37
+	and.b32 	%r65, %r30, 62;
+	bfe.u32 	%r66, %r30, 1, 5;
+	.loc	1 29 19                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:29:19
+	add.s32 	%r67, %r64, %r38;
+$L__tmp1:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ] ]
+	add.f32 	%r68, %r55, %r56;
+$L__tmp2:
+	.loc	2 293 36                        // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ]
+	shfl.sync.bfly.b32 	%r69, %r68, 16, 31, -1;
+$L__tmp3:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ] ]
+	add.f32 	%r70, %r68, %r69;
+$L__tmp4:
+	.loc	2 293 36                        // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ]
+	shfl.sync.bfly.b32 	%r71, %r70, 8, 31, -1;
+$L__tmp5:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ] ]
+	add.f32 	%r72, %r70, %r71;
+$L__tmp6:
+	.loc	2 293 36                        // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ]
+	shfl.sync.bfly.b32 	%r73, %r72, 4, 31, -1;
+$L__tmp7:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ] ]
+	add.f32 	%r74, %r72, %r73;
+$L__tmp8:
+	.loc	2 293 36                        // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ]
+	shfl.sync.bfly.b32 	%r75, %r74, 2, 31, -1;
+$L__tmp9:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ] ]
+	add.f32 	%r76, %r74, %r75;
+$L__tmp10:
+	.loc	2 293 36                        // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ]
+	shfl.sync.bfly.b32 	%r77, %r76, 1, 31, -1;
+$L__tmp11:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ] ]
+	add.f32 	%r78, %r76, %r77;
+$L__tmp12:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ] ]
+	add.f32 	%r79, %r59, %r60;
+$L__tmp13:
+	.loc	2 293 36                        // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ]
+	shfl.sync.bfly.b32 	%r80, %r79, 16, 31, -1;
+$L__tmp14:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ] ]
+	add.f32 	%r81, %r79, %r80;
+$L__tmp15:
+	.loc	2 293 36                        // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ]
+	shfl.sync.bfly.b32 	%r82, %r81, 8, 31, -1;
+$L__tmp16:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ] ]
+	add.f32 	%r83, %r81, %r82;
+$L__tmp17:
+	.loc	2 293 36                        // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ]
+	shfl.sync.bfly.b32 	%r84, %r83, 4, 31, -1;
+$L__tmp18:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ] ]
+	add.f32 	%r85, %r83, %r84;
+$L__tmp19:
+	.loc	2 293 36                        // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ]
+	shfl.sync.bfly.b32 	%r86, %r85, 2, 31, -1;
+$L__tmp20:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ] ]
+	add.f32 	%r87, %r85, %r86;
+$L__tmp21:
+	.loc	2 293 36                        // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ]
+	shfl.sync.bfly.b32 	%r88, %r87, 1, 31, -1;
+$L__tmp22:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ] ]
+	add.f32 	%r89, %r87, %r88;
+$L__tmp23:
+	.loc	1 63 46                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:63:46
+	shl.b32 	%r90, %r40, 7;
+	mov.b32 	%r91, 0f43000000;
+	.loc	1 75 25                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:75:25
+	div.full.f32 	%r92, %r89, %r91;
+	.loc	1 77 24                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:77:24
+	add.f32 	%r93, %r92, 0f358637BD;
+	.loc	1 78 32                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:78:32
+	rsqrt.approx.ftz.f32 	%r3, %r93;
+	.loc	1 79 24                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:79:24
+	shr.u32 	%r94, %r32, 3;
+	mov.b32 	%r95, global_smem;
+	add.s32 	%r96, %r95, %r94;
+	st.shared.b32 	[%r96], %r3;
+	bar.sync 	0;
+	shl.b32 	%r97, %r62, 2;
+	add.s32 	%r98, %r95, %r97;
+	ld.shared.b32 	%r4, [%r98];
+	.loc	1 123 24                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:123:24
+	div.full.f32 	%r99, %r78, %r91;
+	.loc	1 124 24                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:124:24
+	add.f32 	%r100, %r99, 0f358637BD;
+	.loc	1 125 32                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:125:32
+	rsqrt.approx.ftz.f32 	%r5, %r100;
+	.loc	1 126 24                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:126:24
+	bar.sync 	0;
+	st.shared.b32 	[%r96], %r5;
+	bar.sync 	0;
+	ld.shared.b32 	%r6, [%r98];
+	bfe.u32 	%r7, %r65, 1, 1;
+	and.b32 	%r101, %r66, 30;
+	shl.b32 	%r102, %r30, 3;
+	and.b32 	%r103, %r102, 120;
+	and.b32 	%r104, %r30, 16;
+	shr.u32 	%r105, %r104, 2;
+	and.b32 	%r106, %r31, 192;
+	xor.b32 	%r107, %r106, %r103;
+	or.b32 	%r108, %r107, %r105;
+	add.s32 	%r8, %r95, %r108;
+	xor.b32 	%r109, %r108, 4;
+	add.s32 	%r9, %r95, %r109;
+	shl.b32 	%r110, %r30, 1;
+	and.b32 	%r111, %r110, 120;
+	and.b32 	%r112, %r63, 192;
+	bfe.s32 	%r113, %r30, 1, 1;
+	and.b32 	%r114, %r113, 260;
+	xor.b32 	%r115, %r112, %r111;
+	or.b32 	%r116, %r115, %r114;
+	add.s32 	%r10, %r95, %r116;
+	xor.b32 	%r117, %r116, 4;
+	add.s32 	%r11, %r95, %r117;
+	shl.b32 	%r118, %r64, 7;
+	shl.b32 	%r119, %r67, 10;
+	and.b32 	%r120, %r119, -32768;
+	add.s32 	%r12, %r120, %r118;
+	shl.b32 	%r121, %r65, 1;
+	xor.b32 	%r122, %r112, %r121;
+	add.s32 	%r13, %r95, %r122;
+	shl.b32 	%r123, %r35, 2;
+	add.s32 	%r14, %r95, %r123;
+	xor.b32 	%r124, %r123, 64;
+	add.s32 	%r15, %r95, %r124;
+	add.s32 	%r16, %r12, 4097;
+	add.s32 	%r17, %r12, 4096;
+	shl.b32 	%r125, %r61, 2;
+	add.s32 	%r18, %r95, %r125;
+	add.s32 	%r19, %r95, %r121;
+	and.b32 	%r126, %r110, 60;
+	shr.u32 	%r127, %r32, 4;
+	or.b32 	%r128, %r126, %r127;
+	or.b32 	%r129, %r128, %r112;
+	add.s32 	%r20, %r95, %r129;
+	xor.b32 	%r130, %r129, 64;
+	add.s32 	%r21, %r95, %r130;
+	and.b32 	%r131, %r102, 56;
+	shr.u32 	%r132, %r30, 2;
+	and.b32 	%r133, %r132, 2;
+	shl.b32 	%r134, %r104, 2;
+	or.b32 	%r135, %r131, %r134;
+	xor.b32 	%r136, %r135, %r106;
+	add.s32 	%r137, %r95, %r133;
+	add.s32 	%r22, %r137, %r136;
+	.loc	1 53 43                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:53:43
+	cvt.u64.u32 	%rd2, %r101;
+	cvt.u64.u32 	%rd3, %r61;
+	cvt.s64.s32 	%rd4, %r90;
+	cvt.s64.s32 	%rd5, %r41;
+	mov.b64 	%rd97, 0;
+	mov.pred 	%p5, %p2;
+$L__BB0_1:                              // =>This Inner Loop Header: Depth=1
+	.loc	1 0 43                          // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:0:43
+	mov.pred 	%p1, %p5;
+	setp.ne.b32 	%p4, %r7, 0;
+	setp.eq.b32 	%p3, %r7, 0;
+	.loc	1 54 31                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:54:31
+	or.b64 	%rd75, %rd97, %rd1;
+	or.b64 	%rd76, %rd97, %rd3;
+	.loc	1 72 41                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:72:41
+	or.b64 	%rd77, %rd97, %rd2;
+	.loc	1 61 51                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:61:51
+	cvt.u32.u64 	%r146, %rd75;
+	or.b32 	%r147, %r1, %r146;
+	.loc	1 61 35                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:61:35
+	mad.wide.s32 	%rd30, %r147, 2, %rd8;
+	.loc	1 61 62                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:61:62
+	// begin inline asm
+	mov.u64 %rd29, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd29, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r138, %r24;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.b32 { %r138 }, [ %rd30 + 0 ], %rd29;
+	// end inline asm
+	mov.b32 	{%rs28, %rs29}, %r138;
+	.loc	1 61 115                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:61:115
+	cvt.f32.bf16 	%r148, %rs28;
+	cvt.f32.bf16 	%r149, %rs29;
+	.loc	1 62 35                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:62:35
+	shl.b64 	%rd78, %rd76, 1;
+	add.s64 	%rd32, %rd9, %rd78;
+	.loc	1 62 42                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:62:42
+	// begin inline asm
+	mov.u64 %rd31, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd31, 1.0;
+	// end inline asm
+	mov.b16 	%rs10, 0;
+	// begin inline asm
+	mov.u16 %rs9, %rs10;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs9 }, [ %rd32 + 0 ], %rd31;
+	// end inline asm
+	.loc	1 62 95                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:62:95
+	cvt.f32.bf16 	%r150, %rs9;
+	.loc	1 63 42                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:63:42
+	or.b64 	%rd79, %rd75, %rd4;
+	.loc	1 63 35                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:63:35
+	shl.b64 	%rd80, %rd79, 2;
+	add.s64 	%rd34, %rd10, %rd80;
+	.loc	1 63 51                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:63:51
+	// begin inline asm
+	mov.u64 %rd33, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd33, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r139, %r24;
+	mov.u32 %r140, %r24;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { %r139, %r140 }, [ %rd34 + 0 ], %rd33;
+	// end inline asm
+	bar.sync 	0;
+	st.shared.b32 	[%r8], %r139;
+	st.shared.b32 	[%r9+256], %r140;
+	bar.sync 	0;
+	ld.shared.b32 	%r151, [%r10];
+	ld.shared.b32 	%r152, [%r11];
+	.loc	1 64 35                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:64:35
+	add.s64 	%rd36, %rd11, %rd80;
+	.loc	1 64 51                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:64:51
+	// begin inline asm
+	mov.u64 %rd35, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd35, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r141, %r24;
+	mov.u32 %r142, %r24;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { %r141, %r142 }, [ %rd36 + 0 ], %rd35;
+	// end inline asm
+	bar.sync 	0;
+	st.shared.b32 	[%r8], %r141;
+	st.shared.b32 	[%r9+256], %r142;
+	bar.sync 	0;
+	ld.shared.b32 	%r153, [%r10];
+	ld.shared.b32 	%r154, [%r11];
+	.loc	1 65 58                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:65:58
+	or.b32 	%r155, %r2, %r146;
+	.loc	1 65 35                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:65:35
+	mad.wide.s32 	%rd38, %r155, 2, %rd8;
+	.loc	1 65 69                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:65:69
+	// begin inline asm
+	mov.u64 %rd37, 0x0;
+	createpolicy.fractional.L2::evict_first.b64 %rd37, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r143, %r24;
+	@%p2 ld.global.L1::evict_first.L2::cache_hint.b32 { %r143 }, [ %rd38 + 0 ], %rd37;
+	// end inline asm
+	mov.b32 	{%rs30, %rs31}, %r143;
+	.loc	1 65 123                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:65:123
+	cvt.f32.bf16 	%r156, %rs30;
+	cvt.f32.bf16 	%r157, %rs31;
+	.loc	1 66 36                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:66:36
+	add.s64 	%rd40, %rd12, %rd78;
+	.loc	1 66 43                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:66:43
+	// begin inline asm
+	mov.u64 %rd39, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd39, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs11, %rs10;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs11 }, [ %rd40 + 0 ], %rd39;
+	// end inline asm
+	.loc	1 66 96                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:66:96
+	cvt.f32.bf16 	%r158, %rs11;
+	.loc	1 72 35                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:72:35
+	cvt.s64.s32 	%rd81, %r12;
+	.loc	1 72 57                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:72:57
+	cvt.u32.u64 	%r159, %rd77;
+	.loc	1 72 35                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:72:35
+	cvt.s64.s32 	%rd82, %rd77;
+	add.s64 	%rd83, %rd81, %rd82;
+	shl.b64 	%rd84, %rd83, 1;
+	add.s64 	%rd85, %rd8, %rd84;
+	add.s64 	%rd42, %rd85, 2;
+	add.s64 	%rd44, %rd85, 66;
+	.loc	1 72 68                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:72:68
+	// begin inline asm
+	mov.u64 %rd41, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd41, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs12, %rs10;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs12 }, [ %rd42 + 0 ], %rd41;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd43, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd43, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs13, %rs10;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs13 }, [ %rd44 + 0 ], %rd43;
+	// end inline asm
+	.loc	1 72 129                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:72:129
+	cvt.f32.bf16 	%r160, %rs12;
+	cvt.f32.bf16 	%r161, %rs13;
+	.loc	1 79 24                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:79:24
+	mul.f32 	%r162, %r4, %r160;
+	mul.f32 	%r163, %r4, %r161;
+	.loc	1 80 35                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:80:35
+	shl.b64 	%rd86, %rd77, 1;
+	add.s64 	%rd54, %rd9, %rd86;
+	add.s64 	%rd46, %rd54, 2;
+	add.s64 	%rd48, %rd54, 66;
+	.loc	1 80 85                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:80:85
+	// begin inline asm
+	mov.u64 %rd45, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd45, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs14, %rs10;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs14 }, [ %rd46 + 0 ], %rd45;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd47, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd47, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs15, %rs10;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs15 }, [ %rd48 + 0 ], %rd47;
+	// end inline asm
+	.loc	1 80 146                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:80:146
+	cvt.f32.bf16 	%r164, %rs14;
+	cvt.f32.bf16 	%r165, %rs15;
+	.loc	1 84 17                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:84:17
+	neg.f32 	%r166, %r162;
+	fma.rn.f32 	%r167, %r166, %r164, 0f00000000;
+	neg.f32 	%r168, %r163;
+	fma.rn.f32 	%r169, %r168, %r165, 0f00000000;
+	.loc	1 90 53                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:90:53
+	or.b32 	%r170, %r12, %r159;
+	.loc	1 90 35                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:90:35
+	mad.wide.s32 	%rd50, %r170, 2, %rd8;
+	add.s64 	%rd52, %rd85, 64;
+	.loc	1 90 64                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:90:64
+	// begin inline asm
+	mov.u64 %rd49, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd49, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs16, %rs10;
+	@%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs16 }, [ %rd50 + 0 ], %rd49;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd51, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd51, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs17, %rs10;
+	@%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs17 }, [ %rd52 + 0 ], %rd51;
+	// end inline asm
+	.loc	1 90 125                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:90:125
+	cvt.f32.bf16 	%r171, %rs16;
+	cvt.f32.bf16 	%r172, %rs17;
+	.loc	1 97 24                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:97:24
+	mul.f32 	%r173, %r4, %r171;
+	mul.f32 	%r174, %r4, %r172;
+	.loc	1 98 35                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:98:35
+	add.s64 	%rd56, %rd54, 64;
+	.loc	1 98 81                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:98:81
+	// begin inline asm
+	mov.u64 %rd53, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd53, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs18, %rs10;
+	@%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs18 }, [ %rd54 + 0 ], %rd53;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd55, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd55, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs19, %rs10;
+	@%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs19 }, [ %rd56 + 0 ], %rd55;
+	// end inline asm
+	.loc	1 98 142                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:98:142
+	cvt.f32.bf16 	%r175, %rs18;
+	cvt.f32.bf16 	%r176, %rs19;
+	.loc	1 100 24                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:100:24
+	mul.f32 	%r177, %r173, %r175;
+	mul.f32 	%r178, %r174, %r176;
+	.loc	1 0 0                           // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:0
+	selp.f32 	%r179, %r167, %r177, %p3;
+	selp.f32 	%r180, %r169, %r178, %p3;
+	.loc	1 111 24                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:111:24
+	mul.f32 	%r181, %r3, %r148;
+	mul.f32 	%r182, %r3, %r149;
+	.loc	1 113 24                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:113:24
+	bar.sync 	0;
+	st.shared.b32 	[%r13], %r150;
+	bar.sync 	0;
+	ld.shared.b32 	%r183, [%r14];
+	ld.shared.b32 	%r184, [%r15+128];
+	mul.f32 	%r185, %r181, %r183;
+	mul.f32 	%r186, %r182, %r184;
+	.loc	1 116 24                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:116:24
+	mul.f32 	%r187, %r185, %r139;
+	mul.f32 	%r188, %r186, %r140;
+	bar.sync 	0;
+	st.shared.b32 	[%r8], %r187;
+	st.shared.b32 	[%r9+256], %r188;
+	bar.sync 	0;
+	ld.shared.b32 	%r189, [%r10];
+	ld.shared.b32 	%r190, [%r11];
+	.loc	1 119 24                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:119:24
+	fma.rn.f32 	%r191, %r153, %r179, %r189;
+	fma.rn.f32 	%r192, %r154, %r180, %r190;
+	.loc	1 121 60                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:121:60
+	or.b32 	%r193, %r16, %r159;
+	.loc	1 121 35                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:121:35
+	mad.wide.s32 	%rd58, %r193, 2, %rd8;
+	cvt.s64.s32 	%rd87, %r16;
+	add.s64 	%rd88, %rd87, %rd82;
+	shl.b64 	%rd89, %rd88, 1;
+	add.s64 	%rd90, %rd8, %rd89;
+	add.s64 	%rd60, %rd90, 64;
+	.loc	1 121 71                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:121:71
+	// begin inline asm
+	mov.u64 %rd57, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd57, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs20, %rs10;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs20 }, [ %rd58 + 0 ], %rd57;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd59, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd59, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs21, %rs10;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs21 }, [ %rd60 + 0 ], %rd59;
+	// end inline asm
+	.loc	1 121 132                       // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:121:132
+	cvt.f32.bf16 	%r194, %rs20;
+	cvt.f32.bf16 	%r195, %rs21;
+	.loc	1 126 24                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:126:24
+	mul.f32 	%r196, %r6, %r194;
+	mul.f32 	%r197, %r6, %r195;
+	.loc	1 127 35                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:127:35
+	add.s64 	%rd70, %rd12, %rd86;
+	add.s64 	%rd62, %rd70, 2;
+	add.s64 	%rd64, %rd70, 66;
+	.loc	1 127 85                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:127:85
+	// begin inline asm
+	mov.u64 %rd61, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd61, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs22, %rs10;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs22 }, [ %rd62 + 0 ], %rd61;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd63, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd63, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs23, %rs10;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs23 }, [ %rd64 + 0 ], %rd63;
+	// end inline asm
+	.loc	1 127 146                       // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:127:146
+	cvt.f32.bf16 	%r198, %rs22;
+	cvt.f32.bf16 	%r199, %rs23;
+	.loc	1 131 17                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:131:17
+	neg.f32 	%r200, %r196;
+	fma.rn.f32 	%r201, %r200, %r198, 0f00000000;
+	neg.f32 	%r202, %r197;
+	fma.rn.f32 	%r203, %r202, %r199, 0f00000000;
+	.loc	1 134 60                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:134:60
+	or.b32 	%r204, %r17, %r159;
+	.loc	1 134 35                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:134:35
+	mad.wide.s32 	%rd66, %r204, 2, %rd8;
+	cvt.s64.s32 	%rd91, %r17;
+	add.s64 	%rd92, %rd91, %rd82;
+	shl.b64 	%rd93, %rd92, 1;
+	add.s64 	%rd94, %rd8, %rd93;
+	add.s64 	%rd68, %rd94, 64;
+	.loc	1 134 71                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:134:71
+	// begin inline asm
+	mov.u64 %rd65, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd65, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs24, %rs10;
+	@%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs24 }, [ %rd66 + 0 ], %rd65;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd67, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd67, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs25, %rs10;
+	@%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs25 }, [ %rd68 + 0 ], %rd67;
+	// end inline asm
+	.loc	1 134 132                       // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:134:132
+	cvt.f32.bf16 	%r205, %rs24;
+	cvt.f32.bf16 	%r206, %rs25;
+	.loc	1 139 24                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:139:24
+	mul.f32 	%r207, %r6, %r205;
+	mul.f32 	%r208, %r6, %r206;
+	.loc	1 140 35                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:140:35
+	add.s64 	%rd72, %rd70, 64;
+	.loc	1 140 81                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:140:81
+	// begin inline asm
+	mov.u64 %rd69, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd69, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs26, %rs10;
+	@%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs26 }, [ %rd70 + 0 ], %rd69;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd71, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd71, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs27, %rs10;
+	@%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs27 }, [ %rd72 + 0 ], %rd71;
+	// end inline asm
+	.loc	1 140 142                       // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:140:142
+	cvt.f32.bf16 	%r209, %rs26;
+	cvt.f32.bf16 	%r210, %rs27;
+	.loc	1 142 24                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:142:24
+	mul.f32 	%r211, %r207, %r209;
+	mul.f32 	%r212, %r208, %r210;
+	.loc	1 0 0                           // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:0
+	selp.f32 	%r213, %r201, %r211, %p3;
+	selp.f32 	%r214, %r203, %r212, %p3;
+	.loc	1 151 25                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:151:25
+	mul.f32 	%r215, %r5, %r156;
+	mul.f32 	%r216, %r5, %r157;
+	bar.sync 	0;
+	st.shared.b32 	[%r8], %r215;
+	st.shared.b32 	[%r9+256], %r216;
+	bar.sync 	0;
+	ld.shared.b32 	%r217, [%r10];
+	ld.shared.b32 	%r218, [%r11];
+	.loc	1 156 26                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:156:26
+	bar.sync 	0;
+	st.shared.b32 	[%r18], %r158;
+	bar.sync 	0;
+	ld.shared.b32 	%r219, [%r19];
+	ld.shared.b32 	%r220, [%r19+128];
+	.loc	1 153 26                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:153:26
+	mul.f32 	%r221, %r217, %r219;
+	mul.f32 	%r222, %r218, %r220;
+	.loc	1 156 26                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:156:26
+	mul.f32 	%r223, %r151, %r221;
+	mul.f32 	%r224, %r152, %r222;
+	.loc	1 159 26                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:159:26
+	fma.rn.f32 	%r225, %r153, %r213, %r223;
+	fma.rn.f32 	%r226, %r154, %r214, %r224;
+	.loc	1 161 39                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:161:39
+	or.b64 	%rd95, %rd75, %rd5;
+	.loc	1 161 32                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:161:32
+	shl.b64 	%rd96, %rd95, 1;
+	add.s64 	%rd73, %rd6, %rd96;
+	.loc	1 161 55                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:161:55
+	cvt.rn.bf16.f32 	%rs32, %r191;
+	cvt.rn.bf16.f32 	%rs33, %r192;
+	bar.sync 	0;
+	st.shared.b16 	[%r20], %rs32;
+	st.shared.b16 	[%r21], %rs33;
+	bar.sync 	0;
+	ld.shared.b16 	%rs34, [%r22];
+	ld.shared.b16 	%rs35, [%r22+4];
+	mov.b32 	%r144, {%rs34, %rs35};
+	// begin inline asm
+	@%p2 st.global.b32 [ %rd73 + 0 ], { %r144 };
+	// end inline asm
+	.loc	1 162 32                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:162:32
+	add.s64 	%rd74, %rd7, %rd96;
+	.loc	1 162 56                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:162:56
+	cvt.rn.bf16.f32 	%rs36, %r225;
+	cvt.rn.bf16.f32 	%rs37, %r226;
+	bar.sync 	0;
+	st.shared.b16 	[%r20], %rs36;
+	st.shared.b16 	[%r21], %rs37;
+	bar.sync 	0;
+	ld.shared.b16 	%rs38, [%r22];
+	ld.shared.b16 	%rs39, [%r22+4];
+	mov.b32 	%r145, {%rs38, %rs39};
+	// begin inline asm
+	@%p2 st.global.b32 [ %rd74 + 0 ], { %r145 };
+	// end inline asm
+	mov.b64 	%rd97, 64;
+	mov.pred 	%p5, 0;
+	.loc	1 53 43                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:53:43
+	@%p1 bra 	$L__BB0_1;
+// %bb.2:
+	.loc	1 53 4                          // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:53:4
+	ret;
+$L__tmp24:
+$L__func_end0:
+                                        // -- End function
+}
+	.file	1 "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py"
+	.file	2 "/usr/local/lib/python3.12/dist-packages/triton/language/standard.py"
+	.section	.debug_abbrev
+	{
+.b8 1                                   // Abbreviation Code
+.b8 17                                  // DW_TAG_compile_unit
+.b8 1                                   // DW_CHILDREN_yes
+.b8 37                                  // DW_AT_producer
+.b8 8                                   // DW_FORM_string
+.b8 19                                  // DW_AT_language
+.b8 5                                   // DW_FORM_data2
+.b8 3                                   // DW_AT_name
+.b8 8                                   // DW_FORM_string
+.b8 16                                  // DW_AT_stmt_list
+.b8 6                                   // DW_FORM_data4
+.b8 27                                  // DW_AT_comp_dir
+.b8 8                                   // DW_FORM_string
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 2                                   // Abbreviation Code
+.b8 46                                  // DW_TAG_subprogram
+.b8 0                                   // DW_CHILDREN_no
+.b8 3                                   // DW_AT_name
+.b8 8                                   // DW_FORM_string
+.b8 32                                  // DW_AT_inline
+.b8 11                                  // DW_FORM_data1
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 3                                   // Abbreviation Code
+.b8 46                                  // DW_TAG_subprogram
+.b8 1                                   // DW_CHILDREN_yes
+.b8 17                                  // DW_AT_low_pc
+.b8 1                                   // DW_FORM_addr
+.b8 18                                  // DW_AT_high_pc
+.b8 1                                   // DW_FORM_addr
+.b8 49                                  // DW_AT_abstract_origin
+.b8 19                                  // DW_FORM_ref4
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 4                                   // Abbreviation Code
+.b8 29                                  // DW_TAG_inlined_subroutine
+.b8 1                                   // DW_CHILDREN_yes
+.b8 49                                  // DW_AT_abstract_origin
+.b8 19                                  // DW_FORM_ref4
+.b8 17                                  // DW_AT_low_pc
+.b8 1                                   // DW_FORM_addr
+.b8 18                                  // DW_AT_high_pc
+.b8 1                                   // DW_FORM_addr
+.b8 88                                  // DW_AT_call_file
+.b8 11                                  // DW_FORM_data1
+.b8 89                                  // DW_AT_call_line
+.b8 11                                  // DW_FORM_data1
+.b8 87                                  // DW_AT_call_column
+.b8 11                                  // DW_FORM_data1
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 5                                   // Abbreviation Code
+.b8 29                                  // DW_TAG_inlined_subroutine
+.b8 0                                   // DW_CHILDREN_no
+.b8 49                                  // DW_AT_abstract_origin
+.b8 19                                  // DW_FORM_ref4
+.b8 17                                  // DW_AT_low_pc
+.b8 1                                   // DW_FORM_addr
+.b8 18                                  // DW_AT_high_pc
+.b8 1                                   // DW_FORM_addr
+.b8 88                                  // DW_AT_call_file
+.b8 11                                  // DW_FORM_data1
+.b8 89                                  // DW_AT_call_line
+.b8 5                                   // DW_FORM_data2
+.b8 87                                  // DW_AT_call_column
+.b8 11                                  // DW_FORM_data1
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 0                                   // EOM(3)
+	}
+	.section	.debug_info
+	{
+.b32 456                                // Length of Unit
+.b8 2                                   // DWARF version number
+.b8 0
+.b32 .debug_abbrev                      // Offset Into Abbrev. Section
+.b8 8                                   // Address Size (in bytes)
+.b8 1                                   // Abbrev [1] 0xb:0x1c1 DW_TAG_compile_unit
+.b8 116                                 // DW_AT_producer
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 0
+.b8 2                                   // DW_AT_language
+.b8 0
+.b8 99                                  // DW_AT_name
+.b8 98
+.b8 118
+.b8 113
+.b8 104
+.b8 106
+.b8 116
+.b8 121
+.b8 103
+.b8 55
+.b8 102
+.b8 118
+.b8 120
+.b8 122
+.b8 119
+.b8 116
+.b8 98
+.b8 116
+.b8 116
+.b8 52
+.b8 118
+.b8 114
+.b8 100
+.b8 107
+.b8 98
+.b8 110
+.b8 98
+.b8 54
+.b8 110
+.b8 51
+.b8 50
+.b8 102
+.b8 110
+.b8 114
+.b8 105
+.b8 106
+.b8 106
+.b8 112
+.b8 108
+.b8 51
+.b8 118
+.b8 118
+.b8 52
+.b8 99
+.b8 102
+.b8 113
+.b8 100
+.b8 52
+.b8 109
+.b8 122
+.b8 110
+.b8 114
+.b8 46
+.b8 112
+.b8 121
+.b8 0
+.b32 .debug_line                        // DW_AT_stmt_list
+.b8 47                                  // DW_AT_comp_dir
+.b8 97
+.b8 112
+.b8 112
+.b8 47
+.b8 116
+.b8 101
+.b8 110
+.b8 115
+.b8 111
+.b8 114
+.b8 114
+.b8 116
+.b8 95
+.b8 108
+.b8 108
+.b8 109
+.b8 47
+.b8 118
+.b8 105
+.b8 115
+.b8 117
+.b8 97
+.b8 108
+.b8 95
+.b8 103
+.b8 101
+.b8 110
+.b8 47
+.b8 99
+.b8 111
+.b8 109
+.b8 112
+.b8 105
+.b8 108
+.b8 101
+.b8 100
+.b8 95
+.b8 99
+.b8 97
+.b8 99
+.b8 104
+.b8 101
+.b8 47
+.b8 102
+.b8 108
+.b8 117
+.b8 120
+.b8 50
+.b8 95
+.b8 107
+.b8 108
+.b8 101
+.b8 105
+.b8 110
+.b8 95
+.b8 57
+.b8 98
+.b8 95
+.b8 78
+.b8 86
+.b8 73
+.b8 68
+.b8 73
+.b8 65
+.b8 95
+.b8 71
+.b8 101
+.b8 70
+.b8 111
+.b8 114
+.b8 99
+.b8 101
+.b8 95
+.b8 82
+.b8 84
+.b8 88
+.b8 95
+.b8 52
+.b8 48
+.b8 57
+.b8 48
+.b8 95
+.b8 115
+.b8 109
+.b8 56
+.b8 57
+.b8 95
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 50
+.b8 46
+.b8 49
+.b8 48
+.b8 46
+.b8 48
+.b8 97
+.b8 48
+.b8 95
+.b8 98
+.b8 52
+.b8 101
+.b8 52
+.b8 101
+.b8 101
+.b8 56
+.b8 49
+.b8 100
+.b8 51
+.b8 46
+.b8 110
+.b8 118
+.b8 50
+.b8 53
+.b8 46
+.b8 49
+.b8 50
+.b8 95
+.b8 99
+.b8 117
+.b8 100
+.b8 97
+.b8 49
+.b8 51
+.b8 95
+.b8 49
+.b8 47
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 105
+.b8 110
+.b8 100
+.b8 117
+.b8 99
+.b8 116
+.b8 111
+.b8 114
+.b8 47
+.b8 98
+.b8 118
+.b8 0
+.b8 2                                   // Abbrev [2] 0xe4:0x6d DW_TAG_subprogram
+.b8 116                                 // DW_AT_name
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 114
+.b8 101
+.b8 100
+.b8 95
+.b8 102
+.b8 117
+.b8 115
+.b8 101
+.b8 100
+.b8 95
+.b8 95
+.b8 102
+.b8 117
+.b8 115
+.b8 101
+.b8 100
+.b8 95
+.b8 114
+.b8 109
+.b8 115
+.b8 95
+.b8 110
+.b8 111
+.b8 114
+.b8 109
+.b8 95
+.b8 95
+.b8 116
+.b8 111
+.b8 95
+.b8 99
+.b8 111
+.b8 112
+.b8 121
+.b8 95
+.b8 97
+.b8 100
+.b8 100
+.b8 95
+.b8 109
+.b8 117
+.b8 108
+.b8 95
+.b8 110
+.b8 101
+.b8 103
+.b8 95
+.b8 115
+.b8 112
+.b8 108
+.b8 105
+.b8 116
+.b8 95
+.b8 115
+.b8 112
+.b8 108
+.b8 105
+.b8 116
+.b8 95
+.b8 119
+.b8 105
+.b8 116
+.b8 104
+.b8 95
+.b8 115
+.b8 105
+.b8 122
+.b8 101
+.b8 115
+.b8 95
+.b8 115
+.b8 116
+.b8 97
+.b8 99
+.b8 107
+.b8 95
+.b8 117
+.b8 110
+.b8 98
+.b8 105
+.b8 110
+.b8 100
+.b8 95
+.b8 117
+.b8 110
+.b8 115
+.b8 113
+.b8 117
+.b8 101
+.b8 101
+.b8 122
+.b8 101
+.b8 95
+.b8 118
+.b8 105
+.b8 101
+.b8 119
+.b8 95
+.b8 48
+.b8 0
+.b8 1                                   // DW_AT_inline
+.b8 3                                   // Abbrev [3] 0x151:0x7a DW_TAG_subprogram
+.b64 $L__func_begin0                    // DW_AT_low_pc
+.b64 $L__func_end0                      // DW_AT_high_pc
+.b32 228                                // DW_AT_abstract_origin
+.b8 4                                   // Abbrev [4] 0x166:0x32 DW_TAG_inlined_subroutine
+.b32 228                                // DW_AT_abstract_origin
+.b64 $L__tmp1                           // DW_AT_low_pc
+.b64 $L__tmp12                          // DW_AT_high_pc
+.b8 1                                   // DW_AT_call_file
+.b8 51                                  // DW_AT_call_line
+.b8 25                                  // DW_AT_call_column
+.b8 5                                   // Abbrev [5] 0x17e:0x19 DW_TAG_inlined_subroutine
+.b32 228                                // DW_AT_abstract_origin
+.b64 $L__tmp1                           // DW_AT_low_pc
+.b64 $L__tmp12                          // DW_AT_high_pc
+.b8 2                                   // DW_AT_call_file
+.b8 37                                  // DW_AT_call_line
+.b8 1
+.b8 36                                  // DW_AT_call_column
+.b8 0                                   // End Of Children Mark
+.b8 4                                   // Abbrev [4] 0x198:0x32 DW_TAG_inlined_subroutine
+.b32 228                                // DW_AT_abstract_origin
+.b64 $L__tmp12                          // DW_AT_low_pc
+.b64 $L__tmp23                          // DW_AT_high_pc
+.b8 1                                   // DW_AT_call_file
+.b8 52                                  // DW_AT_call_line
+.b8 27                                  // DW_AT_call_column
+.b8 5                                   // Abbrev [5] 0x1b0:0x19 DW_TAG_inlined_subroutine
+.b32 228                                // DW_AT_abstract_origin
+.b64 $L__tmp12                          // DW_AT_low_pc
+.b64 $L__tmp23                          // DW_AT_high_pc
+.b8 2                                   // DW_AT_call_file
+.b8 37                                  // DW_AT_call_line
+.b8 1
+.b8 36                                  // DW_AT_call_column
+.b8 0                                   // End Of Children Mark
+.b8 0                                   // End Of Children Mark
+.b8 0                                   // End Of Children Mark
+	}
+	.section	.debug_macinfo	{	}
diff --git a/triton/4CABSBCN3JWTJORDZXCB5NETVIT6A64D5LGN2QHUTERFG5YETKTA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.source b/triton/4CABSBCN3JWTJORDZXCB5NETVIT6A64D5LGN2QHUTERFG5YETKTA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.source
new file mode 100644
index 0000000000000000000000000000000000000000..2b9c8c254d415643c15bf01722276d94b8160239
--- /dev/null
+++ b/triton/4CABSBCN3JWTJORDZXCB5NETVIT6A64D5LGN2QHUTERFG5YETKTA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.source
@@ -0,0 +1,972 @@
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":18:0)
+#loc213 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":287:0)
+#loc215 = loc(unknown)
+#loc218 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":262:0)
+#loc222 = loc("in_out_ptr0"(#loc))
+#loc223 = loc("in_out_ptr1"(#loc))
+#loc224 = loc("in_ptr0"(#loc))
+#loc225 = loc("in_ptr1"(#loc))
+#loc226 = loc("in_ptr2"(#loc))
+#loc227 = loc("in_ptr3"(#loc))
+#loc228 = loc("in_ptr4"(#loc))
+#loc229 = loc("xnumel"(#loc))
+#loc230 = loc("r0_numel"(#loc))
+#loc432 = loc("input"(#loc213))
+#loc433 = loc("a"(#loc218))
+#loc434 = loc("b"(#loc218))
+module {
+  tt.func public @triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0(%in_out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_out_ptr0"(#loc)), %in_out_ptr1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_out_ptr1"(#loc)), %in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %in_ptr4: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr4"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
+    %xnumel_0 = arith.constant 73728 : i32 loc(#loc231)
+    %r0_numel_1 = arith.constant 128 : i32 loc(#loc232)
+    %xoffset = tt.get_program_id x : i32 loc(#loc233)
+    %xoffset_2 = arith.constant 2 : i32 loc(#loc234)
+    %xoffset_3 = arith.constant 2 : i32 loc(#loc234)
+    %xoffset_4 = arith.muli %xoffset, %xoffset_3 : i32 loc(#loc234)
+    %xindex = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc235)
+    %xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<2xi32> -> tensor<2x1xi32> loc(#loc236)
+    %xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<2x1xi32> loc(#loc237)
+    %xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<2x1xi32> loc(#loc237)
+    %xmask = arith.constant true loc(#loc238)
+    %xmask_8 = arith.constant dense<true> : tensor<2x64xi1> loc(#loc238)
+    %r0_base = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc239)
+    %r0_base_9 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc240)
+    %x0 = arith.constant 32 : i32 loc(#loc241)
+    %x0_10 = arith.constant 32 : i32 loc(#loc241)
+    %x0_11 = arith.constant dense<32> : tensor<2x1xi32> loc(#loc241)
+    %x0_12 = arith.remsi %xindex_7, %x0_11 : tensor<2x1xi32> loc(#loc241)
+    %x1 = arith.constant 32 : i32 loc(#loc242)
+    %x1_13 = arith.constant 32 : i32 loc(#loc242)
+    %x1_14 = arith.constant dense<32> : tensor<2x1xi32> loc(#loc242)
+    %x1_15 = arith.divsi %xindex_7, %x1_14 : tensor<2x1xi32> loc(#loc242)
+    %_tmp4 = arith.constant 0.000000e+00 : f32 loc(#loc243)
+    %_tmp4_16 = arith.constant dense<0.000000e+00> : tensor<2x64xf32> loc(#loc243)
+    %_tmp10 = arith.constant 0.000000e+00 : f32 loc(#loc244)
+    %_tmp10_17 = arith.constant dense<0.000000e+00> : tensor<2x64xf32> loc(#loc244)
+    %c0_i32 = arith.constant 0 : i32 loc(#loc15)
+    %c64_i32 = arith.constant 64 : i32 loc(#loc15)
+    %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc15)
+    %1 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc15)
+    %2 = arith.bitcast %c64_i32 : i32 to i32 loc(#loc15)
+    %3 = ub.poison : i32 loc(#loc15)
+    %_tmp10_18:2 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%_tmp4_23 = %_tmp4_16, %_tmp10_24 = %_tmp10_17) -> (tensor<2x64xf32>, tensor<2x64xf32>)  : i32 {
+      %r0_index = tt.splat %r0_offset : i32 -> tensor<1x64xi32> loc(#loc246)
+      %r0_index_25 = arith.addi %r0_index, %r0_base_9 : tensor<1x64xi32> loc(#loc246)
+      %r0_mask = arith.constant dense<128> : tensor<1x64xi32> loc(#loc247)
+      %r0_mask_26 = arith.cmpi slt, %r0_index_25, %r0_mask : tensor<1x64xi32> loc(#loc247)
+      %tmp0 = arith.constant 4096 : i32 loc(#loc248)
+      %tmp0_27 = arith.constant 4096 : i32 loc(#loc248)
+      %tmp0_28 = arith.constant dense<4096> : tensor<1x64xi32> loc(#loc248)
+      %tmp0_29 = arith.addi %tmp0_28, %r0_index_25 : tensor<1x64xi32> loc(#loc248)
+      %tmp0_30 = arith.constant 128 : i32 loc(#loc249)
+      %tmp0_31 = arith.constant 128 : i32 loc(#loc249)
+      %tmp0_32 = arith.constant dense<128> : tensor<2x1xi32> loc(#loc249)
+      %tmp0_33 = arith.muli %tmp0_32, %x0_12 : tensor<2x1xi32> loc(#loc249)
+      %tmp0_34 = tt.broadcast %tmp0_29 : tensor<1x64xi32> -> tensor<2x64xi32> loc(#loc250)
+      %tmp0_35 = tt.broadcast %tmp0_33 : tensor<2x1xi32> -> tensor<2x64xi32> loc(#loc250)
+      %tmp0_36 = arith.addi %tmp0_34, %tmp0_35 : tensor<2x64xi32> loc(#loc250)
+      %tmp0_37 = arith.constant 36864 : i32 loc(#loc251)
+      %tmp0_38 = arith.constant 36864 : i32 loc(#loc251)
+      %tmp0_39 = arith.constant dense<36864> : tensor<2x1xi32> loc(#loc251)
+      %tmp0_40 = arith.muli %tmp0_39, %x1_15 : tensor<2x1xi32> loc(#loc251)
+      %tmp0_41 = tt.broadcast %tmp0_40 : tensor<2x1xi32> -> tensor<2x64xi32> loc(#loc252)
+      %tmp0_42 = arith.addi %tmp0_36, %tmp0_41 : tensor<2x64xi32> loc(#loc252)
+      %tmp0_43 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<2x64x!tt.ptr<bf16>> loc(#loc253)
+      %tmp0_44 = tt.addptr %tmp0_43, %tmp0_42 : tensor<2x64x!tt.ptr<bf16>>, tensor<2x64xi32> loc(#loc253)
+      %tmp0_45 = arith.constant 0.000000e+00 : f32 loc(#loc254)
+      %tmp0_46 = tt.broadcast %r0_mask_26 : tensor<1x64xi1> -> tensor<2x64xi1> loc(#loc254)
+      %tmp0_47 = arith.constant dense<0.000000e+00> : tensor<2x64xf32> loc(#loc254)
+      %tmp0_48 = arith.truncf %tmp0_47 : tensor<2x64xf32> to tensor<2x64xbf16> loc(#loc254)
+      %tmp0_49 = tt.load %tmp0_44, %tmp0_46, %tmp0_48 evictionPolicy = evict_last : tensor<2x64x!tt.ptr<bf16>> loc(#loc254)
+      %tmp0_50 = arith.extf %tmp0_49 : tensor<2x64xbf16> to tensor<2x64xf32> loc(#loc255)
+      %tmp6 = arith.constant 128 : i32 loc(#loc256)
+      %tmp6_51 = arith.constant 128 : i32 loc(#loc256)
+      %tmp6_52 = arith.constant dense<128> : tensor<2x1xi32> loc(#loc256)
+      %tmp6_53 = arith.muli %tmp6_52, %x0_12 : tensor<2x1xi32> loc(#loc256)
+      %tmp6_54 = tt.broadcast %r0_index_25 : tensor<1x64xi32> -> tensor<2x64xi32> loc(#loc257)
+      %tmp6_55 = tt.broadcast %tmp6_53 : tensor<2x1xi32> -> tensor<2x64xi32> loc(#loc257)
+      %tmp6_56 = arith.addi %tmp6_54, %tmp6_55 : tensor<2x64xi32> loc(#loc257)
+      %tmp6_57 = arith.constant 36864 : i32 loc(#loc258)
+      %tmp6_58 = arith.constant 36864 : i32 loc(#loc258)
+      %tmp6_59 = arith.constant dense<36864> : tensor<2x1xi32> loc(#loc258)
+      %tmp6_60 = arith.muli %tmp6_59, %x1_15 : tensor<2x1xi32> loc(#loc258)
+      %tmp6_61 = tt.broadcast %tmp6_60 : tensor<2x1xi32> -> tensor<2x64xi32> loc(#loc259)
+      %tmp6_62 = arith.addi %tmp6_56, %tmp6_61 : tensor<2x64xi32> loc(#loc259)
+      %tmp6_63 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<2x64x!tt.ptr<bf16>> loc(#loc260)
+      %tmp6_64 = tt.addptr %tmp6_63, %tmp6_62 : tensor<2x64x!tt.ptr<bf16>>, tensor<2x64xi32> loc(#loc260)
+      %tmp6_65 = arith.constant 0.000000e+00 : f32 loc(#loc261)
+      %tmp6_66 = tt.broadcast %r0_mask_26 : tensor<1x64xi1> -> tensor<2x64xi1> loc(#loc261)
+      %tmp6_67 = arith.constant dense<0.000000e+00> : tensor<2x64xf32> loc(#loc261)
+      %tmp6_68 = arith.truncf %tmp6_67 : tensor<2x64xf32> to tensor<2x64xbf16> loc(#loc261)
+      %tmp6_69 = tt.load %tmp6_64, %tmp6_66, %tmp6_68 evictionPolicy = evict_last : tensor<2x64x!tt.ptr<bf16>> loc(#loc261)
+      %tmp6_70 = arith.extf %tmp6_69 : tensor<2x64xbf16> to tensor<2x64xf32> loc(#loc262)
+      %tmp2 = arith.mulf %tmp0_50, %tmp0_50 : tensor<2x64xf32> loc(#loc263)
+      %tmp5 = arith.addf %_tmp4_23, %tmp2 : tensor<2x64xf32> loc(#loc264)
+      %_tmp4_71 = tt.broadcast %r0_mask_26 : tensor<1x64xi1> -> tensor<2x64xi1> loc(#loc265)
+      %_tmp4_72 = arith.select %_tmp4_71, %tmp5, %_tmp4_23 : tensor<2x64xi1>, tensor<2x64xf32> loc(#loc265)
+      %tmp8 = arith.mulf %tmp6_70, %tmp6_70 : tensor<2x64xf32> loc(#loc266)
+      %tmp11 = arith.addf %_tmp10_24, %tmp8 : tensor<2x64xf32> loc(#loc267)
+      %_tmp10_73 = tt.broadcast %r0_mask_26 : tensor<1x64xi1> -> tensor<2x64xi1> loc(#loc268)
+      %_tmp10_74 = arith.select %_tmp10_73, %tmp11, %_tmp10_24 : tensor<2x64xi1>, tensor<2x64xf32> loc(#loc268)
+      scf.yield %_tmp4_72, %_tmp10_74 : tensor<2x64xf32>, tensor<2x64xf32> loc(#loc39)
+    } loc(#loc435)
+    %tmp4 = tt.call @"triton.language.standard.sum__fp32S2_64S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%_tmp10_18#0) : (tensor<2x64xf32>) -> tensor<2xf32> loc(#loc269)
+    %tmp4_19 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<2xf32> -> tensor<2x1xf32> loc(#loc270)
+    %tmp10 = tt.call @"triton.language.standard.sum__fp32S2_64S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%_tmp10_18#1) : (tensor<2x64xf32>) -> tensor<2xf32> loc(#loc271)
+    %tmp10_20 = tt.expand_dims %tmp10 {axis = 1 : i32} : tensor<2xf32> -> tensor<2x1xf32> loc(#loc272)
+    %c0_i32_21 = arith.constant 0 : i32 loc(#loc44)
+    %c64_i32_22 = arith.constant 64 : i32 loc(#loc44)
+    %4 = arith.bitcast %c0_i32_21 : i32 to i32 loc(#loc44)
+    %5 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc44)
+    %6 = arith.bitcast %c64_i32_22 : i32 to i32 loc(#loc44)
+    %7 = ub.poison : i32 loc(#loc44)
+    scf.for %r0_offset = %4 to %5 step %6  : i32 {
+      %r0_index = tt.splat %r0_offset : i32 -> tensor<1x64xi32> loc(#loc273)
+      %r0_index_23 = arith.addi %r0_index, %r0_base_9 : tensor<1x64xi32> loc(#loc273)
+      %r0_mask = arith.constant dense<128> : tensor<1x64xi32> loc(#loc274)
+      %r0_mask_24 = arith.cmpi slt, %r0_index_23, %r0_mask : tensor<1x64xi32> loc(#loc274)
+      %r0_3 = arith.constant 2 : i32 loc(#loc275)
+      %r0_3_25 = arith.constant 2 : i32 loc(#loc275)
+      %r0_3_26 = arith.constant dense<2> : tensor<1x64xi32> loc(#loc275)
+      %r0_3_27 = arith.remsi %r0_index_23, %r0_3_26 : tensor<1x64xi32> loc(#loc275)
+      %r0_4 = arith.constant 2 : i32 loc(#loc276)
+      %r0_4_28 = arith.constant 2 : i32 loc(#loc276)
+      %r0_4_29 = arith.constant dense<2> : tensor<1x64xi32> loc(#loc276)
+      %r0_4_30 = arith.divsi %r0_index_23, %r0_4_29 : tensor<1x64xi32> loc(#loc276)
+      %tmp50 = arith.constant 128 : i32 loc(#loc277)
+      %tmp50_31 = arith.constant 128 : i32 loc(#loc277)
+      %tmp50_32 = arith.constant dense<128> : tensor<2x1xi32> loc(#loc277)
+      %tmp50_33 = arith.muli %tmp50_32, %x0_12 : tensor<2x1xi32> loc(#loc277)
+      %tmp50_34 = tt.broadcast %r0_index_23 : tensor<1x64xi32> -> tensor<2x64xi32> loc(#loc278)
+      %tmp50_35 = tt.broadcast %tmp50_33 : tensor<2x1xi32> -> tensor<2x64xi32> loc(#loc278)
+      %tmp50_36 = arith.addi %tmp50_34, %tmp50_35 : tensor<2x64xi32> loc(#loc278)
+      %tmp50_37 = arith.constant 36864 : i32 loc(#loc279)
+      %tmp50_38 = arith.constant 36864 : i32 loc(#loc279)
+      %tmp50_39 = arith.constant dense<36864> : tensor<2x1xi32> loc(#loc279)
+      %tmp50_40 = arith.muli %tmp50_39, %x1_15 : tensor<2x1xi32> loc(#loc279)
+      %tmp50_41 = tt.broadcast %tmp50_40 : tensor<2x1xi32> -> tensor<2x64xi32> loc(#loc280)
+      %tmp50_42 = arith.addi %tmp50_36, %tmp50_41 : tensor<2x64xi32> loc(#loc280)
+      %tmp50_43 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<2x64x!tt.ptr<bf16>> loc(#loc281)
+      %tmp50_44 = tt.addptr %tmp50_43, %tmp50_42 : tensor<2x64x!tt.ptr<bf16>>, tensor<2x64xi32> loc(#loc281)
+      %tmp50_45 = arith.constant 0.000000e+00 : f32 loc(#loc282)
+      %tmp50_46 = tt.broadcast %r0_mask_24 : tensor<1x64xi1> -> tensor<2x64xi1> loc(#loc282)
+      %tmp50_47 = arith.constant dense<0.000000e+00> : tensor<2x64xf32> loc(#loc282)
+      %tmp50_48 = arith.truncf %tmp50_47 : tensor<2x64xf32> to tensor<2x64xbf16> loc(#loc282)
+      %tmp50_49 = tt.load %tmp50_44, %tmp50_46, %tmp50_48 evictionPolicy = evict_last : tensor<2x64x!tt.ptr<bf16>> loc(#loc282)
+      %tmp50_50 = arith.extf %tmp50_49 : tensor<2x64xbf16> to tensor<2x64xf32> loc(#loc283)
+      %tmp58 = tt.splat %in_ptr1 : !tt.ptr<bf16> -> tensor<1x64x!tt.ptr<bf16>> loc(#loc284)
+      %tmp58_51 = tt.addptr %tmp58, %r0_index_23 : tensor<1x64x!tt.ptr<bf16>>, tensor<1x64xi32> loc(#loc284)
+      %tmp58_52 = arith.constant 0.000000e+00 : f32 loc(#loc285)
+      %tmp58_53 = arith.constant dense<0.000000e+00> : tensor<1x64xf32> loc(#loc285)
+      %tmp58_54 = arith.truncf %tmp58_53 : tensor<1x64xf32> to tensor<1x64xbf16> loc(#loc285)
+      %tmp58_55 = tt.load %tmp58_51, %r0_mask_24, %tmp58_54 evictionPolicy = evict_last : tensor<1x64x!tt.ptr<bf16>> loc(#loc285)
+      %tmp58_56 = arith.extf %tmp58_55 : tensor<1x64xbf16> to tensor<1x64xf32> loc(#loc286)
+      %tmp63 = arith.constant 128 : i32 loc(#loc287)
+      %tmp63_57 = arith.constant 128 : i32 loc(#loc287)
+      %tmp63_58 = arith.constant dense<128> : tensor<2x1xi32> loc(#loc287)
+      %tmp63_59 = arith.muli %tmp63_58, %x1_15 : tensor<2x1xi32> loc(#loc287)
+      %tmp63_60 = tt.broadcast %r0_index_23 : tensor<1x64xi32> -> tensor<2x64xi32> loc(#loc288)
+      %tmp63_61 = tt.broadcast %tmp63_59 : tensor<2x1xi32> -> tensor<2x64xi32> loc(#loc288)
+      %tmp63_62 = arith.addi %tmp63_60, %tmp63_61 : tensor<2x64xi32> loc(#loc288)
+      %tmp63_63 = tt.splat %in_ptr2 : !tt.ptr<f32> -> tensor<2x64x!tt.ptr<f32>> loc(#loc289)
+      %tmp63_64 = tt.addptr %tmp63_63, %tmp63_62 : tensor<2x64x!tt.ptr<f32>>, tensor<2x64xi32> loc(#loc289)
+      %tmp63_65 = arith.constant 0.000000e+00 : f32 loc(#loc290)
+      %tmp63_66 = tt.broadcast %r0_mask_24 : tensor<1x64xi1> -> tensor<2x64xi1> loc(#loc290)
+      %tmp63_67 = arith.constant dense<0.000000e+00> : tensor<2x64xf32> loc(#loc290)
+      %tmp63_68 = tt.load %tmp63_64, %tmp63_66, %tmp63_67 evictionPolicy = evict_last : tensor<2x64x!tt.ptr<f32>> loc(#loc290)
+      %tmp66 = arith.constant 128 : i32 loc(#loc291)
+      %tmp66_69 = arith.constant 128 : i32 loc(#loc291)
+      %tmp66_70 = arith.constant dense<128> : tensor<2x1xi32> loc(#loc291)
+      %tmp66_71 = arith.muli %tmp66_70, %x1_15 : tensor<2x1xi32> loc(#loc291)
+      %tmp66_72 = tt.broadcast %r0_index_23 : tensor<1x64xi32> -> tensor<2x64xi32> loc(#loc292)
+      %tmp66_73 = tt.broadcast %tmp66_71 : tensor<2x1xi32> -> tensor<2x64xi32> loc(#loc292)
+      %tmp66_74 = arith.addi %tmp66_72, %tmp66_73 : tensor<2x64xi32> loc(#loc292)
+      %tmp66_75 = tt.splat %in_ptr3 : !tt.ptr<f32> -> tensor<2x64x!tt.ptr<f32>> loc(#loc293)
+      %tmp66_76 = tt.addptr %tmp66_75, %tmp66_74 : tensor<2x64x!tt.ptr<f32>>, tensor<2x64xi32> loc(#loc293)
+      %tmp66_77 = arith.constant 0.000000e+00 : f32 loc(#loc294)
+      %tmp66_78 = tt.broadcast %r0_mask_24 : tensor<1x64xi1> -> tensor<2x64xi1> loc(#loc294)
+      %tmp66_79 = arith.constant dense<0.000000e+00> : tensor<2x64xf32> loc(#loc294)
+      %tmp66_80 = tt.load %tmp66_76, %tmp66_78, %tmp66_79 evictionPolicy = evict_last : tensor<2x64x!tt.ptr<f32>> loc(#loc294)
+      %tmp96 = arith.constant 4096 : i32 loc(#loc295)
+      %tmp96_81 = arith.constant 4096 : i32 loc(#loc295)
+      %tmp96_82 = arith.constant dense<4096> : tensor<1x64xi32> loc(#loc295)
+      %tmp96_83 = arith.addi %tmp96_82, %r0_index_23 : tensor<1x64xi32> loc(#loc295)
+      %tmp96_84 = arith.constant 128 : i32 loc(#loc296)
+      %tmp96_85 = arith.constant 128 : i32 loc(#loc296)
+      %tmp96_86 = arith.constant dense<128> : tensor<2x1xi32> loc(#loc296)
+      %tmp96_87 = arith.muli %tmp96_86, %x0_12 : tensor<2x1xi32> loc(#loc296)
+      %tmp96_88 = tt.broadcast %tmp96_83 : tensor<1x64xi32> -> tensor<2x64xi32> loc(#loc297)
+      %tmp96_89 = tt.broadcast %tmp96_87 : tensor<2x1xi32> -> tensor<2x64xi32> loc(#loc297)
+      %tmp96_90 = arith.addi %tmp96_88, %tmp96_89 : tensor<2x64xi32> loc(#loc297)
+      %tmp96_91 = arith.constant 36864 : i32 loc(#loc298)
+      %tmp96_92 = arith.constant 36864 : i32 loc(#loc298)
+      %tmp96_93 = arith.constant dense<36864> : tensor<2x1xi32> loc(#loc298)
+      %tmp96_94 = arith.muli %tmp96_93, %x1_15 : tensor<2x1xi32> loc(#loc298)
+      %tmp96_95 = tt.broadcast %tmp96_94 : tensor<2x1xi32> -> tensor<2x64xi32> loc(#loc299)
+      %tmp96_96 = arith.addi %tmp96_90, %tmp96_95 : tensor<2x64xi32> loc(#loc299)
+      %tmp96_97 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<2x64x!tt.ptr<bf16>> loc(#loc300)
+      %tmp96_98 = tt.addptr %tmp96_97, %tmp96_96 : tensor<2x64x!tt.ptr<bf16>>, tensor<2x64xi32> loc(#loc300)
+      %tmp96_99 = arith.constant 0.000000e+00 : f32 loc(#loc301)
+      %tmp96_100 = tt.broadcast %r0_mask_24 : tensor<1x64xi1> -> tensor<2x64xi1> loc(#loc301)
+      %tmp96_101 = arith.constant dense<0.000000e+00> : tensor<2x64xf32> loc(#loc301)
+      %tmp96_102 = arith.truncf %tmp96_101 : tensor<2x64xf32> to tensor<2x64xbf16> loc(#loc301)
+      %tmp96_103 = tt.load %tmp96_98, %tmp96_100, %tmp96_102 evictionPolicy = evict_first : tensor<2x64x!tt.ptr<bf16>> loc(#loc301)
+      %tmp96_104 = arith.extf %tmp96_103 : tensor<2x64xbf16> to tensor<2x64xf32> loc(#loc302)
+      %tmp102 = tt.splat %in_ptr4 : !tt.ptr<bf16> -> tensor<1x64x!tt.ptr<bf16>> loc(#loc303)
+      %tmp102_105 = tt.addptr %tmp102, %r0_index_23 : tensor<1x64x!tt.ptr<bf16>>, tensor<1x64xi32> loc(#loc303)
+      %tmp102_106 = arith.constant 0.000000e+00 : f32 loc(#loc304)
+      %tmp102_107 = arith.constant dense<0.000000e+00> : tensor<1x64xf32> loc(#loc304)
+      %tmp102_108 = arith.truncf %tmp102_107 : tensor<1x64xf32> to tensor<1x64xbf16> loc(#loc304)
+      %tmp102_109 = tt.load %tmp102_105, %r0_mask_24, %tmp102_108 evictionPolicy = evict_last : tensor<1x64x!tt.ptr<bf16>> loc(#loc304)
+      %tmp102_110 = arith.extf %tmp102_109 : tensor<1x64xbf16> to tensor<1x64xf32> loc(#loc305)
+      %tmp13 = arith.constant 0 : i64 loc(#loc306)
+      %tmp13_111 = arith.constant dense<0> : tensor<1x1xi64> loc(#loc306)
+      %tmp14 = arith.extsi %r0_3_27 : tensor<1x64xi32> to tensor<1x64xi64> loc(#loc307)
+      %tmp14_112 = arith.constant dense<0> : tensor<1x64xi64> loc(#loc307)
+      %tmp14_113 = arith.cmpi sge, %tmp14, %tmp14_112 : tensor<1x64xi64> loc(#loc307)
+      %tmp15 = arith.constant 1 : i64 loc(#loc308)
+      %tmp15_114 = arith.constant dense<1> : tensor<1x1xi64> loc(#loc308)
+      %tmp16 = arith.extsi %r0_3_27 : tensor<1x64xi32> to tensor<1x64xi64> loc(#loc309)
+      %tmp16_115 = arith.constant dense<1> : tensor<1x64xi64> loc(#loc309)
+      %tmp16_116 = arith.cmpi slt, %tmp16, %tmp16_115 : tensor<1x64xi64> loc(#loc309)
+      %tmp17 = arith.constant 2 : i32 loc(#loc310)
+      %tmp17_117 = arith.constant 2 : i32 loc(#loc310)
+      %tmp17_118 = arith.constant dense<2> : tensor<1x64xi32> loc(#loc310)
+      %tmp17_119 = arith.muli %tmp17_118, %r0_4_30 : tensor<1x64xi32> loc(#loc310)
+      %tmp17_120 = arith.constant 1 : i32 loc(#loc311)
+      %tmp17_121 = arith.constant 1 : i32 loc(#loc311)
+      %tmp17_122 = arith.constant dense<1> : tensor<1x64xi32> loc(#loc311)
+      %tmp17_123 = arith.addi %tmp17_122, %tmp17_119 : tensor<1x64xi32> loc(#loc311)
+      %tmp17_124 = arith.constant 128 : i32 loc(#loc312)
+      %tmp17_125 = arith.constant 128 : i32 loc(#loc312)
+      %tmp17_126 = arith.constant dense<128> : tensor<2x1xi32> loc(#loc312)
+      %tmp17_127 = arith.muli %tmp17_126, %x0_12 : tensor<2x1xi32> loc(#loc312)
+      %tmp17_128 = tt.broadcast %tmp17_123 : tensor<1x64xi32> -> tensor<2x64xi32> loc(#loc313)
+      %tmp17_129 = tt.broadcast %tmp17_127 : tensor<2x1xi32> -> tensor<2x64xi32> loc(#loc313)
+      %tmp17_130 = arith.addi %tmp17_128, %tmp17_129 : tensor<2x64xi32> loc(#loc313)
+      %tmp17_131 = arith.constant 36864 : i32 loc(#loc314)
+      %tmp17_132 = arith.constant 36864 : i32 loc(#loc314)
+      %tmp17_133 = arith.constant dense<36864> : tensor<2x1xi32> loc(#loc314)
+      %tmp17_134 = arith.muli %tmp17_133, %x1_15 : tensor<2x1xi32> loc(#loc314)
+      %tmp17_135 = tt.broadcast %tmp17_134 : tensor<2x1xi32> -> tensor<2x64xi32> loc(#loc315)
+      %tmp17_136 = arith.addi %tmp17_130, %tmp17_135 : tensor<2x64xi32> loc(#loc315)
+      %tmp17_137 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<2x64x!tt.ptr<bf16>> loc(#loc316)
+      %tmp17_138 = tt.addptr %tmp17_137, %tmp17_136 : tensor<2x64x!tt.ptr<bf16>>, tensor<2x64xi32> loc(#loc316)
+      %tmp17_139 = arith.andi %r0_mask_24, %tmp16_116 : tensor<1x64xi1> loc(#loc317)
+      %tmp17_140 = arith.constant 0.000000e+00 : f32 loc(#loc318)
+      %tmp17_141 = tt.broadcast %tmp17_139 : tensor<1x64xi1> -> tensor<2x64xi1> loc(#loc318)
+      %tmp17_142 = arith.constant dense<0.000000e+00> : tensor<2x64xf32> loc(#loc318)
+      %tmp17_143 = arith.truncf %tmp17_142 : tensor<2x64xf32> to tensor<2x64xbf16> loc(#loc318)
+      %tmp17_144 = tt.load %tmp17_138, %tmp17_141, %tmp17_143 evictionPolicy = evict_last : tensor<2x64x!tt.ptr<bf16>> loc(#loc318)
+      %tmp17_145 = arith.extf %tmp17_144 : tensor<2x64xbf16> to tensor<2x64xf32> loc(#loc319)
+      %tmp19 = arith.constant 1.280000e+02 : f32 loc(#loc320)
+      %tmp20 = arith.constant dense<1.280000e+02> : tensor<2x1xf32> loc(#loc321)
+      %tmp20_146 = arith.divf %tmp10_20, %tmp20 : tensor<2x1xf32> loc(#loc321)
+      %tmp21 = arith.constant 9.99999997E-7 : f32 loc(#loc322)
+      %tmp22 = arith.constant dense<9.99999997E-7> : tensor<2x1xf32> loc(#loc323)
+      %tmp22_147 = arith.addf %tmp20_146, %tmp22 : tensor<2x1xf32> loc(#loc323)
+      %tmp23 = tt.extern_elementwise %tmp22_147 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<2x1xf32>) -> tensor<2x1xf32> loc(#loc324)
+      %tmp24 = tt.broadcast %tmp23 : tensor<2x1xf32> -> tensor<2x64xf32> loc(#loc325)
+      %tmp24_148 = arith.mulf %tmp17_145, %tmp24 : tensor<2x64xf32> loc(#loc325)
+      %tmp25 = arith.constant 2 : i32 loc(#loc326)
+      %tmp25_149 = arith.constant 2 : i32 loc(#loc326)
+      %tmp25_150 = arith.constant dense<2> : tensor<1x64xi32> loc(#loc326)
+      %tmp25_151 = arith.muli %tmp25_150, %r0_4_30 : tensor<1x64xi32> loc(#loc326)
+      %tmp25_152 = arith.constant 1 : i32 loc(#loc327)
+      %tmp25_153 = arith.constant 1 : i32 loc(#loc327)
+      %tmp25_154 = arith.constant dense<1> : tensor<1x64xi32> loc(#loc327)
+      %tmp25_155 = arith.addi %tmp25_154, %tmp25_151 : tensor<1x64xi32> loc(#loc327)
+      %tmp25_156 = tt.broadcast %tmp25_155 : tensor<1x64xi32> -> tensor<2x64xi32> loc(#loc328)
+      %tmp25_157 = tt.splat %in_ptr1 : !tt.ptr<bf16> -> tensor<2x64x!tt.ptr<bf16>> loc(#loc329)
+      %tmp25_158 = tt.addptr %tmp25_157, %tmp25_156 : tensor<2x64x!tt.ptr<bf16>>, tensor<2x64xi32> loc(#loc329)
+      %tmp25_159 = arith.andi %r0_mask_24, %tmp16_116 : tensor<1x64xi1> loc(#loc330)
+      %tmp25_160 = arith.constant 0.000000e+00 : f32 loc(#loc331)
+      %tmp25_161 = tt.broadcast %tmp25_159 : tensor<1x64xi1> -> tensor<2x64xi1> loc(#loc331)
+      %tmp25_162 = arith.constant dense<0.000000e+00> : tensor<2x64xf32> loc(#loc331)
+      %tmp25_163 = arith.truncf %tmp25_162 : tensor<2x64xf32> to tensor<2x64xbf16> loc(#loc331)
+      %tmp25_164 = tt.load %tmp25_158, %tmp25_161, %tmp25_163 evictionPolicy = evict_last : tensor<2x64x!tt.ptr<bf16>> loc(#loc331)
+      %tmp25_165 = arith.extf %tmp25_164 : tensor<2x64xbf16> to tensor<2x64xf32> loc(#loc332)
+      %tmp27 = arith.mulf %tmp24_148, %tmp25_165 : tensor<2x64xf32> loc(#loc333)
+      %tmp29 = arith.constant 0.000000e+00 : f32 loc(#loc334)
+      %tmp29_166 = arith.constant dense<0.000000e+00> : tensor<2x64xf32> loc(#loc334)
+      %tmp29_167 = arith.subf %tmp29_166, %tmp27 : tensor<2x64xf32> loc(#loc334)
+      %tmp30 = arith.constant 0.000000e+00 : f32 loc(#loc335)
+      %tmp30_168 = arith.constant dense<0.000000e+00> : tensor<2x64xf32> loc(#loc335)
+      %tmp31 = tt.broadcast %tmp16_116 : tensor<1x64xi1> -> tensor<2x64xi1> loc(#loc336)
+      %tmp31_169 = arith.select %tmp31, %tmp29_167, %tmp30_168 : tensor<2x64xi1>, tensor<2x64xf32> loc(#loc336)
+      %tmp32 = arith.extsi %r0_3_27 : tensor<1x64xi32> to tensor<1x64xi64> loc(#loc337)
+      %tmp32_170 = arith.constant dense<1> : tensor<1x64xi64> loc(#loc337)
+      %tmp32_171 = arith.cmpi sge, %tmp32, %tmp32_170 : tensor<1x64xi64> loc(#loc337)
+      %tmp33 = arith.constant 2 : i64 loc(#loc338)
+      %tmp33_172 = arith.constant dense<2> : tensor<1x1xi64> loc(#loc338)
+      %tmp34 = arith.extsi %r0_3_27 : tensor<1x64xi32> to tensor<1x64xi64> loc(#loc339)
+      %tmp34_173 = arith.constant dense<2> : tensor<1x64xi64> loc(#loc339)
+      %tmp34_174 = arith.cmpi slt, %tmp34, %tmp34_173 : tensor<1x64xi64> loc(#loc339)
+      %tmp35 = arith.constant 2 : i32 loc(#loc340)
+      %tmp35_175 = arith.constant 2 : i32 loc(#loc340)
+      %tmp35_176 = arith.constant dense<2> : tensor<1x64xi32> loc(#loc340)
+      %tmp35_177 = arith.muli %tmp35_176, %r0_4_30 : tensor<1x64xi32> loc(#loc340)
+      %tmp35_178 = arith.constant 128 : i32 loc(#loc341)
+      %tmp35_179 = arith.constant 128 : i32 loc(#loc341)
+      %tmp35_180 = arith.constant dense<128> : tensor<2x1xi32> loc(#loc341)
+      %tmp35_181 = arith.muli %tmp35_180, %x0_12 : tensor<2x1xi32> loc(#loc341)
+      %tmp35_182 = tt.broadcast %tmp35_177 : tensor<1x64xi32> -> tensor<2x64xi32> loc(#loc342)
+      %tmp35_183 = tt.broadcast %tmp35_181 : tensor<2x1xi32> -> tensor<2x64xi32> loc(#loc342)
+      %tmp35_184 = arith.addi %tmp35_182, %tmp35_183 : tensor<2x64xi32> loc(#loc342)
+      %tmp35_185 = arith.constant 36864 : i32 loc(#loc343)
+      %tmp35_186 = arith.constant 36864 : i32 loc(#loc343)
+      %tmp35_187 = arith.constant dense<36864> : tensor<2x1xi32> loc(#loc343)
+      %tmp35_188 = arith.muli %tmp35_187, %x1_15 : tensor<2x1xi32> loc(#loc343)
+      %tmp35_189 = tt.broadcast %tmp35_188 : tensor<2x1xi32> -> tensor<2x64xi32> loc(#loc344)
+      %tmp35_190 = arith.addi %tmp35_184, %tmp35_189 : tensor<2x64xi32> loc(#loc344)
+      %tmp35_191 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<2x64x!tt.ptr<bf16>> loc(#loc345)
+      %tmp35_192 = tt.addptr %tmp35_191, %tmp35_190 : tensor<2x64x!tt.ptr<bf16>>, tensor<2x64xi32> loc(#loc345)
+      %tmp35_193 = arith.andi %r0_mask_24, %tmp32_171 : tensor<1x64xi1> loc(#loc346)
+      %tmp35_194 = arith.constant 0.000000e+00 : f32 loc(#loc347)
+      %tmp35_195 = tt.broadcast %tmp35_193 : tensor<1x64xi1> -> tensor<2x64xi1> loc(#loc347)
+      %tmp35_196 = arith.constant dense<0.000000e+00> : tensor<2x64xf32> loc(#loc347)
+      %tmp35_197 = arith.truncf %tmp35_196 : tensor<2x64xf32> to tensor<2x64xbf16> loc(#loc347)
+      %tmp35_198 = tt.load %tmp35_192, %tmp35_195, %tmp35_197 evictionPolicy = evict_last : tensor<2x64x!tt.ptr<bf16>> loc(#loc347)
+      %tmp35_199 = arith.extf %tmp35_198 : tensor<2x64xbf16> to tensor<2x64xf32> loc(#loc348)
+      %tmp37 = arith.constant 1.280000e+02 : f32 loc(#loc349)
+      %tmp38 = arith.constant dense<1.280000e+02> : tensor<2x1xf32> loc(#loc350)
+      %tmp38_200 = arith.divf %tmp10_20, %tmp38 : tensor<2x1xf32> loc(#loc350)
+      %tmp39 = arith.constant 9.99999997E-7 : f32 loc(#loc351)
+      %tmp40 = arith.constant dense<9.99999997E-7> : tensor<2x1xf32> loc(#loc352)
+      %tmp40_201 = arith.addf %tmp38_200, %tmp40 : tensor<2x1xf32> loc(#loc352)
+      %tmp41 = tt.extern_elementwise %tmp40_201 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<2x1xf32>) -> tensor<2x1xf32> loc(#loc353)
+      %tmp42 = tt.broadcast %tmp41 : tensor<2x1xf32> -> tensor<2x64xf32> loc(#loc354)
+      %tmp42_202 = arith.mulf %tmp35_199, %tmp42 : tensor<2x64xf32> loc(#loc354)
+      %tmp43 = arith.constant 2 : i32 loc(#loc355)
+      %tmp43_203 = arith.constant 2 : i32 loc(#loc355)
+      %tmp43_204 = arith.constant dense<2> : tensor<1x64xi32> loc(#loc355)
+      %tmp43_205 = arith.muli %tmp43_204, %r0_4_30 : tensor<1x64xi32> loc(#loc355)
+      %tmp43_206 = tt.broadcast %tmp43_205 : tensor<1x64xi32> -> tensor<2x64xi32> loc(#loc356)
+      %tmp43_207 = tt.splat %in_ptr1 : !tt.ptr<bf16> -> tensor<2x64x!tt.ptr<bf16>> loc(#loc357)
+      %tmp43_208 = tt.addptr %tmp43_207, %tmp43_206 : tensor<2x64x!tt.ptr<bf16>>, tensor<2x64xi32> loc(#loc357)
+      %tmp43_209 = arith.andi %r0_mask_24, %tmp32_171 : tensor<1x64xi1> loc(#loc358)
+      %tmp43_210 = arith.constant 0.000000e+00 : f32 loc(#loc359)
+      %tmp43_211 = tt.broadcast %tmp43_209 : tensor<1x64xi1> -> tensor<2x64xi1> loc(#loc359)
+      %tmp43_212 = arith.constant dense<0.000000e+00> : tensor<2x64xf32> loc(#loc359)
+      %tmp43_213 = arith.truncf %tmp43_212 : tensor<2x64xf32> to tensor<2x64xbf16> loc(#loc359)
+      %tmp43_214 = tt.load %tmp43_208, %tmp43_211, %tmp43_213 evictionPolicy = evict_last : tensor<2x64x!tt.ptr<bf16>> loc(#loc359)
+      %tmp43_215 = arith.extf %tmp43_214 : tensor<2x64xbf16> to tensor<2x64xf32> loc(#loc360)
+      %tmp45 = arith.mulf %tmp42_202, %tmp43_215 : tensor<2x64xf32> loc(#loc361)
+      %tmp47 = arith.constant 0.000000e+00 : f32 loc(#loc362)
+      %tmp47_216 = arith.constant dense<0.000000e+00> : tensor<2x64xf32> loc(#loc362)
+      %tmp48 = tt.broadcast %tmp32_171 : tensor<1x64xi1> -> tensor<2x64xi1> loc(#loc363)
+      %tmp48_217 = arith.select %tmp48, %tmp45, %tmp47_216 : tensor<2x64xi1>, tensor<2x64xf32> loc(#loc363)
+      %tmp49 = tt.broadcast %tmp16_116 : tensor<1x64xi1> -> tensor<2x64xi1> loc(#loc364)
+      %tmp49_218 = arith.select %tmp49, %tmp31_169, %tmp48_217 : tensor<2x64xi1>, tensor<2x64xf32> loc(#loc364)
+      %tmp52 = arith.constant 1.280000e+02 : f32 loc(#loc365)
+      %tmp53 = arith.constant dense<1.280000e+02> : tensor<2x1xf32> loc(#loc366)
+      %tmp53_219 = arith.divf %tmp10_20, %tmp53 : tensor<2x1xf32> loc(#loc366)
+      %tmp54 = arith.constant 9.99999997E-7 : f32 loc(#loc367)
+      %tmp55 = arith.constant dense<9.99999997E-7> : tensor<2x1xf32> loc(#loc368)
+      %tmp55_220 = arith.addf %tmp53_219, %tmp55 : tensor<2x1xf32> loc(#loc368)
+      %tmp56 = tt.extern_elementwise %tmp55_220 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<2x1xf32>) -> tensor<2x1xf32> loc(#loc369)
+      %tmp57 = tt.broadcast %tmp56 : tensor<2x1xf32> -> tensor<2x64xf32> loc(#loc370)
+      %tmp57_221 = arith.mulf %tmp50_50, %tmp57 : tensor<2x64xf32> loc(#loc370)
+      %tmp60 = tt.broadcast %tmp58_56 : tensor<1x64xf32> -> tensor<2x64xf32> loc(#loc371)
+      %tmp60_222 = arith.mulf %tmp57_221, %tmp60 : tensor<2x64xf32> loc(#loc371)
+      %tmp64 = arith.mulf %tmp60_222, %tmp63_68 : tensor<2x64xf32> loc(#loc372)
+      %tmp67 = arith.mulf %tmp49_218, %tmp66_80 : tensor<2x64xf32> loc(#loc373)
+      %tmp68 = arith.addf %tmp64, %tmp67 : tensor<2x64xf32> loc(#loc374)
+      %tmp70 = arith.constant 2 : i32 loc(#loc375)
+      %tmp70_223 = arith.constant 2 : i32 loc(#loc375)
+      %tmp70_224 = arith.constant dense<2> : tensor<1x64xi32> loc(#loc375)
+      %tmp70_225 = arith.muli %tmp70_224, %r0_4_30 : tensor<1x64xi32> loc(#loc375)
+      %tmp70_226 = arith.constant 4097 : i32 loc(#loc376)
+      %tmp70_227 = arith.constant 4097 : i32 loc(#loc376)
+      %tmp70_228 = arith.constant dense<4097> : tensor<1x64xi32> loc(#loc376)
+      %tmp70_229 = arith.addi %tmp70_228, %tmp70_225 : tensor<1x64xi32> loc(#loc376)
+      %tmp70_230 = arith.constant 128 : i32 loc(#loc377)
+      %tmp70_231 = arith.constant 128 : i32 loc(#loc377)
+      %tmp70_232 = arith.constant dense<128> : tensor<2x1xi32> loc(#loc377)
+      %tmp70_233 = arith.muli %tmp70_232, %x0_12 : tensor<2x1xi32> loc(#loc377)
+      %tmp70_234 = tt.broadcast %tmp70_229 : tensor<1x64xi32> -> tensor<2x64xi32> loc(#loc378)
+      %tmp70_235 = tt.broadcast %tmp70_233 : tensor<2x1xi32> -> tensor<2x64xi32> loc(#loc378)
+      %tmp70_236 = arith.addi %tmp70_234, %tmp70_235 : tensor<2x64xi32> loc(#loc378)
+      %tmp70_237 = arith.constant 36864 : i32 loc(#loc379)
+      %tmp70_238 = arith.constant 36864 : i32 loc(#loc379)
+      %tmp70_239 = arith.constant dense<36864> : tensor<2x1xi32> loc(#loc379)
+      %tmp70_240 = arith.muli %tmp70_239, %x1_15 : tensor<2x1xi32> loc(#loc379)
+      %tmp70_241 = tt.broadcast %tmp70_240 : tensor<2x1xi32> -> tensor<2x64xi32> loc(#loc380)
+      %tmp70_242 = arith.addi %tmp70_236, %tmp70_241 : tensor<2x64xi32> loc(#loc380)
+      %tmp70_243 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<2x64x!tt.ptr<bf16>> loc(#loc381)
+      %tmp70_244 = tt.addptr %tmp70_243, %tmp70_242 : tensor<2x64x!tt.ptr<bf16>>, tensor<2x64xi32> loc(#loc381)
+      %tmp70_245 = arith.andi %r0_mask_24, %tmp16_116 : tensor<1x64xi1> loc(#loc382)
+      %tmp70_246 = arith.constant 0.000000e+00 : f32 loc(#loc383)
+      %tmp70_247 = tt.broadcast %tmp70_245 : tensor<1x64xi1> -> tensor<2x64xi1> loc(#loc383)
+      %tmp70_248 = arith.constant dense<0.000000e+00> : tensor<2x64xf32> loc(#loc383)
+      %tmp70_249 = arith.truncf %tmp70_248 : tensor<2x64xf32> to tensor<2x64xbf16> loc(#loc383)
+      %tmp70_250 = tt.load %tmp70_244, %tmp70_247, %tmp70_249 evictionPolicy = evict_last : tensor<2x64x!tt.ptr<bf16>> loc(#loc383)
+      %tmp70_251 = arith.extf %tmp70_250 : tensor<2x64xbf16> to tensor<2x64xf32> loc(#loc384)
+      %tmp72 = arith.constant dense<1.280000e+02> : tensor<2x1xf32> loc(#loc385)
+      %tmp72_252 = arith.divf %tmp4_19, %tmp72 : tensor<2x1xf32> loc(#loc385)
+      %tmp73 = arith.constant dense<9.99999997E-7> : tensor<2x1xf32> loc(#loc386)
+      %tmp73_253 = arith.addf %tmp72_252, %tmp73 : tensor<2x1xf32> loc(#loc386)
+      %tmp74 = tt.extern_elementwise %tmp73_253 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<2x1xf32>) -> tensor<2x1xf32> loc(#loc387)
+      %tmp75 = tt.broadcast %tmp74 : tensor<2x1xf32> -> tensor<2x64xf32> loc(#loc388)
+      %tmp75_254 = arith.mulf %tmp70_251, %tmp75 : tensor<2x64xf32> loc(#loc388)
+      %tmp76 = arith.constant 2 : i32 loc(#loc389)
+      %tmp76_255 = arith.constant 2 : i32 loc(#loc389)
+      %tmp76_256 = arith.constant dense<2> : tensor<1x64xi32> loc(#loc389)
+      %tmp76_257 = arith.muli %tmp76_256, %r0_4_30 : tensor<1x64xi32> loc(#loc389)
+      %tmp76_258 = arith.constant 1 : i32 loc(#loc390)
+      %tmp76_259 = arith.constant 1 : i32 loc(#loc390)
+      %tmp76_260 = arith.constant dense<1> : tensor<1x64xi32> loc(#loc390)
+      %tmp76_261 = arith.addi %tmp76_260, %tmp76_257 : tensor<1x64xi32> loc(#loc390)
+      %tmp76_262 = tt.broadcast %tmp76_261 : tensor<1x64xi32> -> tensor<2x64xi32> loc(#loc391)
+      %tmp76_263 = tt.splat %in_ptr4 : !tt.ptr<bf16> -> tensor<2x64x!tt.ptr<bf16>> loc(#loc392)
+      %tmp76_264 = tt.addptr %tmp76_263, %tmp76_262 : tensor<2x64x!tt.ptr<bf16>>, tensor<2x64xi32> loc(#loc392)
+      %tmp76_265 = arith.andi %r0_mask_24, %tmp16_116 : tensor<1x64xi1> loc(#loc393)
+      %tmp76_266 = arith.constant 0.000000e+00 : f32 loc(#loc394)
+      %tmp76_267 = tt.broadcast %tmp76_265 : tensor<1x64xi1> -> tensor<2x64xi1> loc(#loc394)
+      %tmp76_268 = arith.constant dense<0.000000e+00> : tensor<2x64xf32> loc(#loc394)
+      %tmp76_269 = arith.truncf %tmp76_268 : tensor<2x64xf32> to tensor<2x64xbf16> loc(#loc394)
+      %tmp76_270 = tt.load %tmp76_264, %tmp76_267, %tmp76_269 evictionPolicy = evict_last : tensor<2x64x!tt.ptr<bf16>> loc(#loc394)
+      %tmp76_271 = arith.extf %tmp76_270 : tensor<2x64xbf16> to tensor<2x64xf32> loc(#loc395)
+      %tmp78 = arith.mulf %tmp75_254, %tmp76_271 : tensor<2x64xf32> loc(#loc396)
+      %tmp80 = arith.constant 0.000000e+00 : f32 loc(#loc397)
+      %tmp80_272 = arith.constant dense<0.000000e+00> : tensor<2x64xf32> loc(#loc397)
+      %tmp80_273 = arith.subf %tmp80_272, %tmp78 : tensor<2x64xf32> loc(#loc397)
+      %tmp81 = arith.constant 0.000000e+00 : f32 loc(#loc398)
+      %tmp81_274 = arith.constant dense<0.000000e+00> : tensor<2x64xf32> loc(#loc398)
+      %tmp82 = tt.broadcast %tmp16_116 : tensor<1x64xi1> -> tensor<2x64xi1> loc(#loc399)
+      %tmp82_275 = arith.select %tmp82, %tmp80_273, %tmp81_274 : tensor<2x64xi1>, tensor<2x64xf32> loc(#loc399)
+      %tmp83 = arith.constant 2 : i32 loc(#loc400)
+      %tmp83_276 = arith.constant 2 : i32 loc(#loc400)
+      %tmp83_277 = arith.constant dense<2> : tensor<1x64xi32> loc(#loc400)
+      %tmp83_278 = arith.muli %tmp83_277, %r0_4_30 : tensor<1x64xi32> loc(#loc400)
+      %tmp83_279 = arith.constant 4096 : i32 loc(#loc401)
+      %tmp83_280 = arith.constant 4096 : i32 loc(#loc401)
+      %tmp83_281 = arith.constant dense<4096> : tensor<1x64xi32> loc(#loc401)
+      %tmp83_282 = arith.addi %tmp83_281, %tmp83_278 : tensor<1x64xi32> loc(#loc401)
+      %tmp83_283 = arith.constant 128 : i32 loc(#loc402)
+      %tmp83_284 = arith.constant 128 : i32 loc(#loc402)
+      %tmp83_285 = arith.constant dense<128> : tensor<2x1xi32> loc(#loc402)
+      %tmp83_286 = arith.muli %tmp83_285, %x0_12 : tensor<2x1xi32> loc(#loc402)
+      %tmp83_287 = tt.broadcast %tmp83_282 : tensor<1x64xi32> -> tensor<2x64xi32> loc(#loc403)
+      %tmp83_288 = tt.broadcast %tmp83_286 : tensor<2x1xi32> -> tensor<2x64xi32> loc(#loc403)
+      %tmp83_289 = arith.addi %tmp83_287, %tmp83_288 : tensor<2x64xi32> loc(#loc403)
+      %tmp83_290 = arith.constant 36864 : i32 loc(#loc404)
+      %tmp83_291 = arith.constant 36864 : i32 loc(#loc404)
+      %tmp83_292 = arith.constant dense<36864> : tensor<2x1xi32> loc(#loc404)
+      %tmp83_293 = arith.muli %tmp83_292, %x1_15 : tensor<2x1xi32> loc(#loc404)
+      %tmp83_294 = tt.broadcast %tmp83_293 : tensor<2x1xi32> -> tensor<2x64xi32> loc(#loc405)
+      %tmp83_295 = arith.addi %tmp83_289, %tmp83_294 : tensor<2x64xi32> loc(#loc405)
+      %tmp83_296 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<2x64x!tt.ptr<bf16>> loc(#loc406)
+      %tmp83_297 = tt.addptr %tmp83_296, %tmp83_295 : tensor<2x64x!tt.ptr<bf16>>, tensor<2x64xi32> loc(#loc406)
+      %tmp83_298 = arith.andi %r0_mask_24, %tmp32_171 : tensor<1x64xi1> loc(#loc407)
+      %tmp83_299 = arith.constant 0.000000e+00 : f32 loc(#loc408)
+      %tmp83_300 = tt.broadcast %tmp83_298 : tensor<1x64xi1> -> tensor<2x64xi1> loc(#loc408)
+      %tmp83_301 = arith.constant dense<0.000000e+00> : tensor<2x64xf32> loc(#loc408)
+      %tmp83_302 = arith.truncf %tmp83_301 : tensor<2x64xf32> to tensor<2x64xbf16> loc(#loc408)
+      %tmp83_303 = tt.load %tmp83_297, %tmp83_300, %tmp83_302 evictionPolicy = evict_last : tensor<2x64x!tt.ptr<bf16>> loc(#loc408)
+      %tmp83_304 = arith.extf %tmp83_303 : tensor<2x64xbf16> to tensor<2x64xf32> loc(#loc409)
+      %tmp85 = arith.constant dense<1.280000e+02> : tensor<2x1xf32> loc(#loc410)
+      %tmp85_305 = arith.divf %tmp4_19, %tmp85 : tensor<2x1xf32> loc(#loc410)
+      %tmp86 = arith.constant dense<9.99999997E-7> : tensor<2x1xf32> loc(#loc411)
+      %tmp86_306 = arith.addf %tmp85_305, %tmp86 : tensor<2x1xf32> loc(#loc411)
+      %tmp87 = tt.extern_elementwise %tmp86_306 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<2x1xf32>) -> tensor<2x1xf32> loc(#loc412)
+      %tmp88 = tt.broadcast %tmp87 : tensor<2x1xf32> -> tensor<2x64xf32> loc(#loc413)
+      %tmp88_307 = arith.mulf %tmp83_304, %tmp88 : tensor<2x64xf32> loc(#loc413)
+      %tmp89 = arith.constant 2 : i32 loc(#loc414)
+      %tmp89_308 = arith.constant 2 : i32 loc(#loc414)
+      %tmp89_309 = arith.constant dense<2> : tensor<1x64xi32> loc(#loc414)
+      %tmp89_310 = arith.muli %tmp89_309, %r0_4_30 : tensor<1x64xi32> loc(#loc414)
+      %tmp89_311 = tt.broadcast %tmp89_310 : tensor<1x64xi32> -> tensor<2x64xi32> loc(#loc415)
+      %tmp89_312 = tt.splat %in_ptr4 : !tt.ptr<bf16> -> tensor<2x64x!tt.ptr<bf16>> loc(#loc416)
+      %tmp89_313 = tt.addptr %tmp89_312, %tmp89_311 : tensor<2x64x!tt.ptr<bf16>>, tensor<2x64xi32> loc(#loc416)
+      %tmp89_314 = arith.andi %r0_mask_24, %tmp32_171 : tensor<1x64xi1> loc(#loc417)
+      %tmp89_315 = arith.constant 0.000000e+00 : f32 loc(#loc418)
+      %tmp89_316 = tt.broadcast %tmp89_314 : tensor<1x64xi1> -> tensor<2x64xi1> loc(#loc418)
+      %tmp89_317 = arith.constant dense<0.000000e+00> : tensor<2x64xf32> loc(#loc418)
+      %tmp89_318 = arith.truncf %tmp89_317 : tensor<2x64xf32> to tensor<2x64xbf16> loc(#loc418)
+      %tmp89_319 = tt.load %tmp89_313, %tmp89_316, %tmp89_318 evictionPolicy = evict_last : tensor<2x64x!tt.ptr<bf16>> loc(#loc418)
+      %tmp89_320 = arith.extf %tmp89_319 : tensor<2x64xbf16> to tensor<2x64xf32> loc(#loc419)
+      %tmp91 = arith.mulf %tmp88_307, %tmp89_320 : tensor<2x64xf32> loc(#loc420)
+      %tmp93 = arith.constant 0.000000e+00 : f32 loc(#loc421)
+      %tmp93_321 = arith.constant dense<0.000000e+00> : tensor<2x64xf32> loc(#loc421)
+      %tmp94 = tt.broadcast %tmp32_171 : tensor<1x64xi1> -> tensor<2x64xi1> loc(#loc422)
+      %tmp94_322 = arith.select %tmp94, %tmp91, %tmp93_321 : tensor<2x64xi1>, tensor<2x64xf32> loc(#loc422)
+      %tmp95 = tt.broadcast %tmp16_116 : tensor<1x64xi1> -> tensor<2x64xi1> loc(#loc423)
+      %tmp95_323 = arith.select %tmp95, %tmp82_275, %tmp94_322 : tensor<2x64xi1>, tensor<2x64xf32> loc(#loc423)
+      %tmp98 = arith.constant dense<1.280000e+02> : tensor<2x1xf32> loc(#loc424)
+      %tmp98_324 = arith.divf %tmp4_19, %tmp98 : tensor<2x1xf32> loc(#loc424)
+      %tmp99 = arith.constant dense<9.99999997E-7> : tensor<2x1xf32> loc(#loc425)
+      %tmp99_325 = arith.addf %tmp98_324, %tmp99 : tensor<2x1xf32> loc(#loc425)
+      %tmp100 = tt.extern_elementwise %tmp99_325 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<2x1xf32>) -> tensor<2x1xf32> loc(#loc426)
+      %tmp101 = tt.broadcast %tmp100 : tensor<2x1xf32> -> tensor<2x64xf32> loc(#loc427)
+      %tmp101_326 = arith.mulf %tmp96_104, %tmp101 : tensor<2x64xf32> loc(#loc427)
+      %tmp104 = tt.broadcast %tmp102_110 : tensor<1x64xf32> -> tensor<2x64xf32> loc(#loc428)
+      %tmp104_327 = arith.mulf %tmp101_326, %tmp104 : tensor<2x64xf32> loc(#loc428)
+      %tmp107 = arith.mulf %tmp104_327, %tmp63_68 : tensor<2x64xf32> loc(#loc429)
+      %tmp109 = arith.mulf %tmp95_323, %tmp66_80 : tensor<2x64xf32> loc(#loc430)
+      %tmp110 = arith.addf %tmp107, %tmp109 : tensor<2x64xf32> loc(#loc431)
+      %c128_i32 = arith.constant 128 : i32 loc(#loc204)
+      %c128_i32_328 = arith.constant 128 : i32 loc(#loc204)
+      %cst = arith.constant dense<128> : tensor<2x1xi32> loc(#loc204)
+      %8 = arith.muli %cst, %xindex_7 : tensor<2x1xi32> loc(#loc204)
+      %9 = tt.broadcast %r0_index_23 : tensor<1x64xi32> -> tensor<2x64xi32> loc(#loc205)
+      %10 = tt.broadcast %8 : tensor<2x1xi32> -> tensor<2x64xi32> loc(#loc205)
+      %11 = arith.addi %9, %10 : tensor<2x64xi32> loc(#loc205)
+      %12 = tt.splat %in_out_ptr0 : !tt.ptr<bf16> -> tensor<2x64x!tt.ptr<bf16>> loc(#loc206)
+      %13 = tt.addptr %12, %11 : tensor<2x64x!tt.ptr<bf16>>, tensor<2x64xi32> loc(#loc206)
+      %14 = tt.broadcast %r0_mask_24 : tensor<1x64xi1> -> tensor<2x64xi1> loc(#loc207)
+      %15 = arith.truncf %tmp68 : tensor<2x64xf32> to tensor<2x64xbf16> loc(#loc207)
+      tt.store %13, %15, %14 : tensor<2x64x!tt.ptr<bf16>> loc(#loc207)
+      %c128_i32_329 = arith.constant 128 : i32 loc(#loc208)
+      %c128_i32_330 = arith.constant 128 : i32 loc(#loc208)
+      %cst_331 = arith.constant dense<128> : tensor<2x1xi32> loc(#loc208)
+      %16 = arith.muli %cst_331, %xindex_7 : tensor<2x1xi32> loc(#loc208)
+      %17 = tt.broadcast %r0_index_23 : tensor<1x64xi32> -> tensor<2x64xi32> loc(#loc209)
+      %18 = tt.broadcast %16 : tensor<2x1xi32> -> tensor<2x64xi32> loc(#loc209)
+      %19 = arith.addi %17, %18 : tensor<2x64xi32> loc(#loc209)
+      %20 = tt.splat %in_out_ptr1 : !tt.ptr<bf16> -> tensor<2x64x!tt.ptr<bf16>> loc(#loc210)
+      %21 = tt.addptr %20, %19 : tensor<2x64x!tt.ptr<bf16>>, tensor<2x64xi32> loc(#loc210)
+      %22 = tt.broadcast %r0_mask_24 : tensor<1x64xi1> -> tensor<2x64xi1> loc(#loc211)
+      %23 = arith.truncf %tmp110 : tensor<2x64xf32> to tensor<2x64xbf16> loc(#loc211)
+      tt.store %21, %23, %22 : tensor<2x64x!tt.ptr<bf16>> loc(#loc211)
+    } loc(#loc44)
+    tt.return loc(#loc212)
+  } loc(#loc)
+  tt.func private @"triton.language.standard.sum__fp32S2_64S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<2x64xf32> loc("input"(#loc213))) -> tensor<2xf32> attributes {noinline = false} {
+    %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({
+    ^bb0(%arg1: f32 loc(unknown), %arg2: f32 loc(unknown)):
+      %2 = tt.call @triton.language.standard._sum_combine__fp32_fp32__(%arg1, %arg2) : (f32, f32) -> f32 loc(#loc214)
+      tt.reduce.return %2 : f32 loc(#loc214)
+    }) : (tensor<2x64xf32>) -> tensor<2xf32> loc(#loc214)
+    tt.return %0 : tensor<2xf32> loc(#loc216)
+  ^bb1:  // no predecessors
+    %1 = ub.poison : tensor<2xf32> loc(#loc217)
+    tt.return %1 : tensor<2xf32> loc(#loc217)
+  } loc(#loc213)
+  tt.func private @triton.language.standard._sum_combine__fp32_fp32__(%a: f32 loc("a"(#loc218)), %b: f32 loc("b"(#loc218))) -> f32 attributes {noinline = false} {
+    %0 = arith.addf %a, %b : f32 loc(#loc219)
+    tt.return %0 : f32 loc(#loc220)
+  ^bb1:  // no predecessors
+    %1 = ub.poison : f32 loc(#loc221)
+    tt.return %1 : f32 loc(#loc221)
+  } loc(#loc218)
+} loc(#loc)
+#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":19:13)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":20:15)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":23:28)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":23:33)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:36)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:44)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:23)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":25:46)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":26:27)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":26:37)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":28:19)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":29:19)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":30:43)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":32:44)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":33:43)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":34:31)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":35:29)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:41)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:52)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:48)
+#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:63)
+#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:57)
+#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:34)
+#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:68)
+#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:121)
+#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:45)
+#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:41)
+#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:56)
+#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:50)
+#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:34)
+#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:61)
+#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:114)
+#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":42:22)
+#loc34 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":44:23)
+#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":45:40)
+#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":47:22)
+#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":49:25)
+#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":50:42)
+#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":50:8)
+#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":51:25)
+#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":51:28)
+#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":52:27)
+#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":52:30)
+#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":53:43)
+#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":54:31)
+#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":55:29)
+#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":58:27)
+#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":59:27)
+#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:46)
+#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:42)
+#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:57)
+#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:51)
+#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:35)
+#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:62)
+#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:115)
+#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:35)
+#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:42)
+#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:95)
+#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:46)
+#loc60 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:42)
+#loc61 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:35)
+#loc62 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:51)
+#loc63 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:46)
+#loc64 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:42)
+#loc65 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:35)
+#loc66 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:51)
+#loc67 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:42)
+#loc68 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:53)
+#loc69 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:49)
+#loc70 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:64)
+#loc71 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:58)
+#loc72 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:35)
+#loc73 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:69)
+#loc74 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:123)
+#loc75 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:36)
+#loc76 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:43)
+#loc77 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:96)
+#loc78 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":68:35)
+#loc79 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":69:25)
+#loc80 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":70:35)
+#loc81 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":71:24)
+#loc82 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:41)
+#loc83 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:39)
+#loc84 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:52)
+#loc85 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:48)
+#loc86 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:63)
+#loc87 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:57)
+#loc88 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:35)
+#loc89 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:78)
+#loc90 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:68)
+#loc91 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:129)
+#loc92 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":74:16)
+#loc93 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":75:25)
+#loc94 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":76:16)
+#loc95 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":77:24)
+#loc96 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":78:32)
+#loc97 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":79:24)
+#loc98 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:57)
+#loc99 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:55)
+#loc100 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:63)
+#loc101 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:35)
+#loc102 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:95)
+#loc103 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:85)
+#loc104 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:146)
+#loc105 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":82:24)
+#loc106 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":84:17)
+#loc107 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":85:42)
+#loc108 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":86:39)
+#loc109 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":87:25)
+#loc110 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":88:35)
+#loc111 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":89:24)
+#loc112 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:37)
+#loc113 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:48)
+#loc114 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:44)
+#loc115 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:59)
+#loc116 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:53)
+#loc117 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:35)
+#loc118 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:74)
+#loc119 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:64)
+#loc120 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:125)
+#loc121 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":92:16)
+#loc122 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":93:25)
+#loc123 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":94:16)
+#loc124 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":95:24)
+#loc125 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":96:32)
+#loc126 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":97:24)
+#loc127 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:53)
+#loc128 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:59)
+#loc129 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:35)
+#loc130 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:91)
+#loc131 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:81)
+#loc132 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:142)
+#loc133 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":100:24)
+#loc134 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":102:42)
+#loc135 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":103:39)
+#loc136 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":104:39)
+#loc137 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":106:16)
+#loc138 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":107:25)
+#loc139 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":108:16)
+#loc140 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":109:24)
+#loc141 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":110:32)
+#loc142 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":111:24)
+#loc143 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":113:24)
+#loc144 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":116:24)
+#loc145 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":118:24)
+#loc146 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":119:24)
+#loc147 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:44)
+#loc148 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:42)
+#loc149 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:55)
+#loc150 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:51)
+#loc151 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:66)
+#loc152 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:60)
+#loc153 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:35)
+#loc154 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:81)
+#loc155 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:71)
+#loc156 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:132)
+#loc157 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":123:24)
+#loc158 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":124:24)
+#loc159 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":125:32)
+#loc160 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":126:24)
+#loc161 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:57)
+#loc162 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:55)
+#loc163 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:63)
+#loc164 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:35)
+#loc165 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:95)
+#loc166 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:85)
+#loc167 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:146)
+#loc168 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":129:24)
+#loc169 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":131:17)
+#loc170 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":132:42)
+#loc171 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":133:39)
+#loc172 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:44)
+#loc173 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:42)
+#loc174 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:55)
+#loc175 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:51)
+#loc176 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:66)
+#loc177 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:60)
+#loc178 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:35)
+#loc179 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:81)
+#loc180 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:71)
+#loc181 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:132)
+#loc182 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":136:24)
+#loc183 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":137:24)
+#loc184 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":138:32)
+#loc185 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":139:24)
+#loc186 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:53)
+#loc187 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:59)
+#loc188 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:35)
+#loc189 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:91)
+#loc190 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:81)
+#loc191 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:142)
+#loc192 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":142:24)
+#loc193 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":144:42)
+#loc194 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":145:39)
+#loc195 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":146:39)
+#loc196 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":148:24)
+#loc197 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":149:24)
+#loc198 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":150:33)
+#loc199 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":151:25)
+#loc200 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":153:26)
+#loc201 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":156:26)
+#loc202 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":158:26)
+#loc203 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":159:26)
+#loc204 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:43)
+#loc205 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:39)
+#loc206 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:32)
+#loc207 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:55)
+#loc208 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:43)
+#loc209 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:39)
+#loc210 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:32)
+#loc211 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:56)
+#loc212 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":53:4)
+#loc214 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:36)
+#loc216 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:11)
+#loc217 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:4)
+#loc219 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:15)
+#loc220 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:11)
+#loc221 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:4)
+#loc231 = loc("xnumel"(#loc1))
+#loc232 = loc("r0_numel"(#loc2))
+#loc233 = loc("xoffset"(#loc3))
+#loc234 = loc("xoffset"(#loc4))
+#loc235 = loc("xindex"(#loc5))
+#loc236 = loc("xindex"(#loc6))
+#loc237 = loc("xindex"(#loc7))
+#loc238 = loc("xmask"(#loc8))
+#loc239 = loc("r0_base"(#loc9))
+#loc240 = loc("r0_base"(#loc10))
+#loc241 = loc("x0"(#loc11))
+#loc242 = loc("x1"(#loc12))
+#loc243 = loc("_tmp4"(#loc13))
+#loc244 = loc("_tmp10"(#loc14))
+#loc245 = loc("_tmp4"(#loc15))
+#loc246 = loc("r0_index"(#loc16))
+#loc247 = loc("r0_mask"(#loc17))
+#loc248 = loc("tmp0"(#loc18))
+#loc249 = loc("tmp0"(#loc19))
+#loc250 = loc("tmp0"(#loc20))
+#loc251 = loc("tmp0"(#loc21))
+#loc252 = loc("tmp0"(#loc22))
+#loc253 = loc("tmp0"(#loc23))
+#loc254 = loc("tmp0"(#loc24))
+#loc255 = loc("tmp0"(#loc25))
+#loc256 = loc("tmp6"(#loc26))
+#loc257 = loc("tmp6"(#loc27))
+#loc258 = loc("tmp6"(#loc28))
+#loc259 = loc("tmp6"(#loc29))
+#loc260 = loc("tmp6"(#loc30))
+#loc261 = loc("tmp6"(#loc31))
+#loc262 = loc("tmp6"(#loc32))
+#loc263 = loc("tmp2"(#loc33))
+#loc264 = loc("tmp5"(#loc34))
+#loc265 = loc("_tmp4"(#loc35))
+#loc266 = loc("tmp8"(#loc36))
+#loc267 = loc("tmp11"(#loc37))
+#loc268 = loc("_tmp10"(#loc38))
+#loc269 = loc("tmp4"(#loc40))
+#loc270 = loc("tmp4"(#loc41))
+#loc271 = loc("tmp10"(#loc42))
+#loc272 = loc("tmp10"(#loc43))
+#loc273 = loc("r0_index"(#loc45))
+#loc274 = loc("r0_mask"(#loc46))
+#loc275 = loc("r0_3"(#loc47))
+#loc276 = loc("r0_4"(#loc48))
+#loc277 = loc("tmp50"(#loc49))
+#loc278 = loc("tmp50"(#loc50))
+#loc279 = loc("tmp50"(#loc51))
+#loc280 = loc("tmp50"(#loc52))
+#loc281 = loc("tmp50"(#loc53))
+#loc282 = loc("tmp50"(#loc54))
+#loc283 = loc("tmp50"(#loc55))
+#loc284 = loc("tmp58"(#loc56))
+#loc285 = loc("tmp58"(#loc57))
+#loc286 = loc("tmp58"(#loc58))
+#loc287 = loc("tmp63"(#loc59))
+#loc288 = loc("tmp63"(#loc60))
+#loc289 = loc("tmp63"(#loc61))
+#loc290 = loc("tmp63"(#loc62))
+#loc291 = loc("tmp66"(#loc63))
+#loc292 = loc("tmp66"(#loc64))
+#loc293 = loc("tmp66"(#loc65))
+#loc294 = loc("tmp66"(#loc66))
+#loc295 = loc("tmp96"(#loc67))
+#loc296 = loc("tmp96"(#loc68))
+#loc297 = loc("tmp96"(#loc69))
+#loc298 = loc("tmp96"(#loc70))
+#loc299 = loc("tmp96"(#loc71))
+#loc300 = loc("tmp96"(#loc72))
+#loc301 = loc("tmp96"(#loc73))
+#loc302 = loc("tmp96"(#loc74))
+#loc303 = loc("tmp102"(#loc75))
+#loc304 = loc("tmp102"(#loc76))
+#loc305 = loc("tmp102"(#loc77))
+#loc306 = loc("tmp13"(#loc78))
+#loc307 = loc("tmp14"(#loc79))
+#loc308 = loc("tmp15"(#loc80))
+#loc309 = loc("tmp16"(#loc81))
+#loc310 = loc("tmp17"(#loc82))
+#loc311 = loc("tmp17"(#loc83))
+#loc312 = loc("tmp17"(#loc84))
+#loc313 = loc("tmp17"(#loc85))
+#loc314 = loc("tmp17"(#loc86))
+#loc315 = loc("tmp17"(#loc87))
+#loc316 = loc("tmp17"(#loc88))
+#loc317 = loc("tmp17"(#loc89))
+#loc318 = loc("tmp17"(#loc90))
+#loc319 = loc("tmp17"(#loc91))
+#loc320 = loc("tmp19"(#loc92))
+#loc321 = loc("tmp20"(#loc93))
+#loc322 = loc("tmp21"(#loc94))
+#loc323 = loc("tmp22"(#loc95))
+#loc324 = loc("tmp23"(#loc96))
+#loc325 = loc("tmp24"(#loc97))
+#loc326 = loc("tmp25"(#loc98))
+#loc327 = loc("tmp25"(#loc99))
+#loc328 = loc("tmp25"(#loc100))
+#loc329 = loc("tmp25"(#loc101))
+#loc330 = loc("tmp25"(#loc102))
+#loc331 = loc("tmp25"(#loc103))
+#loc332 = loc("tmp25"(#loc104))
+#loc333 = loc("tmp27"(#loc105))
+#loc334 = loc("tmp29"(#loc106))
+#loc335 = loc("tmp30"(#loc107))
+#loc336 = loc("tmp31"(#loc108))
+#loc337 = loc("tmp32"(#loc109))
+#loc338 = loc("tmp33"(#loc110))
+#loc339 = loc("tmp34"(#loc111))
+#loc340 = loc("tmp35"(#loc112))
+#loc341 = loc("tmp35"(#loc113))
+#loc342 = loc("tmp35"(#loc114))
+#loc343 = loc("tmp35"(#loc115))
+#loc344 = loc("tmp35"(#loc116))
+#loc345 = loc("tmp35"(#loc117))
+#loc346 = loc("tmp35"(#loc118))
+#loc347 = loc("tmp35"(#loc119))
+#loc348 = loc("tmp35"(#loc120))
+#loc349 = loc("tmp37"(#loc121))
+#loc350 = loc("tmp38"(#loc122))
+#loc351 = loc("tmp39"(#loc123))
+#loc352 = loc("tmp40"(#loc124))
+#loc353 = loc("tmp41"(#loc125))
+#loc354 = loc("tmp42"(#loc126))
+#loc355 = loc("tmp43"(#loc127))
+#loc356 = loc("tmp43"(#loc128))
+#loc357 = loc("tmp43"(#loc129))
+#loc358 = loc("tmp43"(#loc130))
+#loc359 = loc("tmp43"(#loc131))
+#loc360 = loc("tmp43"(#loc132))
+#loc361 = loc("tmp45"(#loc133))
+#loc362 = loc("tmp47"(#loc134))
+#loc363 = loc("tmp48"(#loc135))
+#loc364 = loc("tmp49"(#loc136))
+#loc365 = loc("tmp52"(#loc137))
+#loc366 = loc("tmp53"(#loc138))
+#loc367 = loc("tmp54"(#loc139))
+#loc368 = loc("tmp55"(#loc140))
+#loc369 = loc("tmp56"(#loc141))
+#loc370 = loc("tmp57"(#loc142))
+#loc371 = loc("tmp60"(#loc143))
+#loc372 = loc("tmp64"(#loc144))
+#loc373 = loc("tmp67"(#loc145))
+#loc374 = loc("tmp68"(#loc146))
+#loc375 = loc("tmp70"(#loc147))
+#loc376 = loc("tmp70"(#loc148))
+#loc377 = loc("tmp70"(#loc149))
+#loc378 = loc("tmp70"(#loc150))
+#loc379 = loc("tmp70"(#loc151))
+#loc380 = loc("tmp70"(#loc152))
+#loc381 = loc("tmp70"(#loc153))
+#loc382 = loc("tmp70"(#loc154))
+#loc383 = loc("tmp70"(#loc155))
+#loc384 = loc("tmp70"(#loc156))
+#loc385 = loc("tmp72"(#loc157))
+#loc386 = loc("tmp73"(#loc158))
+#loc387 = loc("tmp74"(#loc159))
+#loc388 = loc("tmp75"(#loc160))
+#loc389 = loc("tmp76"(#loc161))
+#loc390 = loc("tmp76"(#loc162))
+#loc391 = loc("tmp76"(#loc163))
+#loc392 = loc("tmp76"(#loc164))
+#loc393 = loc("tmp76"(#loc165))
+#loc394 = loc("tmp76"(#loc166))
+#loc395 = loc("tmp76"(#loc167))
+#loc396 = loc("tmp78"(#loc168))
+#loc397 = loc("tmp80"(#loc169))
+#loc398 = loc("tmp81"(#loc170))
+#loc399 = loc("tmp82"(#loc171))
+#loc400 = loc("tmp83"(#loc172))
+#loc401 = loc("tmp83"(#loc173))
+#loc402 = loc("tmp83"(#loc174))
+#loc403 = loc("tmp83"(#loc175))
+#loc404 = loc("tmp83"(#loc176))
+#loc405 = loc("tmp83"(#loc177))
+#loc406 = loc("tmp83"(#loc178))
+#loc407 = loc("tmp83"(#loc179))
+#loc408 = loc("tmp83"(#loc180))
+#loc409 = loc("tmp83"(#loc181))
+#loc410 = loc("tmp85"(#loc182))
+#loc411 = loc("tmp86"(#loc183))
+#loc412 = loc("tmp87"(#loc184))
+#loc413 = loc("tmp88"(#loc185))
+#loc414 = loc("tmp89"(#loc186))
+#loc415 = loc("tmp89"(#loc187))
+#loc416 = loc("tmp89"(#loc188))
+#loc417 = loc("tmp89"(#loc189))
+#loc418 = loc("tmp89"(#loc190))
+#loc419 = loc("tmp89"(#loc191))
+#loc420 = loc("tmp91"(#loc192))
+#loc421 = loc("tmp93"(#loc193))
+#loc422 = loc("tmp94"(#loc194))
+#loc423 = loc("tmp95"(#loc195))
+#loc424 = loc("tmp98"(#loc196))
+#loc425 = loc("tmp99"(#loc197))
+#loc426 = loc("tmp100"(#loc198))
+#loc427 = loc("tmp101"(#loc199))
+#loc428 = loc("tmp104"(#loc200))
+#loc429 = loc("tmp107"(#loc201))
+#loc430 = loc("tmp109"(#loc202))
+#loc431 = loc("tmp110"(#loc203))
+#loc435 = loc("_tmp10"(#loc245))
diff --git a/triton/4CABSBCN3JWTJORDZXCB5NETVIT6A64D5LGN2QHUTERFG5YETKTA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttgir b/triton/4CABSBCN3JWTJORDZXCB5NETVIT6A64D5LGN2QHUTERFG5YETKTA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttgir
new file mode 100644
index 0000000000000000000000000000000000000000..2e4ed4d50da7439900f325d63a2505365f09cf0c
--- /dev/null
+++ b/triton/4CABSBCN3JWTJORDZXCB5NETVIT6A64D5LGN2QHUTERFG5YETKTA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttgir
@@ -0,0 +1,557 @@
+#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [2, 16], warpsPerCTA = [1, 2], order = [0, 1]}>
+#blocked1 = #ttg.blocked<{sizePerThread = [1, 2], threadsPerWarp = [1, 32], warpsPerCTA = [2, 1], order = [1, 0]}>
+#blocked2 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 2], order = [1, 0]}>
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":18:0)
+#loc1 = loc(unknown)
+#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":51:25)
+#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":52:27)
+#loc147 = loc("in_out_ptr0"(#loc))
+#loc148 = loc("in_out_ptr1"(#loc))
+#loc149 = loc("in_ptr0"(#loc))
+#loc150 = loc("in_ptr1"(#loc))
+#loc151 = loc("in_ptr2"(#loc))
+#loc152 = loc("in_ptr3"(#loc))
+#loc153 = loc("in_ptr4"(#loc))
+#loc154 = loc("xnumel"(#loc))
+#loc155 = loc("r0_numel"(#loc))
+#loc185 = loc("tmp4"(#loc33))
+#loc187 = loc("tmp10"(#loc36))
+#loc292 = loc(callsite(#loc1 at #loc185))
+#loc294 = loc(callsite(#loc1 at #loc187))
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 2 : i32, ttg.target = "cuda:89", "ttg.threads-per-warp" = 32 : i32} {
+  tt.func public @triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0(%in_out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_out_ptr0"(#loc)), %in_out_ptr1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_out_ptr1"(#loc)), %in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %in_ptr4: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr4"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
+    %cst = arith.constant dense<4097> : tensor<1x64xi32, #blocked> loc(#loc1)
+    %cst_0 = arith.constant dense<1> : tensor<1x64xi32, #blocked> loc(#loc1)
+    %cst_1 = arith.constant dense<1> : tensor<1x64xi64, #blocked> loc(#loc1)
+    %cst_2 = arith.constant dense<2> : tensor<1x64xi32, #blocked> loc(#loc1)
+    %cst_3 = arith.constant dense<36864> : tensor<2x1xi32, #blocked> loc(#loc1)
+    %cst_4 = arith.constant dense<36864> : tensor<2x1xi32, #blocked1> loc(#loc1)
+    %cst_5 = arith.constant dense<128> : tensor<2x1xi32, #blocked> loc(#loc1)
+    %cst_6 = arith.constant dense<128> : tensor<2x1xi32, #blocked1> loc(#loc1)
+    %cst_7 = arith.constant dense<4096> : tensor<1x64xi32, #blocked> loc(#loc1)
+    %cst_8 = arith.constant dense<4096> : tensor<1x64xi32, #blocked1> loc(#loc1)
+    %cst_9 = arith.constant dense<128> : tensor<1x64xi32, #blocked> loc(#loc1)
+    %cst_10 = arith.constant dense<128> : tensor<1x64xi32, #blocked1> loc(#loc1)
+    %cst_11 = arith.constant dense<32> : tensor<2x1xi32, #blocked> loc(#loc1)
+    %cst_12 = arith.constant dense<32> : tensor<2x1xi32, #blocked1> loc(#loc1)
+    %c2_i32 = arith.constant 2 : i32 loc(#loc1)
+    %cst_13 = arith.constant dense<0.000000e+00> : tensor<2x64xbf16, #blocked1> loc(#loc1)
+    %cst_14 = arith.constant dense<0.000000e+00> : tensor<2x64xbf16, #blocked> loc(#loc1)
+    %cst_15 = arith.constant dense<128> : tensor<1x64xi32, #blocked2> loc(#loc1)
+    %cst_16 = arith.constant dense<0.000000e+00> : tensor<1x64xbf16, #blocked2> loc(#loc1)
+    %c64_i32 = arith.constant 64 : i32 loc(#loc1)
+    %c128_i32 = arith.constant 128 : i32 loc(#loc1)
+    %c0_i32 = arith.constant 0 : i32 loc(#loc1)
+    %cst_17 = arith.constant dense<9.99999997E-7> : tensor<2x1xf32, #blocked1> loc(#loc1)
+    %cst_18 = arith.constant dense<1.280000e+02> : tensor<2x1xf32, #blocked1> loc(#loc1)
+    %cst_19 = arith.constant dense<0.000000e+00> : tensor<2x64xf32, #blocked> loc(#loc1)
+    %cst_20 = arith.constant dense<0.000000e+00> : tensor<2x64xf32, #blocked1> loc(#loc1)
+    %xoffset = tt.get_program_id x : i32 loc(#loc156)
+    %xoffset_21 = arith.muli %xoffset, %c2_i32 : i32 loc(#loc157)
+    %xindex = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc158)
+    %xindex_22 = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc158)
+    %xindex_23 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<2xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<2x1xi32, #blocked1> loc(#loc158)
+    %xindex_24 = tt.expand_dims %xindex_22 {axis = 1 : i32} : tensor<2xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<2x1xi32, #blocked> loc(#loc158)
+    %xindex_25 = tt.splat %xoffset_21 : i32 -> tensor<2x1xi32, #blocked1> loc(#loc159)
+    %xindex_26 = tt.splat %xoffset_21 : i32 -> tensor<2x1xi32, #blocked> loc(#loc159)
+    %xindex_27 = arith.addi %xindex_25, %xindex_23 : tensor<2x1xi32, #blocked1> loc(#loc159)
+    %xindex_28 = arith.addi %xindex_26, %xindex_24 : tensor<2x1xi32, #blocked> loc(#loc159)
+    %r0_base = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc160)
+    %r0_base_29 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc160)
+    %r0_base_30 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked2}>> loc(#loc160)
+    %r0_base_31 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x64xi32, #blocked1> loc(#loc160)
+    %r0_base_32 = tt.expand_dims %r0_base_29 {axis = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x64xi32, #blocked> loc(#loc160)
+    %r0_base_33 = tt.expand_dims %r0_base_30 {axis = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked2}>> -> tensor<1x64xi32, #blocked2> loc(#loc160)
+    %x0 = arith.remsi %xindex_27, %cst_12 : tensor<2x1xi32, #blocked1> loc(#loc161)
+    %x0_34 = arith.remsi %xindex_28, %cst_11 : tensor<2x1xi32, #blocked> loc(#loc161)
+    %x1 = arith.divsi %xindex_27, %cst_12 : tensor<2x1xi32, #blocked1> loc(#loc162)
+    %x1_35 = arith.divsi %xindex_28, %cst_11 : tensor<2x1xi32, #blocked> loc(#loc162)
+    %tmp0 = arith.muli %x0, %cst_6 : tensor<2x1xi32, #blocked1> loc(#loc163)
+    %tmp0_36 = tt.broadcast %tmp0 : tensor<2x1xi32, #blocked1> -> tensor<2x64xi32, #blocked1> loc(#loc164)
+    %tmp0_37 = arith.muli %x1, %cst_4 : tensor<2x1xi32, #blocked1> loc(#loc165)
+    %tmp0_38 = tt.broadcast %tmp0_37 : tensor<2x1xi32, #blocked1> -> tensor<2x64xi32, #blocked1> loc(#loc166)
+    %tmp0_39 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<2x64x!tt.ptr<bf16>, #blocked1> loc(#loc167)
+    %_tmp10:2 = scf.for %_tmp10_54 = %c0_i32 to %c128_i32 step %c64_i32 iter_args(%arg10 = %cst_20, %arg11 = %cst_20) -> (tensor<2x64xf32, #blocked1>, tensor<2x64xf32, #blocked1>)  : i32 {
+      %r0_index = tt.splat %_tmp10_54 : i32 -> tensor<1x64xi32, #blocked1> loc(#loc169)
+      %r0_index_55 = arith.addi %r0_index, %r0_base_31 : tensor<1x64xi32, #blocked1> loc(#loc169)
+      %r0_mask = arith.cmpi slt, %r0_index_55, %cst_10 : tensor<1x64xi32, #blocked1> loc(#loc170)
+      %tmp0_56 = arith.addi %r0_index_55, %cst_8 : tensor<1x64xi32, #blocked1> loc(#loc171)
+      %tmp0_57 = tt.broadcast %tmp0_56 : tensor<1x64xi32, #blocked1> -> tensor<2x64xi32, #blocked1> loc(#loc164)
+      %tmp0_58 = arith.addi %tmp0_57, %tmp0_36 : tensor<2x64xi32, #blocked1> loc(#loc164)
+      %tmp0_59 = arith.addi %tmp0_58, %tmp0_38 : tensor<2x64xi32, #blocked1> loc(#loc166)
+      %tmp0_60 = tt.addptr %tmp0_39, %tmp0_59 : tensor<2x64x!tt.ptr<bf16>, #blocked1>, tensor<2x64xi32, #blocked1> loc(#loc167)
+      %tmp0_61 = tt.broadcast %r0_mask : tensor<1x64xi1, #blocked1> -> tensor<2x64xi1, #blocked1> loc(#loc172)
+      %tmp0_62 = tt.load %tmp0_60, %tmp0_61, %cst_13 evictionPolicy = evict_last : tensor<2x64x!tt.ptr<bf16>, #blocked1> loc(#loc172)
+      %tmp0_63 = arith.extf %tmp0_62 : tensor<2x64xbf16, #blocked1> to tensor<2x64xf32, #blocked1> loc(#loc173)
+      %tmp6 = tt.broadcast %r0_index_55 : tensor<1x64xi32, #blocked1> -> tensor<2x64xi32, #blocked1> loc(#loc174)
+      %tmp6_64 = arith.addi %tmp6, %tmp0_36 : tensor<2x64xi32, #blocked1> loc(#loc174)
+      %tmp6_65 = arith.addi %tmp6_64, %tmp0_38 : tensor<2x64xi32, #blocked1> loc(#loc175)
+      %tmp6_66 = tt.addptr %tmp0_39, %tmp6_65 : tensor<2x64x!tt.ptr<bf16>, #blocked1>, tensor<2x64xi32, #blocked1> loc(#loc176)
+      %tmp6_67 = tt.load %tmp6_66, %tmp0_61, %cst_13 evictionPolicy = evict_last : tensor<2x64x!tt.ptr<bf16>, #blocked1> loc(#loc177)
+      %tmp6_68 = arith.extf %tmp6_67 : tensor<2x64xbf16, #blocked1> to tensor<2x64xf32, #blocked1> loc(#loc178)
+      %tmp2 = arith.mulf %tmp0_63, %tmp0_63 : tensor<2x64xf32, #blocked1> loc(#loc179)
+      %tmp5 = arith.addf %arg10, %tmp2 : tensor<2x64xf32, #blocked1> loc(#loc180)
+      %_tmp4 = arith.select %tmp0_61, %tmp5, %arg10 : tensor<2x64xi1, #blocked1>, tensor<2x64xf32, #blocked1> loc(#loc181)
+      %tmp8 = arith.mulf %tmp6_68, %tmp6_68 : tensor<2x64xf32, #blocked1> loc(#loc182)
+      %tmp11 = arith.addf %arg11, %tmp8 : tensor<2x64xf32, #blocked1> loc(#loc183)
+      %_tmp10_69 = arith.select %tmp0_61, %tmp11, %arg11 : tensor<2x64xi1, #blocked1>, tensor<2x64xf32, #blocked1> loc(#loc184)
+      scf.yield %_tmp4, %_tmp10_69 : tensor<2x64xf32, #blocked1>, tensor<2x64xf32, #blocked1> loc(#loc31)
+    } loc(#loc290)
+    %tmp4 = "tt.reduce"(%_tmp10#0) <{axis = 1 : i32}> ({
+    ^bb0(%tmp4_54: f32 loc(callsite(#loc1 at #loc185)), %tmp4_55: f32 loc(callsite(#loc1 at #loc185))):
+      %tmp4_56 = arith.addf %tmp4_54, %tmp4_55 : f32 loc(#loc297)
+      tt.reduce.return %tmp4_56 : f32 loc(#loc291)
+    }) : (tensor<2x64xf32, #blocked1>) -> tensor<2xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc291)
+    %tmp4_40 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<2xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<2x1xf32, #blocked1> loc(#loc186)
+    %tmp10 = "tt.reduce"(%_tmp10#1) <{axis = 1 : i32}> ({
+    ^bb0(%tmp10_54: f32 loc(callsite(#loc1 at #loc187)), %tmp10_55: f32 loc(callsite(#loc1 at #loc187))):
+      %tmp10_56 = arith.addf %tmp10_54, %tmp10_55 : f32 loc(#loc298)
+      tt.reduce.return %tmp10_56 : f32 loc(#loc293)
+    }) : (tensor<2x64xf32, #blocked1>) -> tensor<2xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc293)
+    %tmp10_41 = tt.expand_dims %tmp10 {axis = 1 : i32} : tensor<2xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<2x1xf32, #blocked1> loc(#loc188)
+    %tmp50 = arith.muli %x0_34, %cst_5 : tensor<2x1xi32, #blocked> loc(#loc189)
+    %tmp50_42 = tt.broadcast %tmp50 : tensor<2x1xi32, #blocked> -> tensor<2x64xi32, #blocked> loc(#loc190)
+    %tmp50_43 = arith.muli %x1_35, %cst_3 : tensor<2x1xi32, #blocked> loc(#loc191)
+    %tmp50_44 = tt.broadcast %tmp50_43 : tensor<2x1xi32, #blocked> -> tensor<2x64xi32, #blocked> loc(#loc192)
+    %tmp50_45 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<2x64x!tt.ptr<bf16>, #blocked> loc(#loc193)
+    %tmp58 = tt.splat %in_ptr1 : !tt.ptr<bf16> -> tensor<1x64x!tt.ptr<bf16>, #blocked> loc(#loc194)
+    %tmp58_46 = tt.splat %in_ptr1 : !tt.ptr<bf16> -> tensor<1x64x!tt.ptr<bf16>, #blocked2> loc(#loc194)
+    %tmp63 = arith.muli %x1, %cst_6 : tensor<2x1xi32, #blocked1> loc(#loc195)
+    %tmp63_47 = tt.broadcast %tmp63 : tensor<2x1xi32, #blocked1> -> tensor<2x64xi32, #blocked1> loc(#loc196)
+    %tmp63_48 = tt.splat %in_ptr2 : !tt.ptr<f32> -> tensor<2x64x!tt.ptr<f32>, #blocked1> loc(#loc197)
+    %tmp66 = tt.splat %in_ptr3 : !tt.ptr<f32> -> tensor<2x64x!tt.ptr<f32>, #blocked1> loc(#loc198)
+    %tmp102 = tt.splat %in_ptr4 : !tt.ptr<bf16> -> tensor<1x64x!tt.ptr<bf16>, #blocked> loc(#loc199)
+    %tmp102_49 = tt.splat %in_ptr4 : !tt.ptr<bf16> -> tensor<1x64x!tt.ptr<bf16>, #blocked2> loc(#loc199)
+    %tmp20 = arith.divf %tmp10_41, %cst_18 : tensor<2x1xf32, #blocked1> loc(#loc200)
+    %tmp22 = arith.addf %tmp20, %cst_17 : tensor<2x1xf32, #blocked1> loc(#loc201)
+    %tmp23 = tt.extern_elementwise %tmp22 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<2x1xf32, #blocked1>) -> tensor<2x1xf32, #blocked1> loc(#loc202)
+    %tmp24 = ttg.convert_layout %tmp23 : tensor<2x1xf32, #blocked1> -> tensor<2x1xf32, #blocked> loc(#loc203)
+    %tmp24_50 = tt.broadcast %tmp24 : tensor<2x1xf32, #blocked> -> tensor<2x64xf32, #blocked> loc(#loc203)
+    %tmp24_51 = tt.broadcast %tmp23 : tensor<2x1xf32, #blocked1> -> tensor<2x64xf32, #blocked1> loc(#loc203)
+    %tmp72 = arith.divf %tmp4_40, %cst_18 : tensor<2x1xf32, #blocked1> loc(#loc204)
+    %tmp73 = arith.addf %tmp72, %cst_17 : tensor<2x1xf32, #blocked1> loc(#loc205)
+    %tmp74 = tt.extern_elementwise %tmp73 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<2x1xf32, #blocked1>) -> tensor<2x1xf32, #blocked1> loc(#loc206)
+    %tmp75 = ttg.convert_layout %tmp74 : tensor<2x1xf32, #blocked1> -> tensor<2x1xf32, #blocked> loc(#loc207)
+    %tmp75_52 = tt.broadcast %tmp75 : tensor<2x1xf32, #blocked> -> tensor<2x64xf32, #blocked> loc(#loc207)
+    %tmp75_53 = tt.broadcast %tmp74 : tensor<2x1xf32, #blocked1> -> tensor<2x64xf32, #blocked1> loc(#loc207)
+    %0 = arith.muli %xindex_27, %cst_6 : tensor<2x1xi32, #blocked1> loc(#loc57)
+    %1 = tt.broadcast %0 : tensor<2x1xi32, #blocked1> -> tensor<2x64xi32, #blocked1> loc(#loc58)
+    %2 = tt.splat %in_out_ptr0 : !tt.ptr<bf16> -> tensor<2x64x!tt.ptr<bf16>, #blocked1> loc(#loc59)
+    %3 = tt.splat %in_out_ptr1 : !tt.ptr<bf16> -> tensor<2x64x!tt.ptr<bf16>, #blocked1> loc(#loc60)
+    scf.for %r0_offset = %c0_i32 to %c128_i32 step %c64_i32  : i32 {
+      %r0_index = tt.splat %r0_offset : i32 -> tensor<1x64xi32, #blocked1> loc(#loc208)
+      %r0_index_54 = tt.splat %r0_offset : i32 -> tensor<1x64xi32, #blocked> loc(#loc208)
+      %r0_index_55 = tt.splat %r0_offset : i32 -> tensor<1x64xi32, #blocked2> loc(#loc208)
+      %r0_index_56 = arith.addi %r0_index, %r0_base_31 : tensor<1x64xi32, #blocked1> loc(#loc208)
+      %r0_index_57 = arith.addi %r0_index_54, %r0_base_32 : tensor<1x64xi32, #blocked> loc(#loc208)
+      %r0_index_58 = arith.addi %r0_index_55, %r0_base_33 : tensor<1x64xi32, #blocked2> loc(#loc208)
+      %r0_mask = arith.cmpi slt, %r0_index_56, %cst_10 : tensor<1x64xi32, #blocked1> loc(#loc209)
+      %r0_mask_59 = arith.cmpi slt, %r0_index_57, %cst_9 : tensor<1x64xi32, #blocked> loc(#loc209)
+      %r0_mask_60 = arith.cmpi slt, %r0_index_58, %cst_15 : tensor<1x64xi32, #blocked2> loc(#loc209)
+      %r0_3 = arith.remsi %r0_index_57, %cst_2 : tensor<1x64xi32, #blocked> loc(#loc210)
+      %r0_4 = arith.divsi %r0_index_57, %cst_2 : tensor<1x64xi32, #blocked> loc(#loc211)
+      %tmp50_61 = tt.broadcast %r0_index_56 : tensor<1x64xi32, #blocked1> -> tensor<2x64xi32, #blocked1> loc(#loc190)
+      %tmp50_62 = arith.addi %tmp50_61, %tmp0_36 : tensor<2x64xi32, #blocked1> loc(#loc190)
+      %tmp50_63 = arith.addi %tmp50_62, %tmp0_38 : tensor<2x64xi32, #blocked1> loc(#loc192)
+      %tmp50_64 = tt.addptr %tmp0_39, %tmp50_63 : tensor<2x64x!tt.ptr<bf16>, #blocked1>, tensor<2x64xi32, #blocked1> loc(#loc193)
+      %tmp50_65 = tt.broadcast %r0_mask : tensor<1x64xi1, #blocked1> -> tensor<2x64xi1, #blocked1> loc(#loc212)
+      %tmp50_66 = tt.load %tmp50_64, %tmp50_65, %cst_13 evictionPolicy = evict_last : tensor<2x64x!tt.ptr<bf16>, #blocked1> loc(#loc212)
+      %tmp50_67 = arith.extf %tmp50_66 : tensor<2x64xbf16, #blocked1> to tensor<2x64xf32, #blocked1> loc(#loc213)
+      %tmp58_68 = tt.addptr %tmp58_46, %r0_index_58 : tensor<1x64x!tt.ptr<bf16>, #blocked2>, tensor<1x64xi32, #blocked2> loc(#loc194)
+      %tmp58_69 = tt.load %tmp58_68, %r0_mask_60, %cst_16 evictionPolicy = evict_last : tensor<1x64x!tt.ptr<bf16>, #blocked2> loc(#loc214)
+      %tmp58_70 = arith.extf %tmp58_69 : tensor<1x64xbf16, #blocked2> to tensor<1x64xf32, #blocked2> loc(#loc215)
+      %tmp63_71 = arith.addi %tmp50_61, %tmp63_47 : tensor<2x64xi32, #blocked1> loc(#loc196)
+      %tmp63_72 = tt.addptr %tmp63_48, %tmp63_71 : tensor<2x64x!tt.ptr<f32>, #blocked1>, tensor<2x64xi32, #blocked1> loc(#loc197)
+      %tmp63_73 = tt.load %tmp63_72, %tmp50_65, %cst_20 evictionPolicy = evict_last : tensor<2x64x!tt.ptr<f32>, #blocked1> loc(#loc216)
+      %tmp63_74 = ttg.convert_layout %tmp63_73 : tensor<2x64xf32, #blocked1> -> tensor<2x64xf32, #blocked> loc(#loc216)
+      %tmp66_75 = tt.addptr %tmp66, %tmp63_71 : tensor<2x64x!tt.ptr<f32>, #blocked1>, tensor<2x64xi32, #blocked1> loc(#loc198)
+      %tmp66_76 = tt.load %tmp66_75, %tmp50_65, %cst_20 evictionPolicy = evict_last : tensor<2x64x!tt.ptr<f32>, #blocked1> loc(#loc217)
+      %tmp66_77 = ttg.convert_layout %tmp66_76 : tensor<2x64xf32, #blocked1> -> tensor<2x64xf32, #blocked> loc(#loc217)
+      %tmp96 = arith.addi %r0_index_56, %cst_8 : tensor<1x64xi32, #blocked1> loc(#loc218)
+      %tmp96_78 = tt.broadcast %tmp96 : tensor<1x64xi32, #blocked1> -> tensor<2x64xi32, #blocked1> loc(#loc219)
+      %tmp96_79 = arith.addi %tmp96_78, %tmp0_36 : tensor<2x64xi32, #blocked1> loc(#loc219)
+      %tmp96_80 = arith.addi %tmp96_79, %tmp0_38 : tensor<2x64xi32, #blocked1> loc(#loc220)
+      %tmp96_81 = tt.addptr %tmp0_39, %tmp96_80 : tensor<2x64x!tt.ptr<bf16>, #blocked1>, tensor<2x64xi32, #blocked1> loc(#loc221)
+      %tmp96_82 = tt.load %tmp96_81, %tmp50_65, %cst_13 evictionPolicy = evict_first : tensor<2x64x!tt.ptr<bf16>, #blocked1> loc(#loc222)
+      %tmp96_83 = arith.extf %tmp96_82 : tensor<2x64xbf16, #blocked1> to tensor<2x64xf32, #blocked1> loc(#loc223)
+      %tmp102_84 = tt.addptr %tmp102_49, %r0_index_58 : tensor<1x64x!tt.ptr<bf16>, #blocked2>, tensor<1x64xi32, #blocked2> loc(#loc199)
+      %tmp102_85 = tt.load %tmp102_84, %r0_mask_60, %cst_16 evictionPolicy = evict_last : tensor<1x64x!tt.ptr<bf16>, #blocked2> loc(#loc224)
+      %tmp102_86 = arith.extf %tmp102_85 : tensor<1x64xbf16, #blocked2> to tensor<1x64xf32, #blocked2> loc(#loc225)
+      %tmp16 = arith.extsi %r0_3 : tensor<1x64xi32, #blocked> to tensor<1x64xi64, #blocked> loc(#loc226)
+      %tmp16_87 = arith.cmpi slt, %tmp16, %cst_1 : tensor<1x64xi64, #blocked> loc(#loc226)
+      %tmp17 = arith.muli %r0_4, %cst_2 : tensor<1x64xi32, #blocked> loc(#loc227)
+      %tmp17_88 = arith.addi %tmp17, %cst_0 : tensor<1x64xi32, #blocked> loc(#loc228)
+      %tmp17_89 = tt.broadcast %tmp17_88 : tensor<1x64xi32, #blocked> -> tensor<2x64xi32, #blocked> loc(#loc229)
+      %tmp17_90 = arith.addi %tmp17_89, %tmp50_42 : tensor<2x64xi32, #blocked> loc(#loc229)
+      %tmp17_91 = arith.addi %tmp17_90, %tmp50_44 : tensor<2x64xi32, #blocked> loc(#loc230)
+      %tmp17_92 = tt.addptr %tmp50_45, %tmp17_91 : tensor<2x64x!tt.ptr<bf16>, #blocked>, tensor<2x64xi32, #blocked> loc(#loc231)
+      %tmp17_93 = arith.andi %r0_mask_59, %tmp16_87 : tensor<1x64xi1, #blocked> loc(#loc232)
+      %tmp17_94 = tt.broadcast %tmp17_93 : tensor<1x64xi1, #blocked> -> tensor<2x64xi1, #blocked> loc(#loc233)
+      %tmp17_95 = tt.load %tmp17_92, %tmp17_94, %cst_14 evictionPolicy = evict_last : tensor<2x64x!tt.ptr<bf16>, #blocked> loc(#loc233)
+      %tmp17_96 = arith.extf %tmp17_95 : tensor<2x64xbf16, #blocked> to tensor<2x64xf32, #blocked> loc(#loc234)
+      %tmp24_97 = arith.mulf %tmp17_96, %tmp24_50 : tensor<2x64xf32, #blocked> loc(#loc203)
+      %tmp25 = tt.addptr %tmp58, %tmp17_88 : tensor<1x64x!tt.ptr<bf16>, #blocked>, tensor<1x64xi32, #blocked> loc(#loc235)
+      %tmp25_98 = tt.broadcast %tmp25 : tensor<1x64x!tt.ptr<bf16>, #blocked> -> tensor<2x64x!tt.ptr<bf16>, #blocked> loc(#loc235)
+      %tmp25_99 = tt.load %tmp25_98, %tmp17_94, %cst_14 evictionPolicy = evict_last : tensor<2x64x!tt.ptr<bf16>, #blocked> loc(#loc236)
+      %tmp25_100 = arith.extf %tmp25_99 : tensor<2x64xbf16, #blocked> to tensor<2x64xf32, #blocked> loc(#loc237)
+      %tmp27 = arith.mulf %tmp24_97, %tmp25_100 : tensor<2x64xf32, #blocked> loc(#loc238)
+      %tmp29 = arith.subf %cst_19, %tmp27 : tensor<2x64xf32, #blocked> loc(#loc239)
+      %tmp31 = tt.broadcast %tmp16_87 : tensor<1x64xi1, #blocked> -> tensor<2x64xi1, #blocked> loc(#loc240)
+      %tmp32 = arith.cmpi sge, %tmp16, %cst_1 : tensor<1x64xi64, #blocked> loc(#loc241)
+      %tmp35 = tt.broadcast %tmp17 : tensor<1x64xi32, #blocked> -> tensor<2x64xi32, #blocked> loc(#loc242)
+      %tmp35_101 = arith.addi %tmp35, %tmp50_42 : tensor<2x64xi32, #blocked> loc(#loc242)
+      %tmp35_102 = arith.addi %tmp35_101, %tmp50_44 : tensor<2x64xi32, #blocked> loc(#loc243)
+      %tmp35_103 = tt.addptr %tmp50_45, %tmp35_102 : tensor<2x64x!tt.ptr<bf16>, #blocked>, tensor<2x64xi32, #blocked> loc(#loc244)
+      %tmp35_104 = arith.andi %r0_mask_59, %tmp32 : tensor<1x64xi1, #blocked> loc(#loc245)
+      %tmp35_105 = tt.broadcast %tmp35_104 : tensor<1x64xi1, #blocked> -> tensor<2x64xi1, #blocked> loc(#loc246)
+      %tmp35_106 = tt.load %tmp35_103, %tmp35_105, %cst_14 evictionPolicy = evict_last : tensor<2x64x!tt.ptr<bf16>, #blocked> loc(#loc246)
+      %tmp35_107 = arith.extf %tmp35_106 : tensor<2x64xbf16, #blocked> to tensor<2x64xf32, #blocked> loc(#loc247)
+      %tmp42 = arith.mulf %tmp35_107, %tmp24_50 : tensor<2x64xf32, #blocked> loc(#loc248)
+      %tmp43 = tt.addptr %tmp58, %tmp17 : tensor<1x64x!tt.ptr<bf16>, #blocked>, tensor<1x64xi32, #blocked> loc(#loc249)
+      %tmp43_108 = tt.broadcast %tmp43 : tensor<1x64x!tt.ptr<bf16>, #blocked> -> tensor<2x64x!tt.ptr<bf16>, #blocked> loc(#loc249)
+      %tmp43_109 = tt.load %tmp43_108, %tmp35_105, %cst_14 evictionPolicy = evict_last : tensor<2x64x!tt.ptr<bf16>, #blocked> loc(#loc250)
+      %tmp43_110 = arith.extf %tmp43_109 : tensor<2x64xbf16, #blocked> to tensor<2x64xf32, #blocked> loc(#loc251)
+      %tmp45 = arith.mulf %tmp42, %tmp43_110 : tensor<2x64xf32, #blocked> loc(#loc252)
+      %tmp48 = tt.broadcast %tmp32 : tensor<1x64xi1, #blocked> -> tensor<2x64xi1, #blocked> loc(#loc253)
+      %tmp48_111 = arith.select %tmp48, %tmp45, %cst_19 : tensor<2x64xi1, #blocked>, tensor<2x64xf32, #blocked> loc(#loc253)
+      %tmp49 = arith.select %tmp31, %tmp29, %tmp48_111 : tensor<2x64xi1, #blocked>, tensor<2x64xf32, #blocked> loc(#loc295)
+      %tmp57 = arith.mulf %tmp50_67, %tmp24_51 : tensor<2x64xf32, #blocked1> loc(#loc255)
+      %tmp60 = ttg.convert_layout %tmp58_70 : tensor<1x64xf32, #blocked2> -> tensor<1x64xf32, #blocked1> loc(#loc256)
+      %tmp60_112 = tt.broadcast %tmp60 : tensor<1x64xf32, #blocked1> -> tensor<2x64xf32, #blocked1> loc(#loc256)
+      %tmp60_113 = arith.mulf %tmp57, %tmp60_112 : tensor<2x64xf32, #blocked1> loc(#loc256)
+      %tmp64 = arith.mulf %tmp60_113, %tmp63_73 : tensor<2x64xf32, #blocked1> loc(#loc257)
+      %tmp64_114 = ttg.convert_layout %tmp64 : tensor<2x64xf32, #blocked1> -> tensor<2x64xf32, #blocked> loc(#loc257)
+      %tmp67 = arith.mulf %tmp49, %tmp66_77 : tensor<2x64xf32, #blocked> loc(#loc258)
+      %tmp68 = arith.addf %tmp64_114, %tmp67 : tensor<2x64xf32, #blocked> loc(#loc259)
+      %tmp70 = arith.addi %tmp17, %cst : tensor<1x64xi32, #blocked> loc(#loc260)
+      %tmp70_115 = tt.broadcast %tmp70 : tensor<1x64xi32, #blocked> -> tensor<2x64xi32, #blocked> loc(#loc261)
+      %tmp70_116 = arith.addi %tmp70_115, %tmp50_42 : tensor<2x64xi32, #blocked> loc(#loc261)
+      %tmp70_117 = arith.addi %tmp70_116, %tmp50_44 : tensor<2x64xi32, #blocked> loc(#loc262)
+      %tmp70_118 = tt.addptr %tmp50_45, %tmp70_117 : tensor<2x64x!tt.ptr<bf16>, #blocked>, tensor<2x64xi32, #blocked> loc(#loc263)
+      %tmp70_119 = tt.load %tmp70_118, %tmp17_94, %cst_14 evictionPolicy = evict_last : tensor<2x64x!tt.ptr<bf16>, #blocked> loc(#loc264)
+      %tmp70_120 = arith.extf %tmp70_119 : tensor<2x64xbf16, #blocked> to tensor<2x64xf32, #blocked> loc(#loc265)
+      %tmp75_121 = arith.mulf %tmp70_120, %tmp75_52 : tensor<2x64xf32, #blocked> loc(#loc207)
+      %tmp76 = tt.addptr %tmp102, %tmp17_88 : tensor<1x64x!tt.ptr<bf16>, #blocked>, tensor<1x64xi32, #blocked> loc(#loc266)
+      %tmp76_122 = tt.broadcast %tmp76 : tensor<1x64x!tt.ptr<bf16>, #blocked> -> tensor<2x64x!tt.ptr<bf16>, #blocked> loc(#loc266)
+      %tmp76_123 = tt.load %tmp76_122, %tmp17_94, %cst_14 evictionPolicy = evict_last : tensor<2x64x!tt.ptr<bf16>, #blocked> loc(#loc267)
+      %tmp76_124 = arith.extf %tmp76_123 : tensor<2x64xbf16, #blocked> to tensor<2x64xf32, #blocked> loc(#loc268)
+      %tmp78 = arith.mulf %tmp75_121, %tmp76_124 : tensor<2x64xf32, #blocked> loc(#loc269)
+      %tmp80 = arith.subf %cst_19, %tmp78 : tensor<2x64xf32, #blocked> loc(#loc270)
+      %tmp83 = arith.addi %tmp17, %cst_7 : tensor<1x64xi32, #blocked> loc(#loc271)
+      %tmp83_125 = tt.broadcast %tmp83 : tensor<1x64xi32, #blocked> -> tensor<2x64xi32, #blocked> loc(#loc272)
+      %tmp83_126 = arith.addi %tmp83_125, %tmp50_42 : tensor<2x64xi32, #blocked> loc(#loc272)
+      %tmp83_127 = arith.addi %tmp83_126, %tmp50_44 : tensor<2x64xi32, #blocked> loc(#loc273)
+      %tmp83_128 = tt.addptr %tmp50_45, %tmp83_127 : tensor<2x64x!tt.ptr<bf16>, #blocked>, tensor<2x64xi32, #blocked> loc(#loc274)
+      %tmp83_129 = tt.load %tmp83_128, %tmp35_105, %cst_14 evictionPolicy = evict_last : tensor<2x64x!tt.ptr<bf16>, #blocked> loc(#loc275)
+      %tmp83_130 = arith.extf %tmp83_129 : tensor<2x64xbf16, #blocked> to tensor<2x64xf32, #blocked> loc(#loc276)
+      %tmp88 = arith.mulf %tmp83_130, %tmp75_52 : tensor<2x64xf32, #blocked> loc(#loc277)
+      %tmp89 = tt.addptr %tmp102, %tmp17 : tensor<1x64x!tt.ptr<bf16>, #blocked>, tensor<1x64xi32, #blocked> loc(#loc278)
+      %tmp89_131 = tt.broadcast %tmp89 : tensor<1x64x!tt.ptr<bf16>, #blocked> -> tensor<2x64x!tt.ptr<bf16>, #blocked> loc(#loc278)
+      %tmp89_132 = tt.load %tmp89_131, %tmp35_105, %cst_14 evictionPolicy = evict_last : tensor<2x64x!tt.ptr<bf16>, #blocked> loc(#loc279)
+      %tmp89_133 = arith.extf %tmp89_132 : tensor<2x64xbf16, #blocked> to tensor<2x64xf32, #blocked> loc(#loc280)
+      %tmp91 = arith.mulf %tmp88, %tmp89_133 : tensor<2x64xf32, #blocked> loc(#loc281)
+      %tmp94 = arith.select %tmp48, %tmp91, %cst_19 : tensor<2x64xi1, #blocked>, tensor<2x64xf32, #blocked> loc(#loc282)
+      %tmp95 = arith.select %tmp31, %tmp80, %tmp94 : tensor<2x64xi1, #blocked>, tensor<2x64xf32, #blocked> loc(#loc296)
+      %tmp101 = arith.mulf %tmp96_83, %tmp75_53 : tensor<2x64xf32, #blocked1> loc(#loc285)
+      %tmp101_134 = ttg.convert_layout %tmp101 : tensor<2x64xf32, #blocked1> -> tensor<2x64xf32, #blocked> loc(#loc285)
+      %tmp107 = ttg.convert_layout %tmp102_86 : tensor<1x64xf32, #blocked2> -> tensor<1x64xf32, #blocked> loc(#loc286)
+      %tmp104 = tt.broadcast %tmp107 : tensor<1x64xf32, #blocked> -> tensor<2x64xf32, #blocked> loc(#loc287)
+      %tmp104_135 = arith.mulf %tmp101_134, %tmp104 : tensor<2x64xf32, #blocked> loc(#loc287)
+      %tmp107_136 = arith.mulf %tmp104_135, %tmp63_74 : tensor<2x64xf32, #blocked> loc(#loc286)
+      %tmp109 = arith.mulf %tmp95, %tmp66_77 : tensor<2x64xf32, #blocked> loc(#loc288)
+      %tmp110 = arith.addf %tmp107_136, %tmp109 : tensor<2x64xf32, #blocked> loc(#loc289)
+      %4 = arith.addi %tmp50_61, %1 : tensor<2x64xi32, #blocked1> loc(#loc58)
+      %5 = tt.addptr %2, %4 : tensor<2x64x!tt.ptr<bf16>, #blocked1>, tensor<2x64xi32, #blocked1> loc(#loc59)
+      %6 = arith.truncf %tmp68 : tensor<2x64xf32, #blocked> to tensor<2x64xbf16, #blocked> loc(#loc144)
+      %7 = ttg.convert_layout %6 : tensor<2x64xbf16, #blocked> -> tensor<2x64xbf16, #blocked1> loc(#loc144)
+      tt.store %5, %7, %tmp50_65 : tensor<2x64x!tt.ptr<bf16>, #blocked1> loc(#loc144)
+      %8 = tt.addptr %3, %4 : tensor<2x64x!tt.ptr<bf16>, #blocked1>, tensor<2x64xi32, #blocked1> loc(#loc60)
+      %9 = arith.truncf %tmp110 : tensor<2x64xf32, #blocked> to tensor<2x64xbf16, #blocked> loc(#loc145)
+      %10 = ttg.convert_layout %9 : tensor<2x64xbf16, #blocked> -> tensor<2x64xbf16, #blocked1> loc(#loc145)
+      tt.store %8, %10, %tmp50_65 : tensor<2x64x!tt.ptr<bf16>, #blocked1> loc(#loc145)
+    } loc(#loc61)
+    tt.return loc(#loc146)
+  } loc(#loc)
+} loc(#loc)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":23:28)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":23:33)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:44)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:23)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":26:37)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":28:19)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":29:19)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:52)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:48)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:63)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:57)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:34)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":33:43)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":34:31)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":35:29)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:41)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:68)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:121)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:41)
+#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:50)
+#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:34)
+#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:61)
+#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:114)
+#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":42:22)
+#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":44:23)
+#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":45:40)
+#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":47:22)
+#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":49:25)
+#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":50:42)
+#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":50:8)
+#loc32 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:36)
+#loc34 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:15)
+#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":51:28)
+#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":52:30)
+#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:46)
+#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:42)
+#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:57)
+#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:51)
+#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:35)
+#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:35)
+#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:46)
+#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:42)
+#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:35)
+#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:35)
+#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:36)
+#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":75:25)
+#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":77:24)
+#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":78:32)
+#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":79:24)
+#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":123:24)
+#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":124:24)
+#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":125:32)
+#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":126:24)
+#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:43)
+#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:39)
+#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:32)
+#loc60 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:32)
+#loc61 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":53:43)
+#loc62 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":54:31)
+#loc63 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":55:29)
+#loc64 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":58:27)
+#loc65 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":59:27)
+#loc66 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:62)
+#loc67 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:115)
+#loc68 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:42)
+#loc69 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:95)
+#loc70 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:51)
+#loc71 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:51)
+#loc72 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:42)
+#loc73 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:49)
+#loc74 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:58)
+#loc75 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:35)
+#loc76 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:69)
+#loc77 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:123)
+#loc78 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:43)
+#loc79 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:96)
+#loc80 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":71:24)
+#loc81 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:41)
+#loc82 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:39)
+#loc83 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:48)
+#loc84 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:57)
+#loc85 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:35)
+#loc86 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:78)
+#loc87 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:68)
+#loc88 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:129)
+#loc89 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:35)
+#loc90 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:85)
+#loc91 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:146)
+#loc92 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":82:24)
+#loc93 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":84:17)
+#loc94 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":86:39)
+#loc95 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":87:25)
+#loc96 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:44)
+#loc97 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:53)
+#loc98 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:35)
+#loc99 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:74)
+#loc100 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:64)
+#loc101 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:125)
+#loc102 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":97:24)
+#loc103 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:35)
+#loc104 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:81)
+#loc105 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:142)
+#loc106 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":100:24)
+#loc107 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":103:39)
+#loc108 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":104:39)
+#loc109 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":111:24)
+#loc110 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":113:24)
+#loc111 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":116:24)
+#loc112 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":118:24)
+#loc113 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":119:24)
+#loc114 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:42)
+#loc115 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:51)
+#loc116 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:60)
+#loc117 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:35)
+#loc118 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:71)
+#loc119 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:132)
+#loc120 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:35)
+#loc121 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:85)
+#loc122 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:146)
+#loc123 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":129:24)
+#loc124 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":131:17)
+#loc125 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:42)
+#loc126 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:51)
+#loc127 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:60)
+#loc128 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:35)
+#loc129 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:71)
+#loc130 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:132)
+#loc131 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":139:24)
+#loc132 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:35)
+#loc133 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:81)
+#loc134 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:142)
+#loc135 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":142:24)
+#loc136 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":145:39)
+#loc137 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":146:39)
+#loc138 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":133:39)
+#loc139 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":151:25)
+#loc140 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":156:26)
+#loc141 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":153:26)
+#loc142 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":158:26)
+#loc143 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":159:26)
+#loc144 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:55)
+#loc145 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:56)
+#loc146 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":53:4)
+#loc156 = loc("xoffset"(#loc2))
+#loc157 = loc("xoffset"(#loc3))
+#loc158 = loc("xindex"(#loc4))
+#loc159 = loc("xindex"(#loc5))
+#loc160 = loc("r0_base"(#loc6))
+#loc161 = loc("x0"(#loc7))
+#loc162 = loc("x1"(#loc8))
+#loc163 = loc("tmp0"(#loc9))
+#loc164 = loc("tmp0"(#loc10))
+#loc165 = loc("tmp0"(#loc11))
+#loc166 = loc("tmp0"(#loc12))
+#loc167 = loc("tmp0"(#loc13))
+#loc168 = loc("_tmp4"(#loc14))
+#loc169 = loc("r0_index"(#loc15))
+#loc170 = loc("r0_mask"(#loc16))
+#loc171 = loc("tmp0"(#loc17))
+#loc172 = loc("tmp0"(#loc18))
+#loc173 = loc("tmp0"(#loc19))
+#loc174 = loc("tmp6"(#loc20))
+#loc175 = loc("tmp6"(#loc21))
+#loc176 = loc("tmp6"(#loc22))
+#loc177 = loc("tmp6"(#loc23))
+#loc178 = loc("tmp6"(#loc24))
+#loc179 = loc("tmp2"(#loc25))
+#loc180 = loc("tmp5"(#loc26))
+#loc181 = loc("_tmp4"(#loc27))
+#loc182 = loc("tmp8"(#loc28))
+#loc183 = loc("tmp11"(#loc29))
+#loc184 = loc("_tmp10"(#loc30))
+#loc186 = loc("tmp4"(#loc35))
+#loc188 = loc("tmp10"(#loc37))
+#loc189 = loc("tmp50"(#loc38))
+#loc190 = loc("tmp50"(#loc39))
+#loc191 = loc("tmp50"(#loc40))
+#loc192 = loc("tmp50"(#loc41))
+#loc193 = loc("tmp50"(#loc42))
+#loc194 = loc("tmp58"(#loc43))
+#loc195 = loc("tmp63"(#loc44))
+#loc196 = loc("tmp63"(#loc45))
+#loc197 = loc("tmp63"(#loc46))
+#loc198 = loc("tmp66"(#loc47))
+#loc199 = loc("tmp102"(#loc48))
+#loc200 = loc("tmp20"(#loc49))
+#loc201 = loc("tmp22"(#loc50))
+#loc202 = loc("tmp23"(#loc51))
+#loc203 = loc("tmp24"(#loc52))
+#loc204 = loc("tmp72"(#loc53))
+#loc205 = loc("tmp73"(#loc54))
+#loc206 = loc("tmp74"(#loc55))
+#loc207 = loc("tmp75"(#loc56))
+#loc208 = loc("r0_index"(#loc62))
+#loc209 = loc("r0_mask"(#loc63))
+#loc210 = loc("r0_3"(#loc64))
+#loc211 = loc("r0_4"(#loc65))
+#loc212 = loc("tmp50"(#loc66))
+#loc213 = loc("tmp50"(#loc67))
+#loc214 = loc("tmp58"(#loc68))
+#loc215 = loc("tmp58"(#loc69))
+#loc216 = loc("tmp63"(#loc70))
+#loc217 = loc("tmp66"(#loc71))
+#loc218 = loc("tmp96"(#loc72))
+#loc219 = loc("tmp96"(#loc73))
+#loc220 = loc("tmp96"(#loc74))
+#loc221 = loc("tmp96"(#loc75))
+#loc222 = loc("tmp96"(#loc76))
+#loc223 = loc("tmp96"(#loc77))
+#loc224 = loc("tmp102"(#loc78))
+#loc225 = loc("tmp102"(#loc79))
+#loc226 = loc("tmp16"(#loc80))
+#loc227 = loc("tmp17"(#loc81))
+#loc228 = loc("tmp17"(#loc82))
+#loc229 = loc("tmp17"(#loc83))
+#loc230 = loc("tmp17"(#loc84))
+#loc231 = loc("tmp17"(#loc85))
+#loc232 = loc("tmp17"(#loc86))
+#loc233 = loc("tmp17"(#loc87))
+#loc234 = loc("tmp17"(#loc88))
+#loc235 = loc("tmp25"(#loc89))
+#loc236 = loc("tmp25"(#loc90))
+#loc237 = loc("tmp25"(#loc91))
+#loc238 = loc("tmp27"(#loc92))
+#loc239 = loc("tmp29"(#loc93))
+#loc240 = loc("tmp31"(#loc94))
+#loc241 = loc("tmp32"(#loc95))
+#loc242 = loc("tmp35"(#loc96))
+#loc243 = loc("tmp35"(#loc97))
+#loc244 = loc("tmp35"(#loc98))
+#loc245 = loc("tmp35"(#loc99))
+#loc246 = loc("tmp35"(#loc100))
+#loc247 = loc("tmp35"(#loc101))
+#loc248 = loc("tmp42"(#loc102))
+#loc249 = loc("tmp43"(#loc103))
+#loc250 = loc("tmp43"(#loc104))
+#loc251 = loc("tmp43"(#loc105))
+#loc252 = loc("tmp45"(#loc106))
+#loc253 = loc("tmp48"(#loc107))
+#loc254 = loc("tmp49"(#loc108))
+#loc255 = loc("tmp57"(#loc109))
+#loc256 = loc("tmp60"(#loc110))
+#loc257 = loc("tmp64"(#loc111))
+#loc258 = loc("tmp67"(#loc112))
+#loc259 = loc("tmp68"(#loc113))
+#loc260 = loc("tmp70"(#loc114))
+#loc261 = loc("tmp70"(#loc115))
+#loc262 = loc("tmp70"(#loc116))
+#loc263 = loc("tmp70"(#loc117))
+#loc264 = loc("tmp70"(#loc118))
+#loc265 = loc("tmp70"(#loc119))
+#loc266 = loc("tmp76"(#loc120))
+#loc267 = loc("tmp76"(#loc121))
+#loc268 = loc("tmp76"(#loc122))
+#loc269 = loc("tmp78"(#loc123))
+#loc270 = loc("tmp80"(#loc124))
+#loc271 = loc("tmp83"(#loc125))
+#loc272 = loc("tmp83"(#loc126))
+#loc273 = loc("tmp83"(#loc127))
+#loc274 = loc("tmp83"(#loc128))
+#loc275 = loc("tmp83"(#loc129))
+#loc276 = loc("tmp83"(#loc130))
+#loc277 = loc("tmp88"(#loc131))
+#loc278 = loc("tmp89"(#loc132))
+#loc279 = loc("tmp89"(#loc133))
+#loc280 = loc("tmp89"(#loc134))
+#loc281 = loc("tmp91"(#loc135))
+#loc282 = loc("tmp94"(#loc136))
+#loc283 = loc("tmp95"(#loc137))
+#loc284 = loc("tmp82"(#loc138))
+#loc285 = loc("tmp101"(#loc139))
+#loc286 = loc("tmp107"(#loc140))
+#loc287 = loc("tmp104"(#loc141))
+#loc288 = loc("tmp109"(#loc142))
+#loc289 = loc("tmp110"(#loc143))
+#loc290 = loc("_tmp10"(#loc168))
+#loc291 = loc(callsite(#loc32 at #loc185))
+#loc293 = loc(callsite(#loc32 at #loc187))
+#loc295 = loc(fused[#loc254, #loc240])
+#loc296 = loc(fused[#loc283, #loc284])
+#loc297 = loc(callsite(#loc34 at #loc291))
+#loc298 = loc(callsite(#loc34 at #loc293))
diff --git a/triton/4CABSBCN3JWTJORDZXCB5NETVIT6A64D5LGN2QHUTERFG5YETKTA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttir b/triton/4CABSBCN3JWTJORDZXCB5NETVIT6A64D5LGN2QHUTERFG5YETKTA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttir
new file mode 100644
index 0000000000000000000000000000000000000000..bd8d9c8a1fb1ec460cc41f61a355383d788a3c4c
--- /dev/null
+++ b/triton/4CABSBCN3JWTJORDZXCB5NETVIT6A64D5LGN2QHUTERFG5YETKTA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttir
@@ -0,0 +1,520 @@
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":18:0)
+#loc1 = loc(unknown)
+#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":51:25)
+#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":52:27)
+#loc149 = loc("in_out_ptr0"(#loc))
+#loc150 = loc("in_out_ptr1"(#loc))
+#loc151 = loc("in_ptr0"(#loc))
+#loc152 = loc("in_ptr1"(#loc))
+#loc153 = loc("in_ptr2"(#loc))
+#loc154 = loc("in_ptr3"(#loc))
+#loc155 = loc("in_ptr4"(#loc))
+#loc156 = loc("xnumel"(#loc))
+#loc157 = loc("r0_numel"(#loc))
+#loc189 = loc("tmp4"(#loc35))
+#loc191 = loc("tmp10"(#loc38))
+#loc296 = loc(callsite(#loc1 at #loc189))
+#loc298 = loc(callsite(#loc1 at #loc191))
+module {
+  tt.func public @triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0(%in_out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_out_ptr0"(#loc)), %in_out_ptr1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_out_ptr1"(#loc)), %in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %in_ptr4: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr4"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
+    %cst = arith.constant dense<0.000000e+00> : tensor<1x64xbf16> loc(#loc1)
+    %cst_0 = arith.constant dense<0.000000e+00> : tensor<2x64xbf16> loc(#loc1)
+    %c64_i32 = arith.constant 64 : i32 loc(#loc1)
+    %c128_i32 = arith.constant 128 : i32 loc(#loc1)
+    %c0_i32 = arith.constant 0 : i32 loc(#loc1)
+    %cst_1 = arith.constant dense<4097> : tensor<1x64xi32> loc(#loc1)
+    %cst_2 = arith.constant dense<9.99999997E-7> : tensor<2x1xf32> loc(#loc1)
+    %cst_3 = arith.constant dense<1.280000e+02> : tensor<2x1xf32> loc(#loc1)
+    %cst_4 = arith.constant dense<1> : tensor<1x64xi32> loc(#loc1)
+    %cst_5 = arith.constant dense<1> : tensor<1x64xi64> loc(#loc1)
+    %cst_6 = arith.constant dense<2> : tensor<1x64xi32> loc(#loc1)
+    %cst_7 = arith.constant dense<36864> : tensor<2x1xi32> loc(#loc1)
+    %cst_8 = arith.constant dense<128> : tensor<2x1xi32> loc(#loc1)
+    %cst_9 = arith.constant dense<4096> : tensor<1x64xi32> loc(#loc1)
+    %cst_10 = arith.constant dense<128> : tensor<1x64xi32> loc(#loc1)
+    %cst_11 = arith.constant dense<0.000000e+00> : tensor<2x64xf32> loc(#loc1)
+    %cst_12 = arith.constant dense<32> : tensor<2x1xi32> loc(#loc1)
+    %c2_i32 = arith.constant 2 : i32 loc(#loc1)
+    %xoffset = tt.get_program_id x : i32 loc(#loc158)
+    %xoffset_13 = arith.muli %xoffset, %c2_i32 : i32 loc(#loc159)
+    %xindex = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc160)
+    %xindex_14 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<2xi32> -> tensor<2x1xi32> loc(#loc161)
+    %xindex_15 = tt.splat %xoffset_13 : i32 -> tensor<2x1xi32> loc(#loc162)
+    %xindex_16 = arith.addi %xindex_15, %xindex_14 : tensor<2x1xi32> loc(#loc162)
+    %r0_base = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc163)
+    %r0_base_17 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc164)
+    %x0 = arith.remsi %xindex_16, %cst_12 : tensor<2x1xi32> loc(#loc165)
+    %x1 = arith.divsi %xindex_16, %cst_12 : tensor<2x1xi32> loc(#loc166)
+    %_tmp10:2 = scf.for %r0_offset = %c0_i32 to %c128_i32 step %c64_i32 iter_args(%_tmp4 = %cst_11, %_tmp10_20 = %cst_11) -> (tensor<2x64xf32>, tensor<2x64xf32>)  : i32 {
+      %r0_index = tt.splat %r0_offset : i32 -> tensor<1x64xi32> loc(#loc168)
+      %r0_index_21 = arith.addi %r0_index, %r0_base_17 : tensor<1x64xi32> loc(#loc168)
+      %r0_mask = arith.cmpi slt, %r0_index_21, %cst_10 : tensor<1x64xi32> loc(#loc169)
+      %tmp0 = arith.addi %r0_index_21, %cst_9 : tensor<1x64xi32> loc(#loc170)
+      %tmp0_22 = arith.muli %x0, %cst_8 : tensor<2x1xi32> loc(#loc171)
+      %tmp0_23 = tt.broadcast %tmp0 : tensor<1x64xi32> -> tensor<2x64xi32> loc(#loc172)
+      %tmp0_24 = tt.broadcast %tmp0_22 : tensor<2x1xi32> -> tensor<2x64xi32> loc(#loc172)
+      %tmp0_25 = arith.addi %tmp0_23, %tmp0_24 : tensor<2x64xi32> loc(#loc172)
+      %tmp0_26 = arith.muli %x1, %cst_7 : tensor<2x1xi32> loc(#loc173)
+      %tmp0_27 = tt.broadcast %tmp0_26 : tensor<2x1xi32> -> tensor<2x64xi32> loc(#loc174)
+      %tmp0_28 = arith.addi %tmp0_25, %tmp0_27 : tensor<2x64xi32> loc(#loc174)
+      %tmp0_29 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<2x64x!tt.ptr<bf16>> loc(#loc175)
+      %tmp0_30 = tt.addptr %tmp0_29, %tmp0_28 : tensor<2x64x!tt.ptr<bf16>>, tensor<2x64xi32> loc(#loc175)
+      %tmp0_31 = tt.broadcast %r0_mask : tensor<1x64xi1> -> tensor<2x64xi1> loc(#loc176)
+      %tmp0_32 = tt.load %tmp0_30, %tmp0_31, %cst_0 evictionPolicy = evict_last : tensor<2x64x!tt.ptr<bf16>> loc(#loc176)
+      %tmp0_33 = arith.extf %tmp0_32 : tensor<2x64xbf16> to tensor<2x64xf32> loc(#loc177)
+      %tmp6 = tt.broadcast %r0_index_21 : tensor<1x64xi32> -> tensor<2x64xi32> loc(#loc178)
+      %tmp6_34 = arith.addi %tmp6, %tmp0_24 : tensor<2x64xi32> loc(#loc178)
+      %tmp6_35 = arith.addi %tmp6_34, %tmp0_27 : tensor<2x64xi32> loc(#loc179)
+      %tmp6_36 = tt.addptr %tmp0_29, %tmp6_35 : tensor<2x64x!tt.ptr<bf16>>, tensor<2x64xi32> loc(#loc180)
+      %tmp6_37 = tt.load %tmp6_36, %tmp0_31, %cst_0 evictionPolicy = evict_last : tensor<2x64x!tt.ptr<bf16>> loc(#loc181)
+      %tmp6_38 = arith.extf %tmp6_37 : tensor<2x64xbf16> to tensor<2x64xf32> loc(#loc182)
+      %tmp2 = arith.mulf %tmp0_33, %tmp0_33 : tensor<2x64xf32> loc(#loc183)
+      %tmp5 = arith.addf %_tmp4, %tmp2 : tensor<2x64xf32> loc(#loc184)
+      %_tmp4_39 = arith.select %tmp0_31, %tmp5, %_tmp4 : tensor<2x64xi1>, tensor<2x64xf32> loc(#loc185)
+      %tmp8 = arith.mulf %tmp6_38, %tmp6_38 : tensor<2x64xf32> loc(#loc186)
+      %tmp11 = arith.addf %_tmp10_20, %tmp8 : tensor<2x64xf32> loc(#loc187)
+      %_tmp10_40 = arith.select %tmp0_31, %tmp11, %_tmp10_20 : tensor<2x64xi1>, tensor<2x64xf32> loc(#loc188)
+      scf.yield %_tmp4_39, %_tmp10_40 : tensor<2x64xf32>, tensor<2x64xf32> loc(#loc33)
+    } loc(#loc294)
+    %tmp4 = "tt.reduce"(%_tmp10#0) <{axis = 1 : i32}> ({
+    ^bb0(%tmp4_20: f32 loc(callsite(#loc1 at #loc189)), %tmp4_21: f32 loc(callsite(#loc1 at #loc189))):
+      %tmp4_22 = arith.addf %tmp4_20, %tmp4_21 : f32 loc(#loc299)
+      tt.reduce.return %tmp4_22 : f32 loc(#loc295)
+    }) : (tensor<2x64xf32>) -> tensor<2xf32> loc(#loc295)
+    %tmp4_18 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<2xf32> -> tensor<2x1xf32> loc(#loc190)
+    %tmp10 = "tt.reduce"(%_tmp10#1) <{axis = 1 : i32}> ({
+    ^bb0(%tmp10_20: f32 loc(callsite(#loc1 at #loc191)), %tmp10_21: f32 loc(callsite(#loc1 at #loc191))):
+      %tmp10_22 = arith.addf %tmp10_20, %tmp10_21 : f32 loc(#loc300)
+      tt.reduce.return %tmp10_22 : f32 loc(#loc297)
+    }) : (tensor<2x64xf32>) -> tensor<2xf32> loc(#loc297)
+    %tmp10_19 = tt.expand_dims %tmp10 {axis = 1 : i32} : tensor<2xf32> -> tensor<2x1xf32> loc(#loc192)
+    scf.for %r0_offset = %c0_i32 to %c128_i32 step %c64_i32  : i32 {
+      %r0_index = tt.splat %r0_offset : i32 -> tensor<1x64xi32> loc(#loc193)
+      %r0_index_20 = arith.addi %r0_index, %r0_base_17 : tensor<1x64xi32> loc(#loc193)
+      %r0_mask = arith.cmpi slt, %r0_index_20, %cst_10 : tensor<1x64xi32> loc(#loc194)
+      %r0_3 = arith.remsi %r0_index_20, %cst_6 : tensor<1x64xi32> loc(#loc195)
+      %r0_4 = arith.divsi %r0_index_20, %cst_6 : tensor<1x64xi32> loc(#loc196)
+      %tmp50 = arith.muli %x0, %cst_8 : tensor<2x1xi32> loc(#loc197)
+      %tmp50_21 = tt.broadcast %r0_index_20 : tensor<1x64xi32> -> tensor<2x64xi32> loc(#loc198)
+      %tmp50_22 = tt.broadcast %tmp50 : tensor<2x1xi32> -> tensor<2x64xi32> loc(#loc198)
+      %tmp50_23 = arith.addi %tmp50_21, %tmp50_22 : tensor<2x64xi32> loc(#loc198)
+      %tmp50_24 = arith.muli %x1, %cst_7 : tensor<2x1xi32> loc(#loc199)
+      %tmp50_25 = tt.broadcast %tmp50_24 : tensor<2x1xi32> -> tensor<2x64xi32> loc(#loc200)
+      %tmp50_26 = arith.addi %tmp50_23, %tmp50_25 : tensor<2x64xi32> loc(#loc200)
+      %tmp50_27 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<2x64x!tt.ptr<bf16>> loc(#loc201)
+      %tmp50_28 = tt.addptr %tmp50_27, %tmp50_26 : tensor<2x64x!tt.ptr<bf16>>, tensor<2x64xi32> loc(#loc201)
+      %tmp50_29 = tt.broadcast %r0_mask : tensor<1x64xi1> -> tensor<2x64xi1> loc(#loc202)
+      %tmp50_30 = tt.load %tmp50_28, %tmp50_29, %cst_0 evictionPolicy = evict_last : tensor<2x64x!tt.ptr<bf16>> loc(#loc202)
+      %tmp50_31 = arith.extf %tmp50_30 : tensor<2x64xbf16> to tensor<2x64xf32> loc(#loc203)
+      %tmp58 = tt.splat %in_ptr1 : !tt.ptr<bf16> -> tensor<1x64x!tt.ptr<bf16>> loc(#loc204)
+      %tmp58_32 = tt.addptr %tmp58, %r0_index_20 : tensor<1x64x!tt.ptr<bf16>>, tensor<1x64xi32> loc(#loc204)
+      %tmp58_33 = tt.load %tmp58_32, %r0_mask, %cst evictionPolicy = evict_last : tensor<1x64x!tt.ptr<bf16>> loc(#loc205)
+      %tmp58_34 = arith.extf %tmp58_33 : tensor<1x64xbf16> to tensor<1x64xf32> loc(#loc206)
+      %tmp63 = arith.muli %x1, %cst_8 : tensor<2x1xi32> loc(#loc207)
+      %tmp63_35 = tt.broadcast %tmp63 : tensor<2x1xi32> -> tensor<2x64xi32> loc(#loc208)
+      %tmp63_36 = arith.addi %tmp50_21, %tmp63_35 : tensor<2x64xi32> loc(#loc208)
+      %tmp63_37 = tt.splat %in_ptr2 : !tt.ptr<f32> -> tensor<2x64x!tt.ptr<f32>> loc(#loc209)
+      %tmp63_38 = tt.addptr %tmp63_37, %tmp63_36 : tensor<2x64x!tt.ptr<f32>>, tensor<2x64xi32> loc(#loc209)
+      %tmp63_39 = tt.load %tmp63_38, %tmp50_29, %cst_11 evictionPolicy = evict_last : tensor<2x64x!tt.ptr<f32>> loc(#loc210)
+      %tmp66 = tt.splat %in_ptr3 : !tt.ptr<f32> -> tensor<2x64x!tt.ptr<f32>> loc(#loc211)
+      %tmp66_40 = tt.addptr %tmp66, %tmp63_36 : tensor<2x64x!tt.ptr<f32>>, tensor<2x64xi32> loc(#loc211)
+      %tmp66_41 = tt.load %tmp66_40, %tmp50_29, %cst_11 evictionPolicy = evict_last : tensor<2x64x!tt.ptr<f32>> loc(#loc212)
+      %tmp96 = arith.addi %r0_index_20, %cst_9 : tensor<1x64xi32> loc(#loc213)
+      %tmp96_42 = tt.broadcast %tmp96 : tensor<1x64xi32> -> tensor<2x64xi32> loc(#loc214)
+      %tmp96_43 = arith.addi %tmp96_42, %tmp50_22 : tensor<2x64xi32> loc(#loc214)
+      %tmp96_44 = arith.addi %tmp96_43, %tmp50_25 : tensor<2x64xi32> loc(#loc215)
+      %tmp96_45 = tt.addptr %tmp50_27, %tmp96_44 : tensor<2x64x!tt.ptr<bf16>>, tensor<2x64xi32> loc(#loc216)
+      %tmp96_46 = tt.load %tmp96_45, %tmp50_29, %cst_0 evictionPolicy = evict_first : tensor<2x64x!tt.ptr<bf16>> loc(#loc217)
+      %tmp96_47 = arith.extf %tmp96_46 : tensor<2x64xbf16> to tensor<2x64xf32> loc(#loc218)
+      %tmp102 = tt.splat %in_ptr4 : !tt.ptr<bf16> -> tensor<1x64x!tt.ptr<bf16>> loc(#loc219)
+      %tmp102_48 = tt.addptr %tmp102, %r0_index_20 : tensor<1x64x!tt.ptr<bf16>>, tensor<1x64xi32> loc(#loc219)
+      %tmp102_49 = tt.load %tmp102_48, %r0_mask, %cst evictionPolicy = evict_last : tensor<1x64x!tt.ptr<bf16>> loc(#loc220)
+      %tmp102_50 = arith.extf %tmp102_49 : tensor<1x64xbf16> to tensor<1x64xf32> loc(#loc221)
+      %tmp16 = arith.extsi %r0_3 : tensor<1x64xi32> to tensor<1x64xi64> loc(#loc222)
+      %tmp16_51 = arith.cmpi slt, %tmp16, %cst_5 : tensor<1x64xi64> loc(#loc222)
+      %tmp17 = arith.muli %r0_4, %cst_6 : tensor<1x64xi32> loc(#loc223)
+      %tmp17_52 = arith.addi %tmp17, %cst_4 : tensor<1x64xi32> loc(#loc224)
+      %tmp17_53 = tt.broadcast %tmp17_52 : tensor<1x64xi32> -> tensor<2x64xi32> loc(#loc225)
+      %tmp17_54 = arith.addi %tmp17_53, %tmp50_22 : tensor<2x64xi32> loc(#loc225)
+      %tmp17_55 = arith.addi %tmp17_54, %tmp50_25 : tensor<2x64xi32> loc(#loc226)
+      %tmp17_56 = tt.addptr %tmp50_27, %tmp17_55 : tensor<2x64x!tt.ptr<bf16>>, tensor<2x64xi32> loc(#loc227)
+      %tmp17_57 = arith.andi %r0_mask, %tmp16_51 : tensor<1x64xi1> loc(#loc228)
+      %tmp17_58 = tt.broadcast %tmp17_57 : tensor<1x64xi1> -> tensor<2x64xi1> loc(#loc229)
+      %tmp17_59 = tt.load %tmp17_56, %tmp17_58, %cst_0 evictionPolicy = evict_last : tensor<2x64x!tt.ptr<bf16>> loc(#loc229)
+      %tmp17_60 = arith.extf %tmp17_59 : tensor<2x64xbf16> to tensor<2x64xf32> loc(#loc230)
+      %tmp20 = arith.divf %tmp10_19, %cst_3 : tensor<2x1xf32> loc(#loc231)
+      %tmp22 = arith.addf %tmp20, %cst_2 : tensor<2x1xf32> loc(#loc232)
+      %tmp23 = tt.extern_elementwise %tmp22 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<2x1xf32>) -> tensor<2x1xf32> loc(#loc233)
+      %tmp24 = tt.broadcast %tmp23 : tensor<2x1xf32> -> tensor<2x64xf32> loc(#loc234)
+      %tmp24_61 = arith.mulf %tmp17_60, %tmp24 : tensor<2x64xf32> loc(#loc234)
+      %tmp25 = tt.addptr %tmp58, %tmp17_52 : tensor<1x64x!tt.ptr<bf16>>, tensor<1x64xi32> loc(#loc235)
+      %tmp25_62 = tt.broadcast %tmp25 : tensor<1x64x!tt.ptr<bf16>> -> tensor<2x64x!tt.ptr<bf16>> loc(#loc235)
+      %tmp25_63 = tt.load %tmp25_62, %tmp17_58, %cst_0 evictionPolicy = evict_last : tensor<2x64x!tt.ptr<bf16>> loc(#loc236)
+      %tmp25_64 = arith.extf %tmp25_63 : tensor<2x64xbf16> to tensor<2x64xf32> loc(#loc237)
+      %tmp27 = arith.mulf %tmp24_61, %tmp25_64 : tensor<2x64xf32> loc(#loc238)
+      %tmp29 = arith.subf %cst_11, %tmp27 : tensor<2x64xf32> loc(#loc239)
+      %tmp31 = tt.broadcast %tmp16_51 : tensor<1x64xi1> -> tensor<2x64xi1> loc(#loc240)
+      %tmp31_65 = arith.select %tmp31, %tmp29, %cst_11 : tensor<2x64xi1>, tensor<2x64xf32> loc(#loc240)
+      %tmp32 = arith.cmpi sge, %tmp16, %cst_5 : tensor<1x64xi64> loc(#loc241)
+      %tmp35 = tt.broadcast %tmp17 : tensor<1x64xi32> -> tensor<2x64xi32> loc(#loc242)
+      %tmp35_66 = arith.addi %tmp35, %tmp50_22 : tensor<2x64xi32> loc(#loc242)
+      %tmp35_67 = arith.addi %tmp35_66, %tmp50_25 : tensor<2x64xi32> loc(#loc243)
+      %tmp35_68 = tt.addptr %tmp50_27, %tmp35_67 : tensor<2x64x!tt.ptr<bf16>>, tensor<2x64xi32> loc(#loc244)
+      %tmp35_69 = arith.andi %r0_mask, %tmp32 : tensor<1x64xi1> loc(#loc245)
+      %tmp35_70 = tt.broadcast %tmp35_69 : tensor<1x64xi1> -> tensor<2x64xi1> loc(#loc246)
+      %tmp35_71 = tt.load %tmp35_68, %tmp35_70, %cst_0 evictionPolicy = evict_last : tensor<2x64x!tt.ptr<bf16>> loc(#loc246)
+      %tmp35_72 = arith.extf %tmp35_71 : tensor<2x64xbf16> to tensor<2x64xf32> loc(#loc247)
+      %tmp42 = arith.mulf %tmp35_72, %tmp24 : tensor<2x64xf32> loc(#loc248)
+      %tmp43 = tt.addptr %tmp58, %tmp17 : tensor<1x64x!tt.ptr<bf16>>, tensor<1x64xi32> loc(#loc249)
+      %tmp43_73 = tt.broadcast %tmp43 : tensor<1x64x!tt.ptr<bf16>> -> tensor<2x64x!tt.ptr<bf16>> loc(#loc249)
+      %tmp43_74 = tt.load %tmp43_73, %tmp35_70, %cst_0 evictionPolicy = evict_last : tensor<2x64x!tt.ptr<bf16>> loc(#loc250)
+      %tmp43_75 = arith.extf %tmp43_74 : tensor<2x64xbf16> to tensor<2x64xf32> loc(#loc251)
+      %tmp45 = arith.mulf %tmp42, %tmp43_75 : tensor<2x64xf32> loc(#loc252)
+      %tmp48 = tt.broadcast %tmp32 : tensor<1x64xi1> -> tensor<2x64xi1> loc(#loc253)
+      %tmp48_76 = arith.select %tmp48, %tmp45, %cst_11 : tensor<2x64xi1>, tensor<2x64xf32> loc(#loc253)
+      %tmp49 = arith.select %tmp31, %tmp31_65, %tmp48_76 : tensor<2x64xi1>, tensor<2x64xf32> loc(#loc254)
+      %tmp57 = arith.mulf %tmp50_31, %tmp24 : tensor<2x64xf32> loc(#loc255)
+      %tmp60 = tt.broadcast %tmp58_34 : tensor<1x64xf32> -> tensor<2x64xf32> loc(#loc256)
+      %tmp60_77 = arith.mulf %tmp57, %tmp60 : tensor<2x64xf32> loc(#loc256)
+      %tmp64 = arith.mulf %tmp60_77, %tmp63_39 : tensor<2x64xf32> loc(#loc257)
+      %tmp67 = arith.mulf %tmp49, %tmp66_41 : tensor<2x64xf32> loc(#loc258)
+      %tmp68 = arith.addf %tmp64, %tmp67 : tensor<2x64xf32> loc(#loc259)
+      %tmp70 = arith.addi %tmp17, %cst_1 : tensor<1x64xi32> loc(#loc260)
+      %tmp70_78 = tt.broadcast %tmp70 : tensor<1x64xi32> -> tensor<2x64xi32> loc(#loc261)
+      %tmp70_79 = arith.addi %tmp70_78, %tmp50_22 : tensor<2x64xi32> loc(#loc261)
+      %tmp70_80 = arith.addi %tmp70_79, %tmp50_25 : tensor<2x64xi32> loc(#loc262)
+      %tmp70_81 = tt.addptr %tmp50_27, %tmp70_80 : tensor<2x64x!tt.ptr<bf16>>, tensor<2x64xi32> loc(#loc263)
+      %tmp70_82 = tt.load %tmp70_81, %tmp17_58, %cst_0 evictionPolicy = evict_last : tensor<2x64x!tt.ptr<bf16>> loc(#loc264)
+      %tmp70_83 = arith.extf %tmp70_82 : tensor<2x64xbf16> to tensor<2x64xf32> loc(#loc265)
+      %tmp72 = arith.divf %tmp4_18, %cst_3 : tensor<2x1xf32> loc(#loc266)
+      %tmp73 = arith.addf %tmp72, %cst_2 : tensor<2x1xf32> loc(#loc267)
+      %tmp74 = tt.extern_elementwise %tmp73 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<2x1xf32>) -> tensor<2x1xf32> loc(#loc268)
+      %tmp75 = tt.broadcast %tmp74 : tensor<2x1xf32> -> tensor<2x64xf32> loc(#loc269)
+      %tmp75_84 = arith.mulf %tmp70_83, %tmp75 : tensor<2x64xf32> loc(#loc269)
+      %tmp76 = tt.addptr %tmp102, %tmp17_52 : tensor<1x64x!tt.ptr<bf16>>, tensor<1x64xi32> loc(#loc270)
+      %tmp76_85 = tt.broadcast %tmp76 : tensor<1x64x!tt.ptr<bf16>> -> tensor<2x64x!tt.ptr<bf16>> loc(#loc270)
+      %tmp76_86 = tt.load %tmp76_85, %tmp17_58, %cst_0 evictionPolicy = evict_last : tensor<2x64x!tt.ptr<bf16>> loc(#loc271)
+      %tmp76_87 = arith.extf %tmp76_86 : tensor<2x64xbf16> to tensor<2x64xf32> loc(#loc272)
+      %tmp78 = arith.mulf %tmp75_84, %tmp76_87 : tensor<2x64xf32> loc(#loc273)
+      %tmp80 = arith.subf %cst_11, %tmp78 : tensor<2x64xf32> loc(#loc274)
+      %tmp82 = arith.select %tmp31, %tmp80, %cst_11 : tensor<2x64xi1>, tensor<2x64xf32> loc(#loc275)
+      %tmp83 = arith.addi %tmp17, %cst_9 : tensor<1x64xi32> loc(#loc276)
+      %tmp83_88 = tt.broadcast %tmp83 : tensor<1x64xi32> -> tensor<2x64xi32> loc(#loc277)
+      %tmp83_89 = arith.addi %tmp83_88, %tmp50_22 : tensor<2x64xi32> loc(#loc277)
+      %tmp83_90 = arith.addi %tmp83_89, %tmp50_25 : tensor<2x64xi32> loc(#loc278)
+      %tmp83_91 = tt.addptr %tmp50_27, %tmp83_90 : tensor<2x64x!tt.ptr<bf16>>, tensor<2x64xi32> loc(#loc279)
+      %tmp83_92 = tt.load %tmp83_91, %tmp35_70, %cst_0 evictionPolicy = evict_last : tensor<2x64x!tt.ptr<bf16>> loc(#loc280)
+      %tmp83_93 = arith.extf %tmp83_92 : tensor<2x64xbf16> to tensor<2x64xf32> loc(#loc281)
+      %tmp88 = arith.mulf %tmp83_93, %tmp75 : tensor<2x64xf32> loc(#loc282)
+      %tmp89 = tt.addptr %tmp102, %tmp17 : tensor<1x64x!tt.ptr<bf16>>, tensor<1x64xi32> loc(#loc283)
+      %tmp89_94 = tt.broadcast %tmp89 : tensor<1x64x!tt.ptr<bf16>> -> tensor<2x64x!tt.ptr<bf16>> loc(#loc283)
+      %tmp89_95 = tt.load %tmp89_94, %tmp35_70, %cst_0 evictionPolicy = evict_last : tensor<2x64x!tt.ptr<bf16>> loc(#loc284)
+      %tmp89_96 = arith.extf %tmp89_95 : tensor<2x64xbf16> to tensor<2x64xf32> loc(#loc285)
+      %tmp91 = arith.mulf %tmp88, %tmp89_96 : tensor<2x64xf32> loc(#loc286)
+      %tmp94 = arith.select %tmp48, %tmp91, %cst_11 : tensor<2x64xi1>, tensor<2x64xf32> loc(#loc287)
+      %tmp95 = arith.select %tmp31, %tmp82, %tmp94 : tensor<2x64xi1>, tensor<2x64xf32> loc(#loc288)
+      %tmp101 = arith.mulf %tmp96_47, %tmp75 : tensor<2x64xf32> loc(#loc289)
+      %tmp104 = tt.broadcast %tmp102_50 : tensor<1x64xf32> -> tensor<2x64xf32> loc(#loc290)
+      %tmp104_97 = arith.mulf %tmp101, %tmp104 : tensor<2x64xf32> loc(#loc290)
+      %tmp107 = arith.mulf %tmp104_97, %tmp63_39 : tensor<2x64xf32> loc(#loc291)
+      %tmp109 = arith.mulf %tmp95, %tmp66_41 : tensor<2x64xf32> loc(#loc292)
+      %tmp110 = arith.addf %tmp107, %tmp109 : tensor<2x64xf32> loc(#loc293)
+      %0 = arith.muli %xindex_16, %cst_8 : tensor<2x1xi32> loc(#loc142)
+      %1 = tt.broadcast %0 : tensor<2x1xi32> -> tensor<2x64xi32> loc(#loc143)
+      %2 = arith.addi %tmp50_21, %1 : tensor<2x64xi32> loc(#loc143)
+      %3 = tt.splat %in_out_ptr0 : !tt.ptr<bf16> -> tensor<2x64x!tt.ptr<bf16>> loc(#loc144)
+      %4 = tt.addptr %3, %2 : tensor<2x64x!tt.ptr<bf16>>, tensor<2x64xi32> loc(#loc144)
+      %5 = arith.truncf %tmp68 : tensor<2x64xf32> to tensor<2x64xbf16> loc(#loc145)
+      tt.store %4, %5, %tmp50_29 : tensor<2x64x!tt.ptr<bf16>> loc(#loc145)
+      %6 = tt.splat %in_out_ptr1 : !tt.ptr<bf16> -> tensor<2x64x!tt.ptr<bf16>> loc(#loc146)
+      %7 = tt.addptr %6, %2 : tensor<2x64x!tt.ptr<bf16>>, tensor<2x64xi32> loc(#loc146)
+      %8 = arith.truncf %tmp110 : tensor<2x64xf32> to tensor<2x64xbf16> loc(#loc147)
+      tt.store %7, %8, %tmp50_29 : tensor<2x64x!tt.ptr<bf16>> loc(#loc147)
+    } loc(#loc40)
+    tt.return loc(#loc148)
+  } loc(#loc)
+} loc(#loc)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":23:28)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":23:33)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:36)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:44)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:23)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":26:27)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":26:37)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":28:19)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":29:19)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":33:43)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":34:31)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":35:29)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:41)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:52)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:48)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:63)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:57)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:34)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:68)
+#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:121)
+#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:41)
+#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:50)
+#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:34)
+#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:61)
+#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:114)
+#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":42:22)
+#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":44:23)
+#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":45:40)
+#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":47:22)
+#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":49:25)
+#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":50:42)
+#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":50:8)
+#loc34 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:36)
+#loc36 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:15)
+#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":51:28)
+#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":52:30)
+#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":53:43)
+#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":54:31)
+#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":55:29)
+#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":58:27)
+#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":59:27)
+#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:46)
+#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:42)
+#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:57)
+#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:51)
+#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:35)
+#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:62)
+#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:115)
+#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:35)
+#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:42)
+#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:95)
+#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:46)
+#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:42)
+#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:35)
+#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:51)
+#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:35)
+#loc60 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:51)
+#loc61 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:42)
+#loc62 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:49)
+#loc63 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:58)
+#loc64 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:35)
+#loc65 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:69)
+#loc66 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:123)
+#loc67 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:36)
+#loc68 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:43)
+#loc69 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:96)
+#loc70 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":71:24)
+#loc71 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:41)
+#loc72 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:39)
+#loc73 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:48)
+#loc74 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:57)
+#loc75 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:35)
+#loc76 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:78)
+#loc77 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:68)
+#loc78 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:129)
+#loc79 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":75:25)
+#loc80 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":77:24)
+#loc81 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":78:32)
+#loc82 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":79:24)
+#loc83 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:35)
+#loc84 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:85)
+#loc85 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:146)
+#loc86 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":82:24)
+#loc87 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":84:17)
+#loc88 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":86:39)
+#loc89 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":87:25)
+#loc90 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:44)
+#loc91 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:53)
+#loc92 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:35)
+#loc93 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:74)
+#loc94 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:64)
+#loc95 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:125)
+#loc96 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":97:24)
+#loc97 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:35)
+#loc98 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:81)
+#loc99 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:142)
+#loc100 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":100:24)
+#loc101 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":103:39)
+#loc102 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":104:39)
+#loc103 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":111:24)
+#loc104 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":113:24)
+#loc105 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":116:24)
+#loc106 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":118:24)
+#loc107 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":119:24)
+#loc108 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:42)
+#loc109 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:51)
+#loc110 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:60)
+#loc111 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:35)
+#loc112 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:71)
+#loc113 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:132)
+#loc114 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":123:24)
+#loc115 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":124:24)
+#loc116 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":125:32)
+#loc117 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":126:24)
+#loc118 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:35)
+#loc119 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:85)
+#loc120 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:146)
+#loc121 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":129:24)
+#loc122 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":131:17)
+#loc123 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":133:39)
+#loc124 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:42)
+#loc125 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:51)
+#loc126 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:60)
+#loc127 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:35)
+#loc128 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:71)
+#loc129 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:132)
+#loc130 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":139:24)
+#loc131 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:35)
+#loc132 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:81)
+#loc133 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:142)
+#loc134 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":142:24)
+#loc135 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":145:39)
+#loc136 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":146:39)
+#loc137 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":151:25)
+#loc138 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":153:26)
+#loc139 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":156:26)
+#loc140 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":158:26)
+#loc141 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":159:26)
+#loc142 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:43)
+#loc143 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:39)
+#loc144 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:32)
+#loc145 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:55)
+#loc146 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:32)
+#loc147 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:56)
+#loc148 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":53:4)
+#loc158 = loc("xoffset"(#loc2))
+#loc159 = loc("xoffset"(#loc3))
+#loc160 = loc("xindex"(#loc4))
+#loc161 = loc("xindex"(#loc5))
+#loc162 = loc("xindex"(#loc6))
+#loc163 = loc("r0_base"(#loc7))
+#loc164 = loc("r0_base"(#loc8))
+#loc165 = loc("x0"(#loc9))
+#loc166 = loc("x1"(#loc10))
+#loc167 = loc("_tmp4"(#loc11))
+#loc168 = loc("r0_index"(#loc12))
+#loc169 = loc("r0_mask"(#loc13))
+#loc170 = loc("tmp0"(#loc14))
+#loc171 = loc("tmp0"(#loc15))
+#loc172 = loc("tmp0"(#loc16))
+#loc173 = loc("tmp0"(#loc17))
+#loc174 = loc("tmp0"(#loc18))
+#loc175 = loc("tmp0"(#loc19))
+#loc176 = loc("tmp0"(#loc20))
+#loc177 = loc("tmp0"(#loc21))
+#loc178 = loc("tmp6"(#loc22))
+#loc179 = loc("tmp6"(#loc23))
+#loc180 = loc("tmp6"(#loc24))
+#loc181 = loc("tmp6"(#loc25))
+#loc182 = loc("tmp6"(#loc26))
+#loc183 = loc("tmp2"(#loc27))
+#loc184 = loc("tmp5"(#loc28))
+#loc185 = loc("_tmp4"(#loc29))
+#loc186 = loc("tmp8"(#loc30))
+#loc187 = loc("tmp11"(#loc31))
+#loc188 = loc("_tmp10"(#loc32))
+#loc190 = loc("tmp4"(#loc37))
+#loc192 = loc("tmp10"(#loc39))
+#loc193 = loc("r0_index"(#loc41))
+#loc194 = loc("r0_mask"(#loc42))
+#loc195 = loc("r0_3"(#loc43))
+#loc196 = loc("r0_4"(#loc44))
+#loc197 = loc("tmp50"(#loc45))
+#loc198 = loc("tmp50"(#loc46))
+#loc199 = loc("tmp50"(#loc47))
+#loc200 = loc("tmp50"(#loc48))
+#loc201 = loc("tmp50"(#loc49))
+#loc202 = loc("tmp50"(#loc50))
+#loc203 = loc("tmp50"(#loc51))
+#loc204 = loc("tmp58"(#loc52))
+#loc205 = loc("tmp58"(#loc53))
+#loc206 = loc("tmp58"(#loc54))
+#loc207 = loc("tmp63"(#loc55))
+#loc208 = loc("tmp63"(#loc56))
+#loc209 = loc("tmp63"(#loc57))
+#loc210 = loc("tmp63"(#loc58))
+#loc211 = loc("tmp66"(#loc59))
+#loc212 = loc("tmp66"(#loc60))
+#loc213 = loc("tmp96"(#loc61))
+#loc214 = loc("tmp96"(#loc62))
+#loc215 = loc("tmp96"(#loc63))
+#loc216 = loc("tmp96"(#loc64))
+#loc217 = loc("tmp96"(#loc65))
+#loc218 = loc("tmp96"(#loc66))
+#loc219 = loc("tmp102"(#loc67))
+#loc220 = loc("tmp102"(#loc68))
+#loc221 = loc("tmp102"(#loc69))
+#loc222 = loc("tmp16"(#loc70))
+#loc223 = loc("tmp17"(#loc71))
+#loc224 = loc("tmp17"(#loc72))
+#loc225 = loc("tmp17"(#loc73))
+#loc226 = loc("tmp17"(#loc74))
+#loc227 = loc("tmp17"(#loc75))
+#loc228 = loc("tmp17"(#loc76))
+#loc229 = loc("tmp17"(#loc77))
+#loc230 = loc("tmp17"(#loc78))
+#loc231 = loc("tmp20"(#loc79))
+#loc232 = loc("tmp22"(#loc80))
+#loc233 = loc("tmp23"(#loc81))
+#loc234 = loc("tmp24"(#loc82))
+#loc235 = loc("tmp25"(#loc83))
+#loc236 = loc("tmp25"(#loc84))
+#loc237 = loc("tmp25"(#loc85))
+#loc238 = loc("tmp27"(#loc86))
+#loc239 = loc("tmp29"(#loc87))
+#loc240 = loc("tmp31"(#loc88))
+#loc241 = loc("tmp32"(#loc89))
+#loc242 = loc("tmp35"(#loc90))
+#loc243 = loc("tmp35"(#loc91))
+#loc244 = loc("tmp35"(#loc92))
+#loc245 = loc("tmp35"(#loc93))
+#loc246 = loc("tmp35"(#loc94))
+#loc247 = loc("tmp35"(#loc95))
+#loc248 = loc("tmp42"(#loc96))
+#loc249 = loc("tmp43"(#loc97))
+#loc250 = loc("tmp43"(#loc98))
+#loc251 = loc("tmp43"(#loc99))
+#loc252 = loc("tmp45"(#loc100))
+#loc253 = loc("tmp48"(#loc101))
+#loc254 = loc("tmp49"(#loc102))
+#loc255 = loc("tmp57"(#loc103))
+#loc256 = loc("tmp60"(#loc104))
+#loc257 = loc("tmp64"(#loc105))
+#loc258 = loc("tmp67"(#loc106))
+#loc259 = loc("tmp68"(#loc107))
+#loc260 = loc("tmp70"(#loc108))
+#loc261 = loc("tmp70"(#loc109))
+#loc262 = loc("tmp70"(#loc110))
+#loc263 = loc("tmp70"(#loc111))
+#loc264 = loc("tmp70"(#loc112))
+#loc265 = loc("tmp70"(#loc113))
+#loc266 = loc("tmp72"(#loc114))
+#loc267 = loc("tmp73"(#loc115))
+#loc268 = loc("tmp74"(#loc116))
+#loc269 = loc("tmp75"(#loc117))
+#loc270 = loc("tmp76"(#loc118))
+#loc271 = loc("tmp76"(#loc119))
+#loc272 = loc("tmp76"(#loc120))
+#loc273 = loc("tmp78"(#loc121))
+#loc274 = loc("tmp80"(#loc122))
+#loc275 = loc("tmp82"(#loc123))
+#loc276 = loc("tmp83"(#loc124))
+#loc277 = loc("tmp83"(#loc125))
+#loc278 = loc("tmp83"(#loc126))
+#loc279 = loc("tmp83"(#loc127))
+#loc280 = loc("tmp83"(#loc128))
+#loc281 = loc("tmp83"(#loc129))
+#loc282 = loc("tmp88"(#loc130))
+#loc283 = loc("tmp89"(#loc131))
+#loc284 = loc("tmp89"(#loc132))
+#loc285 = loc("tmp89"(#loc133))
+#loc286 = loc("tmp91"(#loc134))
+#loc287 = loc("tmp94"(#loc135))
+#loc288 = loc("tmp95"(#loc136))
+#loc289 = loc("tmp101"(#loc137))
+#loc290 = loc("tmp104"(#loc138))
+#loc291 = loc("tmp107"(#loc139))
+#loc292 = loc("tmp109"(#loc140))
+#loc293 = loc("tmp110"(#loc141))
+#loc294 = loc("_tmp10"(#loc167))
+#loc295 = loc(callsite(#loc34 at #loc189))
+#loc297 = loc(callsite(#loc34 at #loc191))
+#loc299 = loc(callsite(#loc36 at #loc295))
+#loc300 = loc(callsite(#loc36 at #loc297))
diff --git a/triton/646WRNO7FMJYGKK66FHZ7JPPW6YUT5P5V6RPZAVEK2TTH5B43JCA/__grp__triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json b/triton/646WRNO7FMJYGKK66FHZ7JPPW6YUT5P5V6RPZAVEK2TTH5B43JCA/__grp__triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..92e578f1c5f962bc9b14ba31e6bda19673b3836f
--- /dev/null
+++ b/triton/646WRNO7FMJYGKK66FHZ7JPPW6YUT5P5V6RPZAVEK2TTH5B43JCA/__grp__triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json
@@ -0,0 +1 @@
+{"child_paths": {"triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.source": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/646WRNO7FMJYGKK66FHZ7JPPW6YUT5P5V6RPZAVEK2TTH5B43JCA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.source", "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/646WRNO7FMJYGKK66FHZ7JPPW6YUT5P5V6RPZAVEK2TTH5B43JCA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttir", "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttgir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/646WRNO7FMJYGKK66FHZ7JPPW6YUT5P5V6RPZAVEK2TTH5B43JCA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttgir", "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.llir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/646WRNO7FMJYGKK66FHZ7JPPW6YUT5P5V6RPZAVEK2TTH5B43JCA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.llir", "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ptx": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/646WRNO7FMJYGKK66FHZ7JPPW6YUT5P5V6RPZAVEK2TTH5B43JCA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ptx", "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.cubin": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/646WRNO7FMJYGKK66FHZ7JPPW6YUT5P5V6RPZAVEK2TTH5B43JCA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.cubin", "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/646WRNO7FMJYGKK66FHZ7JPPW6YUT5P5V6RPZAVEK2TTH5B43JCA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json"}}
\ No newline at end of file
diff --git a/triton/646WRNO7FMJYGKK66FHZ7JPPW6YUT5P5V6RPZAVEK2TTH5B43JCA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.cubin b/triton/646WRNO7FMJYGKK66FHZ7JPPW6YUT5P5V6RPZAVEK2TTH5B43JCA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..50613ea92434bf443300cf92cc99793541e0dc29
Binary files /dev/null and b/triton/646WRNO7FMJYGKK66FHZ7JPPW6YUT5P5V6RPZAVEK2TTH5B43JCA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.cubin differ
diff --git a/triton/646WRNO7FMJYGKK66FHZ7JPPW6YUT5P5V6RPZAVEK2TTH5B43JCA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json b/triton/646WRNO7FMJYGKK66FHZ7JPPW6YUT5P5V6RPZAVEK2TTH5B43JCA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..9acf5c43ff4b9a8c8315b0234f85b374fcaaa13f
--- /dev/null
+++ b/triton/646WRNO7FMJYGKK66FHZ7JPPW6YUT5P5V6RPZAVEK2TTH5B43JCA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json
@@ -0,0 +1 @@
+{"hash": "f73d68b5df2b1383295ef14f9fa5efb7b149f5fdafa2fc82a456a733f43cda44", "target": {"backend": "cuda", "arch": 89, "warp_size": 32}, "num_warps": 8, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "enable_reflect_ftz": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee", "bf16x3", "bf16x6"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm89", "instrumentation_mode": "", "triton_version": "3.6.0", "tensordesc_meta": [], "shared": 4096, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0"}
\ No newline at end of file
diff --git a/triton/646WRNO7FMJYGKK66FHZ7JPPW6YUT5P5V6RPZAVEK2TTH5B43JCA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.llir b/triton/646WRNO7FMJYGKK66FHZ7JPPW6YUT5P5V6RPZAVEK2TTH5B43JCA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.llir
new file mode 100644
index 0000000000000000000000000000000000000000..e3c893d58b1434c047a64ce0afb513da30de95ce
--- /dev/null
+++ b/triton/646WRNO7FMJYGKK66FHZ7JPPW6YUT5P5V6RPZAVEK2TTH5B43JCA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.llir
@@ -0,0 +1,865 @@
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64"
+
+@global_smem = external local_unnamed_addr addrspace(3) global [0 x i8], align 16
+@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1
+
+; Function Attrs: nounwind
+define ptx_kernel void @triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, ptr addrspace(1) %6, i32 %7, i32 %8, ptr addrspace(1) readnone captures(none) %9, ptr addrspace(1) readnone captures(none) %10) local_unnamed_addr #0 !dbg !5 {
+  %12 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !8
+  %13 = shl i32 %12, 3, !dbg !9
+  %14 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
+  %15 = and i32 %14, 224, !dbg !10
+  %16 = lshr exact i32 %15, 5, !dbg !10
+  %17 = and i32 %14, 7, !dbg !10
+  %18 = or disjoint i32 %16, %13, !dbg !11
+  %19 = or disjoint i32 %13, %17, !dbg !11
+  %20 = and i32 %14, 31, !dbg !12
+  %21 = shl nuw nsw i32 %20, 2, !dbg !12
+  %22 = lshr i32 %14, 3, !dbg !12
+  %23 = sdiv i32 %18, 32, !dbg !13
+  %24 = mul i32 %23, 32, !dbg !14
+  %.decomposed = sub i32 %18, %24, !dbg !14
+  %25 = sdiv i32 %19, 32, !dbg !13
+  %26 = or disjoint i32 %21, 4096, !dbg !15
+  %27 = shl nsw i32 %.decomposed, 7, !dbg !16
+  %28 = add nsw i32 %26, %27, !dbg !17
+  %29 = mul i32 %23, 36864, !dbg !18
+  %30 = add i32 %28, %29, !dbg !19
+  %31 = sext i32 %30 to i64, !dbg !20
+  %32 = getelementptr bfloat, ptr addrspace(1) %2, i64 %31, !dbg !20
+  %33 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !21
+  %34 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %32, i64 %33, i1 true) #6, !dbg !21
+  %35 = extractvalue { i32, i32 } %34, 0, !dbg !21
+  %36 = bitcast i32 %35 to <2 x bfloat>, !dbg !21
+  %37 = extractvalue { i32, i32 } %34, 1, !dbg !21
+  %38 = bitcast i32 %37 to <2 x bfloat>, !dbg !21
+  %39 = extractelement <2 x bfloat> %36, i64 0, !dbg !21
+  %40 = extractelement <2 x bfloat> %36, i64 1, !dbg !21
+  %41 = extractelement <2 x bfloat> %38, i64 0, !dbg !21
+  %42 = extractelement <2 x bfloat> %38, i64 1, !dbg !21
+  %43 = fpext bfloat %39 to float, !dbg !22
+  %44 = fpext bfloat %40 to float, !dbg !22
+  %45 = fpext bfloat %41 to float, !dbg !22
+  %46 = fpext bfloat %42 to float, !dbg !22
+  %47 = or disjoint i32 %27, %21, !dbg !23
+  %48 = add i32 %47, %29, !dbg !24
+  %49 = sext i32 %48 to i64, !dbg !25
+  %50 = getelementptr bfloat, ptr addrspace(1) %2, i64 %49, !dbg !25
+  %51 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !26
+  %52 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %50, i64 %51, i1 true) #6, !dbg !26
+  %53 = extractvalue { i32, i32 } %52, 0, !dbg !26
+  %54 = bitcast i32 %53 to <2 x bfloat>, !dbg !26
+  %55 = extractvalue { i32, i32 } %52, 1, !dbg !26
+  %56 = bitcast i32 %55 to <2 x bfloat>, !dbg !26
+  %57 = extractelement <2 x bfloat> %54, i64 0, !dbg !26
+  %58 = extractelement <2 x bfloat> %54, i64 1, !dbg !26
+  %59 = extractelement <2 x bfloat> %56, i64 0, !dbg !26
+  %60 = extractelement <2 x bfloat> %56, i64 1, !dbg !26
+  %61 = fpext bfloat %57 to float, !dbg !27
+  %62 = fpext bfloat %58 to float, !dbg !27
+  %63 = fpext bfloat %59 to float, !dbg !27
+  %64 = fpext bfloat %60 to float, !dbg !27
+  %65 = fmul float %43, %43, !dbg !28
+  %66 = fmul float %44, %44, !dbg !28
+  %67 = fmul float %45, %45, !dbg !28
+  %68 = fmul float %46, %46, !dbg !28
+  %69 = fmul float %61, %61, !dbg !29
+  %70 = fmul float %62, %62, !dbg !29
+  %71 = fmul float %63, %63, !dbg !29
+  %72 = fmul float %64, %64, !dbg !29
+  %73 = fadd float %65, %66, !dbg !30
+  %74 = fadd float %67, %73, !dbg !30
+  %75 = fadd float %68, %74, !dbg !30
+  %76 = bitcast float %75 to i32, !dbg !33
+  %77 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %76, i32 16, i32 31), !dbg !33
+  %78 = bitcast i32 %77 to float, !dbg !33
+  %79 = fadd float %75, %78, !dbg !30
+  %80 = bitcast float %79 to i32, !dbg !33
+  %81 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %80, i32 8, i32 31), !dbg !33
+  %82 = bitcast i32 %81 to float, !dbg !33
+  %83 = fadd float %79, %82, !dbg !30
+  %84 = bitcast float %83 to i32, !dbg !33
+  %85 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %84, i32 4, i32 31), !dbg !33
+  %86 = bitcast i32 %85 to float, !dbg !33
+  %87 = fadd float %83, %86, !dbg !30
+  %88 = bitcast float %87 to i32, !dbg !33
+  %89 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %88, i32 2, i32 31), !dbg !33
+  %90 = bitcast i32 %89 to float, !dbg !33
+  %91 = fadd float %87, %90, !dbg !30
+  %92 = bitcast float %91 to i32, !dbg !33
+  %93 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %92, i32 1, i32 31), !dbg !33
+  %94 = bitcast i32 %93 to float, !dbg !33
+  %95 = fadd float %91, %94, !dbg !30
+  %96 = fadd float %69, %70, !dbg !36
+  %97 = fadd float %71, %96, !dbg !36
+  %98 = fadd float %72, %97, !dbg !36
+  %99 = bitcast float %98 to i32, !dbg !37
+  %100 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %99, i32 16, i32 31), !dbg !37
+  %101 = bitcast i32 %100 to float, !dbg !37
+  %102 = fadd float %98, %101, !dbg !36
+  %103 = bitcast float %102 to i32, !dbg !37
+  %104 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %103, i32 8, i32 31), !dbg !37
+  %105 = bitcast i32 %104 to float, !dbg !37
+  %106 = fadd float %102, %105, !dbg !36
+  %107 = bitcast float %106 to i32, !dbg !37
+  %108 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %107, i32 4, i32 31), !dbg !37
+  %109 = bitcast i32 %108 to float, !dbg !37
+  %110 = fadd float %106, %109, !dbg !36
+  %111 = bitcast float %110 to i32, !dbg !37
+  %112 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %111, i32 2, i32 31), !dbg !37
+  %113 = bitcast i32 %112 to float, !dbg !37
+  %114 = fadd float %110, %113, !dbg !36
+  %115 = bitcast float %114 to i32, !dbg !37
+  %116 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %115, i32 1, i32 31), !dbg !37
+  %117 = bitcast i32 %116 to float, !dbg !37
+  %118 = fadd float %114, %117, !dbg !36
+  %119 = and i32 %22, 1, !dbg !39
+  %120 = zext nneg i32 %21 to i64, !dbg !40
+  %121 = getelementptr bfloat, ptr addrspace(1) %3, i64 %120, !dbg !40
+  %122 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !41
+  %123 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %121, i64 %122, i1 true) #6, !dbg !41
+  %124 = extractvalue { i32, i32 } %123, 0, !dbg !41
+  %125 = bitcast i32 %124 to <2 x bfloat>, !dbg !41
+  %126 = extractvalue { i32, i32 } %123, 1, !dbg !41
+  %127 = bitcast i32 %126 to <2 x bfloat>, !dbg !41
+  %128 = extractelement <2 x bfloat> %125, i64 0, !dbg !41
+  %129 = extractelement <2 x bfloat> %125, i64 1, !dbg !41
+  %130 = extractelement <2 x bfloat> %127, i64 0, !dbg !41
+  %131 = extractelement <2 x bfloat> %127, i64 1, !dbg !41
+  %132 = fpext bfloat %128 to float, !dbg !42
+  %133 = fpext bfloat %129 to float, !dbg !42
+  %134 = fpext bfloat %130 to float, !dbg !42
+  %135 = fpext bfloat %131 to float, !dbg !42
+  %136 = shl i32 %23, 7, !dbg !43
+  %137 = or disjoint i32 %136, %21, !dbg !44
+  %138 = sext i32 %137 to i64, !dbg !45
+  %139 = getelementptr float, ptr addrspace(1) %4, i64 %138, !dbg !45
+  %140 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !46
+  %141 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %139, i64 %140, i1 true) #6, !dbg !46
+  %142 = extractvalue { i32, i32, i32, i32 } %141, 0, !dbg !46
+  %143 = extractvalue { i32, i32, i32, i32 } %141, 1, !dbg !46
+  %144 = extractvalue { i32, i32, i32, i32 } %141, 2, !dbg !46
+  %145 = extractvalue { i32, i32, i32, i32 } %141, 3, !dbg !46
+  %146 = bitcast i32 %142 to float, !dbg !46
+  %147 = bitcast i32 %143 to float, !dbg !46
+  %148 = bitcast i32 %144 to float, !dbg !46
+  %149 = bitcast i32 %145 to float, !dbg !46
+  %150 = getelementptr float, ptr addrspace(1) %5, i64 %138, !dbg !47
+  %151 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !48
+  %152 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %150, i64 %151, i1 true) #6, !dbg !48
+  %153 = extractvalue { i32, i32, i32, i32 } %152, 0, !dbg !48
+  %154 = extractvalue { i32, i32, i32, i32 } %152, 1, !dbg !48
+  %155 = extractvalue { i32, i32, i32, i32 } %152, 2, !dbg !48
+  %156 = extractvalue { i32, i32, i32, i32 } %152, 3, !dbg !48
+  %157 = shl nuw nsw i32 %17, 4, !dbg !48
+  %158 = shl nuw nsw i32 %15, 2, !dbg !48
+  %159 = lshr i32 %14, 1, !dbg !48
+  %160 = and i32 %159, 124, !dbg !48
+  %161 = or disjoint i32 %157, %158, !dbg !48
+  %162 = xor i32 %161, %160, !dbg !48
+  %163 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %162, !dbg !48
+  %164 = insertelement <1 x i32> poison, i32 %153, i64 0, !dbg !48
+  store <1 x i32> %164, ptr addrspace(3) %163, align 4, !dbg !48
+  %165 = xor i32 %162, 1028, !dbg !48
+  %166 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %165, !dbg !48
+  %167 = insertelement <1 x i32> poison, i32 %154, i64 0, !dbg !48
+  store <1 x i32> %167, ptr addrspace(3) %166, align 4, !dbg !48
+  %168 = xor i32 %162, 2056, !dbg !48
+  %169 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %168, !dbg !48
+  %170 = insertelement <1 x i32> poison, i32 %155, i64 0, !dbg !48
+  store <1 x i32> %170, ptr addrspace(3) %169, align 4, !dbg !48
+  %171 = xor i32 %162, 3084, !dbg !48
+  %172 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %171, !dbg !48
+  %173 = insertelement <1 x i32> poison, i32 %156, i64 0, !dbg !48
+  store <1 x i32> %173, ptr addrspace(3) %172, align 4, !dbg !48
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !48
+  %174 = shl nuw nsw i32 %20, 7, !dbg !48
+  %175 = xor i32 %157, %160, !dbg !48
+  %176 = or disjoint i32 %175, %174, !dbg !48
+  %177 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %176, !dbg !48
+  %178 = load float, ptr addrspace(3) %177, align 4, !dbg !48
+  %179 = xor i32 %176, 4, !dbg !48
+  %180 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %179, !dbg !48
+  %181 = load float, ptr addrspace(3) %180, align 4, !dbg !48
+  %182 = xor i32 %176, 8, !dbg !48
+  %183 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %182, !dbg !48
+  %184 = load float, ptr addrspace(3) %183, align 4, !dbg !48
+  %185 = xor i32 %176, 12, !dbg !48
+  %186 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %185, !dbg !48
+  %187 = load float, ptr addrspace(3) %186, align 4, !dbg !48
+  %188 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #6, !dbg !49
+  %189 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %32, i64 %188, i1 true) #6, !dbg !49
+  %190 = getelementptr bfloat, ptr addrspace(1) %6, i64 %120, !dbg !50
+  %191 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !51
+  %192 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %190, i64 %191, i1 true) #6, !dbg !51
+  %193 = icmp eq i32 %119, 0, !dbg !52
+  %194 = and i32 %22, 30, !dbg !53
+  %195 = or disjoint i32 %194, 32, !dbg !53
+  %196 = or disjoint i32 %194, 64, !dbg !53
+  %197 = or disjoint i32 %194, 96, !dbg !53
+  %198 = or disjoint i32 %194, 1, !dbg !54
+  %199 = or disjoint i32 %194, 33, !dbg !54
+  %200 = or disjoint i32 %194, 65, !dbg !54
+  %201 = or i32 %22, 97, !dbg !54
+  %202 = shl i32 %19, 7, !dbg !55
+  %203 = shl i32 %25, 15, !dbg !55
+  %204 = add i32 %203, %202, !dbg !55
+  %205 = or disjoint i32 %204, %198, !dbg !56
+  %206 = or disjoint i32 %204, %199, !dbg !56
+  %207 = or disjoint i32 %204, %200, !dbg !56
+  %208 = or disjoint i32 %204, %201, !dbg !56
+  %209 = sext i32 %205 to i64, !dbg !57
+  %210 = getelementptr bfloat, ptr addrspace(1) %2, i64 %209, !dbg !57
+  %211 = sext i32 %206 to i64, !dbg !57
+  %212 = getelementptr bfloat, ptr addrspace(1) %2, i64 %211, !dbg !57
+  %213 = sext i32 %207 to i64, !dbg !57
+  %214 = getelementptr bfloat, ptr addrspace(1) %2, i64 %213, !dbg !57
+  %215 = sext i32 %208 to i64, !dbg !57
+  %216 = getelementptr bfloat, ptr addrspace(1) %2, i64 %215, !dbg !57
+  %217 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !58
+  %218 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %210, i64 %217, i1 %193) #6, !dbg !58
+  %219 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !58
+  %220 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %212, i64 %219, i1 %193) #6, !dbg !58
+  %221 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !58
+  %222 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %214, i64 %221, i1 %193) #6, !dbg !58
+  %223 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !58
+  %224 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %216, i64 %223, i1 %193) #6, !dbg !58
+  %225 = tail call float @llvm.nvvm.div.full(float %118, float 1.280000e+02), !dbg !59
+  %226 = fadd float %225, 0x3EB0C6F7A0000000, !dbg !60
+  %227 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !61
+  %.not.i = icmp eq i32 %227, 0, !dbg !61
+  br i1 %.not.i, label %230, label %228, !dbg !61
+
+228:                                              ; preds = %11
+  %229 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %226), !dbg !61
+  br label %__nv_rsqrtf.exit, !dbg !61
+
+230:                                              ; preds = %11
+  %231 = tail call float @llvm.nvvm.rsqrt.approx.f(float %226), !dbg !61
+  br label %__nv_rsqrtf.exit, !dbg !61
+
+__nv_rsqrtf.exit:                                 ; preds = %228, %230
+  %.0.i = phi float [ %229, %228 ], [ %231, %230 ], !dbg !61
+  %232 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !61
+  %233 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !61
+  %234 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !61
+  %.not.i7 = icmp eq i32 %234, 0, !dbg !61
+  br i1 %.not.i7, label %237, label %235, !dbg !61
+
+235:                                              ; preds = %__nv_rsqrtf.exit
+  %236 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %226), !dbg !61
+  br label %__nv_rsqrtf.exit9, !dbg !61
+
+237:                                              ; preds = %__nv_rsqrtf.exit
+  %238 = tail call float @llvm.nvvm.rsqrt.approx.f(float %226), !dbg !61
+  br label %__nv_rsqrtf.exit9, !dbg !61
+
+__nv_rsqrtf.exit9:                                ; preds = %235, %237
+  %.0.i8 = phi float [ %236, %235 ], [ %238, %237 ], !dbg !61
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !62
+  %239 = lshr exact i32 %15, 3, !dbg !62
+  %240 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %239, !dbg !62
+  store float %.0.i, ptr addrspace(3) %240, align 4, !dbg !62
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !62
+  %241 = shl nuw nsw i32 %17, 2, !dbg !62
+  %242 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %241, !dbg !62
+  %243 = load float, ptr addrspace(3) %242, align 4, !dbg !62
+  %244 = zext nneg i32 %198 to i64, !dbg !63
+  %245 = getelementptr bfloat, ptr addrspace(1) %3, i64 %244, !dbg !63
+  %246 = zext nneg i32 %199 to i64, !dbg !63
+  %247 = getelementptr bfloat, ptr addrspace(1) %3, i64 %246, !dbg !63
+  %248 = zext nneg i32 %200 to i64, !dbg !63
+  %249 = getelementptr bfloat, ptr addrspace(1) %3, i64 %248, !dbg !63
+  %250 = zext nneg i32 %201 to i64, !dbg !63
+  %251 = getelementptr bfloat, ptr addrspace(1) %3, i64 %250, !dbg !63
+  %252 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !64
+  %253 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %245, i64 %252, i1 %193) #6, !dbg !64
+  %254 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !64
+  %255 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %247, i64 %254, i1 %193) #6, !dbg !64
+  %256 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !64
+  %257 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %249, i64 %256, i1 %193) #6, !dbg !64
+  %258 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !64
+  %259 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %251, i64 %258, i1 %193) #6, !dbg !64
+  %260 = icmp ne i32 %119, 0, !dbg !65
+  %261 = or disjoint i32 %204, %194, !dbg !66
+  %262 = or disjoint i32 %204, %195, !dbg !66
+  %263 = or disjoint i32 %204, %196, !dbg !66
+  %264 = or disjoint i32 %204, %197, !dbg !66
+  %265 = sext i32 %261 to i64, !dbg !67
+  %266 = getelementptr bfloat, ptr addrspace(1) %2, i64 %265, !dbg !67
+  %267 = sext i32 %262 to i64, !dbg !67
+  %268 = getelementptr bfloat, ptr addrspace(1) %2, i64 %267, !dbg !67
+  %269 = sext i32 %263 to i64, !dbg !67
+  %270 = getelementptr bfloat, ptr addrspace(1) %2, i64 %269, !dbg !67
+  %271 = sext i32 %264 to i64, !dbg !67
+  %272 = getelementptr bfloat, ptr addrspace(1) %2, i64 %271, !dbg !67
+  %273 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !68
+  %274 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %266, i64 %273, i1 %260) #6, !dbg !68
+  %275 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !68
+  %276 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %268, i64 %275, i1 %260) #6, !dbg !68
+  %277 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !68
+  %278 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %270, i64 %277, i1 %260) #6, !dbg !68
+  %279 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !68
+  %280 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %272, i64 %279, i1 %260) #6, !dbg !68
+  %281 = zext nneg i32 %194 to i64, !dbg !69
+  %282 = getelementptr bfloat, ptr addrspace(1) %3, i64 %281, !dbg !69
+  %283 = zext nneg i32 %195 to i64, !dbg !69
+  %284 = getelementptr bfloat, ptr addrspace(1) %3, i64 %283, !dbg !69
+  %285 = zext nneg i32 %196 to i64, !dbg !69
+  %286 = getelementptr bfloat, ptr addrspace(1) %3, i64 %285, !dbg !69
+  %287 = zext nneg i32 %197 to i64, !dbg !69
+  %288 = getelementptr bfloat, ptr addrspace(1) %3, i64 %287, !dbg !69
+  %289 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !70
+  %290 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %282, i64 %289, i1 %260) #6, !dbg !70
+  %291 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !70
+  %292 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %284, i64 %291, i1 %260) #6, !dbg !70
+  %293 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !70
+  %294 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %286, i64 %293, i1 %260) #6, !dbg !70
+  %295 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !70
+  %296 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %288, i64 %295, i1 %260) #6, !dbg !70
+  %297 = fmul float %.0.i8, %61, !dbg !71
+  %298 = fmul float %.0.i8, %62, !dbg !71
+  %299 = fmul float %.0.i8, %63, !dbg !71
+  %300 = fmul float %.0.i8, %64, !dbg !71
+  %301 = fmul float %297, %132, !dbg !72
+  %302 = fmul float %298, %133, !dbg !72
+  %303 = fmul float %299, %134, !dbg !72
+  %304 = fmul float %300, %135, !dbg !72
+  %305 = fmul float %301, %146, !dbg !73
+  %306 = fmul float %302, %147, !dbg !73
+  %307 = fmul float %303, %148, !dbg !73
+  %308 = fmul float %304, %149, !dbg !73
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !73
+  store float %305, ptr addrspace(3) %163, align 4, !dbg !73
+  store float %306, ptr addrspace(3) %166, align 4, !dbg !73
+  store float %307, ptr addrspace(3) %169, align 4, !dbg !73
+  store float %308, ptr addrspace(3) %172, align 4, !dbg !73
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !73
+  %309 = load float, ptr addrspace(3) %177, align 4, !dbg !73
+  %310 = load float, ptr addrspace(3) %180, align 4, !dbg !73
+  %311 = load float, ptr addrspace(3) %183, align 4, !dbg !73
+  %312 = load float, ptr addrspace(3) %186, align 4, !dbg !73
+  %313 = or i32 %22, 4193, !dbg !74
+  %314 = add i32 %204, 4097, !dbg !75
+  %315 = or disjoint i32 %314, %194, !dbg !76
+  %316 = add i32 %204, 4129, !dbg !75
+  %317 = or disjoint i32 %316, %194, !dbg !76
+  %318 = add i32 %204, 4161, !dbg !75
+  %319 = or disjoint i32 %318, %194, !dbg !76
+  %320 = add i32 %204, %313, !dbg !76
+  %321 = sext i32 %315 to i64, !dbg !77
+  %322 = getelementptr bfloat, ptr addrspace(1) %2, i64 %321, !dbg !77
+  %323 = sext i32 %317 to i64, !dbg !77
+  %324 = getelementptr bfloat, ptr addrspace(1) %2, i64 %323, !dbg !77
+  %325 = sext i32 %319 to i64, !dbg !77
+  %326 = getelementptr bfloat, ptr addrspace(1) %2, i64 %325, !dbg !77
+  %327 = sext i32 %320 to i64, !dbg !77
+  %328 = getelementptr bfloat, ptr addrspace(1) %2, i64 %327, !dbg !77
+  %329 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !78
+  %330 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %322, i64 %329, i1 %193) #6, !dbg !78
+  %331 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !78
+  %332 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %324, i64 %331, i1 %193) #6, !dbg !78
+  %333 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !78
+  %334 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %326, i64 %333, i1 %193) #6, !dbg !78
+  %335 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !78
+  %336 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %328, i64 %335, i1 %193) #6, !dbg !78
+  %337 = tail call float @llvm.nvvm.div.full(float %95, float 1.280000e+02), !dbg !79
+  %338 = fadd float %337, 0x3EB0C6F7A0000000, !dbg !80
+  %339 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !81
+  %.not.i10 = icmp eq i32 %339, 0, !dbg !81
+  br i1 %.not.i10, label %342, label %340, !dbg !81
+
+340:                                              ; preds = %__nv_rsqrtf.exit9
+  %341 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %338), !dbg !81
+  br label %__nv_rsqrtf.exit12, !dbg !81
+
+342:                                              ; preds = %__nv_rsqrtf.exit9
+  %343 = tail call float @llvm.nvvm.rsqrt.approx.f(float %338), !dbg !81
+  br label %__nv_rsqrtf.exit12, !dbg !81
+
+__nv_rsqrtf.exit12:                               ; preds = %340, %342
+  %.0.i11 = phi float [ %341, %340 ], [ %343, %342 ], !dbg !81
+  %344 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !81
+  %345 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !81
+  %346 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !81
+  %.not.i19 = icmp eq i32 %346, 0, !dbg !81
+  br i1 %.not.i19, label %349, label %347, !dbg !81
+
+347:                                              ; preds = %__nv_rsqrtf.exit12
+  %348 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %338), !dbg !81
+  br label %__nv_rsqrtf.exit21, !dbg !81
+
+349:                                              ; preds = %__nv_rsqrtf.exit12
+  %350 = tail call float @llvm.nvvm.rsqrt.approx.f(float %338), !dbg !81
+  br label %__nv_rsqrtf.exit21, !dbg !81
+
+__nv_rsqrtf.exit21:                               ; preds = %347, %349
+  %.0.i20 = phi float [ %348, %347 ], [ %350, %349 ], !dbg !81
+  %351 = bitcast i16 %336 to bfloat, !dbg !78
+  %352 = fpext bfloat %351 to float, !dbg !82
+  %353 = bitcast i16 %334 to bfloat, !dbg !78
+  %354 = fpext bfloat %353 to float, !dbg !82
+  %355 = bitcast i16 %332 to bfloat, !dbg !78
+  %356 = fpext bfloat %355 to float, !dbg !82
+  %357 = bitcast i16 %330 to bfloat, !dbg !78
+  %358 = fpext bfloat %357 to float, !dbg !82
+  %359 = bitcast i16 %224 to bfloat, !dbg !58
+  %360 = fpext bfloat %359 to float, !dbg !83
+  %361 = fmul float %243, %360, !dbg !62
+  %362 = bitcast i16 %259 to bfloat, !dbg !64
+  %363 = fpext bfloat %362 to float, !dbg !84
+  %364 = fmul float %361, %363, !dbg !85
+  %365 = fsub float 0.000000e+00, %364, !dbg !86
+  %366 = bitcast i16 %280 to bfloat, !dbg !68
+  %367 = fpext bfloat %366 to float, !dbg !87
+  %368 = fmul float %243, %367, !dbg !88
+  %369 = bitcast i16 %296 to bfloat, !dbg !70
+  %370 = fpext bfloat %369 to float, !dbg !89
+  %371 = fmul float %368, %370, !dbg !90
+  %372 = select i1 %193, float %365, float %371, !dbg !91
+  %373 = fmul float %187, %372, !dbg !92
+  %374 = fadd float %373, %312, !dbg !93
+  %375 = bitcast i16 %222 to bfloat, !dbg !58
+  %376 = fpext bfloat %375 to float, !dbg !83
+  %377 = fmul float %243, %376, !dbg !62
+  %378 = bitcast i16 %257 to bfloat, !dbg !64
+  %379 = fpext bfloat %378 to float, !dbg !84
+  %380 = fmul float %377, %379, !dbg !85
+  %381 = fsub float 0.000000e+00, %380, !dbg !86
+  %382 = bitcast i16 %278 to bfloat, !dbg !68
+  %383 = fpext bfloat %382 to float, !dbg !87
+  %384 = fmul float %243, %383, !dbg !88
+  %385 = bitcast i16 %294 to bfloat, !dbg !70
+  %386 = fpext bfloat %385 to float, !dbg !89
+  %387 = fmul float %384, %386, !dbg !90
+  %388 = select i1 %193, float %381, float %387, !dbg !91
+  %389 = fmul float %184, %388, !dbg !92
+  %390 = fadd float %389, %311, !dbg !93
+  %391 = bitcast i16 %220 to bfloat, !dbg !58
+  %392 = fpext bfloat %391 to float, !dbg !83
+  %393 = fmul float %243, %392, !dbg !62
+  %394 = bitcast i16 %255 to bfloat, !dbg !64
+  %395 = fpext bfloat %394 to float, !dbg !84
+  %396 = fmul float %393, %395, !dbg !85
+  %397 = fsub float 0.000000e+00, %396, !dbg !86
+  %398 = bitcast i16 %276 to bfloat, !dbg !68
+  %399 = fpext bfloat %398 to float, !dbg !87
+  %400 = fmul float %243, %399, !dbg !88
+  %401 = bitcast i16 %292 to bfloat, !dbg !70
+  %402 = fpext bfloat %401 to float, !dbg !89
+  %403 = fmul float %400, %402, !dbg !90
+  %404 = select i1 %193, float %397, float %403, !dbg !91
+  %405 = fmul float %181, %404, !dbg !92
+  %406 = fadd float %405, %310, !dbg !93
+  %407 = bitcast i16 %218 to bfloat, !dbg !58
+  %408 = fpext bfloat %407 to float, !dbg !83
+  %409 = fmul float %243, %408, !dbg !62
+  %410 = bitcast i16 %253 to bfloat, !dbg !64
+  %411 = fpext bfloat %410 to float, !dbg !84
+  %412 = fmul float %409, %411, !dbg !85
+  %413 = fsub float 0.000000e+00, %412, !dbg !86
+  %414 = bitcast i16 %274 to bfloat, !dbg !68
+  %415 = fpext bfloat %414 to float, !dbg !87
+  %416 = fmul float %243, %415, !dbg !88
+  %417 = bitcast i16 %290 to bfloat, !dbg !70
+  %418 = fpext bfloat %417 to float, !dbg !89
+  %419 = fmul float %416, %418, !dbg !90
+  %420 = select i1 %193, float %413, float %419, !dbg !91
+  %421 = fmul float %178, %420, !dbg !92
+  %422 = fadd float %421, %309, !dbg !93
+  %423 = extractvalue { i32, i32 } %192, 1, !dbg !51
+  %424 = bitcast i32 %423 to <2 x bfloat>, !dbg !51
+  %425 = extractelement <2 x bfloat> %424, i64 1, !dbg !51
+  %426 = fpext bfloat %425 to float, !dbg !94
+  %427 = extractelement <2 x bfloat> %424, i64 0, !dbg !51
+  %428 = fpext bfloat %427 to float, !dbg !94
+  %429 = extractvalue { i32, i32 } %192, 0, !dbg !51
+  %430 = bitcast i32 %429 to <2 x bfloat>, !dbg !51
+  %431 = extractelement <2 x bfloat> %430, i64 1, !dbg !51
+  %432 = fpext bfloat %431 to float, !dbg !94
+  %433 = extractelement <2 x bfloat> %430, i64 0, !dbg !51
+  %434 = fpext bfloat %433 to float, !dbg !94
+  %435 = extractvalue { i32, i32 } %189, 1, !dbg !49
+  %436 = bitcast i32 %435 to <2 x bfloat>, !dbg !49
+  %437 = extractelement <2 x bfloat> %436, i64 1, !dbg !49
+  %438 = fpext bfloat %437 to float, !dbg !95
+  %439 = extractelement <2 x bfloat> %436, i64 0, !dbg !49
+  %440 = fpext bfloat %439 to float, !dbg !95
+  %441 = extractvalue { i32, i32 } %189, 0, !dbg !49
+  %442 = bitcast i32 %441 to <2 x bfloat>, !dbg !49
+  %443 = extractelement <2 x bfloat> %442, i64 1, !dbg !49
+  %444 = fpext bfloat %443 to float, !dbg !95
+  %445 = extractelement <2 x bfloat> %442, i64 0, !dbg !49
+  %446 = fpext bfloat %445 to float, !dbg !95
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !96
+  store float %.0.i11, ptr addrspace(3) %240, align 4, !dbg !96
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !96
+  %447 = load float, ptr addrspace(3) %242, align 4, !dbg !96
+  %448 = fmul float %447, %358, !dbg !96
+  %449 = fmul float %447, %356, !dbg !96
+  %450 = fmul float %447, %354, !dbg !96
+  %451 = fmul float %447, %352, !dbg !96
+  %452 = getelementptr bfloat, ptr addrspace(1) %6, i64 %244, !dbg !97
+  %453 = getelementptr bfloat, ptr addrspace(1) %6, i64 %246, !dbg !97
+  %454 = getelementptr bfloat, ptr addrspace(1) %6, i64 %248, !dbg !97
+  %455 = getelementptr bfloat, ptr addrspace(1) %6, i64 %250, !dbg !97
+  %456 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !98
+  %457 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %452, i64 %456, i1 %193) #6, !dbg !98
+  %458 = bitcast i16 %457 to bfloat, !dbg !98
+  %459 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !98
+  %460 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %453, i64 %459, i1 %193) #6, !dbg !98
+  %461 = bitcast i16 %460 to bfloat, !dbg !98
+  %462 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !98
+  %463 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %454, i64 %462, i1 %193) #6, !dbg !98
+  %464 = bitcast i16 %463 to bfloat, !dbg !98
+  %465 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !98
+  %466 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %455, i64 %465, i1 %193) #6, !dbg !98
+  %467 = bitcast i16 %466 to bfloat, !dbg !98
+  %468 = fpext bfloat %458 to float, !dbg !99
+  %469 = fpext bfloat %461 to float, !dbg !99
+  %470 = fpext bfloat %464 to float, !dbg !99
+  %471 = fpext bfloat %467 to float, !dbg !99
+  %472 = fmul float %448, %468, !dbg !100
+  %473 = fmul float %449, %469, !dbg !100
+  %474 = fmul float %450, %470, !dbg !100
+  %475 = fmul float %451, %471, !dbg !100
+  %476 = fsub float 0.000000e+00, %472, !dbg !101
+  %477 = fsub float 0.000000e+00, %473, !dbg !101
+  %478 = fsub float 0.000000e+00, %474, !dbg !101
+  %479 = fsub float 0.000000e+00, %475, !dbg !101
+  %480 = add i32 %204, 4096, !dbg !102
+  %481 = or disjoint i32 %480, %194, !dbg !103
+  %482 = add i32 %204, 4128, !dbg !102
+  %483 = or disjoint i32 %482, %194, !dbg !103
+  %484 = add i32 %204, 4160, !dbg !102
+  %485 = or disjoint i32 %484, %194, !dbg !103
+  %486 = add i32 %204, 4192, !dbg !102
+  %487 = or disjoint i32 %486, %194, !dbg !103
+  %488 = sext i32 %481 to i64, !dbg !104
+  %489 = getelementptr bfloat, ptr addrspace(1) %2, i64 %488, !dbg !104
+  %490 = sext i32 %483 to i64, !dbg !104
+  %491 = getelementptr bfloat, ptr addrspace(1) %2, i64 %490, !dbg !104
+  %492 = sext i32 %485 to i64, !dbg !104
+  %493 = getelementptr bfloat, ptr addrspace(1) %2, i64 %492, !dbg !104
+  %494 = sext i32 %487 to i64, !dbg !104
+  %495 = getelementptr bfloat, ptr addrspace(1) %2, i64 %494, !dbg !104
+  %496 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !105
+  %497 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %489, i64 %496, i1 %260) #6, !dbg !105
+  %498 = bitcast i16 %497 to bfloat, !dbg !105
+  %499 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !105
+  %500 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %491, i64 %499, i1 %260) #6, !dbg !105
+  %501 = bitcast i16 %500 to bfloat, !dbg !105
+  %502 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !105
+  %503 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %493, i64 %502, i1 %260) #6, !dbg !105
+  %504 = bitcast i16 %503 to bfloat, !dbg !105
+  %505 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !105
+  %506 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %495, i64 %505, i1 %260) #6, !dbg !105
+  %507 = bitcast i16 %506 to bfloat, !dbg !105
+  %508 = fpext bfloat %498 to float, !dbg !106
+  %509 = fpext bfloat %501 to float, !dbg !106
+  %510 = fpext bfloat %504 to float, !dbg !106
+  %511 = fpext bfloat %507 to float, !dbg !106
+  %512 = fmul float %447, %508, !dbg !107
+  %513 = fmul float %447, %509, !dbg !107
+  %514 = fmul float %447, %510, !dbg !107
+  %515 = fmul float %447, %511, !dbg !107
+  %516 = getelementptr bfloat, ptr addrspace(1) %6, i64 %281, !dbg !108
+  %517 = getelementptr bfloat, ptr addrspace(1) %6, i64 %283, !dbg !108
+  %518 = getelementptr bfloat, ptr addrspace(1) %6, i64 %285, !dbg !108
+  %519 = getelementptr bfloat, ptr addrspace(1) %6, i64 %287, !dbg !108
+  %520 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !109
+  %521 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %516, i64 %520, i1 %260) #6, !dbg !109
+  %522 = bitcast i16 %521 to bfloat, !dbg !109
+  %523 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !109
+  %524 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %517, i64 %523, i1 %260) #6, !dbg !109
+  %525 = bitcast i16 %524 to bfloat, !dbg !109
+  %526 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !109
+  %527 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %518, i64 %526, i1 %260) #6, !dbg !109
+  %528 = bitcast i16 %527 to bfloat, !dbg !109
+  %529 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !109
+  %530 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %519, i64 %529, i1 %260) #6, !dbg !109
+  %531 = bitcast i16 %530 to bfloat, !dbg !109
+  %532 = fpext bfloat %522 to float, !dbg !110
+  %533 = fpext bfloat %525 to float, !dbg !110
+  %534 = fpext bfloat %528 to float, !dbg !110
+  %535 = fpext bfloat %531 to float, !dbg !110
+  %536 = fmul float %512, %532, !dbg !111
+  %537 = fmul float %513, %533, !dbg !111
+  %538 = fmul float %514, %534, !dbg !111
+  %539 = fmul float %515, %535, !dbg !111
+  %540 = select i1 %193, float %476, float %536, !dbg !91
+  %541 = select i1 %193, float %477, float %537, !dbg !91
+  %542 = select i1 %193, float %478, float %538, !dbg !91
+  %543 = select i1 %193, float %479, float %539, !dbg !91
+  %544 = fmul float %.0.i20, %446, !dbg !112
+  %545 = fmul float %.0.i20, %444, !dbg !112
+  %546 = fmul float %.0.i20, %440, !dbg !112
+  %547 = fmul float %.0.i20, %438, !dbg !112
+  %548 = fmul float %544, %434, !dbg !113
+  %549 = fmul float %545, %432, !dbg !113
+  %550 = fmul float %546, %428, !dbg !113
+  %551 = fmul float %547, %426, !dbg !113
+  %552 = fmul float %548, %146, !dbg !114
+  %553 = fmul float %549, %147, !dbg !114
+  %554 = fmul float %550, %148, !dbg !114
+  %555 = fmul float %551, %149, !dbg !114
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !114
+  store float %552, ptr addrspace(3) %163, align 4, !dbg !114
+  store float %553, ptr addrspace(3) %166, align 4, !dbg !114
+  store float %554, ptr addrspace(3) %169, align 4, !dbg !114
+  store float %555, ptr addrspace(3) %172, align 4, !dbg !114
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !114
+  %556 = load float, ptr addrspace(3) %177, align 4, !dbg !114
+  %557 = load float, ptr addrspace(3) %180, align 4, !dbg !114
+  %558 = load float, ptr addrspace(3) %183, align 4, !dbg !114
+  %559 = load float, ptr addrspace(3) %186, align 4, !dbg !114
+  %560 = fmul float %178, %540, !dbg !115
+  %561 = fmul float %181, %541, !dbg !115
+  %562 = fmul float %184, %542, !dbg !115
+  %563 = fmul float %187, %543, !dbg !115
+  %564 = fadd float %560, %556, !dbg !116
+  %565 = fadd float %561, %557, !dbg !116
+  %566 = fadd float %562, %558, !dbg !116
+  %567 = fadd float %563, %559, !dbg !116
+  %568 = shl i32 %18, 7, !dbg !117
+  %569 = or disjoint i32 %568, %21, !dbg !118
+  %570 = sext i32 %569 to i64, !dbg !119
+  %571 = getelementptr bfloat, ptr addrspace(1) %0, i64 %570, !dbg !119
+  %572 = fptrunc float %422 to bfloat, !dbg !120
+  %573 = fptrunc float %406 to bfloat, !dbg !120
+  %574 = fptrunc float %390 to bfloat, !dbg !120
+  %575 = fptrunc float %374 to bfloat, !dbg !120
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !120
+  %576 = and i32 %14, 15, !dbg !120
+  %577 = shl nuw nsw i32 %576, 7, !dbg !120
+  %578 = shl nuw nsw i32 %576, 3, !dbg !120
+  %579 = and i32 %22, 24, !dbg !120
+  %580 = lshr i32 %14, 2, !dbg !120
+  %581 = and i32 %580, 4, !dbg !120
+  %582 = lshr i32 %14, 4, !dbg !120
+  %583 = and i32 %582, 2, !dbg !120
+  %584 = or disjoint i32 %577, %581, !dbg !120
+  %585 = or disjoint i32 %584, %583, !dbg !120
+  %586 = xor i32 %578, %579, !dbg !120
+  %587 = or disjoint i32 %585, %586, !dbg !120
+  %588 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %587, !dbg !120
+  store bfloat %572, ptr addrspace(3) %588, align 2, !dbg !120
+  %589 = xor i32 %587, 32, !dbg !120
+  %590 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %589, !dbg !120
+  store bfloat %573, ptr addrspace(3) %590, align 2, !dbg !120
+  %591 = xor i32 %587, 64, !dbg !120
+  %592 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %591, !dbg !120
+  store bfloat %574, ptr addrspace(3) %592, align 2, !dbg !120
+  %593 = xor i32 %587, 96, !dbg !120
+  %594 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %593, !dbg !120
+  store bfloat %575, ptr addrspace(3) %594, align 2, !dbg !120
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !120
+  %595 = shl nuw nsw i32 %14, 2, !dbg !120
+  %596 = and i32 %595, 1016, !dbg !120
+  %597 = lshr exact i32 %15, 2, !dbg !120
+  %598 = shl nuw nsw i32 %14, 1, !dbg !120
+  %599 = and i32 %598, 2, !dbg !120
+  %600 = xor i32 %596, %597, !dbg !120
+  %601 = or disjoint i32 %600, %599, !dbg !120
+  %602 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %601, !dbg !120
+  %603 = load bfloat, ptr addrspace(3) %602, align 2, !dbg !120
+  %604 = getelementptr inbounds nuw i8, ptr addrspace(3) %602, i32 4, !dbg !120
+  %605 = load bfloat, ptr addrspace(3) %604, align 2, !dbg !120
+  %606 = xor i32 %601, 1088, !dbg !120
+  %607 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %606, !dbg !120
+  %608 = load bfloat, ptr addrspace(3) %607, align 2, !dbg !120
+  %609 = getelementptr inbounds nuw i8, ptr addrspace(3) %607, i32 4, !dbg !120
+  %610 = load bfloat, ptr addrspace(3) %609, align 2, !dbg !120
+  %611 = insertelement <2 x bfloat> poison, bfloat %603, i64 0, !dbg !120
+  %612 = insertelement <2 x bfloat> %611, bfloat %608, i64 1, !dbg !120
+  %613 = bitcast <2 x bfloat> %612 to i32, !dbg !120
+  %614 = insertelement <2 x bfloat> poison, bfloat %605, i64 0, !dbg !120
+  %615 = insertelement <2 x bfloat> %614, bfloat %610, i64 1, !dbg !120
+  %616 = bitcast <2 x bfloat> %615 to i32, !dbg !120
+  tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %613, i32 %616, ptr addrspace(1) %571, i1 true) #6, !dbg !120
+  %617 = getelementptr bfloat, ptr addrspace(1) %1, i64 %570, !dbg !121
+  %618 = fptrunc float %564 to bfloat, !dbg !122
+  %619 = fptrunc float %565 to bfloat, !dbg !122
+  %620 = fptrunc float %566 to bfloat, !dbg !122
+  %621 = fptrunc float %567 to bfloat, !dbg !122
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !122
+  store bfloat %618, ptr addrspace(3) %588, align 2, !dbg !122
+  store bfloat %619, ptr addrspace(3) %590, align 2, !dbg !122
+  store bfloat %620, ptr addrspace(3) %592, align 2, !dbg !122
+  store bfloat %621, ptr addrspace(3) %594, align 2, !dbg !122
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !122
+  %622 = load bfloat, ptr addrspace(3) %602, align 2, !dbg !122
+  %623 = load bfloat, ptr addrspace(3) %604, align 2, !dbg !122
+  %624 = load bfloat, ptr addrspace(3) %607, align 2, !dbg !122
+  %625 = load bfloat, ptr addrspace(3) %609, align 2, !dbg !122
+  %626 = insertelement <2 x bfloat> poison, bfloat %622, i64 0, !dbg !122
+  %627 = insertelement <2 x bfloat> %626, bfloat %624, i64 1, !dbg !122
+  %628 = bitcast <2 x bfloat> %627 to i32, !dbg !122
+  %629 = insertelement <2 x bfloat> poison, bfloat %623, i64 0, !dbg !122
+  %630 = insertelement <2 x bfloat> %629, bfloat %625, i64 1, !dbg !122
+  %631 = bitcast <2 x bfloat> %630 to i32, !dbg !122
+  tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %628, i32 %631, ptr addrspace(1) %617, i1 true) #6, !dbg !122
+  ret void, !dbg !123
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1
+
+; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
+declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2
+
+; Function Attrs: convergent nocallback nounwind
+declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #3
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.div.full(float, float) #4
+
+declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #5
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #4
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.rsqrt.approx.f(float) #4
+
+attributes #0 = { nounwind "nvvm.reqntid"="256" }
+attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
+attributes #3 = { convergent nocallback nounwind }
+attributes #4 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
+attributes #5 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #6 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3}
+!llvm.ident = !{!4}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly)
+!1 = !DIFile(filename: "cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py", directory: "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv")
+!2 = !{i32 2, !"Debug Info Version", i32 3}
+!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
+!4 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
+!5 = distinct !DISubprogram(name: "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0", linkageName: "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0", scope: !1, file: !1, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
+!7 = !{}
+!8 = !DILocation(line: 23, column: 28, scope: !5)
+!9 = !DILocation(line: 23, column: 33, scope: !5)
+!10 = !DILocation(line: 24, column: 44, scope: !5)
+!11 = !DILocation(line: 24, column: 23, scope: !5)
+!12 = !DILocation(line: 26, column: 37, scope: !5)
+!13 = !DILocation(line: 29, column: 19, scope: !5)
+!14 = !DILocation(line: 28, column: 19, scope: !5)
+!15 = !DILocation(line: 39, column: 41, scope: !5)
+!16 = !DILocation(line: 39, column: 52, scope: !5)
+!17 = !DILocation(line: 39, column: 48, scope: !5)
+!18 = !DILocation(line: 39, column: 63, scope: !5)
+!19 = !DILocation(line: 39, column: 57, scope: !5)
+!20 = !DILocation(line: 39, column: 34, scope: !5)
+!21 = !DILocation(line: 39, column: 68, scope: !5)
+!22 = !DILocation(line: 39, column: 121, scope: !5)
+!23 = !DILocation(line: 40, column: 41, scope: !5)
+!24 = !DILocation(line: 40, column: 50, scope: !5)
+!25 = !DILocation(line: 40, column: 34, scope: !5)
+!26 = !DILocation(line: 40, column: 61, scope: !5)
+!27 = !DILocation(line: 40, column: 114, scope: !5)
+!28 = !DILocation(line: 42, column: 22, scope: !5)
+!29 = !DILocation(line: 47, column: 22, scope: !5)
+!30 = !DILocation(line: 263, column: 15, scope: !31, inlinedAt: !33)
+!31 = distinct !DILexicalBlockFile(scope: !5, file: !32, discriminator: 0)
+!32 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.12/dist-packages/triton/language")
+!33 = !DILocation(line: 293, column: 36, scope: !31, inlinedAt: !34)
+!34 = !DILocation(line: 51, column: 25, scope: !35)
+!35 = distinct !DILexicalBlockFile(scope: !5, file: !1, discriminator: 0)
+!36 = !DILocation(line: 263, column: 15, scope: !31, inlinedAt: !37)
+!37 = !DILocation(line: 293, column: 36, scope: !31, inlinedAt: !38)
+!38 = !DILocation(line: 52, column: 27, scope: !35)
+!39 = !DILocation(line: 58, column: 27, scope: !5)
+!40 = !DILocation(line: 62, column: 35, scope: !5)
+!41 = !DILocation(line: 62, column: 42, scope: !5)
+!42 = !DILocation(line: 62, column: 95, scope: !5)
+!43 = !DILocation(line: 63, column: 46, scope: !5)
+!44 = !DILocation(line: 63, column: 42, scope: !5)
+!45 = !DILocation(line: 63, column: 35, scope: !5)
+!46 = !DILocation(line: 63, column: 51, scope: !5)
+!47 = !DILocation(line: 64, column: 35, scope: !5)
+!48 = !DILocation(line: 64, column: 51, scope: !5)
+!49 = !DILocation(line: 65, column: 69, scope: !5)
+!50 = !DILocation(line: 66, column: 36, scope: !5)
+!51 = !DILocation(line: 66, column: 43, scope: !5)
+!52 = !DILocation(line: 71, column: 24, scope: !5)
+!53 = !DILocation(line: 72, column: 41, scope: !5)
+!54 = !DILocation(line: 72, column: 39, scope: !5)
+!55 = !DILocation(line: 72, column: 48, scope: !5)
+!56 = !DILocation(line: 72, column: 57, scope: !5)
+!57 = !DILocation(line: 72, column: 35, scope: !5)
+!58 = !DILocation(line: 72, column: 68, scope: !5)
+!59 = !DILocation(line: 75, column: 25, scope: !5)
+!60 = !DILocation(line: 77, column: 24, scope: !5)
+!61 = !DILocation(line: 78, column: 32, scope: !5)
+!62 = !DILocation(line: 79, column: 24, scope: !5)
+!63 = !DILocation(line: 80, column: 35, scope: !5)
+!64 = !DILocation(line: 80, column: 85, scope: !5)
+!65 = !DILocation(line: 87, column: 25, scope: !5)
+!66 = !DILocation(line: 90, column: 53, scope: !5)
+!67 = !DILocation(line: 90, column: 35, scope: !5)
+!68 = !DILocation(line: 90, column: 64, scope: !5)
+!69 = !DILocation(line: 98, column: 35, scope: !5)
+!70 = !DILocation(line: 98, column: 81, scope: !5)
+!71 = !DILocation(line: 111, column: 24, scope: !5)
+!72 = !DILocation(line: 113, column: 24, scope: !5)
+!73 = !DILocation(line: 116, column: 24, scope: !5)
+!74 = !DILocation(line: 121, column: 42, scope: !5)
+!75 = !DILocation(line: 121, column: 51, scope: !5)
+!76 = !DILocation(line: 121, column: 60, scope: !5)
+!77 = !DILocation(line: 121, column: 35, scope: !5)
+!78 = !DILocation(line: 121, column: 71, scope: !5)
+!79 = !DILocation(line: 123, column: 24, scope: !5)
+!80 = !DILocation(line: 124, column: 24, scope: !5)
+!81 = !DILocation(line: 125, column: 32, scope: !5)
+!82 = !DILocation(line: 121, column: 132, scope: !5)
+!83 = !DILocation(line: 72, column: 129, scope: !5)
+!84 = !DILocation(line: 80, column: 146, scope: !5)
+!85 = !DILocation(line: 82, column: 24, scope: !5)
+!86 = !DILocation(line: 84, column: 17, scope: !5)
+!87 = !DILocation(line: 90, column: 125, scope: !5)
+!88 = !DILocation(line: 97, column: 24, scope: !5)
+!89 = !DILocation(line: 98, column: 142, scope: !5)
+!90 = !DILocation(line: 100, column: 24, scope: !5)
+!91 = !DILocation(line: 0, scope: !5)
+!92 = !DILocation(line: 118, column: 24, scope: !5)
+!93 = !DILocation(line: 119, column: 24, scope: !5)
+!94 = !DILocation(line: 66, column: 96, scope: !5)
+!95 = !DILocation(line: 65, column: 123, scope: !5)
+!96 = !DILocation(line: 126, column: 24, scope: !5)
+!97 = !DILocation(line: 127, column: 35, scope: !5)
+!98 = !DILocation(line: 127, column: 85, scope: !5)
+!99 = !DILocation(line: 127, column: 146, scope: !5)
+!100 = !DILocation(line: 129, column: 24, scope: !5)
+!101 = !DILocation(line: 131, column: 17, scope: !5)
+!102 = !DILocation(line: 134, column: 51, scope: !5)
+!103 = !DILocation(line: 134, column: 60, scope: !5)
+!104 = !DILocation(line: 134, column: 35, scope: !5)
+!105 = !DILocation(line: 134, column: 71, scope: !5)
+!106 = !DILocation(line: 134, column: 132, scope: !5)
+!107 = !DILocation(line: 139, column: 24, scope: !5)
+!108 = !DILocation(line: 140, column: 35, scope: !5)
+!109 = !DILocation(line: 140, column: 81, scope: !5)
+!110 = !DILocation(line: 140, column: 142, scope: !5)
+!111 = !DILocation(line: 142, column: 24, scope: !5)
+!112 = !DILocation(line: 151, column: 25, scope: !5)
+!113 = !DILocation(line: 153, column: 26, scope: !5)
+!114 = !DILocation(line: 156, column: 26, scope: !5)
+!115 = !DILocation(line: 158, column: 26, scope: !5)
+!116 = !DILocation(line: 159, column: 26, scope: !5)
+!117 = !DILocation(line: 161, column: 43, scope: !5)
+!118 = !DILocation(line: 161, column: 39, scope: !5)
+!119 = !DILocation(line: 161, column: 32, scope: !5)
+!120 = !DILocation(line: 161, column: 55, scope: !5)
+!121 = !DILocation(line: 162, column: 32, scope: !5)
+!122 = !DILocation(line: 162, column: 56, scope: !5)
+!123 = !DILocation(line: 53, column: 4, scope: !5)
diff --git a/triton/646WRNO7FMJYGKK66FHZ7JPPW6YUT5P5V6RPZAVEK2TTH5B43JCA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ptx b/triton/646WRNO7FMJYGKK66FHZ7JPPW6YUT5P5V6RPZAVEK2TTH5B43JCA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ptx
new file mode 100644
index 0000000000000000000000000000000000000000..fdd894cfc5a02fcb56e2bd11319824abe46ba5ea
--- /dev/null
+++ b/triton/646WRNO7FMJYGKK66FHZ7JPPW6YUT5P5V6RPZAVEK2TTH5B43JCA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ptx
@@ -0,0 +1,1404 @@
+//
+// Generated by LLVM NVPTX Back-End
+//
+
+.version 9.1
+.target sm_89
+.address_size 64
+
+	// .globl	triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0 // -- Begin function triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0
+.extern .shared .align 16 .b8 global_smem[];
+.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90};
+                                        // @triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0
+.visible .entry triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0(
+	.param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_0,
+	.param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_1,
+	.param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_2,
+	.param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_3,
+	.param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_4,
+	.param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_5,
+	.param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_6,
+	.param .u32 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_7,
+	.param .u32 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_8,
+	.param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_9,
+	.param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_10
+)
+.reqntid 256
+{
+	.reg .pred 	%p<4>;
+	.reg .b16 	%rs<70>;
+	.reg .b32 	%r<306>;
+	.reg .b64 	%rd<97>;
+	.loc	1 18 0                          // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:18:0
+$L__func_begin0:
+	.loc	1 18 0                          // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:18:0
+
+// %bb.0:                               // %__nv_rsqrtf.exit
+	ld.param.b64 	%rd80, [triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_0];
+	ld.param.b64 	%rd81, [triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_1];
+$L__tmp0:
+	.loc	1 23 28                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:23:28
+	mov.u32 	%r24, %ctaid.x;
+	.loc	1 23 33                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:23:33
+	shl.b32 	%r25, %r24, 3;
+	ld.param.b64 	%rd82, [triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_2];
+	ld.param.b64 	%rd83, [triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_3];
+	.loc	1 24 44                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:24:44
+	mov.u32 	%r26, %tid.x;
+	and.b32 	%r27, %r26, 224;
+	ld.param.b64 	%rd84, [triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_4];
+	bfe.u32 	%r28, %r26, 5, 3;
+	ld.param.b64 	%rd85, [triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_5];
+	and.b32 	%r29, %r26, 7;
+	ld.param.b64 	%rd86, [triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_6];
+	.loc	1 24 23                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:24:23
+	or.b32 	%r30, %r28, %r25;
+	or.b32 	%r31, %r25, %r29;
+	.loc	1 26 37                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:26:37
+	and.b32 	%r32, %r26, 31;
+	shl.b32 	%r33, %r32, 2;
+	shr.u32 	%r34, %r26, 3;
+	.loc	1 29 19                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:29:19
+	bfe.s32 	%r35, %r24, 28, 1;
+	shr.u32 	%r36, %r35, 27;
+	add.s32 	%r37, %r30, %r36;
+	shr.s32 	%r38, %r37, 5;
+	.loc	1 28 19                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:28:19
+	and.b32 	%r39, %r37, 33554400;
+	sub.s32 	%r40, %r30, %r39;
+	.loc	1 29 19                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:29:19
+	add.s32 	%r41, %r31, %r36;
+	.loc	1 39 52                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:39:52
+	shl.b32 	%r42, %r40, 7;
+	.loc	1 39 48                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:39:48
+	or.b32 	%r43, %r42, %r33;
+	mad.lo.s32 	%r44, %r38, 36864, %r43;
+	.loc	1 39 57                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:39:57
+	add.s32 	%r45, %r44, 4096;
+	.loc	1 39 34                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:39:34
+	mad.wide.s32 	%rd1, %r45, 2, %rd82;
+	.loc	1 39 68                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:39:68
+	// begin inline asm
+	mov.u64 %rd2, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd2, 1.0;
+	// end inline asm
+	mov.b32 	%r3, 0;
+	mov.pred 	%p1, -1;
+	// begin inline asm
+	mov.u32 %r1, %r3;
+	mov.u32 %r2, %r3;
+	@%p1 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { %r1, %r2 }, [ %rd1 + 0 ], %rd2;
+	// end inline asm
+	mov.b32 	{%rs34, %rs35}, %r1;
+	mov.b32 	{%rs36, %rs37}, %r2;
+	.loc	1 39 121                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:39:121
+	cvt.f32.bf16 	%r46, %rs34;
+	cvt.f32.bf16 	%r47, %rs35;
+	cvt.f32.bf16 	%r48, %rs36;
+	cvt.f32.bf16 	%r49, %rs37;
+	.loc	1 40 34                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:40:34
+	mad.wide.s32 	%rd3, %r44, 2, %rd82;
+	.loc	1 40 61                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:40:61
+	// begin inline asm
+	mov.u64 %rd4, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd4, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r4, %r3;
+	mov.u32 %r5, %r3;
+	@%p1 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { %r4, %r5 }, [ %rd3 + 0 ], %rd4;
+	// end inline asm
+	mov.b32 	{%rs38, %rs39}, %r4;
+	mov.b32 	{%rs40, %rs41}, %r5;
+	.loc	1 40 114                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:40:114
+	cvt.f32.bf16 	%r50, %rs38;
+	cvt.f32.bf16 	%r51, %rs39;
+	cvt.f32.bf16 	%r52, %rs40;
+	cvt.f32.bf16 	%r53, %rs41;
+	.loc	1 42 22                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:42:22
+	mul.f32 	%r54, %r47, %r47;
+	.loc	1 47 22                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:47:22
+	mul.f32 	%r55, %r51, %r51;
+$L__tmp1:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ] ]
+	fma.rn.f32 	%r56, %r46, %r46, %r54;
+	fma.rn.f32 	%r57, %r48, %r48, %r56;
+	fma.rn.f32 	%r58, %r49, %r49, %r57;
+$L__tmp2:
+	.loc	2 293 36                        // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ]
+	shfl.sync.bfly.b32 	%r59, %r58, 16, 31, -1;
+$L__tmp3:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ] ]
+	add.f32 	%r60, %r58, %r59;
+$L__tmp4:
+	.loc	2 293 36                        // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ]
+	shfl.sync.bfly.b32 	%r61, %r60, 8, 31, -1;
+$L__tmp5:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ] ]
+	add.f32 	%r62, %r60, %r61;
+$L__tmp6:
+	.loc	2 293 36                        // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ]
+	shfl.sync.bfly.b32 	%r63, %r62, 4, 31, -1;
+$L__tmp7:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ] ]
+	add.f32 	%r64, %r62, %r63;
+$L__tmp8:
+	.loc	2 293 36                        // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ]
+	shfl.sync.bfly.b32 	%r65, %r64, 2, 31, -1;
+$L__tmp9:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ] ]
+	add.f32 	%r66, %r64, %r65;
+$L__tmp10:
+	.loc	2 293 36                        // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ]
+	shfl.sync.bfly.b32 	%r67, %r66, 1, 31, -1;
+$L__tmp11:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ] ]
+	add.f32 	%r68, %r66, %r67;
+$L__tmp12:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ] ]
+	fma.rn.f32 	%r69, %r50, %r50, %r55;
+	fma.rn.f32 	%r70, %r52, %r52, %r69;
+	fma.rn.f32 	%r71, %r53, %r53, %r70;
+$L__tmp13:
+	.loc	2 293 36                        // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ]
+	shfl.sync.bfly.b32 	%r72, %r71, 16, 31, -1;
+$L__tmp14:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ] ]
+	add.f32 	%r73, %r71, %r72;
+$L__tmp15:
+	.loc	2 293 36                        // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ]
+	shfl.sync.bfly.b32 	%r74, %r73, 8, 31, -1;
+$L__tmp16:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ] ]
+	add.f32 	%r75, %r73, %r74;
+$L__tmp17:
+	.loc	2 293 36                        // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ]
+	shfl.sync.bfly.b32 	%r76, %r75, 4, 31, -1;
+$L__tmp18:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ] ]
+	add.f32 	%r77, %r75, %r76;
+$L__tmp19:
+	.loc	2 293 36                        // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ]
+	shfl.sync.bfly.b32 	%r78, %r77, 2, 31, -1;
+$L__tmp20:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ] ]
+	add.f32 	%r79, %r77, %r78;
+$L__tmp21:
+	.loc	2 293 36                        // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ]
+	shfl.sync.bfly.b32 	%r80, %r79, 1, 31, -1;
+$L__tmp22:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ] ]
+	add.f32 	%r81, %r79, %r80;
+$L__tmp23:
+	.loc	1 62 35                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:62:35
+	mul.wide.u32 	%rd87, %r33, 2;
+	add.s64 	%rd5, %rd83, %rd87;
+	.loc	1 62 42                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:62:42
+	// begin inline asm
+	mov.u64 %rd6, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd6, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r6, %r3;
+	mov.u32 %r7, %r3;
+	@%p1 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { %r6, %r7 }, [ %rd5 + 0 ], %rd6;
+	// end inline asm
+	mov.b32 	{%rs42, %rs43}, %r6;
+	mov.b32 	{%rs44, %rs45}, %r7;
+	.loc	1 62 95                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:62:95
+	cvt.f32.bf16 	%r82, %rs42;
+	cvt.f32.bf16 	%r83, %rs43;
+	cvt.f32.bf16 	%r84, %rs44;
+	cvt.f32.bf16 	%r85, %rs45;
+	.loc	1 63 46                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:63:46
+	shl.b32 	%r86, %r38, 7;
+	.loc	1 63 42                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:63:42
+	or.b32 	%r87, %r86, %r33;
+	.loc	1 63 35                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:63:35
+	mul.wide.s32 	%rd88, %r87, 4;
+	add.s64 	%rd7, %rd84, %rd88;
+	.loc	1 63 51                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:63:51
+	// begin inline asm
+	mov.u64 %rd8, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd8, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r8, %r3;
+	mov.u32 %r9, %r3;
+	mov.u32 %r10, %r3;
+	mov.u32 %r11, %r3;
+	@%p1 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r8, %r9, %r10, %r11 }, [ %rd7 + 0 ], %rd8;
+	// end inline asm
+	.loc	1 64 35                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:64:35
+	add.s64 	%rd9, %rd85, %rd88;
+	.loc	1 64 51                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:64:51
+	// begin inline asm
+	mov.u64 %rd10, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd10, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r12, %r3;
+	mov.u32 %r13, %r3;
+	mov.u32 %r14, %r3;
+	mov.u32 %r15, %r3;
+	@%p1 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r12, %r13, %r14, %r15 }, [ %rd9 + 0 ], %rd10;
+	// end inline asm
+	shl.b32 	%r88, %r29, 4;
+	shl.b32 	%r89, %r27, 2;
+	shr.u32 	%r90, %r26, 1;
+	and.b32 	%r91, %r90, 124;
+	or.b32 	%r92, %r88, %r89;
+	xor.b32 	%r93, %r92, %r91;
+	mov.b32 	%r94, global_smem;
+	add.s32 	%r95, %r94, %r93;
+	st.shared.b32 	[%r95], %r12;
+	xor.b32 	%r96, %r93, 4;
+	add.s32 	%r97, %r94, %r96;
+	st.shared.b32 	[%r97+1024], %r13;
+	xor.b32 	%r98, %r93, 8;
+	add.s32 	%r99, %r94, %r98;
+	st.shared.b32 	[%r99+2048], %r14;
+	xor.b32 	%r100, %r93, 12;
+	add.s32 	%r101, %r94, %r100;
+	st.shared.b32 	[%r101+3072], %r15;
+	bar.sync 	0;
+	shl.b32 	%r102, %r32, 7;
+	xor.b32 	%r103, %r88, %r91;
+	or.b32 	%r104, %r103, %r102;
+	add.s32 	%r105, %r94, %r104;
+	ld.shared.b32 	%r106, [%r105];
+	xor.b32 	%r107, %r104, 4;
+	add.s32 	%r108, %r94, %r107;
+	ld.shared.b32 	%r109, [%r108];
+	xor.b32 	%r110, %r104, 8;
+	add.s32 	%r111, %r94, %r110;
+	ld.shared.b32 	%r112, [%r111];
+	xor.b32 	%r113, %r104, 12;
+	add.s32 	%r114, %r94, %r113;
+	ld.shared.b32 	%r115, [%r114];
+	.loc	1 65 69                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:65:69
+	// begin inline asm
+	mov.u64 %rd11, 0x0;
+	createpolicy.fractional.L2::evict_first.b64 %rd11, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r16, %r3;
+	mov.u32 %r17, %r3;
+	@%p1 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { %r16, %r17 }, [ %rd1 + 0 ], %rd11;
+	// end inline asm
+	.loc	1 66 36                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:66:36
+	add.s64 	%rd12, %rd86, %rd87;
+	.loc	1 66 43                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:66:43
+	// begin inline asm
+	mov.u64 %rd13, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd13, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r18, %r3;
+	mov.u32 %r19, %r3;
+	@%p1 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { %r18, %r19 }, [ %rd12 + 0 ], %rd13;
+	// end inline asm
+	.loc	1 71 24                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:71:24
+	and.b32 	%r116, %r34, 1;
+	setp.ne.b32 	%p3, %r116, 0;
+	not.pred 	%p2, %p3;
+	.loc	1 72 41                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:72:41
+	and.b32 	%r117, %r34, 30;
+	.loc	1 72 39                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:72:39
+	or.b32 	%r118, %r34, 97;
+	.loc	1 72 48                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:72:48
+	shl.b32 	%r119, %r31, 7;
+	shl.b32 	%r120, %r41, 10;
+	and.b32 	%r121, %r120, -32768;
+	add.s32 	%r122, %r121, %r119;
+	.loc	1 72 57                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:72:57
+	or.b32 	%r123, %r122, %r118;
+	.loc	1 72 35                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:72:35
+	cvt.s64.s32 	%rd89, %r122;
+	cvt.u64.u32 	%rd90, %r117;
+	or.b64 	%rd91, %rd89, %rd90;
+	shl.b64 	%rd92, %rd91, 1;
+	add.s64 	%rd93, %rd82, %rd92;
+	add.s64 	%rd14, %rd93, 2;
+	add.s64 	%rd16, %rd93, 66;
+	add.s64 	%rd18, %rd93, 130;
+	mad.wide.s32 	%rd20, %r123, 2, %rd82;
+	.loc	1 72 68                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:72:68
+	// begin inline asm
+	mov.u64 %rd15, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd15, 1.0;
+	// end inline asm
+	mov.b16 	%rs2, 0;
+	// begin inline asm
+	mov.u16 %rs1, %rs2;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs1 }, [ %rd14 + 0 ], %rd15;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd17, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd17, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs3, %rs2;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs3 }, [ %rd16 + 0 ], %rd17;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd19, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd19, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs4, %rs2;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs4 }, [ %rd18 + 0 ], %rd19;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd21, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd21, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs5, %rs2;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs5 }, [ %rd20 + 0 ], %rd21;
+	// end inline asm
+	mov.b32 	%r124, 0f43000000;
+	.loc	1 75 25                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:75:25
+	div.full.f32 	%r125, %r81, %r124;
+	.loc	1 77 24                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:77:24
+	add.f32 	%r126, %r125, 0f358637BD;
+	.loc	1 78 32                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:78:32
+	rsqrt.approx.ftz.f32 	%r127, %r126;
+	.loc	1 79 24                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:79:24
+	bar.sync 	0;
+	shr.u32 	%r128, %r27, 3;
+	add.s32 	%r129, %r94, %r128;
+	st.shared.b32 	[%r129], %r127;
+	bar.sync 	0;
+	shl.b32 	%r130, %r29, 2;
+	add.s32 	%r131, %r94, %r130;
+	ld.shared.b32 	%r132, [%r131];
+	.loc	1 80 35                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:80:35
+	mul.wide.u32 	%rd94, %r117, 2;
+	add.s64 	%rd38, %rd83, %rd94;
+	add.s64 	%rd22, %rd38, 2;
+	add.s64 	%rd24, %rd38, 66;
+	add.s64 	%rd26, %rd38, 130;
+	mul.wide.u32 	%rd95, %r118, 2;
+	add.s64 	%rd28, %rd83, %rd95;
+	.loc	1 80 85                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:80:85
+	// begin inline asm
+	mov.u64 %rd23, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd23, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs6, %rs2;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs6 }, [ %rd22 + 0 ], %rd23;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd25, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd25, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs7, %rs2;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs7 }, [ %rd24 + 0 ], %rd25;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd27, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd27, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs8, %rs2;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs8 }, [ %rd26 + 0 ], %rd27;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd29, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd29, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs9, %rs2;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs9 }, [ %rd28 + 0 ], %rd29;
+	// end inline asm
+	.loc	1 90 53                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:90:53
+	or.b32 	%r133, %r122, %r117;
+	.loc	1 90 35                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:90:35
+	mad.wide.s32 	%rd30, %r133, 2, %rd82;
+	add.s64 	%rd32, %rd93, 64;
+	add.s64 	%rd34, %rd93, 128;
+	add.s64 	%rd36, %rd93, 192;
+	.loc	1 90 64                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:90:64
+	// begin inline asm
+	mov.u64 %rd31, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd31, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs10, %rs2;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs10 }, [ %rd30 + 0 ], %rd31;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd33, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd33, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs11, %rs2;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs11 }, [ %rd32 + 0 ], %rd33;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd35, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd35, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs12, %rs2;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs12 }, [ %rd34 + 0 ], %rd35;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd37, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd37, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs13, %rs2;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs13 }, [ %rd36 + 0 ], %rd37;
+	// end inline asm
+	.loc	1 98 35                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:98:35
+	add.s64 	%rd40, %rd38, 64;
+	add.s64 	%rd42, %rd38, 128;
+	add.s64 	%rd44, %rd38, 192;
+	.loc	1 98 81                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:98:81
+	// begin inline asm
+	mov.u64 %rd39, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd39, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs14, %rs2;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs14 }, [ %rd38 + 0 ], %rd39;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd41, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd41, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs15, %rs2;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs15 }, [ %rd40 + 0 ], %rd41;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd43, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd43, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs16, %rs2;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs16 }, [ %rd42 + 0 ], %rd43;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd45, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd45, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs17, %rs2;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs17 }, [ %rd44 + 0 ], %rd45;
+	// end inline asm
+	.loc	1 111 24                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:111:24
+	mul.f32 	%r134, %r127, %r50;
+	mul.f32 	%r135, %r127, %r51;
+	mul.f32 	%r136, %r127, %r52;
+	mul.f32 	%r137, %r127, %r53;
+	.loc	1 113 24                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:113:24
+	mul.f32 	%r138, %r134, %r82;
+	mul.f32 	%r139, %r135, %r83;
+	mul.f32 	%r140, %r136, %r84;
+	mul.f32 	%r141, %r137, %r85;
+	.loc	1 116 24                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:116:24
+	mul.f32 	%r142, %r138, %r8;
+	mul.f32 	%r143, %r139, %r9;
+	mul.f32 	%r144, %r140, %r10;
+	mul.f32 	%r145, %r141, %r11;
+	bar.sync 	0;
+	st.shared.b32 	[%r95], %r142;
+	st.shared.b32 	[%r97+1024], %r143;
+	st.shared.b32 	[%r99+2048], %r144;
+	st.shared.b32 	[%r101+3072], %r145;
+	bar.sync 	0;
+	ld.shared.b32 	%r146, [%r105];
+	ld.shared.b32 	%r147, [%r108];
+	ld.shared.b32 	%r148, [%r111];
+	ld.shared.b32 	%r149, [%r114];
+	.loc	1 121 42                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:121:42
+	or.b32 	%r150, %r34, 4193;
+	.loc	1 121 60                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:121:60
+	add.s32 	%r151, %r133, 4097;
+	add.s32 	%r152, %r133, 4129;
+	add.s32 	%r153, %r133, 4161;
+	add.s32 	%r154, %r122, %r150;
+	.loc	1 121 35                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:121:35
+	mad.wide.s32 	%rd46, %r151, 2, %rd82;
+	mad.wide.s32 	%rd48, %r152, 2, %rd82;
+	mad.wide.s32 	%rd50, %r153, 2, %rd82;
+	mad.wide.s32 	%rd52, %r154, 2, %rd82;
+	.loc	1 121 71                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:121:71
+	// begin inline asm
+	mov.u64 %rd47, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd47, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs18, %rs2;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs18 }, [ %rd46 + 0 ], %rd47;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd49, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd49, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs19, %rs2;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs19 }, [ %rd48 + 0 ], %rd49;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd51, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd51, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs20, %rs2;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs20 }, [ %rd50 + 0 ], %rd51;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd53, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd53, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs21, %rs2;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs21 }, [ %rd52 + 0 ], %rd53;
+	// end inline asm
+	.loc	1 123 24                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:123:24
+	div.full.f32 	%r155, %r68, %r124;
+	.loc	1 124 24                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:124:24
+	add.f32 	%r156, %r155, 0f358637BD;
+	.loc	1 125 32                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:125:32
+	rsqrt.approx.ftz.f32 	%r157, %r156;
+	.loc	1 121 132                       // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:121:132
+	cvt.f32.bf16 	%r158, %rs21;
+	cvt.f32.bf16 	%r159, %rs20;
+	cvt.f32.bf16 	%r160, %rs19;
+	cvt.f32.bf16 	%r161, %rs18;
+	.loc	1 72 129                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:72:129
+	cvt.f32.bf16 	%r162, %rs5;
+	.loc	1 79 24                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:79:24
+	mul.f32 	%r163, %r132, %r162;
+	.loc	1 80 146                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:80:146
+	cvt.f32.bf16 	%r164, %rs9;
+	.loc	1 84 17                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:84:17
+	neg.f32 	%r165, %r163;
+	fma.rn.f32 	%r166, %r165, %r164, 0f00000000;
+	.loc	1 90 125                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:90:125
+	cvt.f32.bf16 	%r167, %rs13;
+	.loc	1 97 24                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:97:24
+	mul.f32 	%r168, %r132, %r167;
+	.loc	1 98 142                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:98:142
+	cvt.f32.bf16 	%r169, %rs17;
+	.loc	1 100 24                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:100:24
+	mul.f32 	%r170, %r168, %r169;
+	.loc	1 0 0                           // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:0
+	selp.f32 	%r171, %r170, %r166, %p3;
+	.loc	1 119 24                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:119:24
+	fma.rn.f32 	%r172, %r115, %r171, %r149;
+	.loc	1 72 129                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:72:129
+	cvt.f32.bf16 	%r173, %rs4;
+	.loc	1 79 24                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:79:24
+	mul.f32 	%r174, %r132, %r173;
+	.loc	1 80 146                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:80:146
+	cvt.f32.bf16 	%r175, %rs8;
+	.loc	1 84 17                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:84:17
+	neg.f32 	%r176, %r174;
+	fma.rn.f32 	%r177, %r176, %r175, 0f00000000;
+	.loc	1 90 125                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:90:125
+	cvt.f32.bf16 	%r178, %rs12;
+	.loc	1 97 24                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:97:24
+	mul.f32 	%r179, %r132, %r178;
+	.loc	1 98 142                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:98:142
+	cvt.f32.bf16 	%r180, %rs16;
+	.loc	1 100 24                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:100:24
+	mul.f32 	%r181, %r179, %r180;
+	.loc	1 0 0                           // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:0
+	selp.f32 	%r182, %r181, %r177, %p3;
+	.loc	1 119 24                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:119:24
+	fma.rn.f32 	%r183, %r112, %r182, %r148;
+	.loc	1 72 129                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:72:129
+	cvt.f32.bf16 	%r184, %rs3;
+	.loc	1 79 24                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:79:24
+	mul.f32 	%r185, %r132, %r184;
+	.loc	1 80 146                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:80:146
+	cvt.f32.bf16 	%r186, %rs7;
+	.loc	1 84 17                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:84:17
+	neg.f32 	%r187, %r185;
+	fma.rn.f32 	%r188, %r187, %r186, 0f00000000;
+	.loc	1 90 125                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:90:125
+	cvt.f32.bf16 	%r189, %rs11;
+	.loc	1 97 24                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:97:24
+	mul.f32 	%r190, %r132, %r189;
+	.loc	1 98 142                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:98:142
+	cvt.f32.bf16 	%r191, %rs15;
+	.loc	1 100 24                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:100:24
+	mul.f32 	%r192, %r190, %r191;
+	.loc	1 0 0                           // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:0
+	selp.f32 	%r193, %r192, %r188, %p3;
+	.loc	1 119 24                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:119:24
+	fma.rn.f32 	%r194, %r109, %r193, %r147;
+	.loc	1 72 129                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:72:129
+	cvt.f32.bf16 	%r195, %rs1;
+	.loc	1 79 24                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:79:24
+	mul.f32 	%r196, %r132, %r195;
+	.loc	1 80 146                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:80:146
+	cvt.f32.bf16 	%r197, %rs6;
+	.loc	1 84 17                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:84:17
+	neg.f32 	%r198, %r196;
+	fma.rn.f32 	%r199, %r198, %r197, 0f00000000;
+	.loc	1 90 125                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:90:125
+	cvt.f32.bf16 	%r200, %rs10;
+	.loc	1 97 24                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:97:24
+	mul.f32 	%r201, %r132, %r200;
+	.loc	1 98 142                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:98:142
+	cvt.f32.bf16 	%r202, %rs14;
+	.loc	1 100 24                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:100:24
+	mul.f32 	%r203, %r201, %r202;
+	.loc	1 0 0                           // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:0
+	selp.f32 	%r204, %r203, %r199, %p3;
+	.loc	1 119 24                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:119:24
+	fma.rn.f32 	%r205, %r106, %r204, %r146;
+	.loc	1 66 43                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:66:43
+	mov.b32 	{%rs46, %rs47}, %r19;
+	.loc	1 66 96                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:66:96
+	cvt.f32.bf16 	%r206, %rs47;
+	cvt.f32.bf16 	%r207, %rs46;
+	.loc	1 66 43                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:66:43
+	mov.b32 	{%rs48, %rs49}, %r18;
+	.loc	1 66 96                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:66:96
+	cvt.f32.bf16 	%r208, %rs49;
+	cvt.f32.bf16 	%r209, %rs48;
+	.loc	1 65 69                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:65:69
+	mov.b32 	{%rs50, %rs51}, %r17;
+	.loc	1 65 123                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:65:123
+	cvt.f32.bf16 	%r210, %rs51;
+	cvt.f32.bf16 	%r211, %rs50;
+	.loc	1 65 69                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:65:69
+	mov.b32 	{%rs52, %rs53}, %r16;
+	.loc	1 65 123                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:65:123
+	cvt.f32.bf16 	%r212, %rs53;
+	cvt.f32.bf16 	%r213, %rs52;
+	.loc	1 126 24                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:126:24
+	bar.sync 	0;
+	st.shared.b32 	[%r129], %r157;
+	bar.sync 	0;
+	ld.shared.b32 	%r214, [%r131];
+	mul.f32 	%r215, %r214, %r161;
+	mul.f32 	%r216, %r214, %r160;
+	mul.f32 	%r217, %r214, %r159;
+	mul.f32 	%r218, %r214, %r158;
+	.loc	1 127 35                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:127:35
+	add.s64 	%rd70, %rd86, %rd94;
+	add.s64 	%rd54, %rd70, 2;
+	add.s64 	%rd56, %rd70, 66;
+	add.s64 	%rd58, %rd70, 130;
+	add.s64 	%rd60, %rd86, %rd95;
+	.loc	1 127 85                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:127:85
+	// begin inline asm
+	mov.u64 %rd55, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd55, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs22, %rs2;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs22 }, [ %rd54 + 0 ], %rd55;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd57, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd57, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs23, %rs2;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs23 }, [ %rd56 + 0 ], %rd57;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd59, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd59, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs24, %rs2;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs24 }, [ %rd58 + 0 ], %rd59;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd61, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd61, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs25, %rs2;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs25 }, [ %rd60 + 0 ], %rd61;
+	// end inline asm
+	.loc	1 127 146                       // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:127:146
+	cvt.f32.bf16 	%r219, %rs22;
+	cvt.f32.bf16 	%r220, %rs23;
+	cvt.f32.bf16 	%r221, %rs24;
+	cvt.f32.bf16 	%r222, %rs25;
+	.loc	1 131 17                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:131:17
+	neg.f32 	%r223, %r215;
+	fma.rn.f32 	%r224, %r223, %r219, 0f00000000;
+	neg.f32 	%r225, %r216;
+	fma.rn.f32 	%r226, %r225, %r220, 0f00000000;
+	neg.f32 	%r227, %r217;
+	fma.rn.f32 	%r228, %r227, %r221, 0f00000000;
+	neg.f32 	%r229, %r218;
+	fma.rn.f32 	%r230, %r229, %r222, 0f00000000;
+	.loc	1 134 60                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:134:60
+	add.s32 	%r231, %r133, 4096;
+	add.s32 	%r232, %r133, 4128;
+	add.s32 	%r233, %r133, 4160;
+	add.s32 	%r234, %r133, 4192;
+	.loc	1 134 35                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:134:35
+	mad.wide.s32 	%rd62, %r231, 2, %rd82;
+	mad.wide.s32 	%rd64, %r232, 2, %rd82;
+	mad.wide.s32 	%rd66, %r233, 2, %rd82;
+	mad.wide.s32 	%rd68, %r234, 2, %rd82;
+	.loc	1 134 71                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:134:71
+	// begin inline asm
+	mov.u64 %rd63, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd63, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs26, %rs2;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs26 }, [ %rd62 + 0 ], %rd63;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd65, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd65, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs27, %rs2;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs27 }, [ %rd64 + 0 ], %rd65;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd67, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd67, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs28, %rs2;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs28 }, [ %rd66 + 0 ], %rd67;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd69, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd69, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs29, %rs2;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs29 }, [ %rd68 + 0 ], %rd69;
+	// end inline asm
+	.loc	1 134 132                       // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:134:132
+	cvt.f32.bf16 	%r235, %rs26;
+	cvt.f32.bf16 	%r236, %rs27;
+	cvt.f32.bf16 	%r237, %rs28;
+	cvt.f32.bf16 	%r238, %rs29;
+	.loc	1 139 24                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:139:24
+	mul.f32 	%r239, %r214, %r235;
+	mul.f32 	%r240, %r214, %r236;
+	mul.f32 	%r241, %r214, %r237;
+	mul.f32 	%r242, %r214, %r238;
+	.loc	1 140 35                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:140:35
+	add.s64 	%rd72, %rd70, 64;
+	add.s64 	%rd74, %rd70, 128;
+	add.s64 	%rd76, %rd70, 192;
+	.loc	1 140 81                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:140:81
+	// begin inline asm
+	mov.u64 %rd71, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd71, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs30, %rs2;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs30 }, [ %rd70 + 0 ], %rd71;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd73, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd73, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs31, %rs2;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs31 }, [ %rd72 + 0 ], %rd73;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd75, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd75, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs32, %rs2;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs32 }, [ %rd74 + 0 ], %rd75;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd77, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd77, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs33, %rs2;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs33 }, [ %rd76 + 0 ], %rd77;
+	// end inline asm
+	.loc	1 140 142                       // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:140:142
+	cvt.f32.bf16 	%r243, %rs30;
+	cvt.f32.bf16 	%r244, %rs31;
+	cvt.f32.bf16 	%r245, %rs32;
+	cvt.f32.bf16 	%r246, %rs33;
+	.loc	1 142 24                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:142:24
+	mul.f32 	%r247, %r239, %r243;
+	mul.f32 	%r248, %r240, %r244;
+	mul.f32 	%r249, %r241, %r245;
+	mul.f32 	%r250, %r242, %r246;
+	.loc	1 0 0                           // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:0
+	selp.f32 	%r251, %r247, %r224, %p3;
+	selp.f32 	%r252, %r248, %r226, %p3;
+	selp.f32 	%r253, %r249, %r228, %p3;
+	selp.f32 	%r254, %r250, %r230, %p3;
+	.loc	1 151 25                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:151:25
+	mul.f32 	%r255, %r157, %r213;
+	mul.f32 	%r256, %r157, %r212;
+	mul.f32 	%r257, %r157, %r211;
+	mul.f32 	%r258, %r157, %r210;
+	.loc	1 153 26                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:153:26
+	mul.f32 	%r259, %r255, %r209;
+	mul.f32 	%r260, %r256, %r208;
+	mul.f32 	%r261, %r257, %r207;
+	mul.f32 	%r262, %r258, %r206;
+	.loc	1 156 26                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:156:26
+	mul.f32 	%r263, %r259, %r8;
+	mul.f32 	%r264, %r260, %r9;
+	mul.f32 	%r265, %r261, %r10;
+	mul.f32 	%r266, %r262, %r11;
+	bar.sync 	0;
+	st.shared.b32 	[%r95], %r263;
+	st.shared.b32 	[%r97+1024], %r264;
+	st.shared.b32 	[%r99+2048], %r265;
+	st.shared.b32 	[%r101+3072], %r266;
+	bar.sync 	0;
+	ld.shared.b32 	%r267, [%r105];
+	ld.shared.b32 	%r268, [%r108];
+	ld.shared.b32 	%r269, [%r111];
+	ld.shared.b32 	%r270, [%r114];
+	.loc	1 159 26                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:159:26
+	fma.rn.f32 	%r271, %r106, %r251, %r267;
+	fma.rn.f32 	%r272, %r109, %r252, %r268;
+	fma.rn.f32 	%r273, %r112, %r253, %r269;
+	fma.rn.f32 	%r274, %r115, %r254, %r270;
+	.loc	1 161 43                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:161:43
+	shl.b32 	%r275, %r30, 7;
+	.loc	1 161 39                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:161:39
+	or.b32 	%r276, %r275, %r33;
+	.loc	1 161 32                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:161:32
+	mul.wide.s32 	%rd96, %r276, 2;
+	add.s64 	%rd78, %rd80, %rd96;
+	.loc	1 161 55                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:161:55
+	cvt.rn.bf16.f32 	%rs54, %r205;
+	cvt.rn.bf16.f32 	%rs55, %r194;
+	cvt.rn.bf16.f32 	%rs56, %r183;
+	cvt.rn.bf16.f32 	%rs57, %r172;
+	bar.sync 	0;
+	and.b32 	%r277, %r26, 15;
+	shl.b32 	%r278, %r277, 7;
+	shl.b32 	%r279, %r277, 3;
+	and.b32 	%r280, %r34, 24;
+	shr.u32 	%r281, %r26, 2;
+	and.b32 	%r282, %r281, 4;
+	shr.u32 	%r283, %r26, 4;
+	and.b32 	%r284, %r283, 2;
+	or.b32 	%r285, %r278, %r282;
+	or.b32 	%r286, %r285, %r284;
+	xor.b32 	%r287, %r279, %r280;
+	or.b32 	%r288, %r286, %r287;
+	add.s32 	%r289, %r94, %r288;
+	st.shared.b16 	[%r289], %rs54;
+	xor.b32 	%r290, %r288, 32;
+	add.s32 	%r291, %r94, %r290;
+	st.shared.b16 	[%r291], %rs55;
+	xor.b32 	%r292, %r288, 64;
+	add.s32 	%r293, %r94, %r292;
+	st.shared.b16 	[%r293], %rs56;
+	xor.b32 	%r294, %r288, 96;
+	add.s32 	%r295, %r94, %r294;
+	st.shared.b16 	[%r295], %rs57;
+	bar.sync 	0;
+	shl.b32 	%r296, %r26, 2;
+	and.b32 	%r297, %r296, 1016;
+	shr.u32 	%r298, %r27, 2;
+	shl.b32 	%r299, %r26, 1;
+	and.b32 	%r300, %r299, 2;
+	xor.b32 	%r301, %r297, %r298;
+	or.b32 	%r302, %r301, %r300;
+	add.s32 	%r303, %r94, %r302;
+	ld.shared.b16 	%rs58, [%r303];
+	ld.shared.b16 	%rs59, [%r303+4];
+	xor.b32 	%r304, %r302, 64;
+	add.s32 	%r305, %r94, %r304;
+	ld.shared.b16 	%rs60, [%r305+1024];
+	ld.shared.b16 	%rs61, [%r305+1028];
+	mov.b32 	%r20, {%rs58, %rs60};
+	mov.b32 	%r21, {%rs59, %rs61};
+	// begin inline asm
+	@%p1 st.global.v2.b32 [ %rd78 + 0 ], { %r20, %r21 };
+	// end inline asm
+	.loc	1 162 32                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:162:32
+	add.s64 	%rd79, %rd81, %rd96;
+	.loc	1 162 56                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:162:56
+	cvt.rn.bf16.f32 	%rs62, %r271;
+	cvt.rn.bf16.f32 	%rs63, %r272;
+	cvt.rn.bf16.f32 	%rs64, %r273;
+	cvt.rn.bf16.f32 	%rs65, %r274;
+	bar.sync 	0;
+	st.shared.b16 	[%r289], %rs62;
+	st.shared.b16 	[%r291], %rs63;
+	st.shared.b16 	[%r293], %rs64;
+	st.shared.b16 	[%r295], %rs65;
+	bar.sync 	0;
+	ld.shared.b16 	%rs66, [%r303];
+	ld.shared.b16 	%rs67, [%r303+4];
+	ld.shared.b16 	%rs68, [%r305+1024];
+	ld.shared.b16 	%rs69, [%r305+1028];
+	mov.b32 	%r22, {%rs66, %rs68};
+	mov.b32 	%r23, {%rs67, %rs69};
+	// begin inline asm
+	@%p1 st.global.v2.b32 [ %rd79 + 0 ], { %r22, %r23 };
+	// end inline asm
+	.loc	1 53 4                          // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:53:4
+	ret;
+$L__tmp24:
+$L__func_end0:
+                                        // -- End function
+}
+	.file	1 "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py"
+	.file	2 "/usr/local/lib/python3.12/dist-packages/triton/language/standard.py"
+	.section	.debug_abbrev
+	{
+.b8 1                                   // Abbreviation Code
+.b8 17                                  // DW_TAG_compile_unit
+.b8 1                                   // DW_CHILDREN_yes
+.b8 37                                  // DW_AT_producer
+.b8 8                                   // DW_FORM_string
+.b8 19                                  // DW_AT_language
+.b8 5                                   // DW_FORM_data2
+.b8 3                                   // DW_AT_name
+.b8 8                                   // DW_FORM_string
+.b8 16                                  // DW_AT_stmt_list
+.b8 6                                   // DW_FORM_data4
+.b8 27                                  // DW_AT_comp_dir
+.b8 8                                   // DW_FORM_string
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 2                                   // Abbreviation Code
+.b8 46                                  // DW_TAG_subprogram
+.b8 0                                   // DW_CHILDREN_no
+.b8 3                                   // DW_AT_name
+.b8 8                                   // DW_FORM_string
+.b8 32                                  // DW_AT_inline
+.b8 11                                  // DW_FORM_data1
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 3                                   // Abbreviation Code
+.b8 46                                  // DW_TAG_subprogram
+.b8 1                                   // DW_CHILDREN_yes
+.b8 17                                  // DW_AT_low_pc
+.b8 1                                   // DW_FORM_addr
+.b8 18                                  // DW_AT_high_pc
+.b8 1                                   // DW_FORM_addr
+.b8 49                                  // DW_AT_abstract_origin
+.b8 19                                  // DW_FORM_ref4
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 4                                   // Abbreviation Code
+.b8 29                                  // DW_TAG_inlined_subroutine
+.b8 1                                   // DW_CHILDREN_yes
+.b8 49                                  // DW_AT_abstract_origin
+.b8 19                                  // DW_FORM_ref4
+.b8 17                                  // DW_AT_low_pc
+.b8 1                                   // DW_FORM_addr
+.b8 18                                  // DW_AT_high_pc
+.b8 1                                   // DW_FORM_addr
+.b8 88                                  // DW_AT_call_file
+.b8 11                                  // DW_FORM_data1
+.b8 89                                  // DW_AT_call_line
+.b8 11                                  // DW_FORM_data1
+.b8 87                                  // DW_AT_call_column
+.b8 11                                  // DW_FORM_data1
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 5                                   // Abbreviation Code
+.b8 29                                  // DW_TAG_inlined_subroutine
+.b8 0                                   // DW_CHILDREN_no
+.b8 49                                  // DW_AT_abstract_origin
+.b8 19                                  // DW_FORM_ref4
+.b8 17                                  // DW_AT_low_pc
+.b8 1                                   // DW_FORM_addr
+.b8 18                                  // DW_AT_high_pc
+.b8 1                                   // DW_FORM_addr
+.b8 88                                  // DW_AT_call_file
+.b8 11                                  // DW_FORM_data1
+.b8 89                                  // DW_AT_call_line
+.b8 5                                   // DW_FORM_data2
+.b8 87                                  // DW_AT_call_column
+.b8 11                                  // DW_FORM_data1
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 0                                   // EOM(3)
+	}
+	.section	.debug_info
+	{
+.b32 456                                // Length of Unit
+.b8 2                                   // DWARF version number
+.b8 0
+.b32 .debug_abbrev                      // Offset Into Abbrev. Section
+.b8 8                                   // Address Size (in bytes)
+.b8 1                                   // Abbrev [1] 0xb:0x1c1 DW_TAG_compile_unit
+.b8 116                                 // DW_AT_producer
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 0
+.b8 2                                   // DW_AT_language
+.b8 0
+.b8 99                                  // DW_AT_name
+.b8 98
+.b8 118
+.b8 113
+.b8 104
+.b8 106
+.b8 116
+.b8 121
+.b8 103
+.b8 55
+.b8 102
+.b8 118
+.b8 120
+.b8 122
+.b8 119
+.b8 116
+.b8 98
+.b8 116
+.b8 116
+.b8 52
+.b8 118
+.b8 114
+.b8 100
+.b8 107
+.b8 98
+.b8 110
+.b8 98
+.b8 54
+.b8 110
+.b8 51
+.b8 50
+.b8 102
+.b8 110
+.b8 114
+.b8 105
+.b8 106
+.b8 106
+.b8 112
+.b8 108
+.b8 51
+.b8 118
+.b8 118
+.b8 52
+.b8 99
+.b8 102
+.b8 113
+.b8 100
+.b8 52
+.b8 109
+.b8 122
+.b8 110
+.b8 114
+.b8 46
+.b8 112
+.b8 121
+.b8 0
+.b32 .debug_line                        // DW_AT_stmt_list
+.b8 47                                  // DW_AT_comp_dir
+.b8 97
+.b8 112
+.b8 112
+.b8 47
+.b8 116
+.b8 101
+.b8 110
+.b8 115
+.b8 111
+.b8 114
+.b8 114
+.b8 116
+.b8 95
+.b8 108
+.b8 108
+.b8 109
+.b8 47
+.b8 118
+.b8 105
+.b8 115
+.b8 117
+.b8 97
+.b8 108
+.b8 95
+.b8 103
+.b8 101
+.b8 110
+.b8 47
+.b8 99
+.b8 111
+.b8 109
+.b8 112
+.b8 105
+.b8 108
+.b8 101
+.b8 100
+.b8 95
+.b8 99
+.b8 97
+.b8 99
+.b8 104
+.b8 101
+.b8 47
+.b8 102
+.b8 108
+.b8 117
+.b8 120
+.b8 50
+.b8 95
+.b8 107
+.b8 108
+.b8 101
+.b8 105
+.b8 110
+.b8 95
+.b8 57
+.b8 98
+.b8 95
+.b8 78
+.b8 86
+.b8 73
+.b8 68
+.b8 73
+.b8 65
+.b8 95
+.b8 71
+.b8 101
+.b8 70
+.b8 111
+.b8 114
+.b8 99
+.b8 101
+.b8 95
+.b8 82
+.b8 84
+.b8 88
+.b8 95
+.b8 52
+.b8 48
+.b8 57
+.b8 48
+.b8 95
+.b8 115
+.b8 109
+.b8 56
+.b8 57
+.b8 95
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 50
+.b8 46
+.b8 49
+.b8 48
+.b8 46
+.b8 48
+.b8 97
+.b8 48
+.b8 95
+.b8 98
+.b8 52
+.b8 101
+.b8 52
+.b8 101
+.b8 101
+.b8 56
+.b8 49
+.b8 100
+.b8 51
+.b8 46
+.b8 110
+.b8 118
+.b8 50
+.b8 53
+.b8 46
+.b8 49
+.b8 50
+.b8 95
+.b8 99
+.b8 117
+.b8 100
+.b8 97
+.b8 49
+.b8 51
+.b8 95
+.b8 49
+.b8 47
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 105
+.b8 110
+.b8 100
+.b8 117
+.b8 99
+.b8 116
+.b8 111
+.b8 114
+.b8 47
+.b8 98
+.b8 118
+.b8 0
+.b8 2                                   // Abbrev [2] 0xe4:0x6d DW_TAG_subprogram
+.b8 116                                 // DW_AT_name
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 114
+.b8 101
+.b8 100
+.b8 95
+.b8 102
+.b8 117
+.b8 115
+.b8 101
+.b8 100
+.b8 95
+.b8 95
+.b8 102
+.b8 117
+.b8 115
+.b8 101
+.b8 100
+.b8 95
+.b8 114
+.b8 109
+.b8 115
+.b8 95
+.b8 110
+.b8 111
+.b8 114
+.b8 109
+.b8 95
+.b8 95
+.b8 116
+.b8 111
+.b8 95
+.b8 99
+.b8 111
+.b8 112
+.b8 121
+.b8 95
+.b8 97
+.b8 100
+.b8 100
+.b8 95
+.b8 109
+.b8 117
+.b8 108
+.b8 95
+.b8 110
+.b8 101
+.b8 103
+.b8 95
+.b8 115
+.b8 112
+.b8 108
+.b8 105
+.b8 116
+.b8 95
+.b8 115
+.b8 112
+.b8 108
+.b8 105
+.b8 116
+.b8 95
+.b8 119
+.b8 105
+.b8 116
+.b8 104
+.b8 95
+.b8 115
+.b8 105
+.b8 122
+.b8 101
+.b8 115
+.b8 95
+.b8 115
+.b8 116
+.b8 97
+.b8 99
+.b8 107
+.b8 95
+.b8 117
+.b8 110
+.b8 98
+.b8 105
+.b8 110
+.b8 100
+.b8 95
+.b8 117
+.b8 110
+.b8 115
+.b8 113
+.b8 117
+.b8 101
+.b8 101
+.b8 122
+.b8 101
+.b8 95
+.b8 118
+.b8 105
+.b8 101
+.b8 119
+.b8 95
+.b8 48
+.b8 0
+.b8 1                                   // DW_AT_inline
+.b8 3                                   // Abbrev [3] 0x151:0x7a DW_TAG_subprogram
+.b64 $L__func_begin0                    // DW_AT_low_pc
+.b64 $L__func_end0                      // DW_AT_high_pc
+.b32 228                                // DW_AT_abstract_origin
+.b8 4                                   // Abbrev [4] 0x166:0x32 DW_TAG_inlined_subroutine
+.b32 228                                // DW_AT_abstract_origin
+.b64 $L__tmp1                           // DW_AT_low_pc
+.b64 $L__tmp12                          // DW_AT_high_pc
+.b8 1                                   // DW_AT_call_file
+.b8 51                                  // DW_AT_call_line
+.b8 25                                  // DW_AT_call_column
+.b8 5                                   // Abbrev [5] 0x17e:0x19 DW_TAG_inlined_subroutine
+.b32 228                                // DW_AT_abstract_origin
+.b64 $L__tmp1                           // DW_AT_low_pc
+.b64 $L__tmp12                          // DW_AT_high_pc
+.b8 2                                   // DW_AT_call_file
+.b8 37                                  // DW_AT_call_line
+.b8 1
+.b8 36                                  // DW_AT_call_column
+.b8 0                                   // End Of Children Mark
+.b8 4                                   // Abbrev [4] 0x198:0x32 DW_TAG_inlined_subroutine
+.b32 228                                // DW_AT_abstract_origin
+.b64 $L__tmp12                          // DW_AT_low_pc
+.b64 $L__tmp23                          // DW_AT_high_pc
+.b8 1                                   // DW_AT_call_file
+.b8 52                                  // DW_AT_call_line
+.b8 27                                  // DW_AT_call_column
+.b8 5                                   // Abbrev [5] 0x1b0:0x19 DW_TAG_inlined_subroutine
+.b32 228                                // DW_AT_abstract_origin
+.b64 $L__tmp12                          // DW_AT_low_pc
+.b64 $L__tmp23                          // DW_AT_high_pc
+.b8 2                                   // DW_AT_call_file
+.b8 37                                  // DW_AT_call_line
+.b8 1
+.b8 36                                  // DW_AT_call_column
+.b8 0                                   // End Of Children Mark
+.b8 0                                   // End Of Children Mark
+.b8 0                                   // End Of Children Mark
+	}
+	.section	.debug_macinfo	{	}
diff --git a/triton/646WRNO7FMJYGKK66FHZ7JPPW6YUT5P5V6RPZAVEK2TTH5B43JCA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.source b/triton/646WRNO7FMJYGKK66FHZ7JPPW6YUT5P5V6RPZAVEK2TTH5B43JCA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.source
new file mode 100644
index 0000000000000000000000000000000000000000..35b704f94f17e6e09792453584cc30d2e765f42b
--- /dev/null
+++ b/triton/646WRNO7FMJYGKK66FHZ7JPPW6YUT5P5V6RPZAVEK2TTH5B43JCA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.source
@@ -0,0 +1,972 @@
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":18:0)
+#loc213 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":287:0)
+#loc215 = loc(unknown)
+#loc218 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":262:0)
+#loc222 = loc("in_out_ptr0"(#loc))
+#loc223 = loc("in_out_ptr1"(#loc))
+#loc224 = loc("in_ptr0"(#loc))
+#loc225 = loc("in_ptr1"(#loc))
+#loc226 = loc("in_ptr2"(#loc))
+#loc227 = loc("in_ptr3"(#loc))
+#loc228 = loc("in_ptr4"(#loc))
+#loc229 = loc("xnumel"(#loc))
+#loc230 = loc("r0_numel"(#loc))
+#loc432 = loc("input"(#loc213))
+#loc433 = loc("a"(#loc218))
+#loc434 = loc("b"(#loc218))
+module {
+  tt.func public @triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0(%in_out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_out_ptr0"(#loc)), %in_out_ptr1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_out_ptr1"(#loc)), %in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %in_ptr4: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr4"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
+    %xnumel_0 = arith.constant 73728 : i32 loc(#loc231)
+    %r0_numel_1 = arith.constant 128 : i32 loc(#loc232)
+    %xoffset = tt.get_program_id x : i32 loc(#loc233)
+    %xoffset_2 = arith.constant 8 : i32 loc(#loc234)
+    %xoffset_3 = arith.constant 8 : i32 loc(#loc234)
+    %xoffset_4 = arith.muli %xoffset, %xoffset_3 : i32 loc(#loc234)
+    %xindex = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32> loc(#loc235)
+    %xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<8xi32> -> tensor<8x1xi32> loc(#loc236)
+    %xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<8x1xi32> loc(#loc237)
+    %xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<8x1xi32> loc(#loc237)
+    %xmask = arith.constant true loc(#loc238)
+    %xmask_8 = arith.constant dense<true> : tensor<8x128xi1> loc(#loc238)
+    %r0_base = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc239)
+    %r0_base_9 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc240)
+    %x0 = arith.constant 32 : i32 loc(#loc241)
+    %x0_10 = arith.constant 32 : i32 loc(#loc241)
+    %x0_11 = arith.constant dense<32> : tensor<8x1xi32> loc(#loc241)
+    %x0_12 = arith.remsi %xindex_7, %x0_11 : tensor<8x1xi32> loc(#loc241)
+    %x1 = arith.constant 32 : i32 loc(#loc242)
+    %x1_13 = arith.constant 32 : i32 loc(#loc242)
+    %x1_14 = arith.constant dense<32> : tensor<8x1xi32> loc(#loc242)
+    %x1_15 = arith.divsi %xindex_7, %x1_14 : tensor<8x1xi32> loc(#loc242)
+    %_tmp4 = arith.constant 0.000000e+00 : f32 loc(#loc243)
+    %_tmp4_16 = arith.constant dense<0.000000e+00> : tensor<8x128xf32> loc(#loc243)
+    %_tmp10 = arith.constant 0.000000e+00 : f32 loc(#loc244)
+    %_tmp10_17 = arith.constant dense<0.000000e+00> : tensor<8x128xf32> loc(#loc244)
+    %c0_i32 = arith.constant 0 : i32 loc(#loc15)
+    %c128_i32 = arith.constant 128 : i32 loc(#loc15)
+    %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc15)
+    %1 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc15)
+    %2 = arith.bitcast %c128_i32 : i32 to i32 loc(#loc15)
+    %3 = ub.poison : i32 loc(#loc15)
+    %_tmp10_18:2 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%_tmp4_23 = %_tmp4_16, %_tmp10_24 = %_tmp10_17) -> (tensor<8x128xf32>, tensor<8x128xf32>)  : i32 {
+      %r0_index = tt.splat %r0_offset : i32 -> tensor<1x128xi32> loc(#loc246)
+      %r0_index_25 = arith.addi %r0_index, %r0_base_9 : tensor<1x128xi32> loc(#loc246)
+      %r0_mask = arith.constant dense<128> : tensor<1x128xi32> loc(#loc247)
+      %r0_mask_26 = arith.cmpi slt, %r0_index_25, %r0_mask : tensor<1x128xi32> loc(#loc247)
+      %tmp0 = arith.constant 4096 : i32 loc(#loc248)
+      %tmp0_27 = arith.constant 4096 : i32 loc(#loc248)
+      %tmp0_28 = arith.constant dense<4096> : tensor<1x128xi32> loc(#loc248)
+      %tmp0_29 = arith.addi %tmp0_28, %r0_index_25 : tensor<1x128xi32> loc(#loc248)
+      %tmp0_30 = arith.constant 128 : i32 loc(#loc249)
+      %tmp0_31 = arith.constant 128 : i32 loc(#loc249)
+      %tmp0_32 = arith.constant dense<128> : tensor<8x1xi32> loc(#loc249)
+      %tmp0_33 = arith.muli %tmp0_32, %x0_12 : tensor<8x1xi32> loc(#loc249)
+      %tmp0_34 = tt.broadcast %tmp0_29 : tensor<1x128xi32> -> tensor<8x128xi32> loc(#loc250)
+      %tmp0_35 = tt.broadcast %tmp0_33 : tensor<8x1xi32> -> tensor<8x128xi32> loc(#loc250)
+      %tmp0_36 = arith.addi %tmp0_34, %tmp0_35 : tensor<8x128xi32> loc(#loc250)
+      %tmp0_37 = arith.constant 36864 : i32 loc(#loc251)
+      %tmp0_38 = arith.constant 36864 : i32 loc(#loc251)
+      %tmp0_39 = arith.constant dense<36864> : tensor<8x1xi32> loc(#loc251)
+      %tmp0_40 = arith.muli %tmp0_39, %x1_15 : tensor<8x1xi32> loc(#loc251)
+      %tmp0_41 = tt.broadcast %tmp0_40 : tensor<8x1xi32> -> tensor<8x128xi32> loc(#loc252)
+      %tmp0_42 = arith.addi %tmp0_36, %tmp0_41 : tensor<8x128xi32> loc(#loc252)
+      %tmp0_43 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<8x128x!tt.ptr<bf16>> loc(#loc253)
+      %tmp0_44 = tt.addptr %tmp0_43, %tmp0_42 : tensor<8x128x!tt.ptr<bf16>>, tensor<8x128xi32> loc(#loc253)
+      %tmp0_45 = arith.constant 0.000000e+00 : f32 loc(#loc254)
+      %tmp0_46 = tt.broadcast %r0_mask_26 : tensor<1x128xi1> -> tensor<8x128xi1> loc(#loc254)
+      %tmp0_47 = arith.constant dense<0.000000e+00> : tensor<8x128xf32> loc(#loc254)
+      %tmp0_48 = arith.truncf %tmp0_47 : tensor<8x128xf32> to tensor<8x128xbf16> loc(#loc254)
+      %tmp0_49 = tt.load %tmp0_44, %tmp0_46, %tmp0_48 evictionPolicy = evict_last : tensor<8x128x!tt.ptr<bf16>> loc(#loc254)
+      %tmp0_50 = arith.extf %tmp0_49 : tensor<8x128xbf16> to tensor<8x128xf32> loc(#loc255)
+      %tmp6 = arith.constant 128 : i32 loc(#loc256)
+      %tmp6_51 = arith.constant 128 : i32 loc(#loc256)
+      %tmp6_52 = arith.constant dense<128> : tensor<8x1xi32> loc(#loc256)
+      %tmp6_53 = arith.muli %tmp6_52, %x0_12 : tensor<8x1xi32> loc(#loc256)
+      %tmp6_54 = tt.broadcast %r0_index_25 : tensor<1x128xi32> -> tensor<8x128xi32> loc(#loc257)
+      %tmp6_55 = tt.broadcast %tmp6_53 : tensor<8x1xi32> -> tensor<8x128xi32> loc(#loc257)
+      %tmp6_56 = arith.addi %tmp6_54, %tmp6_55 : tensor<8x128xi32> loc(#loc257)
+      %tmp6_57 = arith.constant 36864 : i32 loc(#loc258)
+      %tmp6_58 = arith.constant 36864 : i32 loc(#loc258)
+      %tmp6_59 = arith.constant dense<36864> : tensor<8x1xi32> loc(#loc258)
+      %tmp6_60 = arith.muli %tmp6_59, %x1_15 : tensor<8x1xi32> loc(#loc258)
+      %tmp6_61 = tt.broadcast %tmp6_60 : tensor<8x1xi32> -> tensor<8x128xi32> loc(#loc259)
+      %tmp6_62 = arith.addi %tmp6_56, %tmp6_61 : tensor<8x128xi32> loc(#loc259)
+      %tmp6_63 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<8x128x!tt.ptr<bf16>> loc(#loc260)
+      %tmp6_64 = tt.addptr %tmp6_63, %tmp6_62 : tensor<8x128x!tt.ptr<bf16>>, tensor<8x128xi32> loc(#loc260)
+      %tmp6_65 = arith.constant 0.000000e+00 : f32 loc(#loc261)
+      %tmp6_66 = tt.broadcast %r0_mask_26 : tensor<1x128xi1> -> tensor<8x128xi1> loc(#loc261)
+      %tmp6_67 = arith.constant dense<0.000000e+00> : tensor<8x128xf32> loc(#loc261)
+      %tmp6_68 = arith.truncf %tmp6_67 : tensor<8x128xf32> to tensor<8x128xbf16> loc(#loc261)
+      %tmp6_69 = tt.load %tmp6_64, %tmp6_66, %tmp6_68 evictionPolicy = evict_last : tensor<8x128x!tt.ptr<bf16>> loc(#loc261)
+      %tmp6_70 = arith.extf %tmp6_69 : tensor<8x128xbf16> to tensor<8x128xf32> loc(#loc262)
+      %tmp2 = arith.mulf %tmp0_50, %tmp0_50 : tensor<8x128xf32> loc(#loc263)
+      %tmp5 = arith.addf %_tmp4_23, %tmp2 : tensor<8x128xf32> loc(#loc264)
+      %_tmp4_71 = tt.broadcast %r0_mask_26 : tensor<1x128xi1> -> tensor<8x128xi1> loc(#loc265)
+      %_tmp4_72 = arith.select %_tmp4_71, %tmp5, %_tmp4_23 : tensor<8x128xi1>, tensor<8x128xf32> loc(#loc265)
+      %tmp8 = arith.mulf %tmp6_70, %tmp6_70 : tensor<8x128xf32> loc(#loc266)
+      %tmp11 = arith.addf %_tmp10_24, %tmp8 : tensor<8x128xf32> loc(#loc267)
+      %_tmp10_73 = tt.broadcast %r0_mask_26 : tensor<1x128xi1> -> tensor<8x128xi1> loc(#loc268)
+      %_tmp10_74 = arith.select %_tmp10_73, %tmp11, %_tmp10_24 : tensor<8x128xi1>, tensor<8x128xf32> loc(#loc268)
+      scf.yield %_tmp4_72, %_tmp10_74 : tensor<8x128xf32>, tensor<8x128xf32> loc(#loc39)
+    } loc(#loc435)
+    %tmp4 = tt.call @"triton.language.standard.sum__fp32S8_128S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%_tmp10_18#0) : (tensor<8x128xf32>) -> tensor<8xf32> loc(#loc269)
+    %tmp4_19 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<8xf32> -> tensor<8x1xf32> loc(#loc270)
+    %tmp10 = tt.call @"triton.language.standard.sum__fp32S8_128S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%_tmp10_18#1) : (tensor<8x128xf32>) -> tensor<8xf32> loc(#loc271)
+    %tmp10_20 = tt.expand_dims %tmp10 {axis = 1 : i32} : tensor<8xf32> -> tensor<8x1xf32> loc(#loc272)
+    %c0_i32_21 = arith.constant 0 : i32 loc(#loc44)
+    %c128_i32_22 = arith.constant 128 : i32 loc(#loc44)
+    %4 = arith.bitcast %c0_i32_21 : i32 to i32 loc(#loc44)
+    %5 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc44)
+    %6 = arith.bitcast %c128_i32_22 : i32 to i32 loc(#loc44)
+    %7 = ub.poison : i32 loc(#loc44)
+    scf.for %r0_offset = %4 to %5 step %6  : i32 {
+      %r0_index = tt.splat %r0_offset : i32 -> tensor<1x128xi32> loc(#loc273)
+      %r0_index_23 = arith.addi %r0_index, %r0_base_9 : tensor<1x128xi32> loc(#loc273)
+      %r0_mask = arith.constant dense<128> : tensor<1x128xi32> loc(#loc274)
+      %r0_mask_24 = arith.cmpi slt, %r0_index_23, %r0_mask : tensor<1x128xi32> loc(#loc274)
+      %r0_3 = arith.constant 2 : i32 loc(#loc275)
+      %r0_3_25 = arith.constant 2 : i32 loc(#loc275)
+      %r0_3_26 = arith.constant dense<2> : tensor<1x128xi32> loc(#loc275)
+      %r0_3_27 = arith.remsi %r0_index_23, %r0_3_26 : tensor<1x128xi32> loc(#loc275)
+      %r0_4 = arith.constant 2 : i32 loc(#loc276)
+      %r0_4_28 = arith.constant 2 : i32 loc(#loc276)
+      %r0_4_29 = arith.constant dense<2> : tensor<1x128xi32> loc(#loc276)
+      %r0_4_30 = arith.divsi %r0_index_23, %r0_4_29 : tensor<1x128xi32> loc(#loc276)
+      %tmp50 = arith.constant 128 : i32 loc(#loc277)
+      %tmp50_31 = arith.constant 128 : i32 loc(#loc277)
+      %tmp50_32 = arith.constant dense<128> : tensor<8x1xi32> loc(#loc277)
+      %tmp50_33 = arith.muli %tmp50_32, %x0_12 : tensor<8x1xi32> loc(#loc277)
+      %tmp50_34 = tt.broadcast %r0_index_23 : tensor<1x128xi32> -> tensor<8x128xi32> loc(#loc278)
+      %tmp50_35 = tt.broadcast %tmp50_33 : tensor<8x1xi32> -> tensor<8x128xi32> loc(#loc278)
+      %tmp50_36 = arith.addi %tmp50_34, %tmp50_35 : tensor<8x128xi32> loc(#loc278)
+      %tmp50_37 = arith.constant 36864 : i32 loc(#loc279)
+      %tmp50_38 = arith.constant 36864 : i32 loc(#loc279)
+      %tmp50_39 = arith.constant dense<36864> : tensor<8x1xi32> loc(#loc279)
+      %tmp50_40 = arith.muli %tmp50_39, %x1_15 : tensor<8x1xi32> loc(#loc279)
+      %tmp50_41 = tt.broadcast %tmp50_40 : tensor<8x1xi32> -> tensor<8x128xi32> loc(#loc280)
+      %tmp50_42 = arith.addi %tmp50_36, %tmp50_41 : tensor<8x128xi32> loc(#loc280)
+      %tmp50_43 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<8x128x!tt.ptr<bf16>> loc(#loc281)
+      %tmp50_44 = tt.addptr %tmp50_43, %tmp50_42 : tensor<8x128x!tt.ptr<bf16>>, tensor<8x128xi32> loc(#loc281)
+      %tmp50_45 = arith.constant 0.000000e+00 : f32 loc(#loc282)
+      %tmp50_46 = tt.broadcast %r0_mask_24 : tensor<1x128xi1> -> tensor<8x128xi1> loc(#loc282)
+      %tmp50_47 = arith.constant dense<0.000000e+00> : tensor<8x128xf32> loc(#loc282)
+      %tmp50_48 = arith.truncf %tmp50_47 : tensor<8x128xf32> to tensor<8x128xbf16> loc(#loc282)
+      %tmp50_49 = tt.load %tmp50_44, %tmp50_46, %tmp50_48 evictionPolicy = evict_last : tensor<8x128x!tt.ptr<bf16>> loc(#loc282)
+      %tmp50_50 = arith.extf %tmp50_49 : tensor<8x128xbf16> to tensor<8x128xf32> loc(#loc283)
+      %tmp58 = tt.splat %in_ptr1 : !tt.ptr<bf16> -> tensor<1x128x!tt.ptr<bf16>> loc(#loc284)
+      %tmp58_51 = tt.addptr %tmp58, %r0_index_23 : tensor<1x128x!tt.ptr<bf16>>, tensor<1x128xi32> loc(#loc284)
+      %tmp58_52 = arith.constant 0.000000e+00 : f32 loc(#loc285)
+      %tmp58_53 = arith.constant dense<0.000000e+00> : tensor<1x128xf32> loc(#loc285)
+      %tmp58_54 = arith.truncf %tmp58_53 : tensor<1x128xf32> to tensor<1x128xbf16> loc(#loc285)
+      %tmp58_55 = tt.load %tmp58_51, %r0_mask_24, %tmp58_54 evictionPolicy = evict_last : tensor<1x128x!tt.ptr<bf16>> loc(#loc285)
+      %tmp58_56 = arith.extf %tmp58_55 : tensor<1x128xbf16> to tensor<1x128xf32> loc(#loc286)
+      %tmp63 = arith.constant 128 : i32 loc(#loc287)
+      %tmp63_57 = arith.constant 128 : i32 loc(#loc287)
+      %tmp63_58 = arith.constant dense<128> : tensor<8x1xi32> loc(#loc287)
+      %tmp63_59 = arith.muli %tmp63_58, %x1_15 : tensor<8x1xi32> loc(#loc287)
+      %tmp63_60 = tt.broadcast %r0_index_23 : tensor<1x128xi32> -> tensor<8x128xi32> loc(#loc288)
+      %tmp63_61 = tt.broadcast %tmp63_59 : tensor<8x1xi32> -> tensor<8x128xi32> loc(#loc288)
+      %tmp63_62 = arith.addi %tmp63_60, %tmp63_61 : tensor<8x128xi32> loc(#loc288)
+      %tmp63_63 = tt.splat %in_ptr2 : !tt.ptr<f32> -> tensor<8x128x!tt.ptr<f32>> loc(#loc289)
+      %tmp63_64 = tt.addptr %tmp63_63, %tmp63_62 : tensor<8x128x!tt.ptr<f32>>, tensor<8x128xi32> loc(#loc289)
+      %tmp63_65 = arith.constant 0.000000e+00 : f32 loc(#loc290)
+      %tmp63_66 = tt.broadcast %r0_mask_24 : tensor<1x128xi1> -> tensor<8x128xi1> loc(#loc290)
+      %tmp63_67 = arith.constant dense<0.000000e+00> : tensor<8x128xf32> loc(#loc290)
+      %tmp63_68 = tt.load %tmp63_64, %tmp63_66, %tmp63_67 evictionPolicy = evict_last : tensor<8x128x!tt.ptr<f32>> loc(#loc290)
+      %tmp66 = arith.constant 128 : i32 loc(#loc291)
+      %tmp66_69 = arith.constant 128 : i32 loc(#loc291)
+      %tmp66_70 = arith.constant dense<128> : tensor<8x1xi32> loc(#loc291)
+      %tmp66_71 = arith.muli %tmp66_70, %x1_15 : tensor<8x1xi32> loc(#loc291)
+      %tmp66_72 = tt.broadcast %r0_index_23 : tensor<1x128xi32> -> tensor<8x128xi32> loc(#loc292)
+      %tmp66_73 = tt.broadcast %tmp66_71 : tensor<8x1xi32> -> tensor<8x128xi32> loc(#loc292)
+      %tmp66_74 = arith.addi %tmp66_72, %tmp66_73 : tensor<8x128xi32> loc(#loc292)
+      %tmp66_75 = tt.splat %in_ptr3 : !tt.ptr<f32> -> tensor<8x128x!tt.ptr<f32>> loc(#loc293)
+      %tmp66_76 = tt.addptr %tmp66_75, %tmp66_74 : tensor<8x128x!tt.ptr<f32>>, tensor<8x128xi32> loc(#loc293)
+      %tmp66_77 = arith.constant 0.000000e+00 : f32 loc(#loc294)
+      %tmp66_78 = tt.broadcast %r0_mask_24 : tensor<1x128xi1> -> tensor<8x128xi1> loc(#loc294)
+      %tmp66_79 = arith.constant dense<0.000000e+00> : tensor<8x128xf32> loc(#loc294)
+      %tmp66_80 = tt.load %tmp66_76, %tmp66_78, %tmp66_79 evictionPolicy = evict_last : tensor<8x128x!tt.ptr<f32>> loc(#loc294)
+      %tmp96 = arith.constant 4096 : i32 loc(#loc295)
+      %tmp96_81 = arith.constant 4096 : i32 loc(#loc295)
+      %tmp96_82 = arith.constant dense<4096> : tensor<1x128xi32> loc(#loc295)
+      %tmp96_83 = arith.addi %tmp96_82, %r0_index_23 : tensor<1x128xi32> loc(#loc295)
+      %tmp96_84 = arith.constant 128 : i32 loc(#loc296)
+      %tmp96_85 = arith.constant 128 : i32 loc(#loc296)
+      %tmp96_86 = arith.constant dense<128> : tensor<8x1xi32> loc(#loc296)
+      %tmp96_87 = arith.muli %tmp96_86, %x0_12 : tensor<8x1xi32> loc(#loc296)
+      %tmp96_88 = tt.broadcast %tmp96_83 : tensor<1x128xi32> -> tensor<8x128xi32> loc(#loc297)
+      %tmp96_89 = tt.broadcast %tmp96_87 : tensor<8x1xi32> -> tensor<8x128xi32> loc(#loc297)
+      %tmp96_90 = arith.addi %tmp96_88, %tmp96_89 : tensor<8x128xi32> loc(#loc297)
+      %tmp96_91 = arith.constant 36864 : i32 loc(#loc298)
+      %tmp96_92 = arith.constant 36864 : i32 loc(#loc298)
+      %tmp96_93 = arith.constant dense<36864> : tensor<8x1xi32> loc(#loc298)
+      %tmp96_94 = arith.muli %tmp96_93, %x1_15 : tensor<8x1xi32> loc(#loc298)
+      %tmp96_95 = tt.broadcast %tmp96_94 : tensor<8x1xi32> -> tensor<8x128xi32> loc(#loc299)
+      %tmp96_96 = arith.addi %tmp96_90, %tmp96_95 : tensor<8x128xi32> loc(#loc299)
+      %tmp96_97 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<8x128x!tt.ptr<bf16>> loc(#loc300)
+      %tmp96_98 = tt.addptr %tmp96_97, %tmp96_96 : tensor<8x128x!tt.ptr<bf16>>, tensor<8x128xi32> loc(#loc300)
+      %tmp96_99 = arith.constant 0.000000e+00 : f32 loc(#loc301)
+      %tmp96_100 = tt.broadcast %r0_mask_24 : tensor<1x128xi1> -> tensor<8x128xi1> loc(#loc301)
+      %tmp96_101 = arith.constant dense<0.000000e+00> : tensor<8x128xf32> loc(#loc301)
+      %tmp96_102 = arith.truncf %tmp96_101 : tensor<8x128xf32> to tensor<8x128xbf16> loc(#loc301)
+      %tmp96_103 = tt.load %tmp96_98, %tmp96_100, %tmp96_102 evictionPolicy = evict_first : tensor<8x128x!tt.ptr<bf16>> loc(#loc301)
+      %tmp96_104 = arith.extf %tmp96_103 : tensor<8x128xbf16> to tensor<8x128xf32> loc(#loc302)
+      %tmp102 = tt.splat %in_ptr4 : !tt.ptr<bf16> -> tensor<1x128x!tt.ptr<bf16>> loc(#loc303)
+      %tmp102_105 = tt.addptr %tmp102, %r0_index_23 : tensor<1x128x!tt.ptr<bf16>>, tensor<1x128xi32> loc(#loc303)
+      %tmp102_106 = arith.constant 0.000000e+00 : f32 loc(#loc304)
+      %tmp102_107 = arith.constant dense<0.000000e+00> : tensor<1x128xf32> loc(#loc304)
+      %tmp102_108 = arith.truncf %tmp102_107 : tensor<1x128xf32> to tensor<1x128xbf16> loc(#loc304)
+      %tmp102_109 = tt.load %tmp102_105, %r0_mask_24, %tmp102_108 evictionPolicy = evict_last : tensor<1x128x!tt.ptr<bf16>> loc(#loc304)
+      %tmp102_110 = arith.extf %tmp102_109 : tensor<1x128xbf16> to tensor<1x128xf32> loc(#loc305)
+      %tmp13 = arith.constant 0 : i64 loc(#loc306)
+      %tmp13_111 = arith.constant dense<0> : tensor<1x1xi64> loc(#loc306)
+      %tmp14 = arith.extsi %r0_3_27 : tensor<1x128xi32> to tensor<1x128xi64> loc(#loc307)
+      %tmp14_112 = arith.constant dense<0> : tensor<1x128xi64> loc(#loc307)
+      %tmp14_113 = arith.cmpi sge, %tmp14, %tmp14_112 : tensor<1x128xi64> loc(#loc307)
+      %tmp15 = arith.constant 1 : i64 loc(#loc308)
+      %tmp15_114 = arith.constant dense<1> : tensor<1x1xi64> loc(#loc308)
+      %tmp16 = arith.extsi %r0_3_27 : tensor<1x128xi32> to tensor<1x128xi64> loc(#loc309)
+      %tmp16_115 = arith.constant dense<1> : tensor<1x128xi64> loc(#loc309)
+      %tmp16_116 = arith.cmpi slt, %tmp16, %tmp16_115 : tensor<1x128xi64> loc(#loc309)
+      %tmp17 = arith.constant 2 : i32 loc(#loc310)
+      %tmp17_117 = arith.constant 2 : i32 loc(#loc310)
+      %tmp17_118 = arith.constant dense<2> : tensor<1x128xi32> loc(#loc310)
+      %tmp17_119 = arith.muli %tmp17_118, %r0_4_30 : tensor<1x128xi32> loc(#loc310)
+      %tmp17_120 = arith.constant 1 : i32 loc(#loc311)
+      %tmp17_121 = arith.constant 1 : i32 loc(#loc311)
+      %tmp17_122 = arith.constant dense<1> : tensor<1x128xi32> loc(#loc311)
+      %tmp17_123 = arith.addi %tmp17_122, %tmp17_119 : tensor<1x128xi32> loc(#loc311)
+      %tmp17_124 = arith.constant 128 : i32 loc(#loc312)
+      %tmp17_125 = arith.constant 128 : i32 loc(#loc312)
+      %tmp17_126 = arith.constant dense<128> : tensor<8x1xi32> loc(#loc312)
+      %tmp17_127 = arith.muli %tmp17_126, %x0_12 : tensor<8x1xi32> loc(#loc312)
+      %tmp17_128 = tt.broadcast %tmp17_123 : tensor<1x128xi32> -> tensor<8x128xi32> loc(#loc313)
+      %tmp17_129 = tt.broadcast %tmp17_127 : tensor<8x1xi32> -> tensor<8x128xi32> loc(#loc313)
+      %tmp17_130 = arith.addi %tmp17_128, %tmp17_129 : tensor<8x128xi32> loc(#loc313)
+      %tmp17_131 = arith.constant 36864 : i32 loc(#loc314)
+      %tmp17_132 = arith.constant 36864 : i32 loc(#loc314)
+      %tmp17_133 = arith.constant dense<36864> : tensor<8x1xi32> loc(#loc314)
+      %tmp17_134 = arith.muli %tmp17_133, %x1_15 : tensor<8x1xi32> loc(#loc314)
+      %tmp17_135 = tt.broadcast %tmp17_134 : tensor<8x1xi32> -> tensor<8x128xi32> loc(#loc315)
+      %tmp17_136 = arith.addi %tmp17_130, %tmp17_135 : tensor<8x128xi32> loc(#loc315)
+      %tmp17_137 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<8x128x!tt.ptr<bf16>> loc(#loc316)
+      %tmp17_138 = tt.addptr %tmp17_137, %tmp17_136 : tensor<8x128x!tt.ptr<bf16>>, tensor<8x128xi32> loc(#loc316)
+      %tmp17_139 = arith.andi %r0_mask_24, %tmp16_116 : tensor<1x128xi1> loc(#loc317)
+      %tmp17_140 = arith.constant 0.000000e+00 : f32 loc(#loc318)
+      %tmp17_141 = tt.broadcast %tmp17_139 : tensor<1x128xi1> -> tensor<8x128xi1> loc(#loc318)
+      %tmp17_142 = arith.constant dense<0.000000e+00> : tensor<8x128xf32> loc(#loc318)
+      %tmp17_143 = arith.truncf %tmp17_142 : tensor<8x128xf32> to tensor<8x128xbf16> loc(#loc318)
+      %tmp17_144 = tt.load %tmp17_138, %tmp17_141, %tmp17_143 evictionPolicy = evict_last : tensor<8x128x!tt.ptr<bf16>> loc(#loc318)
+      %tmp17_145 = arith.extf %tmp17_144 : tensor<8x128xbf16> to tensor<8x128xf32> loc(#loc319)
+      %tmp19 = arith.constant 1.280000e+02 : f32 loc(#loc320)
+      %tmp20 = arith.constant dense<1.280000e+02> : tensor<8x1xf32> loc(#loc321)
+      %tmp20_146 = arith.divf %tmp10_20, %tmp20 : tensor<8x1xf32> loc(#loc321)
+      %tmp21 = arith.constant 9.99999997E-7 : f32 loc(#loc322)
+      %tmp22 = arith.constant dense<9.99999997E-7> : tensor<8x1xf32> loc(#loc323)
+      %tmp22_147 = arith.addf %tmp20_146, %tmp22 : tensor<8x1xf32> loc(#loc323)
+      %tmp23 = tt.extern_elementwise %tmp22_147 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<8x1xf32>) -> tensor<8x1xf32> loc(#loc324)
+      %tmp24 = tt.broadcast %tmp23 : tensor<8x1xf32> -> tensor<8x128xf32> loc(#loc325)
+      %tmp24_148 = arith.mulf %tmp17_145, %tmp24 : tensor<8x128xf32> loc(#loc325)
+      %tmp25 = arith.constant 2 : i32 loc(#loc326)
+      %tmp25_149 = arith.constant 2 : i32 loc(#loc326)
+      %tmp25_150 = arith.constant dense<2> : tensor<1x128xi32> loc(#loc326)
+      %tmp25_151 = arith.muli %tmp25_150, %r0_4_30 : tensor<1x128xi32> loc(#loc326)
+      %tmp25_152 = arith.constant 1 : i32 loc(#loc327)
+      %tmp25_153 = arith.constant 1 : i32 loc(#loc327)
+      %tmp25_154 = arith.constant dense<1> : tensor<1x128xi32> loc(#loc327)
+      %tmp25_155 = arith.addi %tmp25_154, %tmp25_151 : tensor<1x128xi32> loc(#loc327)
+      %tmp25_156 = tt.broadcast %tmp25_155 : tensor<1x128xi32> -> tensor<8x128xi32> loc(#loc328)
+      %tmp25_157 = tt.splat %in_ptr1 : !tt.ptr<bf16> -> tensor<8x128x!tt.ptr<bf16>> loc(#loc329)
+      %tmp25_158 = tt.addptr %tmp25_157, %tmp25_156 : tensor<8x128x!tt.ptr<bf16>>, tensor<8x128xi32> loc(#loc329)
+      %tmp25_159 = arith.andi %r0_mask_24, %tmp16_116 : tensor<1x128xi1> loc(#loc330)
+      %tmp25_160 = arith.constant 0.000000e+00 : f32 loc(#loc331)
+      %tmp25_161 = tt.broadcast %tmp25_159 : tensor<1x128xi1> -> tensor<8x128xi1> loc(#loc331)
+      %tmp25_162 = arith.constant dense<0.000000e+00> : tensor<8x128xf32> loc(#loc331)
+      %tmp25_163 = arith.truncf %tmp25_162 : tensor<8x128xf32> to tensor<8x128xbf16> loc(#loc331)
+      %tmp25_164 = tt.load %tmp25_158, %tmp25_161, %tmp25_163 evictionPolicy = evict_last : tensor<8x128x!tt.ptr<bf16>> loc(#loc331)
+      %tmp25_165 = arith.extf %tmp25_164 : tensor<8x128xbf16> to tensor<8x128xf32> loc(#loc332)
+      %tmp27 = arith.mulf %tmp24_148, %tmp25_165 : tensor<8x128xf32> loc(#loc333)
+      %tmp29 = arith.constant 0.000000e+00 : f32 loc(#loc334)
+      %tmp29_166 = arith.constant dense<0.000000e+00> : tensor<8x128xf32> loc(#loc334)
+      %tmp29_167 = arith.subf %tmp29_166, %tmp27 : tensor<8x128xf32> loc(#loc334)
+      %tmp30 = arith.constant 0.000000e+00 : f32 loc(#loc335)
+      %tmp30_168 = arith.constant dense<0.000000e+00> : tensor<8x128xf32> loc(#loc335)
+      %tmp31 = tt.broadcast %tmp16_116 : tensor<1x128xi1> -> tensor<8x128xi1> loc(#loc336)
+      %tmp31_169 = arith.select %tmp31, %tmp29_167, %tmp30_168 : tensor<8x128xi1>, tensor<8x128xf32> loc(#loc336)
+      %tmp32 = arith.extsi %r0_3_27 : tensor<1x128xi32> to tensor<1x128xi64> loc(#loc337)
+      %tmp32_170 = arith.constant dense<1> : tensor<1x128xi64> loc(#loc337)
+      %tmp32_171 = arith.cmpi sge, %tmp32, %tmp32_170 : tensor<1x128xi64> loc(#loc337)
+      %tmp33 = arith.constant 2 : i64 loc(#loc338)
+      %tmp33_172 = arith.constant dense<2> : tensor<1x1xi64> loc(#loc338)
+      %tmp34 = arith.extsi %r0_3_27 : tensor<1x128xi32> to tensor<1x128xi64> loc(#loc339)
+      %tmp34_173 = arith.constant dense<2> : tensor<1x128xi64> loc(#loc339)
+      %tmp34_174 = arith.cmpi slt, %tmp34, %tmp34_173 : tensor<1x128xi64> loc(#loc339)
+      %tmp35 = arith.constant 2 : i32 loc(#loc340)
+      %tmp35_175 = arith.constant 2 : i32 loc(#loc340)
+      %tmp35_176 = arith.constant dense<2> : tensor<1x128xi32> loc(#loc340)
+      %tmp35_177 = arith.muli %tmp35_176, %r0_4_30 : tensor<1x128xi32> loc(#loc340)
+      %tmp35_178 = arith.constant 128 : i32 loc(#loc341)
+      %tmp35_179 = arith.constant 128 : i32 loc(#loc341)
+      %tmp35_180 = arith.constant dense<128> : tensor<8x1xi32> loc(#loc341)
+      %tmp35_181 = arith.muli %tmp35_180, %x0_12 : tensor<8x1xi32> loc(#loc341)
+      %tmp35_182 = tt.broadcast %tmp35_177 : tensor<1x128xi32> -> tensor<8x128xi32> loc(#loc342)
+      %tmp35_183 = tt.broadcast %tmp35_181 : tensor<8x1xi32> -> tensor<8x128xi32> loc(#loc342)
+      %tmp35_184 = arith.addi %tmp35_182, %tmp35_183 : tensor<8x128xi32> loc(#loc342)
+      %tmp35_185 = arith.constant 36864 : i32 loc(#loc343)
+      %tmp35_186 = arith.constant 36864 : i32 loc(#loc343)
+      %tmp35_187 = arith.constant dense<36864> : tensor<8x1xi32> loc(#loc343)
+      %tmp35_188 = arith.muli %tmp35_187, %x1_15 : tensor<8x1xi32> loc(#loc343)
+      %tmp35_189 = tt.broadcast %tmp35_188 : tensor<8x1xi32> -> tensor<8x128xi32> loc(#loc344)
+      %tmp35_190 = arith.addi %tmp35_184, %tmp35_189 : tensor<8x128xi32> loc(#loc344)
+      %tmp35_191 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<8x128x!tt.ptr<bf16>> loc(#loc345)
+      %tmp35_192 = tt.addptr %tmp35_191, %tmp35_190 : tensor<8x128x!tt.ptr<bf16>>, tensor<8x128xi32> loc(#loc345)
+      %tmp35_193 = arith.andi %r0_mask_24, %tmp32_171 : tensor<1x128xi1> loc(#loc346)
+      %tmp35_194 = arith.constant 0.000000e+00 : f32 loc(#loc347)
+      %tmp35_195 = tt.broadcast %tmp35_193 : tensor<1x128xi1> -> tensor<8x128xi1> loc(#loc347)
+      %tmp35_196 = arith.constant dense<0.000000e+00> : tensor<8x128xf32> loc(#loc347)
+      %tmp35_197 = arith.truncf %tmp35_196 : tensor<8x128xf32> to tensor<8x128xbf16> loc(#loc347)
+      %tmp35_198 = tt.load %tmp35_192, %tmp35_195, %tmp35_197 evictionPolicy = evict_last : tensor<8x128x!tt.ptr<bf16>> loc(#loc347)
+      %tmp35_199 = arith.extf %tmp35_198 : tensor<8x128xbf16> to tensor<8x128xf32> loc(#loc348)
+      %tmp37 = arith.constant 1.280000e+02 : f32 loc(#loc349)
+      %tmp38 = arith.constant dense<1.280000e+02> : tensor<8x1xf32> loc(#loc350)
+      %tmp38_200 = arith.divf %tmp10_20, %tmp38 : tensor<8x1xf32> loc(#loc350)
+      %tmp39 = arith.constant 9.99999997E-7 : f32 loc(#loc351)
+      %tmp40 = arith.constant dense<9.99999997E-7> : tensor<8x1xf32> loc(#loc352)
+      %tmp40_201 = arith.addf %tmp38_200, %tmp40 : tensor<8x1xf32> loc(#loc352)
+      %tmp41 = tt.extern_elementwise %tmp40_201 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<8x1xf32>) -> tensor<8x1xf32> loc(#loc353)
+      %tmp42 = tt.broadcast %tmp41 : tensor<8x1xf32> -> tensor<8x128xf32> loc(#loc354)
+      %tmp42_202 = arith.mulf %tmp35_199, %tmp42 : tensor<8x128xf32> loc(#loc354)
+      %tmp43 = arith.constant 2 : i32 loc(#loc355)
+      %tmp43_203 = arith.constant 2 : i32 loc(#loc355)
+      %tmp43_204 = arith.constant dense<2> : tensor<1x128xi32> loc(#loc355)
+      %tmp43_205 = arith.muli %tmp43_204, %r0_4_30 : tensor<1x128xi32> loc(#loc355)
+      %tmp43_206 = tt.broadcast %tmp43_205 : tensor<1x128xi32> -> tensor<8x128xi32> loc(#loc356)
+      %tmp43_207 = tt.splat %in_ptr1 : !tt.ptr<bf16> -> tensor<8x128x!tt.ptr<bf16>> loc(#loc357)
+      %tmp43_208 = tt.addptr %tmp43_207, %tmp43_206 : tensor<8x128x!tt.ptr<bf16>>, tensor<8x128xi32> loc(#loc357)
+      %tmp43_209 = arith.andi %r0_mask_24, %tmp32_171 : tensor<1x128xi1> loc(#loc358)
+      %tmp43_210 = arith.constant 0.000000e+00 : f32 loc(#loc359)
+      %tmp43_211 = tt.broadcast %tmp43_209 : tensor<1x128xi1> -> tensor<8x128xi1> loc(#loc359)
+      %tmp43_212 = arith.constant dense<0.000000e+00> : tensor<8x128xf32> loc(#loc359)
+      %tmp43_213 = arith.truncf %tmp43_212 : tensor<8x128xf32> to tensor<8x128xbf16> loc(#loc359)
+      %tmp43_214 = tt.load %tmp43_208, %tmp43_211, %tmp43_213 evictionPolicy = evict_last : tensor<8x128x!tt.ptr<bf16>> loc(#loc359)
+      %tmp43_215 = arith.extf %tmp43_214 : tensor<8x128xbf16> to tensor<8x128xf32> loc(#loc360)
+      %tmp45 = arith.mulf %tmp42_202, %tmp43_215 : tensor<8x128xf32> loc(#loc361)
+      %tmp47 = arith.constant 0.000000e+00 : f32 loc(#loc362)
+      %tmp47_216 = arith.constant dense<0.000000e+00> : tensor<8x128xf32> loc(#loc362)
+      %tmp48 = tt.broadcast %tmp32_171 : tensor<1x128xi1> -> tensor<8x128xi1> loc(#loc363)
+      %tmp48_217 = arith.select %tmp48, %tmp45, %tmp47_216 : tensor<8x128xi1>, tensor<8x128xf32> loc(#loc363)
+      %tmp49 = tt.broadcast %tmp16_116 : tensor<1x128xi1> -> tensor<8x128xi1> loc(#loc364)
+      %tmp49_218 = arith.select %tmp49, %tmp31_169, %tmp48_217 : tensor<8x128xi1>, tensor<8x128xf32> loc(#loc364)
+      %tmp52 = arith.constant 1.280000e+02 : f32 loc(#loc365)
+      %tmp53 = arith.constant dense<1.280000e+02> : tensor<8x1xf32> loc(#loc366)
+      %tmp53_219 = arith.divf %tmp10_20, %tmp53 : tensor<8x1xf32> loc(#loc366)
+      %tmp54 = arith.constant 9.99999997E-7 : f32 loc(#loc367)
+      %tmp55 = arith.constant dense<9.99999997E-7> : tensor<8x1xf32> loc(#loc368)
+      %tmp55_220 = arith.addf %tmp53_219, %tmp55 : tensor<8x1xf32> loc(#loc368)
+      %tmp56 = tt.extern_elementwise %tmp55_220 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<8x1xf32>) -> tensor<8x1xf32> loc(#loc369)
+      %tmp57 = tt.broadcast %tmp56 : tensor<8x1xf32> -> tensor<8x128xf32> loc(#loc370)
+      %tmp57_221 = arith.mulf %tmp50_50, %tmp57 : tensor<8x128xf32> loc(#loc370)
+      %tmp60 = tt.broadcast %tmp58_56 : tensor<1x128xf32> -> tensor<8x128xf32> loc(#loc371)
+      %tmp60_222 = arith.mulf %tmp57_221, %tmp60 : tensor<8x128xf32> loc(#loc371)
+      %tmp64 = arith.mulf %tmp60_222, %tmp63_68 : tensor<8x128xf32> loc(#loc372)
+      %tmp67 = arith.mulf %tmp49_218, %tmp66_80 : tensor<8x128xf32> loc(#loc373)
+      %tmp68 = arith.addf %tmp64, %tmp67 : tensor<8x128xf32> loc(#loc374)
+      %tmp70 = arith.constant 2 : i32 loc(#loc375)
+      %tmp70_223 = arith.constant 2 : i32 loc(#loc375)
+      %tmp70_224 = arith.constant dense<2> : tensor<1x128xi32> loc(#loc375)
+      %tmp70_225 = arith.muli %tmp70_224, %r0_4_30 : tensor<1x128xi32> loc(#loc375)
+      %tmp70_226 = arith.constant 4097 : i32 loc(#loc376)
+      %tmp70_227 = arith.constant 4097 : i32 loc(#loc376)
+      %tmp70_228 = arith.constant dense<4097> : tensor<1x128xi32> loc(#loc376)
+      %tmp70_229 = arith.addi %tmp70_228, %tmp70_225 : tensor<1x128xi32> loc(#loc376)
+      %tmp70_230 = arith.constant 128 : i32 loc(#loc377)
+      %tmp70_231 = arith.constant 128 : i32 loc(#loc377)
+      %tmp70_232 = arith.constant dense<128> : tensor<8x1xi32> loc(#loc377)
+      %tmp70_233 = arith.muli %tmp70_232, %x0_12 : tensor<8x1xi32> loc(#loc377)
+      %tmp70_234 = tt.broadcast %tmp70_229 : tensor<1x128xi32> -> tensor<8x128xi32> loc(#loc378)
+      %tmp70_235 = tt.broadcast %tmp70_233 : tensor<8x1xi32> -> tensor<8x128xi32> loc(#loc378)
+      %tmp70_236 = arith.addi %tmp70_234, %tmp70_235 : tensor<8x128xi32> loc(#loc378)
+      %tmp70_237 = arith.constant 36864 : i32 loc(#loc379)
+      %tmp70_238 = arith.constant 36864 : i32 loc(#loc379)
+      %tmp70_239 = arith.constant dense<36864> : tensor<8x1xi32> loc(#loc379)
+      %tmp70_240 = arith.muli %tmp70_239, %x1_15 : tensor<8x1xi32> loc(#loc379)
+      %tmp70_241 = tt.broadcast %tmp70_240 : tensor<8x1xi32> -> tensor<8x128xi32> loc(#loc380)
+      %tmp70_242 = arith.addi %tmp70_236, %tmp70_241 : tensor<8x128xi32> loc(#loc380)
+      %tmp70_243 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<8x128x!tt.ptr<bf16>> loc(#loc381)
+      %tmp70_244 = tt.addptr %tmp70_243, %tmp70_242 : tensor<8x128x!tt.ptr<bf16>>, tensor<8x128xi32> loc(#loc381)
+      %tmp70_245 = arith.andi %r0_mask_24, %tmp16_116 : tensor<1x128xi1> loc(#loc382)
+      %tmp70_246 = arith.constant 0.000000e+00 : f32 loc(#loc383)
+      %tmp70_247 = tt.broadcast %tmp70_245 : tensor<1x128xi1> -> tensor<8x128xi1> loc(#loc383)
+      %tmp70_248 = arith.constant dense<0.000000e+00> : tensor<8x128xf32> loc(#loc383)
+      %tmp70_249 = arith.truncf %tmp70_248 : tensor<8x128xf32> to tensor<8x128xbf16> loc(#loc383)
+      %tmp70_250 = tt.load %tmp70_244, %tmp70_247, %tmp70_249 evictionPolicy = evict_last : tensor<8x128x!tt.ptr<bf16>> loc(#loc383)
+      %tmp70_251 = arith.extf %tmp70_250 : tensor<8x128xbf16> to tensor<8x128xf32> loc(#loc384)
+      %tmp72 = arith.constant dense<1.280000e+02> : tensor<8x1xf32> loc(#loc385)
+      %tmp72_252 = arith.divf %tmp4_19, %tmp72 : tensor<8x1xf32> loc(#loc385)
+      %tmp73 = arith.constant dense<9.99999997E-7> : tensor<8x1xf32> loc(#loc386)
+      %tmp73_253 = arith.addf %tmp72_252, %tmp73 : tensor<8x1xf32> loc(#loc386)
+      %tmp74 = tt.extern_elementwise %tmp73_253 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<8x1xf32>) -> tensor<8x1xf32> loc(#loc387)
+      %tmp75 = tt.broadcast %tmp74 : tensor<8x1xf32> -> tensor<8x128xf32> loc(#loc388)
+      %tmp75_254 = arith.mulf %tmp70_251, %tmp75 : tensor<8x128xf32> loc(#loc388)
+      %tmp76 = arith.constant 2 : i32 loc(#loc389)
+      %tmp76_255 = arith.constant 2 : i32 loc(#loc389)
+      %tmp76_256 = arith.constant dense<2> : tensor<1x128xi32> loc(#loc389)
+      %tmp76_257 = arith.muli %tmp76_256, %r0_4_30 : tensor<1x128xi32> loc(#loc389)
+      %tmp76_258 = arith.constant 1 : i32 loc(#loc390)
+      %tmp76_259 = arith.constant 1 : i32 loc(#loc390)
+      %tmp76_260 = arith.constant dense<1> : tensor<1x128xi32> loc(#loc390)
+      %tmp76_261 = arith.addi %tmp76_260, %tmp76_257 : tensor<1x128xi32> loc(#loc390)
+      %tmp76_262 = tt.broadcast %tmp76_261 : tensor<1x128xi32> -> tensor<8x128xi32> loc(#loc391)
+      %tmp76_263 = tt.splat %in_ptr4 : !tt.ptr<bf16> -> tensor<8x128x!tt.ptr<bf16>> loc(#loc392)
+      %tmp76_264 = tt.addptr %tmp76_263, %tmp76_262 : tensor<8x128x!tt.ptr<bf16>>, tensor<8x128xi32> loc(#loc392)
+      %tmp76_265 = arith.andi %r0_mask_24, %tmp16_116 : tensor<1x128xi1> loc(#loc393)
+      %tmp76_266 = arith.constant 0.000000e+00 : f32 loc(#loc394)
+      %tmp76_267 = tt.broadcast %tmp76_265 : tensor<1x128xi1> -> tensor<8x128xi1> loc(#loc394)
+      %tmp76_268 = arith.constant dense<0.000000e+00> : tensor<8x128xf32> loc(#loc394)
+      %tmp76_269 = arith.truncf %tmp76_268 : tensor<8x128xf32> to tensor<8x128xbf16> loc(#loc394)
+      %tmp76_270 = tt.load %tmp76_264, %tmp76_267, %tmp76_269 evictionPolicy = evict_last : tensor<8x128x!tt.ptr<bf16>> loc(#loc394)
+      %tmp76_271 = arith.extf %tmp76_270 : tensor<8x128xbf16> to tensor<8x128xf32> loc(#loc395)
+      %tmp78 = arith.mulf %tmp75_254, %tmp76_271 : tensor<8x128xf32> loc(#loc396)
+      %tmp80 = arith.constant 0.000000e+00 : f32 loc(#loc397)
+      %tmp80_272 = arith.constant dense<0.000000e+00> : tensor<8x128xf32> loc(#loc397)
+      %tmp80_273 = arith.subf %tmp80_272, %tmp78 : tensor<8x128xf32> loc(#loc397)
+      %tmp81 = arith.constant 0.000000e+00 : f32 loc(#loc398)
+      %tmp81_274 = arith.constant dense<0.000000e+00> : tensor<8x128xf32> loc(#loc398)
+      %tmp82 = tt.broadcast %tmp16_116 : tensor<1x128xi1> -> tensor<8x128xi1> loc(#loc399)
+      %tmp82_275 = arith.select %tmp82, %tmp80_273, %tmp81_274 : tensor<8x128xi1>, tensor<8x128xf32> loc(#loc399)
+      %tmp83 = arith.constant 2 : i32 loc(#loc400)
+      %tmp83_276 = arith.constant 2 : i32 loc(#loc400)
+      %tmp83_277 = arith.constant dense<2> : tensor<1x128xi32> loc(#loc400)
+      %tmp83_278 = arith.muli %tmp83_277, %r0_4_30 : tensor<1x128xi32> loc(#loc400)
+      %tmp83_279 = arith.constant 4096 : i32 loc(#loc401)
+      %tmp83_280 = arith.constant 4096 : i32 loc(#loc401)
+      %tmp83_281 = arith.constant dense<4096> : tensor<1x128xi32> loc(#loc401)
+      %tmp83_282 = arith.addi %tmp83_281, %tmp83_278 : tensor<1x128xi32> loc(#loc401)
+      %tmp83_283 = arith.constant 128 : i32 loc(#loc402)
+      %tmp83_284 = arith.constant 128 : i32 loc(#loc402)
+      %tmp83_285 = arith.constant dense<128> : tensor<8x1xi32> loc(#loc402)
+      %tmp83_286 = arith.muli %tmp83_285, %x0_12 : tensor<8x1xi32> loc(#loc402)
+      %tmp83_287 = tt.broadcast %tmp83_282 : tensor<1x128xi32> -> tensor<8x128xi32> loc(#loc403)
+      %tmp83_288 = tt.broadcast %tmp83_286 : tensor<8x1xi32> -> tensor<8x128xi32> loc(#loc403)
+      %tmp83_289 = arith.addi %tmp83_287, %tmp83_288 : tensor<8x128xi32> loc(#loc403)
+      %tmp83_290 = arith.constant 36864 : i32 loc(#loc404)
+      %tmp83_291 = arith.constant 36864 : i32 loc(#loc404)
+      %tmp83_292 = arith.constant dense<36864> : tensor<8x1xi32> loc(#loc404)
+      %tmp83_293 = arith.muli %tmp83_292, %x1_15 : tensor<8x1xi32> loc(#loc404)
+      %tmp83_294 = tt.broadcast %tmp83_293 : tensor<8x1xi32> -> tensor<8x128xi32> loc(#loc405)
+      %tmp83_295 = arith.addi %tmp83_289, %tmp83_294 : tensor<8x128xi32> loc(#loc405)
+      %tmp83_296 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<8x128x!tt.ptr<bf16>> loc(#loc406)
+      %tmp83_297 = tt.addptr %tmp83_296, %tmp83_295 : tensor<8x128x!tt.ptr<bf16>>, tensor<8x128xi32> loc(#loc406)
+      %tmp83_298 = arith.andi %r0_mask_24, %tmp32_171 : tensor<1x128xi1> loc(#loc407)
+      %tmp83_299 = arith.constant 0.000000e+00 : f32 loc(#loc408)
+      %tmp83_300 = tt.broadcast %tmp83_298 : tensor<1x128xi1> -> tensor<8x128xi1> loc(#loc408)
+      %tmp83_301 = arith.constant dense<0.000000e+00> : tensor<8x128xf32> loc(#loc408)
+      %tmp83_302 = arith.truncf %tmp83_301 : tensor<8x128xf32> to tensor<8x128xbf16> loc(#loc408)
+      %tmp83_303 = tt.load %tmp83_297, %tmp83_300, %tmp83_302 evictionPolicy = evict_last : tensor<8x128x!tt.ptr<bf16>> loc(#loc408)
+      %tmp83_304 = arith.extf %tmp83_303 : tensor<8x128xbf16> to tensor<8x128xf32> loc(#loc409)
+      %tmp85 = arith.constant dense<1.280000e+02> : tensor<8x1xf32> loc(#loc410)
+      %tmp85_305 = arith.divf %tmp4_19, %tmp85 : tensor<8x1xf32> loc(#loc410)
+      %tmp86 = arith.constant dense<9.99999997E-7> : tensor<8x1xf32> loc(#loc411)
+      %tmp86_306 = arith.addf %tmp85_305, %tmp86 : tensor<8x1xf32> loc(#loc411)
+      %tmp87 = tt.extern_elementwise %tmp86_306 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<8x1xf32>) -> tensor<8x1xf32> loc(#loc412)
+      %tmp88 = tt.broadcast %tmp87 : tensor<8x1xf32> -> tensor<8x128xf32> loc(#loc413)
+      %tmp88_307 = arith.mulf %tmp83_304, %tmp88 : tensor<8x128xf32> loc(#loc413)
+      %tmp89 = arith.constant 2 : i32 loc(#loc414)
+      %tmp89_308 = arith.constant 2 : i32 loc(#loc414)
+      %tmp89_309 = arith.constant dense<2> : tensor<1x128xi32> loc(#loc414)
+      %tmp89_310 = arith.muli %tmp89_309, %r0_4_30 : tensor<1x128xi32> loc(#loc414)
+      %tmp89_311 = tt.broadcast %tmp89_310 : tensor<1x128xi32> -> tensor<8x128xi32> loc(#loc415)
+      %tmp89_312 = tt.splat %in_ptr4 : !tt.ptr<bf16> -> tensor<8x128x!tt.ptr<bf16>> loc(#loc416)
+      %tmp89_313 = tt.addptr %tmp89_312, %tmp89_311 : tensor<8x128x!tt.ptr<bf16>>, tensor<8x128xi32> loc(#loc416)
+      %tmp89_314 = arith.andi %r0_mask_24, %tmp32_171 : tensor<1x128xi1> loc(#loc417)
+      %tmp89_315 = arith.constant 0.000000e+00 : f32 loc(#loc418)
+      %tmp89_316 = tt.broadcast %tmp89_314 : tensor<1x128xi1> -> tensor<8x128xi1> loc(#loc418)
+      %tmp89_317 = arith.constant dense<0.000000e+00> : tensor<8x128xf32> loc(#loc418)
+      %tmp89_318 = arith.truncf %tmp89_317 : tensor<8x128xf32> to tensor<8x128xbf16> loc(#loc418)
+      %tmp89_319 = tt.load %tmp89_313, %tmp89_316, %tmp89_318 evictionPolicy = evict_last : tensor<8x128x!tt.ptr<bf16>> loc(#loc418)
+      %tmp89_320 = arith.extf %tmp89_319 : tensor<8x128xbf16> to tensor<8x128xf32> loc(#loc419)
+      %tmp91 = arith.mulf %tmp88_307, %tmp89_320 : tensor<8x128xf32> loc(#loc420)
+      %tmp93 = arith.constant 0.000000e+00 : f32 loc(#loc421)
+      %tmp93_321 = arith.constant dense<0.000000e+00> : tensor<8x128xf32> loc(#loc421)
+      %tmp94 = tt.broadcast %tmp32_171 : tensor<1x128xi1> -> tensor<8x128xi1> loc(#loc422)
+      %tmp94_322 = arith.select %tmp94, %tmp91, %tmp93_321 : tensor<8x128xi1>, tensor<8x128xf32> loc(#loc422)
+      %tmp95 = tt.broadcast %tmp16_116 : tensor<1x128xi1> -> tensor<8x128xi1> loc(#loc423)
+      %tmp95_323 = arith.select %tmp95, %tmp82_275, %tmp94_322 : tensor<8x128xi1>, tensor<8x128xf32> loc(#loc423)
+      %tmp98 = arith.constant dense<1.280000e+02> : tensor<8x1xf32> loc(#loc424)
+      %tmp98_324 = arith.divf %tmp4_19, %tmp98 : tensor<8x1xf32> loc(#loc424)
+      %tmp99 = arith.constant dense<9.99999997E-7> : tensor<8x1xf32> loc(#loc425)
+      %tmp99_325 = arith.addf %tmp98_324, %tmp99 : tensor<8x1xf32> loc(#loc425)
+      %tmp100 = tt.extern_elementwise %tmp99_325 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<8x1xf32>) -> tensor<8x1xf32> loc(#loc426)
+      %tmp101 = tt.broadcast %tmp100 : tensor<8x1xf32> -> tensor<8x128xf32> loc(#loc427)
+      %tmp101_326 = arith.mulf %tmp96_104, %tmp101 : tensor<8x128xf32> loc(#loc427)
+      %tmp104 = tt.broadcast %tmp102_110 : tensor<1x128xf32> -> tensor<8x128xf32> loc(#loc428)
+      %tmp104_327 = arith.mulf %tmp101_326, %tmp104 : tensor<8x128xf32> loc(#loc428)
+      %tmp107 = arith.mulf %tmp104_327, %tmp63_68 : tensor<8x128xf32> loc(#loc429)
+      %tmp109 = arith.mulf %tmp95_323, %tmp66_80 : tensor<8x128xf32> loc(#loc430)
+      %tmp110 = arith.addf %tmp107, %tmp109 : tensor<8x128xf32> loc(#loc431)
+      %c128_i32_328 = arith.constant 128 : i32 loc(#loc204)
+      %c128_i32_329 = arith.constant 128 : i32 loc(#loc204)
+      %cst = arith.constant dense<128> : tensor<8x1xi32> loc(#loc204)
+      %8 = arith.muli %cst, %xindex_7 : tensor<8x1xi32> loc(#loc204)
+      %9 = tt.broadcast %r0_index_23 : tensor<1x128xi32> -> tensor<8x128xi32> loc(#loc205)
+      %10 = tt.broadcast %8 : tensor<8x1xi32> -> tensor<8x128xi32> loc(#loc205)
+      %11 = arith.addi %9, %10 : tensor<8x128xi32> loc(#loc205)
+      %12 = tt.splat %in_out_ptr0 : !tt.ptr<bf16> -> tensor<8x128x!tt.ptr<bf16>> loc(#loc206)
+      %13 = tt.addptr %12, %11 : tensor<8x128x!tt.ptr<bf16>>, tensor<8x128xi32> loc(#loc206)
+      %14 = tt.broadcast %r0_mask_24 : tensor<1x128xi1> -> tensor<8x128xi1> loc(#loc207)
+      %15 = arith.truncf %tmp68 : tensor<8x128xf32> to tensor<8x128xbf16> loc(#loc207)
+      tt.store %13, %15, %14 : tensor<8x128x!tt.ptr<bf16>> loc(#loc207)
+      %c128_i32_330 = arith.constant 128 : i32 loc(#loc208)
+      %c128_i32_331 = arith.constant 128 : i32 loc(#loc208)
+      %cst_332 = arith.constant dense<128> : tensor<8x1xi32> loc(#loc208)
+      %16 = arith.muli %cst_332, %xindex_7 : tensor<8x1xi32> loc(#loc208)
+      %17 = tt.broadcast %r0_index_23 : tensor<1x128xi32> -> tensor<8x128xi32> loc(#loc209)
+      %18 = tt.broadcast %16 : tensor<8x1xi32> -> tensor<8x128xi32> loc(#loc209)
+      %19 = arith.addi %17, %18 : tensor<8x128xi32> loc(#loc209)
+      %20 = tt.splat %in_out_ptr1 : !tt.ptr<bf16> -> tensor<8x128x!tt.ptr<bf16>> loc(#loc210)
+      %21 = tt.addptr %20, %19 : tensor<8x128x!tt.ptr<bf16>>, tensor<8x128xi32> loc(#loc210)
+      %22 = tt.broadcast %r0_mask_24 : tensor<1x128xi1> -> tensor<8x128xi1> loc(#loc211)
+      %23 = arith.truncf %tmp110 : tensor<8x128xf32> to tensor<8x128xbf16> loc(#loc211)
+      tt.store %21, %23, %22 : tensor<8x128x!tt.ptr<bf16>> loc(#loc211)
+    } loc(#loc44)
+    tt.return loc(#loc212)
+  } loc(#loc)
+  tt.func private @"triton.language.standard.sum__fp32S8_128S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<8x128xf32> loc("input"(#loc213))) -> tensor<8xf32> attributes {noinline = false} {
+    %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({
+    ^bb0(%arg1: f32 loc(unknown), %arg2: f32 loc(unknown)):
+      %2 = tt.call @triton.language.standard._sum_combine__fp32_fp32__(%arg1, %arg2) : (f32, f32) -> f32 loc(#loc214)
+      tt.reduce.return %2 : f32 loc(#loc214)
+    }) : (tensor<8x128xf32>) -> tensor<8xf32> loc(#loc214)
+    tt.return %0 : tensor<8xf32> loc(#loc216)
+  ^bb1:  // no predecessors
+    %1 = ub.poison : tensor<8xf32> loc(#loc217)
+    tt.return %1 : tensor<8xf32> loc(#loc217)
+  } loc(#loc213)
+  tt.func private @triton.language.standard._sum_combine__fp32_fp32__(%a: f32 loc("a"(#loc218)), %b: f32 loc("b"(#loc218))) -> f32 attributes {noinline = false} {
+    %0 = arith.addf %a, %b : f32 loc(#loc219)
+    tt.return %0 : f32 loc(#loc220)
+  ^bb1:  // no predecessors
+    %1 = ub.poison : f32 loc(#loc221)
+    tt.return %1 : f32 loc(#loc221)
+  } loc(#loc218)
+} loc(#loc)
+#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":19:13)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":20:15)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":23:28)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":23:33)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:36)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:44)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:23)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":25:46)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":26:27)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":26:37)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":28:19)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":29:19)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":30:43)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":32:44)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":33:43)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":34:31)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":35:29)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:41)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:52)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:48)
+#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:63)
+#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:57)
+#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:34)
+#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:68)
+#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:121)
+#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:45)
+#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:41)
+#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:56)
+#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:50)
+#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:34)
+#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:61)
+#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:114)
+#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":42:22)
+#loc34 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":44:23)
+#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":45:40)
+#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":47:22)
+#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":49:25)
+#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":50:42)
+#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":50:8)
+#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":51:25)
+#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":51:28)
+#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":52:27)
+#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":52:30)
+#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":53:43)
+#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":54:31)
+#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":55:29)
+#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":58:27)
+#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":59:27)
+#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:46)
+#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:42)
+#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:57)
+#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:51)
+#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:35)
+#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:62)
+#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:115)
+#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:35)
+#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:42)
+#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:95)
+#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:46)
+#loc60 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:42)
+#loc61 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:35)
+#loc62 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:51)
+#loc63 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:46)
+#loc64 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:42)
+#loc65 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:35)
+#loc66 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:51)
+#loc67 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:42)
+#loc68 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:53)
+#loc69 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:49)
+#loc70 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:64)
+#loc71 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:58)
+#loc72 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:35)
+#loc73 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:69)
+#loc74 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:123)
+#loc75 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:36)
+#loc76 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:43)
+#loc77 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:96)
+#loc78 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":68:35)
+#loc79 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":69:25)
+#loc80 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":70:35)
+#loc81 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":71:24)
+#loc82 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:41)
+#loc83 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:39)
+#loc84 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:52)
+#loc85 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:48)
+#loc86 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:63)
+#loc87 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:57)
+#loc88 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:35)
+#loc89 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:78)
+#loc90 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:68)
+#loc91 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:129)
+#loc92 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":74:16)
+#loc93 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":75:25)
+#loc94 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":76:16)
+#loc95 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":77:24)
+#loc96 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":78:32)
+#loc97 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":79:24)
+#loc98 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:57)
+#loc99 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:55)
+#loc100 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:63)
+#loc101 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:35)
+#loc102 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:95)
+#loc103 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:85)
+#loc104 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:146)
+#loc105 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":82:24)
+#loc106 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":84:17)
+#loc107 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":85:42)
+#loc108 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":86:39)
+#loc109 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":87:25)
+#loc110 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":88:35)
+#loc111 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":89:24)
+#loc112 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:37)
+#loc113 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:48)
+#loc114 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:44)
+#loc115 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:59)
+#loc116 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:53)
+#loc117 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:35)
+#loc118 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:74)
+#loc119 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:64)
+#loc120 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:125)
+#loc121 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":92:16)
+#loc122 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":93:25)
+#loc123 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":94:16)
+#loc124 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":95:24)
+#loc125 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":96:32)
+#loc126 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":97:24)
+#loc127 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:53)
+#loc128 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:59)
+#loc129 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:35)
+#loc130 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:91)
+#loc131 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:81)
+#loc132 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:142)
+#loc133 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":100:24)
+#loc134 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":102:42)
+#loc135 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":103:39)
+#loc136 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":104:39)
+#loc137 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":106:16)
+#loc138 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":107:25)
+#loc139 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":108:16)
+#loc140 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":109:24)
+#loc141 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":110:32)
+#loc142 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":111:24)
+#loc143 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":113:24)
+#loc144 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":116:24)
+#loc145 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":118:24)
+#loc146 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":119:24)
+#loc147 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:44)
+#loc148 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:42)
+#loc149 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:55)
+#loc150 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:51)
+#loc151 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:66)
+#loc152 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:60)
+#loc153 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:35)
+#loc154 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:81)
+#loc155 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:71)
+#loc156 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:132)
+#loc157 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":123:24)
+#loc158 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":124:24)
+#loc159 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":125:32)
+#loc160 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":126:24)
+#loc161 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:57)
+#loc162 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:55)
+#loc163 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:63)
+#loc164 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:35)
+#loc165 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:95)
+#loc166 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:85)
+#loc167 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:146)
+#loc168 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":129:24)
+#loc169 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":131:17)
+#loc170 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":132:42)
+#loc171 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":133:39)
+#loc172 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:44)
+#loc173 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:42)
+#loc174 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:55)
+#loc175 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:51)
+#loc176 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:66)
+#loc177 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:60)
+#loc178 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:35)
+#loc179 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:81)
+#loc180 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:71)
+#loc181 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:132)
+#loc182 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":136:24)
+#loc183 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":137:24)
+#loc184 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":138:32)
+#loc185 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":139:24)
+#loc186 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:53)
+#loc187 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:59)
+#loc188 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:35)
+#loc189 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:91)
+#loc190 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:81)
+#loc191 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:142)
+#loc192 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":142:24)
+#loc193 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":144:42)
+#loc194 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":145:39)
+#loc195 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":146:39)
+#loc196 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":148:24)
+#loc197 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":149:24)
+#loc198 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":150:33)
+#loc199 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":151:25)
+#loc200 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":153:26)
+#loc201 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":156:26)
+#loc202 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":158:26)
+#loc203 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":159:26)
+#loc204 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:43)
+#loc205 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:39)
+#loc206 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:32)
+#loc207 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:55)
+#loc208 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:43)
+#loc209 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:39)
+#loc210 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:32)
+#loc211 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:56)
+#loc212 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":53:4)
+#loc214 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:36)
+#loc216 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:11)
+#loc217 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:4)
+#loc219 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:15)
+#loc220 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:11)
+#loc221 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:4)
+#loc231 = loc("xnumel"(#loc1))
+#loc232 = loc("r0_numel"(#loc2))
+#loc233 = loc("xoffset"(#loc3))
+#loc234 = loc("xoffset"(#loc4))
+#loc235 = loc("xindex"(#loc5))
+#loc236 = loc("xindex"(#loc6))
+#loc237 = loc("xindex"(#loc7))
+#loc238 = loc("xmask"(#loc8))
+#loc239 = loc("r0_base"(#loc9))
+#loc240 = loc("r0_base"(#loc10))
+#loc241 = loc("x0"(#loc11))
+#loc242 = loc("x1"(#loc12))
+#loc243 = loc("_tmp4"(#loc13))
+#loc244 = loc("_tmp10"(#loc14))
+#loc245 = loc("_tmp4"(#loc15))
+#loc246 = loc("r0_index"(#loc16))
+#loc247 = loc("r0_mask"(#loc17))
+#loc248 = loc("tmp0"(#loc18))
+#loc249 = loc("tmp0"(#loc19))
+#loc250 = loc("tmp0"(#loc20))
+#loc251 = loc("tmp0"(#loc21))
+#loc252 = loc("tmp0"(#loc22))
+#loc253 = loc("tmp0"(#loc23))
+#loc254 = loc("tmp0"(#loc24))
+#loc255 = loc("tmp0"(#loc25))
+#loc256 = loc("tmp6"(#loc26))
+#loc257 = loc("tmp6"(#loc27))
+#loc258 = loc("tmp6"(#loc28))
+#loc259 = loc("tmp6"(#loc29))
+#loc260 = loc("tmp6"(#loc30))
+#loc261 = loc("tmp6"(#loc31))
+#loc262 = loc("tmp6"(#loc32))
+#loc263 = loc("tmp2"(#loc33))
+#loc264 = loc("tmp5"(#loc34))
+#loc265 = loc("_tmp4"(#loc35))
+#loc266 = loc("tmp8"(#loc36))
+#loc267 = loc("tmp11"(#loc37))
+#loc268 = loc("_tmp10"(#loc38))
+#loc269 = loc("tmp4"(#loc40))
+#loc270 = loc("tmp4"(#loc41))
+#loc271 = loc("tmp10"(#loc42))
+#loc272 = loc("tmp10"(#loc43))
+#loc273 = loc("r0_index"(#loc45))
+#loc274 = loc("r0_mask"(#loc46))
+#loc275 = loc("r0_3"(#loc47))
+#loc276 = loc("r0_4"(#loc48))
+#loc277 = loc("tmp50"(#loc49))
+#loc278 = loc("tmp50"(#loc50))
+#loc279 = loc("tmp50"(#loc51))
+#loc280 = loc("tmp50"(#loc52))
+#loc281 = loc("tmp50"(#loc53))
+#loc282 = loc("tmp50"(#loc54))
+#loc283 = loc("tmp50"(#loc55))
+#loc284 = loc("tmp58"(#loc56))
+#loc285 = loc("tmp58"(#loc57))
+#loc286 = loc("tmp58"(#loc58))
+#loc287 = loc("tmp63"(#loc59))
+#loc288 = loc("tmp63"(#loc60))
+#loc289 = loc("tmp63"(#loc61))
+#loc290 = loc("tmp63"(#loc62))
+#loc291 = loc("tmp66"(#loc63))
+#loc292 = loc("tmp66"(#loc64))
+#loc293 = loc("tmp66"(#loc65))
+#loc294 = loc("tmp66"(#loc66))
+#loc295 = loc("tmp96"(#loc67))
+#loc296 = loc("tmp96"(#loc68))
+#loc297 = loc("tmp96"(#loc69))
+#loc298 = loc("tmp96"(#loc70))
+#loc299 = loc("tmp96"(#loc71))
+#loc300 = loc("tmp96"(#loc72))
+#loc301 = loc("tmp96"(#loc73))
+#loc302 = loc("tmp96"(#loc74))
+#loc303 = loc("tmp102"(#loc75))
+#loc304 = loc("tmp102"(#loc76))
+#loc305 = loc("tmp102"(#loc77))
+#loc306 = loc("tmp13"(#loc78))
+#loc307 = loc("tmp14"(#loc79))
+#loc308 = loc("tmp15"(#loc80))
+#loc309 = loc("tmp16"(#loc81))
+#loc310 = loc("tmp17"(#loc82))
+#loc311 = loc("tmp17"(#loc83))
+#loc312 = loc("tmp17"(#loc84))
+#loc313 = loc("tmp17"(#loc85))
+#loc314 = loc("tmp17"(#loc86))
+#loc315 = loc("tmp17"(#loc87))
+#loc316 = loc("tmp17"(#loc88))
+#loc317 = loc("tmp17"(#loc89))
+#loc318 = loc("tmp17"(#loc90))
+#loc319 = loc("tmp17"(#loc91))
+#loc320 = loc("tmp19"(#loc92))
+#loc321 = loc("tmp20"(#loc93))
+#loc322 = loc("tmp21"(#loc94))
+#loc323 = loc("tmp22"(#loc95))
+#loc324 = loc("tmp23"(#loc96))
+#loc325 = loc("tmp24"(#loc97))
+#loc326 = loc("tmp25"(#loc98))
+#loc327 = loc("tmp25"(#loc99))
+#loc328 = loc("tmp25"(#loc100))
+#loc329 = loc("tmp25"(#loc101))
+#loc330 = loc("tmp25"(#loc102))
+#loc331 = loc("tmp25"(#loc103))
+#loc332 = loc("tmp25"(#loc104))
+#loc333 = loc("tmp27"(#loc105))
+#loc334 = loc("tmp29"(#loc106))
+#loc335 = loc("tmp30"(#loc107))
+#loc336 = loc("tmp31"(#loc108))
+#loc337 = loc("tmp32"(#loc109))
+#loc338 = loc("tmp33"(#loc110))
+#loc339 = loc("tmp34"(#loc111))
+#loc340 = loc("tmp35"(#loc112))
+#loc341 = loc("tmp35"(#loc113))
+#loc342 = loc("tmp35"(#loc114))
+#loc343 = loc("tmp35"(#loc115))
+#loc344 = loc("tmp35"(#loc116))
+#loc345 = loc("tmp35"(#loc117))
+#loc346 = loc("tmp35"(#loc118))
+#loc347 = loc("tmp35"(#loc119))
+#loc348 = loc("tmp35"(#loc120))
+#loc349 = loc("tmp37"(#loc121))
+#loc350 = loc("tmp38"(#loc122))
+#loc351 = loc("tmp39"(#loc123))
+#loc352 = loc("tmp40"(#loc124))
+#loc353 = loc("tmp41"(#loc125))
+#loc354 = loc("tmp42"(#loc126))
+#loc355 = loc("tmp43"(#loc127))
+#loc356 = loc("tmp43"(#loc128))
+#loc357 = loc("tmp43"(#loc129))
+#loc358 = loc("tmp43"(#loc130))
+#loc359 = loc("tmp43"(#loc131))
+#loc360 = loc("tmp43"(#loc132))
+#loc361 = loc("tmp45"(#loc133))
+#loc362 = loc("tmp47"(#loc134))
+#loc363 = loc("tmp48"(#loc135))
+#loc364 = loc("tmp49"(#loc136))
+#loc365 = loc("tmp52"(#loc137))
+#loc366 = loc("tmp53"(#loc138))
+#loc367 = loc("tmp54"(#loc139))
+#loc368 = loc("tmp55"(#loc140))
+#loc369 = loc("tmp56"(#loc141))
+#loc370 = loc("tmp57"(#loc142))
+#loc371 = loc("tmp60"(#loc143))
+#loc372 = loc("tmp64"(#loc144))
+#loc373 = loc("tmp67"(#loc145))
+#loc374 = loc("tmp68"(#loc146))
+#loc375 = loc("tmp70"(#loc147))
+#loc376 = loc("tmp70"(#loc148))
+#loc377 = loc("tmp70"(#loc149))
+#loc378 = loc("tmp70"(#loc150))
+#loc379 = loc("tmp70"(#loc151))
+#loc380 = loc("tmp70"(#loc152))
+#loc381 = loc("tmp70"(#loc153))
+#loc382 = loc("tmp70"(#loc154))
+#loc383 = loc("tmp70"(#loc155))
+#loc384 = loc("tmp70"(#loc156))
+#loc385 = loc("tmp72"(#loc157))
+#loc386 = loc("tmp73"(#loc158))
+#loc387 = loc("tmp74"(#loc159))
+#loc388 = loc("tmp75"(#loc160))
+#loc389 = loc("tmp76"(#loc161))
+#loc390 = loc("tmp76"(#loc162))
+#loc391 = loc("tmp76"(#loc163))
+#loc392 = loc("tmp76"(#loc164))
+#loc393 = loc("tmp76"(#loc165))
+#loc394 = loc("tmp76"(#loc166))
+#loc395 = loc("tmp76"(#loc167))
+#loc396 = loc("tmp78"(#loc168))
+#loc397 = loc("tmp80"(#loc169))
+#loc398 = loc("tmp81"(#loc170))
+#loc399 = loc("tmp82"(#loc171))
+#loc400 = loc("tmp83"(#loc172))
+#loc401 = loc("tmp83"(#loc173))
+#loc402 = loc("tmp83"(#loc174))
+#loc403 = loc("tmp83"(#loc175))
+#loc404 = loc("tmp83"(#loc176))
+#loc405 = loc("tmp83"(#loc177))
+#loc406 = loc("tmp83"(#loc178))
+#loc407 = loc("tmp83"(#loc179))
+#loc408 = loc("tmp83"(#loc180))
+#loc409 = loc("tmp83"(#loc181))
+#loc410 = loc("tmp85"(#loc182))
+#loc411 = loc("tmp86"(#loc183))
+#loc412 = loc("tmp87"(#loc184))
+#loc413 = loc("tmp88"(#loc185))
+#loc414 = loc("tmp89"(#loc186))
+#loc415 = loc("tmp89"(#loc187))
+#loc416 = loc("tmp89"(#loc188))
+#loc417 = loc("tmp89"(#loc189))
+#loc418 = loc("tmp89"(#loc190))
+#loc419 = loc("tmp89"(#loc191))
+#loc420 = loc("tmp91"(#loc192))
+#loc421 = loc("tmp93"(#loc193))
+#loc422 = loc("tmp94"(#loc194))
+#loc423 = loc("tmp95"(#loc195))
+#loc424 = loc("tmp98"(#loc196))
+#loc425 = loc("tmp99"(#loc197))
+#loc426 = loc("tmp100"(#loc198))
+#loc427 = loc("tmp101"(#loc199))
+#loc428 = loc("tmp104"(#loc200))
+#loc429 = loc("tmp107"(#loc201))
+#loc430 = loc("tmp109"(#loc202))
+#loc431 = loc("tmp110"(#loc203))
+#loc435 = loc("_tmp10"(#loc245))
diff --git a/triton/646WRNO7FMJYGKK66FHZ7JPPW6YUT5P5V6RPZAVEK2TTH5B43JCA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttgir b/triton/646WRNO7FMJYGKK66FHZ7JPPW6YUT5P5V6RPZAVEK2TTH5B43JCA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttgir
new file mode 100644
index 0000000000000000000000000000000000000000..0da48da30f25503de803c35d455f5e28fea932dd
--- /dev/null
+++ b/triton/646WRNO7FMJYGKK66FHZ7JPPW6YUT5P5V6RPZAVEK2TTH5B43JCA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttgir
@@ -0,0 +1,487 @@
+#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [8, 4], warpsPerCTA = [1, 8], order = [0, 1]}>
+#blocked1 = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 32], warpsPerCTA = [8, 1], order = [1, 0]}>
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":18:0)
+#loc1 = loc(unknown)
+#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":51:25)
+#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":52:27)
+#loc130 = loc("in_out_ptr0"(#loc))
+#loc131 = loc("in_out_ptr1"(#loc))
+#loc132 = loc("in_ptr0"(#loc))
+#loc133 = loc("in_ptr1"(#loc))
+#loc134 = loc("in_ptr2"(#loc))
+#loc135 = loc("in_ptr3"(#loc))
+#loc136 = loc("in_ptr4"(#loc))
+#loc137 = loc("xnumel"(#loc))
+#loc138 = loc("r0_numel"(#loc))
+#loc166 = loc("tmp4"(#loc30))
+#loc168 = loc("tmp10"(#loc33))
+#loc259 = loc(callsite(#loc1 at #loc166))
+#loc261 = loc(callsite(#loc1 at #loc168))
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "cuda:89", "ttg.threads-per-warp" = 32 : i32} {
+  tt.func public @triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0(%in_out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_out_ptr0"(#loc)), %in_out_ptr1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_out_ptr1"(#loc)), %in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %in_ptr4: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr4"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
+    %cst = arith.constant dense<4097> : tensor<1x128xi32, #blocked> loc(#loc1)
+    %cst_0 = arith.constant dense<0.000000e+00> : tensor<1x128xbf16, #blocked1> loc(#loc1)
+    %cst_1 = arith.constant dense<1> : tensor<1x128xi32, #blocked> loc(#loc1)
+    %cst_2 = arith.constant dense<1> : tensor<1x128xi64, #blocked> loc(#loc1)
+    %cst_3 = arith.constant dense<2> : tensor<1x128xi32, #blocked> loc(#loc1)
+    %cst_4 = arith.constant dense<36864> : tensor<8x1xi32, #blocked> loc(#loc1)
+    %cst_5 = arith.constant dense<36864> : tensor<8x1xi32, #blocked1> loc(#loc1)
+    %cst_6 = arith.constant dense<128> : tensor<8x1xi32, #blocked> loc(#loc1)
+    %cst_7 = arith.constant dense<128> : tensor<8x1xi32, #blocked1> loc(#loc1)
+    %cst_8 = arith.constant dense<4096> : tensor<1x128xi32, #blocked> loc(#loc1)
+    %cst_9 = arith.constant dense<4096> : tensor<1x128xi32, #blocked1> loc(#loc1)
+    %cst_10 = arith.constant dense<128> : tensor<1x128xi32, #blocked> loc(#loc1)
+    %cst_11 = arith.constant dense<128> : tensor<1x128xi32, #blocked1> loc(#loc1)
+    %cst_12 = arith.constant dense<32> : tensor<8x1xi32, #blocked> loc(#loc1)
+    %cst_13 = arith.constant dense<32> : tensor<8x1xi32, #blocked1> loc(#loc1)
+    %c8_i32 = arith.constant 8 : i32 loc(#loc1)
+    %cst_14 = arith.constant dense<0.000000e+00> : tensor<8x128xbf16, #blocked1> loc(#loc1)
+    %cst_15 = arith.constant dense<0.000000e+00> : tensor<8x128xbf16, #blocked> loc(#loc1)
+    %cst_16 = arith.constant dense<9.99999997E-7> : tensor<8x1xf32, #blocked1> loc(#loc1)
+    %cst_17 = arith.constant dense<1.280000e+02> : tensor<8x1xf32, #blocked1> loc(#loc1)
+    %cst_18 = arith.constant dense<0.000000e+00> : tensor<8x128xf32, #blocked> loc(#loc1)
+    %cst_19 = arith.constant dense<0.000000e+00> : tensor<8x128xf32, #blocked1> loc(#loc1)
+    %xoffset = tt.get_program_id x : i32 loc(#loc139)
+    %xoffset_20 = arith.muli %xoffset, %c8_i32 : i32 loc(#loc140)
+    %xindex = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc141)
+    %xindex_21 = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc141)
+    %xindex_22 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<8x1xi32, #blocked1> loc(#loc141)
+    %xindex_23 = tt.expand_dims %xindex_21 {axis = 1 : i32} : tensor<8xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<8x1xi32, #blocked> loc(#loc141)
+    %xindex_24 = tt.splat %xoffset_20 : i32 -> tensor<8x1xi32, #blocked1> loc(#loc142)
+    %xindex_25 = tt.splat %xoffset_20 : i32 -> tensor<8x1xi32, #blocked> loc(#loc142)
+    %xindex_26 = arith.addi %xindex_24, %xindex_22 : tensor<8x1xi32, #blocked1> loc(#loc142)
+    %xindex_27 = arith.addi %xindex_25, %xindex_23 : tensor<8x1xi32, #blocked> loc(#loc142)
+    %r0_base = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc143)
+    %r0_base_28 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc143)
+    %r0_base_29 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x128xi32, #blocked1> loc(#loc143)
+    %r0_base_30 = tt.expand_dims %r0_base_28 {axis = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x128xi32, #blocked> loc(#loc143)
+    %x0 = arith.remsi %xindex_26, %cst_13 : tensor<8x1xi32, #blocked1> loc(#loc144)
+    %x0_31 = arith.remsi %xindex_27, %cst_12 : tensor<8x1xi32, #blocked> loc(#loc144)
+    %x1 = arith.divsi %xindex_26, %cst_13 : tensor<8x1xi32, #blocked1> loc(#loc145)
+    %x1_32 = arith.divsi %xindex_27, %cst_12 : tensor<8x1xi32, #blocked> loc(#loc145)
+    %r0_mask = arith.cmpi slt, %r0_base_29, %cst_11 : tensor<1x128xi32, #blocked1> loc(#loc146)
+    %r0_mask_33 = arith.cmpi slt, %r0_base_30, %cst_10 : tensor<1x128xi32, #blocked> loc(#loc146)
+    %tmp0 = arith.addi %r0_base_29, %cst_9 : tensor<1x128xi32, #blocked1> loc(#loc147)
+    %tmp0_34 = arith.muli %x0, %cst_7 : tensor<8x1xi32, #blocked1> loc(#loc148)
+    %tmp0_35 = arith.muli %x0_31, %cst_6 : tensor<8x1xi32, #blocked> loc(#loc148)
+    %tmp0_36 = tt.broadcast %tmp0 : tensor<1x128xi32, #blocked1> -> tensor<8x128xi32, #blocked1> loc(#loc149)
+    %tmp0_37 = tt.broadcast %tmp0_34 : tensor<8x1xi32, #blocked1> -> tensor<8x128xi32, #blocked1> loc(#loc149)
+    %tmp0_38 = tt.broadcast %tmp0_35 : tensor<8x1xi32, #blocked> -> tensor<8x128xi32, #blocked> loc(#loc149)
+    %tmp0_39 = arith.addi %tmp0_36, %tmp0_37 : tensor<8x128xi32, #blocked1> loc(#loc149)
+    %tmp0_40 = arith.muli %x1, %cst_5 : tensor<8x1xi32, #blocked1> loc(#loc150)
+    %tmp0_41 = arith.muli %x1_32, %cst_4 : tensor<8x1xi32, #blocked> loc(#loc150)
+    %tmp0_42 = tt.broadcast %tmp0_40 : tensor<8x1xi32, #blocked1> -> tensor<8x128xi32, #blocked1> loc(#loc151)
+    %tmp0_43 = tt.broadcast %tmp0_41 : tensor<8x1xi32, #blocked> -> tensor<8x128xi32, #blocked> loc(#loc151)
+    %tmp0_44 = arith.addi %tmp0_39, %tmp0_42 : tensor<8x128xi32, #blocked1> loc(#loc151)
+    %tmp0_45 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<8x128x!tt.ptr<bf16>, #blocked1> loc(#loc152)
+    %tmp0_46 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<8x128x!tt.ptr<bf16>, #blocked> loc(#loc152)
+    %tmp0_47 = tt.addptr %tmp0_45, %tmp0_44 : tensor<8x128x!tt.ptr<bf16>, #blocked1>, tensor<8x128xi32, #blocked1> loc(#loc152)
+    %tmp0_48 = tt.broadcast %r0_mask : tensor<1x128xi1, #blocked1> -> tensor<8x128xi1, #blocked1> loc(#loc153)
+    %tmp0_49 = tt.load %tmp0_47, %tmp0_48, %cst_14 evictionPolicy = evict_last : tensor<8x128x!tt.ptr<bf16>, #blocked1> loc(#loc153)
+    %tmp0_50 = arith.extf %tmp0_49 : tensor<8x128xbf16, #blocked1> to tensor<8x128xf32, #blocked1> loc(#loc154)
+    %tmp6 = tt.broadcast %r0_base_29 : tensor<1x128xi32, #blocked1> -> tensor<8x128xi32, #blocked1> loc(#loc155)
+    %tmp6_51 = arith.addi %tmp6, %tmp0_37 : tensor<8x128xi32, #blocked1> loc(#loc155)
+    %tmp6_52 = arith.addi %tmp6_51, %tmp0_42 : tensor<8x128xi32, #blocked1> loc(#loc156)
+    %tmp6_53 = tt.addptr %tmp0_45, %tmp6_52 : tensor<8x128x!tt.ptr<bf16>, #blocked1>, tensor<8x128xi32, #blocked1> loc(#loc157)
+    %tmp6_54 = tt.load %tmp6_53, %tmp0_48, %cst_14 evictionPolicy = evict_last : tensor<8x128x!tt.ptr<bf16>, #blocked1> loc(#loc158)
+    %tmp6_55 = arith.extf %tmp6_54 : tensor<8x128xbf16, #blocked1> to tensor<8x128xf32, #blocked1> loc(#loc159)
+    %tmp2 = arith.mulf %tmp0_50, %tmp0_50 : tensor<8x128xf32, #blocked1> loc(#loc160)
+    %tmp5 = arith.addf %tmp2, %cst_19 : tensor<8x128xf32, #blocked1> loc(#loc161)
+    %_tmp4 = arith.select %tmp0_48, %tmp5, %cst_19 : tensor<8x128xi1, #blocked1>, tensor<8x128xf32, #blocked1> loc(#loc162)
+    %tmp8 = arith.mulf %tmp6_55, %tmp6_55 : tensor<8x128xf32, #blocked1> loc(#loc163)
+    %tmp11 = arith.addf %tmp8, %cst_19 : tensor<8x128xf32, #blocked1> loc(#loc164)
+    %_tmp10 = arith.select %tmp0_48, %tmp11, %cst_19 : tensor<8x128xi1, #blocked1>, tensor<8x128xf32, #blocked1> loc(#loc165)
+    %tmp4 = "tt.reduce"(%_tmp4) <{axis = 1 : i32}> ({
+    ^bb0(%tmp4_127: f32 loc(callsite(#loc1 at #loc166)), %tmp4_128: f32 loc(callsite(#loc1 at #loc166))):
+      %tmp4_129 = arith.addf %tmp4_127, %tmp4_128 : f32 loc(#loc264)
+      tt.reduce.return %tmp4_129 : f32 loc(#loc258)
+    }) : (tensor<8x128xf32, #blocked1>) -> tensor<8xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc258)
+    %tmp4_56 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<8xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<8x1xf32, #blocked1> loc(#loc167)
+    %tmp10 = "tt.reduce"(%_tmp10) <{axis = 1 : i32}> ({
+    ^bb0(%tmp10_127: f32 loc(callsite(#loc1 at #loc168)), %tmp10_128: f32 loc(callsite(#loc1 at #loc168))):
+      %tmp10_129 = arith.addf %tmp10_127, %tmp10_128 : f32 loc(#loc265)
+      tt.reduce.return %tmp10_129 : f32 loc(#loc260)
+    }) : (tensor<8x128xf32, #blocked1>) -> tensor<8xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc260)
+    %tmp10_57 = tt.expand_dims %tmp10 {axis = 1 : i32} : tensor<8xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<8x1xf32, #blocked1> loc(#loc169)
+    %r0_3 = arith.remsi %r0_base_30, %cst_3 : tensor<1x128xi32, #blocked> loc(#loc170)
+    %r0_4 = arith.divsi %r0_base_30, %cst_3 : tensor<1x128xi32, #blocked> loc(#loc171)
+    %tmp58 = tt.splat %in_ptr1 : !tt.ptr<bf16> -> tensor<1x128x!tt.ptr<bf16>, #blocked> loc(#loc172)
+    %tmp58_58 = tt.splat %in_ptr1 : !tt.ptr<bf16> -> tensor<1x128x!tt.ptr<bf16>, #blocked1> loc(#loc172)
+    %tmp58_59 = tt.addptr %tmp58_58, %r0_base_29 : tensor<1x128x!tt.ptr<bf16>, #blocked1>, tensor<1x128xi32, #blocked1> loc(#loc172)
+    %tmp58_60 = tt.load %tmp58_59, %r0_mask, %cst_0 evictionPolicy = evict_last : tensor<1x128x!tt.ptr<bf16>, #blocked1> loc(#loc173)
+    %tmp58_61 = arith.extf %tmp58_60 : tensor<1x128xbf16, #blocked1> to tensor<1x128xf32, #blocked1> loc(#loc174)
+    %tmp63 = arith.muli %x1, %cst_7 : tensor<8x1xi32, #blocked1> loc(#loc175)
+    %tmp63_62 = tt.broadcast %tmp63 : tensor<8x1xi32, #blocked1> -> tensor<8x128xi32, #blocked1> loc(#loc176)
+    %tmp63_63 = arith.addi %tmp6, %tmp63_62 : tensor<8x128xi32, #blocked1> loc(#loc176)
+    %tmp63_64 = tt.splat %in_ptr2 : !tt.ptr<f32> -> tensor<8x128x!tt.ptr<f32>, #blocked1> loc(#loc177)
+    %tmp63_65 = tt.addptr %tmp63_64, %tmp63_63 : tensor<8x128x!tt.ptr<f32>, #blocked1>, tensor<8x128xi32, #blocked1> loc(#loc177)
+    %tmp63_66 = tt.load %tmp63_65, %tmp0_48, %cst_19 evictionPolicy = evict_last : tensor<8x128x!tt.ptr<f32>, #blocked1> loc(#loc178)
+    %tmp66 = tt.splat %in_ptr3 : !tt.ptr<f32> -> tensor<8x128x!tt.ptr<f32>, #blocked1> loc(#loc179)
+    %tmp66_67 = tt.addptr %tmp66, %tmp63_63 : tensor<8x128x!tt.ptr<f32>, #blocked1>, tensor<8x128xi32, #blocked1> loc(#loc179)
+    %tmp66_68 = tt.load %tmp66_67, %tmp0_48, %cst_19 evictionPolicy = evict_last : tensor<8x128x!tt.ptr<f32>, #blocked1> loc(#loc180)
+    %tmp66_69 = ttg.convert_layout %tmp66_68 : tensor<8x128xf32, #blocked1> -> tensor<8x128xf32, #blocked> loc(#loc180)
+    %tmp96 = tt.load %tmp0_47, %tmp0_48, %cst_14 evictionPolicy = evict_first : tensor<8x128x!tt.ptr<bf16>, #blocked1> loc(#loc181)
+    %tmp96_70 = arith.extf %tmp96 : tensor<8x128xbf16, #blocked1> to tensor<8x128xf32, #blocked1> loc(#loc182)
+    %tmp102 = tt.splat %in_ptr4 : !tt.ptr<bf16> -> tensor<1x128x!tt.ptr<bf16>, #blocked> loc(#loc183)
+    %tmp102_71 = tt.splat %in_ptr4 : !tt.ptr<bf16> -> tensor<1x128x!tt.ptr<bf16>, #blocked1> loc(#loc183)
+    %tmp102_72 = tt.addptr %tmp102_71, %r0_base_29 : tensor<1x128x!tt.ptr<bf16>, #blocked1>, tensor<1x128xi32, #blocked1> loc(#loc183)
+    %tmp102_73 = tt.load %tmp102_72, %r0_mask, %cst_0 evictionPolicy = evict_last : tensor<1x128x!tt.ptr<bf16>, #blocked1> loc(#loc184)
+    %tmp102_74 = arith.extf %tmp102_73 : tensor<1x128xbf16, #blocked1> to tensor<1x128xf32, #blocked1> loc(#loc185)
+    %tmp16 = arith.extsi %r0_3 : tensor<1x128xi32, #blocked> to tensor<1x128xi64, #blocked> loc(#loc186)
+    %tmp16_75 = arith.cmpi slt, %tmp16, %cst_2 : tensor<1x128xi64, #blocked> loc(#loc186)
+    %tmp17 = arith.muli %r0_4, %cst_3 : tensor<1x128xi32, #blocked> loc(#loc187)
+    %tmp17_76 = arith.addi %tmp17, %cst_1 : tensor<1x128xi32, #blocked> loc(#loc188)
+    %tmp17_77 = tt.broadcast %tmp17_76 : tensor<1x128xi32, #blocked> -> tensor<8x128xi32, #blocked> loc(#loc189)
+    %tmp17_78 = arith.addi %tmp17_77, %tmp0_38 : tensor<8x128xi32, #blocked> loc(#loc189)
+    %tmp17_79 = arith.addi %tmp17_78, %tmp0_43 : tensor<8x128xi32, #blocked> loc(#loc190)
+    %tmp17_80 = tt.addptr %tmp0_46, %tmp17_79 : tensor<8x128x!tt.ptr<bf16>, #blocked>, tensor<8x128xi32, #blocked> loc(#loc191)
+    %tmp17_81 = arith.andi %r0_mask_33, %tmp16_75 : tensor<1x128xi1, #blocked> loc(#loc192)
+    %tmp17_82 = tt.broadcast %tmp17_81 : tensor<1x128xi1, #blocked> -> tensor<8x128xi1, #blocked> loc(#loc193)
+    %tmp17_83 = tt.load %tmp17_80, %tmp17_82, %cst_15 evictionPolicy = evict_last : tensor<8x128x!tt.ptr<bf16>, #blocked> loc(#loc193)
+    %tmp17_84 = arith.extf %tmp17_83 : tensor<8x128xbf16, #blocked> to tensor<8x128xf32, #blocked> loc(#loc194)
+    %tmp20 = arith.divf %tmp10_57, %cst_17 : tensor<8x1xf32, #blocked1> loc(#loc195)
+    %tmp22 = arith.addf %tmp20, %cst_16 : tensor<8x1xf32, #blocked1> loc(#loc196)
+    %tmp23 = tt.extern_elementwise %tmp22 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<8x1xf32, #blocked1>) -> tensor<8x1xf32, #blocked1> loc(#loc197)
+    %tmp24 = ttg.convert_layout %tmp23 : tensor<8x1xf32, #blocked1> -> tensor<8x1xf32, #blocked> loc(#loc198)
+    %tmp24_85 = tt.broadcast %tmp24 : tensor<8x1xf32, #blocked> -> tensor<8x128xf32, #blocked> loc(#loc198)
+    %tmp24_86 = tt.broadcast %tmp23 : tensor<8x1xf32, #blocked1> -> tensor<8x128xf32, #blocked1> loc(#loc198)
+    %tmp24_87 = arith.mulf %tmp17_84, %tmp24_85 : tensor<8x128xf32, #blocked> loc(#loc198)
+    %tmp25 = tt.addptr %tmp58, %tmp17_76 : tensor<1x128x!tt.ptr<bf16>, #blocked>, tensor<1x128xi32, #blocked> loc(#loc199)
+    %tmp25_88 = tt.broadcast %tmp25 : tensor<1x128x!tt.ptr<bf16>, #blocked> -> tensor<8x128x!tt.ptr<bf16>, #blocked> loc(#loc199)
+    %tmp25_89 = tt.load %tmp25_88, %tmp17_82, %cst_15 evictionPolicy = evict_last : tensor<8x128x!tt.ptr<bf16>, #blocked> loc(#loc200)
+    %tmp25_90 = arith.extf %tmp25_89 : tensor<8x128xbf16, #blocked> to tensor<8x128xf32, #blocked> loc(#loc201)
+    %tmp27 = arith.mulf %tmp24_87, %tmp25_90 : tensor<8x128xf32, #blocked> loc(#loc202)
+    %tmp29 = arith.subf %cst_18, %tmp27 : tensor<8x128xf32, #blocked> loc(#loc203)
+    %tmp31 = tt.broadcast %tmp16_75 : tensor<1x128xi1, #blocked> -> tensor<8x128xi1, #blocked> loc(#loc204)
+    %tmp32 = arith.cmpi sge, %tmp16, %cst_2 : tensor<1x128xi64, #blocked> loc(#loc205)
+    %tmp35 = tt.broadcast %tmp17 : tensor<1x128xi32, #blocked> -> tensor<8x128xi32, #blocked> loc(#loc206)
+    %tmp35_91 = arith.addi %tmp35, %tmp0_38 : tensor<8x128xi32, #blocked> loc(#loc206)
+    %tmp35_92 = arith.addi %tmp35_91, %tmp0_43 : tensor<8x128xi32, #blocked> loc(#loc207)
+    %tmp35_93 = tt.addptr %tmp0_46, %tmp35_92 : tensor<8x128x!tt.ptr<bf16>, #blocked>, tensor<8x128xi32, #blocked> loc(#loc208)
+    %tmp35_94 = arith.andi %r0_mask_33, %tmp32 : tensor<1x128xi1, #blocked> loc(#loc209)
+    %tmp35_95 = tt.broadcast %tmp35_94 : tensor<1x128xi1, #blocked> -> tensor<8x128xi1, #blocked> loc(#loc210)
+    %tmp35_96 = tt.load %tmp35_93, %tmp35_95, %cst_15 evictionPolicy = evict_last : tensor<8x128x!tt.ptr<bf16>, #blocked> loc(#loc210)
+    %tmp35_97 = arith.extf %tmp35_96 : tensor<8x128xbf16, #blocked> to tensor<8x128xf32, #blocked> loc(#loc211)
+    %tmp42 = arith.mulf %tmp35_97, %tmp24_85 : tensor<8x128xf32, #blocked> loc(#loc212)
+    %tmp43 = tt.addptr %tmp58, %tmp17 : tensor<1x128x!tt.ptr<bf16>, #blocked>, tensor<1x128xi32, #blocked> loc(#loc213)
+    %tmp43_98 = tt.broadcast %tmp43 : tensor<1x128x!tt.ptr<bf16>, #blocked> -> tensor<8x128x!tt.ptr<bf16>, #blocked> loc(#loc213)
+    %tmp43_99 = tt.load %tmp43_98, %tmp35_95, %cst_15 evictionPolicy = evict_last : tensor<8x128x!tt.ptr<bf16>, #blocked> loc(#loc214)
+    %tmp43_100 = arith.extf %tmp43_99 : tensor<8x128xbf16, #blocked> to tensor<8x128xf32, #blocked> loc(#loc215)
+    %tmp45 = arith.mulf %tmp42, %tmp43_100 : tensor<8x128xf32, #blocked> loc(#loc216)
+    %tmp48 = tt.broadcast %tmp32 : tensor<1x128xi1, #blocked> -> tensor<8x128xi1, #blocked> loc(#loc217)
+    %tmp48_101 = arith.select %tmp48, %tmp45, %cst_18 : tensor<8x128xi1, #blocked>, tensor<8x128xf32, #blocked> loc(#loc217)
+    %tmp49 = arith.select %tmp31, %tmp29, %tmp48_101 : tensor<8x128xi1, #blocked>, tensor<8x128xf32, #blocked> loc(#loc262)
+    %tmp57 = arith.mulf %tmp6_55, %tmp24_86 : tensor<8x128xf32, #blocked1> loc(#loc219)
+    %tmp60 = tt.broadcast %tmp58_61 : tensor<1x128xf32, #blocked1> -> tensor<8x128xf32, #blocked1> loc(#loc220)
+    %tmp60_102 = arith.mulf %tmp57, %tmp60 : tensor<8x128xf32, #blocked1> loc(#loc220)
+    %tmp64 = arith.mulf %tmp60_102, %tmp63_66 : tensor<8x128xf32, #blocked1> loc(#loc221)
+    %tmp64_103 = ttg.convert_layout %tmp64 : tensor<8x128xf32, #blocked1> -> tensor<8x128xf32, #blocked> loc(#loc221)
+    %tmp67 = arith.mulf %tmp49, %tmp66_69 : tensor<8x128xf32, #blocked> loc(#loc222)
+    %tmp68 = arith.addf %tmp64_103, %tmp67 : tensor<8x128xf32, #blocked> loc(#loc223)
+    %tmp70 = arith.addi %tmp17, %cst : tensor<1x128xi32, #blocked> loc(#loc224)
+    %tmp70_104 = tt.broadcast %tmp70 : tensor<1x128xi32, #blocked> -> tensor<8x128xi32, #blocked> loc(#loc225)
+    %tmp70_105 = arith.addi %tmp70_104, %tmp0_38 : tensor<8x128xi32, #blocked> loc(#loc225)
+    %tmp70_106 = arith.addi %tmp70_105, %tmp0_43 : tensor<8x128xi32, #blocked> loc(#loc226)
+    %tmp70_107 = tt.addptr %tmp0_46, %tmp70_106 : tensor<8x128x!tt.ptr<bf16>, #blocked>, tensor<8x128xi32, #blocked> loc(#loc227)
+    %tmp70_108 = tt.load %tmp70_107, %tmp17_82, %cst_15 evictionPolicy = evict_last : tensor<8x128x!tt.ptr<bf16>, #blocked> loc(#loc228)
+    %tmp70_109 = arith.extf %tmp70_108 : tensor<8x128xbf16, #blocked> to tensor<8x128xf32, #blocked> loc(#loc229)
+    %tmp72 = arith.divf %tmp4_56, %cst_17 : tensor<8x1xf32, #blocked1> loc(#loc230)
+    %tmp73 = arith.addf %tmp72, %cst_16 : tensor<8x1xf32, #blocked1> loc(#loc231)
+    %tmp74 = tt.extern_elementwise %tmp73 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<8x1xf32, #blocked1>) -> tensor<8x1xf32, #blocked1> loc(#loc232)
+    %tmp75 = ttg.convert_layout %tmp74 : tensor<8x1xf32, #blocked1> -> tensor<8x1xf32, #blocked> loc(#loc233)
+    %tmp75_110 = tt.broadcast %tmp75 : tensor<8x1xf32, #blocked> -> tensor<8x128xf32, #blocked> loc(#loc233)
+    %tmp75_111 = tt.broadcast %tmp74 : tensor<8x1xf32, #blocked1> -> tensor<8x128xf32, #blocked1> loc(#loc233)
+    %tmp75_112 = arith.mulf %tmp70_109, %tmp75_110 : tensor<8x128xf32, #blocked> loc(#loc233)
+    %tmp76 = tt.addptr %tmp102, %tmp17_76 : tensor<1x128x!tt.ptr<bf16>, #blocked>, tensor<1x128xi32, #blocked> loc(#loc234)
+    %tmp76_113 = tt.broadcast %tmp76 : tensor<1x128x!tt.ptr<bf16>, #blocked> -> tensor<8x128x!tt.ptr<bf16>, #blocked> loc(#loc234)
+    %tmp76_114 = tt.load %tmp76_113, %tmp17_82, %cst_15 evictionPolicy = evict_last : tensor<8x128x!tt.ptr<bf16>, #blocked> loc(#loc235)
+    %tmp76_115 = arith.extf %tmp76_114 : tensor<8x128xbf16, #blocked> to tensor<8x128xf32, #blocked> loc(#loc236)
+    %tmp78 = arith.mulf %tmp75_112, %tmp76_115 : tensor<8x128xf32, #blocked> loc(#loc237)
+    %tmp80 = arith.subf %cst_18, %tmp78 : tensor<8x128xf32, #blocked> loc(#loc238)
+    %tmp83 = arith.addi %tmp17, %cst_8 : tensor<1x128xi32, #blocked> loc(#loc239)
+    %tmp83_116 = tt.broadcast %tmp83 : tensor<1x128xi32, #blocked> -> tensor<8x128xi32, #blocked> loc(#loc240)
+    %tmp83_117 = arith.addi %tmp83_116, %tmp0_38 : tensor<8x128xi32, #blocked> loc(#loc240)
+    %tmp83_118 = arith.addi %tmp83_117, %tmp0_43 : tensor<8x128xi32, #blocked> loc(#loc241)
+    %tmp83_119 = tt.addptr %tmp0_46, %tmp83_118 : tensor<8x128x!tt.ptr<bf16>, #blocked>, tensor<8x128xi32, #blocked> loc(#loc242)
+    %tmp83_120 = tt.load %tmp83_119, %tmp35_95, %cst_15 evictionPolicy = evict_last : tensor<8x128x!tt.ptr<bf16>, #blocked> loc(#loc243)
+    %tmp83_121 = arith.extf %tmp83_120 : tensor<8x128xbf16, #blocked> to tensor<8x128xf32, #blocked> loc(#loc244)
+    %tmp88 = arith.mulf %tmp83_121, %tmp75_110 : tensor<8x128xf32, #blocked> loc(#loc245)
+    %tmp89 = tt.addptr %tmp102, %tmp17 : tensor<1x128x!tt.ptr<bf16>, #blocked>, tensor<1x128xi32, #blocked> loc(#loc246)
+    %tmp89_122 = tt.broadcast %tmp89 : tensor<1x128x!tt.ptr<bf16>, #blocked> -> tensor<8x128x!tt.ptr<bf16>, #blocked> loc(#loc246)
+    %tmp89_123 = tt.load %tmp89_122, %tmp35_95, %cst_15 evictionPolicy = evict_last : tensor<8x128x!tt.ptr<bf16>, #blocked> loc(#loc247)
+    %tmp89_124 = arith.extf %tmp89_123 : tensor<8x128xbf16, #blocked> to tensor<8x128xf32, #blocked> loc(#loc248)
+    %tmp91 = arith.mulf %tmp88, %tmp89_124 : tensor<8x128xf32, #blocked> loc(#loc249)
+    %tmp94 = arith.select %tmp48, %tmp91, %cst_18 : tensor<8x128xi1, #blocked>, tensor<8x128xf32, #blocked> loc(#loc250)
+    %tmp95 = arith.select %tmp31, %tmp80, %tmp94 : tensor<8x128xi1, #blocked>, tensor<8x128xf32, #blocked> loc(#loc263)
+    %tmp101 = arith.mulf %tmp96_70, %tmp75_111 : tensor<8x128xf32, #blocked1> loc(#loc253)
+    %tmp104 = tt.broadcast %tmp102_74 : tensor<1x128xf32, #blocked1> -> tensor<8x128xf32, #blocked1> loc(#loc254)
+    %tmp104_125 = arith.mulf %tmp101, %tmp104 : tensor<8x128xf32, #blocked1> loc(#loc254)
+    %tmp107 = arith.mulf %tmp104_125, %tmp63_66 : tensor<8x128xf32, #blocked1> loc(#loc255)
+    %tmp107_126 = ttg.convert_layout %tmp107 : tensor<8x128xf32, #blocked1> -> tensor<8x128xf32, #blocked> loc(#loc255)
+    %tmp109 = arith.mulf %tmp95, %tmp66_69 : tensor<8x128xf32, #blocked> loc(#loc256)
+    %tmp110 = arith.addf %tmp107_126, %tmp109 : tensor<8x128xf32, #blocked> loc(#loc257)
+    %0 = arith.muli %xindex_26, %cst_7 : tensor<8x1xi32, #blocked1> loc(#loc123)
+    %1 = tt.broadcast %0 : tensor<8x1xi32, #blocked1> -> tensor<8x128xi32, #blocked1> loc(#loc124)
+    %2 = arith.addi %tmp6, %1 : tensor<8x128xi32, #blocked1> loc(#loc124)
+    %3 = tt.splat %in_out_ptr0 : !tt.ptr<bf16> -> tensor<8x128x!tt.ptr<bf16>, #blocked1> loc(#loc125)
+    %4 = tt.addptr %3, %2 : tensor<8x128x!tt.ptr<bf16>, #blocked1>, tensor<8x128xi32, #blocked1> loc(#loc125)
+    %5 = arith.truncf %tmp68 : tensor<8x128xf32, #blocked> to tensor<8x128xbf16, #blocked> loc(#loc126)
+    %6 = ttg.convert_layout %5 : tensor<8x128xbf16, #blocked> -> tensor<8x128xbf16, #blocked1> loc(#loc126)
+    tt.store %4, %6, %tmp0_48 : tensor<8x128x!tt.ptr<bf16>, #blocked1> loc(#loc126)
+    %7 = tt.splat %in_out_ptr1 : !tt.ptr<bf16> -> tensor<8x128x!tt.ptr<bf16>, #blocked1> loc(#loc127)
+    %8 = tt.addptr %7, %2 : tensor<8x128x!tt.ptr<bf16>, #blocked1>, tensor<8x128xi32, #blocked1> loc(#loc127)
+    %9 = arith.truncf %tmp110 : tensor<8x128xf32, #blocked> to tensor<8x128xbf16, #blocked> loc(#loc128)
+    %10 = ttg.convert_layout %9 : tensor<8x128xbf16, #blocked> -> tensor<8x128xbf16, #blocked1> loc(#loc128)
+    tt.store %8, %10, %tmp0_48 : tensor<8x128x!tt.ptr<bf16>, #blocked1> loc(#loc128)
+    tt.return loc(#loc129)
+  } loc(#loc)
+} loc(#loc)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":23:28)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":23:33)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:44)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:23)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":26:37)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":28:19)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":29:19)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":35:29)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:41)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:52)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:48)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:63)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:57)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:34)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:68)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:121)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:41)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:50)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:34)
+#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:61)
+#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:114)
+#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":42:22)
+#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":44:23)
+#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":45:40)
+#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":47:22)
+#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":49:25)
+#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":50:42)
+#loc29 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:36)
+#loc31 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:15)
+#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":51:28)
+#loc34 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":52:30)
+#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":58:27)
+#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":59:27)
+#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:35)
+#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:42)
+#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:95)
+#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:46)
+#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:42)
+#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:35)
+#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:51)
+#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:35)
+#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:51)
+#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:69)
+#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:123)
+#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:36)
+#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:43)
+#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:96)
+#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":71:24)
+#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:41)
+#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:39)
+#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:48)
+#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:57)
+#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:35)
+#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:78)
+#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:68)
+#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:129)
+#loc60 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":75:25)
+#loc61 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":77:24)
+#loc62 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":78:32)
+#loc63 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":79:24)
+#loc64 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:35)
+#loc65 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:85)
+#loc66 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:146)
+#loc67 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":82:24)
+#loc68 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":84:17)
+#loc69 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":86:39)
+#loc70 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":87:25)
+#loc71 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:44)
+#loc72 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:53)
+#loc73 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:35)
+#loc74 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:74)
+#loc75 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:64)
+#loc76 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:125)
+#loc77 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":97:24)
+#loc78 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:35)
+#loc79 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:81)
+#loc80 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:142)
+#loc81 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":100:24)
+#loc82 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":103:39)
+#loc83 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":104:39)
+#loc84 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":111:24)
+#loc85 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":113:24)
+#loc86 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":116:24)
+#loc87 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":118:24)
+#loc88 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":119:24)
+#loc89 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:42)
+#loc90 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:51)
+#loc91 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:60)
+#loc92 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:35)
+#loc93 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:71)
+#loc94 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:132)
+#loc95 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":123:24)
+#loc96 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":124:24)
+#loc97 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":125:32)
+#loc98 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":126:24)
+#loc99 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:35)
+#loc100 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:85)
+#loc101 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:146)
+#loc102 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":129:24)
+#loc103 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":131:17)
+#loc104 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:42)
+#loc105 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:51)
+#loc106 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:60)
+#loc107 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:35)
+#loc108 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:71)
+#loc109 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:132)
+#loc110 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":139:24)
+#loc111 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:35)
+#loc112 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:81)
+#loc113 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:142)
+#loc114 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":142:24)
+#loc115 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":145:39)
+#loc116 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":146:39)
+#loc117 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":133:39)
+#loc118 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":151:25)
+#loc119 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":153:26)
+#loc120 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":156:26)
+#loc121 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":158:26)
+#loc122 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":159:26)
+#loc123 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:43)
+#loc124 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:39)
+#loc125 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:32)
+#loc126 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:55)
+#loc127 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:32)
+#loc128 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:56)
+#loc129 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":53:4)
+#loc139 = loc("xoffset"(#loc2))
+#loc140 = loc("xoffset"(#loc3))
+#loc141 = loc("xindex"(#loc4))
+#loc142 = loc("xindex"(#loc5))
+#loc143 = loc("r0_base"(#loc6))
+#loc144 = loc("x0"(#loc7))
+#loc145 = loc("x1"(#loc8))
+#loc146 = loc("r0_mask"(#loc9))
+#loc147 = loc("tmp0"(#loc10))
+#loc148 = loc("tmp0"(#loc11))
+#loc149 = loc("tmp0"(#loc12))
+#loc150 = loc("tmp0"(#loc13))
+#loc151 = loc("tmp0"(#loc14))
+#loc152 = loc("tmp0"(#loc15))
+#loc153 = loc("tmp0"(#loc16))
+#loc154 = loc("tmp0"(#loc17))
+#loc155 = loc("tmp6"(#loc18))
+#loc156 = loc("tmp6"(#loc19))
+#loc157 = loc("tmp6"(#loc20))
+#loc158 = loc("tmp6"(#loc21))
+#loc159 = loc("tmp6"(#loc22))
+#loc160 = loc("tmp2"(#loc23))
+#loc161 = loc("tmp5"(#loc24))
+#loc162 = loc("_tmp4"(#loc25))
+#loc163 = loc("tmp8"(#loc26))
+#loc164 = loc("tmp11"(#loc27))
+#loc165 = loc("_tmp10"(#loc28))
+#loc167 = loc("tmp4"(#loc32))
+#loc169 = loc("tmp10"(#loc34))
+#loc170 = loc("r0_3"(#loc35))
+#loc171 = loc("r0_4"(#loc36))
+#loc172 = loc("tmp58"(#loc37))
+#loc173 = loc("tmp58"(#loc38))
+#loc174 = loc("tmp58"(#loc39))
+#loc175 = loc("tmp63"(#loc40))
+#loc176 = loc("tmp63"(#loc41))
+#loc177 = loc("tmp63"(#loc42))
+#loc178 = loc("tmp63"(#loc43))
+#loc179 = loc("tmp66"(#loc44))
+#loc180 = loc("tmp66"(#loc45))
+#loc181 = loc("tmp96"(#loc46))
+#loc182 = loc("tmp96"(#loc47))
+#loc183 = loc("tmp102"(#loc48))
+#loc184 = loc("tmp102"(#loc49))
+#loc185 = loc("tmp102"(#loc50))
+#loc186 = loc("tmp16"(#loc51))
+#loc187 = loc("tmp17"(#loc52))
+#loc188 = loc("tmp17"(#loc53))
+#loc189 = loc("tmp17"(#loc54))
+#loc190 = loc("tmp17"(#loc55))
+#loc191 = loc("tmp17"(#loc56))
+#loc192 = loc("tmp17"(#loc57))
+#loc193 = loc("tmp17"(#loc58))
+#loc194 = loc("tmp17"(#loc59))
+#loc195 = loc("tmp20"(#loc60))
+#loc196 = loc("tmp22"(#loc61))
+#loc197 = loc("tmp23"(#loc62))
+#loc198 = loc("tmp24"(#loc63))
+#loc199 = loc("tmp25"(#loc64))
+#loc200 = loc("tmp25"(#loc65))
+#loc201 = loc("tmp25"(#loc66))
+#loc202 = loc("tmp27"(#loc67))
+#loc203 = loc("tmp29"(#loc68))
+#loc204 = loc("tmp31"(#loc69))
+#loc205 = loc("tmp32"(#loc70))
+#loc206 = loc("tmp35"(#loc71))
+#loc207 = loc("tmp35"(#loc72))
+#loc208 = loc("tmp35"(#loc73))
+#loc209 = loc("tmp35"(#loc74))
+#loc210 = loc("tmp35"(#loc75))
+#loc211 = loc("tmp35"(#loc76))
+#loc212 = loc("tmp42"(#loc77))
+#loc213 = loc("tmp43"(#loc78))
+#loc214 = loc("tmp43"(#loc79))
+#loc215 = loc("tmp43"(#loc80))
+#loc216 = loc("tmp45"(#loc81))
+#loc217 = loc("tmp48"(#loc82))
+#loc218 = loc("tmp49"(#loc83))
+#loc219 = loc("tmp57"(#loc84))
+#loc220 = loc("tmp60"(#loc85))
+#loc221 = loc("tmp64"(#loc86))
+#loc222 = loc("tmp67"(#loc87))
+#loc223 = loc("tmp68"(#loc88))
+#loc224 = loc("tmp70"(#loc89))
+#loc225 = loc("tmp70"(#loc90))
+#loc226 = loc("tmp70"(#loc91))
+#loc227 = loc("tmp70"(#loc92))
+#loc228 = loc("tmp70"(#loc93))
+#loc229 = loc("tmp70"(#loc94))
+#loc230 = loc("tmp72"(#loc95))
+#loc231 = loc("tmp73"(#loc96))
+#loc232 = loc("tmp74"(#loc97))
+#loc233 = loc("tmp75"(#loc98))
+#loc234 = loc("tmp76"(#loc99))
+#loc235 = loc("tmp76"(#loc100))
+#loc236 = loc("tmp76"(#loc101))
+#loc237 = loc("tmp78"(#loc102))
+#loc238 = loc("tmp80"(#loc103))
+#loc239 = loc("tmp83"(#loc104))
+#loc240 = loc("tmp83"(#loc105))
+#loc241 = loc("tmp83"(#loc106))
+#loc242 = loc("tmp83"(#loc107))
+#loc243 = loc("tmp83"(#loc108))
+#loc244 = loc("tmp83"(#loc109))
+#loc245 = loc("tmp88"(#loc110))
+#loc246 = loc("tmp89"(#loc111))
+#loc247 = loc("tmp89"(#loc112))
+#loc248 = loc("tmp89"(#loc113))
+#loc249 = loc("tmp91"(#loc114))
+#loc250 = loc("tmp94"(#loc115))
+#loc251 = loc("tmp95"(#loc116))
+#loc252 = loc("tmp82"(#loc117))
+#loc253 = loc("tmp101"(#loc118))
+#loc254 = loc("tmp104"(#loc119))
+#loc255 = loc("tmp107"(#loc120))
+#loc256 = loc("tmp109"(#loc121))
+#loc257 = loc("tmp110"(#loc122))
+#loc258 = loc(callsite(#loc29 at #loc166))
+#loc260 = loc(callsite(#loc29 at #loc168))
+#loc262 = loc(fused[#loc218, #loc204])
+#loc263 = loc(fused[#loc251, #loc252])
+#loc264 = loc(callsite(#loc31 at #loc258))
+#loc265 = loc(callsite(#loc31 at #loc260))
diff --git a/triton/646WRNO7FMJYGKK66FHZ7JPPW6YUT5P5V6RPZAVEK2TTH5B43JCA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttir b/triton/646WRNO7FMJYGKK66FHZ7JPPW6YUT5P5V6RPZAVEK2TTH5B43JCA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttir
new file mode 100644
index 0000000000000000000000000000000000000000..c72b2af97edc08cdcc9a95b74c00153c29f759d2
--- /dev/null
+++ b/triton/646WRNO7FMJYGKK66FHZ7JPPW6YUT5P5V6RPZAVEK2TTH5B43JCA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttir
@@ -0,0 +1,457 @@
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":18:0)
+#loc1 = loc(unknown)
+#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":51:25)
+#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":52:27)
+#loc132 = loc("in_out_ptr0"(#loc))
+#loc133 = loc("in_out_ptr1"(#loc))
+#loc134 = loc("in_ptr0"(#loc))
+#loc135 = loc("in_ptr1"(#loc))
+#loc136 = loc("in_ptr2"(#loc))
+#loc137 = loc("in_ptr3"(#loc))
+#loc138 = loc("in_ptr4"(#loc))
+#loc139 = loc("xnumel"(#loc))
+#loc140 = loc("r0_numel"(#loc))
+#loc170 = loc("tmp4"(#loc32))
+#loc172 = loc("tmp10"(#loc35))
+#loc263 = loc(callsite(#loc1 at #loc170))
+#loc265 = loc(callsite(#loc1 at #loc172))
+module {
+  tt.func public @triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0(%in_out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_out_ptr0"(#loc)), %in_out_ptr1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_out_ptr1"(#loc)), %in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %in_ptr4: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr4"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
+    %cst = arith.constant dense<0.000000e+00> : tensor<1x128xbf16> loc(#loc1)
+    %cst_0 = arith.constant dense<0.000000e+00> : tensor<8x128xbf16> loc(#loc1)
+    %cst_1 = arith.constant dense<4097> : tensor<1x128xi32> loc(#loc1)
+    %cst_2 = arith.constant dense<9.99999997E-7> : tensor<8x1xf32> loc(#loc1)
+    %cst_3 = arith.constant dense<1.280000e+02> : tensor<8x1xf32> loc(#loc1)
+    %cst_4 = arith.constant dense<1> : tensor<1x128xi32> loc(#loc1)
+    %cst_5 = arith.constant dense<1> : tensor<1x128xi64> loc(#loc1)
+    %cst_6 = arith.constant dense<2> : tensor<1x128xi32> loc(#loc1)
+    %cst_7 = arith.constant dense<36864> : tensor<8x1xi32> loc(#loc1)
+    %cst_8 = arith.constant dense<128> : tensor<8x1xi32> loc(#loc1)
+    %cst_9 = arith.constant dense<4096> : tensor<1x128xi32> loc(#loc1)
+    %cst_10 = arith.constant dense<128> : tensor<1x128xi32> loc(#loc1)
+    %cst_11 = arith.constant dense<0.000000e+00> : tensor<8x128xf32> loc(#loc1)
+    %cst_12 = arith.constant dense<32> : tensor<8x1xi32> loc(#loc1)
+    %c8_i32 = arith.constant 8 : i32 loc(#loc1)
+    %xoffset = tt.get_program_id x : i32 loc(#loc141)
+    %xoffset_13 = arith.muli %xoffset, %c8_i32 : i32 loc(#loc142)
+    %xindex = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32> loc(#loc143)
+    %xindex_14 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<8xi32> -> tensor<8x1xi32> loc(#loc144)
+    %xindex_15 = tt.splat %xoffset_13 : i32 -> tensor<8x1xi32> loc(#loc145)
+    %xindex_16 = arith.addi %xindex_15, %xindex_14 : tensor<8x1xi32> loc(#loc145)
+    %r0_base = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc146)
+    %r0_base_17 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc147)
+    %x0 = arith.remsi %xindex_16, %cst_12 : tensor<8x1xi32> loc(#loc148)
+    %x1 = arith.divsi %xindex_16, %cst_12 : tensor<8x1xi32> loc(#loc149)
+    %r0_mask = arith.cmpi slt, %r0_base_17, %cst_10 : tensor<1x128xi32> loc(#loc150)
+    %tmp0 = arith.addi %r0_base_17, %cst_9 : tensor<1x128xi32> loc(#loc151)
+    %tmp0_18 = arith.muli %x0, %cst_8 : tensor<8x1xi32> loc(#loc152)
+    %tmp0_19 = tt.broadcast %tmp0 : tensor<1x128xi32> -> tensor<8x128xi32> loc(#loc153)
+    %tmp0_20 = tt.broadcast %tmp0_18 : tensor<8x1xi32> -> tensor<8x128xi32> loc(#loc153)
+    %tmp0_21 = arith.addi %tmp0_19, %tmp0_20 : tensor<8x128xi32> loc(#loc153)
+    %tmp0_22 = arith.muli %x1, %cst_7 : tensor<8x1xi32> loc(#loc154)
+    %tmp0_23 = tt.broadcast %tmp0_22 : tensor<8x1xi32> -> tensor<8x128xi32> loc(#loc155)
+    %tmp0_24 = arith.addi %tmp0_21, %tmp0_23 : tensor<8x128xi32> loc(#loc155)
+    %tmp0_25 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<8x128x!tt.ptr<bf16>> loc(#loc156)
+    %tmp0_26 = tt.addptr %tmp0_25, %tmp0_24 : tensor<8x128x!tt.ptr<bf16>>, tensor<8x128xi32> loc(#loc156)
+    %tmp0_27 = tt.broadcast %r0_mask : tensor<1x128xi1> -> tensor<8x128xi1> loc(#loc157)
+    %tmp0_28 = tt.load %tmp0_26, %tmp0_27, %cst_0 evictionPolicy = evict_last : tensor<8x128x!tt.ptr<bf16>> loc(#loc157)
+    %tmp0_29 = arith.extf %tmp0_28 : tensor<8x128xbf16> to tensor<8x128xf32> loc(#loc158)
+    %tmp6 = tt.broadcast %r0_base_17 : tensor<1x128xi32> -> tensor<8x128xi32> loc(#loc159)
+    %tmp6_30 = arith.addi %tmp6, %tmp0_20 : tensor<8x128xi32> loc(#loc159)
+    %tmp6_31 = arith.addi %tmp6_30, %tmp0_23 : tensor<8x128xi32> loc(#loc160)
+    %tmp6_32 = tt.addptr %tmp0_25, %tmp6_31 : tensor<8x128x!tt.ptr<bf16>>, tensor<8x128xi32> loc(#loc161)
+    %tmp6_33 = tt.load %tmp6_32, %tmp0_27, %cst_0 evictionPolicy = evict_last : tensor<8x128x!tt.ptr<bf16>> loc(#loc162)
+    %tmp6_34 = arith.extf %tmp6_33 : tensor<8x128xbf16> to tensor<8x128xf32> loc(#loc163)
+    %tmp2 = arith.mulf %tmp0_29, %tmp0_29 : tensor<8x128xf32> loc(#loc164)
+    %tmp5 = arith.addf %tmp2, %cst_11 : tensor<8x128xf32> loc(#loc165)
+    %_tmp4 = arith.select %tmp0_27, %tmp5, %cst_11 : tensor<8x128xi1>, tensor<8x128xf32> loc(#loc166)
+    %tmp8 = arith.mulf %tmp6_34, %tmp6_34 : tensor<8x128xf32> loc(#loc167)
+    %tmp11 = arith.addf %tmp8, %cst_11 : tensor<8x128xf32> loc(#loc168)
+    %_tmp10 = arith.select %tmp0_27, %tmp11, %cst_11 : tensor<8x128xi1>, tensor<8x128xf32> loc(#loc169)
+    %tmp4 = "tt.reduce"(%_tmp4) <{axis = 1 : i32}> ({
+    ^bb0(%tmp4_98: f32 loc(callsite(#loc1 at #loc170)), %tmp4_99: f32 loc(callsite(#loc1 at #loc170))):
+      %tmp4_100 = arith.addf %tmp4_98, %tmp4_99 : f32 loc(#loc266)
+      tt.reduce.return %tmp4_100 : f32 loc(#loc262)
+    }) : (tensor<8x128xf32>) -> tensor<8xf32> loc(#loc262)
+    %tmp4_35 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<8xf32> -> tensor<8x1xf32> loc(#loc171)
+    %tmp10 = "tt.reduce"(%_tmp10) <{axis = 1 : i32}> ({
+    ^bb0(%tmp10_98: f32 loc(callsite(#loc1 at #loc172)), %tmp10_99: f32 loc(callsite(#loc1 at #loc172))):
+      %tmp10_100 = arith.addf %tmp10_98, %tmp10_99 : f32 loc(#loc267)
+      tt.reduce.return %tmp10_100 : f32 loc(#loc264)
+    }) : (tensor<8x128xf32>) -> tensor<8xf32> loc(#loc264)
+    %tmp10_36 = tt.expand_dims %tmp10 {axis = 1 : i32} : tensor<8xf32> -> tensor<8x1xf32> loc(#loc173)
+    %r0_3 = arith.remsi %r0_base_17, %cst_6 : tensor<1x128xi32> loc(#loc174)
+    %r0_4 = arith.divsi %r0_base_17, %cst_6 : tensor<1x128xi32> loc(#loc175)
+    %tmp58 = tt.splat %in_ptr1 : !tt.ptr<bf16> -> tensor<1x128x!tt.ptr<bf16>> loc(#loc176)
+    %tmp58_37 = tt.addptr %tmp58, %r0_base_17 : tensor<1x128x!tt.ptr<bf16>>, tensor<1x128xi32> loc(#loc176)
+    %tmp58_38 = tt.load %tmp58_37, %r0_mask, %cst evictionPolicy = evict_last : tensor<1x128x!tt.ptr<bf16>> loc(#loc177)
+    %tmp58_39 = arith.extf %tmp58_38 : tensor<1x128xbf16> to tensor<1x128xf32> loc(#loc178)
+    %tmp63 = arith.muli %x1, %cst_8 : tensor<8x1xi32> loc(#loc179)
+    %tmp63_40 = tt.broadcast %tmp63 : tensor<8x1xi32> -> tensor<8x128xi32> loc(#loc180)
+    %tmp63_41 = arith.addi %tmp6, %tmp63_40 : tensor<8x128xi32> loc(#loc180)
+    %tmp63_42 = tt.splat %in_ptr2 : !tt.ptr<f32> -> tensor<8x128x!tt.ptr<f32>> loc(#loc181)
+    %tmp63_43 = tt.addptr %tmp63_42, %tmp63_41 : tensor<8x128x!tt.ptr<f32>>, tensor<8x128xi32> loc(#loc181)
+    %tmp63_44 = tt.load %tmp63_43, %tmp0_27, %cst_11 evictionPolicy = evict_last : tensor<8x128x!tt.ptr<f32>> loc(#loc182)
+    %tmp66 = tt.splat %in_ptr3 : !tt.ptr<f32> -> tensor<8x128x!tt.ptr<f32>> loc(#loc183)
+    %tmp66_45 = tt.addptr %tmp66, %tmp63_41 : tensor<8x128x!tt.ptr<f32>>, tensor<8x128xi32> loc(#loc183)
+    %tmp66_46 = tt.load %tmp66_45, %tmp0_27, %cst_11 evictionPolicy = evict_last : tensor<8x128x!tt.ptr<f32>> loc(#loc184)
+    %tmp96 = tt.load %tmp0_26, %tmp0_27, %cst_0 evictionPolicy = evict_first : tensor<8x128x!tt.ptr<bf16>> loc(#loc185)
+    %tmp96_47 = arith.extf %tmp96 : tensor<8x128xbf16> to tensor<8x128xf32> loc(#loc186)
+    %tmp102 = tt.splat %in_ptr4 : !tt.ptr<bf16> -> tensor<1x128x!tt.ptr<bf16>> loc(#loc187)
+    %tmp102_48 = tt.addptr %tmp102, %r0_base_17 : tensor<1x128x!tt.ptr<bf16>>, tensor<1x128xi32> loc(#loc187)
+    %tmp102_49 = tt.load %tmp102_48, %r0_mask, %cst evictionPolicy = evict_last : tensor<1x128x!tt.ptr<bf16>> loc(#loc188)
+    %tmp102_50 = arith.extf %tmp102_49 : tensor<1x128xbf16> to tensor<1x128xf32> loc(#loc189)
+    %tmp16 = arith.extsi %r0_3 : tensor<1x128xi32> to tensor<1x128xi64> loc(#loc190)
+    %tmp16_51 = arith.cmpi slt, %tmp16, %cst_5 : tensor<1x128xi64> loc(#loc190)
+    %tmp17 = arith.muli %r0_4, %cst_6 : tensor<1x128xi32> loc(#loc191)
+    %tmp17_52 = arith.addi %tmp17, %cst_4 : tensor<1x128xi32> loc(#loc192)
+    %tmp17_53 = tt.broadcast %tmp17_52 : tensor<1x128xi32> -> tensor<8x128xi32> loc(#loc193)
+    %tmp17_54 = arith.addi %tmp17_53, %tmp0_20 : tensor<8x128xi32> loc(#loc193)
+    %tmp17_55 = arith.addi %tmp17_54, %tmp0_23 : tensor<8x128xi32> loc(#loc194)
+    %tmp17_56 = tt.addptr %tmp0_25, %tmp17_55 : tensor<8x128x!tt.ptr<bf16>>, tensor<8x128xi32> loc(#loc195)
+    %tmp17_57 = arith.andi %r0_mask, %tmp16_51 : tensor<1x128xi1> loc(#loc196)
+    %tmp17_58 = tt.broadcast %tmp17_57 : tensor<1x128xi1> -> tensor<8x128xi1> loc(#loc197)
+    %tmp17_59 = tt.load %tmp17_56, %tmp17_58, %cst_0 evictionPolicy = evict_last : tensor<8x128x!tt.ptr<bf16>> loc(#loc197)
+    %tmp17_60 = arith.extf %tmp17_59 : tensor<8x128xbf16> to tensor<8x128xf32> loc(#loc198)
+    %tmp20 = arith.divf %tmp10_36, %cst_3 : tensor<8x1xf32> loc(#loc199)
+    %tmp22 = arith.addf %tmp20, %cst_2 : tensor<8x1xf32> loc(#loc200)
+    %tmp23 = tt.extern_elementwise %tmp22 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<8x1xf32>) -> tensor<8x1xf32> loc(#loc201)
+    %tmp24 = tt.broadcast %tmp23 : tensor<8x1xf32> -> tensor<8x128xf32> loc(#loc202)
+    %tmp24_61 = arith.mulf %tmp17_60, %tmp24 : tensor<8x128xf32> loc(#loc202)
+    %tmp25 = tt.addptr %tmp58, %tmp17_52 : tensor<1x128x!tt.ptr<bf16>>, tensor<1x128xi32> loc(#loc203)
+    %tmp25_62 = tt.broadcast %tmp25 : tensor<1x128x!tt.ptr<bf16>> -> tensor<8x128x!tt.ptr<bf16>> loc(#loc203)
+    %tmp25_63 = tt.load %tmp25_62, %tmp17_58, %cst_0 evictionPolicy = evict_last : tensor<8x128x!tt.ptr<bf16>> loc(#loc204)
+    %tmp25_64 = arith.extf %tmp25_63 : tensor<8x128xbf16> to tensor<8x128xf32> loc(#loc205)
+    %tmp27 = arith.mulf %tmp24_61, %tmp25_64 : tensor<8x128xf32> loc(#loc206)
+    %tmp29 = arith.subf %cst_11, %tmp27 : tensor<8x128xf32> loc(#loc207)
+    %tmp31 = tt.broadcast %tmp16_51 : tensor<1x128xi1> -> tensor<8x128xi1> loc(#loc208)
+    %tmp31_65 = arith.select %tmp31, %tmp29, %cst_11 : tensor<8x128xi1>, tensor<8x128xf32> loc(#loc208)
+    %tmp32 = arith.cmpi sge, %tmp16, %cst_5 : tensor<1x128xi64> loc(#loc209)
+    %tmp35 = tt.broadcast %tmp17 : tensor<1x128xi32> -> tensor<8x128xi32> loc(#loc210)
+    %tmp35_66 = arith.addi %tmp35, %tmp0_20 : tensor<8x128xi32> loc(#loc210)
+    %tmp35_67 = arith.addi %tmp35_66, %tmp0_23 : tensor<8x128xi32> loc(#loc211)
+    %tmp35_68 = tt.addptr %tmp0_25, %tmp35_67 : tensor<8x128x!tt.ptr<bf16>>, tensor<8x128xi32> loc(#loc212)
+    %tmp35_69 = arith.andi %r0_mask, %tmp32 : tensor<1x128xi1> loc(#loc213)
+    %tmp35_70 = tt.broadcast %tmp35_69 : tensor<1x128xi1> -> tensor<8x128xi1> loc(#loc214)
+    %tmp35_71 = tt.load %tmp35_68, %tmp35_70, %cst_0 evictionPolicy = evict_last : tensor<8x128x!tt.ptr<bf16>> loc(#loc214)
+    %tmp35_72 = arith.extf %tmp35_71 : tensor<8x128xbf16> to tensor<8x128xf32> loc(#loc215)
+    %tmp42 = arith.mulf %tmp35_72, %tmp24 : tensor<8x128xf32> loc(#loc216)
+    %tmp43 = tt.addptr %tmp58, %tmp17 : tensor<1x128x!tt.ptr<bf16>>, tensor<1x128xi32> loc(#loc217)
+    %tmp43_73 = tt.broadcast %tmp43 : tensor<1x128x!tt.ptr<bf16>> -> tensor<8x128x!tt.ptr<bf16>> loc(#loc217)
+    %tmp43_74 = tt.load %tmp43_73, %tmp35_70, %cst_0 evictionPolicy = evict_last : tensor<8x128x!tt.ptr<bf16>> loc(#loc218)
+    %tmp43_75 = arith.extf %tmp43_74 : tensor<8x128xbf16> to tensor<8x128xf32> loc(#loc219)
+    %tmp45 = arith.mulf %tmp42, %tmp43_75 : tensor<8x128xf32> loc(#loc220)
+    %tmp48 = tt.broadcast %tmp32 : tensor<1x128xi1> -> tensor<8x128xi1> loc(#loc221)
+    %tmp48_76 = arith.select %tmp48, %tmp45, %cst_11 : tensor<8x128xi1>, tensor<8x128xf32> loc(#loc221)
+    %tmp49 = arith.select %tmp31, %tmp31_65, %tmp48_76 : tensor<8x128xi1>, tensor<8x128xf32> loc(#loc222)
+    %tmp57 = arith.mulf %tmp6_34, %tmp24 : tensor<8x128xf32> loc(#loc223)
+    %tmp60 = tt.broadcast %tmp58_39 : tensor<1x128xf32> -> tensor<8x128xf32> loc(#loc224)
+    %tmp60_77 = arith.mulf %tmp57, %tmp60 : tensor<8x128xf32> loc(#loc224)
+    %tmp64 = arith.mulf %tmp60_77, %tmp63_44 : tensor<8x128xf32> loc(#loc225)
+    %tmp67 = arith.mulf %tmp49, %tmp66_46 : tensor<8x128xf32> loc(#loc226)
+    %tmp68 = arith.addf %tmp64, %tmp67 : tensor<8x128xf32> loc(#loc227)
+    %tmp70 = arith.addi %tmp17, %cst_1 : tensor<1x128xi32> loc(#loc228)
+    %tmp70_78 = tt.broadcast %tmp70 : tensor<1x128xi32> -> tensor<8x128xi32> loc(#loc229)
+    %tmp70_79 = arith.addi %tmp70_78, %tmp0_20 : tensor<8x128xi32> loc(#loc229)
+    %tmp70_80 = arith.addi %tmp70_79, %tmp0_23 : tensor<8x128xi32> loc(#loc230)
+    %tmp70_81 = tt.addptr %tmp0_25, %tmp70_80 : tensor<8x128x!tt.ptr<bf16>>, tensor<8x128xi32> loc(#loc231)
+    %tmp70_82 = tt.load %tmp70_81, %tmp17_58, %cst_0 evictionPolicy = evict_last : tensor<8x128x!tt.ptr<bf16>> loc(#loc232)
+    %tmp70_83 = arith.extf %tmp70_82 : tensor<8x128xbf16> to tensor<8x128xf32> loc(#loc233)
+    %tmp72 = arith.divf %tmp4_35, %cst_3 : tensor<8x1xf32> loc(#loc234)
+    %tmp73 = arith.addf %tmp72, %cst_2 : tensor<8x1xf32> loc(#loc235)
+    %tmp74 = tt.extern_elementwise %tmp73 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<8x1xf32>) -> tensor<8x1xf32> loc(#loc236)
+    %tmp75 = tt.broadcast %tmp74 : tensor<8x1xf32> -> tensor<8x128xf32> loc(#loc237)
+    %tmp75_84 = arith.mulf %tmp70_83, %tmp75 : tensor<8x128xf32> loc(#loc237)
+    %tmp76 = tt.addptr %tmp102, %tmp17_52 : tensor<1x128x!tt.ptr<bf16>>, tensor<1x128xi32> loc(#loc238)
+    %tmp76_85 = tt.broadcast %tmp76 : tensor<1x128x!tt.ptr<bf16>> -> tensor<8x128x!tt.ptr<bf16>> loc(#loc238)
+    %tmp76_86 = tt.load %tmp76_85, %tmp17_58, %cst_0 evictionPolicy = evict_last : tensor<8x128x!tt.ptr<bf16>> loc(#loc239)
+    %tmp76_87 = arith.extf %tmp76_86 : tensor<8x128xbf16> to tensor<8x128xf32> loc(#loc240)
+    %tmp78 = arith.mulf %tmp75_84, %tmp76_87 : tensor<8x128xf32> loc(#loc241)
+    %tmp80 = arith.subf %cst_11, %tmp78 : tensor<8x128xf32> loc(#loc242)
+    %tmp82 = arith.select %tmp31, %tmp80, %cst_11 : tensor<8x128xi1>, tensor<8x128xf32> loc(#loc243)
+    %tmp83 = arith.addi %tmp17, %cst_9 : tensor<1x128xi32> loc(#loc244)
+    %tmp83_88 = tt.broadcast %tmp83 : tensor<1x128xi32> -> tensor<8x128xi32> loc(#loc245)
+    %tmp83_89 = arith.addi %tmp83_88, %tmp0_20 : tensor<8x128xi32> loc(#loc245)
+    %tmp83_90 = arith.addi %tmp83_89, %tmp0_23 : tensor<8x128xi32> loc(#loc246)
+    %tmp83_91 = tt.addptr %tmp0_25, %tmp83_90 : tensor<8x128x!tt.ptr<bf16>>, tensor<8x128xi32> loc(#loc247)
+    %tmp83_92 = tt.load %tmp83_91, %tmp35_70, %cst_0 evictionPolicy = evict_last : tensor<8x128x!tt.ptr<bf16>> loc(#loc248)
+    %tmp83_93 = arith.extf %tmp83_92 : tensor<8x128xbf16> to tensor<8x128xf32> loc(#loc249)
+    %tmp88 = arith.mulf %tmp83_93, %tmp75 : tensor<8x128xf32> loc(#loc250)
+    %tmp89 = tt.addptr %tmp102, %tmp17 : tensor<1x128x!tt.ptr<bf16>>, tensor<1x128xi32> loc(#loc251)
+    %tmp89_94 = tt.broadcast %tmp89 : tensor<1x128x!tt.ptr<bf16>> -> tensor<8x128x!tt.ptr<bf16>> loc(#loc251)
+    %tmp89_95 = tt.load %tmp89_94, %tmp35_70, %cst_0 evictionPolicy = evict_last : tensor<8x128x!tt.ptr<bf16>> loc(#loc252)
+    %tmp89_96 = arith.extf %tmp89_95 : tensor<8x128xbf16> to tensor<8x128xf32> loc(#loc253)
+    %tmp91 = arith.mulf %tmp88, %tmp89_96 : tensor<8x128xf32> loc(#loc254)
+    %tmp94 = arith.select %tmp48, %tmp91, %cst_11 : tensor<8x128xi1>, tensor<8x128xf32> loc(#loc255)
+    %tmp95 = arith.select %tmp31, %tmp82, %tmp94 : tensor<8x128xi1>, tensor<8x128xf32> loc(#loc256)
+    %tmp101 = arith.mulf %tmp96_47, %tmp75 : tensor<8x128xf32> loc(#loc257)
+    %tmp104 = tt.broadcast %tmp102_50 : tensor<1x128xf32> -> tensor<8x128xf32> loc(#loc258)
+    %tmp104_97 = arith.mulf %tmp101, %tmp104 : tensor<8x128xf32> loc(#loc258)
+    %tmp107 = arith.mulf %tmp104_97, %tmp63_44 : tensor<8x128xf32> loc(#loc259)
+    %tmp109 = arith.mulf %tmp95, %tmp66_46 : tensor<8x128xf32> loc(#loc260)
+    %tmp110 = arith.addf %tmp107, %tmp109 : tensor<8x128xf32> loc(#loc261)
+    %0 = arith.muli %xindex_16, %cst_8 : tensor<8x1xi32> loc(#loc125)
+    %1 = tt.broadcast %0 : tensor<8x1xi32> -> tensor<8x128xi32> loc(#loc126)
+    %2 = arith.addi %tmp6, %1 : tensor<8x128xi32> loc(#loc126)
+    %3 = tt.splat %in_out_ptr0 : !tt.ptr<bf16> -> tensor<8x128x!tt.ptr<bf16>> loc(#loc127)
+    %4 = tt.addptr %3, %2 : tensor<8x128x!tt.ptr<bf16>>, tensor<8x128xi32> loc(#loc127)
+    %5 = arith.truncf %tmp68 : tensor<8x128xf32> to tensor<8x128xbf16> loc(#loc128)
+    tt.store %4, %5, %tmp0_27 : tensor<8x128x!tt.ptr<bf16>> loc(#loc128)
+    %6 = tt.splat %in_out_ptr1 : !tt.ptr<bf16> -> tensor<8x128x!tt.ptr<bf16>> loc(#loc129)
+    %7 = tt.addptr %6, %2 : tensor<8x128x!tt.ptr<bf16>>, tensor<8x128xi32> loc(#loc129)
+    %8 = arith.truncf %tmp110 : tensor<8x128xf32> to tensor<8x128xbf16> loc(#loc130)
+    tt.store %7, %8, %tmp0_27 : tensor<8x128x!tt.ptr<bf16>> loc(#loc130)
+    tt.return loc(#loc131)
+  } loc(#loc)
+} loc(#loc)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":23:28)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":23:33)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:36)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:44)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:23)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":26:27)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":26:37)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":28:19)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":29:19)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":35:29)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:41)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:52)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:48)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:63)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:57)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:34)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:68)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:121)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:41)
+#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:50)
+#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:34)
+#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:61)
+#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:114)
+#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":42:22)
+#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":44:23)
+#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":45:40)
+#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":47:22)
+#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":49:25)
+#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":50:42)
+#loc31 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:36)
+#loc33 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:15)
+#loc34 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":51:28)
+#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":52:30)
+#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":58:27)
+#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":59:27)
+#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:35)
+#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:42)
+#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:95)
+#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:46)
+#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:42)
+#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:35)
+#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:51)
+#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:35)
+#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:51)
+#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:69)
+#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:123)
+#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:36)
+#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:43)
+#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:96)
+#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":71:24)
+#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:41)
+#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:39)
+#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:48)
+#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:57)
+#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:35)
+#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:78)
+#loc60 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:68)
+#loc61 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:129)
+#loc62 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":75:25)
+#loc63 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":77:24)
+#loc64 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":78:32)
+#loc65 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":79:24)
+#loc66 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:35)
+#loc67 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:85)
+#loc68 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:146)
+#loc69 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":82:24)
+#loc70 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":84:17)
+#loc71 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":86:39)
+#loc72 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":87:25)
+#loc73 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:44)
+#loc74 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:53)
+#loc75 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:35)
+#loc76 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:74)
+#loc77 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:64)
+#loc78 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:125)
+#loc79 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":97:24)
+#loc80 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:35)
+#loc81 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:81)
+#loc82 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:142)
+#loc83 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":100:24)
+#loc84 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":103:39)
+#loc85 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":104:39)
+#loc86 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":111:24)
+#loc87 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":113:24)
+#loc88 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":116:24)
+#loc89 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":118:24)
+#loc90 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":119:24)
+#loc91 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:42)
+#loc92 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:51)
+#loc93 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:60)
+#loc94 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:35)
+#loc95 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:71)
+#loc96 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:132)
+#loc97 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":123:24)
+#loc98 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":124:24)
+#loc99 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":125:32)
+#loc100 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":126:24)
+#loc101 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:35)
+#loc102 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:85)
+#loc103 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:146)
+#loc104 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":129:24)
+#loc105 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":131:17)
+#loc106 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":133:39)
+#loc107 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:42)
+#loc108 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:51)
+#loc109 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:60)
+#loc110 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:35)
+#loc111 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:71)
+#loc112 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:132)
+#loc113 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":139:24)
+#loc114 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:35)
+#loc115 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:81)
+#loc116 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:142)
+#loc117 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":142:24)
+#loc118 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":145:39)
+#loc119 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":146:39)
+#loc120 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":151:25)
+#loc121 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":153:26)
+#loc122 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":156:26)
+#loc123 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":158:26)
+#loc124 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":159:26)
+#loc125 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:43)
+#loc126 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:39)
+#loc127 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:32)
+#loc128 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:55)
+#loc129 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:32)
+#loc130 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:56)
+#loc131 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":53:4)
+#loc141 = loc("xoffset"(#loc2))
+#loc142 = loc("xoffset"(#loc3))
+#loc143 = loc("xindex"(#loc4))
+#loc144 = loc("xindex"(#loc5))
+#loc145 = loc("xindex"(#loc6))
+#loc146 = loc("r0_base"(#loc7))
+#loc147 = loc("r0_base"(#loc8))
+#loc148 = loc("x0"(#loc9))
+#loc149 = loc("x1"(#loc10))
+#loc150 = loc("r0_mask"(#loc11))
+#loc151 = loc("tmp0"(#loc12))
+#loc152 = loc("tmp0"(#loc13))
+#loc153 = loc("tmp0"(#loc14))
+#loc154 = loc("tmp0"(#loc15))
+#loc155 = loc("tmp0"(#loc16))
+#loc156 = loc("tmp0"(#loc17))
+#loc157 = loc("tmp0"(#loc18))
+#loc158 = loc("tmp0"(#loc19))
+#loc159 = loc("tmp6"(#loc20))
+#loc160 = loc("tmp6"(#loc21))
+#loc161 = loc("tmp6"(#loc22))
+#loc162 = loc("tmp6"(#loc23))
+#loc163 = loc("tmp6"(#loc24))
+#loc164 = loc("tmp2"(#loc25))
+#loc165 = loc("tmp5"(#loc26))
+#loc166 = loc("_tmp4"(#loc27))
+#loc167 = loc("tmp8"(#loc28))
+#loc168 = loc("tmp11"(#loc29))
+#loc169 = loc("_tmp10"(#loc30))
+#loc171 = loc("tmp4"(#loc34))
+#loc173 = loc("tmp10"(#loc36))
+#loc174 = loc("r0_3"(#loc37))
+#loc175 = loc("r0_4"(#loc38))
+#loc176 = loc("tmp58"(#loc39))
+#loc177 = loc("tmp58"(#loc40))
+#loc178 = loc("tmp58"(#loc41))
+#loc179 = loc("tmp63"(#loc42))
+#loc180 = loc("tmp63"(#loc43))
+#loc181 = loc("tmp63"(#loc44))
+#loc182 = loc("tmp63"(#loc45))
+#loc183 = loc("tmp66"(#loc46))
+#loc184 = loc("tmp66"(#loc47))
+#loc185 = loc("tmp96"(#loc48))
+#loc186 = loc("tmp96"(#loc49))
+#loc187 = loc("tmp102"(#loc50))
+#loc188 = loc("tmp102"(#loc51))
+#loc189 = loc("tmp102"(#loc52))
+#loc190 = loc("tmp16"(#loc53))
+#loc191 = loc("tmp17"(#loc54))
+#loc192 = loc("tmp17"(#loc55))
+#loc193 = loc("tmp17"(#loc56))
+#loc194 = loc("tmp17"(#loc57))
+#loc195 = loc("tmp17"(#loc58))
+#loc196 = loc("tmp17"(#loc59))
+#loc197 = loc("tmp17"(#loc60))
+#loc198 = loc("tmp17"(#loc61))
+#loc199 = loc("tmp20"(#loc62))
+#loc200 = loc("tmp22"(#loc63))
+#loc201 = loc("tmp23"(#loc64))
+#loc202 = loc("tmp24"(#loc65))
+#loc203 = loc("tmp25"(#loc66))
+#loc204 = loc("tmp25"(#loc67))
+#loc205 = loc("tmp25"(#loc68))
+#loc206 = loc("tmp27"(#loc69))
+#loc207 = loc("tmp29"(#loc70))
+#loc208 = loc("tmp31"(#loc71))
+#loc209 = loc("tmp32"(#loc72))
+#loc210 = loc("tmp35"(#loc73))
+#loc211 = loc("tmp35"(#loc74))
+#loc212 = loc("tmp35"(#loc75))
+#loc213 = loc("tmp35"(#loc76))
+#loc214 = loc("tmp35"(#loc77))
+#loc215 = loc("tmp35"(#loc78))
+#loc216 = loc("tmp42"(#loc79))
+#loc217 = loc("tmp43"(#loc80))
+#loc218 = loc("tmp43"(#loc81))
+#loc219 = loc("tmp43"(#loc82))
+#loc220 = loc("tmp45"(#loc83))
+#loc221 = loc("tmp48"(#loc84))
+#loc222 = loc("tmp49"(#loc85))
+#loc223 = loc("tmp57"(#loc86))
+#loc224 = loc("tmp60"(#loc87))
+#loc225 = loc("tmp64"(#loc88))
+#loc226 = loc("tmp67"(#loc89))
+#loc227 = loc("tmp68"(#loc90))
+#loc228 = loc("tmp70"(#loc91))
+#loc229 = loc("tmp70"(#loc92))
+#loc230 = loc("tmp70"(#loc93))
+#loc231 = loc("tmp70"(#loc94))
+#loc232 = loc("tmp70"(#loc95))
+#loc233 = loc("tmp70"(#loc96))
+#loc234 = loc("tmp72"(#loc97))
+#loc235 = loc("tmp73"(#loc98))
+#loc236 = loc("tmp74"(#loc99))
+#loc237 = loc("tmp75"(#loc100))
+#loc238 = loc("tmp76"(#loc101))
+#loc239 = loc("tmp76"(#loc102))
+#loc240 = loc("tmp76"(#loc103))
+#loc241 = loc("tmp78"(#loc104))
+#loc242 = loc("tmp80"(#loc105))
+#loc243 = loc("tmp82"(#loc106))
+#loc244 = loc("tmp83"(#loc107))
+#loc245 = loc("tmp83"(#loc108))
+#loc246 = loc("tmp83"(#loc109))
+#loc247 = loc("tmp83"(#loc110))
+#loc248 = loc("tmp83"(#loc111))
+#loc249 = loc("tmp83"(#loc112))
+#loc250 = loc("tmp88"(#loc113))
+#loc251 = loc("tmp89"(#loc114))
+#loc252 = loc("tmp89"(#loc115))
+#loc253 = loc("tmp89"(#loc116))
+#loc254 = loc("tmp91"(#loc117))
+#loc255 = loc("tmp94"(#loc118))
+#loc256 = loc("tmp95"(#loc119))
+#loc257 = loc("tmp101"(#loc120))
+#loc258 = loc("tmp104"(#loc121))
+#loc259 = loc("tmp107"(#loc122))
+#loc260 = loc("tmp109"(#loc123))
+#loc261 = loc("tmp110"(#loc124))
+#loc262 = loc(callsite(#loc31 at #loc170))
+#loc264 = loc(callsite(#loc31 at #loc172))
+#loc266 = loc(callsite(#loc33 at #loc262))
+#loc267 = loc(callsite(#loc33 at #loc264))
diff --git a/triton/6DP457QWOYDYHZ7TARQ4OLPDLGEKSLMNVUE3G2KQSQUSVRN7FBVA/__grp__triton_poi_fused_mul_silu_split_0.json b/triton/6DP457QWOYDYHZ7TARQ4OLPDLGEKSLMNVUE3G2KQSQUSVRN7FBVA/__grp__triton_poi_fused_mul_silu_split_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..8cb5fbe038a5186e47a489e40bb8c19b5ea19bb2
--- /dev/null
+++ b/triton/6DP457QWOYDYHZ7TARQ4OLPDLGEKSLMNVUE3G2KQSQUSVRN7FBVA/__grp__triton_poi_fused_mul_silu_split_0.json
@@ -0,0 +1 @@
+{"child_paths": {"triton_poi_fused_mul_silu_split_0.source": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/6DP457QWOYDYHZ7TARQ4OLPDLGEKSLMNVUE3G2KQSQUSVRN7FBVA/triton_poi_fused_mul_silu_split_0.source", "triton_poi_fused_mul_silu_split_0.ttir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/6DP457QWOYDYHZ7TARQ4OLPDLGEKSLMNVUE3G2KQSQUSVRN7FBVA/triton_poi_fused_mul_silu_split_0.ttir", "triton_poi_fused_mul_silu_split_0.ttgir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/6DP457QWOYDYHZ7TARQ4OLPDLGEKSLMNVUE3G2KQSQUSVRN7FBVA/triton_poi_fused_mul_silu_split_0.ttgir", "triton_poi_fused_mul_silu_split_0.llir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/6DP457QWOYDYHZ7TARQ4OLPDLGEKSLMNVUE3G2KQSQUSVRN7FBVA/triton_poi_fused_mul_silu_split_0.llir", "triton_poi_fused_mul_silu_split_0.ptx": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/6DP457QWOYDYHZ7TARQ4OLPDLGEKSLMNVUE3G2KQSQUSVRN7FBVA/triton_poi_fused_mul_silu_split_0.ptx", "triton_poi_fused_mul_silu_split_0.cubin": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/6DP457QWOYDYHZ7TARQ4OLPDLGEKSLMNVUE3G2KQSQUSVRN7FBVA/triton_poi_fused_mul_silu_split_0.cubin", "triton_poi_fused_mul_silu_split_0.json": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/6DP457QWOYDYHZ7TARQ4OLPDLGEKSLMNVUE3G2KQSQUSVRN7FBVA/triton_poi_fused_mul_silu_split_0.json"}}
\ No newline at end of file
diff --git a/triton/6DP457QWOYDYHZ7TARQ4OLPDLGEKSLMNVUE3G2KQSQUSVRN7FBVA/triton_poi_fused_mul_silu_split_0.cubin b/triton/6DP457QWOYDYHZ7TARQ4OLPDLGEKSLMNVUE3G2KQSQUSVRN7FBVA/triton_poi_fused_mul_silu_split_0.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..3519c5b9375dd04c14d6f407d6979d21a342579a
Binary files /dev/null and b/triton/6DP457QWOYDYHZ7TARQ4OLPDLGEKSLMNVUE3G2KQSQUSVRN7FBVA/triton_poi_fused_mul_silu_split_0.cubin differ
diff --git a/triton/6DP457QWOYDYHZ7TARQ4OLPDLGEKSLMNVUE3G2KQSQUSVRN7FBVA/triton_poi_fused_mul_silu_split_0.json b/triton/6DP457QWOYDYHZ7TARQ4OLPDLGEKSLMNVUE3G2KQSQUSVRN7FBVA/triton_poi_fused_mul_silu_split_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..02f1bc3c82364cc1aec481190a7930b0c283a089
--- /dev/null
+++ b/triton/6DP457QWOYDYHZ7TARQ4OLPDLGEKSLMNVUE3G2KQSQUSVRN7FBVA/triton_poi_fused_mul_silu_split_0.json
@@ -0,0 +1 @@
+{"hash": "f0dfcefe16760783e7f30461c72de35988a92d8dad09b3695094292ac5bf286a", "target": {"backend": "cuda", "arch": 89, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "enable_reflect_ftz": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee", "bf16x3", "bf16x6"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm89", "instrumentation_mode": "", "triton_version": "3.6.0", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_poi_fused_mul_silu_split_0"}
\ No newline at end of file
diff --git a/triton/6DP457QWOYDYHZ7TARQ4OLPDLGEKSLMNVUE3G2KQSQUSVRN7FBVA/triton_poi_fused_mul_silu_split_0.llir b/triton/6DP457QWOYDYHZ7TARQ4OLPDLGEKSLMNVUE3G2KQSQUSVRN7FBVA/triton_poi_fused_mul_silu_split_0.llir
new file mode 100644
index 0000000000000000000000000000000000000000..749bdf56e71c5fab670af172c19630702b230ada
--- /dev/null
+++ b/triton/6DP457QWOYDYHZ7TARQ4OLPDLGEKSLMNVUE3G2KQSQUSVRN7FBVA/triton_poi_fused_mul_silu_split_0.llir
@@ -0,0 +1,176 @@
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64"
+
+; Function Attrs: nounwind
+define ptx_kernel void @triton_poi_fused_mul_silu_split_0(ptr addrspace(1) %0, ptr addrspace(1) %1, i32 %2, ptr addrspace(1) readnone captures(none) %3, ptr addrspace(1) readnone captures(none) %4) local_unnamed_addr #0 !dbg !4 {
+  %6 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7
+  %7 = shl i32 %6, 10, !dbg !8
+  %8 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9
+  %9 = shl nuw nsw i32 %8, 3, !dbg !9
+  %10 = and i32 %9, 1016, !dbg !9
+  %11 = or disjoint i32 %10, %7, !dbg !10
+  %12 = srem i32 %11, 12288, !dbg !11
+  %13 = sub nsw i32 %11, %12, !dbg !11
+  %14 = add i32 %13, %11, !dbg !11
+  %15 = sext i32 %14 to i64, !dbg !12
+  %16 = getelementptr bfloat, ptr addrspace(1) %0, i64 %15, !dbg !12
+  %17 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l"(ptr addrspace(1) %16) #3, !dbg !13
+  %18 = extractvalue { i32, i32, i32, i32 } %17, 0, !dbg !13
+  %19 = bitcast i32 %18 to <2 x bfloat>, !dbg !13
+  %20 = extractvalue { i32, i32, i32, i32 } %17, 1, !dbg !13
+  %21 = bitcast i32 %20 to <2 x bfloat>, !dbg !13
+  %22 = extractvalue { i32, i32, i32, i32 } %17, 2, !dbg !13
+  %23 = bitcast i32 %22 to <2 x bfloat>, !dbg !13
+  %24 = extractvalue { i32, i32, i32, i32 } %17, 3, !dbg !13
+  %25 = bitcast i32 %24 to <2 x bfloat>, !dbg !13
+  %26 = add i32 %14, 12288, !dbg !14
+  %27 = sext i32 %26 to i64, !dbg !15
+  %28 = getelementptr bfloat, ptr addrspace(1) %0, i64 %27, !dbg !15
+  %29 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l"(ptr addrspace(1) %28) #3, !dbg !16
+  %30 = extractvalue { i32, i32, i32, i32 } %29, 0, !dbg !16
+  %31 = bitcast i32 %30 to <2 x bfloat>, !dbg !16
+  %32 = extractvalue { i32, i32, i32, i32 } %29, 1, !dbg !16
+  %33 = bitcast i32 %32 to <2 x bfloat>, !dbg !16
+  %34 = extractvalue { i32, i32, i32, i32 } %29, 2, !dbg !16
+  %35 = bitcast i32 %34 to <2 x bfloat>, !dbg !16
+  %36 = extractvalue { i32, i32, i32, i32 } %29, 3, !dbg !16
+  %37 = bitcast i32 %36 to <2 x bfloat>, !dbg !16
+  %38 = sext i32 %11 to i64, !dbg !17
+  %39 = getelementptr bfloat, ptr addrspace(1) %1, i64 %38, !dbg !17
+  %40 = fpext <2 x bfloat> %19 to <2 x float>, !dbg !18
+  %41 = fpext <2 x bfloat> %31 to <2 x float>, !dbg !19
+  %42 = extractelement <2 x float> %40, i64 0, !dbg !20
+  %43 = fsub float 0.000000e+00, %42, !dbg !20
+  %44 = extractelement <2 x float> %40, i64 1, !dbg !20
+  %45 = fsub float 0.000000e+00, %44, !dbg !20
+  %46 = fmul float %43, 0x3FF7154760000000, !dbg !25
+  %47 = tail call float @llvm.nvvm.ex2.approx.f(float %46), !dbg !25
+  %48 = fmul float %45, 0x3FF7154760000000, !dbg !25
+  %49 = tail call float @llvm.nvvm.ex2.approx.f(float %48), !dbg !25
+  %50 = fadd float %47, 1.000000e+00, !dbg !26
+  %51 = fadd float %49, 1.000000e+00, !dbg !26
+  %52 = tail call float @llvm.nvvm.div.full(float 1.000000e+00, float %50), !dbg !27
+  %53 = tail call float @llvm.nvvm.div.full(float 1.000000e+00, float %51), !dbg !27
+  %54 = insertelement <2 x float> poison, float %52, i64 0, !dbg !28
+  %55 = insertelement <2 x float> %54, float %53, i64 1, !dbg !28
+  %56 = fmul <2 x float> %55, %40, !dbg !28
+  %57 = fmul <2 x float> %56, %41, !dbg !29
+  %58 = fptrunc <2 x float> %57 to <2 x bfloat>, !dbg !30
+  %59 = fpext <2 x bfloat> %21 to <2 x float>, !dbg !18
+  %60 = fpext <2 x bfloat> %33 to <2 x float>, !dbg !19
+  %61 = extractelement <2 x float> %59, i64 0, !dbg !20
+  %62 = fsub float 0.000000e+00, %61, !dbg !20
+  %63 = extractelement <2 x float> %59, i64 1, !dbg !20
+  %64 = fsub float 0.000000e+00, %63, !dbg !20
+  %65 = fmul float %62, 0x3FF7154760000000, !dbg !25
+  %66 = tail call float @llvm.nvvm.ex2.approx.f(float %65), !dbg !25
+  %67 = fmul float %64, 0x3FF7154760000000, !dbg !25
+  %68 = tail call float @llvm.nvvm.ex2.approx.f(float %67), !dbg !25
+  %69 = fadd float %66, 1.000000e+00, !dbg !26
+  %70 = fadd float %68, 1.000000e+00, !dbg !26
+  %71 = tail call float @llvm.nvvm.div.full(float 1.000000e+00, float %69), !dbg !27
+  %72 = tail call float @llvm.nvvm.div.full(float 1.000000e+00, float %70), !dbg !27
+  %73 = insertelement <2 x float> poison, float %71, i64 0, !dbg !28
+  %74 = insertelement <2 x float> %73, float %72, i64 1, !dbg !28
+  %75 = fmul <2 x float> %74, %59, !dbg !28
+  %76 = fmul <2 x float> %75, %60, !dbg !29
+  %77 = fptrunc <2 x float> %76 to <2 x bfloat>, !dbg !30
+  %78 = fpext <2 x bfloat> %23 to <2 x float>, !dbg !18
+  %79 = fpext <2 x bfloat> %35 to <2 x float>, !dbg !19
+  %80 = extractelement <2 x float> %78, i64 0, !dbg !20
+  %81 = fsub float 0.000000e+00, %80, !dbg !20
+  %82 = extractelement <2 x float> %78, i64 1, !dbg !20
+  %83 = fsub float 0.000000e+00, %82, !dbg !20
+  %84 = fmul float %81, 0x3FF7154760000000, !dbg !25
+  %85 = tail call float @llvm.nvvm.ex2.approx.f(float %84), !dbg !25
+  %86 = fmul float %83, 0x3FF7154760000000, !dbg !25
+  %87 = tail call float @llvm.nvvm.ex2.approx.f(float %86), !dbg !25
+  %88 = fadd float %85, 1.000000e+00, !dbg !26
+  %89 = fadd float %87, 1.000000e+00, !dbg !26
+  %90 = tail call float @llvm.nvvm.div.full(float 1.000000e+00, float %88), !dbg !27
+  %91 = tail call float @llvm.nvvm.div.full(float 1.000000e+00, float %89), !dbg !27
+  %92 = insertelement <2 x float> poison, float %90, i64 0, !dbg !28
+  %93 = insertelement <2 x float> %92, float %91, i64 1, !dbg !28
+  %94 = fmul <2 x float> %93, %78, !dbg !28
+  %95 = fmul <2 x float> %94, %79, !dbg !29
+  %96 = fptrunc <2 x float> %95 to <2 x bfloat>, !dbg !30
+  %97 = fpext <2 x bfloat> %25 to <2 x float>, !dbg !18
+  %98 = fpext <2 x bfloat> %37 to <2 x float>, !dbg !19
+  %99 = extractelement <2 x float> %97, i64 0, !dbg !20
+  %100 = fsub float 0.000000e+00, %99, !dbg !20
+  %101 = extractelement <2 x float> %97, i64 1, !dbg !20
+  %102 = fsub float 0.000000e+00, %101, !dbg !20
+  %103 = fmul float %100, 0x3FF7154760000000, !dbg !25
+  %104 = tail call float @llvm.nvvm.ex2.approx.f(float %103), !dbg !25
+  %105 = fmul float %102, 0x3FF7154760000000, !dbg !25
+  %106 = tail call float @llvm.nvvm.ex2.approx.f(float %105), !dbg !25
+  %107 = fadd float %104, 1.000000e+00, !dbg !26
+  %108 = fadd float %106, 1.000000e+00, !dbg !26
+  %109 = tail call float @llvm.nvvm.div.full(float 1.000000e+00, float %107), !dbg !27
+  %110 = tail call float @llvm.nvvm.div.full(float 1.000000e+00, float %108), !dbg !27
+  %111 = insertelement <2 x float> poison, float %109, i64 0, !dbg !28
+  %112 = insertelement <2 x float> %111, float %110, i64 1, !dbg !28
+  %113 = fmul <2 x float> %112, %97, !dbg !28
+  %114 = fmul <2 x float> %113, %98, !dbg !29
+  %115 = fptrunc <2 x float> %114 to <2 x bfloat>, !dbg !30
+  %116 = bitcast <2 x bfloat> %58 to i32, !dbg !30
+  %117 = bitcast <2 x bfloat> %77 to i32, !dbg !30
+  %118 = bitcast <2 x bfloat> %96 to i32, !dbg !30
+  %119 = bitcast <2 x bfloat> %115 to i32, !dbg !30
+  tail call void asm sideeffect "st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l"(i32 %116, i32 %117, i32 %118, i32 %119, ptr addrspace(1) %39) #3, !dbg !30
+  ret void, !dbg !31
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.ex2.approx.f(float) #2
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.div.full(float, float) #2
+
+attributes #0 = { nounwind "nvvm.reqntid"="128" }
+attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #2 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
+attributes #3 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly)
+!1 = !DIFile(filename: "c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py", directory: "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w")
+!2 = !{i32 2, !"Debug Info Version", i32 3}
+!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
+!4 = distinct !DISubprogram(name: "triton_poi_fused_mul_silu_split_0", linkageName: "triton_poi_fused_mul_silu_split_0", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!5 = !DISubroutineType(cc: DW_CC_normal, types: !6)
+!6 = !{}
+!7 = !DILocation(line: 20, column: 28, scope: !4)
+!8 = !DILocation(line: 20, column: 33, scope: !4)
+!9 = !DILocation(line: 21, column: 36, scope: !4)
+!10 = !DILocation(line: 21, column: 23, scope: !4)
+!11 = !DILocation(line: 26, column: 35, scope: !4)
+!12 = !DILocation(line: 26, column: 30, scope: !4)
+!13 = !DILocation(line: 26, column: 46, scope: !4)
+!14 = !DILocation(line: 27, column: 43, scope: !4)
+!15 = !DILocation(line: 27, column: 30, scope: !4)
+!16 = !DILocation(line: 27, column: 54, scope: !4)
+!17 = !DILocation(line: 33, column: 25, scope: !4)
+!18 = !DILocation(line: 26, column: 55, scope: !4)
+!19 = !DILocation(line: 27, column: 63, scope: !4)
+!20 = !DILocation(line: 50, column: 30, scope: !21, inlinedAt: !23)
+!21 = distinct !DILexicalBlockFile(scope: !4, file: !22, discriminator: 0)
+!22 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.12/dist-packages/triton/language")
+!23 = !DILocation(line: 29, column: 22, scope: !24)
+!24 = distinct !DILexicalBlockFile(scope: !4, file: !1, discriminator: 0)
+!25 = !DILocation(line: 50, column: 29, scope: !21, inlinedAt: !23)
+!26 = !DILocation(line: 50, column: 20, scope: !21, inlinedAt: !23)
+!27 = !DILocation(line: 50, column: 16, scope: !21, inlinedAt: !23)
+!28 = !DILocation(line: 30, column: 18, scope: !4)
+!29 = !DILocation(line: 32, column: 18, scope: !4)
+!30 = !DILocation(line: 33, column: 36, scope: !4)
+!31 = !DILocation(line: 33, column: 4, scope: !4)
diff --git a/triton/6DP457QWOYDYHZ7TARQ4OLPDLGEKSLMNVUE3G2KQSQUSVRN7FBVA/triton_poi_fused_mul_silu_split_0.ptx b/triton/6DP457QWOYDYHZ7TARQ4OLPDLGEKSLMNVUE3G2KQSQUSVRN7FBVA/triton_poi_fused_mul_silu_split_0.ptx
new file mode 100644
index 0000000000000000000000000000000000000000..05aaa9b7dd33c30603eeeee1bfce35c5cc1814f1
--- /dev/null
+++ b/triton/6DP457QWOYDYHZ7TARQ4OLPDLGEKSLMNVUE3G2KQSQUSVRN7FBVA/triton_poi_fused_mul_silu_split_0.ptx
@@ -0,0 +1,539 @@
+//
+// Generated by LLVM NVPTX Back-End
+//
+
+.version 9.1
+.target sm_89
+.address_size 64
+
+	// .globl	triton_poi_fused_mul_silu_split_0 // -- Begin function triton_poi_fused_mul_silu_split_0
+                                        // @triton_poi_fused_mul_silu_split_0
+.visible .entry triton_poi_fused_mul_silu_split_0(
+	.param .u64 .ptr .global .align 1 triton_poi_fused_mul_silu_split_0_param_0,
+	.param .u64 .ptr .global .align 1 triton_poi_fused_mul_silu_split_0_param_1,
+	.param .u32 triton_poi_fused_mul_silu_split_0_param_2,
+	.param .u64 .ptr .global .align 1 triton_poi_fused_mul_silu_split_0_param_3,
+	.param .u64 .ptr .global .align 1 triton_poi_fused_mul_silu_split_0_param_4
+)
+.reqntid 128
+{
+	.reg .b16 	%rs<17>;
+	.reg .b32 	%r<99>;
+	.reg .b64 	%rd<6>;
+	.loc	1 18 0                          // c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:18:0
+$L__func_begin0:
+	.loc	1 18 0                          // c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:18:0
+
+// %bb.0:
+	ld.param.b64 	%rd4, [triton_poi_fused_mul_silu_split_0_param_0];
+	ld.param.b64 	%rd5, [triton_poi_fused_mul_silu_split_0_param_1];
+$L__tmp0:
+	.loc	1 20 28                         // c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:20:28
+	mov.u32 	%r13, %ctaid.x;
+	.loc	1 20 33                         // c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:20:33
+	shl.b32 	%r14, %r13, 10;
+	.loc	1 21 36                         // c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:21:36
+	mov.u32 	%r15, %tid.x;
+	shl.b32 	%r16, %r15, 3;
+	and.b32 	%r17, %r16, 1016;
+	.loc	1 21 23                         // c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:21:23
+	or.b32 	%r18, %r17, %r14;
+	.loc	1 26 35                         // c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:26:35
+	mul.hi.s32 	%r19, %r18, 715827883;
+	shr.u32 	%r20, %r19, 31;
+	shr.u32 	%r21, %r19, 11;
+	add.s32 	%r22, %r21, %r20;
+	mad.lo.s32 	%r23, %r22, 12288, %r18;
+	.loc	1 26 30                         // c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:26:30
+	mad.wide.s32 	%rd1, %r23, 2, %rd4;
+	.loc	1 26 46                         // c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:26:46
+	// begin inline asm
+	mov.u32 %r1, 0x0;
+	mov.u32 %r2, 0x0;
+	mov.u32 %r3, 0x0;
+	mov.u32 %r4, 0x0;
+	ld.global.v4.b32 { %r1, %r2, %r3, %r4 }, [ %rd1 + 0 ];
+	// end inline asm
+	.loc	1 27 43                         // c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:27:43
+	add.s32 	%r24, %r23, 12288;
+	.loc	1 27 30                         // c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:27:30
+	mad.wide.s32 	%rd2, %r24, 2, %rd4;
+	.loc	1 27 54                         // c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:27:54
+	// begin inline asm
+	mov.u32 %r5, 0x0;
+	mov.u32 %r6, 0x0;
+	mov.u32 %r7, 0x0;
+	mov.u32 %r8, 0x0;
+	ld.global.v4.b32 { %r5, %r6, %r7, %r8 }, [ %rd2 + 0 ];
+	// end inline asm
+	.loc	1 33 25                         // c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:33:25
+	mad.wide.s32 	%rd3, %r18, 2, %rd5;
+	.loc	1 26 55                         // c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:26:55
+	mov.b32 	{%rs1, %rs2}, %r1;
+	cvt.f32.bf16 	%r25, %rs2;
+	cvt.f32.bf16 	%r26, %rs1;
+	.loc	1 27 63                         // c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:27:63
+	mov.b32 	{%rs3, %rs4}, %r5;
+	cvt.f32.bf16 	%r27, %rs4;
+	cvt.f32.bf16 	%r28, %rs3;
+	mov.b32 	%r29, 0f00000000;
+$L__tmp1:
+	.loc	2 50 30                         // standard.py:50:30 @[ c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:29:22 ]
+	sub.f32 	%r30, %r29, %r26;
+	sub.f32 	%r31, %r29, %r25;
+	.loc	2 50 29                         // standard.py:50:29 @[ c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:29:22 ]
+	mul.f32 	%r32, %r30, 0f3FB8AA3B;
+	ex2.approx.f32 	%r33, %r32;
+	mul.f32 	%r34, %r31, 0f3FB8AA3B;
+	ex2.approx.f32 	%r35, %r34;
+	.loc	2 50 20                         // standard.py:50:20 @[ c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:29:22 ]
+	add.f32 	%r36, %r33, 0f3F800000;
+	add.f32 	%r37, %r35, 0f3F800000;
+	mov.b32 	%r38, 0f3F800000;
+	.loc	2 50 16                         // standard.py:50:16 @[ c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:29:22 ]
+	div.full.f32 	%r39, %r38, %r36;
+	div.full.f32 	%r40, %r38, %r37;
+$L__tmp2:
+	.loc	1 30 18                         // c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:30:18
+	mul.f32 	%r41, %r40, %r25;
+	mul.f32 	%r42, %r39, %r26;
+	.loc	1 32 18                         // c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:32:18
+	mul.f32 	%r43, %r42, %r28;
+	mul.f32 	%r44, %r41, %r27;
+	.loc	1 33 36                         // c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:33:36
+	cvt.rn.bf16x2.f32 	%r9, %r44, %r43;
+	.loc	1 26 55                         // c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:26:55
+	mov.b32 	{%rs5, %rs6}, %r2;
+	cvt.f32.bf16 	%r45, %rs6;
+	cvt.f32.bf16 	%r46, %rs5;
+	.loc	1 27 63                         // c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:27:63
+	mov.b32 	{%rs7, %rs8}, %r6;
+	cvt.f32.bf16 	%r47, %rs8;
+	cvt.f32.bf16 	%r48, %rs7;
+$L__tmp3:
+	.loc	2 50 30                         // standard.py:50:30 @[ c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:29:22 ]
+	sub.f32 	%r49, %r29, %r46;
+	sub.f32 	%r50, %r29, %r45;
+	.loc	2 50 29                         // standard.py:50:29 @[ c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:29:22 ]
+	mul.f32 	%r51, %r49, 0f3FB8AA3B;
+	ex2.approx.f32 	%r52, %r51;
+	mul.f32 	%r53, %r50, 0f3FB8AA3B;
+	ex2.approx.f32 	%r54, %r53;
+	.loc	2 50 20                         // standard.py:50:20 @[ c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:29:22 ]
+	add.f32 	%r55, %r52, 0f3F800000;
+	add.f32 	%r56, %r54, 0f3F800000;
+	.loc	2 50 16                         // standard.py:50:16 @[ c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:29:22 ]
+	div.full.f32 	%r57, %r38, %r55;
+	div.full.f32 	%r58, %r38, %r56;
+$L__tmp4:
+	.loc	1 30 18                         // c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:30:18
+	mul.f32 	%r59, %r58, %r45;
+	mul.f32 	%r60, %r57, %r46;
+	.loc	1 32 18                         // c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:32:18
+	mul.f32 	%r61, %r60, %r48;
+	mul.f32 	%r62, %r59, %r47;
+	.loc	1 33 36                         // c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:33:36
+	cvt.rn.bf16x2.f32 	%r10, %r62, %r61;
+	.loc	1 26 55                         // c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:26:55
+	mov.b32 	{%rs9, %rs10}, %r3;
+	cvt.f32.bf16 	%r63, %rs10;
+	cvt.f32.bf16 	%r64, %rs9;
+	.loc	1 27 63                         // c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:27:63
+	mov.b32 	{%rs11, %rs12}, %r7;
+	cvt.f32.bf16 	%r65, %rs12;
+	cvt.f32.bf16 	%r66, %rs11;
+$L__tmp5:
+	.loc	2 50 30                         // standard.py:50:30 @[ c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:29:22 ]
+	sub.f32 	%r67, %r29, %r64;
+	sub.f32 	%r68, %r29, %r63;
+	.loc	2 50 29                         // standard.py:50:29 @[ c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:29:22 ]
+	mul.f32 	%r69, %r67, 0f3FB8AA3B;
+	ex2.approx.f32 	%r70, %r69;
+	mul.f32 	%r71, %r68, 0f3FB8AA3B;
+	ex2.approx.f32 	%r72, %r71;
+	.loc	2 50 20                         // standard.py:50:20 @[ c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:29:22 ]
+	add.f32 	%r73, %r70, 0f3F800000;
+	add.f32 	%r74, %r72, 0f3F800000;
+	.loc	2 50 16                         // standard.py:50:16 @[ c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:29:22 ]
+	div.full.f32 	%r75, %r38, %r73;
+	div.full.f32 	%r76, %r38, %r74;
+$L__tmp6:
+	.loc	1 30 18                         // c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:30:18
+	mul.f32 	%r77, %r76, %r63;
+	mul.f32 	%r78, %r75, %r64;
+	.loc	1 32 18                         // c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:32:18
+	mul.f32 	%r79, %r78, %r66;
+	mul.f32 	%r80, %r77, %r65;
+	.loc	1 33 36                         // c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:33:36
+	cvt.rn.bf16x2.f32 	%r11, %r80, %r79;
+	.loc	1 26 55                         // c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:26:55
+	mov.b32 	{%rs13, %rs14}, %r4;
+	cvt.f32.bf16 	%r81, %rs14;
+	cvt.f32.bf16 	%r82, %rs13;
+	.loc	1 27 63                         // c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:27:63
+	mov.b32 	{%rs15, %rs16}, %r8;
+	cvt.f32.bf16 	%r83, %rs16;
+	cvt.f32.bf16 	%r84, %rs15;
+$L__tmp7:
+	.loc	2 50 30                         // standard.py:50:30 @[ c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:29:22 ]
+	sub.f32 	%r85, %r29, %r82;
+	sub.f32 	%r86, %r29, %r81;
+	.loc	2 50 29                         // standard.py:50:29 @[ c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:29:22 ]
+	mul.f32 	%r87, %r85, 0f3FB8AA3B;
+	ex2.approx.f32 	%r88, %r87;
+	mul.f32 	%r89, %r86, 0f3FB8AA3B;
+	ex2.approx.f32 	%r90, %r89;
+	.loc	2 50 20                         // standard.py:50:20 @[ c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:29:22 ]
+	add.f32 	%r91, %r88, 0f3F800000;
+	add.f32 	%r92, %r90, 0f3F800000;
+	.loc	2 50 16                         // standard.py:50:16 @[ c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:29:22 ]
+	div.full.f32 	%r93, %r38, %r91;
+	div.full.f32 	%r94, %r38, %r92;
+$L__tmp8:
+	.loc	1 30 18                         // c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:30:18
+	mul.f32 	%r95, %r94, %r81;
+	mul.f32 	%r96, %r93, %r82;
+	.loc	1 32 18                         // c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:32:18
+	mul.f32 	%r97, %r96, %r84;
+	mul.f32 	%r98, %r95, %r83;
+	.loc	1 33 36                         // c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:33:36
+	cvt.rn.bf16x2.f32 	%r12, %r98, %r97;
+	// begin inline asm
+	st.global.v4.b32 [ %rd3 + 0 ], { %r9, %r10, %r11, %r12 };
+	// end inline asm
+	.loc	1 33 4                          // c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:33:4
+	ret;
+$L__tmp9:
+$L__func_end0:
+                                        // -- End function
+}
+	.file	1 "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py"
+	.file	2 "/usr/local/lib/python3.12/dist-packages/triton/language/standard.py"
+	.section	.debug_abbrev
+	{
+.b8 1                                   // Abbreviation Code
+.b8 17                                  // DW_TAG_compile_unit
+.b8 1                                   // DW_CHILDREN_yes
+.b8 37                                  // DW_AT_producer
+.b8 8                                   // DW_FORM_string
+.b8 19                                  // DW_AT_language
+.b8 5                                   // DW_FORM_data2
+.b8 3                                   // DW_AT_name
+.b8 8                                   // DW_FORM_string
+.b8 16                                  // DW_AT_stmt_list
+.b8 6                                   // DW_FORM_data4
+.b8 27                                  // DW_AT_comp_dir
+.b8 8                                   // DW_FORM_string
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 2                                   // Abbreviation Code
+.b8 46                                  // DW_TAG_subprogram
+.b8 0                                   // DW_CHILDREN_no
+.b8 3                                   // DW_AT_name
+.b8 8                                   // DW_FORM_string
+.b8 32                                  // DW_AT_inline
+.b8 11                                  // DW_FORM_data1
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 3                                   // Abbreviation Code
+.b8 46                                  // DW_TAG_subprogram
+.b8 1                                   // DW_CHILDREN_yes
+.b8 17                                  // DW_AT_low_pc
+.b8 1                                   // DW_FORM_addr
+.b8 18                                  // DW_AT_high_pc
+.b8 1                                   // DW_FORM_addr
+.b8 49                                  // DW_AT_abstract_origin
+.b8 19                                  // DW_FORM_ref4
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 4                                   // Abbreviation Code
+.b8 29                                  // DW_TAG_inlined_subroutine
+.b8 0                                   // DW_CHILDREN_no
+.b8 49                                  // DW_AT_abstract_origin
+.b8 19                                  // DW_FORM_ref4
+.b8 17                                  // DW_AT_low_pc
+.b8 1                                   // DW_FORM_addr
+.b8 18                                  // DW_AT_high_pc
+.b8 1                                   // DW_FORM_addr
+.b8 88                                  // DW_AT_call_file
+.b8 11                                  // DW_FORM_data1
+.b8 89                                  // DW_AT_call_line
+.b8 11                                  // DW_FORM_data1
+.b8 87                                  // DW_AT_call_column
+.b8 11                                  // DW_FORM_data1
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 0                                   // EOM(3)
+	}
+	.section	.debug_info
+	{
+.b32 307                                // Length of Unit
+.b8 2                                   // DWARF version number
+.b8 0
+.b32 .debug_abbrev                      // Offset Into Abbrev. Section
+.b8 8                                   // Address Size (in bytes)
+.b8 1                                   // Abbrev [1] 0xb:0x12c DW_TAG_compile_unit
+.b8 116                                 // DW_AT_producer
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 0
+.b8 2                                   // DW_AT_language
+.b8 0
+.b8 99                                  // DW_AT_name
+.b8 54
+.b8 119
+.b8 54
+.b8 115
+.b8 103
+.b8 52
+.b8 118
+.b8 51
+.b8 98
+.b8 99
+.b8 105
+.b8 103
+.b8 104
+.b8 119
+.b8 111
+.b8 107
+.b8 122
+.b8 113
+.b8 54
+.b8 105
+.b8 52
+.b8 51
+.b8 116
+.b8 108
+.b8 53
+.b8 120
+.b8 107
+.b8 53
+.b8 118
+.b8 122
+.b8 55
+.b8 122
+.b8 101
+.b8 118
+.b8 117
+.b8 107
+.b8 55
+.b8 106
+.b8 104
+.b8 118
+.b8 108
+.b8 113
+.b8 121
+.b8 114
+.b8 121
+.b8 121
+.b8 117
+.b8 104
+.b8 117
+.b8 101
+.b8 111
+.b8 46
+.b8 112
+.b8 121
+.b8 0
+.b32 .debug_line                        // DW_AT_stmt_list
+.b8 47                                  // DW_AT_comp_dir
+.b8 97
+.b8 112
+.b8 112
+.b8 47
+.b8 116
+.b8 101
+.b8 110
+.b8 115
+.b8 111
+.b8 114
+.b8 114
+.b8 116
+.b8 95
+.b8 108
+.b8 108
+.b8 109
+.b8 47
+.b8 118
+.b8 105
+.b8 115
+.b8 117
+.b8 97
+.b8 108
+.b8 95
+.b8 103
+.b8 101
+.b8 110
+.b8 47
+.b8 99
+.b8 111
+.b8 109
+.b8 112
+.b8 105
+.b8 108
+.b8 101
+.b8 100
+.b8 95
+.b8 99
+.b8 97
+.b8 99
+.b8 104
+.b8 101
+.b8 47
+.b8 102
+.b8 108
+.b8 117
+.b8 120
+.b8 50
+.b8 95
+.b8 107
+.b8 108
+.b8 101
+.b8 105
+.b8 110
+.b8 95
+.b8 57
+.b8 98
+.b8 95
+.b8 78
+.b8 86
+.b8 73
+.b8 68
+.b8 73
+.b8 65
+.b8 95
+.b8 71
+.b8 101
+.b8 70
+.b8 111
+.b8 114
+.b8 99
+.b8 101
+.b8 95
+.b8 82
+.b8 84
+.b8 88
+.b8 95
+.b8 52
+.b8 48
+.b8 57
+.b8 48
+.b8 95
+.b8 115
+.b8 109
+.b8 56
+.b8 57
+.b8 95
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 50
+.b8 46
+.b8 49
+.b8 48
+.b8 46
+.b8 48
+.b8 97
+.b8 48
+.b8 95
+.b8 98
+.b8 52
+.b8 101
+.b8 52
+.b8 101
+.b8 101
+.b8 56
+.b8 49
+.b8 100
+.b8 51
+.b8 46
+.b8 110
+.b8 118
+.b8 50
+.b8 53
+.b8 46
+.b8 49
+.b8 50
+.b8 95
+.b8 99
+.b8 117
+.b8 100
+.b8 97
+.b8 49
+.b8 51
+.b8 95
+.b8 49
+.b8 47
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 105
+.b8 110
+.b8 100
+.b8 117
+.b8 99
+.b8 116
+.b8 111
+.b8 114
+.b8 47
+.b8 54
+.b8 119
+.b8 0
+.b8 2                                   // Abbrev [2] 0xe4:0x24 DW_TAG_subprogram
+.b8 116                                 // DW_AT_name
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 112
+.b8 111
+.b8 105
+.b8 95
+.b8 102
+.b8 117
+.b8 115
+.b8 101
+.b8 100
+.b8 95
+.b8 109
+.b8 117
+.b8 108
+.b8 95
+.b8 115
+.b8 105
+.b8 108
+.b8 117
+.b8 95
+.b8 115
+.b8 112
+.b8 108
+.b8 105
+.b8 116
+.b8 95
+.b8 48
+.b8 0
+.b8 1                                   // DW_AT_inline
+.b8 3                                   // Abbrev [3] 0x108:0x2e DW_TAG_subprogram
+.b64 $L__func_begin0                    // DW_AT_low_pc
+.b64 $L__func_end0                      // DW_AT_high_pc
+.b32 228                                // DW_AT_abstract_origin
+.b8 4                                   // Abbrev [4] 0x11d:0x18 DW_TAG_inlined_subroutine
+.b32 228                                // DW_AT_abstract_origin
+.b64 $L__tmp1                           // DW_AT_low_pc
+.b64 $L__tmp8                           // DW_AT_high_pc
+.b8 1                                   // DW_AT_call_file
+.b8 29                                  // DW_AT_call_line
+.b8 22                                  // DW_AT_call_column
+.b8 0                                   // End Of Children Mark
+.b8 0                                   // End Of Children Mark
+	}
+	.section	.debug_macinfo	{	}
diff --git a/triton/6DP457QWOYDYHZ7TARQ4OLPDLGEKSLMNVUE3G2KQSQUSVRN7FBVA/triton_poi_fused_mul_silu_split_0.source b/triton/6DP457QWOYDYHZ7TARQ4OLPDLGEKSLMNVUE3G2KQSQUSVRN7FBVA/triton_poi_fused_mul_silu_split_0.source
new file mode 100644
index 0000000000000000000000000000000000000000..11c465fdf51c6ae2895be0136afd18f90cc2fb5a
--- /dev/null
+++ b/triton/6DP457QWOYDYHZ7TARQ4OLPDLGEKSLMNVUE3G2KQSQUSVRN7FBVA/triton_poi_fused_mul_silu_split_0.source
@@ -0,0 +1,129 @@
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":18:0)
+#loc26 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":49:0)
+#loc33 = loc("in_ptr0"(#loc))
+#loc34 = loc("out_ptr0"(#loc))
+#loc35 = loc("xnumel"(#loc))
+#loc58 = loc("x"(#loc26))
+module {
+  tt.func public @triton_poi_fused_mul_silu_split_0(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} {
+    %xnumel_0 = arith.constant 25165824 : i32 loc(#loc36)
+    %xoffset = tt.get_program_id x : i32 loc(#loc37)
+    %xoffset_1 = arith.constant 1024 : i32 loc(#loc38)
+    %xoffset_2 = arith.constant 1024 : i32 loc(#loc38)
+    %xoffset_3 = arith.muli %xoffset, %xoffset_2 : i32 loc(#loc38)
+    %xindex = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32> loc(#loc39)
+    %xindex_4 = tt.splat %xoffset_3 : i32 -> tensor<1024xi32> loc(#loc40)
+    %xindex_5 = arith.addi %xindex_4, %xindex : tensor<1024xi32> loc(#loc40)
+    %xmask = arith.constant true loc(#loc41)
+    %xmask_6 = arith.constant dense<true> : tensor<1024xi1> loc(#loc41)
+    %x0 = arith.constant 12288 : i32 loc(#loc42)
+    %x0_7 = arith.constant 12288 : i32 loc(#loc42)
+    %x0_8 = arith.constant dense<12288> : tensor<1024xi32> loc(#loc42)
+    %x0_9 = arith.remsi %xindex_5, %x0_8 : tensor<1024xi32> loc(#loc42)
+    %x1 = arith.constant 12288 : i32 loc(#loc43)
+    %x1_10 = arith.constant 12288 : i32 loc(#loc43)
+    %x1_11 = arith.constant dense<12288> : tensor<1024xi32> loc(#loc43)
+    %x1_12 = arith.divsi %xindex_5, %x1_11 : tensor<1024xi32> loc(#loc43)
+    %tmp0 = arith.constant 24576 : i32 loc(#loc44)
+    %tmp0_13 = arith.constant 24576 : i32 loc(#loc44)
+    %tmp0_14 = arith.constant dense<24576> : tensor<1024xi32> loc(#loc44)
+    %tmp0_15 = arith.muli %tmp0_14, %x1_12 : tensor<1024xi32> loc(#loc44)
+    %tmp0_16 = arith.addi %x0_9, %tmp0_15 : tensor<1024xi32> loc(#loc45)
+    %tmp0_17 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<1024x!tt.ptr<bf16>> loc(#loc46)
+    %tmp0_18 = tt.addptr %tmp0_17, %tmp0_16 : tensor<1024x!tt.ptr<bf16>>, tensor<1024xi32> loc(#loc46)
+    %tmp0_19 = tt.load %tmp0_18 : tensor<1024x!tt.ptr<bf16>> loc(#loc47)
+    %tmp0_20 = arith.extf %tmp0_19 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc48)
+    %tmp5 = arith.constant 12288 : i32 loc(#loc49)
+    %tmp5_21 = arith.constant 12288 : i32 loc(#loc49)
+    %tmp5_22 = arith.constant dense<12288> : tensor<1024xi32> loc(#loc49)
+    %tmp5_23 = arith.addi %tmp5_22, %x0_9 : tensor<1024xi32> loc(#loc49)
+    %tmp5_24 = arith.constant 24576 : i32 loc(#loc50)
+    %tmp5_25 = arith.constant 24576 : i32 loc(#loc50)
+    %tmp5_26 = arith.constant dense<24576> : tensor<1024xi32> loc(#loc50)
+    %tmp5_27 = arith.muli %tmp5_26, %x1_12 : tensor<1024xi32> loc(#loc50)
+    %tmp5_28 = arith.addi %tmp5_23, %tmp5_27 : tensor<1024xi32> loc(#loc51)
+    %tmp5_29 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<1024x!tt.ptr<bf16>> loc(#loc52)
+    %tmp5_30 = tt.addptr %tmp5_29, %tmp5_28 : tensor<1024x!tt.ptr<bf16>>, tensor<1024xi32> loc(#loc52)
+    %tmp5_31 = tt.load %tmp5_30 : tensor<1024x!tt.ptr<bf16>> loc(#loc53)
+    %tmp5_32 = arith.extf %tmp5_31 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc54)
+    %tmp2 = tt.call @triton.language.standard.sigmoid__fp32S1024S__(%tmp0_20) : (tensor<1024xf32>) -> tensor<1024xf32> loc(#loc55)
+    %tmp3 = arith.mulf %tmp0_20, %tmp2 : tensor<1024xf32> loc(#loc56)
+    %tmp6 = arith.mulf %tmp3, %tmp5_32 : tensor<1024xf32> loc(#loc57)
+    %0 = tt.splat %out_ptr0 : !tt.ptr<bf16> -> tensor<1024x!tt.ptr<bf16>> loc(#loc23)
+    %1 = tt.addptr %0, %xindex_5 : tensor<1024x!tt.ptr<bf16>>, tensor<1024xi32> loc(#loc23)
+    %2 = arith.truncf %tmp6 : tensor<1024xf32> to tensor<1024xbf16> loc(#loc24)
+    tt.store %1, %2 : tensor<1024x!tt.ptr<bf16>> loc(#loc24)
+    tt.return loc(#loc25)
+  } loc(#loc)
+  tt.func private @triton.language.standard.sigmoid__fp32S1024S__(%x: tensor<1024xf32> loc("x"(#loc26))) -> tensor<1024xf32> attributes {noinline = false} {
+    %cst = arith.constant 0.000000e+00 : f32 loc(#loc27)
+    %cst_0 = arith.constant dense<0.000000e+00> : tensor<1024xf32> loc(#loc27)
+    %0 = arith.subf %cst_0, %x : tensor<1024xf32> loc(#loc27)
+    %1 = math.exp %0 : tensor<1024xf32> loc(#loc28)
+    %c1_i32 = arith.constant 1 : i32 loc(#loc29)
+    %cst_1 = arith.constant 1.000000e+00 : f32 loc(#loc29)
+    %cst_2 = arith.constant dense<1.000000e+00> : tensor<1024xf32> loc(#loc29)
+    %2 = arith.addf %cst_2, %1 : tensor<1024xf32> loc(#loc29)
+    %c1_i32_3 = arith.constant 1 : i32 loc(#loc30)
+    %cst_4 = arith.constant 1.000000e+00 : f32 loc(#loc30)
+    %cst_5 = arith.constant dense<1.000000e+00> : tensor<1024xf32> loc(#loc30)
+    %3 = arith.divf %cst_5, %2 : tensor<1024xf32> loc(#loc30)
+    tt.return %3 : tensor<1024xf32> loc(#loc31)
+  ^bb1:  // no predecessors
+    %4 = ub.poison : tensor<1024xf32> loc(#loc32)
+    tt.return %4 : tensor<1024xf32> loc(#loc32)
+  } loc(#loc26)
+} loc(#loc)
+#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":19:13)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":20:28)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":20:33)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":21:36)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":21:23)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":22:36)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":23:19)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":24:19)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":26:41)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":26:35)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":26:30)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":26:46)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":26:55)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":27:38)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":27:49)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":27:43)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":27:30)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":27:54)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":27:63)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":29:22)
+#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":30:18)
+#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":32:18)
+#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":33:25)
+#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":33:36)
+#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":33:4)
+#loc27 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:30)
+#loc28 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:29)
+#loc29 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:20)
+#loc30 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:16)
+#loc31 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:11)
+#loc32 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:4)
+#loc36 = loc("xnumel"(#loc1))
+#loc37 = loc("xoffset"(#loc2))
+#loc38 = loc("xoffset"(#loc3))
+#loc39 = loc("xindex"(#loc4))
+#loc40 = loc("xindex"(#loc5))
+#loc41 = loc("xmask"(#loc6))
+#loc42 = loc("x0"(#loc7))
+#loc43 = loc("x1"(#loc8))
+#loc44 = loc("tmp0"(#loc9))
+#loc45 = loc("tmp0"(#loc10))
+#loc46 = loc("tmp0"(#loc11))
+#loc47 = loc("tmp0"(#loc12))
+#loc48 = loc("tmp0"(#loc13))
+#loc49 = loc("tmp5"(#loc14))
+#loc50 = loc("tmp5"(#loc15))
+#loc51 = loc("tmp5"(#loc16))
+#loc52 = loc("tmp5"(#loc17))
+#loc53 = loc("tmp5"(#loc18))
+#loc54 = loc("tmp5"(#loc19))
+#loc55 = loc("tmp2"(#loc20))
+#loc56 = loc("tmp3"(#loc21))
+#loc57 = loc("tmp6"(#loc22))
diff --git a/triton/6DP457QWOYDYHZ7TARQ4OLPDLGEKSLMNVUE3G2KQSQUSVRN7FBVA/triton_poi_fused_mul_silu_split_0.ttgir b/triton/6DP457QWOYDYHZ7TARQ4OLPDLGEKSLMNVUE3G2KQSQUSVRN7FBVA/triton_poi_fused_mul_silu_split_0.ttgir
new file mode 100644
index 0000000000000000000000000000000000000000..2644d72632d6b667223b47ab6251c7daf35bc79d
--- /dev/null
+++ b/triton/6DP457QWOYDYHZ7TARQ4OLPDLGEKSLMNVUE3G2KQSQUSVRN7FBVA/triton_poi_fused_mul_silu_split_0.ttgir
@@ -0,0 +1,93 @@
+#blocked = #ttg.blocked<{sizePerThread = [8], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}>
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":18:0)
+#loc28 = loc("in_ptr0"(#loc))
+#loc29 = loc("out_ptr0"(#loc))
+#loc30 = loc("xnumel"(#loc))
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:89", "ttg.threads-per-warp" = 32 : i32} {
+  tt.func public @triton_poi_fused_mul_silu_split_0(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} {
+    %cst = arith.constant dense<24576> : tensor<1024xi32, #blocked> loc(#loc1)
+    %cst_0 = arith.constant dense<12288> : tensor<1024xi32, #blocked> loc(#loc1)
+    %c1024_i32 = arith.constant 1024 : i32 loc(#loc1)
+    %cst_1 = arith.constant dense<0.000000e+00> : tensor<1024xf32, #blocked> loc(#loc1)
+    %cst_2 = arith.constant dense<1.000000e+00> : tensor<1024xf32, #blocked> loc(#loc1)
+    %xoffset = tt.get_program_id x : i32 loc(#loc31)
+    %xoffset_3 = arith.muli %xoffset, %c1024_i32 : i32 loc(#loc32)
+    %xindex = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked> loc(#loc33)
+    %xindex_4 = tt.splat %xoffset_3 : i32 -> tensor<1024xi32, #blocked> loc(#loc34)
+    %xindex_5 = arith.addi %xindex_4, %xindex : tensor<1024xi32, #blocked> loc(#loc34)
+    %x0 = arith.remsi %xindex_5, %cst_0 : tensor<1024xi32, #blocked> loc(#loc35)
+    %x1 = arith.divsi %xindex_5, %cst_0 : tensor<1024xi32, #blocked> loc(#loc36)
+    %tmp0 = arith.muli %x1, %cst : tensor<1024xi32, #blocked> loc(#loc37)
+    %tmp0_6 = arith.addi %x0, %tmp0 : tensor<1024xi32, #blocked> loc(#loc38)
+    %tmp0_7 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<1024x!tt.ptr<bf16>, #blocked> loc(#loc39)
+    %tmp0_8 = tt.addptr %tmp0_7, %tmp0_6 : tensor<1024x!tt.ptr<bf16>, #blocked>, tensor<1024xi32, #blocked> loc(#loc39)
+    %tmp0_9 = tt.load %tmp0_8 : tensor<1024x!tt.ptr<bf16>, #blocked> loc(#loc40)
+    %tmp0_10 = arith.extf %tmp0_9 : tensor<1024xbf16, #blocked> to tensor<1024xf32, #blocked> loc(#loc41)
+    %tmp5 = arith.addi %x0, %cst_0 : tensor<1024xi32, #blocked> loc(#loc42)
+    %tmp5_11 = arith.addi %tmp5, %tmp0 : tensor<1024xi32, #blocked> loc(#loc43)
+    %tmp5_12 = tt.addptr %tmp0_7, %tmp5_11 : tensor<1024x!tt.ptr<bf16>, #blocked>, tensor<1024xi32, #blocked> loc(#loc44)
+    %tmp5_13 = tt.load %tmp5_12 : tensor<1024x!tt.ptr<bf16>, #blocked> loc(#loc45)
+    %tmp5_14 = arith.extf %tmp5_13 : tensor<1024xbf16, #blocked> to tensor<1024xf32, #blocked> loc(#loc46)
+    %tmp2 = arith.subf %cst_1, %tmp0_10 : tensor<1024xf32, #blocked> loc(#loc50)
+    %tmp2_15 = math.exp %tmp2 : tensor<1024xf32, #blocked> loc(#loc51)
+    %tmp2_16 = arith.addf %tmp2_15, %cst_2 : tensor<1024xf32, #blocked> loc(#loc52)
+    %tmp2_17 = arith.divf %cst_2, %tmp2_16 : tensor<1024xf32, #blocked> loc(#loc53)
+    %tmp3 = arith.mulf %tmp0_10, %tmp2_17 : tensor<1024xf32, #blocked> loc(#loc48)
+    %tmp6 = arith.mulf %tmp3, %tmp5_14 : tensor<1024xf32, #blocked> loc(#loc49)
+    %0 = tt.splat %out_ptr0 : !tt.ptr<bf16> -> tensor<1024x!tt.ptr<bf16>, #blocked> loc(#loc25)
+    %1 = tt.addptr %0, %xindex_5 : tensor<1024x!tt.ptr<bf16>, #blocked>, tensor<1024xi32, #blocked> loc(#loc25)
+    %2 = arith.truncf %tmp6 : tensor<1024xf32, #blocked> to tensor<1024xbf16, #blocked> loc(#loc26)
+    tt.store %1, %2 : tensor<1024x!tt.ptr<bf16>, #blocked> loc(#loc26)
+    tt.return loc(#loc27)
+  } loc(#loc)
+} loc(#loc)
+#loc1 = loc(unknown)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":20:28)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":20:33)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":21:36)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":21:23)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":23:19)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":24:19)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":26:41)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":26:35)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":26:30)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":26:46)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":26:55)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":27:38)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":27:43)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":27:30)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":27:54)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":27:63)
+#loc18 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:30)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":29:22)
+#loc20 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:29)
+#loc21 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:20)
+#loc22 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:16)
+#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":30:18)
+#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":32:18)
+#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":33:25)
+#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":33:36)
+#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":33:4)
+#loc31 = loc("xoffset"(#loc2))
+#loc32 = loc("xoffset"(#loc3))
+#loc33 = loc("xindex"(#loc4))
+#loc34 = loc("xindex"(#loc5))
+#loc35 = loc("x0"(#loc6))
+#loc36 = loc("x1"(#loc7))
+#loc37 = loc("tmp0"(#loc8))
+#loc38 = loc("tmp0"(#loc9))
+#loc39 = loc("tmp0"(#loc10))
+#loc40 = loc("tmp0"(#loc11))
+#loc41 = loc("tmp0"(#loc12))
+#loc42 = loc("tmp5"(#loc13))
+#loc43 = loc("tmp5"(#loc14))
+#loc44 = loc("tmp5"(#loc15))
+#loc45 = loc("tmp5"(#loc16))
+#loc46 = loc("tmp5"(#loc17))
+#loc47 = loc("tmp2"(#loc19))
+#loc48 = loc("tmp3"(#loc23))
+#loc49 = loc("tmp6"(#loc24))
+#loc50 = loc(callsite(#loc18 at #loc47))
+#loc51 = loc(callsite(#loc20 at #loc47))
+#loc52 = loc(callsite(#loc21 at #loc47))
+#loc53 = loc(callsite(#loc22 at #loc47))
diff --git a/triton/6DP457QWOYDYHZ7TARQ4OLPDLGEKSLMNVUE3G2KQSQUSVRN7FBVA/triton_poi_fused_mul_silu_split_0.ttir b/triton/6DP457QWOYDYHZ7TARQ4OLPDLGEKSLMNVUE3G2KQSQUSVRN7FBVA/triton_poi_fused_mul_silu_split_0.ttir
new file mode 100644
index 0000000000000000000000000000000000000000..9a61f0406e185eab580c3b19e50d7d4bc20a07c5
--- /dev/null
+++ b/triton/6DP457QWOYDYHZ7TARQ4OLPDLGEKSLMNVUE3G2KQSQUSVRN7FBVA/triton_poi_fused_mul_silu_split_0.ttir
@@ -0,0 +1,93 @@
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":18:0)
+#loc28 = loc("in_ptr0"(#loc))
+#loc29 = loc("out_ptr0"(#loc))
+#loc30 = loc("xnumel"(#loc))
+module {
+  tt.func public @triton_poi_fused_mul_silu_split_0(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} {
+    %tmp2 = arith.constant dense<0.000000e+00> : tensor<1024xf32> loc(#loc50)
+    %tmp2_0 = arith.constant dense<1.000000e+00> : tensor<1024xf32> loc(#loc51)
+    %cst = arith.constant dense<24576> : tensor<1024xi32> loc(#loc3)
+    %cst_1 = arith.constant dense<12288> : tensor<1024xi32> loc(#loc3)
+    %c1024_i32 = arith.constant 1024 : i32 loc(#loc3)
+    %xoffset = tt.get_program_id x : i32 loc(#loc32)
+    %xoffset_2 = arith.muli %xoffset, %c1024_i32 : i32 loc(#loc33)
+    %xindex = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32> loc(#loc34)
+    %xindex_3 = tt.splat %xoffset_2 : i32 -> tensor<1024xi32> loc(#loc35)
+    %xindex_4 = arith.addi %xindex_3, %xindex : tensor<1024xi32> loc(#loc35)
+    %x0 = arith.remsi %xindex_4, %cst_1 : tensor<1024xi32> loc(#loc36)
+    %x1 = arith.divsi %xindex_4, %cst_1 : tensor<1024xi32> loc(#loc37)
+    %tmp0 = arith.muli %x1, %cst : tensor<1024xi32> loc(#loc38)
+    %tmp0_5 = arith.addi %x0, %tmp0 : tensor<1024xi32> loc(#loc39)
+    %tmp0_6 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<1024x!tt.ptr<bf16>> loc(#loc40)
+    %tmp0_7 = tt.addptr %tmp0_6, %tmp0_5 : tensor<1024x!tt.ptr<bf16>>, tensor<1024xi32> loc(#loc40)
+    %tmp0_8 = tt.load %tmp0_7 : tensor<1024x!tt.ptr<bf16>> loc(#loc41)
+    %tmp0_9 = arith.extf %tmp0_8 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc42)
+    %tmp5 = arith.addi %x0, %cst_1 : tensor<1024xi32> loc(#loc43)
+    %tmp5_10 = arith.addi %tmp5, %tmp0 : tensor<1024xi32> loc(#loc44)
+    %tmp5_11 = tt.addptr %tmp0_6, %tmp5_10 : tensor<1024x!tt.ptr<bf16>>, tensor<1024xi32> loc(#loc45)
+    %tmp5_12 = tt.load %tmp5_11 : tensor<1024x!tt.ptr<bf16>> loc(#loc46)
+    %tmp5_13 = arith.extf %tmp5_12 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc47)
+    %tmp2_14 = arith.subf %tmp2, %tmp0_9 : tensor<1024xf32> loc(#loc50)
+    %tmp2_15 = math.exp %tmp2_14 : tensor<1024xf32> loc(#loc52)
+    %tmp2_16 = arith.addf %tmp2_15, %tmp2_0 : tensor<1024xf32> loc(#loc53)
+    %tmp2_17 = arith.divf %tmp2_0, %tmp2_16 : tensor<1024xf32> loc(#loc54)
+    %tmp3 = arith.mulf %tmp0_9, %tmp2_17 : tensor<1024xf32> loc(#loc48)
+    %tmp6 = arith.mulf %tmp3, %tmp5_13 : tensor<1024xf32> loc(#loc49)
+    %0 = tt.splat %out_ptr0 : !tt.ptr<bf16> -> tensor<1024x!tt.ptr<bf16>> loc(#loc25)
+    %1 = tt.addptr %0, %xindex_4 : tensor<1024x!tt.ptr<bf16>>, tensor<1024xi32> loc(#loc25)
+    %2 = arith.truncf %tmp6 : tensor<1024xf32> to tensor<1024xbf16> loc(#loc26)
+    tt.store %1, %2 : tensor<1024x!tt.ptr<bf16>> loc(#loc26)
+    tt.return loc(#loc27)
+  } loc(#loc)
+} loc(#loc)
+#loc1 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:30)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":29:22)
+#loc3 = loc(unknown)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":20:28)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":20:33)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":21:36)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":21:23)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":23:19)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":24:19)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":26:41)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":26:35)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":26:30)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":26:46)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":26:55)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":27:38)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":27:43)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":27:30)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":27:54)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":27:63)
+#loc20 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:29)
+#loc21 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:20)
+#loc22 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:16)
+#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":30:18)
+#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":32:18)
+#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":33:25)
+#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":33:36)
+#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":33:4)
+#loc31 = loc("tmp2"(#loc2))
+#loc32 = loc("xoffset"(#loc4))
+#loc33 = loc("xoffset"(#loc5))
+#loc34 = loc("xindex"(#loc6))
+#loc35 = loc("xindex"(#loc7))
+#loc36 = loc("x0"(#loc8))
+#loc37 = loc("x1"(#loc9))
+#loc38 = loc("tmp0"(#loc10))
+#loc39 = loc("tmp0"(#loc11))
+#loc40 = loc("tmp0"(#loc12))
+#loc41 = loc("tmp0"(#loc13))
+#loc42 = loc("tmp0"(#loc14))
+#loc43 = loc("tmp5"(#loc15))
+#loc44 = loc("tmp5"(#loc16))
+#loc45 = loc("tmp5"(#loc17))
+#loc46 = loc("tmp5"(#loc18))
+#loc47 = loc("tmp5"(#loc19))
+#loc48 = loc("tmp3"(#loc23))
+#loc49 = loc("tmp6"(#loc24))
+#loc50 = loc(callsite(#loc1 at #loc31))
+#loc51 = loc(callsite(#loc3 at #loc31))
+#loc52 = loc(callsite(#loc20 at #loc31))
+#loc53 = loc(callsite(#loc21 at #loc31))
+#loc54 = loc(callsite(#loc22 at #loc31))
diff --git a/triton/77FLY5I3QQZOLH7UQNNBUBUQAUUI5IWSBGOQZ5R3Y2DOEUFWIYAA/__grp__triton_poi_fused_mul_silu_split_0.json b/triton/77FLY5I3QQZOLH7UQNNBUBUQAUUI5IWSBGOQZ5R3Y2DOEUFWIYAA/__grp__triton_poi_fused_mul_silu_split_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..aa37e98aba3afb74f4527b0c87f14a4aa1cb622d
--- /dev/null
+++ b/triton/77FLY5I3QQZOLH7UQNNBUBUQAUUI5IWSBGOQZ5R3Y2DOEUFWIYAA/__grp__triton_poi_fused_mul_silu_split_0.json
@@ -0,0 +1 @@
+{"child_paths": {"triton_poi_fused_mul_silu_split_0.source": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/77FLY5I3QQZOLH7UQNNBUBUQAUUI5IWSBGOQZ5R3Y2DOEUFWIYAA/triton_poi_fused_mul_silu_split_0.source", "triton_poi_fused_mul_silu_split_0.ttir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/77FLY5I3QQZOLH7UQNNBUBUQAUUI5IWSBGOQZ5R3Y2DOEUFWIYAA/triton_poi_fused_mul_silu_split_0.ttir", "triton_poi_fused_mul_silu_split_0.ttgir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/77FLY5I3QQZOLH7UQNNBUBUQAUUI5IWSBGOQZ5R3Y2DOEUFWIYAA/triton_poi_fused_mul_silu_split_0.ttgir", "triton_poi_fused_mul_silu_split_0.llir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/77FLY5I3QQZOLH7UQNNBUBUQAUUI5IWSBGOQZ5R3Y2DOEUFWIYAA/triton_poi_fused_mul_silu_split_0.llir", "triton_poi_fused_mul_silu_split_0.ptx": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/77FLY5I3QQZOLH7UQNNBUBUQAUUI5IWSBGOQZ5R3Y2DOEUFWIYAA/triton_poi_fused_mul_silu_split_0.ptx", "triton_poi_fused_mul_silu_split_0.cubin": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/77FLY5I3QQZOLH7UQNNBUBUQAUUI5IWSBGOQZ5R3Y2DOEUFWIYAA/triton_poi_fused_mul_silu_split_0.cubin", "triton_poi_fused_mul_silu_split_0.json": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/77FLY5I3QQZOLH7UQNNBUBUQAUUI5IWSBGOQZ5R3Y2DOEUFWIYAA/triton_poi_fused_mul_silu_split_0.json"}}
\ No newline at end of file
diff --git a/triton/77FLY5I3QQZOLH7UQNNBUBUQAUUI5IWSBGOQZ5R3Y2DOEUFWIYAA/triton_poi_fused_mul_silu_split_0.cubin b/triton/77FLY5I3QQZOLH7UQNNBUBUQAUUI5IWSBGOQZ5R3Y2DOEUFWIYAA/triton_poi_fused_mul_silu_split_0.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..93018b71b547598981c1b3c0d22570bc5a309414
Binary files /dev/null and b/triton/77FLY5I3QQZOLH7UQNNBUBUQAUUI5IWSBGOQZ5R3Y2DOEUFWIYAA/triton_poi_fused_mul_silu_split_0.cubin differ
diff --git a/triton/77FLY5I3QQZOLH7UQNNBUBUQAUUI5IWSBGOQZ5R3Y2DOEUFWIYAA/triton_poi_fused_mul_silu_split_0.json b/triton/77FLY5I3QQZOLH7UQNNBUBUQAUUI5IWSBGOQZ5R3Y2DOEUFWIYAA/triton_poi_fused_mul_silu_split_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..e8ba81b358700e8812c5e4f3be3c75adde0719cf
--- /dev/null
+++ b/triton/77FLY5I3QQZOLH7UQNNBUBUQAUUI5IWSBGOQZ5R3Y2DOEUFWIYAA/triton_poi_fused_mul_silu_split_0.json
@@ -0,0 +1 @@
+{"hash": "ffcabc751b8432e59ff4835a1a069005288ea2d2099d0cf63bc686e250b64600", "target": {"backend": "cuda", "arch": 89, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "enable_reflect_ftz": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee", "bf16x3", "bf16x6"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm89", "instrumentation_mode": "", "triton_version": "3.6.0", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_poi_fused_mul_silu_split_0"}
\ No newline at end of file
diff --git a/triton/77FLY5I3QQZOLH7UQNNBUBUQAUUI5IWSBGOQZ5R3Y2DOEUFWIYAA/triton_poi_fused_mul_silu_split_0.llir b/triton/77FLY5I3QQZOLH7UQNNBUBUQAUUI5IWSBGOQZ5R3Y2DOEUFWIYAA/triton_poi_fused_mul_silu_split_0.llir
new file mode 100644
index 0000000000000000000000000000000000000000..5cad5920af6565e198f3dc793f6a8cddac09e232
--- /dev/null
+++ b/triton/77FLY5I3QQZOLH7UQNNBUBUQAUUI5IWSBGOQZ5R3Y2DOEUFWIYAA/triton_poi_fused_mul_silu_split_0.llir
@@ -0,0 +1,176 @@
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64"
+
+; Function Attrs: nounwind
+define ptx_kernel void @triton_poi_fused_mul_silu_split_0(ptr addrspace(1) %0, ptr addrspace(1) %1, i32 %2, ptr addrspace(1) readnone captures(none) %3, ptr addrspace(1) readnone captures(none) %4) local_unnamed_addr #0 !dbg !4 {
+  %6 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7
+  %7 = shl i32 %6, 10, !dbg !8
+  %8 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9
+  %9 = shl nuw nsw i32 %8, 3, !dbg !9
+  %10 = and i32 %9, 1016, !dbg !9
+  %11 = or disjoint i32 %10, %7, !dbg !10
+  %12 = srem i32 %11, 12288, !dbg !11
+  %13 = sub nsw i32 %11, %12, !dbg !11
+  %14 = add i32 %13, %11, !dbg !11
+  %15 = sext i32 %14 to i64, !dbg !12
+  %16 = getelementptr bfloat, ptr addrspace(1) %0, i64 %15, !dbg !12
+  %17 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l"(ptr addrspace(1) %16) #3, !dbg !13
+  %18 = extractvalue { i32, i32, i32, i32 } %17, 0, !dbg !13
+  %19 = bitcast i32 %18 to <2 x bfloat>, !dbg !13
+  %20 = extractvalue { i32, i32, i32, i32 } %17, 1, !dbg !13
+  %21 = bitcast i32 %20 to <2 x bfloat>, !dbg !13
+  %22 = extractvalue { i32, i32, i32, i32 } %17, 2, !dbg !13
+  %23 = bitcast i32 %22 to <2 x bfloat>, !dbg !13
+  %24 = extractvalue { i32, i32, i32, i32 } %17, 3, !dbg !13
+  %25 = bitcast i32 %24 to <2 x bfloat>, !dbg !13
+  %26 = add i32 %14, 12288, !dbg !14
+  %27 = sext i32 %26 to i64, !dbg !15
+  %28 = getelementptr bfloat, ptr addrspace(1) %0, i64 %27, !dbg !15
+  %29 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l"(ptr addrspace(1) %28) #3, !dbg !16
+  %30 = extractvalue { i32, i32, i32, i32 } %29, 0, !dbg !16
+  %31 = bitcast i32 %30 to <2 x bfloat>, !dbg !16
+  %32 = extractvalue { i32, i32, i32, i32 } %29, 1, !dbg !16
+  %33 = bitcast i32 %32 to <2 x bfloat>, !dbg !16
+  %34 = extractvalue { i32, i32, i32, i32 } %29, 2, !dbg !16
+  %35 = bitcast i32 %34 to <2 x bfloat>, !dbg !16
+  %36 = extractvalue { i32, i32, i32, i32 } %29, 3, !dbg !16
+  %37 = bitcast i32 %36 to <2 x bfloat>, !dbg !16
+  %38 = sext i32 %11 to i64, !dbg !17
+  %39 = getelementptr bfloat, ptr addrspace(1) %1, i64 %38, !dbg !17
+  %40 = fpext <2 x bfloat> %19 to <2 x float>, !dbg !18
+  %41 = fpext <2 x bfloat> %31 to <2 x float>, !dbg !19
+  %42 = extractelement <2 x float> %40, i64 0, !dbg !20
+  %43 = fsub float 0.000000e+00, %42, !dbg !20
+  %44 = extractelement <2 x float> %40, i64 1, !dbg !20
+  %45 = fsub float 0.000000e+00, %44, !dbg !20
+  %46 = fmul float %43, 0x3FF7154760000000, !dbg !25
+  %47 = tail call float @llvm.nvvm.ex2.approx.f(float %46), !dbg !25
+  %48 = fmul float %45, 0x3FF7154760000000, !dbg !25
+  %49 = tail call float @llvm.nvvm.ex2.approx.f(float %48), !dbg !25
+  %50 = fadd float %47, 1.000000e+00, !dbg !26
+  %51 = fadd float %49, 1.000000e+00, !dbg !26
+  %52 = tail call float @llvm.nvvm.div.full(float 1.000000e+00, float %50), !dbg !27
+  %53 = tail call float @llvm.nvvm.div.full(float 1.000000e+00, float %51), !dbg !27
+  %54 = insertelement <2 x float> poison, float %52, i64 0, !dbg !28
+  %55 = insertelement <2 x float> %54, float %53, i64 1, !dbg !28
+  %56 = fmul <2 x float> %55, %40, !dbg !28
+  %57 = fmul <2 x float> %56, %41, !dbg !29
+  %58 = fptrunc <2 x float> %57 to <2 x bfloat>, !dbg !30
+  %59 = fpext <2 x bfloat> %21 to <2 x float>, !dbg !18
+  %60 = fpext <2 x bfloat> %33 to <2 x float>, !dbg !19
+  %61 = extractelement <2 x float> %59, i64 0, !dbg !20
+  %62 = fsub float 0.000000e+00, %61, !dbg !20
+  %63 = extractelement <2 x float> %59, i64 1, !dbg !20
+  %64 = fsub float 0.000000e+00, %63, !dbg !20
+  %65 = fmul float %62, 0x3FF7154760000000, !dbg !25
+  %66 = tail call float @llvm.nvvm.ex2.approx.f(float %65), !dbg !25
+  %67 = fmul float %64, 0x3FF7154760000000, !dbg !25
+  %68 = tail call float @llvm.nvvm.ex2.approx.f(float %67), !dbg !25
+  %69 = fadd float %66, 1.000000e+00, !dbg !26
+  %70 = fadd float %68, 1.000000e+00, !dbg !26
+  %71 = tail call float @llvm.nvvm.div.full(float 1.000000e+00, float %69), !dbg !27
+  %72 = tail call float @llvm.nvvm.div.full(float 1.000000e+00, float %70), !dbg !27
+  %73 = insertelement <2 x float> poison, float %71, i64 0, !dbg !28
+  %74 = insertelement <2 x float> %73, float %72, i64 1, !dbg !28
+  %75 = fmul <2 x float> %74, %59, !dbg !28
+  %76 = fmul <2 x float> %75, %60, !dbg !29
+  %77 = fptrunc <2 x float> %76 to <2 x bfloat>, !dbg !30
+  %78 = fpext <2 x bfloat> %23 to <2 x float>, !dbg !18
+  %79 = fpext <2 x bfloat> %35 to <2 x float>, !dbg !19
+  %80 = extractelement <2 x float> %78, i64 0, !dbg !20
+  %81 = fsub float 0.000000e+00, %80, !dbg !20
+  %82 = extractelement <2 x float> %78, i64 1, !dbg !20
+  %83 = fsub float 0.000000e+00, %82, !dbg !20
+  %84 = fmul float %81, 0x3FF7154760000000, !dbg !25
+  %85 = tail call float @llvm.nvvm.ex2.approx.f(float %84), !dbg !25
+  %86 = fmul float %83, 0x3FF7154760000000, !dbg !25
+  %87 = tail call float @llvm.nvvm.ex2.approx.f(float %86), !dbg !25
+  %88 = fadd float %85, 1.000000e+00, !dbg !26
+  %89 = fadd float %87, 1.000000e+00, !dbg !26
+  %90 = tail call float @llvm.nvvm.div.full(float 1.000000e+00, float %88), !dbg !27
+  %91 = tail call float @llvm.nvvm.div.full(float 1.000000e+00, float %89), !dbg !27
+  %92 = insertelement <2 x float> poison, float %90, i64 0, !dbg !28
+  %93 = insertelement <2 x float> %92, float %91, i64 1, !dbg !28
+  %94 = fmul <2 x float> %93, %78, !dbg !28
+  %95 = fmul <2 x float> %94, %79, !dbg !29
+  %96 = fptrunc <2 x float> %95 to <2 x bfloat>, !dbg !30
+  %97 = fpext <2 x bfloat> %25 to <2 x float>, !dbg !18
+  %98 = fpext <2 x bfloat> %37 to <2 x float>, !dbg !19
+  %99 = extractelement <2 x float> %97, i64 0, !dbg !20
+  %100 = fsub float 0.000000e+00, %99, !dbg !20
+  %101 = extractelement <2 x float> %97, i64 1, !dbg !20
+  %102 = fsub float 0.000000e+00, %101, !dbg !20
+  %103 = fmul float %100, 0x3FF7154760000000, !dbg !25
+  %104 = tail call float @llvm.nvvm.ex2.approx.f(float %103), !dbg !25
+  %105 = fmul float %102, 0x3FF7154760000000, !dbg !25
+  %106 = tail call float @llvm.nvvm.ex2.approx.f(float %105), !dbg !25
+  %107 = fadd float %104, 1.000000e+00, !dbg !26
+  %108 = fadd float %106, 1.000000e+00, !dbg !26
+  %109 = tail call float @llvm.nvvm.div.full(float 1.000000e+00, float %107), !dbg !27
+  %110 = tail call float @llvm.nvvm.div.full(float 1.000000e+00, float %108), !dbg !27
+  %111 = insertelement <2 x float> poison, float %109, i64 0, !dbg !28
+  %112 = insertelement <2 x float> %111, float %110, i64 1, !dbg !28
+  %113 = fmul <2 x float> %112, %97, !dbg !28
+  %114 = fmul <2 x float> %113, %98, !dbg !29
+  %115 = fptrunc <2 x float> %114 to <2 x bfloat>, !dbg !30
+  %116 = bitcast <2 x bfloat> %58 to i32, !dbg !30
+  %117 = bitcast <2 x bfloat> %77 to i32, !dbg !30
+  %118 = bitcast <2 x bfloat> %96 to i32, !dbg !30
+  %119 = bitcast <2 x bfloat> %115 to i32, !dbg !30
+  tail call void asm sideeffect "st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l"(i32 %116, i32 %117, i32 %118, i32 %119, ptr addrspace(1) %39) #3, !dbg !30
+  ret void, !dbg !31
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.ex2.approx.f(float) #2
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.div.full(float, float) #2
+
+attributes #0 = { nounwind "nvvm.reqntid"="128" }
+attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #2 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
+attributes #3 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly)
+!1 = !DIFile(filename: "csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py", directory: "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy")
+!2 = !{i32 2, !"Debug Info Version", i32 3}
+!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
+!4 = distinct !DISubprogram(name: "triton_poi_fused_mul_silu_split_0", linkageName: "triton_poi_fused_mul_silu_split_0", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!5 = !DISubroutineType(cc: DW_CC_normal, types: !6)
+!6 = !{}
+!7 = !DILocation(line: 19, column: 28, scope: !4)
+!8 = !DILocation(line: 19, column: 33, scope: !4)
+!9 = !DILocation(line: 20, column: 36, scope: !4)
+!10 = !DILocation(line: 20, column: 23, scope: !4)
+!11 = !DILocation(line: 25, column: 35, scope: !4)
+!12 = !DILocation(line: 25, column: 30, scope: !4)
+!13 = !DILocation(line: 25, column: 46, scope: !4)
+!14 = !DILocation(line: 26, column: 43, scope: !4)
+!15 = !DILocation(line: 26, column: 30, scope: !4)
+!16 = !DILocation(line: 26, column: 54, scope: !4)
+!17 = !DILocation(line: 32, column: 25, scope: !4)
+!18 = !DILocation(line: 25, column: 55, scope: !4)
+!19 = !DILocation(line: 26, column: 63, scope: !4)
+!20 = !DILocation(line: 50, column: 30, scope: !21, inlinedAt: !23)
+!21 = distinct !DILexicalBlockFile(scope: !4, file: !22, discriminator: 0)
+!22 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.12/dist-packages/triton/language")
+!23 = !DILocation(line: 28, column: 22, scope: !24)
+!24 = distinct !DILexicalBlockFile(scope: !4, file: !1, discriminator: 0)
+!25 = !DILocation(line: 50, column: 29, scope: !21, inlinedAt: !23)
+!26 = !DILocation(line: 50, column: 20, scope: !21, inlinedAt: !23)
+!27 = !DILocation(line: 50, column: 16, scope: !21, inlinedAt: !23)
+!28 = !DILocation(line: 29, column: 18, scope: !4)
+!29 = !DILocation(line: 31, column: 18, scope: !4)
+!30 = !DILocation(line: 32, column: 36, scope: !4)
+!31 = !DILocation(line: 32, column: 4, scope: !4)
diff --git a/triton/77FLY5I3QQZOLH7UQNNBUBUQAUUI5IWSBGOQZ5R3Y2DOEUFWIYAA/triton_poi_fused_mul_silu_split_0.ptx b/triton/77FLY5I3QQZOLH7UQNNBUBUQAUUI5IWSBGOQZ5R3Y2DOEUFWIYAA/triton_poi_fused_mul_silu_split_0.ptx
new file mode 100644
index 0000000000000000000000000000000000000000..63c0ecf51590d148c248f8d1d1ae3bb18ca1f538
--- /dev/null
+++ b/triton/77FLY5I3QQZOLH7UQNNBUBUQAUUI5IWSBGOQZ5R3Y2DOEUFWIYAA/triton_poi_fused_mul_silu_split_0.ptx
@@ -0,0 +1,539 @@
+//
+// Generated by LLVM NVPTX Back-End
+//
+
+.version 9.1
+.target sm_89
+.address_size 64
+
+	// .globl	triton_poi_fused_mul_silu_split_0 // -- Begin function triton_poi_fused_mul_silu_split_0
+                                        // @triton_poi_fused_mul_silu_split_0
+.visible .entry triton_poi_fused_mul_silu_split_0(
+	.param .u64 .ptr .global .align 1 triton_poi_fused_mul_silu_split_0_param_0,
+	.param .u64 .ptr .global .align 1 triton_poi_fused_mul_silu_split_0_param_1,
+	.param .u32 triton_poi_fused_mul_silu_split_0_param_2,
+	.param .u64 .ptr .global .align 1 triton_poi_fused_mul_silu_split_0_param_3,
+	.param .u64 .ptr .global .align 1 triton_poi_fused_mul_silu_split_0_param_4
+)
+.reqntid 128
+{
+	.reg .b16 	%rs<17>;
+	.reg .b32 	%r<99>;
+	.reg .b64 	%rd<6>;
+	.loc	1 18 0                          // csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:18:0
+$L__func_begin0:
+	.loc	1 18 0                          // csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:18:0
+
+// %bb.0:
+	ld.param.b64 	%rd4, [triton_poi_fused_mul_silu_split_0_param_0];
+	ld.param.b64 	%rd5, [triton_poi_fused_mul_silu_split_0_param_1];
+$L__tmp0:
+	.loc	1 19 28                         // csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:19:28
+	mov.u32 	%r13, %ctaid.x;
+	.loc	1 19 33                         // csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:19:33
+	shl.b32 	%r14, %r13, 10;
+	.loc	1 20 36                         // csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:20:36
+	mov.u32 	%r15, %tid.x;
+	shl.b32 	%r16, %r15, 3;
+	and.b32 	%r17, %r16, 1016;
+	.loc	1 20 23                         // csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:20:23
+	or.b32 	%r18, %r17, %r14;
+	.loc	1 25 35                         // csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:25:35
+	mul.hi.s32 	%r19, %r18, 715827883;
+	shr.u32 	%r20, %r19, 31;
+	shr.u32 	%r21, %r19, 11;
+	add.s32 	%r22, %r21, %r20;
+	mad.lo.s32 	%r23, %r22, 12288, %r18;
+	.loc	1 25 30                         // csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:25:30
+	mad.wide.s32 	%rd1, %r23, 2, %rd4;
+	.loc	1 25 46                         // csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:25:46
+	// begin inline asm
+	mov.u32 %r1, 0x0;
+	mov.u32 %r2, 0x0;
+	mov.u32 %r3, 0x0;
+	mov.u32 %r4, 0x0;
+	ld.global.v4.b32 { %r1, %r2, %r3, %r4 }, [ %rd1 + 0 ];
+	// end inline asm
+	.loc	1 26 43                         // csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:26:43
+	add.s32 	%r24, %r23, 12288;
+	.loc	1 26 30                         // csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:26:30
+	mad.wide.s32 	%rd2, %r24, 2, %rd4;
+	.loc	1 26 54                         // csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:26:54
+	// begin inline asm
+	mov.u32 %r5, 0x0;
+	mov.u32 %r6, 0x0;
+	mov.u32 %r7, 0x0;
+	mov.u32 %r8, 0x0;
+	ld.global.v4.b32 { %r5, %r6, %r7, %r8 }, [ %rd2 + 0 ];
+	// end inline asm
+	.loc	1 32 25                         // csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:32:25
+	mad.wide.s32 	%rd3, %r18, 2, %rd5;
+	.loc	1 25 55                         // csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:25:55
+	mov.b32 	{%rs1, %rs2}, %r1;
+	cvt.f32.bf16 	%r25, %rs2;
+	cvt.f32.bf16 	%r26, %rs1;
+	.loc	1 26 63                         // csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:26:63
+	mov.b32 	{%rs3, %rs4}, %r5;
+	cvt.f32.bf16 	%r27, %rs4;
+	cvt.f32.bf16 	%r28, %rs3;
+	mov.b32 	%r29, 0f00000000;
+$L__tmp1:
+	.loc	2 50 30                         // standard.py:50:30 @[ csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:28:22 ]
+	sub.f32 	%r30, %r29, %r26;
+	sub.f32 	%r31, %r29, %r25;
+	.loc	2 50 29                         // standard.py:50:29 @[ csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:28:22 ]
+	mul.f32 	%r32, %r30, 0f3FB8AA3B;
+	ex2.approx.f32 	%r33, %r32;
+	mul.f32 	%r34, %r31, 0f3FB8AA3B;
+	ex2.approx.f32 	%r35, %r34;
+	.loc	2 50 20                         // standard.py:50:20 @[ csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:28:22 ]
+	add.f32 	%r36, %r33, 0f3F800000;
+	add.f32 	%r37, %r35, 0f3F800000;
+	mov.b32 	%r38, 0f3F800000;
+	.loc	2 50 16                         // standard.py:50:16 @[ csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:28:22 ]
+	div.full.f32 	%r39, %r38, %r36;
+	div.full.f32 	%r40, %r38, %r37;
+$L__tmp2:
+	.loc	1 29 18                         // csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:29:18
+	mul.f32 	%r41, %r40, %r25;
+	mul.f32 	%r42, %r39, %r26;
+	.loc	1 31 18                         // csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:31:18
+	mul.f32 	%r43, %r42, %r28;
+	mul.f32 	%r44, %r41, %r27;
+	.loc	1 32 36                         // csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:32:36
+	cvt.rn.bf16x2.f32 	%r9, %r44, %r43;
+	.loc	1 25 55                         // csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:25:55
+	mov.b32 	{%rs5, %rs6}, %r2;
+	cvt.f32.bf16 	%r45, %rs6;
+	cvt.f32.bf16 	%r46, %rs5;
+	.loc	1 26 63                         // csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:26:63
+	mov.b32 	{%rs7, %rs8}, %r6;
+	cvt.f32.bf16 	%r47, %rs8;
+	cvt.f32.bf16 	%r48, %rs7;
+$L__tmp3:
+	.loc	2 50 30                         // standard.py:50:30 @[ csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:28:22 ]
+	sub.f32 	%r49, %r29, %r46;
+	sub.f32 	%r50, %r29, %r45;
+	.loc	2 50 29                         // standard.py:50:29 @[ csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:28:22 ]
+	mul.f32 	%r51, %r49, 0f3FB8AA3B;
+	ex2.approx.f32 	%r52, %r51;
+	mul.f32 	%r53, %r50, 0f3FB8AA3B;
+	ex2.approx.f32 	%r54, %r53;
+	.loc	2 50 20                         // standard.py:50:20 @[ csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:28:22 ]
+	add.f32 	%r55, %r52, 0f3F800000;
+	add.f32 	%r56, %r54, 0f3F800000;
+	.loc	2 50 16                         // standard.py:50:16 @[ csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:28:22 ]
+	div.full.f32 	%r57, %r38, %r55;
+	div.full.f32 	%r58, %r38, %r56;
+$L__tmp4:
+	.loc	1 29 18                         // csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:29:18
+	mul.f32 	%r59, %r58, %r45;
+	mul.f32 	%r60, %r57, %r46;
+	.loc	1 31 18                         // csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:31:18
+	mul.f32 	%r61, %r60, %r48;
+	mul.f32 	%r62, %r59, %r47;
+	.loc	1 32 36                         // csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:32:36
+	cvt.rn.bf16x2.f32 	%r10, %r62, %r61;
+	.loc	1 25 55                         // csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:25:55
+	mov.b32 	{%rs9, %rs10}, %r3;
+	cvt.f32.bf16 	%r63, %rs10;
+	cvt.f32.bf16 	%r64, %rs9;
+	.loc	1 26 63                         // csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:26:63
+	mov.b32 	{%rs11, %rs12}, %r7;
+	cvt.f32.bf16 	%r65, %rs12;
+	cvt.f32.bf16 	%r66, %rs11;
+$L__tmp5:
+	.loc	2 50 30                         // standard.py:50:30 @[ csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:28:22 ]
+	sub.f32 	%r67, %r29, %r64;
+	sub.f32 	%r68, %r29, %r63;
+	.loc	2 50 29                         // standard.py:50:29 @[ csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:28:22 ]
+	mul.f32 	%r69, %r67, 0f3FB8AA3B;
+	ex2.approx.f32 	%r70, %r69;
+	mul.f32 	%r71, %r68, 0f3FB8AA3B;
+	ex2.approx.f32 	%r72, %r71;
+	.loc	2 50 20                         // standard.py:50:20 @[ csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:28:22 ]
+	add.f32 	%r73, %r70, 0f3F800000;
+	add.f32 	%r74, %r72, 0f3F800000;
+	.loc	2 50 16                         // standard.py:50:16 @[ csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:28:22 ]
+	div.full.f32 	%r75, %r38, %r73;
+	div.full.f32 	%r76, %r38, %r74;
+$L__tmp6:
+	.loc	1 29 18                         // csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:29:18
+	mul.f32 	%r77, %r76, %r63;
+	mul.f32 	%r78, %r75, %r64;
+	.loc	1 31 18                         // csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:31:18
+	mul.f32 	%r79, %r78, %r66;
+	mul.f32 	%r80, %r77, %r65;
+	.loc	1 32 36                         // csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:32:36
+	cvt.rn.bf16x2.f32 	%r11, %r80, %r79;
+	.loc	1 25 55                         // csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:25:55
+	mov.b32 	{%rs13, %rs14}, %r4;
+	cvt.f32.bf16 	%r81, %rs14;
+	cvt.f32.bf16 	%r82, %rs13;
+	.loc	1 26 63                         // csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:26:63
+	mov.b32 	{%rs15, %rs16}, %r8;
+	cvt.f32.bf16 	%r83, %rs16;
+	cvt.f32.bf16 	%r84, %rs15;
+$L__tmp7:
+	.loc	2 50 30                         // standard.py:50:30 @[ csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:28:22 ]
+	sub.f32 	%r85, %r29, %r82;
+	sub.f32 	%r86, %r29, %r81;
+	.loc	2 50 29                         // standard.py:50:29 @[ csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:28:22 ]
+	mul.f32 	%r87, %r85, 0f3FB8AA3B;
+	ex2.approx.f32 	%r88, %r87;
+	mul.f32 	%r89, %r86, 0f3FB8AA3B;
+	ex2.approx.f32 	%r90, %r89;
+	.loc	2 50 20                         // standard.py:50:20 @[ csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:28:22 ]
+	add.f32 	%r91, %r88, 0f3F800000;
+	add.f32 	%r92, %r90, 0f3F800000;
+	.loc	2 50 16                         // standard.py:50:16 @[ csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:28:22 ]
+	div.full.f32 	%r93, %r38, %r91;
+	div.full.f32 	%r94, %r38, %r92;
+$L__tmp8:
+	.loc	1 29 18                         // csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:29:18
+	mul.f32 	%r95, %r94, %r81;
+	mul.f32 	%r96, %r93, %r82;
+	.loc	1 31 18                         // csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:31:18
+	mul.f32 	%r97, %r96, %r84;
+	mul.f32 	%r98, %r95, %r83;
+	.loc	1 32 36                         // csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:32:36
+	cvt.rn.bf16x2.f32 	%r12, %r98, %r97;
+	// begin inline asm
+	st.global.v4.b32 [ %rd3 + 0 ], { %r9, %r10, %r11, %r12 };
+	// end inline asm
+	.loc	1 32 4                          // csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:32:4
+	ret;
+$L__tmp9:
+$L__func_end0:
+                                        // -- End function
+}
+	.file	1 "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py"
+	.file	2 "/usr/local/lib/python3.12/dist-packages/triton/language/standard.py"
+	.section	.debug_abbrev
+	{
+.b8 1                                   // Abbreviation Code
+.b8 17                                  // DW_TAG_compile_unit
+.b8 1                                   // DW_CHILDREN_yes
+.b8 37                                  // DW_AT_producer
+.b8 8                                   // DW_FORM_string
+.b8 19                                  // DW_AT_language
+.b8 5                                   // DW_FORM_data2
+.b8 3                                   // DW_AT_name
+.b8 8                                   // DW_FORM_string
+.b8 16                                  // DW_AT_stmt_list
+.b8 6                                   // DW_FORM_data4
+.b8 27                                  // DW_AT_comp_dir
+.b8 8                                   // DW_FORM_string
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 2                                   // Abbreviation Code
+.b8 46                                  // DW_TAG_subprogram
+.b8 0                                   // DW_CHILDREN_no
+.b8 3                                   // DW_AT_name
+.b8 8                                   // DW_FORM_string
+.b8 32                                  // DW_AT_inline
+.b8 11                                  // DW_FORM_data1
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 3                                   // Abbreviation Code
+.b8 46                                  // DW_TAG_subprogram
+.b8 1                                   // DW_CHILDREN_yes
+.b8 17                                  // DW_AT_low_pc
+.b8 1                                   // DW_FORM_addr
+.b8 18                                  // DW_AT_high_pc
+.b8 1                                   // DW_FORM_addr
+.b8 49                                  // DW_AT_abstract_origin
+.b8 19                                  // DW_FORM_ref4
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 4                                   // Abbreviation Code
+.b8 29                                  // DW_TAG_inlined_subroutine
+.b8 0                                   // DW_CHILDREN_no
+.b8 49                                  // DW_AT_abstract_origin
+.b8 19                                  // DW_FORM_ref4
+.b8 17                                  // DW_AT_low_pc
+.b8 1                                   // DW_FORM_addr
+.b8 18                                  // DW_AT_high_pc
+.b8 1                                   // DW_FORM_addr
+.b8 88                                  // DW_AT_call_file
+.b8 11                                  // DW_FORM_data1
+.b8 89                                  // DW_AT_call_line
+.b8 11                                  // DW_FORM_data1
+.b8 87                                  // DW_AT_call_column
+.b8 11                                  // DW_FORM_data1
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 0                                   // EOM(3)
+	}
+	.section	.debug_info
+	{
+.b32 307                                // Length of Unit
+.b8 2                                   // DWARF version number
+.b8 0
+.b32 .debug_abbrev                      // Offset Into Abbrev. Section
+.b8 8                                   // Address Size (in bytes)
+.b8 1                                   // Abbrev [1] 0xb:0x12c DW_TAG_compile_unit
+.b8 116                                 // DW_AT_producer
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 0
+.b8 2                                   // DW_AT_language
+.b8 0
+.b8 99                                  // DW_AT_name
+.b8 115
+.b8 121
+.b8 97
+.b8 101
+.b8 51
+.b8 111
+.b8 107
+.b8 50
+.b8 120
+.b8 110
+.b8 122
+.b8 117
+.b8 120
+.b8 104
+.b8 106
+.b8 107
+.b8 120
+.b8 122
+.b8 104
+.b8 100
+.b8 99
+.b8 112
+.b8 99
+.b8 122
+.b8 54
+.b8 106
+.b8 99
+.b8 107
+.b8 99
+.b8 117
+.b8 51
+.b8 118
+.b8 118
+.b8 55
+.b8 101
+.b8 113
+.b8 98
+.b8 51
+.b8 112
+.b8 101
+.b8 119
+.b8 104
+.b8 114
+.b8 118
+.b8 113
+.b8 109
+.b8 105
+.b8 101
+.b8 114
+.b8 103
+.b8 102
+.b8 46
+.b8 112
+.b8 121
+.b8 0
+.b32 .debug_line                        // DW_AT_stmt_list
+.b8 47                                  // DW_AT_comp_dir
+.b8 97
+.b8 112
+.b8 112
+.b8 47
+.b8 116
+.b8 101
+.b8 110
+.b8 115
+.b8 111
+.b8 114
+.b8 114
+.b8 116
+.b8 95
+.b8 108
+.b8 108
+.b8 109
+.b8 47
+.b8 118
+.b8 105
+.b8 115
+.b8 117
+.b8 97
+.b8 108
+.b8 95
+.b8 103
+.b8 101
+.b8 110
+.b8 47
+.b8 99
+.b8 111
+.b8 109
+.b8 112
+.b8 105
+.b8 108
+.b8 101
+.b8 100
+.b8 95
+.b8 99
+.b8 97
+.b8 99
+.b8 104
+.b8 101
+.b8 47
+.b8 102
+.b8 108
+.b8 117
+.b8 120
+.b8 50
+.b8 95
+.b8 107
+.b8 108
+.b8 101
+.b8 105
+.b8 110
+.b8 95
+.b8 57
+.b8 98
+.b8 95
+.b8 78
+.b8 86
+.b8 73
+.b8 68
+.b8 73
+.b8 65
+.b8 95
+.b8 71
+.b8 101
+.b8 70
+.b8 111
+.b8 114
+.b8 99
+.b8 101
+.b8 95
+.b8 82
+.b8 84
+.b8 88
+.b8 95
+.b8 52
+.b8 48
+.b8 57
+.b8 48
+.b8 95
+.b8 115
+.b8 109
+.b8 56
+.b8 57
+.b8 95
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 50
+.b8 46
+.b8 49
+.b8 48
+.b8 46
+.b8 48
+.b8 97
+.b8 48
+.b8 95
+.b8 98
+.b8 52
+.b8 101
+.b8 52
+.b8 101
+.b8 101
+.b8 56
+.b8 49
+.b8 100
+.b8 51
+.b8 46
+.b8 110
+.b8 118
+.b8 50
+.b8 53
+.b8 46
+.b8 49
+.b8 50
+.b8 95
+.b8 99
+.b8 117
+.b8 100
+.b8 97
+.b8 49
+.b8 51
+.b8 95
+.b8 49
+.b8 47
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 105
+.b8 110
+.b8 100
+.b8 117
+.b8 99
+.b8 116
+.b8 111
+.b8 114
+.b8 47
+.b8 115
+.b8 121
+.b8 0
+.b8 2                                   // Abbrev [2] 0xe4:0x24 DW_TAG_subprogram
+.b8 116                                 // DW_AT_name
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 112
+.b8 111
+.b8 105
+.b8 95
+.b8 102
+.b8 117
+.b8 115
+.b8 101
+.b8 100
+.b8 95
+.b8 109
+.b8 117
+.b8 108
+.b8 95
+.b8 115
+.b8 105
+.b8 108
+.b8 117
+.b8 95
+.b8 115
+.b8 112
+.b8 108
+.b8 105
+.b8 116
+.b8 95
+.b8 48
+.b8 0
+.b8 1                                   // DW_AT_inline
+.b8 3                                   // Abbrev [3] 0x108:0x2e DW_TAG_subprogram
+.b64 $L__func_begin0                    // DW_AT_low_pc
+.b64 $L__func_end0                      // DW_AT_high_pc
+.b32 228                                // DW_AT_abstract_origin
+.b8 4                                   // Abbrev [4] 0x11d:0x18 DW_TAG_inlined_subroutine
+.b32 228                                // DW_AT_abstract_origin
+.b64 $L__tmp1                           // DW_AT_low_pc
+.b64 $L__tmp8                           // DW_AT_high_pc
+.b8 1                                   // DW_AT_call_file
+.b8 28                                  // DW_AT_call_line
+.b8 22                                  // DW_AT_call_column
+.b8 0                                   // End Of Children Mark
+.b8 0                                   // End Of Children Mark
+	}
+	.section	.debug_macinfo	{	}
diff --git a/triton/77FLY5I3QQZOLH7UQNNBUBUQAUUI5IWSBGOQZ5R3Y2DOEUFWIYAA/triton_poi_fused_mul_silu_split_0.source b/triton/77FLY5I3QQZOLH7UQNNBUBUQAUUI5IWSBGOQZ5R3Y2DOEUFWIYAA/triton_poi_fused_mul_silu_split_0.source
new file mode 100644
index 0000000000000000000000000000000000000000..951df97979069edd2b68e5fa5cfca0b583d64891
--- /dev/null
+++ b/triton/77FLY5I3QQZOLH7UQNNBUBUQAUUI5IWSBGOQZ5R3Y2DOEUFWIYAA/triton_poi_fused_mul_silu_split_0.source
@@ -0,0 +1,126 @@
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":18:0)
+#loc25 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":49:0)
+#loc32 = loc("in_ptr0"(#loc))
+#loc33 = loc("out_ptr0"(#loc))
+#loc34 = loc("xnumel"(#loc))
+#loc56 = loc("x"(#loc25))
+module {
+  tt.func public @triton_poi_fused_mul_silu_split_0(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} {
+    %xoffset = tt.get_program_id x : i32 loc(#loc35)
+    %xoffset_0 = arith.constant 1024 : i32 loc(#loc36)
+    %xoffset_1 = arith.constant 1024 : i32 loc(#loc36)
+    %xoffset_2 = arith.muli %xoffset, %xoffset_1 : i32 loc(#loc36)
+    %xindex = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32> loc(#loc37)
+    %xindex_3 = tt.splat %xoffset_2 : i32 -> tensor<1024xi32> loc(#loc38)
+    %xindex_4 = arith.addi %xindex_3, %xindex : tensor<1024xi32> loc(#loc38)
+    %xmask = arith.constant true loc(#loc39)
+    %xmask_5 = arith.constant dense<true> : tensor<1024xi1> loc(#loc39)
+    %x0 = arith.constant 12288 : i32 loc(#loc40)
+    %x0_6 = arith.constant 12288 : i32 loc(#loc40)
+    %x0_7 = arith.constant dense<12288> : tensor<1024xi32> loc(#loc40)
+    %x0_8 = arith.remsi %xindex_4, %x0_7 : tensor<1024xi32> loc(#loc40)
+    %x1 = arith.constant 12288 : i32 loc(#loc41)
+    %x1_9 = arith.constant 12288 : i32 loc(#loc41)
+    %x1_10 = arith.constant dense<12288> : tensor<1024xi32> loc(#loc41)
+    %x1_11 = arith.divsi %xindex_4, %x1_10 : tensor<1024xi32> loc(#loc41)
+    %tmp0 = arith.constant 24576 : i32 loc(#loc42)
+    %tmp0_12 = arith.constant 24576 : i32 loc(#loc42)
+    %tmp0_13 = arith.constant dense<24576> : tensor<1024xi32> loc(#loc42)
+    %tmp0_14 = arith.muli %tmp0_13, %x1_11 : tensor<1024xi32> loc(#loc42)
+    %tmp0_15 = arith.addi %x0_8, %tmp0_14 : tensor<1024xi32> loc(#loc43)
+    %tmp0_16 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<1024x!tt.ptr<bf16>> loc(#loc44)
+    %tmp0_17 = tt.addptr %tmp0_16, %tmp0_15 : tensor<1024x!tt.ptr<bf16>>, tensor<1024xi32> loc(#loc44)
+    %tmp0_18 = tt.load %tmp0_17 : tensor<1024x!tt.ptr<bf16>> loc(#loc45)
+    %tmp0_19 = arith.extf %tmp0_18 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc46)
+    %tmp5 = arith.constant 12288 : i32 loc(#loc47)
+    %tmp5_20 = arith.constant 12288 : i32 loc(#loc47)
+    %tmp5_21 = arith.constant dense<12288> : tensor<1024xi32> loc(#loc47)
+    %tmp5_22 = arith.addi %tmp5_21, %x0_8 : tensor<1024xi32> loc(#loc47)
+    %tmp5_23 = arith.constant 24576 : i32 loc(#loc48)
+    %tmp5_24 = arith.constant 24576 : i32 loc(#loc48)
+    %tmp5_25 = arith.constant dense<24576> : tensor<1024xi32> loc(#loc48)
+    %tmp5_26 = arith.muli %tmp5_25, %x1_11 : tensor<1024xi32> loc(#loc48)
+    %tmp5_27 = arith.addi %tmp5_22, %tmp5_26 : tensor<1024xi32> loc(#loc49)
+    %tmp5_28 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<1024x!tt.ptr<bf16>> loc(#loc50)
+    %tmp5_29 = tt.addptr %tmp5_28, %tmp5_27 : tensor<1024x!tt.ptr<bf16>>, tensor<1024xi32> loc(#loc50)
+    %tmp5_30 = tt.load %tmp5_29 : tensor<1024x!tt.ptr<bf16>> loc(#loc51)
+    %tmp5_31 = arith.extf %tmp5_30 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc52)
+    %tmp2 = tt.call @triton.language.standard.sigmoid__fp32S1024S__(%tmp0_19) : (tensor<1024xf32>) -> tensor<1024xf32> loc(#loc53)
+    %tmp3 = arith.mulf %tmp0_19, %tmp2 : tensor<1024xf32> loc(#loc54)
+    %tmp6 = arith.mulf %tmp3, %tmp5_31 : tensor<1024xf32> loc(#loc55)
+    %0 = tt.splat %out_ptr0 : !tt.ptr<bf16> -> tensor<1024x!tt.ptr<bf16>> loc(#loc22)
+    %1 = tt.addptr %0, %xindex_4 : tensor<1024x!tt.ptr<bf16>>, tensor<1024xi32> loc(#loc22)
+    %2 = arith.truncf %tmp6 : tensor<1024xf32> to tensor<1024xbf16> loc(#loc23)
+    tt.store %1, %2 : tensor<1024x!tt.ptr<bf16>> loc(#loc23)
+    tt.return loc(#loc24)
+  } loc(#loc)
+  tt.func private @triton.language.standard.sigmoid__fp32S1024S__(%x: tensor<1024xf32> loc("x"(#loc25))) -> tensor<1024xf32> attributes {noinline = false} {
+    %cst = arith.constant 0.000000e+00 : f32 loc(#loc26)
+    %cst_0 = arith.constant dense<0.000000e+00> : tensor<1024xf32> loc(#loc26)
+    %0 = arith.subf %cst_0, %x : tensor<1024xf32> loc(#loc26)
+    %1 = math.exp %0 : tensor<1024xf32> loc(#loc27)
+    %c1_i32 = arith.constant 1 : i32 loc(#loc28)
+    %cst_1 = arith.constant 1.000000e+00 : f32 loc(#loc28)
+    %cst_2 = arith.constant dense<1.000000e+00> : tensor<1024xf32> loc(#loc28)
+    %2 = arith.addf %cst_2, %1 : tensor<1024xf32> loc(#loc28)
+    %c1_i32_3 = arith.constant 1 : i32 loc(#loc29)
+    %cst_4 = arith.constant 1.000000e+00 : f32 loc(#loc29)
+    %cst_5 = arith.constant dense<1.000000e+00> : tensor<1024xf32> loc(#loc29)
+    %3 = arith.divf %cst_5, %2 : tensor<1024xf32> loc(#loc29)
+    tt.return %3 : tensor<1024xf32> loc(#loc30)
+  ^bb1:  // no predecessors
+    %4 = ub.poison : tensor<1024xf32> loc(#loc31)
+    tt.return %4 : tensor<1024xf32> loc(#loc31)
+  } loc(#loc25)
+} loc(#loc)
+#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":19:28)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":19:33)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":20:36)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":20:23)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":21:36)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":22:19)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":23:19)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":25:41)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":25:35)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":25:30)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":25:46)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":25:55)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":26:38)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":26:49)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":26:43)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":26:30)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":26:54)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":26:63)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":28:22)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":29:18)
+#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":31:18)
+#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":32:25)
+#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":32:36)
+#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":32:4)
+#loc26 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:30)
+#loc27 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:29)
+#loc28 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:20)
+#loc29 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:16)
+#loc30 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:11)
+#loc31 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:4)
+#loc35 = loc("xoffset"(#loc1))
+#loc36 = loc("xoffset"(#loc2))
+#loc37 = loc("xindex"(#loc3))
+#loc38 = loc("xindex"(#loc4))
+#loc39 = loc("xmask"(#loc5))
+#loc40 = loc("x0"(#loc6))
+#loc41 = loc("x1"(#loc7))
+#loc42 = loc("tmp0"(#loc8))
+#loc43 = loc("tmp0"(#loc9))
+#loc44 = loc("tmp0"(#loc10))
+#loc45 = loc("tmp0"(#loc11))
+#loc46 = loc("tmp0"(#loc12))
+#loc47 = loc("tmp5"(#loc13))
+#loc48 = loc("tmp5"(#loc14))
+#loc49 = loc("tmp5"(#loc15))
+#loc50 = loc("tmp5"(#loc16))
+#loc51 = loc("tmp5"(#loc17))
+#loc52 = loc("tmp5"(#loc18))
+#loc53 = loc("tmp2"(#loc19))
+#loc54 = loc("tmp3"(#loc20))
+#loc55 = loc("tmp6"(#loc21))
diff --git a/triton/77FLY5I3QQZOLH7UQNNBUBUQAUUI5IWSBGOQZ5R3Y2DOEUFWIYAA/triton_poi_fused_mul_silu_split_0.ttgir b/triton/77FLY5I3QQZOLH7UQNNBUBUQAUUI5IWSBGOQZ5R3Y2DOEUFWIYAA/triton_poi_fused_mul_silu_split_0.ttgir
new file mode 100644
index 0000000000000000000000000000000000000000..548225d3ff1d57c11706f8a5caeb149b792e4c90
--- /dev/null
+++ b/triton/77FLY5I3QQZOLH7UQNNBUBUQAUUI5IWSBGOQZ5R3Y2DOEUFWIYAA/triton_poi_fused_mul_silu_split_0.ttgir
@@ -0,0 +1,93 @@
+#blocked = #ttg.blocked<{sizePerThread = [8], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}>
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":18:0)
+#loc28 = loc("in_ptr0"(#loc))
+#loc29 = loc("out_ptr0"(#loc))
+#loc30 = loc("xnumel"(#loc))
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:89", "ttg.threads-per-warp" = 32 : i32} {
+  tt.func public @triton_poi_fused_mul_silu_split_0(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} {
+    %cst = arith.constant dense<24576> : tensor<1024xi32, #blocked> loc(#loc1)
+    %cst_0 = arith.constant dense<12288> : tensor<1024xi32, #blocked> loc(#loc1)
+    %c1024_i32 = arith.constant 1024 : i32 loc(#loc1)
+    %cst_1 = arith.constant dense<0.000000e+00> : tensor<1024xf32, #blocked> loc(#loc1)
+    %cst_2 = arith.constant dense<1.000000e+00> : tensor<1024xf32, #blocked> loc(#loc1)
+    %xoffset = tt.get_program_id x : i32 loc(#loc31)
+    %xoffset_3 = arith.muli %xoffset, %c1024_i32 : i32 loc(#loc32)
+    %xindex = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked> loc(#loc33)
+    %xindex_4 = tt.splat %xoffset_3 : i32 -> tensor<1024xi32, #blocked> loc(#loc34)
+    %xindex_5 = arith.addi %xindex_4, %xindex : tensor<1024xi32, #blocked> loc(#loc34)
+    %x0 = arith.remsi %xindex_5, %cst_0 : tensor<1024xi32, #blocked> loc(#loc35)
+    %x1 = arith.divsi %xindex_5, %cst_0 : tensor<1024xi32, #blocked> loc(#loc36)
+    %tmp0 = arith.muli %x1, %cst : tensor<1024xi32, #blocked> loc(#loc37)
+    %tmp0_6 = arith.addi %x0, %tmp0 : tensor<1024xi32, #blocked> loc(#loc38)
+    %tmp0_7 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<1024x!tt.ptr<bf16>, #blocked> loc(#loc39)
+    %tmp0_8 = tt.addptr %tmp0_7, %tmp0_6 : tensor<1024x!tt.ptr<bf16>, #blocked>, tensor<1024xi32, #blocked> loc(#loc39)
+    %tmp0_9 = tt.load %tmp0_8 : tensor<1024x!tt.ptr<bf16>, #blocked> loc(#loc40)
+    %tmp0_10 = arith.extf %tmp0_9 : tensor<1024xbf16, #blocked> to tensor<1024xf32, #blocked> loc(#loc41)
+    %tmp5 = arith.addi %x0, %cst_0 : tensor<1024xi32, #blocked> loc(#loc42)
+    %tmp5_11 = arith.addi %tmp5, %tmp0 : tensor<1024xi32, #blocked> loc(#loc43)
+    %tmp5_12 = tt.addptr %tmp0_7, %tmp5_11 : tensor<1024x!tt.ptr<bf16>, #blocked>, tensor<1024xi32, #blocked> loc(#loc44)
+    %tmp5_13 = tt.load %tmp5_12 : tensor<1024x!tt.ptr<bf16>, #blocked> loc(#loc45)
+    %tmp5_14 = arith.extf %tmp5_13 : tensor<1024xbf16, #blocked> to tensor<1024xf32, #blocked> loc(#loc46)
+    %tmp2 = arith.subf %cst_1, %tmp0_10 : tensor<1024xf32, #blocked> loc(#loc50)
+    %tmp2_15 = math.exp %tmp2 : tensor<1024xf32, #blocked> loc(#loc51)
+    %tmp2_16 = arith.addf %tmp2_15, %cst_2 : tensor<1024xf32, #blocked> loc(#loc52)
+    %tmp2_17 = arith.divf %cst_2, %tmp2_16 : tensor<1024xf32, #blocked> loc(#loc53)
+    %tmp3 = arith.mulf %tmp0_10, %tmp2_17 : tensor<1024xf32, #blocked> loc(#loc48)
+    %tmp6 = arith.mulf %tmp3, %tmp5_14 : tensor<1024xf32, #blocked> loc(#loc49)
+    %0 = tt.splat %out_ptr0 : !tt.ptr<bf16> -> tensor<1024x!tt.ptr<bf16>, #blocked> loc(#loc25)
+    %1 = tt.addptr %0, %xindex_5 : tensor<1024x!tt.ptr<bf16>, #blocked>, tensor<1024xi32, #blocked> loc(#loc25)
+    %2 = arith.truncf %tmp6 : tensor<1024xf32, #blocked> to tensor<1024xbf16, #blocked> loc(#loc26)
+    tt.store %1, %2 : tensor<1024x!tt.ptr<bf16>, #blocked> loc(#loc26)
+    tt.return loc(#loc27)
+  } loc(#loc)
+} loc(#loc)
+#loc1 = loc(unknown)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":19:28)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":19:33)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":20:36)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":20:23)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":22:19)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":23:19)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":25:41)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":25:35)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":25:30)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":25:46)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":25:55)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":26:38)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":26:43)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":26:30)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":26:54)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":26:63)
+#loc18 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:30)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":28:22)
+#loc20 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:29)
+#loc21 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:20)
+#loc22 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:16)
+#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":29:18)
+#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":31:18)
+#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":32:25)
+#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":32:36)
+#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":32:4)
+#loc31 = loc("xoffset"(#loc2))
+#loc32 = loc("xoffset"(#loc3))
+#loc33 = loc("xindex"(#loc4))
+#loc34 = loc("xindex"(#loc5))
+#loc35 = loc("x0"(#loc6))
+#loc36 = loc("x1"(#loc7))
+#loc37 = loc("tmp0"(#loc8))
+#loc38 = loc("tmp0"(#loc9))
+#loc39 = loc("tmp0"(#loc10))
+#loc40 = loc("tmp0"(#loc11))
+#loc41 = loc("tmp0"(#loc12))
+#loc42 = loc("tmp5"(#loc13))
+#loc43 = loc("tmp5"(#loc14))
+#loc44 = loc("tmp5"(#loc15))
+#loc45 = loc("tmp5"(#loc16))
+#loc46 = loc("tmp5"(#loc17))
+#loc47 = loc("tmp2"(#loc19))
+#loc48 = loc("tmp3"(#loc23))
+#loc49 = loc("tmp6"(#loc24))
+#loc50 = loc(callsite(#loc18 at #loc47))
+#loc51 = loc(callsite(#loc20 at #loc47))
+#loc52 = loc(callsite(#loc21 at #loc47))
+#loc53 = loc(callsite(#loc22 at #loc47))
diff --git a/triton/77FLY5I3QQZOLH7UQNNBUBUQAUUI5IWSBGOQZ5R3Y2DOEUFWIYAA/triton_poi_fused_mul_silu_split_0.ttir b/triton/77FLY5I3QQZOLH7UQNNBUBUQAUUI5IWSBGOQZ5R3Y2DOEUFWIYAA/triton_poi_fused_mul_silu_split_0.ttir
new file mode 100644
index 0000000000000000000000000000000000000000..29f3198e2e9a06f7ace4d791ebabf6c968a89561
--- /dev/null
+++ b/triton/77FLY5I3QQZOLH7UQNNBUBUQAUUI5IWSBGOQZ5R3Y2DOEUFWIYAA/triton_poi_fused_mul_silu_split_0.ttir
@@ -0,0 +1,93 @@
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":18:0)
+#loc28 = loc("in_ptr0"(#loc))
+#loc29 = loc("out_ptr0"(#loc))
+#loc30 = loc("xnumel"(#loc))
+module {
+  tt.func public @triton_poi_fused_mul_silu_split_0(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} {
+    %tmp2 = arith.constant dense<0.000000e+00> : tensor<1024xf32> loc(#loc50)
+    %tmp2_0 = arith.constant dense<1.000000e+00> : tensor<1024xf32> loc(#loc51)
+    %cst = arith.constant dense<24576> : tensor<1024xi32> loc(#loc3)
+    %cst_1 = arith.constant dense<12288> : tensor<1024xi32> loc(#loc3)
+    %c1024_i32 = arith.constant 1024 : i32 loc(#loc3)
+    %xoffset = tt.get_program_id x : i32 loc(#loc32)
+    %xoffset_2 = arith.muli %xoffset, %c1024_i32 : i32 loc(#loc33)
+    %xindex = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32> loc(#loc34)
+    %xindex_3 = tt.splat %xoffset_2 : i32 -> tensor<1024xi32> loc(#loc35)
+    %xindex_4 = arith.addi %xindex_3, %xindex : tensor<1024xi32> loc(#loc35)
+    %x0 = arith.remsi %xindex_4, %cst_1 : tensor<1024xi32> loc(#loc36)
+    %x1 = arith.divsi %xindex_4, %cst_1 : tensor<1024xi32> loc(#loc37)
+    %tmp0 = arith.muli %x1, %cst : tensor<1024xi32> loc(#loc38)
+    %tmp0_5 = arith.addi %x0, %tmp0 : tensor<1024xi32> loc(#loc39)
+    %tmp0_6 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<1024x!tt.ptr<bf16>> loc(#loc40)
+    %tmp0_7 = tt.addptr %tmp0_6, %tmp0_5 : tensor<1024x!tt.ptr<bf16>>, tensor<1024xi32> loc(#loc40)
+    %tmp0_8 = tt.load %tmp0_7 : tensor<1024x!tt.ptr<bf16>> loc(#loc41)
+    %tmp0_9 = arith.extf %tmp0_8 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc42)
+    %tmp5 = arith.addi %x0, %cst_1 : tensor<1024xi32> loc(#loc43)
+    %tmp5_10 = arith.addi %tmp5, %tmp0 : tensor<1024xi32> loc(#loc44)
+    %tmp5_11 = tt.addptr %tmp0_6, %tmp5_10 : tensor<1024x!tt.ptr<bf16>>, tensor<1024xi32> loc(#loc45)
+    %tmp5_12 = tt.load %tmp5_11 : tensor<1024x!tt.ptr<bf16>> loc(#loc46)
+    %tmp5_13 = arith.extf %tmp5_12 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc47)
+    %tmp2_14 = arith.subf %tmp2, %tmp0_9 : tensor<1024xf32> loc(#loc50)
+    %tmp2_15 = math.exp %tmp2_14 : tensor<1024xf32> loc(#loc52)
+    %tmp2_16 = arith.addf %tmp2_15, %tmp2_0 : tensor<1024xf32> loc(#loc53)
+    %tmp2_17 = arith.divf %tmp2_0, %tmp2_16 : tensor<1024xf32> loc(#loc54)
+    %tmp3 = arith.mulf %tmp0_9, %tmp2_17 : tensor<1024xf32> loc(#loc48)
+    %tmp6 = arith.mulf %tmp3, %tmp5_13 : tensor<1024xf32> loc(#loc49)
+    %0 = tt.splat %out_ptr0 : !tt.ptr<bf16> -> tensor<1024x!tt.ptr<bf16>> loc(#loc25)
+    %1 = tt.addptr %0, %xindex_4 : tensor<1024x!tt.ptr<bf16>>, tensor<1024xi32> loc(#loc25)
+    %2 = arith.truncf %tmp6 : tensor<1024xf32> to tensor<1024xbf16> loc(#loc26)
+    tt.store %1, %2 : tensor<1024x!tt.ptr<bf16>> loc(#loc26)
+    tt.return loc(#loc27)
+  } loc(#loc)
+} loc(#loc)
+#loc1 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:30)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":28:22)
+#loc3 = loc(unknown)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":19:28)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":19:33)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":20:36)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":20:23)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":22:19)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":23:19)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":25:41)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":25:35)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":25:30)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":25:46)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":25:55)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":26:38)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":26:43)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":26:30)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":26:54)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":26:63)
+#loc20 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:29)
+#loc21 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:20)
+#loc22 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:16)
+#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":29:18)
+#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":31:18)
+#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":32:25)
+#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":32:36)
+#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":32:4)
+#loc31 = loc("tmp2"(#loc2))
+#loc32 = loc("xoffset"(#loc4))
+#loc33 = loc("xoffset"(#loc5))
+#loc34 = loc("xindex"(#loc6))
+#loc35 = loc("xindex"(#loc7))
+#loc36 = loc("x0"(#loc8))
+#loc37 = loc("x1"(#loc9))
+#loc38 = loc("tmp0"(#loc10))
+#loc39 = loc("tmp0"(#loc11))
+#loc40 = loc("tmp0"(#loc12))
+#loc41 = loc("tmp0"(#loc13))
+#loc42 = loc("tmp0"(#loc14))
+#loc43 = loc("tmp5"(#loc15))
+#loc44 = loc("tmp5"(#loc16))
+#loc45 = loc("tmp5"(#loc17))
+#loc46 = loc("tmp5"(#loc18))
+#loc47 = loc("tmp5"(#loc19))
+#loc48 = loc("tmp3"(#loc23))
+#loc49 = loc("tmp6"(#loc24))
+#loc50 = loc(callsite(#loc1 at #loc31))
+#loc51 = loc(callsite(#loc3 at #loc31))
+#loc52 = loc(callsite(#loc20 at #loc31))
+#loc53 = loc(callsite(#loc21 at #loc31))
+#loc54 = loc(callsite(#loc22 at #loc31))
diff --git a/triton/7AJRY24MV5T6L2RL7LQOPCPXBR43L4PK4E4QTYOKDF7EANSWPHFA/__grp__triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.json b/triton/7AJRY24MV5T6L2RL7LQOPCPXBR43L4PK4E4QTYOKDF7EANSWPHFA/__grp__triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..a28a49c96135d2b07c5cd24c1d5f8163df36d065
--- /dev/null
+++ b/triton/7AJRY24MV5T6L2RL7LQOPCPXBR43L4PK4E4QTYOKDF7EANSWPHFA/__grp__triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.json
@@ -0,0 +1 @@
+{"child_paths": {"triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.source": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/7AJRY24MV5T6L2RL7LQOPCPXBR43L4PK4E4QTYOKDF7EANSWPHFA/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.source", "triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.ttir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/7AJRY24MV5T6L2RL7LQOPCPXBR43L4PK4E4QTYOKDF7EANSWPHFA/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.ttir", "triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.ttgir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/7AJRY24MV5T6L2RL7LQOPCPXBR43L4PK4E4QTYOKDF7EANSWPHFA/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.ttgir", "triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.llir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/7AJRY24MV5T6L2RL7LQOPCPXBR43L4PK4E4QTYOKDF7EANSWPHFA/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.llir", "triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.ptx": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/7AJRY24MV5T6L2RL7LQOPCPXBR43L4PK4E4QTYOKDF7EANSWPHFA/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.ptx", "triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.cubin": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/7AJRY24MV5T6L2RL7LQOPCPXBR43L4PK4E4QTYOKDF7EANSWPHFA/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.cubin", "triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.json": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/7AJRY24MV5T6L2RL7LQOPCPXBR43L4PK4E4QTYOKDF7EANSWPHFA/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.json"}}
\ No newline at end of file
diff --git a/triton/7AJRY24MV5T6L2RL7LQOPCPXBR43L4PK4E4QTYOKDF7EANSWPHFA/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.cubin b/triton/7AJRY24MV5T6L2RL7LQOPCPXBR43L4PK4E4QTYOKDF7EANSWPHFA/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..6d1b1d4346a1ea336d93d649d1c97c5a552d1f27
Binary files /dev/null and b/triton/7AJRY24MV5T6L2RL7LQOPCPXBR43L4PK4E4QTYOKDF7EANSWPHFA/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.cubin differ
diff --git a/triton/7AJRY24MV5T6L2RL7LQOPCPXBR43L4PK4E4QTYOKDF7EANSWPHFA/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.json b/triton/7AJRY24MV5T6L2RL7LQOPCPXBR43L4PK4E4QTYOKDF7EANSWPHFA/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..199f6604b0d27bb528577e43eddfcb99a27727a8
--- /dev/null
+++ b/triton/7AJRY24MV5T6L2RL7LQOPCPXBR43L4PK4E4QTYOKDF7EANSWPHFA/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.json
@@ -0,0 +1 @@
+{"hash": "f8131c6b8caf67e5ea2bfae0e789f70c79b5f1eae13909e1ca197e40365679ca", "target": {"backend": "cuda", "arch": 89, "warp_size": 32}, "num_warps": 8, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "enable_reflect_ftz": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee", "bf16x3", "bf16x6"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm89", "instrumentation_mode": "", "triton_version": "3.6.0", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0"}
\ No newline at end of file
diff --git a/triton/7AJRY24MV5T6L2RL7LQOPCPXBR43L4PK4E4QTYOKDF7EANSWPHFA/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.llir b/triton/7AJRY24MV5T6L2RL7LQOPCPXBR43L4PK4E4QTYOKDF7EANSWPHFA/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.llir
new file mode 100644
index 0000000000000000000000000000000000000000..614b3f766cc217d39ae9e2922c1a0f72d0fad00f
--- /dev/null
+++ b/triton/7AJRY24MV5T6L2RL7LQOPCPXBR43L4PK4E4QTYOKDF7EANSWPHFA/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.llir
@@ -0,0 +1,67 @@
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64"
+
+; Function Attrs: nounwind
+define ptx_kernel void @triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0(ptr addrspace(1) %0, ptr addrspace(1) %1, i32 %2, ptr addrspace(1) readnone captures(none) %3, ptr addrspace(1) readnone captures(none) %4) local_unnamed_addr #0 !dbg !4 {
+  %6 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7
+  %7 = shl i32 %6, 9, !dbg !8
+  %8 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9
+  %9 = shl nuw nsw i32 %8, 1, !dbg !9
+  %10 = and i32 %9, 510, !dbg !9
+  %11 = or disjoint i32 %10, %7, !dbg !10
+  %12 = sdiv i32 %11, 128, !dbg !11
+  %13 = mul i32 %12, 128, !dbg !12
+  %.decomposed = sub i32 %11, %13, !dbg !12
+  %14 = srem i32 %12, 2304, !dbg !13
+  %15 = sdiv i32 %11, 294912, !dbg !14
+  %16 = shl nsw i32 %15, 7, !dbg !15
+  %17 = add nsw i32 %16, %.decomposed, !dbg !16
+  %18 = shl nsw i32 %14, 12, !dbg !17
+  %19 = add nsw i32 %17, %18, !dbg !18
+  %20 = sext i32 %19 to i64, !dbg !19
+  %21 = getelementptr bfloat, ptr addrspace(1) %0, i64 %20, !dbg !19
+  %22 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l"(ptr addrspace(1) %21) #2, !dbg !20
+  %23 = sext i32 %11 to i64, !dbg !21
+  %24 = getelementptr bfloat, ptr addrspace(1) %1, i64 %23, !dbg !21
+  tail call void asm sideeffect "st.global.b32 [ $1 + 0 ], { $0 };", "r,l"(i32 %22, ptr addrspace(1) %24) #2, !dbg !22
+  ret void, !dbg !23
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1
+
+attributes #0 = { nounwind "nvvm.reqntid"="256" }
+attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #2 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly)
+!1 = !DIFile(filename: "ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py", directory: "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei")
+!2 = !{i32 2, !"Debug Info Version", i32 3}
+!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
+!4 = distinct !DISubprogram(name: "triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0", linkageName: "triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!5 = !DISubroutineType(cc: DW_CC_normal, types: !6)
+!6 = !{}
+!7 = !DILocation(line: 20, column: 28, scope: !4)
+!8 = !DILocation(line: 20, column: 33, scope: !4)
+!9 = !DILocation(line: 21, column: 36, scope: !4)
+!10 = !DILocation(line: 21, column: 23, scope: !4)
+!11 = !DILocation(line: 24, column: 21, scope: !4)
+!12 = !DILocation(line: 23, column: 19, scope: !4)
+!13 = !DILocation(line: 24, column: 28, scope: !4)
+!14 = !DILocation(line: 25, column: 19, scope: !4)
+!15 = !DILocation(line: 27, column: 39, scope: !4)
+!16 = !DILocation(line: 27, column: 35, scope: !4)
+!17 = !DILocation(line: 27, column: 49, scope: !4)
+!18 = !DILocation(line: 27, column: 44, scope: !4)
+!19 = !DILocation(line: 27, column: 30, scope: !4)
+!20 = !DILocation(line: 27, column: 54, scope: !4)
+!21 = !DILocation(line: 28, column: 25, scope: !4)
+!22 = !DILocation(line: 28, column: 36, scope: !4)
+!23 = !DILocation(line: 28, column: 4, scope: !4)
diff --git a/triton/7AJRY24MV5T6L2RL7LQOPCPXBR43L4PK4E4QTYOKDF7EANSWPHFA/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.ptx b/triton/7AJRY24MV5T6L2RL7LQOPCPXBR43L4PK4E4QTYOKDF7EANSWPHFA/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.ptx
new file mode 100644
index 0000000000000000000000000000000000000000..e16565cd6ba1d8b88853d9005fd2702cef131fc8
--- /dev/null
+++ b/triton/7AJRY24MV5T6L2RL7LQOPCPXBR43L4PK4E4QTYOKDF7EANSWPHFA/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.ptx
@@ -0,0 +1,329 @@
+//
+// Generated by LLVM NVPTX Back-End
+//
+
+.version 9.1
+.target sm_89
+.address_size 64
+
+	// .globl	triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0 // -- Begin function triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0
+                                        // @triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0
+.visible .entry triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0(
+	.param .u64 .ptr .global .align 1 triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0_param_0,
+	.param .u64 .ptr .global .align 1 triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0_param_1,
+	.param .u32 triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0_param_2,
+	.param .u64 .ptr .global .align 1 triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0_param_3,
+	.param .u64 .ptr .global .align 1 triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0_param_4
+)
+.reqntid 256
+{
+	.reg .b32 	%r<28>;
+	.reg .b64 	%rd<5>;
+	.loc	1 18 0                          // ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py:18:0
+$L__func_begin0:
+	.loc	1 18 0                          // ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py:18:0
+
+// %bb.0:
+	ld.param.b64 	%rd3, [triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0_param_0];
+	ld.param.b64 	%rd4, [triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0_param_1];
+$L__tmp0:
+	.loc	1 20 28                         // ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py:20:28
+	mov.u32 	%r2, %ctaid.x;
+	.loc	1 20 33                         // ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py:20:33
+	shl.b32 	%r3, %r2, 9;
+	.loc	1 21 36                         // ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py:21:36
+	mov.u32 	%r4, %tid.x;
+	shl.b32 	%r5, %r4, 1;
+	and.b32 	%r6, %r5, 510;
+	.loc	1 21 23                         // ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py:21:23
+	or.b32 	%r7, %r6, %r3;
+	.loc	1 24 21                         // ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py:24:21
+	bfe.s32 	%r8, %r2, 22, 1;
+	shr.u32 	%r9, %r8, 25;
+	add.s32 	%r10, %r7, %r9;
+	shr.s32 	%r11, %r10, 7;
+	.loc	1 23 19                         // ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py:23:19
+	and.b32 	%r12, %r10, -128;
+	sub.s32 	%r13, %r7, %r12;
+	.loc	1 24 28                         // ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py:24:28
+	mul.hi.s32 	%r14, %r11, 954437177;
+	shr.u32 	%r15, %r14, 31;
+	shr.u32 	%r16, %r14, 9;
+	add.s32 	%r17, %r16, %r15;
+	mul.lo.s32 	%r18, %r17, 2304;
+	sub.s32 	%r19, %r11, %r18;
+	.loc	1 25 19                         // ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py:25:19
+	mul.hi.s32 	%r20, %r7, 954437177;
+	shr.u32 	%r21, %r20, 31;
+	shr.s32 	%r22, %r20, 16;
+	add.s32 	%r23, %r22, %r21;
+	.loc	1 27 39                         // ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py:27:39
+	shl.b32 	%r24, %r23, 7;
+	.loc	1 27 35                         // ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py:27:35
+	add.s32 	%r25, %r24, %r13;
+	.loc	1 27 49                         // ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py:27:49
+	shl.b32 	%r26, %r19, 12;
+	.loc	1 27 44                         // ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py:27:44
+	add.s32 	%r27, %r25, %r26;
+	.loc	1 27 30                         // ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py:27:30
+	mad.wide.s32 	%rd1, %r27, 2, %rd3;
+	.loc	1 27 54                         // ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py:27:54
+	// begin inline asm
+	mov.u32 %r1, 0x0;
+	ld.global.b32 { %r1 }, [ %rd1 + 0 ];
+	// end inline asm
+	.loc	1 28 25                         // ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py:28:25
+	mad.wide.s32 	%rd2, %r7, 2, %rd4;
+	.loc	1 28 36                         // ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py:28:36
+	// begin inline asm
+	st.global.b32 [ %rd2 + 0 ], { %r1 };
+	// end inline asm
+	.loc	1 28 4                          // ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py:28:4
+	ret;
+$L__tmp1:
+$L__func_end0:
+                                        // -- End function
+}
+	.file	1 "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py"
+	.section	.debug_abbrev
+	{
+.b8 1                                   // Abbreviation Code
+.b8 17                                  // DW_TAG_compile_unit
+.b8 0                                   // DW_CHILDREN_no
+.b8 37                                  // DW_AT_producer
+.b8 8                                   // DW_FORM_string
+.b8 19                                  // DW_AT_language
+.b8 5                                   // DW_FORM_data2
+.b8 3                                   // DW_AT_name
+.b8 8                                   // DW_FORM_string
+.b8 16                                  // DW_AT_stmt_list
+.b8 6                                   // DW_FORM_data4
+.b8 27                                  // DW_AT_comp_dir
+.b8 8                                   // DW_FORM_string
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 0                                   // EOM(3)
+	}
+	.section	.debug_info
+	{
+.b32 224                                // Length of Unit
+.b8 2                                   // DWARF version number
+.b8 0
+.b32 .debug_abbrev                      // Offset Into Abbrev. Section
+.b8 8                                   // Address Size (in bytes)
+.b8 1                                   // Abbrev [1] 0xb:0xd9 DW_TAG_compile_unit
+.b8 116                                 // DW_AT_producer
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 0
+.b8 2                                   // DW_AT_language
+.b8 0
+.b8 99                                  // DW_AT_name
+.b8 101
+.b8 105
+.b8 112
+.b8 106
+.b8 120
+.b8 97
+.b8 117
+.b8 115
+.b8 97
+.b8 117
+.b8 122
+.b8 108
+.b8 52
+.b8 109
+.b8 99
+.b8 99
+.b8 50
+.b8 51
+.b8 51
+.b8 102
+.b8 112
+.b8 101
+.b8 117
+.b8 98
+.b8 102
+.b8 115
+.b8 51
+.b8 117
+.b8 107
+.b8 53
+.b8 110
+.b8 105
+.b8 53
+.b8 98
+.b8 106
+.b8 113
+.b8 98
+.b8 108
+.b8 50
+.b8 113
+.b8 119
+.b8 116
+.b8 111
+.b8 119
+.b8 106
+.b8 119
+.b8 114
+.b8 108
+.b8 55
+.b8 99
+.b8 100
+.b8 46
+.b8 112
+.b8 121
+.b8 0
+.b32 .debug_line                        // DW_AT_stmt_list
+.b8 47                                  // DW_AT_comp_dir
+.b8 97
+.b8 112
+.b8 112
+.b8 47
+.b8 116
+.b8 101
+.b8 110
+.b8 115
+.b8 111
+.b8 114
+.b8 114
+.b8 116
+.b8 95
+.b8 108
+.b8 108
+.b8 109
+.b8 47
+.b8 118
+.b8 105
+.b8 115
+.b8 117
+.b8 97
+.b8 108
+.b8 95
+.b8 103
+.b8 101
+.b8 110
+.b8 47
+.b8 99
+.b8 111
+.b8 109
+.b8 112
+.b8 105
+.b8 108
+.b8 101
+.b8 100
+.b8 95
+.b8 99
+.b8 97
+.b8 99
+.b8 104
+.b8 101
+.b8 47
+.b8 102
+.b8 108
+.b8 117
+.b8 120
+.b8 50
+.b8 95
+.b8 107
+.b8 108
+.b8 101
+.b8 105
+.b8 110
+.b8 95
+.b8 57
+.b8 98
+.b8 95
+.b8 78
+.b8 86
+.b8 73
+.b8 68
+.b8 73
+.b8 65
+.b8 95
+.b8 71
+.b8 101
+.b8 70
+.b8 111
+.b8 114
+.b8 99
+.b8 101
+.b8 95
+.b8 82
+.b8 84
+.b8 88
+.b8 95
+.b8 52
+.b8 48
+.b8 57
+.b8 48
+.b8 95
+.b8 115
+.b8 109
+.b8 56
+.b8 57
+.b8 95
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 50
+.b8 46
+.b8 49
+.b8 48
+.b8 46
+.b8 48
+.b8 97
+.b8 48
+.b8 95
+.b8 98
+.b8 52
+.b8 101
+.b8 52
+.b8 101
+.b8 101
+.b8 56
+.b8 49
+.b8 100
+.b8 51
+.b8 46
+.b8 110
+.b8 118
+.b8 50
+.b8 53
+.b8 46
+.b8 49
+.b8 50
+.b8 95
+.b8 99
+.b8 117
+.b8 100
+.b8 97
+.b8 49
+.b8 51
+.b8 95
+.b8 49
+.b8 47
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 105
+.b8 110
+.b8 100
+.b8 117
+.b8 99
+.b8 116
+.b8 111
+.b8 114
+.b8 47
+.b8 101
+.b8 105
+.b8 0
+	}
+	.section	.debug_macinfo	{	}
diff --git a/triton/7AJRY24MV5T6L2RL7LQOPCPXBR43L4PK4E4QTYOKDF7EANSWPHFA/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.source b/triton/7AJRY24MV5T6L2RL7LQOPCPXBR43L4PK4E4QTYOKDF7EANSWPHFA/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.source
new file mode 100644
index 0000000000000000000000000000000000000000..9fa878988fb486d9c354c976b2e285d4281da372
--- /dev/null
+++ b/triton/7AJRY24MV5T6L2RL7LQOPCPXBR43L4PK4E4QTYOKDF7EANSWPHFA/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.source
@@ -0,0 +1,90 @@
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":18:0)
+#loc21 = loc("in_ptr0"(#loc))
+#loc22 = loc("out_ptr0"(#loc))
+#loc23 = loc("xnumel"(#loc))
+module {
+  tt.func public @triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} {
+    %xnumel_0 = arith.constant 9437184 : i32 loc(#loc24)
+    %xoffset = tt.get_program_id x : i32 loc(#loc25)
+    %xoffset_1 = arith.constant 512 : i32 loc(#loc26)
+    %xoffset_2 = arith.constant 512 : i32 loc(#loc26)
+    %xoffset_3 = arith.muli %xoffset, %xoffset_2 : i32 loc(#loc26)
+    %xindex = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32> loc(#loc27)
+    %xindex_4 = tt.splat %xoffset_3 : i32 -> tensor<512xi32> loc(#loc28)
+    %xindex_5 = arith.addi %xindex_4, %xindex : tensor<512xi32> loc(#loc28)
+    %xmask = arith.constant true loc(#loc29)
+    %xmask_6 = arith.constant dense<true> : tensor<512xi1> loc(#loc29)
+    %x0 = arith.constant 128 : i32 loc(#loc30)
+    %x0_7 = arith.constant 128 : i32 loc(#loc30)
+    %x0_8 = arith.constant dense<128> : tensor<512xi32> loc(#loc30)
+    %x0_9 = arith.remsi %xindex_5, %x0_8 : tensor<512xi32> loc(#loc30)
+    %x1 = arith.constant 128 : i32 loc(#loc31)
+    %x1_10 = arith.constant 128 : i32 loc(#loc31)
+    %x1_11 = arith.constant dense<128> : tensor<512xi32> loc(#loc31)
+    %x1_12 = arith.divsi %xindex_5, %x1_11 : tensor<512xi32> loc(#loc31)
+    %x1_13 = arith.constant 2304 : i32 loc(#loc32)
+    %x1_14 = arith.constant 2304 : i32 loc(#loc32)
+    %x1_15 = arith.constant dense<2304> : tensor<512xi32> loc(#loc32)
+    %x1_16 = arith.remsi %x1_12, %x1_15 : tensor<512xi32> loc(#loc32)
+    %x2 = arith.constant 294912 : i32 loc(#loc33)
+    %x2_17 = arith.constant 294912 : i32 loc(#loc33)
+    %x2_18 = arith.constant dense<294912> : tensor<512xi32> loc(#loc33)
+    %x2_19 = arith.divsi %xindex_5, %x2_18 : tensor<512xi32> loc(#loc33)
+    %tmp0 = arith.constant 128 : i32 loc(#loc34)
+    %tmp0_20 = arith.constant 128 : i32 loc(#loc34)
+    %tmp0_21 = arith.constant dense<128> : tensor<512xi32> loc(#loc34)
+    %tmp0_22 = arith.muli %tmp0_21, %x2_19 : tensor<512xi32> loc(#loc34)
+    %tmp0_23 = arith.addi %x0_9, %tmp0_22 : tensor<512xi32> loc(#loc35)
+    %tmp0_24 = arith.constant 4096 : i32 loc(#loc36)
+    %tmp0_25 = arith.constant 4096 : i32 loc(#loc36)
+    %tmp0_26 = arith.constant dense<4096> : tensor<512xi32> loc(#loc36)
+    %tmp0_27 = arith.muli %tmp0_26, %x1_16 : tensor<512xi32> loc(#loc36)
+    %tmp0_28 = arith.addi %tmp0_23, %tmp0_27 : tensor<512xi32> loc(#loc37)
+    %tmp0_29 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<512x!tt.ptr<bf16>> loc(#loc38)
+    %tmp0_30 = tt.addptr %tmp0_29, %tmp0_28 : tensor<512x!tt.ptr<bf16>>, tensor<512xi32> loc(#loc38)
+    %tmp0_31 = tt.load %tmp0_30 : tensor<512x!tt.ptr<bf16>> loc(#loc39)
+    %tmp0_32 = arith.extf %tmp0_31 : tensor<512xbf16> to tensor<512xf32> loc(#loc40)
+    %0 = tt.splat %out_ptr0 : !tt.ptr<bf16> -> tensor<512x!tt.ptr<bf16>> loc(#loc18)
+    %1 = tt.addptr %0, %xindex_5 : tensor<512x!tt.ptr<bf16>>, tensor<512xi32> loc(#loc18)
+    %2 = arith.truncf %tmp0_32 : tensor<512xf32> to tensor<512xbf16> loc(#loc19)
+    tt.store %1, %2 : tensor<512x!tt.ptr<bf16>> loc(#loc19)
+    tt.return loc(#loc20)
+  } loc(#loc)
+} loc(#loc)
+#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":19:13)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":20:28)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":20:33)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":21:36)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":21:23)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":22:36)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":23:19)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":24:21)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":24:28)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":25:19)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":27:39)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":27:35)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":27:49)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":27:44)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":27:30)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":27:54)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":27:63)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":28:25)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":28:36)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":28:4)
+#loc24 = loc("xnumel"(#loc1))
+#loc25 = loc("xoffset"(#loc2))
+#loc26 = loc("xoffset"(#loc3))
+#loc27 = loc("xindex"(#loc4))
+#loc28 = loc("xindex"(#loc5))
+#loc29 = loc("xmask"(#loc6))
+#loc30 = loc("x0"(#loc7))
+#loc31 = loc("x1"(#loc8))
+#loc32 = loc("x1"(#loc9))
+#loc33 = loc("x2"(#loc10))
+#loc34 = loc("tmp0"(#loc11))
+#loc35 = loc("tmp0"(#loc12))
+#loc36 = loc("tmp0"(#loc13))
+#loc37 = loc("tmp0"(#loc14))
+#loc38 = loc("tmp0"(#loc15))
+#loc39 = loc("tmp0"(#loc16))
+#loc40 = loc("tmp0"(#loc17))
diff --git a/triton/7AJRY24MV5T6L2RL7LQOPCPXBR43L4PK4E4QTYOKDF7EANSWPHFA/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.ttgir b/triton/7AJRY24MV5T6L2RL7LQOPCPXBR43L4PK4E4QTYOKDF7EANSWPHFA/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.ttgir
new file mode 100644
index 0000000000000000000000000000000000000000..ebdddc82dda054c40857372ef4f61d3bdf535988
--- /dev/null
+++ b/triton/7AJRY24MV5T6L2RL7LQOPCPXBR43L4PK4E4QTYOKDF7EANSWPHFA/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.ttgir
@@ -0,0 +1,66 @@
+#blocked = #ttg.blocked<{sizePerThread = [2], threadsPerWarp = [32], warpsPerCTA = [8], order = [0]}>
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":18:0)
+#loc19 = loc("in_ptr0"(#loc))
+#loc20 = loc("out_ptr0"(#loc))
+#loc21 = loc("xnumel"(#loc))
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "cuda:89", "ttg.threads-per-warp" = 32 : i32} {
+  tt.func public @triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} {
+    %cst = arith.constant dense<128> : tensor<512xi32, #blocked> loc(#loc1)
+    %cst_0 = arith.constant dense<2304> : tensor<512xi32, #blocked> loc(#loc1)
+    %cst_1 = arith.constant dense<294912> : tensor<512xi32, #blocked> loc(#loc1)
+    %cst_2 = arith.constant dense<4096> : tensor<512xi32, #blocked> loc(#loc1)
+    %c512_i32 = arith.constant 512 : i32 loc(#loc1)
+    %xoffset = tt.get_program_id x : i32 loc(#loc22)
+    %xoffset_3 = arith.muli %xoffset, %c512_i32 : i32 loc(#loc23)
+    %xindex = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32, #blocked> loc(#loc24)
+    %xindex_4 = tt.splat %xoffset_3 : i32 -> tensor<512xi32, #blocked> loc(#loc25)
+    %xindex_5 = arith.addi %xindex_4, %xindex : tensor<512xi32, #blocked> loc(#loc25)
+    %x0 = arith.remsi %xindex_5, %cst : tensor<512xi32, #blocked> loc(#loc26)
+    %x1 = arith.divsi %xindex_5, %cst : tensor<512xi32, #blocked> loc(#loc27)
+    %x1_6 = arith.remsi %x1, %cst_0 : tensor<512xi32, #blocked> loc(#loc28)
+    %x2 = arith.divsi %xindex_5, %cst_1 : tensor<512xi32, #blocked> loc(#loc29)
+    %tmp0 = arith.muli %x2, %cst : tensor<512xi32, #blocked> loc(#loc30)
+    %tmp0_7 = arith.addi %x0, %tmp0 : tensor<512xi32, #blocked> loc(#loc31)
+    %tmp0_8 = arith.muli %x1_6, %cst_2 : tensor<512xi32, #blocked> loc(#loc32)
+    %tmp0_9 = arith.addi %tmp0_7, %tmp0_8 : tensor<512xi32, #blocked> loc(#loc33)
+    %tmp0_10 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<512x!tt.ptr<bf16>, #blocked> loc(#loc34)
+    %tmp0_11 = tt.addptr %tmp0_10, %tmp0_9 : tensor<512x!tt.ptr<bf16>, #blocked>, tensor<512xi32, #blocked> loc(#loc34)
+    %tmp0_12 = tt.load %tmp0_11 : tensor<512x!tt.ptr<bf16>, #blocked> loc(#loc35)
+    %0 = tt.splat %out_ptr0 : !tt.ptr<bf16> -> tensor<512x!tt.ptr<bf16>, #blocked> loc(#loc16)
+    %1 = tt.addptr %0, %xindex_5 : tensor<512x!tt.ptr<bf16>, #blocked>, tensor<512xi32, #blocked> loc(#loc16)
+    tt.store %1, %tmp0_12 : tensor<512x!tt.ptr<bf16>, #blocked> loc(#loc17)
+    tt.return loc(#loc18)
+  } loc(#loc)
+} loc(#loc)
+#loc1 = loc(unknown)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":20:28)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":20:33)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":21:36)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":21:23)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":23:19)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":24:21)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":24:28)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":25:19)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":27:39)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":27:35)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":27:49)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":27:44)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":27:30)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":27:54)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":28:25)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":28:36)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":28:4)
+#loc22 = loc("xoffset"(#loc2))
+#loc23 = loc("xoffset"(#loc3))
+#loc24 = loc("xindex"(#loc4))
+#loc25 = loc("xindex"(#loc5))
+#loc26 = loc("x0"(#loc6))
+#loc27 = loc("x1"(#loc7))
+#loc28 = loc("x1"(#loc8))
+#loc29 = loc("x2"(#loc9))
+#loc30 = loc("tmp0"(#loc10))
+#loc31 = loc("tmp0"(#loc11))
+#loc32 = loc("tmp0"(#loc12))
+#loc33 = loc("tmp0"(#loc13))
+#loc34 = loc("tmp0"(#loc14))
+#loc35 = loc("tmp0"(#loc15))
diff --git a/triton/7AJRY24MV5T6L2RL7LQOPCPXBR43L4PK4E4QTYOKDF7EANSWPHFA/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.ttir b/triton/7AJRY24MV5T6L2RL7LQOPCPXBR43L4PK4E4QTYOKDF7EANSWPHFA/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.ttir
new file mode 100644
index 0000000000000000000000000000000000000000..400988e57e1e7ffec85dde64c8b81715c4cde48a
--- /dev/null
+++ b/triton/7AJRY24MV5T6L2RL7LQOPCPXBR43L4PK4E4QTYOKDF7EANSWPHFA/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.ttir
@@ -0,0 +1,65 @@
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":18:0)
+#loc19 = loc("in_ptr0"(#loc))
+#loc20 = loc("out_ptr0"(#loc))
+#loc21 = loc("xnumel"(#loc))
+module {
+  tt.func public @triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} {
+    %tmp0 = arith.constant dense<4096> : tensor<512xi32> loc(#loc22)
+    %x2 = arith.constant dense<294912> : tensor<512xi32> loc(#loc23)
+    %x1 = arith.constant dense<2304> : tensor<512xi32> loc(#loc24)
+    %cst = arith.constant dense<128> : tensor<512xi32> loc(#loc4)
+    %c512_i32 = arith.constant 512 : i32 loc(#loc4)
+    %xoffset = tt.get_program_id x : i32 loc(#loc25)
+    %xoffset_0 = arith.muli %xoffset, %c512_i32 : i32 loc(#loc26)
+    %xindex = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32> loc(#loc27)
+    %xindex_1 = tt.splat %xoffset_0 : i32 -> tensor<512xi32> loc(#loc28)
+    %xindex_2 = arith.addi %xindex_1, %xindex : tensor<512xi32> loc(#loc28)
+    %x0 = arith.remsi %xindex_2, %cst : tensor<512xi32> loc(#loc29)
+    %x1_3 = arith.divsi %xindex_2, %cst : tensor<512xi32> loc(#loc30)
+    %x1_4 = arith.remsi %x1_3, %x1 : tensor<512xi32> loc(#loc24)
+    %x2_5 = arith.divsi %xindex_2, %x2 : tensor<512xi32> loc(#loc23)
+    %tmp0_6 = arith.muli %x2_5, %cst : tensor<512xi32> loc(#loc31)
+    %tmp0_7 = arith.addi %x0, %tmp0_6 : tensor<512xi32> loc(#loc32)
+    %tmp0_8 = arith.muli %x1_4, %tmp0 : tensor<512xi32> loc(#loc22)
+    %tmp0_9 = arith.addi %tmp0_7, %tmp0_8 : tensor<512xi32> loc(#loc33)
+    %tmp0_10 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<512x!tt.ptr<bf16>> loc(#loc34)
+    %tmp0_11 = tt.addptr %tmp0_10, %tmp0_9 : tensor<512x!tt.ptr<bf16>>, tensor<512xi32> loc(#loc34)
+    %tmp0_12 = tt.load %tmp0_11 : tensor<512x!tt.ptr<bf16>> loc(#loc35)
+    %0 = tt.splat %out_ptr0 : !tt.ptr<bf16> -> tensor<512x!tt.ptr<bf16>> loc(#loc16)
+    %1 = tt.addptr %0, %xindex_2 : tensor<512x!tt.ptr<bf16>>, tensor<512xi32> loc(#loc16)
+    tt.store %1, %tmp0_12 : tensor<512x!tt.ptr<bf16>> loc(#loc17)
+    tt.return loc(#loc18)
+  } loc(#loc)
+} loc(#loc)
+#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":27:49)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":25:19)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":24:28)
+#loc4 = loc(unknown)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":20:28)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":20:33)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":21:36)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":21:23)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":23:19)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":24:21)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":27:39)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":27:35)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":27:44)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":27:30)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":27:54)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":28:25)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":28:36)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":28:4)
+#loc22 = loc("tmp0"(#loc1))
+#loc23 = loc("x2"(#loc2))
+#loc24 = loc("x1"(#loc3))
+#loc25 = loc("xoffset"(#loc5))
+#loc26 = loc("xoffset"(#loc6))
+#loc27 = loc("xindex"(#loc7))
+#loc28 = loc("xindex"(#loc8))
+#loc29 = loc("x0"(#loc9))
+#loc30 = loc("x1"(#loc10))
+#loc31 = loc("tmp0"(#loc11))
+#loc32 = loc("tmp0"(#loc12))
+#loc33 = loc("tmp0"(#loc13))
+#loc34 = loc("tmp0"(#loc14))
+#loc35 = loc("tmp0"(#loc15))
diff --git a/triton/7POELGCJFUOFLC6TGKIWA3PRUT3MYIXDOSTB44MURMQQJWI3L3RQ/__grp__triton_red_fused_add_mul_native_layer_norm_0.json b/triton/7POELGCJFUOFLC6TGKIWA3PRUT3MYIXDOSTB44MURMQQJWI3L3RQ/__grp__triton_red_fused_add_mul_native_layer_norm_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..ada7c60ba3110e1e644c1f3bf8d906d795caf3cf
--- /dev/null
+++ b/triton/7POELGCJFUOFLC6TGKIWA3PRUT3MYIXDOSTB44MURMQQJWI3L3RQ/__grp__triton_red_fused_add_mul_native_layer_norm_0.json
@@ -0,0 +1 @@
+{"child_paths": {"triton_red_fused_add_mul_native_layer_norm_0.source": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/7POELGCJFUOFLC6TGKIWA3PRUT3MYIXDOSTB44MURMQQJWI3L3RQ/triton_red_fused_add_mul_native_layer_norm_0.source", "triton_red_fused_add_mul_native_layer_norm_0.ttir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/7POELGCJFUOFLC6TGKIWA3PRUT3MYIXDOSTB44MURMQQJWI3L3RQ/triton_red_fused_add_mul_native_layer_norm_0.ttir", "triton_red_fused_add_mul_native_layer_norm_0.ttgir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/7POELGCJFUOFLC6TGKIWA3PRUT3MYIXDOSTB44MURMQQJWI3L3RQ/triton_red_fused_add_mul_native_layer_norm_0.ttgir", "triton_red_fused_add_mul_native_layer_norm_0.llir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/7POELGCJFUOFLC6TGKIWA3PRUT3MYIXDOSTB44MURMQQJWI3L3RQ/triton_red_fused_add_mul_native_layer_norm_0.llir", "triton_red_fused_add_mul_native_layer_norm_0.ptx": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/7POELGCJFUOFLC6TGKIWA3PRUT3MYIXDOSTB44MURMQQJWI3L3RQ/triton_red_fused_add_mul_native_layer_norm_0.ptx", "triton_red_fused_add_mul_native_layer_norm_0.cubin": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/7POELGCJFUOFLC6TGKIWA3PRUT3MYIXDOSTB44MURMQQJWI3L3RQ/triton_red_fused_add_mul_native_layer_norm_0.cubin", "triton_red_fused_add_mul_native_layer_norm_0.json": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/7POELGCJFUOFLC6TGKIWA3PRUT3MYIXDOSTB44MURMQQJWI3L3RQ/triton_red_fused_add_mul_native_layer_norm_0.json"}}
\ No newline at end of file
diff --git a/triton/7POELGCJFUOFLC6TGKIWA3PRUT3MYIXDOSTB44MURMQQJWI3L3RQ/triton_red_fused_add_mul_native_layer_norm_0.cubin b/triton/7POELGCJFUOFLC6TGKIWA3PRUT3MYIXDOSTB44MURMQQJWI3L3RQ/triton_red_fused_add_mul_native_layer_norm_0.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..1683dae7a0d86b1c7eb37340f3816361e332c0a6
Binary files /dev/null and b/triton/7POELGCJFUOFLC6TGKIWA3PRUT3MYIXDOSTB44MURMQQJWI3L3RQ/triton_red_fused_add_mul_native_layer_norm_0.cubin differ
diff --git a/triton/7POELGCJFUOFLC6TGKIWA3PRUT3MYIXDOSTB44MURMQQJWI3L3RQ/triton_red_fused_add_mul_native_layer_norm_0.json b/triton/7POELGCJFUOFLC6TGKIWA3PRUT3MYIXDOSTB44MURMQQJWI3L3RQ/triton_red_fused_add_mul_native_layer_norm_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..4ad91ec628268b4ef21b73c7aab02c148c72ad08
--- /dev/null
+++ b/triton/7POELGCJFUOFLC6TGKIWA3PRUT3MYIXDOSTB44MURMQQJWI3L3RQ/triton_red_fused_add_mul_native_layer_norm_0.json
@@ -0,0 +1 @@
+{"hash": "fbdc4598492d1c558bd33291606df1a4f6cc22e374a61e71948b2104d91b5ee3", "target": {"backend": "cuda", "arch": 89, "warp_size": 32}, "num_warps": 16, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "enable_reflect_ftz": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee", "bf16x3", "bf16x6"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm89", "instrumentation_mode": "", "triton_version": "3.6.0", "tensordesc_meta": [], "shared": 192, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused_add_mul_native_layer_norm_0"}
\ No newline at end of file
diff --git a/triton/7POELGCJFUOFLC6TGKIWA3PRUT3MYIXDOSTB44MURMQQJWI3L3RQ/triton_red_fused_add_mul_native_layer_norm_0.llir b/triton/7POELGCJFUOFLC6TGKIWA3PRUT3MYIXDOSTB44MURMQQJWI3L3RQ/triton_red_fused_add_mul_native_layer_norm_0.llir
new file mode 100644
index 0000000000000000000000000000000000000000..c91141e4af47dfed7c5b5ddda48f271c80b40f54
--- /dev/null
+++ b/triton/7POELGCJFUOFLC6TGKIWA3PRUT3MYIXDOSTB44MURMQQJWI3L3RQ/triton_red_fused_add_mul_native_layer_norm_0.llir
@@ -0,0 +1,565 @@
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64"
+
+@global_smem = external addrspace(3) global [0 x i8], align 16
+@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1
+
+; Function Attrs: nounwind
+define ptx_kernel void @triton_red_fused_add_mul_native_layer_norm_0(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, i32 %4, i32 %5, ptr addrspace(1) readnone captures(none) %6, ptr addrspace(1) readnone captures(none) %7) local_unnamed_addr #0 !dbg !5 {
+__nv_rsqrtf.exit:
+  %8 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !8
+  %9 = icmp samesign ult i32 %8, 2048, !dbg !9
+  %10 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
+  %11 = shl nuw nsw i32 %10, 2, !dbg !10
+  %12 = and i32 %11, 2044, !dbg !10
+  %13 = shl i32 %8, 12, !dbg !11
+  %14 = or disjoint i32 %12, %13
+  %15 = sext i32 %14 to i64, !dbg !12
+  %16 = getelementptr bfloat, ptr addrspace(1) %0, i64 %15, !dbg !13
+  %17 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !14
+  %18 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %16, i64 %17, i1 %9) #6, !dbg !14
+  %19 = extractvalue { i32, i32 } %18, 1, !dbg !14
+  %20 = bitcast i32 %19 to <2 x bfloat>, !dbg !14
+  %21 = extractelement <2 x bfloat> %20, i64 1, !dbg !14
+  %22 = fpext bfloat %21 to float, !dbg !15
+  %23 = extractelement <2 x bfloat> %20, i64 0, !dbg !14
+  %24 = fpext bfloat %23 to float, !dbg !15
+  %25 = extractvalue { i32, i32 } %18, 0, !dbg !14
+  %26 = bitcast i32 %25 to <2 x bfloat>, !dbg !14
+  %27 = extractelement <2 x bfloat> %26, i64 1, !dbg !14
+  %28 = fpext bfloat %27 to float, !dbg !15
+  %29 = extractelement <2 x bfloat> %26, i64 0, !dbg !14
+  %30 = fpext bfloat %29 to float, !dbg !15
+  %31 = select i1 %9, float %30, float 0.000000e+00, !dbg !16
+  %32 = select i1 %9, float %28, float 0.000000e+00, !dbg !16
+  %33 = select i1 %9, float %24, float 0.000000e+00, !dbg !16
+  %34 = select i1 %9, float %22, float 0.000000e+00, !dbg !16
+  %35 = getelementptr bfloat, ptr addrspace(1) %0, i64 %15, !dbg !13
+  %36 = getelementptr i8, ptr addrspace(1) %35, i64 4096, !dbg !13
+  %37 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !14
+  %38 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %36, i64 %37, i1 %9) #6, !dbg !14
+  %39 = extractvalue { i32, i32 } %38, 0, !dbg !14
+  %40 = bitcast i32 %39 to <2 x bfloat>, !dbg !14
+  %41 = extractelement <2 x bfloat> %40, i64 0, !dbg !14
+  %42 = fpext bfloat %41 to float, !dbg !15
+  %43 = fsub float %42, %31, !dbg !17
+  %44 = select i1 %9, float 2.000000e+00, float 1.000000e+00, !dbg !22
+  %45 = tail call float @llvm.nvvm.div.full(float %43, float %44), !dbg !23
+  %46 = fadd float %31, %45, !dbg !24
+  %47 = fsub float %42, %46, !dbg !25
+  %48 = fmul float %43, %47, !dbg !26
+  %49 = fadd float %48, 0.000000e+00, !dbg !27
+  %50 = extractelement <2 x bfloat> %40, i64 1, !dbg !14
+  %51 = fpext bfloat %50 to float, !dbg !15
+  %52 = fsub float %51, %32, !dbg !17
+  %53 = tail call float @llvm.nvvm.div.full(float %52, float %44), !dbg !23
+  %54 = fadd float %32, %53, !dbg !24
+  %55 = fsub float %51, %54, !dbg !25
+  %56 = fmul float %52, %55, !dbg !26
+  %57 = fadd float %56, 0.000000e+00, !dbg !27
+  %58 = extractvalue { i32, i32 } %38, 1, !dbg !14
+  %59 = bitcast i32 %58 to <2 x bfloat>, !dbg !14
+  %60 = extractelement <2 x bfloat> %59, i64 0, !dbg !14
+  %61 = fpext bfloat %60 to float, !dbg !15
+  %62 = fsub float %61, %33, !dbg !17
+  %63 = tail call float @llvm.nvvm.div.full(float %62, float %44), !dbg !23
+  %64 = fadd float %33, %63, !dbg !24
+  %65 = fsub float %61, %64, !dbg !25
+  %66 = fmul float %62, %65, !dbg !26
+  %67 = fadd float %66, 0.000000e+00, !dbg !27
+  %68 = extractelement <2 x bfloat> %59, i64 1, !dbg !14
+  %69 = fpext bfloat %68 to float, !dbg !15
+  %70 = fsub float %69, %34, !dbg !17
+  %71 = tail call float @llvm.nvvm.div.full(float %70, float %44), !dbg !23
+  %72 = fadd float %34, %71, !dbg !24
+  %73 = fsub float %69, %72, !dbg !25
+  %74 = fmul float %70, %73, !dbg !26
+  %75 = fadd float %74, 0.000000e+00, !dbg !27
+  %76 = select i1 %9, float %46, float 0.000000e+00, !dbg !16
+  %77 = select i1 %9, float %54, float 0.000000e+00, !dbg !16
+  %78 = select i1 %9, float %64, float 0.000000e+00, !dbg !16
+  %79 = select i1 %9, float %72, float 0.000000e+00, !dbg !16
+  %80 = select i1 %9, float %67, float 0.000000e+00, !dbg !28
+  %81 = select i1 %9, float %75, float 0.000000e+00, !dbg !28
+  %82 = select i1 %9, float 2.000000e+00, float 0.000000e+00, !dbg !22
+  %83 = select i1 %9, float 2.000000e+00, float 0.000000e+00, !dbg !22
+  %84 = select i1 %9, float 2.000000e+00, float 0.000000e+00, !dbg !22
+  %85 = select i1 %9, float 2.000000e+00, float 0.000000e+00, !dbg !22
+  %86 = and i32 %10, 511, !dbg !10
+  %87 = and i32 %10, 31, !dbg !10
+  %88 = lshr i32 %86, 5, !dbg !10
+  %89 = fsub float %77, %76, !dbg !29
+  %90 = select i1 %9, float 4.000000e+00, float 0.000000e+00, !dbg !32
+  %91 = fcmp oeq float %90, 0.000000e+00, !dbg !33
+  %92 = tail call float @llvm.nvvm.div.full(float %83, float %90), !dbg !34
+  %93 = select i1 %91, float 0.000000e+00, float %92, !dbg !35
+  %94 = fmul float %89, %93, !dbg !36
+  %95 = fadd float %76, %94, !dbg !37
+  %96 = fadd float %49, %57, !dbg !38
+  %97 = select i1 %9, float %96, float 0.000000e+00, !dbg !38
+  %98 = fmul float %89, %89, !dbg !39
+  %99 = fmul float %98, %82, !dbg !40
+  %100 = fmul float %99, %93, !dbg !41
+  %101 = fadd float %97, %100, !dbg !42
+  %102 = fsub float %78, %95, !dbg !29
+  %103 = select i1 %9, float 6.000000e+00, float 0.000000e+00, !dbg !32
+  %104 = fcmp oeq float %103, 0.000000e+00, !dbg !33
+  %105 = tail call float @llvm.nvvm.div.full(float %84, float %103), !dbg !34
+  %106 = select i1 %104, float 0.000000e+00, float %105, !dbg !35
+  %107 = fmul float %106, %102, !dbg !36
+  %108 = fadd float %95, %107, !dbg !37
+  %109 = fadd float %80, %101, !dbg !38
+  %110 = fmul float %102, %102, !dbg !39
+  %111 = fmul float %90, %110, !dbg !40
+  %112 = fmul float %106, %111, !dbg !41
+  %113 = fadd float %109, %112, !dbg !42
+  %114 = fsub float %79, %108, !dbg !29
+  %115 = select i1 %9, float 8.000000e+00, float 0.000000e+00, !dbg !32
+  %116 = fcmp oeq float %115, 0.000000e+00, !dbg !33
+  %117 = tail call float @llvm.nvvm.div.full(float %85, float %115), !dbg !34
+  %118 = select i1 %116, float 0.000000e+00, float %117, !dbg !35
+  %119 = fmul float %118, %114, !dbg !36
+  %120 = fadd float %108, %119, !dbg !37
+  %121 = fadd float %81, %113, !dbg !38
+  %122 = fmul float %114, %114, !dbg !39
+  %123 = fmul float %103, %122, !dbg !40
+  %124 = fmul float %118, %123, !dbg !41
+  %125 = fadd float %121, %124, !dbg !42
+  %126 = bitcast float %120 to i32, !dbg !30
+  %127 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %126, i32 16, i32 31), !dbg !30
+  %128 = bitcast i32 %127 to float, !dbg !30
+  %129 = bitcast float %125 to i32, !dbg !30
+  %130 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %129, i32 16, i32 31), !dbg !30
+  %131 = bitcast i32 %130 to float, !dbg !30
+  %132 = bitcast float %115 to i32, !dbg !30
+  %133 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %132, i32 16, i32 31), !dbg !30
+  %134 = bitcast i32 %133 to float, !dbg !30
+  %135 = fsub float %128, %120, !dbg !29
+  %136 = fadd float %115, %134, !dbg !32
+  %137 = fcmp oeq float %136, 0.000000e+00, !dbg !33
+  %138 = tail call float @llvm.nvvm.div.full(float %134, float %136), !dbg !34
+  %139 = select i1 %137, float 0.000000e+00, float %138, !dbg !35
+  %140 = fmul float %139, %135, !dbg !36
+  %141 = fadd float %120, %140, !dbg !37
+  %142 = fadd float %125, %131, !dbg !38
+  %143 = fmul float %135, %135, !dbg !39
+  %144 = fmul float %115, %143, !dbg !40
+  %145 = fmul float %139, %144, !dbg !41
+  %146 = fadd float %142, %145, !dbg !42
+  %147 = bitcast float %141 to i32, !dbg !30
+  %148 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %147, i32 8, i32 31), !dbg !30
+  %149 = bitcast i32 %148 to float, !dbg !30
+  %150 = bitcast float %146 to i32, !dbg !30
+  %151 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %150, i32 8, i32 31), !dbg !30
+  %152 = bitcast i32 %151 to float, !dbg !30
+  %153 = bitcast float %136 to i32, !dbg !30
+  %154 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %153, i32 8, i32 31), !dbg !30
+  %155 = bitcast i32 %154 to float, !dbg !30
+  %156 = fsub float %149, %141, !dbg !29
+  %157 = fadd float %136, %155, !dbg !32
+  %158 = fcmp oeq float %157, 0.000000e+00, !dbg !33
+  %159 = tail call float @llvm.nvvm.div.full(float %155, float %157), !dbg !34
+  %160 = select i1 %158, float 0.000000e+00, float %159, !dbg !35
+  %161 = fmul float %156, %160, !dbg !36
+  %162 = fadd float %141, %161, !dbg !37
+  %163 = fadd float %146, %152, !dbg !38
+  %164 = fmul float %156, %156, !dbg !39
+  %165 = fmul float %136, %164, !dbg !40
+  %166 = fmul float %160, %165, !dbg !41
+  %167 = fadd float %163, %166, !dbg !42
+  %168 = bitcast float %162 to i32, !dbg !30
+  %169 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %168, i32 4, i32 31), !dbg !30
+  %170 = bitcast i32 %169 to float, !dbg !30
+  %171 = bitcast float %167 to i32, !dbg !30
+  %172 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %171, i32 4, i32 31), !dbg !30
+  %173 = bitcast i32 %172 to float, !dbg !30
+  %174 = bitcast float %157 to i32, !dbg !30
+  %175 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %174, i32 4, i32 31), !dbg !30
+  %176 = bitcast i32 %175 to float, !dbg !30
+  %177 = fsub float %170, %162, !dbg !29
+  %178 = fadd float %157, %176, !dbg !32
+  %179 = fcmp oeq float %178, 0.000000e+00, !dbg !33
+  %180 = tail call float @llvm.nvvm.div.full(float %176, float %178), !dbg !34
+  %181 = select i1 %179, float 0.000000e+00, float %180, !dbg !35
+  %182 = fmul float %177, %181, !dbg !36
+  %183 = fadd float %162, %182, !dbg !37
+  %184 = fadd float %167, %173, !dbg !38
+  %185 = fmul float %177, %177, !dbg !39
+  %186 = fmul float %157, %185, !dbg !40
+  %187 = fmul float %181, %186, !dbg !41
+  %188 = fadd float %184, %187, !dbg !42
+  %189 = bitcast float %183 to i32, !dbg !30
+  %190 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %189, i32 2, i32 31), !dbg !30
+  %191 = bitcast i32 %190 to float, !dbg !30
+  %192 = bitcast float %188 to i32, !dbg !30
+  %193 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %192, i32 2, i32 31), !dbg !30
+  %194 = bitcast i32 %193 to float, !dbg !30
+  %195 = bitcast float %178 to i32, !dbg !30
+  %196 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %195, i32 2, i32 31), !dbg !30
+  %197 = bitcast i32 %196 to float, !dbg !30
+  %198 = fsub float %191, %183, !dbg !29
+  %199 = fadd float %178, %197, !dbg !32
+  %200 = fcmp oeq float %199, 0.000000e+00, !dbg !33
+  %201 = tail call float @llvm.nvvm.div.full(float %197, float %199), !dbg !34
+  %202 = select i1 %200, float 0.000000e+00, float %201, !dbg !35
+  %203 = fmul float %198, %202, !dbg !36
+  %204 = fadd float %183, %203, !dbg !37
+  %205 = fadd float %188, %194, !dbg !38
+  %206 = fmul float %198, %198, !dbg !39
+  %207 = fmul float %178, %206, !dbg !40
+  %208 = fmul float %202, %207, !dbg !41
+  %209 = fadd float %205, %208, !dbg !42
+  %210 = bitcast float %204 to i32, !dbg !30
+  %211 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %210, i32 1, i32 31), !dbg !30
+  %212 = bitcast i32 %211 to float, !dbg !30
+  %213 = bitcast float %209 to i32, !dbg !30
+  %214 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %213, i32 1, i32 31), !dbg !30
+  %215 = bitcast i32 %214 to float, !dbg !30
+  %216 = bitcast float %199 to i32, !dbg !30
+  %217 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %216, i32 1, i32 31), !dbg !30
+  %218 = bitcast i32 %217 to float, !dbg !30
+  %219 = fsub float %212, %204, !dbg !29
+  %220 = fadd float %199, %218, !dbg !32
+  %221 = fcmp oeq float %220, 0.000000e+00, !dbg !33
+  %222 = tail call float @llvm.nvvm.div.full(float %218, float %220), !dbg !34
+  %223 = select i1 %221, float 0.000000e+00, float %222, !dbg !35
+  %224 = fmul float %219, %223, !dbg !36
+  %225 = fadd float %204, %224, !dbg !37
+  %226 = fadd float %209, %215, !dbg !38
+  %227 = fmul float %219, %219, !dbg !39
+  %228 = fmul float %199, %227, !dbg !40
+  %229 = fmul float %223, %228, !dbg !41
+  %230 = fadd float %226, %229, !dbg !42
+  %231 = icmp eq i32 %87, 0, !dbg !30
+  %232 = getelementptr float, ptr addrspace(3) @global_smem, i32 %88, !dbg !30
+  %233 = bitcast float %225 to <1 x i32>, !dbg !30
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %232, <1 x i32> %233, i1 %231) #6, !dbg !30
+  %234 = getelementptr float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 64), i32 %88, !dbg !30
+  %235 = bitcast float %230 to <1 x i32>, !dbg !30
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %234, <1 x i32> %235, i1 %231) #6, !dbg !30
+  %236 = getelementptr float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 128), i32 %88, !dbg !30
+  %237 = bitcast float %220 to <1 x i32>, !dbg !30
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %236, <1 x i32> %237, i1 %231) #6, !dbg !30
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !30
+  %238 = icmp samesign ult i32 %86, 16, !dbg !30
+  %239 = getelementptr float, ptr addrspace(3) @global_smem, i32 %86, !dbg !30
+  %240 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %239, i1 %238) #6, !dbg !30
+  %241 = bitcast i32 %240 to float, !dbg !30
+  %242 = getelementptr float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 64), i32 %86, !dbg !30
+  %243 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %242, i1 %238) #6, !dbg !30
+  %244 = bitcast i32 %243 to float, !dbg !30
+  %245 = getelementptr float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 128), i32 %86, !dbg !30
+  %246 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %245, i1 %238) #6, !dbg !30
+  %247 = bitcast i32 %246 to float, !dbg !30
+  %248 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %240, i32 8, i32 31), !dbg !30
+  %249 = bitcast i32 %248 to float, !dbg !30
+  %250 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %243, i32 8, i32 31), !dbg !30
+  %251 = bitcast i32 %250 to float, !dbg !30
+  %252 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %246, i32 8, i32 31), !dbg !30
+  %253 = bitcast i32 %252 to float, !dbg !30
+  %254 = fsub float %249, %241, !dbg !29
+  %255 = fadd float %247, %253, !dbg !32
+  %256 = fcmp oeq float %255, 0.000000e+00, !dbg !33
+  %257 = tail call float @llvm.nvvm.div.full(float %253, float %255), !dbg !34
+  %258 = select i1 %256, float 0.000000e+00, float %257, !dbg !35
+  %259 = fmul float %254, %258, !dbg !36
+  %260 = fadd float %259, %241, !dbg !37
+  %261 = fadd float %244, %251, !dbg !38
+  %262 = fmul float %254, %254, !dbg !39
+  %263 = fmul float %262, %247, !dbg !40
+  %264 = fmul float %263, %258, !dbg !41
+  %265 = fadd float %261, %264, !dbg !42
+  %266 = bitcast float %260 to i32, !dbg !30
+  %267 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %266, i32 4, i32 31), !dbg !30
+  %268 = bitcast i32 %267 to float, !dbg !30
+  %269 = bitcast float %265 to i32, !dbg !30
+  %270 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %269, i32 4, i32 31), !dbg !30
+  %271 = bitcast i32 %270 to float, !dbg !30
+  %272 = bitcast float %255 to i32, !dbg !30
+  %273 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %272, i32 4, i32 31), !dbg !30
+  %274 = bitcast i32 %273 to float, !dbg !30
+  %275 = fsub float %268, %260, !dbg !29
+  %276 = fadd float %255, %274, !dbg !32
+  %277 = fcmp oeq float %276, 0.000000e+00, !dbg !33
+  %278 = tail call float @llvm.nvvm.div.full(float %274, float %276), !dbg !34
+  %279 = select i1 %277, float 0.000000e+00, float %278, !dbg !35
+  %280 = fmul float %275, %279, !dbg !36
+  %281 = fadd float %260, %280, !dbg !37
+  %282 = fadd float %265, %271, !dbg !38
+  %283 = fmul float %275, %275, !dbg !39
+  %284 = fmul float %255, %283, !dbg !40
+  %285 = fmul float %279, %284, !dbg !41
+  %286 = fadd float %282, %285, !dbg !42
+  %287 = bitcast float %281 to i32, !dbg !30
+  %288 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %287, i32 2, i32 31), !dbg !30
+  %289 = bitcast i32 %288 to float, !dbg !30
+  %290 = bitcast float %286 to i32, !dbg !30
+  %291 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %290, i32 2, i32 31), !dbg !30
+  %292 = bitcast i32 %291 to float, !dbg !30
+  %293 = bitcast float %276 to i32, !dbg !30
+  %294 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %293, i32 2, i32 31), !dbg !30
+  %295 = bitcast i32 %294 to float, !dbg !30
+  %296 = fsub float %289, %281, !dbg !29
+  %297 = fadd float %276, %295, !dbg !32
+  %298 = fcmp oeq float %297, 0.000000e+00, !dbg !33
+  %299 = tail call float @llvm.nvvm.div.full(float %295, float %297), !dbg !34
+  %300 = select i1 %298, float 0.000000e+00, float %299, !dbg !35
+  %301 = fmul float %296, %300, !dbg !36
+  %302 = fadd float %281, %301, !dbg !37
+  %303 = fadd float %286, %292, !dbg !38
+  %304 = fmul float %296, %296, !dbg !39
+  %305 = fmul float %276, %304, !dbg !40
+  %306 = fmul float %300, %305, !dbg !41
+  %307 = fadd float %303, %306, !dbg !42
+  %308 = bitcast float %302 to i32, !dbg !30
+  %309 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %308, i32 1, i32 31), !dbg !30
+  %310 = bitcast i32 %309 to float, !dbg !30
+  %311 = bitcast float %307 to i32, !dbg !30
+  %312 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %311, i32 1, i32 31), !dbg !30
+  %313 = bitcast i32 %312 to float, !dbg !30
+  %314 = bitcast float %297 to i32, !dbg !30
+  %315 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %314, i32 1, i32 31), !dbg !30
+  %316 = bitcast i32 %315 to float, !dbg !30
+  %317 = fsub float %310, %302, !dbg !29
+  %318 = fadd float %297, %316, !dbg !32
+  %319 = fcmp oeq float %318, 0.000000e+00, !dbg !33
+  %320 = tail call float @llvm.nvvm.div.full(float %316, float %318), !dbg !34
+  %321 = select i1 %319, float 0.000000e+00, float %320, !dbg !35
+  %322 = fmul float %317, %321, !dbg !36
+  %323 = fadd float %302, %322, !dbg !37
+  %324 = fadd float %307, %313, !dbg !38
+  %325 = fmul float %317, %317, !dbg !39
+  %326 = fmul float %297, %325, !dbg !40
+  %327 = fmul float %321, %326, !dbg !41
+  %328 = fadd float %324, %327, !dbg !42
+  %329 = and i32 %10, 15, !dbg !30
+  %330 = icmp eq i32 %329, 0, !dbg !30
+  %331 = and i1 %238, %330, !dbg !30
+  %332 = bitcast float %323 to <1 x i32>, !dbg !30
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %239, <1 x i32> %332, i1 %331) #6, !dbg !30
+  %333 = bitcast float %328 to <1 x i32>, !dbg !30
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %242, <1 x i32> %333, i1 %331) #6, !dbg !30
+  %334 = bitcast float %318 to <1 x i32>, !dbg !30
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %245, <1 x i32> %334, i1 %331) #6, !dbg !30
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !30
+  %335 = load float, ptr addrspace(3) @global_smem, align 16, !dbg !30
+  %336 = load float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 64), align 16, !dbg !30
+  %337 = tail call float @llvm.nvvm.div.full(float %336, float 4.096000e+03), !dbg !43
+  %338 = fadd float %337, 0x3EB0C6F7A0000000, !dbg !44
+  %339 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45
+  %340 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45
+  %341 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45
+  %342 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45
+  %.not.i15 = icmp eq i32 %342, 0, !dbg !45
+  br i1 %.not.i15, label %345, label %343, !dbg !45
+
+343:                                              ; preds = %__nv_rsqrtf.exit
+  %344 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %338), !dbg !45
+  br label %__nv_rsqrtf.exit17, !dbg !45
+
+345:                                              ; preds = %__nv_rsqrtf.exit
+  %346 = tail call float @llvm.nvvm.rsqrt.approx.f(float %338), !dbg !45
+  br label %__nv_rsqrtf.exit17, !dbg !45
+
+__nv_rsqrtf.exit17:                               ; preds = %343, %345
+  %.0.i16 = phi float [ %344, %343 ], [ %346, %345 ], !dbg !45
+  %347 = zext nneg i32 %12 to i64, !dbg !46
+  %348 = sext i32 %13 to i64, !dbg !46
+  %349 = getelementptr bfloat, ptr addrspace(1) %1, i64 %347, !dbg !47
+  %350 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !48
+  %351 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %349, i64 %350, i1 true) #6, !dbg !48
+  %352 = extractvalue { i32, i32 } %351, 0, !dbg !48
+  %353 = bitcast i32 %352 to <2 x bfloat>, !dbg !48
+  %354 = extractvalue { i32, i32 } %351, 1, !dbg !48
+  %355 = bitcast i32 %354 to <2 x bfloat>, !dbg !48
+  %356 = or disjoint i64 %347, %348, !dbg !49
+  %357 = getelementptr bfloat, ptr addrspace(1) %0, i64 %356, !dbg !50
+  %358 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #6, !dbg !51
+  %359 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %357, i64 %358, i1 %9) #6, !dbg !51
+  %360 = extractvalue { i32, i32 } %359, 0, !dbg !51
+  %361 = bitcast i32 %360 to <2 x bfloat>, !dbg !51
+  %362 = extractvalue { i32, i32 } %359, 1, !dbg !51
+  %363 = bitcast i32 %362 to <2 x bfloat>, !dbg !51
+  %364 = getelementptr bfloat, ptr addrspace(1) %2, i64 %347, !dbg !52
+  %365 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !53
+  %366 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %364, i64 %365, i1 true) #6, !dbg !53
+  %367 = extractvalue { i32, i32 } %366, 0, !dbg !53
+  %368 = bitcast i32 %367 to <2 x bfloat>, !dbg !53
+  %369 = extractvalue { i32, i32 } %366, 1, !dbg !53
+  %370 = bitcast i32 %369 to <2 x bfloat>, !dbg !53
+  %371 = getelementptr bfloat, ptr addrspace(1) %3, i64 %356, !dbg !54
+  %372 = fpext <2 x bfloat> %353 to <2 x float>, !dbg !55
+  %373 = fpext <2 x bfloat> %361 to <2 x float>, !dbg !56
+  %374 = fpext <2 x bfloat> %368 to <2 x float>, !dbg !57
+  %375 = fadd <2 x float> %372, splat (float 1.000000e+00), !dbg !58
+  %376 = insertelement <2 x float> poison, float %335, i64 0, !dbg !59
+  %377 = shufflevector <2 x float> %376, <2 x float> poison, <2 x i32> zeroinitializer, !dbg !59
+  %378 = fsub <2 x float> %373, %377, !dbg !59
+  %379 = insertelement <2 x float> poison, float %.0.i16, i64 0, !dbg !60
+  %380 = shufflevector <2 x float> %379, <2 x float> poison, <2 x i32> zeroinitializer, !dbg !60
+  %381 = fmul <2 x float> %380, %378, !dbg !60
+  %382 = fmul <2 x float> %375, %381, !dbg !61
+  %383 = fadd <2 x float> %382, %374, !dbg !62
+  %384 = fptrunc <2 x float> %383 to <2 x bfloat>, !dbg !63
+  %385 = fpext <2 x bfloat> %355 to <2 x float>, !dbg !55
+  %386 = fpext <2 x bfloat> %363 to <2 x float>, !dbg !56
+  %387 = fpext <2 x bfloat> %370 to <2 x float>, !dbg !57
+  %388 = fadd <2 x float> %385, splat (float 1.000000e+00), !dbg !58
+  %389 = fsub <2 x float> %386, %377, !dbg !59
+  %390 = fmul <2 x float> %380, %389, !dbg !60
+  %391 = fmul <2 x float> %388, %390, !dbg !61
+  %392 = fadd <2 x float> %391, %387, !dbg !62
+  %393 = fptrunc <2 x float> %392 to <2 x bfloat>, !dbg !63
+  %394 = bitcast <2 x bfloat> %384 to i32, !dbg !63
+  %395 = bitcast <2 x bfloat> %393 to i32, !dbg !63
+  tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %394, i32 %395, ptr addrspace(1) %371, i1 %9) #6, !dbg !63
+  %396 = or disjoint i64 %347, 2048, !dbg !64
+  %397 = getelementptr bfloat, ptr addrspace(1) %1, i64 %396, !dbg !47
+  %398 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !48
+  %399 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %397, i64 %398, i1 true) #6, !dbg !48
+  %400 = extractvalue { i32, i32 } %399, 0, !dbg !48
+  %401 = bitcast i32 %400 to <2 x bfloat>, !dbg !48
+  %402 = extractvalue { i32, i32 } %399, 1, !dbg !48
+  %403 = bitcast i32 %402 to <2 x bfloat>, !dbg !48
+  %404 = or disjoint i64 %396, %348, !dbg !49
+  %405 = getelementptr bfloat, ptr addrspace(1) %0, i64 %404, !dbg !50
+  %406 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #6, !dbg !51
+  %407 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %405, i64 %406, i1 %9) #6, !dbg !51
+  %408 = extractvalue { i32, i32 } %407, 0, !dbg !51
+  %409 = bitcast i32 %408 to <2 x bfloat>, !dbg !51
+  %410 = extractvalue { i32, i32 } %407, 1, !dbg !51
+  %411 = bitcast i32 %410 to <2 x bfloat>, !dbg !51
+  %412 = getelementptr bfloat, ptr addrspace(1) %2, i64 %396, !dbg !52
+  %413 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !53
+  %414 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %412, i64 %413, i1 true) #6, !dbg !53
+  %415 = extractvalue { i32, i32 } %414, 0, !dbg !53
+  %416 = bitcast i32 %415 to <2 x bfloat>, !dbg !53
+  %417 = extractvalue { i32, i32 } %414, 1, !dbg !53
+  %418 = bitcast i32 %417 to <2 x bfloat>, !dbg !53
+  %419 = getelementptr bfloat, ptr addrspace(1) %3, i64 %404, !dbg !54
+  %420 = fpext <2 x bfloat> %401 to <2 x float>, !dbg !55
+  %421 = fpext <2 x bfloat> %409 to <2 x float>, !dbg !56
+  %422 = fpext <2 x bfloat> %416 to <2 x float>, !dbg !57
+  %423 = fadd <2 x float> %420, splat (float 1.000000e+00), !dbg !58
+  %424 = fsub <2 x float> %421, %377, !dbg !59
+  %425 = fmul <2 x float> %380, %424, !dbg !60
+  %426 = fmul <2 x float> %423, %425, !dbg !61
+  %427 = fadd <2 x float> %426, %422, !dbg !62
+  %428 = fptrunc <2 x float> %427 to <2 x bfloat>, !dbg !63
+  %429 = fpext <2 x bfloat> %403 to <2 x float>, !dbg !55
+  %430 = fpext <2 x bfloat> %411 to <2 x float>, !dbg !56
+  %431 = fpext <2 x bfloat> %418 to <2 x float>, !dbg !57
+  %432 = fadd <2 x float> %429, splat (float 1.000000e+00), !dbg !58
+  %433 = fsub <2 x float> %430, %377, !dbg !59
+  %434 = fmul <2 x float> %380, %433, !dbg !60
+  %435 = fmul <2 x float> %432, %434, !dbg !61
+  %436 = fadd <2 x float> %435, %431, !dbg !62
+  %437 = fptrunc <2 x float> %436 to <2 x bfloat>, !dbg !63
+  %438 = bitcast <2 x bfloat> %428 to i32, !dbg !63
+  %439 = bitcast <2 x bfloat> %437 to i32, !dbg !63
+  tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %438, i32 %439, ptr addrspace(1) %419, i1 %9) #6, !dbg !63
+  ret void, !dbg !65
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.div.full(float, float) #2
+
+; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
+declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #3
+
+; Function Attrs: convergent nocallback nounwind
+declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #4
+
+declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #5
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #2
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.rsqrt.approx.f(float) #2
+
+attributes #0 = { nounwind "nvvm.reqntid"="512" }
+attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #2 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
+attributes #3 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
+attributes #4 = { convergent nocallback nounwind }
+attributes #5 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #6 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3}
+!llvm.ident = !{!4}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly)
+!1 = !DIFile(filename: "cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py", directory: "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww")
+!2 = !{i32 2, !"Debug Info Version", i32 3}
+!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
+!4 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
+!5 = distinct !DISubprogram(name: "triton_red_fused_add_mul_native_layer_norm_0", linkageName: "triton_red_fused_add_mul_native_layer_norm_0", scope: !1, file: !1, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
+!7 = !{}
+!8 = !DILocation(line: 23, column: 28, scope: !5)
+!9 = !DILocation(line: 25, column: 21, scope: !5)
+!10 = !DILocation(line: 26, column: 37, scope: !5)
+!11 = !DILocation(line: 38, column: 46, scope: !5)
+!12 = !DILocation(line: 32, column: 43, scope: !5)
+!13 = !DILocation(line: 38, column: 34, scope: !5)
+!14 = !DILocation(line: 38, column: 51, scope: !5)
+!15 = !DILocation(line: 38, column: 112, scope: !5)
+!16 = !DILocation(line: 44, column: 62, scope: !5)
+!17 = !DILocation(line: 222, column: 24, scope: !18, inlinedAt: !20)
+!18 = distinct !DILexicalBlockFile(scope: !5, file: !19, discriminator: 0)
+!19 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime")
+!20 = !DILocation(line: 42, column: 51, scope: !21)
+!21 = distinct !DILexicalBlockFile(scope: !5, file: !1, discriminator: 0)
+!22 = !DILocation(line: 46, column: 66, scope: !5)
+!23 = !DILocation(line: 224, column: 34, scope: !18, inlinedAt: !20)
+!24 = !DILocation(line: 224, column: 26, scope: !18, inlinedAt: !20)
+!25 = !DILocation(line: 225, column: 39, scope: !18, inlinedAt: !20)
+!26 = !DILocation(line: 225, column: 31, scope: !18, inlinedAt: !20)
+!27 = !DILocation(line: 225, column: 22, scope: !18, inlinedAt: !20)
+!28 = !DILocation(line: 45, column: 58, scope: !5)
+!29 = !DILocation(line: 231, column: 21, scope: !18, inlinedAt: !30)
+!30 = !DILocation(line: 243, column: 46, scope: !18, inlinedAt: !31)
+!31 = !DILocation(line: 47, column: 79, scope: !21)
+!32 = !DILocation(line: 232, column: 28, scope: !18, inlinedAt: !30)
+!33 = !DILocation(line: 233, column: 39, scope: !18, inlinedAt: !30)
+!34 = !DILocation(line: 233, column: 60, scope: !18, inlinedAt: !30)
+!35 = !DILocation(line: 233, column: 49, scope: !18, inlinedAt: !30)
+!36 = !DILocation(line: 235, column: 25, scope: !18, inlinedAt: !30)
+!37 = !DILocation(line: 235, column: 17, scope: !18, inlinedAt: !30)
+!38 = !DILocation(line: 236, column: 15, scope: !18, inlinedAt: !30)
+!39 = !DILocation(line: 236, column: 30, scope: !18, inlinedAt: !30)
+!40 = !DILocation(line: 236, column: 38, scope: !18, inlinedAt: !30)
+!41 = !DILocation(line: 236, column: 49, scope: !18, inlinedAt: !30)
+!42 = !DILocation(line: 236, column: 22, scope: !18, inlinedAt: !30)
+!43 = !DILocation(line: 65, column: 24, scope: !5)
+!44 = !DILocation(line: 67, column: 24, scope: !5)
+!45 = !DILocation(line: 68, column: 32, scope: !5)
+!46 = !DILocation(line: 51, column: 43, scope: !5)
+!47 = !DILocation(line: 57, column: 34, scope: !5)
+!48 = !DILocation(line: 57, column: 41, scope: !5)
+!49 = !DILocation(line: 58, column: 42, scope: !5)
+!50 = !DILocation(line: 58, column: 35, scope: !5)
+!51 = !DILocation(line: 58, column: 52, scope: !5)
+!52 = !DILocation(line: 59, column: 35, scope: !5)
+!53 = !DILocation(line: 59, column: 42, scope: !5)
+!54 = !DILocation(line: 73, column: 29, scope: !5)
+!55 = !DILocation(line: 57, column: 94, scope: !5)
+!56 = !DILocation(line: 58, column: 114, scope: !5)
+!57 = !DILocation(line: 59, column: 95, scope: !5)
+!58 = !DILocation(line: 61, column: 23, scope: !5)
+!59 = !DILocation(line: 63, column: 24, scope: !5)
+!60 = !DILocation(line: 69, column: 24, scope: !5)
+!61 = !DILocation(line: 71, column: 24, scope: !5)
+!62 = !DILocation(line: 72, column: 24, scope: !5)
+!63 = !DILocation(line: 73, column: 53, scope: !5)
+!64 = !DILocation(line: 52, column: 31, scope: !5)
+!65 = !DILocation(line: 51, column: 4, scope: !5)
diff --git a/triton/7POELGCJFUOFLC6TGKIWA3PRUT3MYIXDOSTB44MURMQQJWI3L3RQ/triton_red_fused_add_mul_native_layer_norm_0.ptx b/triton/7POELGCJFUOFLC6TGKIWA3PRUT3MYIXDOSTB44MURMQQJWI3L3RQ/triton_red_fused_add_mul_native_layer_norm_0.ptx
new file mode 100644
index 0000000000000000000000000000000000000000..882724f06d4da7b0ccd0a61943787cb11db32f98
--- /dev/null
+++ b/triton/7POELGCJFUOFLC6TGKIWA3PRUT3MYIXDOSTB44MURMQQJWI3L3RQ/triton_red_fused_add_mul_native_layer_norm_0.ptx
@@ -0,0 +1,1089 @@
+//
+// Generated by LLVM NVPTX Back-End
+//
+
+.version 9.1
+.target sm_89
+.address_size 64
+
+	// .globl	triton_red_fused_add_mul_native_layer_norm_0 // -- Begin function triton_red_fused_add_mul_native_layer_norm_0
+.extern .shared .align 16 .b8 global_smem[];
+.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90};
+                                        // @triton_red_fused_add_mul_native_layer_norm_0
+.visible .entry triton_red_fused_add_mul_native_layer_norm_0(
+	.param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_0_param_0,
+	.param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_0_param_1,
+	.param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_0_param_2,
+	.param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_0_param_3,
+	.param .u32 triton_red_fused_add_mul_native_layer_norm_0_param_4,
+	.param .u32 triton_red_fused_add_mul_native_layer_norm_0_param_5,
+	.param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_0_param_6,
+	.param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_0_param_7
+)
+.reqntid 512
+{
+	.reg .pred 	%p<19>;
+	.reg .b16 	%rs<33>;
+	.reg .b32 	%r<282>;
+	.reg .b64 	%rd<28>;
+	.loc	1 18 0                          // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:18:0
+$L__func_begin0:
+	.loc	1 18 0                          // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:18:0
+
+// %bb.0:                               // %__nv_rsqrtf.exit
+	ld.param.b64 	%rd19, [triton_red_fused_add_mul_native_layer_norm_0_param_0];
+	ld.param.b64 	%rd20, [triton_red_fused_add_mul_native_layer_norm_0_param_1];
+$L__tmp0:
+	.loc	1 23 28                         // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:23:28
+	mov.u32 	%r37, %ctaid.x;
+	.loc	1 25 21                         // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:25:21
+	setp.lt.u32 	%p1, %r37, 2048;
+	ld.param.b64 	%rd21, [triton_red_fused_add_mul_native_layer_norm_0_param_2];
+	ld.param.b64 	%rd22, [triton_red_fused_add_mul_native_layer_norm_0_param_3];
+	.loc	1 26 37                         // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:26:37
+	mov.u32 	%r38, %tid.x;
+	shl.b32 	%r39, %r38, 2;
+	and.b32 	%r40, %r39, 2044;
+	.loc	1 38 46                         // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:38:46
+	shl.b32 	%r41, %r37, 12;
+	or.b32 	%r42, %r40, %r41;
+	.loc	1 38 34                         // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:38:34
+	mad.wide.s32 	%rd1, %r42, 2, %rd19;
+	.loc	1 38 51                         // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:38:51
+	// begin inline asm
+	mov.u64 %rd2, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd2, 1.0;
+	// end inline asm
+	mov.b32 	%r3, 0;
+	// begin inline asm
+	mov.u32 %r1, %r3;
+	mov.u32 %r2, %r3;
+	@%p1 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { %r1, %r2 }, [ %rd1 + 0 ], %rd2;
+	// end inline asm
+	mov.b32 	{%rs1, %rs2}, %r2;
+	.loc	1 38 112                        // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:38:112
+	cvt.f32.bf16 	%r43, %rs2;
+	cvt.f32.bf16 	%r44, %rs1;
+	.loc	1 38 51                         // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:38:51
+	mov.b32 	{%rs3, %rs4}, %r1;
+	.loc	1 38 112                        // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:38:112
+	cvt.f32.bf16 	%r45, %rs4;
+	cvt.f32.bf16 	%r46, %rs3;
+	.loc	1 44 62                         // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:44:62
+	selp.f32 	%r47, %r46, 0f00000000, %p1;
+	selp.f32 	%r48, %r45, 0f00000000, %p1;
+	selp.f32 	%r49, %r44, 0f00000000, %p1;
+	selp.f32 	%r50, %r43, 0f00000000, %p1;
+	.loc	1 38 34                         // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:38:34
+	add.s64 	%rd3, %rd1, 4096;
+	.loc	1 38 51                         // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:38:51
+	// begin inline asm
+	mov.u64 %rd4, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd4, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r4, %r3;
+	mov.u32 %r5, %r3;
+	@%p1 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { %r4, %r5 }, [ %rd3 + 0 ], %rd4;
+	// end inline asm
+	mov.b32 	{%rs5, %rs6}, %r4;
+	.loc	1 38 112                        // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:38:112
+	cvt.f32.bf16 	%r51, %rs5;
+$L__tmp1:
+	.loc	2 222 24                        // triton_helpers.py:222:24 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:42:51 ]
+	sub.f32 	%r52, %r51, %r47;
+$L__tmp2:
+	.loc	1 46 66                         // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:46:66
+	selp.f32 	%r53, 0f40000000, 0f3F800000, %p1;
+$L__tmp3:
+	.loc	2 224 34                        // triton_helpers.py:224:34 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:42:51 ]
+	div.full.f32 	%r54, %r52, %r53;
+	.loc	2 224 26                        // triton_helpers.py:224:26 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:42:51 ]
+	add.f32 	%r55, %r47, %r54;
+	.loc	2 225 39                        // triton_helpers.py:225:39 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:42:51 ]
+	sub.f32 	%r56, %r51, %r55;
+	.loc	2 225 22                        // triton_helpers.py:225:22 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:42:51 ]
+	fma.rn.f32 	%r57, %r52, %r56, 0f00000000;
+$L__tmp4:
+	.loc	1 38 112                        // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:38:112
+	cvt.f32.bf16 	%r58, %rs6;
+$L__tmp5:
+	.loc	2 222 24                        // triton_helpers.py:222:24 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:42:51 ]
+	sub.f32 	%r59, %r58, %r48;
+	.loc	2 224 34                        // triton_helpers.py:224:34 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:42:51 ]
+	div.full.f32 	%r60, %r59, %r53;
+	.loc	2 224 26                        // triton_helpers.py:224:26 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:42:51 ]
+	add.f32 	%r61, %r48, %r60;
+	.loc	2 225 39                        // triton_helpers.py:225:39 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:42:51 ]
+	sub.f32 	%r62, %r58, %r61;
+	.loc	2 225 22                        // triton_helpers.py:225:22 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:42:51 ]
+	fma.rn.f32 	%r63, %r59, %r62, 0f00000000;
+$L__tmp6:
+	.loc	1 38 51                         // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:38:51
+	mov.b32 	{%rs7, %rs8}, %r5;
+	.loc	1 38 112                        // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:38:112
+	cvt.f32.bf16 	%r64, %rs7;
+$L__tmp7:
+	.loc	2 222 24                        // triton_helpers.py:222:24 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:42:51 ]
+	sub.f32 	%r65, %r64, %r49;
+	.loc	2 224 34                        // triton_helpers.py:224:34 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:42:51 ]
+	div.full.f32 	%r66, %r65, %r53;
+	.loc	2 224 26                        // triton_helpers.py:224:26 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:42:51 ]
+	add.f32 	%r67, %r49, %r66;
+	.loc	2 225 39                        // triton_helpers.py:225:39 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:42:51 ]
+	sub.f32 	%r68, %r64, %r67;
+	.loc	2 225 22                        // triton_helpers.py:225:22 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:42:51 ]
+	fma.rn.f32 	%r69, %r65, %r68, 0f00000000;
+$L__tmp8:
+	.loc	1 38 112                        // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:38:112
+	cvt.f32.bf16 	%r70, %rs8;
+$L__tmp9:
+	.loc	2 222 24                        // triton_helpers.py:222:24 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:42:51 ]
+	sub.f32 	%r71, %r70, %r50;
+	.loc	2 224 34                        // triton_helpers.py:224:34 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:42:51 ]
+	div.full.f32 	%r72, %r71, %r53;
+	.loc	2 224 26                        // triton_helpers.py:224:26 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:42:51 ]
+	add.f32 	%r73, %r50, %r72;
+	.loc	2 225 39                        // triton_helpers.py:225:39 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:42:51 ]
+	sub.f32 	%r74, %r70, %r73;
+	.loc	2 225 22                        // triton_helpers.py:225:22 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:42:51 ]
+	fma.rn.f32 	%r75, %r71, %r74, 0f00000000;
+$L__tmp10:
+	.loc	1 44 62                         // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:44:62
+	selp.f32 	%r76, %r55, 0f00000000, %p1;
+	selp.f32 	%r77, %r61, 0f00000000, %p1;
+	selp.f32 	%r78, %r67, 0f00000000, %p1;
+	selp.f32 	%r79, %r73, 0f00000000, %p1;
+	.loc	1 45 58                         // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:45:58
+	selp.f32 	%r80, %r69, 0f00000000, %p1;
+	selp.f32 	%r81, %r75, 0f00000000, %p1;
+	.loc	1 46 66                         // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:46:66
+	selp.f32 	%r82, 0f40000000, 0f00000000, %p1;
+	.loc	1 26 37                         // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:26:37
+	and.b32 	%r83, %r38, 511;
+	and.b32 	%r84, %r38, 31;
+$L__tmp11:
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	sub.f32 	%r85, %r77, %r76;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	selp.f32 	%r86, 0f40800000, 0f00000000, %p1;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	setp.eq.f32 	%p6, %r86, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	div.full.f32 	%r87, %r82, %r86;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	selp.f32 	%r88, 0f00000000, %r87, %p6;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	fma.rn.f32 	%r89, %r85, %r88, %r76;
+	.loc	2 236 15                        // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	add.f32 	%r90, %r57, %r63;
+	selp.f32 	%r91, %r90, 0f00000000, %p1;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	mul.f32 	%r92, %r85, %r85;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	mul.f32 	%r93, %r92, %r82;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	fma.rn.f32 	%r94, %r93, %r88, %r91;
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	sub.f32 	%r95, %r78, %r89;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	selp.f32 	%r96, 0f40C00000, 0f00000000, %p1;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	setp.eq.f32 	%p7, %r96, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	div.full.f32 	%r97, %r82, %r96;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	selp.f32 	%r98, 0f00000000, %r97, %p7;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	fma.rn.f32 	%r99, %r98, %r95, %r89;
+	.loc	2 236 15                        // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	add.f32 	%r100, %r80, %r94;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	mul.f32 	%r101, %r95, %r95;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	mul.f32 	%r102, %r86, %r101;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	fma.rn.f32 	%r103, %r98, %r102, %r100;
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	sub.f32 	%r104, %r79, %r99;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	selp.f32 	%r105, 0f41000000, 0f00000000, %p1;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	setp.eq.f32 	%p8, %r105, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	div.full.f32 	%r106, %r82, %r105;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	selp.f32 	%r107, 0f00000000, %r106, %p8;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	fma.rn.f32 	%r108, %r107, %r104, %r99;
+	.loc	2 236 15                        // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	add.f32 	%r109, %r81, %r103;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	mul.f32 	%r110, %r104, %r104;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	mul.f32 	%r111, %r96, %r110;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	fma.rn.f32 	%r112, %r107, %r111, %r109;
+$L__tmp12:
+	.loc	2 243 46                        // triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ]
+	shfl.sync.bfly.b32 	%r113, %r108, 16, 31, -1;
+	shfl.sync.bfly.b32 	%r114, %r112, 16, 31, -1;
+	shfl.sync.bfly.b32 	%r115, %r105, 16, 31, -1;
+$L__tmp13:
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	sub.f32 	%r116, %r113, %r108;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	add.f32 	%r117, %r105, %r115;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	setp.eq.f32 	%p9, %r117, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	div.full.f32 	%r118, %r115, %r117;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	selp.f32 	%r119, 0f00000000, %r118, %p9;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	fma.rn.f32 	%r120, %r119, %r116, %r108;
+	.loc	2 236 15                        // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	add.f32 	%r121, %r112, %r114;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	mul.f32 	%r122, %r116, %r116;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	mul.f32 	%r123, %r105, %r122;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	fma.rn.f32 	%r124, %r119, %r123, %r121;
+$L__tmp14:
+	.loc	2 243 46                        // triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ]
+	shfl.sync.bfly.b32 	%r125, %r120, 8, 31, -1;
+	shfl.sync.bfly.b32 	%r126, %r124, 8, 31, -1;
+	shfl.sync.bfly.b32 	%r127, %r117, 8, 31, -1;
+$L__tmp15:
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	sub.f32 	%r128, %r125, %r120;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	add.f32 	%r129, %r117, %r127;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	setp.eq.f32 	%p10, %r129, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	div.full.f32 	%r130, %r127, %r129;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	selp.f32 	%r131, 0f00000000, %r130, %p10;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	fma.rn.f32 	%r132, %r128, %r131, %r120;
+	.loc	2 236 15                        // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	add.f32 	%r133, %r124, %r126;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	mul.f32 	%r134, %r128, %r128;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	mul.f32 	%r135, %r117, %r134;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	fma.rn.f32 	%r136, %r131, %r135, %r133;
+$L__tmp16:
+	.loc	2 243 46                        // triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ]
+	shfl.sync.bfly.b32 	%r137, %r132, 4, 31, -1;
+	shfl.sync.bfly.b32 	%r138, %r136, 4, 31, -1;
+	shfl.sync.bfly.b32 	%r139, %r129, 4, 31, -1;
+$L__tmp17:
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	sub.f32 	%r140, %r137, %r132;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	add.f32 	%r141, %r129, %r139;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	setp.eq.f32 	%p11, %r141, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	div.full.f32 	%r142, %r139, %r141;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	selp.f32 	%r143, 0f00000000, %r142, %p11;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	fma.rn.f32 	%r144, %r140, %r143, %r132;
+	.loc	2 236 15                        // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	add.f32 	%r145, %r136, %r138;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	mul.f32 	%r146, %r140, %r140;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	mul.f32 	%r147, %r129, %r146;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	fma.rn.f32 	%r148, %r143, %r147, %r145;
+$L__tmp18:
+	.loc	2 243 46                        // triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ]
+	shfl.sync.bfly.b32 	%r149, %r144, 2, 31, -1;
+	shfl.sync.bfly.b32 	%r150, %r148, 2, 31, -1;
+	shfl.sync.bfly.b32 	%r151, %r141, 2, 31, -1;
+$L__tmp19:
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	sub.f32 	%r152, %r149, %r144;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	add.f32 	%r153, %r141, %r151;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	setp.eq.f32 	%p12, %r153, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	div.full.f32 	%r154, %r151, %r153;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	selp.f32 	%r155, 0f00000000, %r154, %p12;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	fma.rn.f32 	%r156, %r152, %r155, %r144;
+	.loc	2 236 15                        // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	add.f32 	%r157, %r148, %r150;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	mul.f32 	%r158, %r152, %r152;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	mul.f32 	%r159, %r141, %r158;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	fma.rn.f32 	%r160, %r155, %r159, %r157;
+$L__tmp20:
+	.loc	2 243 46                        // triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ]
+	shfl.sync.bfly.b32 	%r161, %r156, 1, 31, -1;
+	shfl.sync.bfly.b32 	%r162, %r160, 1, 31, -1;
+	shfl.sync.bfly.b32 	%r163, %r153, 1, 31, -1;
+$L__tmp21:
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	sub.f32 	%r164, %r161, %r156;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	add.f32 	%r11, %r153, %r163;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	setp.eq.f32 	%p13, %r11, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	div.full.f32 	%r165, %r163, %r11;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	selp.f32 	%r166, 0f00000000, %r165, %p13;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	fma.rn.f32 	%r7, %r164, %r166, %r156;
+	.loc	2 236 15                        // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	add.f32 	%r167, %r160, %r162;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	mul.f32 	%r168, %r164, %r164;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	mul.f32 	%r169, %r153, %r168;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	fma.rn.f32 	%r9, %r166, %r169, %r167;
+$L__tmp22:
+	.loc	2 243 46                        // triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ]
+	setp.eq.b32 	%p2, %r84, 0;
+	shr.u32 	%r170, %r38, 3;
+	and.b32 	%r171, %r170, 60;
+	mov.b32 	%r172, global_smem;
+	add.s32 	%r6, %r172, %r171;
+	// begin inline asm
+	@%p2 st.shared.b32 [ %r6 + 0 ], %r7;
+	// end inline asm
+	add.s32 	%r8, %r6, 64;
+	// begin inline asm
+	@%p2 st.shared.b32 [ %r8 + 0 ], %r9;
+	// end inline asm
+	add.s32 	%r10, %r6, 128;
+	// begin inline asm
+	@%p2 st.shared.b32 [ %r10 + 0 ], %r11;
+	// end inline asm
+	bar.sync 	0;
+	setp.lt.u32 	%p3, %r83, 16;
+	shl.b32 	%r173, %r83, 2;
+	add.s32 	%r13, %r172, %r173;
+	// begin inline asm
+	@%p3 ld.shared.b32 %r12, [ %r13 + 0 ];
+	// end inline asm
+	add.s32 	%r15, %r13, 64;
+	// begin inline asm
+	@%p3 ld.shared.b32 %r14, [ %r15 + 0 ];
+	// end inline asm
+	add.s32 	%r17, %r13, 128;
+	// begin inline asm
+	@%p3 ld.shared.b32 %r16, [ %r17 + 0 ];
+	// end inline asm
+	shfl.sync.bfly.b32 	%r174, %r12, 8, 31, -1;
+	shfl.sync.bfly.b32 	%r175, %r14, 8, 31, -1;
+	shfl.sync.bfly.b32 	%r176, %r16, 8, 31, -1;
+$L__tmp23:
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	sub.f32 	%r177, %r174, %r12;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	add.f32 	%r178, %r16, %r176;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	setp.eq.f32 	%p14, %r178, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	div.full.f32 	%r179, %r176, %r178;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	selp.f32 	%r180, 0f00000000, %r179, %p14;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	fma.rn.f32 	%r181, %r177, %r180, %r12;
+	.loc	2 236 15                        // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	add.f32 	%r182, %r14, %r175;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	mul.f32 	%r183, %r177, %r177;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	mul.f32 	%r184, %r183, %r16;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	fma.rn.f32 	%r185, %r184, %r180, %r182;
+$L__tmp24:
+	.loc	2 243 46                        // triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ]
+	shfl.sync.bfly.b32 	%r186, %r181, 4, 31, -1;
+	shfl.sync.bfly.b32 	%r187, %r185, 4, 31, -1;
+	shfl.sync.bfly.b32 	%r188, %r178, 4, 31, -1;
+$L__tmp25:
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	sub.f32 	%r189, %r186, %r181;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	add.f32 	%r190, %r178, %r188;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	setp.eq.f32 	%p15, %r190, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	div.full.f32 	%r191, %r188, %r190;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	selp.f32 	%r192, 0f00000000, %r191, %p15;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	fma.rn.f32 	%r193, %r189, %r192, %r181;
+	.loc	2 236 15                        // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	add.f32 	%r194, %r185, %r187;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	mul.f32 	%r195, %r189, %r189;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	mul.f32 	%r196, %r178, %r195;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	fma.rn.f32 	%r197, %r192, %r196, %r194;
+$L__tmp26:
+	.loc	2 243 46                        // triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ]
+	shfl.sync.bfly.b32 	%r198, %r193, 2, 31, -1;
+	shfl.sync.bfly.b32 	%r199, %r197, 2, 31, -1;
+	shfl.sync.bfly.b32 	%r200, %r190, 2, 31, -1;
+$L__tmp27:
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	sub.f32 	%r201, %r198, %r193;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	add.f32 	%r202, %r190, %r200;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	setp.eq.f32 	%p16, %r202, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	div.full.f32 	%r203, %r200, %r202;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	selp.f32 	%r204, 0f00000000, %r203, %p16;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	fma.rn.f32 	%r205, %r201, %r204, %r193;
+	.loc	2 236 15                        // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	add.f32 	%r206, %r197, %r199;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	mul.f32 	%r207, %r201, %r201;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	mul.f32 	%r208, %r190, %r207;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	fma.rn.f32 	%r209, %r204, %r208, %r206;
+$L__tmp28:
+	.loc	2 243 46                        // triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ]
+	shfl.sync.bfly.b32 	%r210, %r205, 1, 31, -1;
+	shfl.sync.bfly.b32 	%r211, %r209, 1, 31, -1;
+	shfl.sync.bfly.b32 	%r212, %r202, 1, 31, -1;
+$L__tmp29:
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	sub.f32 	%r213, %r210, %r205;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	add.f32 	%r20, %r202, %r212;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	setp.eq.f32 	%p17, %r20, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	div.full.f32 	%r214, %r212, %r20;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	selp.f32 	%r215, 0f00000000, %r214, %p17;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	fma.rn.f32 	%r18, %r213, %r215, %r205;
+	.loc	2 236 15                        // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	add.f32 	%r216, %r209, %r211;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	mul.f32 	%r217, %r213, %r213;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	mul.f32 	%r218, %r202, %r217;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	fma.rn.f32 	%r19, %r215, %r218, %r216;
+$L__tmp30:
+	.loc	2 243 46                        // triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ]
+	and.b32 	%r219, %r38, 15;
+	setp.eq.b32 	%p18, %r219, 0;
+	and.pred 	%p4, %p3, %p18;
+	// begin inline asm
+	@%p4 st.shared.b32 [ %r13 + 0 ], %r18;
+	// end inline asm
+	// begin inline asm
+	@%p4 st.shared.b32 [ %r15 + 0 ], %r19;
+	// end inline asm
+	// begin inline asm
+	@%p4 st.shared.b32 [ %r17 + 0 ], %r20;
+	// end inline asm
+	bar.sync 	0;
+	ld.shared.b32 	%r220, [global_smem];
+	ld.shared.b32 	%r221, [global_smem+64];
+	mov.b32 	%r222, 0f45800000;
+$L__tmp31:
+	.loc	1 65 24                         // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:65:24
+	div.full.f32 	%r223, %r221, %r222;
+	.loc	1 67 24                         // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:67:24
+	add.f32 	%r224, %r223, 0f358637BD;
+	.loc	1 68 32                         // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:68:32
+	rsqrt.approx.ftz.f32 	%r225, %r224;
+	.loc	1 51 43                         // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:51:43
+	cvt.u64.u32 	%rd23, %r40;
+	cvt.s64.s32 	%rd24, %r41;
+	.loc	1 57 34                         // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:57:34
+	mul.wide.u32 	%rd25, %r40, 2;
+	add.s64 	%rd5, %rd20, %rd25;
+	.loc	1 57 41                         // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:57:41
+	// begin inline asm
+	mov.u64 %rd6, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd6, 1.0;
+	// end inline asm
+	mov.pred 	%p5, -1;
+	// begin inline asm
+	mov.u32 %r21, %r3;
+	mov.u32 %r22, %r3;
+	@%p5 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { %r21, %r22 }, [ %rd5 + 0 ], %rd6;
+	// end inline asm
+	.loc	1 58 42                         // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:58:42
+	or.b64 	%rd26, %rd23, %rd24;
+	.loc	1 58 35                         // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:58:35
+	shl.b64 	%rd27, %rd26, 1;
+	add.s64 	%rd7, %rd19, %rd27;
+	.loc	1 58 52                         // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:58:52
+	// begin inline asm
+	mov.u64 %rd8, 0x0;
+	createpolicy.fractional.L2::evict_first.b64 %rd8, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r23, %r3;
+	mov.u32 %r24, %r3;
+	@%p1 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { %r23, %r24 }, [ %rd7 + 0 ], %rd8;
+	// end inline asm
+	.loc	1 59 35                         // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:59:35
+	add.s64 	%rd9, %rd21, %rd25;
+	.loc	1 59 42                         // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:59:42
+	// begin inline asm
+	mov.u64 %rd10, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd10, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r25, %r3;
+	mov.u32 %r26, %r3;
+	@%p5 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { %r25, %r26 }, [ %rd9 + 0 ], %rd10;
+	// end inline asm
+	.loc	1 73 29                         // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:73:29
+	add.s64 	%rd11, %rd22, %rd27;
+	.loc	1 57 94                         // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:57:94
+	mov.b32 	{%rs9, %rs10}, %r21;
+	cvt.f32.bf16 	%r226, %rs9;
+	cvt.f32.bf16 	%r227, %rs10;
+	.loc	1 58 114                        // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:58:114
+	mov.b32 	{%rs11, %rs12}, %r23;
+	cvt.f32.bf16 	%r228, %rs12;
+	cvt.f32.bf16 	%r229, %rs11;
+	.loc	1 59 95                         // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:59:95
+	mov.b32 	{%rs13, %rs14}, %r25;
+	cvt.f32.bf16 	%r230, %rs14;
+	cvt.f32.bf16 	%r231, %rs13;
+	.loc	1 61 23                         // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:61:23
+	add.f32 	%r232, %r227, 0f3F800000;
+	add.f32 	%r233, %r226, 0f3F800000;
+	.loc	1 63 24                         // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:63:24
+	sub.f32 	%r234, %r229, %r220;
+	sub.f32 	%r235, %r228, %r220;
+	.loc	1 69 24                         // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:69:24
+	mul.f32 	%r236, %r225, %r235;
+	mul.f32 	%r237, %r225, %r234;
+	.loc	1 72 24                         // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:72:24
+	fma.rn.f32 	%r238, %r233, %r237, %r231;
+	fma.rn.f32 	%r239, %r232, %r236, %r230;
+	.loc	1 73 53                         // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:73:53
+	cvt.rn.bf16x2.f32 	%r27, %r239, %r238;
+	.loc	1 57 94                         // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:57:94
+	mov.b32 	{%rs15, %rs16}, %r22;
+	cvt.f32.bf16 	%r240, %rs15;
+	cvt.f32.bf16 	%r241, %rs16;
+	.loc	1 58 114                        // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:58:114
+	mov.b32 	{%rs17, %rs18}, %r24;
+	cvt.f32.bf16 	%r242, %rs18;
+	cvt.f32.bf16 	%r243, %rs17;
+	.loc	1 59 95                         // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:59:95
+	mov.b32 	{%rs19, %rs20}, %r26;
+	cvt.f32.bf16 	%r244, %rs20;
+	cvt.f32.bf16 	%r245, %rs19;
+	.loc	1 61 23                         // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:61:23
+	add.f32 	%r246, %r241, 0f3F800000;
+	add.f32 	%r247, %r240, 0f3F800000;
+	.loc	1 63 24                         // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:63:24
+	sub.f32 	%r248, %r243, %r220;
+	sub.f32 	%r249, %r242, %r220;
+	.loc	1 69 24                         // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:69:24
+	mul.f32 	%r250, %r225, %r249;
+	mul.f32 	%r251, %r225, %r248;
+	.loc	1 72 24                         // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:72:24
+	fma.rn.f32 	%r252, %r247, %r251, %r245;
+	fma.rn.f32 	%r253, %r246, %r250, %r244;
+	.loc	1 73 53                         // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:73:53
+	cvt.rn.bf16x2.f32 	%r28, %r253, %r252;
+	// begin inline asm
+	@%p1 st.global.v2.b32 [ %rd11 + 0 ], { %r27, %r28 };
+	// end inline asm
+	.loc	1 57 34                         // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:57:34
+	add.s64 	%rd12, %rd5, 4096;
+	.loc	1 57 41                         // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:57:41
+	// begin inline asm
+	mov.u64 %rd13, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd13, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r29, %r3;
+	mov.u32 %r30, %r3;
+	@%p5 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { %r29, %r30 }, [ %rd12 + 0 ], %rd13;
+	// end inline asm
+	.loc	1 58 35                         // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:58:35
+	add.s64 	%rd14, %rd7, 4096;
+	.loc	1 58 52                         // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:58:52
+	// begin inline asm
+	mov.u64 %rd15, 0x0;
+	createpolicy.fractional.L2::evict_first.b64 %rd15, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r31, %r3;
+	mov.u32 %r32, %r3;
+	@%p1 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { %r31, %r32 }, [ %rd14 + 0 ], %rd15;
+	// end inline asm
+	.loc	1 59 35                         // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:59:35
+	add.s64 	%rd16, %rd9, 4096;
+	.loc	1 59 42                         // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:59:42
+	// begin inline asm
+	mov.u64 %rd17, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd17, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r33, %r3;
+	mov.u32 %r34, %r3;
+	@%p5 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { %r33, %r34 }, [ %rd16 + 0 ], %rd17;
+	// end inline asm
+	.loc	1 73 29                         // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:73:29
+	add.s64 	%rd18, %rd11, 4096;
+	.loc	1 57 94                         // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:57:94
+	mov.b32 	{%rs21, %rs22}, %r29;
+	cvt.f32.bf16 	%r254, %rs21;
+	cvt.f32.bf16 	%r255, %rs22;
+	.loc	1 58 114                        // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:58:114
+	mov.b32 	{%rs23, %rs24}, %r31;
+	cvt.f32.bf16 	%r256, %rs24;
+	cvt.f32.bf16 	%r257, %rs23;
+	.loc	1 59 95                         // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:59:95
+	mov.b32 	{%rs25, %rs26}, %r33;
+	cvt.f32.bf16 	%r258, %rs26;
+	cvt.f32.bf16 	%r259, %rs25;
+	.loc	1 61 23                         // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:61:23
+	add.f32 	%r260, %r255, 0f3F800000;
+	add.f32 	%r261, %r254, 0f3F800000;
+	.loc	1 63 24                         // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:63:24
+	sub.f32 	%r262, %r257, %r220;
+	sub.f32 	%r263, %r256, %r220;
+	.loc	1 69 24                         // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:69:24
+	mul.f32 	%r264, %r225, %r263;
+	mul.f32 	%r265, %r225, %r262;
+	.loc	1 72 24                         // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:72:24
+	fma.rn.f32 	%r266, %r261, %r265, %r259;
+	fma.rn.f32 	%r267, %r260, %r264, %r258;
+	.loc	1 73 53                         // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:73:53
+	cvt.rn.bf16x2.f32 	%r35, %r267, %r266;
+	.loc	1 57 94                         // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:57:94
+	mov.b32 	{%rs27, %rs28}, %r30;
+	cvt.f32.bf16 	%r268, %rs27;
+	cvt.f32.bf16 	%r269, %rs28;
+	.loc	1 58 114                        // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:58:114
+	mov.b32 	{%rs29, %rs30}, %r32;
+	cvt.f32.bf16 	%r270, %rs30;
+	cvt.f32.bf16 	%r271, %rs29;
+	.loc	1 59 95                         // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:59:95
+	mov.b32 	{%rs31, %rs32}, %r34;
+	cvt.f32.bf16 	%r272, %rs32;
+	cvt.f32.bf16 	%r273, %rs31;
+	.loc	1 61 23                         // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:61:23
+	add.f32 	%r274, %r269, 0f3F800000;
+	add.f32 	%r275, %r268, 0f3F800000;
+	.loc	1 63 24                         // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:63:24
+	sub.f32 	%r276, %r271, %r220;
+	sub.f32 	%r277, %r270, %r220;
+	.loc	1 69 24                         // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:69:24
+	mul.f32 	%r278, %r225, %r277;
+	mul.f32 	%r279, %r225, %r276;
+	.loc	1 72 24                         // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:72:24
+	fma.rn.f32 	%r280, %r275, %r279, %r273;
+	fma.rn.f32 	%r281, %r274, %r278, %r272;
+	.loc	1 73 53                         // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:73:53
+	cvt.rn.bf16x2.f32 	%r36, %r281, %r280;
+	// begin inline asm
+	@%p1 st.global.v2.b32 [ %rd18 + 0 ], { %r35, %r36 };
+	// end inline asm
+	.loc	1 51 4                          // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:51:4
+	ret;
+$L__tmp32:
+$L__func_end0:
+                                        // -- End function
+}
+	.file	1 "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py"
+	.file	2 "/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py"
+	.section	.debug_abbrev
+	{
+.b8 1                                   // Abbreviation Code
+.b8 17                                  // DW_TAG_compile_unit
+.b8 1                                   // DW_CHILDREN_yes
+.b8 37                                  // DW_AT_producer
+.b8 8                                   // DW_FORM_string
+.b8 19                                  // DW_AT_language
+.b8 5                                   // DW_FORM_data2
+.b8 3                                   // DW_AT_name
+.b8 8                                   // DW_FORM_string
+.b8 16                                  // DW_AT_stmt_list
+.b8 6                                   // DW_FORM_data4
+.b8 27                                  // DW_AT_comp_dir
+.b8 8                                   // DW_FORM_string
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 2                                   // Abbreviation Code
+.b8 46                                  // DW_TAG_subprogram
+.b8 0                                   // DW_CHILDREN_no
+.b8 3                                   // DW_AT_name
+.b8 8                                   // DW_FORM_string
+.b8 32                                  // DW_AT_inline
+.b8 11                                  // DW_FORM_data1
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 3                                   // Abbreviation Code
+.b8 46                                  // DW_TAG_subprogram
+.b8 1                                   // DW_CHILDREN_yes
+.b8 17                                  // DW_AT_low_pc
+.b8 1                                   // DW_FORM_addr
+.b8 18                                  // DW_AT_high_pc
+.b8 1                                   // DW_FORM_addr
+.b8 49                                  // DW_AT_abstract_origin
+.b8 19                                  // DW_FORM_ref4
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 4                                   // Abbreviation Code
+.b8 29                                  // DW_TAG_inlined_subroutine
+.b8 0                                   // DW_CHILDREN_no
+.b8 49                                  // DW_AT_abstract_origin
+.b8 19                                  // DW_FORM_ref4
+.b8 17                                  // DW_AT_low_pc
+.b8 1                                   // DW_FORM_addr
+.b8 18                                  // DW_AT_high_pc
+.b8 1                                   // DW_FORM_addr
+.b8 88                                  // DW_AT_call_file
+.b8 11                                  // DW_FORM_data1
+.b8 89                                  // DW_AT_call_line
+.b8 11                                  // DW_FORM_data1
+.b8 87                                  // DW_AT_call_column
+.b8 11                                  // DW_FORM_data1
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 5                                   // Abbreviation Code
+.b8 29                                  // DW_TAG_inlined_subroutine
+.b8 1                                   // DW_CHILDREN_yes
+.b8 49                                  // DW_AT_abstract_origin
+.b8 19                                  // DW_FORM_ref4
+.b8 17                                  // DW_AT_low_pc
+.b8 1                                   // DW_FORM_addr
+.b8 18                                  // DW_AT_high_pc
+.b8 1                                   // DW_FORM_addr
+.b8 88                                  // DW_AT_call_file
+.b8 11                                  // DW_FORM_data1
+.b8 89                                  // DW_AT_call_line
+.b8 11                                  // DW_FORM_data1
+.b8 87                                  // DW_AT_call_column
+.b8 11                                  // DW_FORM_data1
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 0                                   // EOM(3)
+	}
+	.section	.debug_info
+	{
+.b32 367                                // Length of Unit
+.b8 2                                   // DWARF version number
+.b8 0
+.b32 .debug_abbrev                      // Offset Into Abbrev. Section
+.b8 8                                   // Address Size (in bytes)
+.b8 1                                   // Abbrev [1] 0xb:0x168 DW_TAG_compile_unit
+.b8 116                                 // DW_AT_producer
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 0
+.b8 2                                   // DW_AT_language
+.b8 0
+.b8 99                                  // DW_AT_name
+.b8 119
+.b8 119
+.b8 105
+.b8 122
+.b8 122
+.b8 106
+.b8 119
+.b8 109
+.b8 100
+.b8 52
+.b8 97
+.b8 106
+.b8 108
+.b8 117
+.b8 98
+.b8 120
+.b8 112
+.b8 118
+.b8 120
+.b8 105
+.b8 100
+.b8 106
+.b8 105
+.b8 121
+.b8 51
+.b8 108
+.b8 100
+.b8 118
+.b8 53
+.b8 101
+.b8 102
+.b8 108
+.b8 119
+.b8 108
+.b8 117
+.b8 100
+.b8 103
+.b8 105
+.b8 122
+.b8 99
+.b8 97
+.b8 104
+.b8 118
+.b8 115
+.b8 112
+.b8 52
+.b8 105
+.b8 55
+.b8 53
+.b8 115
+.b8 50
+.b8 46
+.b8 112
+.b8 121
+.b8 0
+.b32 .debug_line                        // DW_AT_stmt_list
+.b8 47                                  // DW_AT_comp_dir
+.b8 97
+.b8 112
+.b8 112
+.b8 47
+.b8 116
+.b8 101
+.b8 110
+.b8 115
+.b8 111
+.b8 114
+.b8 114
+.b8 116
+.b8 95
+.b8 108
+.b8 108
+.b8 109
+.b8 47
+.b8 118
+.b8 105
+.b8 115
+.b8 117
+.b8 97
+.b8 108
+.b8 95
+.b8 103
+.b8 101
+.b8 110
+.b8 47
+.b8 99
+.b8 111
+.b8 109
+.b8 112
+.b8 105
+.b8 108
+.b8 101
+.b8 100
+.b8 95
+.b8 99
+.b8 97
+.b8 99
+.b8 104
+.b8 101
+.b8 47
+.b8 102
+.b8 108
+.b8 117
+.b8 120
+.b8 50
+.b8 95
+.b8 107
+.b8 108
+.b8 101
+.b8 105
+.b8 110
+.b8 95
+.b8 57
+.b8 98
+.b8 95
+.b8 78
+.b8 86
+.b8 73
+.b8 68
+.b8 73
+.b8 65
+.b8 95
+.b8 71
+.b8 101
+.b8 70
+.b8 111
+.b8 114
+.b8 99
+.b8 101
+.b8 95
+.b8 82
+.b8 84
+.b8 88
+.b8 95
+.b8 52
+.b8 48
+.b8 57
+.b8 48
+.b8 95
+.b8 115
+.b8 109
+.b8 56
+.b8 57
+.b8 95
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 50
+.b8 46
+.b8 49
+.b8 48
+.b8 46
+.b8 48
+.b8 97
+.b8 48
+.b8 95
+.b8 98
+.b8 52
+.b8 101
+.b8 52
+.b8 101
+.b8 101
+.b8 56
+.b8 49
+.b8 100
+.b8 51
+.b8 46
+.b8 110
+.b8 118
+.b8 50
+.b8 53
+.b8 46
+.b8 49
+.b8 50
+.b8 95
+.b8 99
+.b8 117
+.b8 100
+.b8 97
+.b8 49
+.b8 51
+.b8 95
+.b8 49
+.b8 47
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 105
+.b8 110
+.b8 100
+.b8 117
+.b8 99
+.b8 116
+.b8 111
+.b8 114
+.b8 47
+.b8 119
+.b8 119
+.b8 0
+.b8 2                                   // Abbrev [2] 0xe4:0x2f DW_TAG_subprogram
+.b8 116                                 // DW_AT_name
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 114
+.b8 101
+.b8 100
+.b8 95
+.b8 102
+.b8 117
+.b8 115
+.b8 101
+.b8 100
+.b8 95
+.b8 97
+.b8 100
+.b8 100
+.b8 95
+.b8 109
+.b8 117
+.b8 108
+.b8 95
+.b8 110
+.b8 97
+.b8 116
+.b8 105
+.b8 118
+.b8 101
+.b8 95
+.b8 108
+.b8 97
+.b8 121
+.b8 101
+.b8 114
+.b8 95
+.b8 110
+.b8 111
+.b8 114
+.b8 109
+.b8 95
+.b8 48
+.b8 0
+.b8 1                                   // DW_AT_inline
+.b8 3                                   // Abbrev [3] 0x113:0x5f DW_TAG_subprogram
+.b64 $L__func_begin0                    // DW_AT_low_pc
+.b64 $L__func_end0                      // DW_AT_high_pc
+.b32 228                                // DW_AT_abstract_origin
+.b8 4                                   // Abbrev [4] 0x128:0x18 DW_TAG_inlined_subroutine
+.b32 228                                // DW_AT_abstract_origin
+.b64 $L__tmp1                           // DW_AT_low_pc
+.b64 $L__tmp10                          // DW_AT_high_pc
+.b8 1                                   // DW_AT_call_file
+.b8 42                                  // DW_AT_call_line
+.b8 51                                  // DW_AT_call_column
+.b8 5                                   // Abbrev [5] 0x140:0x31 DW_TAG_inlined_subroutine
+.b32 228                                // DW_AT_abstract_origin
+.b64 $L__tmp11                          // DW_AT_low_pc
+.b64 $L__tmp31                          // DW_AT_high_pc
+.b8 1                                   // DW_AT_call_file
+.b8 47                                  // DW_AT_call_line
+.b8 79                                  // DW_AT_call_column
+.b8 4                                   // Abbrev [4] 0x158:0x18 DW_TAG_inlined_subroutine
+.b32 228                                // DW_AT_abstract_origin
+.b64 $L__tmp11                          // DW_AT_low_pc
+.b64 $L__tmp30                          // DW_AT_high_pc
+.b8 2                                   // DW_AT_call_file
+.b8 243                                 // DW_AT_call_line
+.b8 46                                  // DW_AT_call_column
+.b8 0                                   // End Of Children Mark
+.b8 0                                   // End Of Children Mark
+.b8 0                                   // End Of Children Mark
+	}
+	.section	.debug_macinfo	{	}
diff --git a/triton/7POELGCJFUOFLC6TGKIWA3PRUT3MYIXDOSTB44MURMQQJWI3L3RQ/triton_red_fused_add_mul_native_layer_norm_0.source b/triton/7POELGCJFUOFLC6TGKIWA3PRUT3MYIXDOSTB44MURMQQJWI3L3RQ/triton_red_fused_add_mul_native_layer_norm_0.source
new file mode 100644
index 0000000000000000000000000000000000000000..83e17ff3f38d522bca1175f7911c73a4f62e97cf
--- /dev/null
+++ b/triton/7POELGCJFUOFLC6TGKIWA3PRUT3MYIXDOSTB44MURMQQJWI3L3RQ/triton_red_fused_add_mul_native_layer_norm_0.source
@@ -0,0 +1,420 @@
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":18:0)
+#loc72 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":216:0)
+#loc85 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":133:0)
+#loc89 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":242:0)
+#loc91 = loc(unknown)
+#loc94 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":230:0)
+#loc109 = loc("in_ptr0"(#loc))
+#loc110 = loc("in_ptr1"(#loc))
+#loc111 = loc("in_ptr2"(#loc))
+#loc112 = loc("out_ptr2"(#loc))
+#loc113 = loc("xnumel"(#loc))
+#loc114 = loc("r0_numel"(#loc))
+#loc171 = loc("value"(#loc72))
+#loc172 = loc("mean"(#loc72))
+#loc173 = loc("m2"(#loc72))
+#loc174 = loc("weight"(#loc72))
+#loc175 = loc("first_iteration"(#loc72))
+#loc185 = loc("input"(#loc85))
+#loc186 = loc("mean"(#loc89))
+#loc187 = loc("m2"(#loc89))
+#loc188 = loc("weight"(#loc89))
+#loc189 = loc("mean_1"(#loc94))
+#loc190 = loc("m2_1"(#loc94))
+#loc191 = loc("weight_1"(#loc94))
+#loc192 = loc("mean_2"(#loc94))
+#loc193 = loc("m2_2"(#loc94))
+#loc194 = loc("weight_2"(#loc94))
+#loc201 = loc("new_mean"(#loc171))
+module {
+  tt.func public @triton_red_fused_add_mul_native_layer_norm_0(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %out_ptr2: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
+    %xnumel_0 = arith.constant 2048 : i32 loc(#loc115)
+    %r0_numel_1 = arith.constant 4096 : i32 loc(#loc116)
+    %xoffset = tt.get_program_id x : i32 loc(#loc117)
+    %xoffset_2 = arith.constant 1 : i32 loc(#loc118)
+    %xoffset_3 = arith.constant 1 : i32 loc(#loc118)
+    %xoffset_4 = arith.muli %xoffset, %xoffset_3 : i32 loc(#loc118)
+    %xindex = tt.make_range {end = 1 : i32, start = 0 : i32} : tensor<1xi32> loc(#loc119)
+    %xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc120)
+    %xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<1x1xi32> loc(#loc121)
+    %xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<1x1xi32> loc(#loc121)
+    %xmask = arith.constant dense<2048> : tensor<1x1xi32> loc(#loc122)
+    %xmask_8 = arith.cmpi slt, %xindex_7, %xmask : tensor<1x1xi32> loc(#loc122)
+    %r0_base = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32> loc(#loc123)
+    %r0_base_9 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<2048xi32> -> tensor<1x2048xi32> loc(#loc124)
+    %tmp3_mean = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_2048__(1,)cconstexpr_fp32_"() : () -> tensor<1x2048xf32> loc(#loc125)
+    %tmp3_m2 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_2048__(1,)cconstexpr_fp32_"() : () -> tensor<1x2048xf32> loc(#loc126)
+    %tmp3_weight = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_2048__(1,)cconstexpr_fp32_"() : () -> tensor<1x2048xf32> loc(#loc127)
+    %c0_i32 = arith.constant 0 : i32 loc(#loc14)
+    %c2048_i32 = arith.constant 2048 : i32 loc(#loc14)
+    %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc14)
+    %1 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc14)
+    %2 = arith.bitcast %c2048_i32 : i32 to i32 loc(#loc14)
+    %3 = ub.poison : i32 loc(#loc14)
+    %tmp3_weight_10:3 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%tmp3_mean_13 = %tmp3_mean, %tmp3_m2_14 = %tmp3_m2, %tmp3_weight_15 = %tmp3_weight) -> (tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32>)  : i32 {
+      %r0_index = tt.splat %r0_offset : i32 -> tensor<1x2048xi32> loc(#loc129)
+      %r0_index_16 = arith.addi %r0_index, %r0_base_9 : tensor<1x2048xi32> loc(#loc129)
+      %r0_mask = arith.constant dense<4096> : tensor<1x2048xi32> loc(#loc130)
+      %r0_mask_17 = arith.cmpi slt, %r0_index_16, %r0_mask : tensor<1x2048xi32> loc(#loc130)
+      %tmp0 = arith.constant 4096 : i32 loc(#loc131)
+      %tmp0_18 = arith.constant 4096 : i32 loc(#loc131)
+      %tmp0_19 = arith.constant dense<4096> : tensor<1x1xi32> loc(#loc131)
+      %tmp0_20 = arith.muli %tmp0_19, %xindex_7 : tensor<1x1xi32> loc(#loc131)
+      %tmp0_21 = tt.broadcast %tmp0_20 : tensor<1x1xi32> -> tensor<1x2048xi32> loc(#loc132)
+      %tmp0_22 = arith.addi %r0_index_16, %tmp0_21 : tensor<1x2048xi32> loc(#loc132)
+      %tmp0_23 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<1x2048x!tt.ptr<bf16>> loc(#loc133)
+      %tmp0_24 = tt.addptr %tmp0_23, %tmp0_22 : tensor<1x2048x!tt.ptr<bf16>>, tensor<1x2048xi32> loc(#loc133)
+      %tmp0_25 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x2048xi1> loc(#loc134)
+      %tmp0_26 = arith.andi %r0_mask_17, %tmp0_25 : tensor<1x2048xi1> loc(#loc134)
+      %tmp0_27 = arith.constant 0.000000e+00 : f32 loc(#loc135)
+      %tmp0_28 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc135)
+      %tmp0_29 = arith.truncf %tmp0_28 : tensor<1x2048xf32> to tensor<1x2048xbf16> loc(#loc135)
+      %tmp0_30 = tt.load %tmp0_24, %tmp0_26, %tmp0_29 evictionPolicy = evict_last : tensor<1x2048x!tt.ptr<bf16>> loc(#loc135)
+      %tmp0_31 = arith.extf %tmp0_30 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc136)
+      %c0_i32_32 = arith.constant 0 : i32 loc(#loc23)
+      %9 = arith.cmpi eq, %r0_offset, %c0_i32_32 : i32 loc(#loc23)
+      %10:3 = tt.call @torch._inductor.runtime.triton_helpers.welford_reduce__fp32S1_2048S_fp32S1_2048S_fp32S1_2048S_fp32S1_2048S_u1__(%tmp0_31, %tmp3_mean_13, %tmp3_m2_14, %tmp3_weight_15, %9) : (tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32>, i1) -> (tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32>) loc(#loc24)
+      %tmp3_mean_33 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x2048xi1> loc(#loc137)
+      %tmp3_mean_34 = arith.andi %r0_mask_17, %tmp3_mean_33 : tensor<1x2048xi1> loc(#loc137)
+      %tmp3_mean_35 = arith.select %tmp3_mean_34, %10#0, %tmp3_mean_13 : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc138)
+      %tmp3_m2_36 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x2048xi1> loc(#loc139)
+      %tmp3_m2_37 = arith.andi %r0_mask_17, %tmp3_m2_36 : tensor<1x2048xi1> loc(#loc139)
+      %tmp3_m2_38 = arith.select %tmp3_m2_37, %10#1, %tmp3_m2_14 : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc140)
+      %tmp3_weight_39 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x2048xi1> loc(#loc141)
+      %tmp3_weight_40 = arith.andi %r0_mask_17, %tmp3_weight_39 : tensor<1x2048xi1> loc(#loc141)
+      %tmp3_weight_41 = arith.select %tmp3_weight_40, %10#2, %tmp3_weight_15 : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc142)
+      scf.yield %tmp3_mean_35, %tmp3_m2_38, %tmp3_weight_41 : tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32> loc(#loc31)
+    } loc(#loc207)
+    %4:3 = tt.call @"torch._inductor.runtime.triton_helpers.welford__fp32S1_2048S_fp32S1_2048S_fp32S1_2048S__(3,)cconstexpr_1_"(%tmp3_weight_10#0, %tmp3_weight_10#1, %tmp3_weight_10#2) : (tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32>) -> (tensor<1xf32>, tensor<1xf32>, tensor<1xf32>) loc(#loc32)
+    %tmp3 = tt.expand_dims %4#0 {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc143)
+    %tmp7 = tt.expand_dims %4#1 {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc144)
+    %tmp8 = tt.expand_dims %4#2 {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc145)
+    %c0_i32_11 = arith.constant 0 : i32 loc(#loc36)
+    %c2048_i32_12 = arith.constant 2048 : i32 loc(#loc36)
+    %5 = arith.bitcast %c0_i32_11 : i32 to i32 loc(#loc36)
+    %6 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc36)
+    %7 = arith.bitcast %c2048_i32_12 : i32 to i32 loc(#loc36)
+    %8 = ub.poison : i32 loc(#loc36)
+    scf.for %r0_offset = %5 to %6 step %7  : i32 {
+      %r0_index = tt.splat %r0_offset : i32 -> tensor<1x2048xi32> loc(#loc146)
+      %r0_index_13 = arith.addi %r0_index, %r0_base_9 : tensor<1x2048xi32> loc(#loc146)
+      %r0_mask = arith.constant dense<4096> : tensor<1x2048xi32> loc(#loc147)
+      %r0_mask_14 = arith.cmpi slt, %r0_index_13, %r0_mask : tensor<1x2048xi32> loc(#loc147)
+      %tmp9 = tt.splat %in_ptr1 : !tt.ptr<bf16> -> tensor<1x2048x!tt.ptr<bf16>> loc(#loc148)
+      %tmp9_15 = tt.addptr %tmp9, %r0_index_13 : tensor<1x2048x!tt.ptr<bf16>>, tensor<1x2048xi32> loc(#loc148)
+      %tmp9_16 = arith.constant 0.000000e+00 : f32 loc(#loc149)
+      %tmp9_17 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc149)
+      %tmp9_18 = arith.truncf %tmp9_17 : tensor<1x2048xf32> to tensor<1x2048xbf16> loc(#loc149)
+      %tmp9_19 = tt.load %tmp9_15, %r0_mask_14, %tmp9_18 evictionPolicy = evict_last : tensor<1x2048x!tt.ptr<bf16>> loc(#loc149)
+      %tmp9_20 = arith.extf %tmp9_19 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc150)
+      %tmp12 = arith.constant 4096 : i32 loc(#loc151)
+      %tmp12_21 = arith.constant 4096 : i32 loc(#loc151)
+      %tmp12_22 = arith.constant dense<4096> : tensor<1x1xi32> loc(#loc151)
+      %tmp12_23 = arith.muli %tmp12_22, %xindex_7 : tensor<1x1xi32> loc(#loc151)
+      %tmp12_24 = tt.broadcast %tmp12_23 : tensor<1x1xi32> -> tensor<1x2048xi32> loc(#loc152)
+      %tmp12_25 = arith.addi %r0_index_13, %tmp12_24 : tensor<1x2048xi32> loc(#loc152)
+      %tmp12_26 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<1x2048x!tt.ptr<bf16>> loc(#loc153)
+      %tmp12_27 = tt.addptr %tmp12_26, %tmp12_25 : tensor<1x2048x!tt.ptr<bf16>>, tensor<1x2048xi32> loc(#loc153)
+      %tmp12_28 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x2048xi1> loc(#loc154)
+      %tmp12_29 = arith.andi %r0_mask_14, %tmp12_28 : tensor<1x2048xi1> loc(#loc154)
+      %tmp12_30 = arith.constant 0.000000e+00 : f32 loc(#loc155)
+      %tmp12_31 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc155)
+      %tmp12_32 = arith.truncf %tmp12_31 : tensor<1x2048xf32> to tensor<1x2048xbf16> loc(#loc155)
+      %tmp12_33 = tt.load %tmp12_27, %tmp12_29, %tmp12_32 evictionPolicy = evict_first : tensor<1x2048x!tt.ptr<bf16>> loc(#loc155)
+      %tmp12_34 = arith.extf %tmp12_33 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc156)
+      %tmp23 = tt.splat %in_ptr2 : !tt.ptr<bf16> -> tensor<1x2048x!tt.ptr<bf16>> loc(#loc157)
+      %tmp23_35 = tt.addptr %tmp23, %r0_index_13 : tensor<1x2048x!tt.ptr<bf16>>, tensor<1x2048xi32> loc(#loc157)
+      %tmp23_36 = arith.constant 0.000000e+00 : f32 loc(#loc158)
+      %tmp23_37 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc158)
+      %tmp23_38 = arith.truncf %tmp23_37 : tensor<1x2048xf32> to tensor<1x2048xbf16> loc(#loc158)
+      %tmp23_39 = tt.load %tmp23_35, %r0_mask_14, %tmp23_38 evictionPolicy = evict_last : tensor<1x2048x!tt.ptr<bf16>> loc(#loc158)
+      %tmp23_40 = arith.extf %tmp23_39 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc159)
+      %tmp10 = arith.constant 1.000000e+00 : f32 loc(#loc160)
+      %tmp11 = arith.constant dense<1.000000e+00> : tensor<1x2048xf32> loc(#loc161)
+      %tmp11_41 = arith.addf %tmp9_20, %tmp11 : tensor<1x2048xf32> loc(#loc161)
+      %tmp14 = tt.broadcast %tmp3 : tensor<1x1xf32> -> tensor<1x2048xf32> loc(#loc162)
+      %tmp14_42 = arith.subf %tmp12_34, %tmp14 : tensor<1x2048xf32> loc(#loc162)
+      %tmp15 = arith.constant 4.096000e+03 : f32 loc(#loc163)
+      %tmp16 = arith.constant dense<4.096000e+03> : tensor<1x1xf32> loc(#loc164)
+      %tmp16_43 = arith.divf %tmp7, %tmp16 : tensor<1x1xf32> loc(#loc164)
+      %tmp17 = arith.constant 9.99999997E-7 : f32 loc(#loc165)
+      %tmp18 = arith.constant dense<9.99999997E-7> : tensor<1x1xf32> loc(#loc166)
+      %tmp18_44 = arith.addf %tmp16_43, %tmp18 : tensor<1x1xf32> loc(#loc166)
+      %tmp19 = tt.extern_elementwise %tmp18_44 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<1x1xf32>) -> tensor<1x1xf32> loc(#loc167)
+      %tmp20 = tt.broadcast %tmp19 : tensor<1x1xf32> -> tensor<1x2048xf32> loc(#loc168)
+      %tmp20_45 = arith.mulf %tmp14_42, %tmp20 : tensor<1x2048xf32> loc(#loc168)
+      %tmp22 = arith.mulf %tmp11_41, %tmp20_45 : tensor<1x2048xf32> loc(#loc169)
+      %tmp24 = arith.addf %tmp22, %tmp23_40 : tensor<1x2048xf32> loc(#loc170)
+      %c4096_i32 = arith.constant 4096 : i32 loc(#loc62)
+      %c4096_i32_46 = arith.constant 4096 : i32 loc(#loc62)
+      %cst = arith.constant dense<4096> : tensor<1x1xi32> loc(#loc62)
+      %9 = arith.muli %cst, %xindex_7 : tensor<1x1xi32> loc(#loc62)
+      %10 = tt.broadcast %9 : tensor<1x1xi32> -> tensor<1x2048xi32> loc(#loc63)
+      %11 = arith.addi %r0_index_13, %10 : tensor<1x2048xi32> loc(#loc63)
+      %12 = tt.splat %out_ptr2 : !tt.ptr<bf16> -> tensor<1x2048x!tt.ptr<bf16>> loc(#loc64)
+      %13 = tt.addptr %12, %11 : tensor<1x2048x!tt.ptr<bf16>>, tensor<1x2048xi32> loc(#loc64)
+      %14 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x2048xi1> loc(#loc65)
+      %15 = arith.andi %r0_mask_14, %14 : tensor<1x2048xi1> loc(#loc65)
+      %16 = arith.truncf %tmp24 : tensor<1x2048xf32> to tensor<1x2048xbf16> loc(#loc66)
+      tt.store %13, %16, %15 : tensor<1x2048x!tt.ptr<bf16>> loc(#loc66)
+    } loc(#loc36)
+    tt.return loc(#loc67)
+  } loc(#loc)
+  tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_2048__(1,)cconstexpr_fp32_"() -> tensor<1x2048xf32> attributes {noinline = false} {
+    %cst = arith.constant 0.000000e+00 : f32 loc(#loc69)
+    %cst_0 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc69)
+    tt.return %cst_0 : tensor<1x2048xf32> loc(#loc70)
+  ^bb1:  // no predecessors
+    %0 = ub.poison : tensor<1x2048xf32> loc(#loc71)
+    tt.return %0 : tensor<1x2048xf32> loc(#loc71)
+  } loc(#loc68)
+  tt.func private @torch._inductor.runtime.triton_helpers.welford_reduce__fp32S1_2048S_fp32S1_2048S_fp32S1_2048S_fp32S1_2048S_u1__(%new_mean: tensor<1x2048xf32> loc("new_mean"(#loc171)), %mean: tensor<1x2048xf32> loc("mean"(#loc72)), %m2: tensor<1x2048xf32> loc("m2"(#loc72)), %weight: tensor<1x2048xf32> loc("weight"(#loc72)), %first_iteration: i1 loc("first_iteration"(#loc72))) -> (tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32>) attributes {noinline = false} {
+    %0:3 = scf.if %first_iteration -> (tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32>) {
+      %new_weight = arith.constant 1.000000e+00 : f32 loc(#loc176)
+      %new_weight_0 = arith.constant dense<1.000000e+00> : tensor<1x2048xf32> loc(#loc202)
+      %new_m2 = tt.call @triton.language.standard.zeros_like__fp32S1_2048S__(%m2) : (tensor<1x2048xf32>) -> tensor<1x2048xf32> loc(#loc203)
+      scf.yield %new_m2, %new_mean, %new_weight_0 : tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32> loc(#loc203)
+    } else {
+      %delta = arith.subf %new_mean, %mean : tensor<1x2048xf32> loc(#loc178)
+      %new_weight = arith.constant 1 : i32 loc(#loc179)
+      %new_weight_0 = arith.constant 1.000000e+00 : f32 loc(#loc179)
+      %new_weight_1 = arith.constant dense<1.000000e+00> : tensor<1x2048xf32> loc(#loc179)
+      %new_weight_2 = arith.addf %weight, %new_weight_1 : tensor<1x2048xf32> loc(#loc204)
+      %new_mean_3 = arith.divf %delta, %new_weight_2 : tensor<1x2048xf32> loc(#loc180)
+      %new_mean_4 = arith.addf %mean, %new_mean_3 : tensor<1x2048xf32> loc(#loc205)
+      %new_m2 = arith.subf %new_mean, %new_mean_4 : tensor<1x2048xf32> loc(#loc182)
+      %new_m2_5 = arith.mulf %delta, %new_m2 : tensor<1x2048xf32> loc(#loc183)
+      %new_m2_6 = arith.addf %m2, %new_m2_5 : tensor<1x2048xf32> loc(#loc206)
+      scf.yield %new_m2_6, %new_mean_4, %new_weight_2 : tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32> loc(#loc184)
+    } loc(#loc73)
+    tt.return %0#1, %0#0, %0#2 : tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32> loc(#loc83)
+  ^bb1:  // no predecessors
+    %1 = ub.poison : tensor<1x2048xf32> loc(#loc84)
+    %2 = ub.poison : tensor<1x2048xf32> loc(#loc84)
+    %3 = ub.poison : tensor<1x2048xf32> loc(#loc84)
+    tt.return %1, %2, %3 : tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32> loc(#loc84)
+  } loc(#loc72)
+  tt.func private @triton.language.standard.zeros_like__fp32S1_2048S__(%input: tensor<1x2048xf32> loc("input"(#loc85))) -> tensor<1x2048xf32> attributes {noinline = false} {
+    %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_2048__(1,)cconstexpr_fp32_"() : () -> tensor<1x2048xf32> loc(#loc86)
+    tt.return %0 : tensor<1x2048xf32> loc(#loc87)
+  ^bb1:  // no predecessors
+    %1 = ub.poison : tensor<1x2048xf32> loc(#loc88)
+    tt.return %1 : tensor<1x2048xf32> loc(#loc88)
+  } loc(#loc85)
+  tt.func private @"torch._inductor.runtime.triton_helpers.welford__fp32S1_2048S_fp32S1_2048S_fp32S1_2048S__(3,)cconstexpr_1_"(%mean: tensor<1x2048xf32> loc("mean"(#loc89)), %m2: tensor<1x2048xf32> loc("m2"(#loc89)), %weight: tensor<1x2048xf32> loc("weight"(#loc89))) -> (tensor<1xf32>, tensor<1xf32>, tensor<1xf32>) attributes {noinline = false} {
+    %0:3 = "tt.reduce"(%mean, %m2, %weight) <{axis = 1 : i32}> ({
+    ^bb0(%arg3: f32 loc(unknown), %arg4: f32 loc(unknown), %arg5: f32 loc(unknown), %arg6: f32 loc(unknown), %arg7: f32 loc(unknown), %arg8: f32 loc(unknown)):
+      %4:3 = tt.call @torch._inductor.runtime.triton_helpers.welford_combine__fp32_fp32_fp32_fp32_fp32_fp32__(%arg3, %arg4, %arg5, %arg6, %arg7, %arg8) : (f32, f32, f32, f32, f32, f32) -> (f32, f32, f32) loc(#loc90)
+      tt.reduce.return %4#0, %4#1, %4#2 : f32, f32, f32 loc(#loc90)
+    }) : (tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32>) -> (tensor<1xf32>, tensor<1xf32>, tensor<1xf32>) loc(#loc90)
+    tt.return %0#0, %0#1, %0#2 : tensor<1xf32>, tensor<1xf32>, tensor<1xf32> loc(#loc92)
+  ^bb1:  // no predecessors
+    %1 = ub.poison : tensor<1xf32> loc(#loc93)
+    %2 = ub.poison : tensor<1xf32> loc(#loc93)
+    %3 = ub.poison : tensor<1xf32> loc(#loc93)
+    tt.return %1, %2, %3 : tensor<1xf32>, tensor<1xf32>, tensor<1xf32> loc(#loc93)
+  } loc(#loc89)
+  tt.func private @torch._inductor.runtime.triton_helpers.welford_combine__fp32_fp32_fp32_fp32_fp32_fp32__(%mean_1: f32 loc("mean_1"(#loc94)), %m2_1: f32 loc("m2_1"(#loc94)), %weight_1: f32 loc("weight_1"(#loc94)), %mean_2: f32 loc("mean_2"(#loc94)), %m2_2: f32 loc("m2_2"(#loc94)), %weight_2: f32 loc("weight_2"(#loc94))) -> (f32, f32, f32) attributes {noinline = false} {
+    %delta = arith.subf %mean_2, %mean_1 : f32 loc(#loc195)
+    %new_weight = arith.addf %weight_1, %weight_2 : f32 loc(#loc196)
+    %w2_over_w = arith.constant 0.000000e+00 : f32 loc(#loc197)
+    %w2_over_w_0 = arith.cmpf oeq, %new_weight, %w2_over_w : f32 loc(#loc197)
+    %w2_over_w_1 = arith.divf %weight_2, %new_weight : f32 loc(#loc198)
+    %w2_over_w_2 = arith.constant 0.000000e+00 : f32 loc(#loc199)
+    %w2_over_w_3 = arith.constant 0.000000e+00 : f32 loc(#loc199)
+    %w2_over_w_4 = arith.select %w2_over_w_0, %w2_over_w_3, %w2_over_w_1 : f32 loc(#loc199)
+    %0 = arith.mulf %delta, %w2_over_w_4 : f32 loc(#loc100)
+    %1 = arith.addf %mean_1, %0 : f32 loc(#loc101)
+    %2 = arith.addf %m2_1, %m2_2 : f32 loc(#loc102)
+    %3 = arith.mulf %delta, %delta : f32 loc(#loc103)
+    %4 = arith.mulf %3, %weight_1 : f32 loc(#loc104)
+    %5 = arith.mulf %4, %w2_over_w_4 : f32 loc(#loc105)
+    %6 = arith.addf %2, %5 : f32 loc(#loc106)
+    tt.return %1, %6, %new_weight : f32, f32, f32 loc(#loc107)
+  ^bb1:  // no predecessors
+    %7 = ub.poison : f32 loc(#loc108)
+    %8 = ub.poison : f32 loc(#loc108)
+    %9 = ub.poison : f32 loc(#loc108)
+    tt.return %7, %8, %9 : f32, f32, f32 loc(#loc108)
+  } loc(#loc94)
+} loc(#loc)
+#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":19:13)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":20:15)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":23:28)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":23:33)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":24:36)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":24:44)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":24:23)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":25:21)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":26:27)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":26:37)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":29:45)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":30:43)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":31:47)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":32:43)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":33:31)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":34:29)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":38:46)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":38:41)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":38:34)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":38:61)
+#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":38:51)
+#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":38:112)
+#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":42:62)
+#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":42:51)
+#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":44:39)
+#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":44:62)
+#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":45:37)
+#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":45:58)
+#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":46:41)
+#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":46:66)
+#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":46:8)
+#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":47:79)
+#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":48:16)
+#loc34 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":49:16)
+#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":50:16)
+#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":51:43)
+#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":52:31)
+#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":53:29)
+#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":57:34)
+#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":57:41)
+#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":57:94)
+#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":58:47)
+#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":58:42)
+#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":58:35)
+#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":58:62)
+#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":58:52)
+#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":58:114)
+#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":59:35)
+#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":59:42)
+#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":59:95)
+#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":60:16)
+#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":61:23)
+#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":63:24)
+#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":64:16)
+#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":65:24)
+#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":66:16)
+#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":67:24)
+#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":68:32)
+#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":69:24)
+#loc60 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":71:24)
+#loc61 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":72:24)
+#loc62 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":73:41)
+#loc63 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":73:36)
+#loc64 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":73:29)
+#loc65 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":73:63)
+#loc66 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":73:53)
+#loc67 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":51:4)
+#loc68 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":120:0)
+#loc69 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":129:31)
+#loc70 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":129:11)
+#loc71 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":129:4)
+#loc73 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":217:7)
+#loc74 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":218:46)
+#loc75 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":220:31)
+#loc76 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":222:24)
+#loc77 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":223:30)
+#loc78 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":224:34)
+#loc79 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":224:26)
+#loc80 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":225:39)
+#loc81 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":225:31)
+#loc82 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":225:22)
+#loc83 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":226:11)
+#loc84 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":226:4)
+#loc86 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":140:30)
+#loc87 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":140:11)
+#loc88 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":140:4)
+#loc90 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":243:46)
+#loc92 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":243:11)
+#loc93 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":243:4)
+#loc95 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":231:21)
+#loc96 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":232:28)
+#loc97 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:39)
+#loc98 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:60)
+#loc99 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:49)
+#loc100 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":235:25)
+#loc101 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":235:17)
+#loc102 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:15)
+#loc103 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:30)
+#loc104 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:38)
+#loc105 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:49)
+#loc106 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:22)
+#loc107 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":234:11)
+#loc108 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":234:4)
+#loc115 = loc("xnumel"(#loc1))
+#loc116 = loc("r0_numel"(#loc2))
+#loc117 = loc("xoffset"(#loc3))
+#loc118 = loc("xoffset"(#loc4))
+#loc119 = loc("xindex"(#loc5))
+#loc120 = loc("xindex"(#loc6))
+#loc121 = loc("xindex"(#loc7))
+#loc122 = loc("xmask"(#loc8))
+#loc123 = loc("r0_base"(#loc9))
+#loc124 = loc("r0_base"(#loc10))
+#loc125 = loc("tmp3_mean"(#loc11))
+#loc126 = loc("tmp3_m2"(#loc12))
+#loc127 = loc("tmp3_weight"(#loc13))
+#loc128 = loc("tmp3_mean"(#loc14))
+#loc129 = loc("r0_index"(#loc15))
+#loc130 = loc("r0_mask"(#loc16))
+#loc131 = loc("tmp0"(#loc17))
+#loc132 = loc("tmp0"(#loc18))
+#loc133 = loc("tmp0"(#loc19))
+#loc134 = loc("tmp0"(#loc20))
+#loc135 = loc("tmp0"(#loc21))
+#loc136 = loc("tmp0"(#loc22))
+#loc137 = loc("tmp3_mean"(#loc25))
+#loc138 = loc("tmp3_mean"(#loc26))
+#loc139 = loc("tmp3_m2"(#loc27))
+#loc140 = loc("tmp3_m2"(#loc28))
+#loc141 = loc("tmp3_weight"(#loc29))
+#loc142 = loc("tmp3_weight"(#loc30))
+#loc143 = loc("tmp3"(#loc33))
+#loc144 = loc("tmp7"(#loc34))
+#loc145 = loc("tmp8"(#loc35))
+#loc146 = loc("r0_index"(#loc37))
+#loc147 = loc("r0_mask"(#loc38))
+#loc148 = loc("tmp9"(#loc39))
+#loc149 = loc("tmp9"(#loc40))
+#loc150 = loc("tmp9"(#loc41))
+#loc151 = loc("tmp12"(#loc42))
+#loc152 = loc("tmp12"(#loc43))
+#loc153 = loc("tmp12"(#loc44))
+#loc154 = loc("tmp12"(#loc45))
+#loc155 = loc("tmp12"(#loc46))
+#loc156 = loc("tmp12"(#loc47))
+#loc157 = loc("tmp23"(#loc48))
+#loc158 = loc("tmp23"(#loc49))
+#loc159 = loc("tmp23"(#loc50))
+#loc160 = loc("tmp10"(#loc51))
+#loc161 = loc("tmp11"(#loc52))
+#loc162 = loc("tmp14"(#loc53))
+#loc163 = loc("tmp15"(#loc54))
+#loc164 = loc("tmp16"(#loc55))
+#loc165 = loc("tmp17"(#loc56))
+#loc166 = loc("tmp18"(#loc57))
+#loc167 = loc("tmp19"(#loc58))
+#loc168 = loc("tmp20"(#loc59))
+#loc169 = loc("tmp22"(#loc60))
+#loc170 = loc("tmp24"(#loc61))
+#loc176 = loc("new_weight"(#loc74))
+#loc177 = loc("new_m2"(#loc75))
+#loc178 = loc("delta"(#loc76))
+#loc179 = loc("new_weight"(#loc77))
+#loc180 = loc("new_mean"(#loc78))
+#loc181 = loc("new_mean"(#loc79))
+#loc182 = loc("new_m2"(#loc80))
+#loc183 = loc("new_m2"(#loc81))
+#loc184 = loc("new_m2"(#loc82))
+#loc195 = loc("delta"(#loc95))
+#loc196 = loc("new_weight"(#loc96))
+#loc197 = loc("w2_over_w"(#loc97))
+#loc198 = loc("w2_over_w"(#loc98))
+#loc199 = loc("w2_over_w"(#loc99))
+#loc200 = loc("tmp3_m2"(#loc128))
+#loc202 = loc("new_weight"(#loc176))
+#loc203 = loc("new_m2"(#loc177))
+#loc204 = loc("new_weight"(#loc179))
+#loc205 = loc("new_mean"(#loc181))
+#loc206 = loc("new_m2"(#loc184))
+#loc207 = loc("tmp3_weight"(#loc200))
diff --git a/triton/7POELGCJFUOFLC6TGKIWA3PRUT3MYIXDOSTB44MURMQQJWI3L3RQ/triton_red_fused_add_mul_native_layer_norm_0.ttgir b/triton/7POELGCJFUOFLC6TGKIWA3PRUT3MYIXDOSTB44MURMQQJWI3L3RQ/triton_red_fused_add_mul_native_layer_norm_0.ttgir
new file mode 100644
index 0000000000000000000000000000000000000000..872c8277b54b268d64b927dc36da234a1604e509
--- /dev/null
+++ b/triton/7POELGCJFUOFLC6TGKIWA3PRUT3MYIXDOSTB44MURMQQJWI3L3RQ/triton_red_fused_add_mul_native_layer_norm_0.ttgir
@@ -0,0 +1,260 @@
+#blocked = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 32], warpsPerCTA = [1, 16], order = [1, 0]}>
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":18:0)
+#loc1 = loc(unknown)
+#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":47:79)
+#loc70 = loc("in_ptr0"(#loc))
+#loc71 = loc("in_ptr1"(#loc))
+#loc72 = loc("in_ptr2"(#loc))
+#loc73 = loc("out_ptr2"(#loc))
+#loc74 = loc("xnumel"(#loc))
+#loc75 = loc("r0_numel"(#loc))
+#loc101 = loc(callsite(#loc1 at #loc30))
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 16 : i32, ttg.target = "cuda:89", "ttg.threads-per-warp" = 32 : i32} {
+  tt.func public @triton_red_fused_add_mul_native_layer_norm_0(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %out_ptr2: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
+    %cst = arith.constant dense<4096> : tensor<1x2048xi32, #blocked> loc(#loc1)
+    %c0_i32 = arith.constant 0 : i32 loc(#loc1)
+    %cst_0 = arith.constant dense<0.000000e+00> : tensor<1x2048xbf16, #blocked> loc(#loc1)
+    %c4096_i32 = arith.constant 4096 : i32 loc(#loc1)
+    %c2048_i32 = arith.constant 2048 : i32 loc(#loc1)
+    %cst_1 = arith.constant 0.000000e+00 : f32 loc(#loc1)
+    %cst_2 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32, #blocked> loc(#loc1)
+    %cst_3 = arith.constant dense<9.99999997E-7> : tensor<1x1xf32, #blocked> loc(#loc1)
+    %cst_4 = arith.constant dense<4.096000e+03> : tensor<1x1xf32, #blocked> loc(#loc1)
+    %cst_5 = arith.constant dense<1.000000e+00> : tensor<1x2048xf32, #blocked> loc(#loc1)
+    %xoffset = tt.get_program_id x : i32 loc(#loc76)
+    %xmask = arith.cmpi slt, %xoffset, %c2048_i32 : i32 loc(#loc77)
+    %r0_base = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc78)
+    %r0_base_6 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<2048xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x2048xi32, #blocked> loc(#loc78)
+    %tmp0 = arith.muli %xoffset, %c4096_i32 : i32 loc(#loc79)
+    %tmp0_7 = tt.splat %tmp0 : i32 -> tensor<1x2048xi32, #blocked> loc(#loc130)
+    %tmp0_8 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<1x2048x!tt.ptr<bf16>, #blocked> loc(#loc81)
+    %tmp0_9 = tt.splat %xmask : i1 -> tensor<1x2048xi1, #blocked> loc(#loc131)
+    %tmp3_weight:3 = scf.for %tmp3_weight_10 = %c0_i32 to %c4096_i32 step %c2048_i32 iter_args(%arg7 = %cst_2, %arg8 = %cst_2, %arg9 = %cst_2) -> (tensor<1x2048xf32, #blocked>, tensor<1x2048xf32, #blocked>, tensor<1x2048xf32, #blocked>)  : i32 {
+      %r0_index = tt.splat %tmp3_weight_10 : i32 -> tensor<1x2048xi32, #blocked> loc(#loc84)
+      %r0_index_11 = arith.addi %r0_index, %r0_base_6 : tensor<1x2048xi32, #blocked> loc(#loc84)
+      %r0_mask = arith.cmpi slt, %r0_index_11, %cst : tensor<1x2048xi32, #blocked> loc(#loc85)
+      %tmp0_12 = arith.addi %r0_index_11, %tmp0_7 : tensor<1x2048xi32, #blocked> loc(#loc80)
+      %tmp0_13 = tt.addptr %tmp0_8, %tmp0_12 : tensor<1x2048x!tt.ptr<bf16>, #blocked>, tensor<1x2048xi32, #blocked> loc(#loc81)
+      %tmp0_14 = arith.andi %r0_mask, %tmp0_9 : tensor<1x2048xi1, #blocked> loc(#loc82)
+      %tmp0_15 = tt.load %tmp0_13, %tmp0_14, %cst_0 evictionPolicy = evict_last : tensor<1x2048x!tt.ptr<bf16>, #blocked> loc(#loc86)
+      %tmp0_16 = arith.extf %tmp0_15 : tensor<1x2048xbf16, #blocked> to tensor<1x2048xf32, #blocked> loc(#loc87)
+      %2 = arith.cmpi eq, %tmp3_weight_10, %c0_i32 : i32 loc(#loc14)
+      %3:3 = scf.if %2 -> (tensor<1x2048xf32, #blocked>, tensor<1x2048xf32, #blocked>, tensor<1x2048xf32, #blocked>) {
+        scf.yield %cst_2, %tmp0_16, %cst_5 : tensor<1x2048xf32, #blocked>, tensor<1x2048xf32, #blocked>, tensor<1x2048xf32, #blocked> loc(#loc155)
+      } else {
+        %delta = arith.subf %tmp0_16, %arg7 : tensor<1x2048xf32, #blocked> loc(#loc134)
+        %new_weight = arith.addf %arg9, %cst_5 : tensor<1x2048xf32, #blocked> loc(#loc156)
+        %new_mean = arith.divf %delta, %new_weight : tensor<1x2048xf32, #blocked> loc(#loc136)
+        %new_mean_18 = arith.addf %arg7, %new_mean : tensor<1x2048xf32, #blocked> loc(#loc157)
+        %new_m2 = arith.subf %tmp0_16, %new_mean_18 : tensor<1x2048xf32, #blocked> loc(#loc138)
+        %new_m2_19 = arith.mulf %delta, %new_m2 : tensor<1x2048xf32, #blocked> loc(#loc139)
+        %new_m2_20 = arith.addf %arg8, %new_m2_19 : tensor<1x2048xf32, #blocked> loc(#loc158)
+        scf.yield %new_m2_20, %new_mean_18, %new_weight : tensor<1x2048xf32, #blocked>, tensor<1x2048xf32, #blocked>, tensor<1x2048xf32, #blocked> loc(#loc141)
+      } loc(#loc88)
+      %tmp3_mean = arith.select %tmp0_14, %3#1, %arg7 : tensor<1x2048xi1, #blocked>, tensor<1x2048xf32, #blocked> loc(#loc97)
+      %tmp3_m2 = arith.select %tmp0_14, %3#0, %arg8 : tensor<1x2048xi1, #blocked>, tensor<1x2048xf32, #blocked> loc(#loc98)
+      %tmp3_weight_17 = arith.select %tmp0_14, %3#2, %arg9 : tensor<1x2048xi1, #blocked>, tensor<1x2048xf32, #blocked> loc(#loc99)
+      scf.yield %tmp3_mean, %tmp3_m2, %tmp3_weight_17 : tensor<1x2048xf32, #blocked>, tensor<1x2048xf32, #blocked>, tensor<1x2048xf32, #blocked> loc(#loc28)
+    } loc(#loc154)
+    %0:3 = "tt.reduce"(%tmp3_weight#0, %tmp3_weight#1, %tmp3_weight#2) <{axis = 1 : i32}> ({
+    ^bb0(%arg6: f32 loc(callsite(#loc1 at #loc30)), %arg7: f32 loc(callsite(#loc1 at #loc30)), %arg8: f32 loc(callsite(#loc1 at #loc30)), %arg9: f32 loc(callsite(#loc1 at #loc30)), %arg10: f32 loc(callsite(#loc1 at #loc30)), %arg11: f32 loc(callsite(#loc1 at #loc30))):
+      %delta = arith.subf %arg9, %arg6 : f32 loc(#loc142)
+      %new_weight = arith.addf %arg8, %arg11 : f32 loc(#loc143)
+      %w2_over_w = arith.cmpf oeq, %new_weight, %cst_1 : f32 loc(#loc144)
+      %w2_over_w_10 = arith.divf %arg11, %new_weight : f32 loc(#loc145)
+      %w2_over_w_11 = arith.select %w2_over_w, %cst_1, %w2_over_w_10 : f32 loc(#loc146)
+      %2 = arith.mulf %delta, %w2_over_w_11 : f32 loc(#loc147)
+      %3 = arith.addf %arg6, %2 : f32 loc(#loc148)
+      %4 = arith.addf %arg7, %arg10 : f32 loc(#loc149)
+      %5 = arith.mulf %delta, %delta : f32 loc(#loc150)
+      %6 = arith.mulf %5, %arg8 : f32 loc(#loc151)
+      %7 = arith.mulf %6, %w2_over_w_11 : f32 loc(#loc152)
+      %8 = arith.addf %4, %7 : f32 loc(#loc153)
+      tt.reduce.return %3, %8, %new_weight : f32, f32, f32 loc(#loc100)
+    }) : (tensor<1x2048xf32, #blocked>, tensor<1x2048xf32, #blocked>, tensor<1x2048xf32, #blocked>) -> (tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked}>>, tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked}>>, tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked}>>) loc(#loc100)
+    %tmp3 = tt.expand_dims %0#0 {axis = 1 : i32} : tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<1x1xf32, #blocked> loc(#loc107)
+    %tmp7 = tt.expand_dims %0#1 {axis = 1 : i32} : tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<1x1xf32, #blocked> loc(#loc108)
+    %tmp9 = tt.splat %in_ptr1 : !tt.ptr<bf16> -> tensor<1x2048x!tt.ptr<bf16>, #blocked> loc(#loc109)
+    %tmp23 = tt.splat %in_ptr2 : !tt.ptr<bf16> -> tensor<1x2048x!tt.ptr<bf16>, #blocked> loc(#loc110)
+    %tmp14 = tt.broadcast %tmp3 : tensor<1x1xf32, #blocked> -> tensor<1x2048xf32, #blocked> loc(#loc111)
+    %tmp16 = arith.divf %tmp7, %cst_4 : tensor<1x1xf32, #blocked> loc(#loc112)
+    %tmp18 = arith.addf %tmp16, %cst_3 : tensor<1x1xf32, #blocked> loc(#loc113)
+    %tmp19 = tt.extern_elementwise %tmp18 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<1x1xf32, #blocked>) -> tensor<1x1xf32, #blocked> loc(#loc114)
+    %tmp20 = tt.broadcast %tmp19 : tensor<1x1xf32, #blocked> -> tensor<1x2048xf32, #blocked> loc(#loc115)
+    %1 = tt.splat %out_ptr2 : !tt.ptr<bf16> -> tensor<1x2048x!tt.ptr<bf16>, #blocked> loc(#loc52)
+    scf.for %r0_offset = %c0_i32 to %c4096_i32 step %c2048_i32  : i32 {
+      %r0_index = tt.splat %r0_offset : i32 -> tensor<1x2048xi32, #blocked> loc(#loc116)
+      %r0_index_10 = arith.addi %r0_index, %r0_base_6 : tensor<1x2048xi32, #blocked> loc(#loc116)
+      %r0_mask = arith.cmpi slt, %r0_index_10, %cst : tensor<1x2048xi32, #blocked> loc(#loc117)
+      %tmp9_11 = tt.addptr %tmp9, %r0_index_10 : tensor<1x2048x!tt.ptr<bf16>, #blocked>, tensor<1x2048xi32, #blocked> loc(#loc109)
+      %tmp9_12 = tt.load %tmp9_11, %r0_mask, %cst_0 evictionPolicy = evict_last : tensor<1x2048x!tt.ptr<bf16>, #blocked> loc(#loc118)
+      %tmp9_13 = arith.extf %tmp9_12 : tensor<1x2048xbf16, #blocked> to tensor<1x2048xf32, #blocked> loc(#loc119)
+      %tmp12 = arith.addi %r0_index_10, %tmp0_7 : tensor<1x2048xi32, #blocked> loc(#loc120)
+      %tmp12_14 = tt.addptr %tmp0_8, %tmp12 : tensor<1x2048x!tt.ptr<bf16>, #blocked>, tensor<1x2048xi32, #blocked> loc(#loc121)
+      %tmp12_15 = arith.andi %r0_mask, %tmp0_9 : tensor<1x2048xi1, #blocked> loc(#loc122)
+      %tmp12_16 = tt.load %tmp12_14, %tmp12_15, %cst_0 evictionPolicy = evict_first : tensor<1x2048x!tt.ptr<bf16>, #blocked> loc(#loc123)
+      %tmp12_17 = arith.extf %tmp12_16 : tensor<1x2048xbf16, #blocked> to tensor<1x2048xf32, #blocked> loc(#loc124)
+      %tmp23_18 = tt.addptr %tmp23, %r0_index_10 : tensor<1x2048x!tt.ptr<bf16>, #blocked>, tensor<1x2048xi32, #blocked> loc(#loc110)
+      %tmp23_19 = tt.load %tmp23_18, %r0_mask, %cst_0 evictionPolicy = evict_last : tensor<1x2048x!tt.ptr<bf16>, #blocked> loc(#loc125)
+      %tmp23_20 = arith.extf %tmp23_19 : tensor<1x2048xbf16, #blocked> to tensor<1x2048xf32, #blocked> loc(#loc126)
+      %tmp11 = arith.addf %tmp9_13, %cst_5 : tensor<1x2048xf32, #blocked> loc(#loc127)
+      %tmp14_21 = arith.subf %tmp12_17, %tmp14 : tensor<1x2048xf32, #blocked> loc(#loc111)
+      %tmp20_22 = arith.mulf %tmp14_21, %tmp20 : tensor<1x2048xf32, #blocked> loc(#loc115)
+      %tmp22 = arith.mulf %tmp11, %tmp20_22 : tensor<1x2048xf32, #blocked> loc(#loc128)
+      %tmp24 = arith.addf %tmp22, %tmp23_20 : tensor<1x2048xf32, #blocked> loc(#loc129)
+      %2 = tt.addptr %1, %tmp12 : tensor<1x2048x!tt.ptr<bf16>, #blocked>, tensor<1x2048xi32, #blocked> loc(#loc52)
+      %3 = arith.truncf %tmp24 : tensor<1x2048xf32, #blocked> to tensor<1x2048xbf16, #blocked> loc(#loc68)
+      tt.store %2, %3, %tmp12_15 : tensor<1x2048x!tt.ptr<bf16>, #blocked> loc(#loc68)
+    } loc(#loc53)
+    tt.return loc(#loc69)
+  } loc(#loc)
+} loc(#loc)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":23:28)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":25:21)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":26:37)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":38:46)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":38:41)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":38:34)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":38:61)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":32:43)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":33:31)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":34:29)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":38:51)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":38:112)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":42:62)
+#loc15 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":217:7)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":42:51)
+#loc17 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":220:31)
+#loc18 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":222:24)
+#loc19 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":223:30)
+#loc20 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":224:34)
+#loc21 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":224:26)
+#loc22 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":225:39)
+#loc23 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":225:31)
+#loc24 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":225:22)
+#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":44:62)
+#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":45:58)
+#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":46:66)
+#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":46:8)
+#loc29 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":243:46)
+#loc31 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":231:21)
+#loc32 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":232:28)
+#loc33 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:39)
+#loc34 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:60)
+#loc35 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:49)
+#loc36 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":235:25)
+#loc37 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":235:17)
+#loc38 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:15)
+#loc39 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:30)
+#loc40 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:38)
+#loc41 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:49)
+#loc42 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:22)
+#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":48:16)
+#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":49:16)
+#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":57:34)
+#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":59:35)
+#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":63:24)
+#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":65:24)
+#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":67:24)
+#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":68:32)
+#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":69:24)
+#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":73:29)
+#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":51:43)
+#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":52:31)
+#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":53:29)
+#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":57:41)
+#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":57:94)
+#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":58:42)
+#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":58:35)
+#loc60 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":58:62)
+#loc61 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":58:52)
+#loc62 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":58:114)
+#loc63 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":59:42)
+#loc64 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":59:95)
+#loc65 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":61:23)
+#loc66 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":71:24)
+#loc67 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":72:24)
+#loc68 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":73:53)
+#loc69 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":51:4)
+#loc76 = loc("xoffset"(#loc2))
+#loc77 = loc("xmask"(#loc3))
+#loc78 = loc("r0_base"(#loc4))
+#loc79 = loc("tmp0"(#loc5))
+#loc80 = loc("tmp0"(#loc6))
+#loc81 = loc("tmp0"(#loc7))
+#loc82 = loc("tmp0"(#loc8))
+#loc83 = loc("tmp3_mean"(#loc9))
+#loc84 = loc("r0_index"(#loc10))
+#loc85 = loc("r0_mask"(#loc11))
+#loc86 = loc("tmp0"(#loc12))
+#loc87 = loc("tmp0"(#loc13))
+#loc88 = loc(callsite(#loc15 at #loc16))
+#loc89 = loc("new_m2"(#loc17))
+#loc90 = loc("delta"(#loc18))
+#loc91 = loc("new_weight"(#loc19))
+#loc92 = loc("new_mean"(#loc20))
+#loc93 = loc("new_mean"(#loc21))
+#loc94 = loc("new_m2"(#loc22))
+#loc95 = loc("new_m2"(#loc23))
+#loc96 = loc("new_m2"(#loc24))
+#loc97 = loc("tmp3_mean"(#loc25))
+#loc98 = loc("tmp3_m2"(#loc26))
+#loc99 = loc("tmp3_weight"(#loc27))
+#loc100 = loc(callsite(#loc29 at #loc30))
+#loc102 = loc("delta"(#loc31))
+#loc103 = loc("new_weight"(#loc32))
+#loc104 = loc("w2_over_w"(#loc33))
+#loc105 = loc("w2_over_w"(#loc34))
+#loc106 = loc("w2_over_w"(#loc35))
+#loc107 = loc("tmp3"(#loc43))
+#loc108 = loc("tmp7"(#loc44))
+#loc109 = loc("tmp9"(#loc45))
+#loc110 = loc("tmp23"(#loc46))
+#loc111 = loc("tmp14"(#loc47))
+#loc112 = loc("tmp16"(#loc48))
+#loc113 = loc("tmp18"(#loc49))
+#loc114 = loc("tmp19"(#loc50))
+#loc115 = loc("tmp20"(#loc51))
+#loc116 = loc("r0_index"(#loc54))
+#loc117 = loc("r0_mask"(#loc55))
+#loc118 = loc("tmp9"(#loc56))
+#loc119 = loc("tmp9"(#loc57))
+#loc120 = loc("tmp12"(#loc58))
+#loc121 = loc("tmp12"(#loc59))
+#loc122 = loc("tmp12"(#loc60))
+#loc123 = loc("tmp12"(#loc61))
+#loc124 = loc("tmp12"(#loc62))
+#loc125 = loc("tmp23"(#loc63))
+#loc126 = loc("tmp23"(#loc64))
+#loc127 = loc("tmp11"(#loc65))
+#loc128 = loc("tmp22"(#loc66))
+#loc129 = loc("tmp24"(#loc67))
+#loc130 = loc(fused[#loc80, #loc79])
+#loc131 = loc(fused[#loc82, #loc77])
+#loc132 = loc("tmp3_m2"(#loc83))
+#loc133 = loc("new_m2"(#loc89))
+#loc134 = loc(callsite(#loc90 at #loc16))
+#loc135 = loc("new_weight"(#loc91))
+#loc136 = loc(callsite(#loc92 at #loc16))
+#loc137 = loc("new_mean"(#loc93))
+#loc138 = loc(callsite(#loc94 at #loc16))
+#loc139 = loc(callsite(#loc95 at #loc16))
+#loc140 = loc("new_m2"(#loc96))
+#loc141 = loc(callsite(#loc96 at #loc16))
+#loc142 = loc(callsite(#loc102 at #loc100))
+#loc143 = loc(callsite(#loc103 at #loc100))
+#loc144 = loc(callsite(#loc104 at #loc100))
+#loc145 = loc(callsite(#loc105 at #loc100))
+#loc146 = loc(callsite(#loc106 at #loc100))
+#loc147 = loc(callsite(#loc36 at #loc100))
+#loc148 = loc(callsite(#loc37 at #loc100))
+#loc149 = loc(callsite(#loc38 at #loc100))
+#loc150 = loc(callsite(#loc39 at #loc100))
+#loc151 = loc(callsite(#loc40 at #loc100))
+#loc152 = loc(callsite(#loc41 at #loc100))
+#loc153 = loc(callsite(#loc42 at #loc100))
+#loc154 = loc("tmp3_weight"(#loc132))
+#loc155 = loc(callsite(#loc133 at #loc16))
+#loc156 = loc(callsite(#loc135 at #loc16))
+#loc157 = loc(callsite(#loc137 at #loc16))
+#loc158 = loc(callsite(#loc140 at #loc16))
diff --git a/triton/7POELGCJFUOFLC6TGKIWA3PRUT3MYIXDOSTB44MURMQQJWI3L3RQ/triton_red_fused_add_mul_native_layer_norm_0.ttir b/triton/7POELGCJFUOFLC6TGKIWA3PRUT3MYIXDOSTB44MURMQQJWI3L3RQ/triton_red_fused_add_mul_native_layer_norm_0.ttir
new file mode 100644
index 0000000000000000000000000000000000000000..6fc41cb13aa57de73d01922dd12878396e166719
--- /dev/null
+++ b/triton/7POELGCJFUOFLC6TGKIWA3PRUT3MYIXDOSTB44MURMQQJWI3L3RQ/triton_red_fused_add_mul_native_layer_norm_0.ttir
@@ -0,0 +1,269 @@
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":18:0)
+#loc1 = loc(unknown)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":47:79)
+#loc72 = loc("in_ptr0"(#loc))
+#loc73 = loc("in_ptr1"(#loc))
+#loc74 = loc("in_ptr2"(#loc))
+#loc75 = loc("out_ptr2"(#loc))
+#loc76 = loc("xnumel"(#loc))
+#loc77 = loc("r0_numel"(#loc))
+#loc78 = loc(callsite(#loc1 at #loc2))
+module {
+  tt.func public @triton_red_fused_add_mul_native_layer_norm_0(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %out_ptr2: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
+    %cst = arith.constant 0.000000e+00 : f32 loc(#loc78)
+    %cst_0 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc1)
+    %cst_1 = arith.constant dense<0.000000e+00> : tensor<1x2048xbf16> loc(#loc1)
+    %c2048_i32 = arith.constant 2048 : i32 loc(#loc1)
+    %c4096_i32 = arith.constant 4096 : i32 loc(#loc1)
+    %cst_2 = arith.constant dense<9.99999997E-7> : tensor<1x1xf32> loc(#loc1)
+    %cst_3 = arith.constant dense<4.096000e+03> : tensor<1x1xf32> loc(#loc1)
+    %cst_4 = arith.constant dense<1.000000e+00> : tensor<1x2048xf32> loc(#loc1)
+    %cst_5 = arith.constant dense<4096> : tensor<1x2048xi32> loc(#loc1)
+    %c0_i32 = arith.constant 0 : i32 loc(#loc1)
+    %xoffset = tt.get_program_id x : i32 loc(#loc79)
+    %xmask = arith.cmpi slt, %xoffset, %c2048_i32 : i32 loc(#loc80)
+    %r0_base = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32> loc(#loc81)
+    %r0_base_6 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<2048xi32> -> tensor<1x2048xi32> loc(#loc82)
+    %tmp3_weight:3 = scf.for %r0_offset = %c0_i32 to %c4096_i32 step %c2048_i32 iter_args(%tmp3_mean = %cst_0, %tmp3_m2 = %cst_0, %tmp3_weight_7 = %cst_0) -> (tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32>)  : i32 {
+      %r0_index = tt.splat %r0_offset : i32 -> tensor<1x2048xi32> loc(#loc84)
+      %r0_index_8 = arith.addi %r0_index, %r0_base_6 : tensor<1x2048xi32> loc(#loc84)
+      %r0_mask = arith.cmpi slt, %r0_index_8, %cst_5 : tensor<1x2048xi32> loc(#loc85)
+      %tmp0 = arith.muli %xoffset, %c4096_i32 : i32 loc(#loc86)
+      %tmp0_9 = tt.splat %tmp0 : i32 -> tensor<1x2048xi32> loc(#loc135)
+      %tmp0_10 = arith.addi %r0_index_8, %tmp0_9 : tensor<1x2048xi32> loc(#loc87)
+      %tmp0_11 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<1x2048x!tt.ptr<bf16>> loc(#loc88)
+      %tmp0_12 = tt.addptr %tmp0_11, %tmp0_10 : tensor<1x2048x!tt.ptr<bf16>>, tensor<1x2048xi32> loc(#loc88)
+      %tmp0_13 = tt.splat %xmask : i1 -> tensor<1x2048xi1> loc(#loc136)
+      %tmp0_14 = arith.andi %r0_mask, %tmp0_13 : tensor<1x2048xi1> loc(#loc89)
+      %tmp0_15 = tt.load %tmp0_12, %tmp0_14, %cst_1 evictionPolicy = evict_last : tensor<1x2048x!tt.ptr<bf16>> loc(#loc90)
+      %tmp0_16 = arith.extf %tmp0_15 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc91)
+      %1 = arith.cmpi eq, %r0_offset, %c0_i32 : i32 loc(#loc16)
+      %2:3 = scf.if %1 -> (tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32>) {
+        scf.yield %cst_0, %tmp0_16, %cst_4 : tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32> loc(#loc161)
+      } else {
+        %delta = arith.subf %tmp0_16, %tmp3_mean : tensor<1x2048xf32> loc(#loc138)
+        %new_weight = arith.addf %tmp3_weight_7, %cst_4 : tensor<1x2048xf32> loc(#loc162)
+        %new_mean = arith.divf %delta, %new_weight : tensor<1x2048xf32> loc(#loc140)
+        %new_mean_20 = arith.addf %tmp3_mean, %new_mean : tensor<1x2048xf32> loc(#loc163)
+        %new_m2 = arith.subf %tmp0_16, %new_mean_20 : tensor<1x2048xf32> loc(#loc142)
+        %new_m2_21 = arith.mulf %delta, %new_m2 : tensor<1x2048xf32> loc(#loc143)
+        %new_m2_22 = arith.addf %tmp3_m2, %new_m2_21 : tensor<1x2048xf32> loc(#loc164)
+        scf.yield %new_m2_22, %new_mean_20, %new_weight : tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32> loc(#loc145)
+      } loc(#loc92)
+      %tmp3_mean_17 = arith.select %tmp0_14, %2#1, %tmp3_mean : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc101)
+      %tmp3_m2_18 = arith.select %tmp0_14, %2#0, %tmp3_m2 : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc102)
+      %tmp3_weight_19 = arith.select %tmp0_14, %2#2, %tmp3_weight_7 : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc103)
+      scf.yield %tmp3_mean_17, %tmp3_m2_18, %tmp3_weight_19 : tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32> loc(#loc30)
+    } loc(#loc160)
+    %0:3 = "tt.reduce"(%tmp3_weight#0, %tmp3_weight#1, %tmp3_weight#2) <{axis = 1 : i32}> ({
+    ^bb0(%arg6: f32 loc(callsite(#loc1 at #loc2)), %arg7: f32 loc(callsite(#loc1 at #loc2)), %arg8: f32 loc(callsite(#loc1 at #loc2)), %arg9: f32 loc(callsite(#loc1 at #loc2)), %arg10: f32 loc(callsite(#loc1 at #loc2)), %arg11: f32 loc(callsite(#loc1 at #loc2))):
+      %delta = arith.subf %arg9, %arg6 : f32 loc(#loc146)
+      %new_weight = arith.addf %arg8, %arg11 : f32 loc(#loc147)
+      %w2_over_w = arith.cmpf oeq, %new_weight, %cst : f32 loc(#loc148)
+      %w2_over_w_7 = arith.divf %arg11, %new_weight : f32 loc(#loc149)
+      %w2_over_w_8 = arith.select %w2_over_w, %cst, %w2_over_w_7 : f32 loc(#loc150)
+      %1 = arith.mulf %delta, %w2_over_w_8 : f32 loc(#loc151)
+      %2 = arith.addf %arg6, %1 : f32 loc(#loc152)
+      %3 = arith.addf %arg7, %arg10 : f32 loc(#loc153)
+      %4 = arith.mulf %delta, %delta : f32 loc(#loc154)
+      %5 = arith.mulf %4, %arg8 : f32 loc(#loc155)
+      %6 = arith.mulf %5, %w2_over_w_8 : f32 loc(#loc156)
+      %7 = arith.addf %3, %6 : f32 loc(#loc157)
+      tt.reduce.return %2, %7, %new_weight : f32, f32, f32 loc(#loc104)
+    }) : (tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32>) -> (tensor<1xf32>, tensor<1xf32>, tensor<1xf32>) loc(#loc104)
+    %tmp3 = tt.expand_dims %0#0 {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc110)
+    %tmp7 = tt.expand_dims %0#1 {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc111)
+    scf.for %r0_offset = %c0_i32 to %c4096_i32 step %c2048_i32  : i32 {
+      %r0_index = tt.splat %r0_offset : i32 -> tensor<1x2048xi32> loc(#loc112)
+      %r0_index_7 = arith.addi %r0_index, %r0_base_6 : tensor<1x2048xi32> loc(#loc112)
+      %r0_mask = arith.cmpi slt, %r0_index_7, %cst_5 : tensor<1x2048xi32> loc(#loc113)
+      %tmp9 = tt.splat %in_ptr1 : !tt.ptr<bf16> -> tensor<1x2048x!tt.ptr<bf16>> loc(#loc114)
+      %tmp9_8 = tt.addptr %tmp9, %r0_index_7 : tensor<1x2048x!tt.ptr<bf16>>, tensor<1x2048xi32> loc(#loc114)
+      %tmp9_9 = tt.load %tmp9_8, %r0_mask, %cst_1 evictionPolicy = evict_last : tensor<1x2048x!tt.ptr<bf16>> loc(#loc115)
+      %tmp9_10 = arith.extf %tmp9_9 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc116)
+      %tmp12 = arith.muli %xoffset, %c4096_i32 : i32 loc(#loc117)
+      %tmp12_11 = tt.splat %tmp12 : i32 -> tensor<1x2048xi32> loc(#loc158)
+      %tmp12_12 = arith.addi %r0_index_7, %tmp12_11 : tensor<1x2048xi32> loc(#loc118)
+      %tmp12_13 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<1x2048x!tt.ptr<bf16>> loc(#loc119)
+      %tmp12_14 = tt.addptr %tmp12_13, %tmp12_12 : tensor<1x2048x!tt.ptr<bf16>>, tensor<1x2048xi32> loc(#loc119)
+      %tmp12_15 = tt.splat %xmask : i1 -> tensor<1x2048xi1> loc(#loc159)
+      %tmp12_16 = arith.andi %r0_mask, %tmp12_15 : tensor<1x2048xi1> loc(#loc120)
+      %tmp12_17 = tt.load %tmp12_14, %tmp12_16, %cst_1 evictionPolicy = evict_first : tensor<1x2048x!tt.ptr<bf16>> loc(#loc121)
+      %tmp12_18 = arith.extf %tmp12_17 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc122)
+      %tmp23 = tt.splat %in_ptr2 : !tt.ptr<bf16> -> tensor<1x2048x!tt.ptr<bf16>> loc(#loc123)
+      %tmp23_19 = tt.addptr %tmp23, %r0_index_7 : tensor<1x2048x!tt.ptr<bf16>>, tensor<1x2048xi32> loc(#loc123)
+      %tmp23_20 = tt.load %tmp23_19, %r0_mask, %cst_1 evictionPolicy = evict_last : tensor<1x2048x!tt.ptr<bf16>> loc(#loc124)
+      %tmp23_21 = arith.extf %tmp23_20 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc125)
+      %tmp11 = arith.addf %tmp9_10, %cst_4 : tensor<1x2048xf32> loc(#loc126)
+      %tmp14 = tt.broadcast %tmp3 : tensor<1x1xf32> -> tensor<1x2048xf32> loc(#loc127)
+      %tmp14_22 = arith.subf %tmp12_18, %tmp14 : tensor<1x2048xf32> loc(#loc127)
+      %tmp16 = arith.divf %tmp7, %cst_3 : tensor<1x1xf32> loc(#loc128)
+      %tmp18 = arith.addf %tmp16, %cst_2 : tensor<1x1xf32> loc(#loc129)
+      %tmp19 = tt.extern_elementwise %tmp18 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<1x1xf32>) -> tensor<1x1xf32> loc(#loc130)
+      %tmp20 = tt.broadcast %tmp19 : tensor<1x1xf32> -> tensor<1x2048xf32> loc(#loc131)
+      %tmp20_23 = arith.mulf %tmp14_22, %tmp20 : tensor<1x2048xf32> loc(#loc131)
+      %tmp22 = arith.mulf %tmp11, %tmp20_23 : tensor<1x2048xf32> loc(#loc132)
+      %tmp24 = arith.addf %tmp22, %tmp23_21 : tensor<1x2048xf32> loc(#loc133)
+      %1 = tt.splat %out_ptr2 : !tt.ptr<bf16> -> tensor<1x2048x!tt.ptr<bf16>> loc(#loc69)
+      %2 = tt.addptr %1, %tmp12_12 : tensor<1x2048x!tt.ptr<bf16>>, tensor<1x2048xi32> loc(#loc69)
+      %3 = arith.truncf %tmp24 : tensor<1x2048xf32> to tensor<1x2048xbf16> loc(#loc70)
+      tt.store %2, %3, %tmp12_16 : tensor<1x2048x!tt.ptr<bf16>> loc(#loc70)
+    } loc(#loc46)
+    tt.return loc(#loc71)
+  } loc(#loc)
+} loc(#loc)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":23:28)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":25:21)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":26:27)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":26:37)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":32:43)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":33:31)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":34:29)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":38:46)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":38:41)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":38:34)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":38:61)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":38:51)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":38:112)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":42:62)
+#loc17 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":217:7)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":42:51)
+#loc19 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":220:31)
+#loc20 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":222:24)
+#loc21 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":223:30)
+#loc22 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":224:34)
+#loc23 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":224:26)
+#loc24 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":225:39)
+#loc25 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":225:31)
+#loc26 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":225:22)
+#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":44:62)
+#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":45:58)
+#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":46:66)
+#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":46:8)
+#loc31 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":243:46)
+#loc32 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":231:21)
+#loc33 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":232:28)
+#loc34 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:39)
+#loc35 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:60)
+#loc36 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:49)
+#loc37 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":235:25)
+#loc38 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":235:17)
+#loc39 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:15)
+#loc40 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:30)
+#loc41 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:38)
+#loc42 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:49)
+#loc43 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:22)
+#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":48:16)
+#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":49:16)
+#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":51:43)
+#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":52:31)
+#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":53:29)
+#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":57:34)
+#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":57:41)
+#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":57:94)
+#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":58:47)
+#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":58:42)
+#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":58:35)
+#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":58:62)
+#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":58:52)
+#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":58:114)
+#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":59:35)
+#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":59:42)
+#loc60 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":59:95)
+#loc61 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":61:23)
+#loc62 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":63:24)
+#loc63 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":65:24)
+#loc64 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":67:24)
+#loc65 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":68:32)
+#loc66 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":69:24)
+#loc67 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":71:24)
+#loc68 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":72:24)
+#loc69 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":73:29)
+#loc70 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":73:53)
+#loc71 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":51:4)
+#loc79 = loc("xoffset"(#loc3))
+#loc80 = loc("xmask"(#loc4))
+#loc81 = loc("r0_base"(#loc5))
+#loc82 = loc("r0_base"(#loc6))
+#loc83 = loc("tmp3_mean"(#loc7))
+#loc84 = loc("r0_index"(#loc8))
+#loc85 = loc("r0_mask"(#loc9))
+#loc86 = loc("tmp0"(#loc10))
+#loc87 = loc("tmp0"(#loc11))
+#loc88 = loc("tmp0"(#loc12))
+#loc89 = loc("tmp0"(#loc13))
+#loc90 = loc("tmp0"(#loc14))
+#loc91 = loc("tmp0"(#loc15))
+#loc92 = loc(callsite(#loc17 at #loc18))
+#loc93 = loc("new_m2"(#loc19))
+#loc94 = loc("delta"(#loc20))
+#loc95 = loc("new_weight"(#loc21))
+#loc96 = loc("new_mean"(#loc22))
+#loc97 = loc("new_mean"(#loc23))
+#loc98 = loc("new_m2"(#loc24))
+#loc99 = loc("new_m2"(#loc25))
+#loc100 = loc("new_m2"(#loc26))
+#loc101 = loc("tmp3_mean"(#loc27))
+#loc102 = loc("tmp3_m2"(#loc28))
+#loc103 = loc("tmp3_weight"(#loc29))
+#loc104 = loc(callsite(#loc31 at #loc2))
+#loc105 = loc("delta"(#loc32))
+#loc106 = loc("new_weight"(#loc33))
+#loc107 = loc("w2_over_w"(#loc34))
+#loc108 = loc("w2_over_w"(#loc35))
+#loc109 = loc("w2_over_w"(#loc36))
+#loc110 = loc("tmp3"(#loc44))
+#loc111 = loc("tmp7"(#loc45))
+#loc112 = loc("r0_index"(#loc47))
+#loc113 = loc("r0_mask"(#loc48))
+#loc114 = loc("tmp9"(#loc49))
+#loc115 = loc("tmp9"(#loc50))
+#loc116 = loc("tmp9"(#loc51))
+#loc117 = loc("tmp12"(#loc52))
+#loc118 = loc("tmp12"(#loc53))
+#loc119 = loc("tmp12"(#loc54))
+#loc120 = loc("tmp12"(#loc55))
+#loc121 = loc("tmp12"(#loc56))
+#loc122 = loc("tmp12"(#loc57))
+#loc123 = loc("tmp23"(#loc58))
+#loc124 = loc("tmp23"(#loc59))
+#loc125 = loc("tmp23"(#loc60))
+#loc126 = loc("tmp11"(#loc61))
+#loc127 = loc("tmp14"(#loc62))
+#loc128 = loc("tmp16"(#loc63))
+#loc129 = loc("tmp18"(#loc64))
+#loc130 = loc("tmp19"(#loc65))
+#loc131 = loc("tmp20"(#loc66))
+#loc132 = loc("tmp22"(#loc67))
+#loc133 = loc("tmp24"(#loc68))
+#loc134 = loc("tmp3_m2"(#loc83))
+#loc135 = loc(fused[#loc87, #loc86])
+#loc136 = loc(fused[#loc89, #loc80])
+#loc137 = loc("new_m2"(#loc93))
+#loc138 = loc(callsite(#loc94 at #loc18))
+#loc139 = loc("new_weight"(#loc95))
+#loc140 = loc(callsite(#loc96 at #loc18))
+#loc141 = loc("new_mean"(#loc97))
+#loc142 = loc(callsite(#loc98 at #loc18))
+#loc143 = loc(callsite(#loc99 at #loc18))
+#loc144 = loc("new_m2"(#loc100))
+#loc145 = loc(callsite(#loc100 at #loc18))
+#loc146 = loc(callsite(#loc105 at #loc104))
+#loc147 = loc(callsite(#loc106 at #loc104))
+#loc148 = loc(callsite(#loc107 at #loc104))
+#loc149 = loc(callsite(#loc108 at #loc104))
+#loc150 = loc(callsite(#loc109 at #loc104))
+#loc151 = loc(callsite(#loc37 at #loc104))
+#loc152 = loc(callsite(#loc38 at #loc104))
+#loc153 = loc(callsite(#loc39 at #loc104))
+#loc154 = loc(callsite(#loc40 at #loc104))
+#loc155 = loc(callsite(#loc41 at #loc104))
+#loc156 = loc(callsite(#loc42 at #loc104))
+#loc157 = loc(callsite(#loc43 at #loc104))
+#loc158 = loc(fused[#loc118, #loc117])
+#loc159 = loc(fused[#loc120, #loc80])
+#loc160 = loc("tmp3_weight"(#loc134))
+#loc161 = loc(callsite(#loc137 at #loc18))
+#loc162 = loc(callsite(#loc139 at #loc18))
+#loc163 = loc(callsite(#loc141 at #loc18))
+#loc164 = loc(callsite(#loc144 at #loc18))
diff --git a/triton/7VRSMQ6HCWL4FYVSALSUIPZQQ2RD3JJX7SZDTD6UHJE5ZNP2MUVA/__grp__triton_red_fused__fused_rms_norm_view_1.json b/triton/7VRSMQ6HCWL4FYVSALSUIPZQQ2RD3JJX7SZDTD6UHJE5ZNP2MUVA/__grp__triton_red_fused__fused_rms_norm_view_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..a31d4092dcbb71d34fd65e75e337fab7cfefb010
--- /dev/null
+++ b/triton/7VRSMQ6HCWL4FYVSALSUIPZQQ2RD3JJX7SZDTD6UHJE5ZNP2MUVA/__grp__triton_red_fused__fused_rms_norm_view_1.json
@@ -0,0 +1 @@
+{"child_paths": {"triton_red_fused__fused_rms_norm_view_1.source": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/7VRSMQ6HCWL4FYVSALSUIPZQQ2RD3JJX7SZDTD6UHJE5ZNP2MUVA/triton_red_fused__fused_rms_norm_view_1.source", "triton_red_fused__fused_rms_norm_view_1.ttir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/7VRSMQ6HCWL4FYVSALSUIPZQQ2RD3JJX7SZDTD6UHJE5ZNP2MUVA/triton_red_fused__fused_rms_norm_view_1.ttir", "triton_red_fused__fused_rms_norm_view_1.ttgir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/7VRSMQ6HCWL4FYVSALSUIPZQQ2RD3JJX7SZDTD6UHJE5ZNP2MUVA/triton_red_fused__fused_rms_norm_view_1.ttgir", "triton_red_fused__fused_rms_norm_view_1.llir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/7VRSMQ6HCWL4FYVSALSUIPZQQ2RD3JJX7SZDTD6UHJE5ZNP2MUVA/triton_red_fused__fused_rms_norm_view_1.llir", "triton_red_fused__fused_rms_norm_view_1.ptx": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/7VRSMQ6HCWL4FYVSALSUIPZQQ2RD3JJX7SZDTD6UHJE5ZNP2MUVA/triton_red_fused__fused_rms_norm_view_1.ptx", "triton_red_fused__fused_rms_norm_view_1.cubin": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/7VRSMQ6HCWL4FYVSALSUIPZQQ2RD3JJX7SZDTD6UHJE5ZNP2MUVA/triton_red_fused__fused_rms_norm_view_1.cubin", "triton_red_fused__fused_rms_norm_view_1.json": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/7VRSMQ6HCWL4FYVSALSUIPZQQ2RD3JJX7SZDTD6UHJE5ZNP2MUVA/triton_red_fused__fused_rms_norm_view_1.json"}}
\ No newline at end of file
diff --git a/triton/7VRSMQ6HCWL4FYVSALSUIPZQQ2RD3JJX7SZDTD6UHJE5ZNP2MUVA/triton_red_fused__fused_rms_norm_view_1.cubin b/triton/7VRSMQ6HCWL4FYVSALSUIPZQQ2RD3JJX7SZDTD6UHJE5ZNP2MUVA/triton_red_fused__fused_rms_norm_view_1.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..655a79dd32b3f8867b16f43ee14dfa4de28af9d4
Binary files /dev/null and b/triton/7VRSMQ6HCWL4FYVSALSUIPZQQ2RD3JJX7SZDTD6UHJE5ZNP2MUVA/triton_red_fused__fused_rms_norm_view_1.cubin differ
diff --git a/triton/7VRSMQ6HCWL4FYVSALSUIPZQQ2RD3JJX7SZDTD6UHJE5ZNP2MUVA/triton_red_fused__fused_rms_norm_view_1.json b/triton/7VRSMQ6HCWL4FYVSALSUIPZQQ2RD3JJX7SZDTD6UHJE5ZNP2MUVA/triton_red_fused__fused_rms_norm_view_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..fc6977d8f2b5c9294ec3a7cbfd3a0141532ce619
--- /dev/null
+++ b/triton/7VRSMQ6HCWL4FYVSALSUIPZQQ2RD3JJX7SZDTD6UHJE5ZNP2MUVA/triton_red_fused__fused_rms_norm_view_1.json
@@ -0,0 +1 @@
+{"hash": "fd632643c71597c2e2b202e5443f3086a23da537fcb2398fd43a49dcb5fa652a", "target": {"backend": "cuda", "arch": 89, "warp_size": 32}, "num_warps": 16, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "enable_reflect_ftz": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee", "bf16x3", "bf16x6"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm89", "instrumentation_mode": "", "triton_version": "3.6.0", "tensordesc_meta": [], "shared": 256, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused__fused_rms_norm_view_1"}
\ No newline at end of file
diff --git a/triton/7VRSMQ6HCWL4FYVSALSUIPZQQ2RD3JJX7SZDTD6UHJE5ZNP2MUVA/triton_red_fused__fused_rms_norm_view_1.llir b/triton/7VRSMQ6HCWL4FYVSALSUIPZQQ2RD3JJX7SZDTD6UHJE5ZNP2MUVA/triton_red_fused__fused_rms_norm_view_1.llir
new file mode 100644
index 0000000000000000000000000000000000000000..e3ea5cfdd31f0e666e32da2aec1289ee2040aea8
--- /dev/null
+++ b/triton/7VRSMQ6HCWL4FYVSALSUIPZQQ2RD3JJX7SZDTD6UHJE5ZNP2MUVA/triton_red_fused__fused_rms_norm_view_1.llir
@@ -0,0 +1,161 @@
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64"
+
+@global_smem = external local_unnamed_addr addrspace(3) global [0 x i8], align 16
+
+; Function Attrs: nounwind
+define ptx_kernel void @triton_red_fused__fused_rms_norm_view_1(ptr addrspace(1) %0, ptr addrspace(1) %1, i32 %2, i32 %3, ptr addrspace(1) readnone captures(none) %4, ptr addrspace(1) readnone captures(none) %5) local_unnamed_addr #0 !dbg !4 {
+  %7 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7
+  %8 = shl i32 %7, 6, !dbg !8
+  %9 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9
+  %10 = and i32 %9, 504, !dbg !9
+  %11 = lshr exact i32 %10, 3, !dbg !9
+  %12 = or disjoint i32 %11, %8, !dbg !10
+  %13 = shl nuw nsw i32 %9, 3, !dbg !11
+  %14 = and i32 %13, 56, !dbg !11
+  %15 = sdiv i32 %12, 32, !dbg !12
+  %16 = mul i32 %15, 32, !dbg !13
+  %.decomposed = sub i32 %12, %16, !dbg !13
+  %17 = shl nsw i32 %.decomposed, 7, !dbg !14
+  %18 = mul i32 %15, 12288, !dbg !15
+  %19 = or disjoint i32 %17, %14
+  %20 = add i32 %19, %18
+  %21 = sext i32 %20 to i64, !dbg !16
+  %22 = getelementptr bfloat, ptr addrspace(1) %0, i64 %21, !dbg !16
+  %23 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !17
+  %24 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %22, i64 %23, i1 true) #4, !dbg !17
+  %25 = extractvalue { i32, i32, i32, i32 } %24, 0, !dbg !17
+  %26 = bitcast i32 %25 to <2 x bfloat>, !dbg !17
+  %27 = extractvalue { i32, i32, i32, i32 } %24, 1, !dbg !17
+  %28 = bitcast i32 %27 to <2 x bfloat>, !dbg !17
+  %29 = extractvalue { i32, i32, i32, i32 } %24, 2, !dbg !17
+  %30 = bitcast i32 %29 to <2 x bfloat>, !dbg !17
+  %31 = extractvalue { i32, i32, i32, i32 } %24, 3, !dbg !17
+  %32 = bitcast i32 %31 to <2 x bfloat>, !dbg !17
+  %33 = sext i32 %20 to i64, !dbg !16
+  %34 = getelementptr bfloat, ptr addrspace(1) %0, i64 %33, !dbg !16
+  %35 = getelementptr i8, ptr addrspace(1) %34, i64 128, !dbg !16
+  %36 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !17
+  %37 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %35, i64 %36, i1 true) #4, !dbg !17
+  %38 = extractvalue { i32, i32, i32, i32 } %37, 0, !dbg !17
+  %39 = bitcast i32 %38 to <2 x bfloat>, !dbg !17
+  %40 = extractvalue { i32, i32, i32, i32 } %37, 1, !dbg !17
+  %41 = bitcast i32 %40 to <2 x bfloat>, !dbg !17
+  %42 = extractvalue { i32, i32, i32, i32 } %37, 2, !dbg !17
+  %43 = bitcast i32 %42 to <2 x bfloat>, !dbg !17
+  %44 = extractvalue { i32, i32, i32, i32 } %37, 3, !dbg !17
+  %45 = bitcast i32 %44 to <2 x bfloat>, !dbg !17
+  %46 = fpext <2 x bfloat> %26 to <2 x float>, !dbg !18
+  %47 = fmul <2 x float> %46, %46, !dbg !19
+  %48 = fpext <2 x bfloat> %39 to <2 x float>, !dbg !18
+  %49 = fmul <2 x float> %48, %48, !dbg !19
+  %50 = fadd <2 x float> %47, %49, !dbg !20
+  %51 = fpext <2 x bfloat> %28 to <2 x float>, !dbg !18
+  %52 = fmul <2 x float> %51, %51, !dbg !19
+  %53 = fpext <2 x bfloat> %41 to <2 x float>, !dbg !18
+  %54 = fmul <2 x float> %53, %53, !dbg !19
+  %55 = fadd <2 x float> %52, %54, !dbg !20
+  %56 = fpext <2 x bfloat> %30 to <2 x float>, !dbg !18
+  %57 = fmul <2 x float> %56, %56, !dbg !19
+  %58 = fpext <2 x bfloat> %43 to <2 x float>, !dbg !18
+  %59 = fmul <2 x float> %58, %58, !dbg !19
+  %60 = fadd <2 x float> %57, %59, !dbg !20
+  %61 = fpext <2 x bfloat> %32 to <2 x float>, !dbg !18
+  %62 = fmul <2 x float> %61, %61, !dbg !19
+  %63 = fpext <2 x bfloat> %45 to <2 x float>, !dbg !18
+  %64 = fmul <2 x float> %63, %63, !dbg !19
+  %65 = fadd <2 x float> %62, %64, !dbg !20
+  %66 = and i32 %9, 63, !dbg !9
+  %67 = or disjoint i32 %8, %66, !dbg !10
+  %shift = shufflevector <2 x float> %50, <2 x float> poison, <2 x i32> <i32 1, i32 poison>, !dbg !21
+  %foldExtExtBinop = fadd <2 x float> %50, %shift, !dbg !21
+  %foldExtExtBinop9 = fadd <2 x float> %55, %foldExtExtBinop, !dbg !21
+  %shift11 = shufflevector <2 x float> %55, <2 x float> poison, <2 x i32> <i32 1, i32 poison>, !dbg !21
+  %foldExtExtBinop12 = fadd <2 x float> %shift11, %foldExtExtBinop9, !dbg !21
+  %foldExtExtBinop14 = fadd <2 x float> %60, %foldExtExtBinop12, !dbg !21
+  %shift16 = shufflevector <2 x float> %60, <2 x float> poison, <2 x i32> <i32 1, i32 poison>, !dbg !21
+  %foldExtExtBinop17 = fadd <2 x float> %shift16, %foldExtExtBinop14, !dbg !21
+  %foldExtExtBinop19 = fadd <2 x float> %65, %foldExtExtBinop17, !dbg !21
+  %shift21 = shufflevector <2 x float> %65, <2 x float> poison, <2 x i32> <i32 1, i32 poison>, !dbg !21
+  %foldExtExtBinop22 = fadd <2 x float> %shift21, %foldExtExtBinop19, !dbg !21
+  %68 = extractelement <2 x float> %foldExtExtBinop22, i64 0, !dbg !21
+  %69 = bitcast float %68 to i32, !dbg !24
+  %70 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %69, i32 4, i32 31), !dbg !24
+  %71 = bitcast i32 %70 to float, !dbg !24
+  %72 = fadd float %68, %71, !dbg !21
+  %73 = bitcast float %72 to i32, !dbg !24
+  %74 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %73, i32 2, i32 31), !dbg !24
+  %75 = bitcast i32 %74 to float, !dbg !24
+  %76 = fadd float %72, %75, !dbg !21
+  %77 = bitcast float %76 to i32, !dbg !24
+  %78 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %77, i32 1, i32 31), !dbg !24
+  %79 = bitcast i32 %78 to float, !dbg !24
+  %80 = fadd float %76, %79, !dbg !21
+  %81 = lshr exact i32 %10, 1, !dbg !27
+  %82 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %81, !dbg !27
+  store float %80, ptr addrspace(3) %82, align 4, !dbg !27
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !27
+  %83 = shl nuw nsw i32 %66, 2, !dbg !27
+  %84 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %83, !dbg !27
+  %85 = load i32, ptr addrspace(3) %84, align 4, !dbg !27
+  %86 = sext i32 %67 to i64, !dbg !28
+  %87 = getelementptr float, ptr addrspace(1) %1, i64 %86, !dbg !28
+  %88 = and i32 %9, 448, !dbg !29
+  %89 = icmp eq i32 %88, 0, !dbg !29
+  tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %85, ptr addrspace(1) %87, i1 %89) #4, !dbg !29
+  ret void, !dbg !30
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1
+
+; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
+declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2
+
+; Function Attrs: convergent nocallback nounwind
+declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #3
+
+attributes #0 = { nounwind "nvvm.reqntid"="512" }
+attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
+attributes #3 = { convergent nocallback nounwind }
+attributes #4 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly)
+!1 = !DIFile(filename: "cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py", directory: "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi")
+!2 = !{i32 2, !"Debug Info Version", i32 3}
+!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
+!4 = distinct !DISubprogram(name: "triton_red_fused__fused_rms_norm_view_1", linkageName: "triton_red_fused__fused_rms_norm_view_1", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!5 = !DISubroutineType(cc: DW_CC_normal, types: !6)
+!6 = !{}
+!7 = !DILocation(line: 23, column: 28, scope: !4)
+!8 = !DILocation(line: 23, column: 33, scope: !4)
+!9 = !DILocation(line: 24, column: 44, scope: !4)
+!10 = !DILocation(line: 24, column: 23, scope: !4)
+!11 = !DILocation(line: 26, column: 37, scope: !4)
+!12 = !DILocation(line: 29, column: 19, scope: !4)
+!13 = !DILocation(line: 28, column: 19, scope: !4)
+!14 = !DILocation(line: 38, column: 45, scope: !4)
+!15 = !DILocation(line: 38, column: 56, scope: !4)
+!16 = !DILocation(line: 38, column: 34, scope: !4)
+!17 = !DILocation(line: 38, column: 61, scope: !4)
+!18 = !DILocation(line: 38, column: 115, scope: !4)
+!19 = !DILocation(line: 40, column: 22, scope: !4)
+!20 = !DILocation(line: 42, column: 23, scope: !4)
+!21 = !DILocation(line: 263, column: 15, scope: !22, inlinedAt: !24)
+!22 = distinct !DILexicalBlockFile(scope: !4, file: !23, discriminator: 0)
+!23 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.12/dist-packages/triton/language")
+!24 = !DILocation(line: 293, column: 36, scope: !22, inlinedAt: !25)
+!25 = !DILocation(line: 44, column: 25, scope: !26)
+!26 = distinct !DILexicalBlockFile(scope: !4, file: !1, discriminator: 0)
+!27 = !DILocation(line: 44, column: 28, scope: !4)
+!28 = !DILocation(line: 45, column: 25, scope: !4)
+!29 = !DILocation(line: 45, column: 36, scope: !4)
+!30 = !DILocation(line: 45, column: 4, scope: !4)
diff --git a/triton/7VRSMQ6HCWL4FYVSALSUIPZQQ2RD3JJX7SZDTD6UHJE5ZNP2MUVA/triton_red_fused__fused_rms_norm_view_1.ptx b/triton/7VRSMQ6HCWL4FYVSALSUIPZQQ2RD3JJX7SZDTD6UHJE5ZNP2MUVA/triton_red_fused__fused_rms_norm_view_1.ptx
new file mode 100644
index 0000000000000000000000000000000000000000..fd61ffb64cd6bdd9aba3f2c2cce15adcc77f5dc3
--- /dev/null
+++ b/triton/7VRSMQ6HCWL4FYVSALSUIPZQQ2RD3JJX7SZDTD6UHJE5ZNP2MUVA/triton_red_fused__fused_rms_norm_view_1.ptx
@@ -0,0 +1,557 @@
+//
+// Generated by LLVM NVPTX Back-End
+//
+
+.version 9.1
+.target sm_89
+.address_size 64
+
+	// .globl	triton_red_fused__fused_rms_norm_view_1 // -- Begin function triton_red_fused__fused_rms_norm_view_1
+.extern .shared .align 16 .b8 global_smem[];
+                                        // @triton_red_fused__fused_rms_norm_view_1
+.visible .entry triton_red_fused__fused_rms_norm_view_1(
+	.param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm_view_1_param_0,
+	.param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm_view_1_param_1,
+	.param .u32 triton_red_fused__fused_rms_norm_view_1_param_2,
+	.param .u32 triton_red_fused__fused_rms_norm_view_1_param_3,
+	.param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm_view_1_param_4,
+	.param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm_view_1_param_5
+)
+.reqntid 512
+{
+	.reg .pred 	%p<3>;
+	.reg .b16 	%rs<17>;
+	.reg .b32 	%r<81>;
+	.reg .b64 	%rd<8>;
+	.loc	1 18 0                          // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:18:0
+$L__func_begin0:
+	.loc	1 18 0                          // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:18:0
+
+// %bb.0:
+	ld.param.b64 	%rd6, [triton_red_fused__fused_rms_norm_view_1_param_0];
+	ld.param.b64 	%rd7, [triton_red_fused__fused_rms_norm_view_1_param_1];
+$L__tmp0:
+	.loc	1 23 28                         // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:23:28
+	mov.u32 	%r11, %ctaid.x;
+	.loc	1 23 33                         // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:23:33
+	shl.b32 	%r12, %r11, 6;
+	.loc	1 24 44                         // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:24:44
+	mov.u32 	%r13, %tid.x;
+	and.b32 	%r14, %r13, 504;
+	bfe.u32 	%r15, %r13, 3, 6;
+	.loc	1 24 23                         // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:24:23
+	or.b32 	%r16, %r15, %r12;
+	.loc	1 26 37                         // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:26:37
+	shl.b32 	%r17, %r13, 3;
+	and.b32 	%r18, %r17, 56;
+	.loc	1 29 19                         // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:29:19
+	bfe.s32 	%r19, %r11, 25, 1;
+	shr.u32 	%r20, %r19, 27;
+	add.s32 	%r21, %r16, %r20;
+	shr.u32 	%r22, %r21, 5;
+	.loc	1 28 19                         // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:28:19
+	and.b32 	%r23, %r21, 33554400;
+	sub.s32 	%r24, %r16, %r23;
+	.loc	1 38 45                         // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:38:45
+	shl.b32 	%r25, %r24, 7;
+	or.b32 	%r26, %r25, %r18;
+	mad.lo.s32 	%r27, %r22, 12288, %r26;
+	.loc	1 38 34                         // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:38:34
+	mad.wide.s32 	%rd1, %r27, 2, %rd6;
+	.loc	1 38 61                         // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:38:61
+	// begin inline asm
+	mov.u64 %rd2, 0x0;
+	createpolicy.fractional.L2::evict_first.b64 %rd2, 1.0;
+	// end inline asm
+	mov.b32 	%r5, 0;
+	mov.pred 	%p1, -1;
+	// begin inline asm
+	mov.u32 %r1, %r5;
+	mov.u32 %r2, %r5;
+	mov.u32 %r3, %r5;
+	mov.u32 %r4, %r5;
+	@%p1 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { %r1, %r2, %r3, %r4 }, [ %rd1 + 0 ], %rd2;
+	// end inline asm
+	.loc	1 38 34                         // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:38:34
+	add.s64 	%rd3, %rd1, 128;
+	.loc	1 38 61                         // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:38:61
+	// begin inline asm
+	mov.u64 %rd4, 0x0;
+	createpolicy.fractional.L2::evict_first.b64 %rd4, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r6, %r5;
+	mov.u32 %r7, %r5;
+	mov.u32 %r8, %r5;
+	mov.u32 %r9, %r5;
+	@%p1 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { %r6, %r7, %r8, %r9 }, [ %rd3 + 0 ], %rd4;
+	// end inline asm
+	.loc	1 38 115                        // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:38:115
+	mov.b32 	{%rs1, %rs2}, %r1;
+	cvt.f32.bf16 	%r28, %rs1;
+	cvt.f32.bf16 	%r29, %rs2;
+	mov.b32 	{%rs3, %rs4}, %r6;
+	cvt.f32.bf16 	%r30, %rs4;
+	cvt.f32.bf16 	%r31, %rs3;
+	.loc	1 40 22                         // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:40:22
+	mul.f32 	%r32, %r31, %r31;
+	mul.f32 	%r33, %r30, %r30;
+	.loc	1 42 23                         // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:42:23
+	fma.rn.f32 	%r34, %r29, %r29, %r33;
+	fma.rn.f32 	%r35, %r28, %r28, %r32;
+	.loc	1 38 115                        // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:38:115
+	mov.b32 	{%rs5, %rs6}, %r2;
+	cvt.f32.bf16 	%r36, %rs5;
+	cvt.f32.bf16 	%r37, %rs6;
+	mov.b32 	{%rs7, %rs8}, %r7;
+	cvt.f32.bf16 	%r38, %rs8;
+	cvt.f32.bf16 	%r39, %rs7;
+	.loc	1 40 22                         // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:40:22
+	mul.f32 	%r40, %r39, %r39;
+	mul.f32 	%r41, %r38, %r38;
+	.loc	1 42 23                         // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:42:23
+	fma.rn.f32 	%r42, %r37, %r37, %r41;
+	fma.rn.f32 	%r43, %r36, %r36, %r40;
+	.loc	1 38 115                        // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:38:115
+	mov.b32 	{%rs9, %rs10}, %r3;
+	cvt.f32.bf16 	%r44, %rs9;
+	cvt.f32.bf16 	%r45, %rs10;
+	mov.b32 	{%rs11, %rs12}, %r8;
+	cvt.f32.bf16 	%r46, %rs12;
+	cvt.f32.bf16 	%r47, %rs11;
+	.loc	1 40 22                         // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:40:22
+	mul.f32 	%r48, %r47, %r47;
+	mul.f32 	%r49, %r46, %r46;
+	.loc	1 42 23                         // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:42:23
+	fma.rn.f32 	%r50, %r45, %r45, %r49;
+	fma.rn.f32 	%r51, %r44, %r44, %r48;
+	.loc	1 38 115                        // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:38:115
+	mov.b32 	{%rs13, %rs14}, %r4;
+	cvt.f32.bf16 	%r52, %rs13;
+	cvt.f32.bf16 	%r53, %rs14;
+	mov.b32 	{%rs15, %rs16}, %r9;
+	cvt.f32.bf16 	%r54, %rs16;
+	cvt.f32.bf16 	%r55, %rs15;
+	.loc	1 40 22                         // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:40:22
+	mul.f32 	%r56, %r55, %r55;
+	mul.f32 	%r57, %r54, %r54;
+	.loc	1 42 23                         // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:42:23
+	fma.rn.f32 	%r58, %r53, %r53, %r57;
+	fma.rn.f32 	%r59, %r52, %r52, %r56;
+	.loc	1 24 44                         // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:24:44
+	and.b32 	%r60, %r13, 63;
+	.loc	1 24 23                         // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:24:23
+	or.b32 	%r61, %r12, %r60;
+$L__tmp1:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:44:25 ] ]
+	add.f32 	%r62, %r35, %r34;
+	add.f32 	%r63, %r43, %r62;
+	add.f32 	%r64, %r42, %r63;
+	add.f32 	%r65, %r51, %r64;
+	add.f32 	%r66, %r50, %r65;
+	add.f32 	%r67, %r59, %r66;
+	add.f32 	%r68, %r58, %r67;
+$L__tmp2:
+	.loc	2 293 36                        // standard.py:293:36 @[ cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:44:25 ]
+	shfl.sync.bfly.b32 	%r69, %r68, 4, 31, -1;
+$L__tmp3:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:44:25 ] ]
+	add.f32 	%r70, %r68, %r69;
+$L__tmp4:
+	.loc	2 293 36                        // standard.py:293:36 @[ cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:44:25 ]
+	shfl.sync.bfly.b32 	%r71, %r70, 2, 31, -1;
+$L__tmp5:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:44:25 ] ]
+	add.f32 	%r72, %r70, %r71;
+$L__tmp6:
+	.loc	2 293 36                        // standard.py:293:36 @[ cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:44:25 ]
+	shfl.sync.bfly.b32 	%r73, %r72, 1, 31, -1;
+$L__tmp7:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:44:25 ] ]
+	add.f32 	%r74, %r72, %r73;
+$L__tmp8:
+	.loc	1 44 28                         // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:44:28
+	shr.u32 	%r75, %r14, 1;
+	mov.b32 	%r76, global_smem;
+	add.s32 	%r77, %r76, %r75;
+	st.shared.b32 	[%r77], %r74;
+	bar.sync 	0;
+	shl.b32 	%r78, %r60, 2;
+	add.s32 	%r79, %r76, %r78;
+	ld.shared.b32 	%r10, [%r79];
+	.loc	1 45 25                         // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:45:25
+	mad.wide.s32 	%rd5, %r61, 4, %rd7;
+	.loc	1 45 36                         // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:45:36
+	and.b32 	%r80, %r13, 448;
+	setp.eq.b32 	%p2, %r80, 0;
+	// begin inline asm
+	@%p2 st.global.b32 [ %rd5 + 0 ], { %r10 };
+	// end inline asm
+	.loc	1 45 4                          // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:45:4
+	ret;
+$L__tmp9:
+$L__func_end0:
+                                        // -- End function
+}
+	.file	1 "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py"
+	.file	2 "/usr/local/lib/python3.12/dist-packages/triton/language/standard.py"
+	.section	.debug_abbrev
+	{
+.b8 1                                   // Abbreviation Code
+.b8 17                                  // DW_TAG_compile_unit
+.b8 1                                   // DW_CHILDREN_yes
+.b8 37                                  // DW_AT_producer
+.b8 8                                   // DW_FORM_string
+.b8 19                                  // DW_AT_language
+.b8 5                                   // DW_FORM_data2
+.b8 3                                   // DW_AT_name
+.b8 8                                   // DW_FORM_string
+.b8 16                                  // DW_AT_stmt_list
+.b8 6                                   // DW_FORM_data4
+.b8 27                                  // DW_AT_comp_dir
+.b8 8                                   // DW_FORM_string
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 2                                   // Abbreviation Code
+.b8 46                                  // DW_TAG_subprogram
+.b8 0                                   // DW_CHILDREN_no
+.b8 3                                   // DW_AT_name
+.b8 8                                   // DW_FORM_string
+.b8 32                                  // DW_AT_inline
+.b8 11                                  // DW_FORM_data1
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 3                                   // Abbreviation Code
+.b8 46                                  // DW_TAG_subprogram
+.b8 1                                   // DW_CHILDREN_yes
+.b8 17                                  // DW_AT_low_pc
+.b8 1                                   // DW_FORM_addr
+.b8 18                                  // DW_AT_high_pc
+.b8 1                                   // DW_FORM_addr
+.b8 49                                  // DW_AT_abstract_origin
+.b8 19                                  // DW_FORM_ref4
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 4                                   // Abbreviation Code
+.b8 29                                  // DW_TAG_inlined_subroutine
+.b8 1                                   // DW_CHILDREN_yes
+.b8 49                                  // DW_AT_abstract_origin
+.b8 19                                  // DW_FORM_ref4
+.b8 17                                  // DW_AT_low_pc
+.b8 1                                   // DW_FORM_addr
+.b8 18                                  // DW_AT_high_pc
+.b8 1                                   // DW_FORM_addr
+.b8 88                                  // DW_AT_call_file
+.b8 11                                  // DW_FORM_data1
+.b8 89                                  // DW_AT_call_line
+.b8 11                                  // DW_FORM_data1
+.b8 87                                  // DW_AT_call_column
+.b8 11                                  // DW_FORM_data1
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 5                                   // Abbreviation Code
+.b8 29                                  // DW_TAG_inlined_subroutine
+.b8 0                                   // DW_CHILDREN_no
+.b8 49                                  // DW_AT_abstract_origin
+.b8 19                                  // DW_FORM_ref4
+.b8 17                                  // DW_AT_low_pc
+.b8 1                                   // DW_FORM_addr
+.b8 18                                  // DW_AT_high_pc
+.b8 1                                   // DW_FORM_addr
+.b8 88                                  // DW_AT_call_file
+.b8 11                                  // DW_FORM_data1
+.b8 89                                  // DW_AT_call_line
+.b8 5                                   // DW_FORM_data2
+.b8 87                                  // DW_AT_call_column
+.b8 11                                  // DW_FORM_data1
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 0                                   // EOM(3)
+	}
+	.section	.debug_info
+	{
+.b32 339                                // Length of Unit
+.b8 2                                   // DWARF version number
+.b8 0
+.b32 .debug_abbrev                      // Offset Into Abbrev. Section
+.b8 8                                   // Address Size (in bytes)
+.b8 1                                   // Abbrev [1] 0xb:0x14c DW_TAG_compile_unit
+.b8 116                                 // DW_AT_producer
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 0
+.b8 2                                   // DW_AT_language
+.b8 0
+.b8 99                                  // DW_AT_name
+.b8 113
+.b8 105
+.b8 116
+.b8 120
+.b8 53
+.b8 104
+.b8 119
+.b8 117
+.b8 112
+.b8 107
+.b8 98
+.b8 106
+.b8 109
+.b8 99
+.b8 115
+.b8 111
+.b8 121
+.b8 107
+.b8 113
+.b8 101
+.b8 112
+.b8 122
+.b8 113
+.b8 99
+.b8 55
+.b8 122
+.b8 99
+.b8 120
+.b8 106
+.b8 99
+.b8 98
+.b8 53
+.b8 97
+.b8 99
+.b8 113
+.b8 107
+.b8 105
+.b8 55
+.b8 122
+.b8 99
+.b8 115
+.b8 106
+.b8 105
+.b8 102
+.b8 114
+.b8 110
+.b8 114
+.b8 122
+.b8 99
+.b8 114
+.b8 114
+.b8 46
+.b8 112
+.b8 121
+.b8 0
+.b32 .debug_line                        // DW_AT_stmt_list
+.b8 47                                  // DW_AT_comp_dir
+.b8 97
+.b8 112
+.b8 112
+.b8 47
+.b8 116
+.b8 101
+.b8 110
+.b8 115
+.b8 111
+.b8 114
+.b8 114
+.b8 116
+.b8 95
+.b8 108
+.b8 108
+.b8 109
+.b8 47
+.b8 118
+.b8 105
+.b8 115
+.b8 117
+.b8 97
+.b8 108
+.b8 95
+.b8 103
+.b8 101
+.b8 110
+.b8 47
+.b8 99
+.b8 111
+.b8 109
+.b8 112
+.b8 105
+.b8 108
+.b8 101
+.b8 100
+.b8 95
+.b8 99
+.b8 97
+.b8 99
+.b8 104
+.b8 101
+.b8 47
+.b8 102
+.b8 108
+.b8 117
+.b8 120
+.b8 50
+.b8 95
+.b8 107
+.b8 108
+.b8 101
+.b8 105
+.b8 110
+.b8 95
+.b8 57
+.b8 98
+.b8 95
+.b8 78
+.b8 86
+.b8 73
+.b8 68
+.b8 73
+.b8 65
+.b8 95
+.b8 71
+.b8 101
+.b8 70
+.b8 111
+.b8 114
+.b8 99
+.b8 101
+.b8 95
+.b8 82
+.b8 84
+.b8 88
+.b8 95
+.b8 52
+.b8 48
+.b8 57
+.b8 48
+.b8 95
+.b8 115
+.b8 109
+.b8 56
+.b8 57
+.b8 95
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 50
+.b8 46
+.b8 49
+.b8 48
+.b8 46
+.b8 48
+.b8 97
+.b8 48
+.b8 95
+.b8 98
+.b8 52
+.b8 101
+.b8 52
+.b8 101
+.b8 101
+.b8 56
+.b8 49
+.b8 100
+.b8 51
+.b8 46
+.b8 110
+.b8 118
+.b8 50
+.b8 53
+.b8 46
+.b8 49
+.b8 50
+.b8 95
+.b8 99
+.b8 117
+.b8 100
+.b8 97
+.b8 49
+.b8 51
+.b8 95
+.b8 49
+.b8 47
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 105
+.b8 110
+.b8 100
+.b8 117
+.b8 99
+.b8 116
+.b8 111
+.b8 114
+.b8 47
+.b8 113
+.b8 105
+.b8 0
+.b8 2                                   // Abbrev [2] 0xe4:0x2a DW_TAG_subprogram
+.b8 116                                 // DW_AT_name
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 114
+.b8 101
+.b8 100
+.b8 95
+.b8 102
+.b8 117
+.b8 115
+.b8 101
+.b8 100
+.b8 95
+.b8 95
+.b8 102
+.b8 117
+.b8 115
+.b8 101
+.b8 100
+.b8 95
+.b8 114
+.b8 109
+.b8 115
+.b8 95
+.b8 110
+.b8 111
+.b8 114
+.b8 109
+.b8 95
+.b8 118
+.b8 105
+.b8 101
+.b8 119
+.b8 95
+.b8 49
+.b8 0
+.b8 1                                   // DW_AT_inline
+.b8 3                                   // Abbrev [3] 0x10e:0x48 DW_TAG_subprogram
+.b64 $L__func_begin0                    // DW_AT_low_pc
+.b64 $L__func_end0                      // DW_AT_high_pc
+.b32 228                                // DW_AT_abstract_origin
+.b8 4                                   // Abbrev [4] 0x123:0x32 DW_TAG_inlined_subroutine
+.b32 228                                // DW_AT_abstract_origin
+.b64 $L__tmp1                           // DW_AT_low_pc
+.b64 $L__tmp8                           // DW_AT_high_pc
+.b8 1                                   // DW_AT_call_file
+.b8 44                                  // DW_AT_call_line
+.b8 25                                  // DW_AT_call_column
+.b8 5                                   // Abbrev [5] 0x13b:0x19 DW_TAG_inlined_subroutine
+.b32 228                                // DW_AT_abstract_origin
+.b64 $L__tmp1                           // DW_AT_low_pc
+.b64 $L__tmp8                           // DW_AT_high_pc
+.b8 2                                   // DW_AT_call_file
+.b8 37                                  // DW_AT_call_line
+.b8 1
+.b8 36                                  // DW_AT_call_column
+.b8 0                                   // End Of Children Mark
+.b8 0                                   // End Of Children Mark
+.b8 0                                   // End Of Children Mark
+	}
+	.section	.debug_macinfo	{	}
diff --git a/triton/7VRSMQ6HCWL4FYVSALSUIPZQQ2RD3JJX7SZDTD6UHJE5ZNP2MUVA/triton_red_fused__fused_rms_norm_view_1.source b/triton/7VRSMQ6HCWL4FYVSALSUIPZQQ2RD3JJX7SZDTD6UHJE5ZNP2MUVA/triton_red_fused__fused_rms_norm_view_1.source
new file mode 100644
index 0000000000000000000000000000000000000000..529a4488127bba20b5fc66767666a84ea578ac2b
--- /dev/null
+++ b/triton/7VRSMQ6HCWL4FYVSALSUIPZQQ2RD3JJX7SZDTD6UHJE5ZNP2MUVA/triton_red_fused__fused_rms_norm_view_1.source
@@ -0,0 +1,167 @@
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":18:0)
+#loc33 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":287:0)
+#loc35 = loc(unknown)
+#loc38 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":262:0)
+#loc42 = loc("in_ptr0"(#loc))
+#loc43 = loc("out_ptr0"(#loc))
+#loc44 = loc("xnumel"(#loc))
+#loc45 = loc("r0_numel"(#loc))
+#loc74 = loc("input"(#loc33))
+#loc75 = loc("a"(#loc38))
+#loc76 = loc("b"(#loc38))
+module {
+  tt.func public @triton_red_fused__fused_rms_norm_view_1(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
+    %xnumel_0 = arith.constant 65536 : i32 loc(#loc46)
+    %r0_numel_1 = arith.constant 128 : i32 loc(#loc47)
+    %xoffset = tt.get_program_id x : i32 loc(#loc48)
+    %xoffset_2 = arith.constant 64 : i32 loc(#loc49)
+    %xoffset_3 = arith.constant 64 : i32 loc(#loc49)
+    %xoffset_4 = arith.muli %xoffset, %xoffset_3 : i32 loc(#loc49)
+    %xindex = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc50)
+    %xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc51)
+    %xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<64x1xi32> loc(#loc52)
+    %xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<64x1xi32> loc(#loc52)
+    %xmask = arith.constant true loc(#loc53)
+    %xmask_8 = arith.constant dense<true> : tensor<64x64xi1> loc(#loc53)
+    %r0_base = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc54)
+    %r0_base_9 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc55)
+    %x0 = arith.constant 32 : i32 loc(#loc56)
+    %x0_10 = arith.constant 32 : i32 loc(#loc56)
+    %x0_11 = arith.constant dense<32> : tensor<64x1xi32> loc(#loc56)
+    %x0_12 = arith.remsi %xindex_7, %x0_11 : tensor<64x1xi32> loc(#loc56)
+    %x1 = arith.constant 32 : i32 loc(#loc57)
+    %x1_13 = arith.constant 32 : i32 loc(#loc57)
+    %x1_14 = arith.constant dense<32> : tensor<64x1xi32> loc(#loc57)
+    %x1_15 = arith.divsi %xindex_7, %x1_14 : tensor<64x1xi32> loc(#loc57)
+    %_tmp4 = arith.constant 0.000000e+00 : f32 loc(#loc58)
+    %_tmp4_16 = arith.constant dense<0.000000e+00> : tensor<64x64xf32> loc(#loc58)
+    %c0_i32 = arith.constant 0 : i32 loc(#loc14)
+    %c64_i32 = arith.constant 64 : i32 loc(#loc14)
+    %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc14)
+    %1 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc14)
+    %2 = arith.bitcast %c64_i32 : i32 to i32 loc(#loc14)
+    %3 = ub.poison : i32 loc(#loc14)
+    %_tmp4_17 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%_tmp4_19 = %_tmp4_16) -> (tensor<64x64xf32>)  : i32 {
+      %r0_index = tt.splat %r0_offset : i32 -> tensor<1x64xi32> loc(#loc60)
+      %r0_index_20 = arith.addi %r0_index, %r0_base_9 : tensor<1x64xi32> loc(#loc60)
+      %r0_mask = arith.constant dense<128> : tensor<1x64xi32> loc(#loc61)
+      %r0_mask_21 = arith.cmpi slt, %r0_index_20, %r0_mask : tensor<1x64xi32> loc(#loc61)
+      %tmp0 = arith.constant 128 : i32 loc(#loc62)
+      %tmp0_22 = arith.constant 128 : i32 loc(#loc62)
+      %tmp0_23 = arith.constant dense<128> : tensor<64x1xi32> loc(#loc62)
+      %tmp0_24 = arith.muli %tmp0_23, %x0_12 : tensor<64x1xi32> loc(#loc62)
+      %tmp0_25 = tt.broadcast %r0_index_20 : tensor<1x64xi32> -> tensor<64x64xi32> loc(#loc63)
+      %tmp0_26 = tt.broadcast %tmp0_24 : tensor<64x1xi32> -> tensor<64x64xi32> loc(#loc63)
+      %tmp0_27 = arith.addi %tmp0_25, %tmp0_26 : tensor<64x64xi32> loc(#loc63)
+      %tmp0_28 = arith.constant 12288 : i32 loc(#loc64)
+      %tmp0_29 = arith.constant 12288 : i32 loc(#loc64)
+      %tmp0_30 = arith.constant dense<12288> : tensor<64x1xi32> loc(#loc64)
+      %tmp0_31 = arith.muli %tmp0_30, %x1_15 : tensor<64x1xi32> loc(#loc64)
+      %tmp0_32 = tt.broadcast %tmp0_31 : tensor<64x1xi32> -> tensor<64x64xi32> loc(#loc65)
+      %tmp0_33 = arith.addi %tmp0_27, %tmp0_32 : tensor<64x64xi32> loc(#loc65)
+      %tmp0_34 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<64x64x!tt.ptr<bf16>> loc(#loc66)
+      %tmp0_35 = tt.addptr %tmp0_34, %tmp0_33 : tensor<64x64x!tt.ptr<bf16>>, tensor<64x64xi32> loc(#loc66)
+      %tmp0_36 = arith.constant 0.000000e+00 : f32 loc(#loc67)
+      %tmp0_37 = tt.broadcast %r0_mask_21 : tensor<1x64xi1> -> tensor<64x64xi1> loc(#loc67)
+      %tmp0_38 = arith.constant dense<0.000000e+00> : tensor<64x64xf32> loc(#loc67)
+      %tmp0_39 = arith.truncf %tmp0_38 : tensor<64x64xf32> to tensor<64x64xbf16> loc(#loc67)
+      %tmp0_40 = tt.load %tmp0_35, %tmp0_37, %tmp0_39 evictionPolicy = evict_first : tensor<64x64x!tt.ptr<bf16>> loc(#loc67)
+      %tmp0_41 = arith.extf %tmp0_40 : tensor<64x64xbf16> to tensor<64x64xf32> loc(#loc68)
+      %tmp2 = arith.mulf %tmp0_41, %tmp0_41 : tensor<64x64xf32> loc(#loc69)
+      %tmp5 = arith.addf %_tmp4_19, %tmp2 : tensor<64x64xf32> loc(#loc70)
+      %_tmp4_42 = tt.broadcast %r0_mask_21 : tensor<1x64xi1> -> tensor<64x64xi1> loc(#loc71)
+      %_tmp4_43 = arith.select %_tmp4_42, %tmp5, %_tmp4_19 : tensor<64x64xi1>, tensor<64x64xf32> loc(#loc71)
+      scf.yield %_tmp4_43 : tensor<64x64xf32> loc(#loc27)
+    } loc(#loc59)
+    %tmp4 = tt.call @"triton.language.standard.sum__fp32S64_64S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%_tmp4_17) : (tensor<64x64xf32>) -> tensor<64xf32> loc(#loc72)
+    %tmp4_18 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<64xf32> -> tensor<64x1xf32> loc(#loc73)
+    %4 = tt.splat %out_ptr0 : !tt.ptr<f32> -> tensor<64x1x!tt.ptr<f32>> loc(#loc30)
+    %5 = tt.addptr %4, %xindex_7 : tensor<64x1x!tt.ptr<f32>>, tensor<64x1xi32> loc(#loc30)
+    tt.store %5, %tmp4_18 : tensor<64x1x!tt.ptr<f32>> loc(#loc31)
+    tt.return loc(#loc32)
+  } loc(#loc)
+  tt.func private @"triton.language.standard.sum__fp32S64_64S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<64x64xf32> loc("input"(#loc33))) -> tensor<64xf32> attributes {noinline = false} {
+    %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({
+    ^bb0(%arg1: f32 loc(unknown), %arg2: f32 loc(unknown)):
+      %2 = tt.call @triton.language.standard._sum_combine__fp32_fp32__(%arg1, %arg2) : (f32, f32) -> f32 loc(#loc34)
+      tt.reduce.return %2 : f32 loc(#loc34)
+    }) : (tensor<64x64xf32>) -> tensor<64xf32> loc(#loc34)
+    tt.return %0 : tensor<64xf32> loc(#loc36)
+  ^bb1:  // no predecessors
+    %1 = ub.poison : tensor<64xf32> loc(#loc37)
+    tt.return %1 : tensor<64xf32> loc(#loc37)
+  } loc(#loc33)
+  tt.func private @triton.language.standard._sum_combine__fp32_fp32__(%a: f32 loc("a"(#loc38)), %b: f32 loc("b"(#loc38))) -> f32 attributes {noinline = false} {
+    %0 = arith.addf %a, %b : f32 loc(#loc39)
+    tt.return %0 : f32 loc(#loc40)
+  ^bb1:  // no predecessors
+    %1 = ub.poison : f32 loc(#loc41)
+    tt.return %1 : f32 loc(#loc41)
+  } loc(#loc38)
+} loc(#loc)
+#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":19:13)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":20:15)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":23:28)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":23:33)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":24:36)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":24:44)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":24:23)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":25:46)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":26:27)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":26:37)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":28:19)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":29:19)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":30:43)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":32:43)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":33:31)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":34:29)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:45)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:41)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:56)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:50)
+#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:34)
+#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:61)
+#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:115)
+#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":40:22)
+#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":42:23)
+#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":43:40)
+#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":43:8)
+#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":44:25)
+#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":44:28)
+#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":45:25)
+#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":45:36)
+#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":45:4)
+#loc34 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:36)
+#loc36 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:11)
+#loc37 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:4)
+#loc39 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:15)
+#loc40 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:11)
+#loc41 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:4)
+#loc46 = loc("xnumel"(#loc1))
+#loc47 = loc("r0_numel"(#loc2))
+#loc48 = loc("xoffset"(#loc3))
+#loc49 = loc("xoffset"(#loc4))
+#loc50 = loc("xindex"(#loc5))
+#loc51 = loc("xindex"(#loc6))
+#loc52 = loc("xindex"(#loc7))
+#loc53 = loc("xmask"(#loc8))
+#loc54 = loc("r0_base"(#loc9))
+#loc55 = loc("r0_base"(#loc10))
+#loc56 = loc("x0"(#loc11))
+#loc57 = loc("x1"(#loc12))
+#loc58 = loc("_tmp4"(#loc13))
+#loc59 = loc("_tmp4"(#loc14))
+#loc60 = loc("r0_index"(#loc15))
+#loc61 = loc("r0_mask"(#loc16))
+#loc62 = loc("tmp0"(#loc17))
+#loc63 = loc("tmp0"(#loc18))
+#loc64 = loc("tmp0"(#loc19))
+#loc65 = loc("tmp0"(#loc20))
+#loc66 = loc("tmp0"(#loc21))
+#loc67 = loc("tmp0"(#loc22))
+#loc68 = loc("tmp0"(#loc23))
+#loc69 = loc("tmp2"(#loc24))
+#loc70 = loc("tmp5"(#loc25))
+#loc71 = loc("_tmp4"(#loc26))
+#loc72 = loc("tmp4"(#loc28))
+#loc73 = loc("tmp4"(#loc29))
diff --git a/triton/7VRSMQ6HCWL4FYVSALSUIPZQQ2RD3JJX7SZDTD6UHJE5ZNP2MUVA/triton_red_fused__fused_rms_norm_view_1.ttgir b/triton/7VRSMQ6HCWL4FYVSALSUIPZQQ2RD3JJX7SZDTD6UHJE5ZNP2MUVA/triton_red_fused__fused_rms_norm_view_1.ttgir
new file mode 100644
index 0000000000000000000000000000000000000000..36102b4bece7ffab66468907d370b1e032cf03c9
--- /dev/null
+++ b/triton/7VRSMQ6HCWL4FYVSALSUIPZQQ2RD3JJX7SZDTD6UHJE5ZNP2MUVA/triton_red_fused__fused_rms_norm_view_1.ttgir
@@ -0,0 +1,120 @@
+#blocked = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [4, 8], warpsPerCTA = [16, 1], order = [1, 0]}>
+#blocked1 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [2, 8], order = [0, 1]}>
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":18:0)
+#loc1 = loc(unknown)
+#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":44:25)
+#loc30 = loc("in_ptr0"(#loc))
+#loc31 = loc("out_ptr0"(#loc))
+#loc32 = loc("xnumel"(#loc))
+#loc33 = loc("r0_numel"(#loc))
+#loc54 = loc("tmp4"(#loc24))
+#loc57 = loc(callsite(#loc1 at #loc54))
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 16 : i32, ttg.target = "cuda:89", "ttg.threads-per-warp" = 32 : i32} {
+  tt.func public @triton_red_fused__fused_rms_norm_view_1(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
+    %cst = arith.constant dense<128> : tensor<1x64xi32, #blocked> loc(#loc1)
+    %cst_0 = arith.constant dense<128> : tensor<64x1xi32, #blocked> loc(#loc1)
+    %cst_1 = arith.constant dense<12288> : tensor<64x1xi32, #blocked> loc(#loc1)
+    %cst_2 = arith.constant dense<32> : tensor<64x1xi32, #blocked> loc(#loc1)
+    %c64_i32 = arith.constant 64 : i32 loc(#loc1)
+    %cst_3 = arith.constant dense<0.000000e+00> : tensor<64x64xbf16, #blocked> loc(#loc1)
+    %c128_i32 = arith.constant 128 : i32 loc(#loc1)
+    %c0_i32 = arith.constant 0 : i32 loc(#loc1)
+    %cst_4 = arith.constant dense<0.000000e+00> : tensor<64x64xf32, #blocked> loc(#loc1)
+    %xoffset = tt.get_program_id x : i32 loc(#loc34)
+    %xoffset_5 = arith.muli %xoffset, %c64_i32 : i32 loc(#loc35)
+    %xindex = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc36)
+    %xindex_6 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc36)
+    %xindex_7 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> loc(#loc36)
+    %xindex_8 = tt.expand_dims %xindex_6 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<64x1xi32, #blocked1> loc(#loc36)
+    %xindex_9 = tt.splat %xoffset_5 : i32 -> tensor<64x1xi32, #blocked> loc(#loc37)
+    %xindex_10 = tt.splat %xoffset_5 : i32 -> tensor<64x1xi32, #blocked1> loc(#loc37)
+    %xindex_11 = arith.addi %xindex_9, %xindex_7 : tensor<64x1xi32, #blocked> loc(#loc37)
+    %xindex_12 = arith.addi %xindex_10, %xindex_8 : tensor<64x1xi32, #blocked1> loc(#loc37)
+    %r0_base = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc38)
+    %r0_base_13 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x64xi32, #blocked> loc(#loc38)
+    %x0 = arith.remsi %xindex_11, %cst_2 : tensor<64x1xi32, #blocked> loc(#loc39)
+    %x1 = arith.divsi %xindex_11, %cst_2 : tensor<64x1xi32, #blocked> loc(#loc40)
+    %tmp0 = arith.muli %x0, %cst_0 : tensor<64x1xi32, #blocked> loc(#loc41)
+    %tmp0_14 = tt.broadcast %tmp0 : tensor<64x1xi32, #blocked> -> tensor<64x64xi32, #blocked> loc(#loc42)
+    %tmp0_15 = arith.muli %x1, %cst_1 : tensor<64x1xi32, #blocked> loc(#loc43)
+    %tmp0_16 = tt.broadcast %tmp0_15 : tensor<64x1xi32, #blocked> -> tensor<64x64xi32, #blocked> loc(#loc44)
+    %tmp0_17 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<64x64x!tt.ptr<bf16>, #blocked> loc(#loc45)
+    %_tmp4 = scf.for %_tmp4_20 = %c0_i32 to %c128_i32 step %c64_i32 iter_args(%arg5 = %cst_4) -> (tensor<64x64xf32, #blocked>)  : i32 {
+      %r0_index = tt.splat %_tmp4_20 : i32 -> tensor<1x64xi32, #blocked> loc(#loc47)
+      %r0_index_21 = arith.addi %r0_index, %r0_base_13 : tensor<1x64xi32, #blocked> loc(#loc47)
+      %r0_mask = arith.cmpi slt, %r0_index_21, %cst : tensor<1x64xi32, #blocked> loc(#loc48)
+      %tmp0_22 = tt.broadcast %r0_index_21 : tensor<1x64xi32, #blocked> -> tensor<64x64xi32, #blocked> loc(#loc42)
+      %tmp0_23 = arith.addi %tmp0_22, %tmp0_14 : tensor<64x64xi32, #blocked> loc(#loc42)
+      %tmp0_24 = arith.addi %tmp0_23, %tmp0_16 : tensor<64x64xi32, #blocked> loc(#loc44)
+      %tmp0_25 = tt.addptr %tmp0_17, %tmp0_24 : tensor<64x64x!tt.ptr<bf16>, #blocked>, tensor<64x64xi32, #blocked> loc(#loc45)
+      %tmp0_26 = tt.broadcast %r0_mask : tensor<1x64xi1, #blocked> -> tensor<64x64xi1, #blocked> loc(#loc49)
+      %tmp0_27 = tt.load %tmp0_25, %tmp0_26, %cst_3 evictionPolicy = evict_first : tensor<64x64x!tt.ptr<bf16>, #blocked> loc(#loc49)
+      %tmp0_28 = arith.extf %tmp0_27 : tensor<64x64xbf16, #blocked> to tensor<64x64xf32, #blocked> loc(#loc50)
+      %tmp2 = arith.mulf %tmp0_28, %tmp0_28 : tensor<64x64xf32, #blocked> loc(#loc51)
+      %tmp5 = arith.addf %arg5, %tmp2 : tensor<64x64xf32, #blocked> loc(#loc52)
+      %_tmp4_29 = arith.select %tmp0_26, %tmp5, %arg5 : tensor<64x64xi1, #blocked>, tensor<64x64xf32, #blocked> loc(#loc53)
+      scf.yield %_tmp4_29 : tensor<64x64xf32, #blocked> loc(#loc22)
+    } loc(#loc46)
+    %tmp4 = "tt.reduce"(%_tmp4) <{axis = 1 : i32}> ({
+    ^bb0(%tmp4_20: f32 loc(callsite(#loc1 at #loc54)), %tmp4_21: f32 loc(callsite(#loc1 at #loc54))):
+      %tmp4_22 = arith.addf %tmp4_20, %tmp4_21 : f32 loc(#loc58)
+      tt.reduce.return %tmp4_22 : f32 loc(#loc56)
+    }) : (tensor<64x64xf32, #blocked>) -> tensor<64xf32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc56)
+    %tmp4_18 = ttg.convert_layout %tmp4 : tensor<64xf32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc55)
+    %tmp4_19 = tt.expand_dims %tmp4_18 {axis = 1 : i32} : tensor<64xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<64x1xf32, #blocked1> loc(#loc55)
+    %0 = tt.splat %out_ptr0 : !tt.ptr<f32> -> tensor<64x1x!tt.ptr<f32>, #blocked1> loc(#loc27)
+    %1 = tt.addptr %0, %xindex_12 : tensor<64x1x!tt.ptr<f32>, #blocked1>, tensor<64x1xi32, #blocked1> loc(#loc27)
+    tt.store %1, %tmp4_19 : tensor<64x1x!tt.ptr<f32>, #blocked1> loc(#loc28)
+    tt.return loc(#loc29)
+  } loc(#loc)
+} loc(#loc)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":23:28)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":23:33)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":24:44)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":24:23)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":26:37)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":28:19)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":29:19)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:45)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:41)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:56)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:50)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:34)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":32:43)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":33:31)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":34:29)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:61)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:115)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":40:22)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":42:23)
+#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":43:40)
+#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":43:8)
+#loc23 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:36)
+#loc25 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:15)
+#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":44:28)
+#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":45:25)
+#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":45:36)
+#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":45:4)
+#loc34 = loc("xoffset"(#loc2))
+#loc35 = loc("xoffset"(#loc3))
+#loc36 = loc("xindex"(#loc4))
+#loc37 = loc("xindex"(#loc5))
+#loc38 = loc("r0_base"(#loc6))
+#loc39 = loc("x0"(#loc7))
+#loc40 = loc("x1"(#loc8))
+#loc41 = loc("tmp0"(#loc9))
+#loc42 = loc("tmp0"(#loc10))
+#loc43 = loc("tmp0"(#loc11))
+#loc44 = loc("tmp0"(#loc12))
+#loc45 = loc("tmp0"(#loc13))
+#loc46 = loc("_tmp4"(#loc14))
+#loc47 = loc("r0_index"(#loc15))
+#loc48 = loc("r0_mask"(#loc16))
+#loc49 = loc("tmp0"(#loc17))
+#loc50 = loc("tmp0"(#loc18))
+#loc51 = loc("tmp2"(#loc19))
+#loc52 = loc("tmp5"(#loc20))
+#loc53 = loc("_tmp4"(#loc21))
+#loc55 = loc("tmp4"(#loc26))
+#loc56 = loc(callsite(#loc23 at #loc54))
+#loc58 = loc(callsite(#loc25 at #loc56))
diff --git a/triton/7VRSMQ6HCWL4FYVSALSUIPZQQ2RD3JJX7SZDTD6UHJE5ZNP2MUVA/triton_red_fused__fused_rms_norm_view_1.ttir b/triton/7VRSMQ6HCWL4FYVSALSUIPZQQ2RD3JJX7SZDTD6UHJE5ZNP2MUVA/triton_red_fused__fused_rms_norm_view_1.ttir
new file mode 100644
index 0000000000000000000000000000000000000000..d64e870d3c119a1a0afa92abf6aa75b33eb07d99
--- /dev/null
+++ b/triton/7VRSMQ6HCWL4FYVSALSUIPZQQ2RD3JJX7SZDTD6UHJE5ZNP2MUVA/triton_red_fused__fused_rms_norm_view_1.ttir
@@ -0,0 +1,114 @@
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":18:0)
+#loc1 = loc(unknown)
+#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":44:25)
+#loc31 = loc("in_ptr0"(#loc))
+#loc32 = loc("out_ptr0"(#loc))
+#loc33 = loc("xnumel"(#loc))
+#loc34 = loc("r0_numel"(#loc))
+#loc56 = loc("tmp4"(#loc25))
+#loc59 = loc(callsite(#loc1 at #loc56))
+module {
+  tt.func public @triton_red_fused__fused_rms_norm_view_1(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
+    %cst = arith.constant dense<0.000000e+00> : tensor<64x64xbf16> loc(#loc1)
+    %c128_i32 = arith.constant 128 : i32 loc(#loc2)
+    %c0_i32 = arith.constant 0 : i32 loc(#loc2)
+    %cst_0 = arith.constant dense<12288> : tensor<64x1xi32> loc(#loc1)
+    %cst_1 = arith.constant dense<128> : tensor<64x1xi32> loc(#loc1)
+    %cst_2 = arith.constant dense<128> : tensor<1x64xi32> loc(#loc1)
+    %cst_3 = arith.constant dense<0.000000e+00> : tensor<64x64xf32> loc(#loc1)
+    %cst_4 = arith.constant dense<32> : tensor<64x1xi32> loc(#loc1)
+    %c64_i32 = arith.constant 64 : i32 loc(#loc1)
+    %xoffset = tt.get_program_id x : i32 loc(#loc35)
+    %xoffset_5 = arith.muli %xoffset, %c64_i32 : i32 loc(#loc36)
+    %xindex = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc37)
+    %xindex_6 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc38)
+    %xindex_7 = tt.splat %xoffset_5 : i32 -> tensor<64x1xi32> loc(#loc39)
+    %xindex_8 = arith.addi %xindex_7, %xindex_6 : tensor<64x1xi32> loc(#loc39)
+    %r0_base = tt.expand_dims %xindex {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc40)
+    %x0 = arith.remsi %xindex_8, %cst_4 : tensor<64x1xi32> loc(#loc41)
+    %x1 = arith.divsi %xindex_8, %cst_4 : tensor<64x1xi32> loc(#loc42)
+    %_tmp4 = scf.for %r0_offset = %c0_i32 to %c128_i32 step %c64_i32 iter_args(%_tmp4_10 = %cst_3) -> (tensor<64x64xf32>)  : i32 {
+      %r0_index = tt.splat %r0_offset : i32 -> tensor<1x64xi32> loc(#loc44)
+      %r0_index_11 = arith.addi %r0_index, %r0_base : tensor<1x64xi32> loc(#loc44)
+      %r0_mask = arith.cmpi slt, %r0_index_11, %cst_2 : tensor<1x64xi32> loc(#loc45)
+      %tmp0 = arith.muli %x0, %cst_1 : tensor<64x1xi32> loc(#loc46)
+      %tmp0_12 = tt.broadcast %r0_index_11 : tensor<1x64xi32> -> tensor<64x64xi32> loc(#loc47)
+      %tmp0_13 = tt.broadcast %tmp0 : tensor<64x1xi32> -> tensor<64x64xi32> loc(#loc47)
+      %tmp0_14 = arith.addi %tmp0_12, %tmp0_13 : tensor<64x64xi32> loc(#loc47)
+      %tmp0_15 = arith.muli %x1, %cst_0 : tensor<64x1xi32> loc(#loc48)
+      %tmp0_16 = tt.broadcast %tmp0_15 : tensor<64x1xi32> -> tensor<64x64xi32> loc(#loc49)
+      %tmp0_17 = arith.addi %tmp0_14, %tmp0_16 : tensor<64x64xi32> loc(#loc49)
+      %tmp0_18 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<64x64x!tt.ptr<bf16>> loc(#loc50)
+      %tmp0_19 = tt.addptr %tmp0_18, %tmp0_17 : tensor<64x64x!tt.ptr<bf16>>, tensor<64x64xi32> loc(#loc50)
+      %tmp0_20 = tt.broadcast %r0_mask : tensor<1x64xi1> -> tensor<64x64xi1> loc(#loc51)
+      %tmp0_21 = tt.load %tmp0_19, %tmp0_20, %cst evictionPolicy = evict_first : tensor<64x64x!tt.ptr<bf16>> loc(#loc51)
+      %tmp0_22 = arith.extf %tmp0_21 : tensor<64x64xbf16> to tensor<64x64xf32> loc(#loc52)
+      %tmp2 = arith.mulf %tmp0_22, %tmp0_22 : tensor<64x64xf32> loc(#loc53)
+      %tmp5 = arith.addf %_tmp4_10, %tmp2 : tensor<64x64xf32> loc(#loc54)
+      %_tmp4_23 = arith.select %tmp0_20, %tmp5, %_tmp4_10 : tensor<64x64xi1>, tensor<64x64xf32> loc(#loc55)
+      scf.yield %_tmp4_23 : tensor<64x64xf32> loc(#loc23)
+    } loc(#loc43)
+    %tmp4 = "tt.reduce"(%_tmp4) <{axis = 1 : i32}> ({
+    ^bb0(%tmp4_10: f32 loc(callsite(#loc1 at #loc56)), %tmp4_11: f32 loc(callsite(#loc1 at #loc56))):
+      %tmp4_12 = arith.addf %tmp4_10, %tmp4_11 : f32 loc(#loc60)
+      tt.reduce.return %tmp4_12 : f32 loc(#loc58)
+    }) : (tensor<64x64xf32>) -> tensor<64xf32> loc(#loc58)
+    %tmp4_9 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<64xf32> -> tensor<64x1xf32> loc(#loc57)
+    %0 = tt.splat %out_ptr0 : !tt.ptr<f32> -> tensor<64x1x!tt.ptr<f32>> loc(#loc28)
+    %1 = tt.addptr %0, %xindex_8 : tensor<64x1x!tt.ptr<f32>>, tensor<64x1xi32> loc(#loc28)
+    tt.store %1, %tmp4_9 : tensor<64x1x!tt.ptr<f32>> loc(#loc29)
+    tt.return loc(#loc30)
+  } loc(#loc)
+} loc(#loc)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":32:43)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":23:28)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":23:33)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":24:36)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":24:44)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":24:23)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":26:37)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":28:19)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":29:19)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":33:31)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":34:29)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:45)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:41)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:56)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:50)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:34)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:61)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:115)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":40:22)
+#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":42:23)
+#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":43:40)
+#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":43:8)
+#loc24 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:36)
+#loc26 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:15)
+#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":44:28)
+#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":45:25)
+#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":45:36)
+#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":45:4)
+#loc35 = loc("xoffset"(#loc3))
+#loc36 = loc("xoffset"(#loc4))
+#loc37 = loc("xindex"(#loc5))
+#loc38 = loc("xindex"(#loc6))
+#loc39 = loc("xindex"(#loc7))
+#loc40 = loc("r0_base"(#loc8))
+#loc41 = loc("x0"(#loc9))
+#loc42 = loc("x1"(#loc10))
+#loc43 = loc("_tmp4"(#loc2))
+#loc44 = loc("r0_index"(#loc11))
+#loc45 = loc("r0_mask"(#loc12))
+#loc46 = loc("tmp0"(#loc13))
+#loc47 = loc("tmp0"(#loc14))
+#loc48 = loc("tmp0"(#loc15))
+#loc49 = loc("tmp0"(#loc16))
+#loc50 = loc("tmp0"(#loc17))
+#loc51 = loc("tmp0"(#loc18))
+#loc52 = loc("tmp0"(#loc19))
+#loc53 = loc("tmp2"(#loc20))
+#loc54 = loc("tmp5"(#loc21))
+#loc55 = loc("_tmp4"(#loc22))
+#loc57 = loc("tmp4"(#loc27))
+#loc58 = loc(callsite(#loc24 at #loc56))
+#loc60 = loc(callsite(#loc26 at #loc58))
diff --git a/triton/7ZCAPUU6AQHQ5DX4ENCYV3QMXTYVGXUYKDIHSZVQ45BB7FWTBYGQ/__grp__triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.json b/triton/7ZCAPUU6AQHQ5DX4ENCYV3QMXTYVGXUYKDIHSZVQ45BB7FWTBYGQ/__grp__triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..0c0622ad43d9b5c2f62e6ae3993c7449a33e3595
--- /dev/null
+++ b/triton/7ZCAPUU6AQHQ5DX4ENCYV3QMXTYVGXUYKDIHSZVQ45BB7FWTBYGQ/__grp__triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.json
@@ -0,0 +1 @@
+{"child_paths": {"triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.source": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/7ZCAPUU6AQHQ5DX4ENCYV3QMXTYVGXUYKDIHSZVQ45BB7FWTBYGQ/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.source", "triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.ttir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/7ZCAPUU6AQHQ5DX4ENCYV3QMXTYVGXUYKDIHSZVQ45BB7FWTBYGQ/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.ttir", "triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.ttgir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/7ZCAPUU6AQHQ5DX4ENCYV3QMXTYVGXUYKDIHSZVQ45BB7FWTBYGQ/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.ttgir", "triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.llir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/7ZCAPUU6AQHQ5DX4ENCYV3QMXTYVGXUYKDIHSZVQ45BB7FWTBYGQ/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.llir", "triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.ptx": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/7ZCAPUU6AQHQ5DX4ENCYV3QMXTYVGXUYKDIHSZVQ45BB7FWTBYGQ/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.ptx", "triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.cubin": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/7ZCAPUU6AQHQ5DX4ENCYV3QMXTYVGXUYKDIHSZVQ45BB7FWTBYGQ/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.cubin", "triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.json": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/7ZCAPUU6AQHQ5DX4ENCYV3QMXTYVGXUYKDIHSZVQ45BB7FWTBYGQ/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.json"}}
\ No newline at end of file
diff --git a/triton/7ZCAPUU6AQHQ5DX4ENCYV3QMXTYVGXUYKDIHSZVQ45BB7FWTBYGQ/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.cubin b/triton/7ZCAPUU6AQHQ5DX4ENCYV3QMXTYVGXUYKDIHSZVQ45BB7FWTBYGQ/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..da34a1a49cdf21cf19a41dff638a12c20a832309
Binary files /dev/null and b/triton/7ZCAPUU6AQHQ5DX4ENCYV3QMXTYVGXUYKDIHSZVQ45BB7FWTBYGQ/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.cubin differ
diff --git a/triton/7ZCAPUU6AQHQ5DX4ENCYV3QMXTYVGXUYKDIHSZVQ45BB7FWTBYGQ/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.json b/triton/7ZCAPUU6AQHQ5DX4ENCYV3QMXTYVGXUYKDIHSZVQ45BB7FWTBYGQ/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..0238f8479df5f7e880d3fdcc2fa1d012ee7880bd
--- /dev/null
+++ b/triton/7ZCAPUU6AQHQ5DX4ENCYV3QMXTYVGXUYKDIHSZVQ45BB7FWTBYGQ/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.json
@@ -0,0 +1 @@
+{"hash": "fe4407d29e040f0e8efc23458aee0cbcf1535e9850d07966b0e7421f96d30e0d", "target": {"backend": "cuda", "arch": 89, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "enable_reflect_ftz": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee", "bf16x3", "bf16x6"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm89", "instrumentation_mode": "", "triton_version": "3.6.0", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0"}
\ No newline at end of file
diff --git a/triton/7ZCAPUU6AQHQ5DX4ENCYV3QMXTYVGXUYKDIHSZVQ45BB7FWTBYGQ/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.llir b/triton/7ZCAPUU6AQHQ5DX4ENCYV3QMXTYVGXUYKDIHSZVQ45BB7FWTBYGQ/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.llir
new file mode 100644
index 0000000000000000000000000000000000000000..8ef87e7332aadae0020d3ec548f4433f63c315ee
--- /dev/null
+++ b/triton/7ZCAPUU6AQHQ5DX4ENCYV3QMXTYVGXUYKDIHSZVQ45BB7FWTBYGQ/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.llir
@@ -0,0 +1,71 @@
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64"
+
+; Function Attrs: nounwind
+define ptx_kernel void @triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0(ptr addrspace(1) %0, ptr addrspace(1) %1, i32 %2, ptr addrspace(1) readnone captures(none) %3, ptr addrspace(1) readnone captures(none) %4) local_unnamed_addr #0 !dbg !4 {
+  %6 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7
+  %7 = shl i32 %6, 10, !dbg !8
+  %8 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9
+  %9 = shl nuw nsw i32 %8, 3, !dbg !9
+  %10 = and i32 %9, 1016, !dbg !9
+  %11 = or disjoint i32 %10, %7, !dbg !10
+  %12 = sdiv i32 %11, 128, !dbg !11
+  %13 = mul i32 %12, 128, !dbg !12
+  %.decomposed = sub i32 %11, %13, !dbg !12
+  %14 = srem i32 %12, 2304, !dbg !13
+  %15 = sdiv i32 %11, 294912, !dbg !14
+  %16 = shl nsw i32 %15, 7, !dbg !15
+  %17 = add nsw i32 %16, %.decomposed, !dbg !16
+  %18 = shl nsw i32 %14, 12, !dbg !17
+  %19 = add nsw i32 %17, %18, !dbg !18
+  %20 = sext i32 %19 to i64, !dbg !19
+  %21 = getelementptr bfloat, ptr addrspace(1) %0, i64 %20, !dbg !19
+  %22 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l"(ptr addrspace(1) %21) #2, !dbg !20
+  %23 = extractvalue { i32, i32, i32, i32 } %22, 0, !dbg !20
+  %24 = extractvalue { i32, i32, i32, i32 } %22, 1, !dbg !20
+  %25 = extractvalue { i32, i32, i32, i32 } %22, 2, !dbg !20
+  %26 = extractvalue { i32, i32, i32, i32 } %22, 3, !dbg !20
+  %27 = sext i32 %11 to i64, !dbg !21
+  %28 = getelementptr bfloat, ptr addrspace(1) %1, i64 %27, !dbg !21
+  tail call void asm sideeffect "st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l"(i32 %23, i32 %24, i32 %25, i32 %26, ptr addrspace(1) %28) #2, !dbg !22
+  ret void, !dbg !23
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1
+
+attributes #0 = { nounwind "nvvm.reqntid"="128" }
+attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #2 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly)
+!1 = !DIFile(filename: "ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py", directory: "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei")
+!2 = !{i32 2, !"Debug Info Version", i32 3}
+!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
+!4 = distinct !DISubprogram(name: "triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0", linkageName: "triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!5 = !DISubroutineType(cc: DW_CC_normal, types: !6)
+!6 = !{}
+!7 = !DILocation(line: 20, column: 28, scope: !4)
+!8 = !DILocation(line: 20, column: 33, scope: !4)
+!9 = !DILocation(line: 21, column: 36, scope: !4)
+!10 = !DILocation(line: 21, column: 23, scope: !4)
+!11 = !DILocation(line: 24, column: 21, scope: !4)
+!12 = !DILocation(line: 23, column: 19, scope: !4)
+!13 = !DILocation(line: 24, column: 28, scope: !4)
+!14 = !DILocation(line: 25, column: 19, scope: !4)
+!15 = !DILocation(line: 27, column: 39, scope: !4)
+!16 = !DILocation(line: 27, column: 35, scope: !4)
+!17 = !DILocation(line: 27, column: 49, scope: !4)
+!18 = !DILocation(line: 27, column: 44, scope: !4)
+!19 = !DILocation(line: 27, column: 30, scope: !4)
+!20 = !DILocation(line: 27, column: 54, scope: !4)
+!21 = !DILocation(line: 28, column: 25, scope: !4)
+!22 = !DILocation(line: 28, column: 36, scope: !4)
+!23 = !DILocation(line: 28, column: 4, scope: !4)
diff --git a/triton/7ZCAPUU6AQHQ5DX4ENCYV3QMXTYVGXUYKDIHSZVQ45BB7FWTBYGQ/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.ptx b/triton/7ZCAPUU6AQHQ5DX4ENCYV3QMXTYVGXUYKDIHSZVQ45BB7FWTBYGQ/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.ptx
new file mode 100644
index 0000000000000000000000000000000000000000..2d5ab26549fe9d8870cfeed0aedfd9d6465e2dc3
--- /dev/null
+++ b/triton/7ZCAPUU6AQHQ5DX4ENCYV3QMXTYVGXUYKDIHSZVQ45BB7FWTBYGQ/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.ptx
@@ -0,0 +1,332 @@
+//
+// Generated by LLVM NVPTX Back-End
+//
+
+.version 9.1
+.target sm_89
+.address_size 64
+
+	// .globl	triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0 // -- Begin function triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0
+                                        // @triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0
+.visible .entry triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0(
+	.param .u64 .ptr .global .align 1 triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0_param_0,
+	.param .u64 .ptr .global .align 1 triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0_param_1,
+	.param .u32 triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0_param_2,
+	.param .u64 .ptr .global .align 1 triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0_param_3,
+	.param .u64 .ptr .global .align 1 triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0_param_4
+)
+.reqntid 128
+{
+	.reg .b32 	%r<31>;
+	.reg .b64 	%rd<5>;
+	.loc	1 18 0                          // ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py:18:0
+$L__func_begin0:
+	.loc	1 18 0                          // ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py:18:0
+
+// %bb.0:
+	ld.param.b64 	%rd3, [triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0_param_0];
+	ld.param.b64 	%rd4, [triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0_param_1];
+$L__tmp0:
+	.loc	1 20 28                         // ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py:20:28
+	mov.u32 	%r5, %ctaid.x;
+	.loc	1 20 33                         // ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py:20:33
+	shl.b32 	%r6, %r5, 10;
+	.loc	1 21 36                         // ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py:21:36
+	mov.u32 	%r7, %tid.x;
+	shl.b32 	%r8, %r7, 3;
+	and.b32 	%r9, %r8, 1016;
+	.loc	1 21 23                         // ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py:21:23
+	or.b32 	%r10, %r9, %r6;
+	.loc	1 24 21                         // ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py:24:21
+	bfe.s32 	%r11, %r5, 21, 1;
+	shr.u32 	%r12, %r11, 25;
+	add.s32 	%r13, %r10, %r12;
+	shr.s32 	%r14, %r13, 7;
+	.loc	1 23 19                         // ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py:23:19
+	and.b32 	%r15, %r13, -128;
+	sub.s32 	%r16, %r10, %r15;
+	.loc	1 24 28                         // ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py:24:28
+	mul.hi.s32 	%r17, %r14, 954437177;
+	shr.u32 	%r18, %r17, 31;
+	shr.u32 	%r19, %r17, 9;
+	add.s32 	%r20, %r19, %r18;
+	mul.lo.s32 	%r21, %r20, 2304;
+	sub.s32 	%r22, %r14, %r21;
+	.loc	1 25 19                         // ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py:25:19
+	mul.hi.s32 	%r23, %r10, 954437177;
+	shr.u32 	%r24, %r23, 31;
+	shr.s32 	%r25, %r23, 16;
+	add.s32 	%r26, %r25, %r24;
+	.loc	1 27 39                         // ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py:27:39
+	shl.b32 	%r27, %r26, 7;
+	.loc	1 27 35                         // ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py:27:35
+	add.s32 	%r28, %r27, %r16;
+	.loc	1 27 49                         // ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py:27:49
+	shl.b32 	%r29, %r22, 12;
+	.loc	1 27 44                         // ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py:27:44
+	add.s32 	%r30, %r28, %r29;
+	.loc	1 27 30                         // ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py:27:30
+	mad.wide.s32 	%rd1, %r30, 2, %rd3;
+	.loc	1 27 54                         // ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py:27:54
+	// begin inline asm
+	mov.u32 %r1, 0x0;
+	mov.u32 %r2, 0x0;
+	mov.u32 %r3, 0x0;
+	mov.u32 %r4, 0x0;
+	ld.global.v4.b32 { %r1, %r2, %r3, %r4 }, [ %rd1 + 0 ];
+	// end inline asm
+	.loc	1 28 25                         // ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py:28:25
+	mad.wide.s32 	%rd2, %r10, 2, %rd4;
+	.loc	1 28 36                         // ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py:28:36
+	// begin inline asm
+	st.global.v4.b32 [ %rd2 + 0 ], { %r1, %r2, %r3, %r4 };
+	// end inline asm
+	.loc	1 28 4                          // ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py:28:4
+	ret;
+$L__tmp1:
+$L__func_end0:
+                                        // -- End function
+}
+	.file	1 "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py"
+	.section	.debug_abbrev
+	{
+.b8 1                                   // Abbreviation Code
+.b8 17                                  // DW_TAG_compile_unit
+.b8 0                                   // DW_CHILDREN_no
+.b8 37                                  // DW_AT_producer
+.b8 8                                   // DW_FORM_string
+.b8 19                                  // DW_AT_language
+.b8 5                                   // DW_FORM_data2
+.b8 3                                   // DW_AT_name
+.b8 8                                   // DW_FORM_string
+.b8 16                                  // DW_AT_stmt_list
+.b8 6                                   // DW_FORM_data4
+.b8 27                                  // DW_AT_comp_dir
+.b8 8                                   // DW_FORM_string
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 0                                   // EOM(3)
+	}
+	.section	.debug_info
+	{
+.b32 224                                // Length of Unit
+.b8 2                                   // DWARF version number
+.b8 0
+.b32 .debug_abbrev                      // Offset Into Abbrev. Section
+.b8 8                                   // Address Size (in bytes)
+.b8 1                                   // Abbrev [1] 0xb:0xd9 DW_TAG_compile_unit
+.b8 116                                 // DW_AT_producer
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 0
+.b8 2                                   // DW_AT_language
+.b8 0
+.b8 99                                  // DW_AT_name
+.b8 101
+.b8 105
+.b8 112
+.b8 106
+.b8 120
+.b8 97
+.b8 117
+.b8 115
+.b8 97
+.b8 117
+.b8 122
+.b8 108
+.b8 52
+.b8 109
+.b8 99
+.b8 99
+.b8 50
+.b8 51
+.b8 51
+.b8 102
+.b8 112
+.b8 101
+.b8 117
+.b8 98
+.b8 102
+.b8 115
+.b8 51
+.b8 117
+.b8 107
+.b8 53
+.b8 110
+.b8 105
+.b8 53
+.b8 98
+.b8 106
+.b8 113
+.b8 98
+.b8 108
+.b8 50
+.b8 113
+.b8 119
+.b8 116
+.b8 111
+.b8 119
+.b8 106
+.b8 119
+.b8 114
+.b8 108
+.b8 55
+.b8 99
+.b8 100
+.b8 46
+.b8 112
+.b8 121
+.b8 0
+.b32 .debug_line                        // DW_AT_stmt_list
+.b8 47                                  // DW_AT_comp_dir
+.b8 97
+.b8 112
+.b8 112
+.b8 47
+.b8 116
+.b8 101
+.b8 110
+.b8 115
+.b8 111
+.b8 114
+.b8 114
+.b8 116
+.b8 95
+.b8 108
+.b8 108
+.b8 109
+.b8 47
+.b8 118
+.b8 105
+.b8 115
+.b8 117
+.b8 97
+.b8 108
+.b8 95
+.b8 103
+.b8 101
+.b8 110
+.b8 47
+.b8 99
+.b8 111
+.b8 109
+.b8 112
+.b8 105
+.b8 108
+.b8 101
+.b8 100
+.b8 95
+.b8 99
+.b8 97
+.b8 99
+.b8 104
+.b8 101
+.b8 47
+.b8 102
+.b8 108
+.b8 117
+.b8 120
+.b8 50
+.b8 95
+.b8 107
+.b8 108
+.b8 101
+.b8 105
+.b8 110
+.b8 95
+.b8 57
+.b8 98
+.b8 95
+.b8 78
+.b8 86
+.b8 73
+.b8 68
+.b8 73
+.b8 65
+.b8 95
+.b8 71
+.b8 101
+.b8 70
+.b8 111
+.b8 114
+.b8 99
+.b8 101
+.b8 95
+.b8 82
+.b8 84
+.b8 88
+.b8 95
+.b8 52
+.b8 48
+.b8 57
+.b8 48
+.b8 95
+.b8 115
+.b8 109
+.b8 56
+.b8 57
+.b8 95
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 50
+.b8 46
+.b8 49
+.b8 48
+.b8 46
+.b8 48
+.b8 97
+.b8 48
+.b8 95
+.b8 98
+.b8 52
+.b8 101
+.b8 52
+.b8 101
+.b8 101
+.b8 56
+.b8 49
+.b8 100
+.b8 51
+.b8 46
+.b8 110
+.b8 118
+.b8 50
+.b8 53
+.b8 46
+.b8 49
+.b8 50
+.b8 95
+.b8 99
+.b8 117
+.b8 100
+.b8 97
+.b8 49
+.b8 51
+.b8 95
+.b8 49
+.b8 47
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 105
+.b8 110
+.b8 100
+.b8 117
+.b8 99
+.b8 116
+.b8 111
+.b8 114
+.b8 47
+.b8 101
+.b8 105
+.b8 0
+	}
+	.section	.debug_macinfo	{	}
diff --git a/triton/7ZCAPUU6AQHQ5DX4ENCYV3QMXTYVGXUYKDIHSZVQ45BB7FWTBYGQ/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.source b/triton/7ZCAPUU6AQHQ5DX4ENCYV3QMXTYVGXUYKDIHSZVQ45BB7FWTBYGQ/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.source
new file mode 100644
index 0000000000000000000000000000000000000000..1f405f12ca51bc6e06093fd082ab80276f47b60b
--- /dev/null
+++ b/triton/7ZCAPUU6AQHQ5DX4ENCYV3QMXTYVGXUYKDIHSZVQ45BB7FWTBYGQ/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.source
@@ -0,0 +1,90 @@
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":18:0)
+#loc21 = loc("in_ptr0"(#loc))
+#loc22 = loc("out_ptr0"(#loc))
+#loc23 = loc("xnumel"(#loc))
+module {
+  tt.func public @triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} {
+    %xnumel_0 = arith.constant 9437184 : i32 loc(#loc24)
+    %xoffset = tt.get_program_id x : i32 loc(#loc25)
+    %xoffset_1 = arith.constant 1024 : i32 loc(#loc26)
+    %xoffset_2 = arith.constant 1024 : i32 loc(#loc26)
+    %xoffset_3 = arith.muli %xoffset, %xoffset_2 : i32 loc(#loc26)
+    %xindex = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32> loc(#loc27)
+    %xindex_4 = tt.splat %xoffset_3 : i32 -> tensor<1024xi32> loc(#loc28)
+    %xindex_5 = arith.addi %xindex_4, %xindex : tensor<1024xi32> loc(#loc28)
+    %xmask = arith.constant true loc(#loc29)
+    %xmask_6 = arith.constant dense<true> : tensor<1024xi1> loc(#loc29)
+    %x0 = arith.constant 128 : i32 loc(#loc30)
+    %x0_7 = arith.constant 128 : i32 loc(#loc30)
+    %x0_8 = arith.constant dense<128> : tensor<1024xi32> loc(#loc30)
+    %x0_9 = arith.remsi %xindex_5, %x0_8 : tensor<1024xi32> loc(#loc30)
+    %x1 = arith.constant 128 : i32 loc(#loc31)
+    %x1_10 = arith.constant 128 : i32 loc(#loc31)
+    %x1_11 = arith.constant dense<128> : tensor<1024xi32> loc(#loc31)
+    %x1_12 = arith.divsi %xindex_5, %x1_11 : tensor<1024xi32> loc(#loc31)
+    %x1_13 = arith.constant 2304 : i32 loc(#loc32)
+    %x1_14 = arith.constant 2304 : i32 loc(#loc32)
+    %x1_15 = arith.constant dense<2304> : tensor<1024xi32> loc(#loc32)
+    %x1_16 = arith.remsi %x1_12, %x1_15 : tensor<1024xi32> loc(#loc32)
+    %x2 = arith.constant 294912 : i32 loc(#loc33)
+    %x2_17 = arith.constant 294912 : i32 loc(#loc33)
+    %x2_18 = arith.constant dense<294912> : tensor<1024xi32> loc(#loc33)
+    %x2_19 = arith.divsi %xindex_5, %x2_18 : tensor<1024xi32> loc(#loc33)
+    %tmp0 = arith.constant 128 : i32 loc(#loc34)
+    %tmp0_20 = arith.constant 128 : i32 loc(#loc34)
+    %tmp0_21 = arith.constant dense<128> : tensor<1024xi32> loc(#loc34)
+    %tmp0_22 = arith.muli %tmp0_21, %x2_19 : tensor<1024xi32> loc(#loc34)
+    %tmp0_23 = arith.addi %x0_9, %tmp0_22 : tensor<1024xi32> loc(#loc35)
+    %tmp0_24 = arith.constant 4096 : i32 loc(#loc36)
+    %tmp0_25 = arith.constant 4096 : i32 loc(#loc36)
+    %tmp0_26 = arith.constant dense<4096> : tensor<1024xi32> loc(#loc36)
+    %tmp0_27 = arith.muli %tmp0_26, %x1_16 : tensor<1024xi32> loc(#loc36)
+    %tmp0_28 = arith.addi %tmp0_23, %tmp0_27 : tensor<1024xi32> loc(#loc37)
+    %tmp0_29 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<1024x!tt.ptr<bf16>> loc(#loc38)
+    %tmp0_30 = tt.addptr %tmp0_29, %tmp0_28 : tensor<1024x!tt.ptr<bf16>>, tensor<1024xi32> loc(#loc38)
+    %tmp0_31 = tt.load %tmp0_30 : tensor<1024x!tt.ptr<bf16>> loc(#loc39)
+    %tmp0_32 = arith.extf %tmp0_31 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc40)
+    %0 = tt.splat %out_ptr0 : !tt.ptr<bf16> -> tensor<1024x!tt.ptr<bf16>> loc(#loc18)
+    %1 = tt.addptr %0, %xindex_5 : tensor<1024x!tt.ptr<bf16>>, tensor<1024xi32> loc(#loc18)
+    %2 = arith.truncf %tmp0_32 : tensor<1024xf32> to tensor<1024xbf16> loc(#loc19)
+    tt.store %1, %2 : tensor<1024x!tt.ptr<bf16>> loc(#loc19)
+    tt.return loc(#loc20)
+  } loc(#loc)
+} loc(#loc)
+#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":19:13)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":20:28)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":20:33)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":21:36)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":21:23)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":22:36)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":23:19)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":24:21)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":24:28)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":25:19)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":27:39)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":27:35)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":27:49)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":27:44)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":27:30)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":27:54)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":27:63)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":28:25)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":28:36)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":28:4)
+#loc24 = loc("xnumel"(#loc1))
+#loc25 = loc("xoffset"(#loc2))
+#loc26 = loc("xoffset"(#loc3))
+#loc27 = loc("xindex"(#loc4))
+#loc28 = loc("xindex"(#loc5))
+#loc29 = loc("xmask"(#loc6))
+#loc30 = loc("x0"(#loc7))
+#loc31 = loc("x1"(#loc8))
+#loc32 = loc("x1"(#loc9))
+#loc33 = loc("x2"(#loc10))
+#loc34 = loc("tmp0"(#loc11))
+#loc35 = loc("tmp0"(#loc12))
+#loc36 = loc("tmp0"(#loc13))
+#loc37 = loc("tmp0"(#loc14))
+#loc38 = loc("tmp0"(#loc15))
+#loc39 = loc("tmp0"(#loc16))
+#loc40 = loc("tmp0"(#loc17))
diff --git a/triton/7ZCAPUU6AQHQ5DX4ENCYV3QMXTYVGXUYKDIHSZVQ45BB7FWTBYGQ/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.ttgir b/triton/7ZCAPUU6AQHQ5DX4ENCYV3QMXTYVGXUYKDIHSZVQ45BB7FWTBYGQ/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.ttgir
new file mode 100644
index 0000000000000000000000000000000000000000..7031b76872929bb56852e7f1e9c3b24ec9ea06a7
--- /dev/null
+++ b/triton/7ZCAPUU6AQHQ5DX4ENCYV3QMXTYVGXUYKDIHSZVQ45BB7FWTBYGQ/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.ttgir
@@ -0,0 +1,66 @@
+#blocked = #ttg.blocked<{sizePerThread = [8], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}>
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":18:0)
+#loc19 = loc("in_ptr0"(#loc))
+#loc20 = loc("out_ptr0"(#loc))
+#loc21 = loc("xnumel"(#loc))
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:89", "ttg.threads-per-warp" = 32 : i32} {
+  tt.func public @triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} {
+    %cst = arith.constant dense<128> : tensor<1024xi32, #blocked> loc(#loc1)
+    %cst_0 = arith.constant dense<2304> : tensor<1024xi32, #blocked> loc(#loc1)
+    %cst_1 = arith.constant dense<294912> : tensor<1024xi32, #blocked> loc(#loc1)
+    %cst_2 = arith.constant dense<4096> : tensor<1024xi32, #blocked> loc(#loc1)
+    %c1024_i32 = arith.constant 1024 : i32 loc(#loc1)
+    %xoffset = tt.get_program_id x : i32 loc(#loc22)
+    %xoffset_3 = arith.muli %xoffset, %c1024_i32 : i32 loc(#loc23)
+    %xindex = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked> loc(#loc24)
+    %xindex_4 = tt.splat %xoffset_3 : i32 -> tensor<1024xi32, #blocked> loc(#loc25)
+    %xindex_5 = arith.addi %xindex_4, %xindex : tensor<1024xi32, #blocked> loc(#loc25)
+    %x0 = arith.remsi %xindex_5, %cst : tensor<1024xi32, #blocked> loc(#loc26)
+    %x1 = arith.divsi %xindex_5, %cst : tensor<1024xi32, #blocked> loc(#loc27)
+    %x1_6 = arith.remsi %x1, %cst_0 : tensor<1024xi32, #blocked> loc(#loc28)
+    %x2 = arith.divsi %xindex_5, %cst_1 : tensor<1024xi32, #blocked> loc(#loc29)
+    %tmp0 = arith.muli %x2, %cst : tensor<1024xi32, #blocked> loc(#loc30)
+    %tmp0_7 = arith.addi %x0, %tmp0 : tensor<1024xi32, #blocked> loc(#loc31)
+    %tmp0_8 = arith.muli %x1_6, %cst_2 : tensor<1024xi32, #blocked> loc(#loc32)
+    %tmp0_9 = arith.addi %tmp0_7, %tmp0_8 : tensor<1024xi32, #blocked> loc(#loc33)
+    %tmp0_10 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<1024x!tt.ptr<bf16>, #blocked> loc(#loc34)
+    %tmp0_11 = tt.addptr %tmp0_10, %tmp0_9 : tensor<1024x!tt.ptr<bf16>, #blocked>, tensor<1024xi32, #blocked> loc(#loc34)
+    %tmp0_12 = tt.load %tmp0_11 : tensor<1024x!tt.ptr<bf16>, #blocked> loc(#loc35)
+    %0 = tt.splat %out_ptr0 : !tt.ptr<bf16> -> tensor<1024x!tt.ptr<bf16>, #blocked> loc(#loc16)
+    %1 = tt.addptr %0, %xindex_5 : tensor<1024x!tt.ptr<bf16>, #blocked>, tensor<1024xi32, #blocked> loc(#loc16)
+    tt.store %1, %tmp0_12 : tensor<1024x!tt.ptr<bf16>, #blocked> loc(#loc17)
+    tt.return loc(#loc18)
+  } loc(#loc)
+} loc(#loc)
+#loc1 = loc(unknown)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":20:28)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":20:33)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":21:36)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":21:23)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":23:19)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":24:21)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":24:28)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":25:19)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":27:39)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":27:35)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":27:49)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":27:44)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":27:30)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":27:54)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":28:25)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":28:36)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":28:4)
+#loc22 = loc("xoffset"(#loc2))
+#loc23 = loc("xoffset"(#loc3))
+#loc24 = loc("xindex"(#loc4))
+#loc25 = loc("xindex"(#loc5))
+#loc26 = loc("x0"(#loc6))
+#loc27 = loc("x1"(#loc7))
+#loc28 = loc("x1"(#loc8))
+#loc29 = loc("x2"(#loc9))
+#loc30 = loc("tmp0"(#loc10))
+#loc31 = loc("tmp0"(#loc11))
+#loc32 = loc("tmp0"(#loc12))
+#loc33 = loc("tmp0"(#loc13))
+#loc34 = loc("tmp0"(#loc14))
+#loc35 = loc("tmp0"(#loc15))
diff --git a/triton/7ZCAPUU6AQHQ5DX4ENCYV3QMXTYVGXUYKDIHSZVQ45BB7FWTBYGQ/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.ttir b/triton/7ZCAPUU6AQHQ5DX4ENCYV3QMXTYVGXUYKDIHSZVQ45BB7FWTBYGQ/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.ttir
new file mode 100644
index 0000000000000000000000000000000000000000..4fe97caa03918e88f85e6f7e93683d6486703a3b
--- /dev/null
+++ b/triton/7ZCAPUU6AQHQ5DX4ENCYV3QMXTYVGXUYKDIHSZVQ45BB7FWTBYGQ/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0.ttir
@@ -0,0 +1,65 @@
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":18:0)
+#loc19 = loc("in_ptr0"(#loc))
+#loc20 = loc("out_ptr0"(#loc))
+#loc21 = loc("xnumel"(#loc))
+module {
+  tt.func public @triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_0(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} {
+    %tmp0 = arith.constant dense<4096> : tensor<1024xi32> loc(#loc22)
+    %x2 = arith.constant dense<294912> : tensor<1024xi32> loc(#loc23)
+    %x1 = arith.constant dense<2304> : tensor<1024xi32> loc(#loc24)
+    %cst = arith.constant dense<128> : tensor<1024xi32> loc(#loc4)
+    %c1024_i32 = arith.constant 1024 : i32 loc(#loc4)
+    %xoffset = tt.get_program_id x : i32 loc(#loc25)
+    %xoffset_0 = arith.muli %xoffset, %c1024_i32 : i32 loc(#loc26)
+    %xindex = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32> loc(#loc27)
+    %xindex_1 = tt.splat %xoffset_0 : i32 -> tensor<1024xi32> loc(#loc28)
+    %xindex_2 = arith.addi %xindex_1, %xindex : tensor<1024xi32> loc(#loc28)
+    %x0 = arith.remsi %xindex_2, %cst : tensor<1024xi32> loc(#loc29)
+    %x1_3 = arith.divsi %xindex_2, %cst : tensor<1024xi32> loc(#loc30)
+    %x1_4 = arith.remsi %x1_3, %x1 : tensor<1024xi32> loc(#loc24)
+    %x2_5 = arith.divsi %xindex_2, %x2 : tensor<1024xi32> loc(#loc23)
+    %tmp0_6 = arith.muli %x2_5, %cst : tensor<1024xi32> loc(#loc31)
+    %tmp0_7 = arith.addi %x0, %tmp0_6 : tensor<1024xi32> loc(#loc32)
+    %tmp0_8 = arith.muli %x1_4, %tmp0 : tensor<1024xi32> loc(#loc22)
+    %tmp0_9 = arith.addi %tmp0_7, %tmp0_8 : tensor<1024xi32> loc(#loc33)
+    %tmp0_10 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<1024x!tt.ptr<bf16>> loc(#loc34)
+    %tmp0_11 = tt.addptr %tmp0_10, %tmp0_9 : tensor<1024x!tt.ptr<bf16>>, tensor<1024xi32> loc(#loc34)
+    %tmp0_12 = tt.load %tmp0_11 : tensor<1024x!tt.ptr<bf16>> loc(#loc35)
+    %0 = tt.splat %out_ptr0 : !tt.ptr<bf16> -> tensor<1024x!tt.ptr<bf16>> loc(#loc16)
+    %1 = tt.addptr %0, %xindex_2 : tensor<1024x!tt.ptr<bf16>>, tensor<1024xi32> loc(#loc16)
+    tt.store %1, %tmp0_12 : tensor<1024x!tt.ptr<bf16>> loc(#loc17)
+    tt.return loc(#loc18)
+  } loc(#loc)
+} loc(#loc)
+#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":27:49)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":25:19)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":24:28)
+#loc4 = loc(unknown)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":20:28)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":20:33)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":21:36)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":21:23)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":23:19)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":24:21)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":27:39)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":27:35)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":27:44)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":27:30)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":27:54)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":28:25)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":28:36)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ei/ceipjxausauzl4mcc233fpeubfs3uk5ni5bjqbl2qwtowjwrl7cd.py":28:4)
+#loc22 = loc("tmp0"(#loc1))
+#loc23 = loc("x2"(#loc2))
+#loc24 = loc("x1"(#loc3))
+#loc25 = loc("xoffset"(#loc5))
+#loc26 = loc("xoffset"(#loc6))
+#loc27 = loc("xindex"(#loc7))
+#loc28 = loc("xindex"(#loc8))
+#loc29 = loc("x0"(#loc9))
+#loc30 = loc("x1"(#loc10))
+#loc31 = loc("tmp0"(#loc11))
+#loc32 = loc("tmp0"(#loc12))
+#loc33 = loc("tmp0"(#loc13))
+#loc34 = loc("tmp0"(#loc14))
+#loc35 = loc("tmp0"(#loc15))
diff --git a/triton/AMWB5VPXRXQ5ZMIVHHFYPDAG3XY3APJNWGYSS2Y5GCX6KVQVJGEQ/__grp__triton_red_fused__fused_rms_norm_view_0.json b/triton/AMWB5VPXRXQ5ZMIVHHFYPDAG3XY3APJNWGYSS2Y5GCX6KVQVJGEQ/__grp__triton_red_fused__fused_rms_norm_view_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..cd60d5946f236eb87845180c714237103291ecda
--- /dev/null
+++ b/triton/AMWB5VPXRXQ5ZMIVHHFYPDAG3XY3APJNWGYSS2Y5GCX6KVQVJGEQ/__grp__triton_red_fused__fused_rms_norm_view_0.json
@@ -0,0 +1 @@
+{"child_paths": {"triton_red_fused__fused_rms_norm_view_0.source": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/AMWB5VPXRXQ5ZMIVHHFYPDAG3XY3APJNWGYSS2Y5GCX6KVQVJGEQ/triton_red_fused__fused_rms_norm_view_0.source", "triton_red_fused__fused_rms_norm_view_0.ttir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/AMWB5VPXRXQ5ZMIVHHFYPDAG3XY3APJNWGYSS2Y5GCX6KVQVJGEQ/triton_red_fused__fused_rms_norm_view_0.ttir", "triton_red_fused__fused_rms_norm_view_0.ttgir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/AMWB5VPXRXQ5ZMIVHHFYPDAG3XY3APJNWGYSS2Y5GCX6KVQVJGEQ/triton_red_fused__fused_rms_norm_view_0.ttgir", "triton_red_fused__fused_rms_norm_view_0.llir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/AMWB5VPXRXQ5ZMIVHHFYPDAG3XY3APJNWGYSS2Y5GCX6KVQVJGEQ/triton_red_fused__fused_rms_norm_view_0.llir", "triton_red_fused__fused_rms_norm_view_0.ptx": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/AMWB5VPXRXQ5ZMIVHHFYPDAG3XY3APJNWGYSS2Y5GCX6KVQVJGEQ/triton_red_fused__fused_rms_norm_view_0.ptx", "triton_red_fused__fused_rms_norm_view_0.cubin": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/AMWB5VPXRXQ5ZMIVHHFYPDAG3XY3APJNWGYSS2Y5GCX6KVQVJGEQ/triton_red_fused__fused_rms_norm_view_0.cubin", "triton_red_fused__fused_rms_norm_view_0.json": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/AMWB5VPXRXQ5ZMIVHHFYPDAG3XY3APJNWGYSS2Y5GCX6KVQVJGEQ/triton_red_fused__fused_rms_norm_view_0.json"}}
\ No newline at end of file
diff --git a/triton/AMWB5VPXRXQ5ZMIVHHFYPDAG3XY3APJNWGYSS2Y5GCX6KVQVJGEQ/triton_red_fused__fused_rms_norm_view_0.cubin b/triton/AMWB5VPXRXQ5ZMIVHHFYPDAG3XY3APJNWGYSS2Y5GCX6KVQVJGEQ/triton_red_fused__fused_rms_norm_view_0.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..ad221bc8ff2da7aebe027994e2a35284823235d5
Binary files /dev/null and b/triton/AMWB5VPXRXQ5ZMIVHHFYPDAG3XY3APJNWGYSS2Y5GCX6KVQVJGEQ/triton_red_fused__fused_rms_norm_view_0.cubin differ
diff --git a/triton/AMWB5VPXRXQ5ZMIVHHFYPDAG3XY3APJNWGYSS2Y5GCX6KVQVJGEQ/triton_red_fused__fused_rms_norm_view_0.json b/triton/AMWB5VPXRXQ5ZMIVHHFYPDAG3XY3APJNWGYSS2Y5GCX6KVQVJGEQ/triton_red_fused__fused_rms_norm_view_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..c751eac3b3d6fab69ba84ce0a09e5c5cc800886b
--- /dev/null
+++ b/triton/AMWB5VPXRXQ5ZMIVHHFYPDAG3XY3APJNWGYSS2Y5GCX6KVQVJGEQ/triton_red_fused__fused_rms_norm_view_0.json
@@ -0,0 +1 @@
+{"hash": "032c1ed5f78de1dcb11539cb878c06ddf1b03d2db1b1296b1d30afe556154989", "target": {"backend": "cuda", "arch": 89, "warp_size": 32}, "num_warps": 16, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "enable_reflect_ftz": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee", "bf16x3", "bf16x6"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm89", "instrumentation_mode": "", "triton_version": "3.6.0", "tensordesc_meta": [], "shared": 256, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused__fused_rms_norm_view_0"}
\ No newline at end of file
diff --git a/triton/AMWB5VPXRXQ5ZMIVHHFYPDAG3XY3APJNWGYSS2Y5GCX6KVQVJGEQ/triton_red_fused__fused_rms_norm_view_0.llir b/triton/AMWB5VPXRXQ5ZMIVHHFYPDAG3XY3APJNWGYSS2Y5GCX6KVQVJGEQ/triton_red_fused__fused_rms_norm_view_0.llir
new file mode 100644
index 0000000000000000000000000000000000000000..5c07cf1b238e0f4f0a0639559007395130f6c1b6
--- /dev/null
+++ b/triton/AMWB5VPXRXQ5ZMIVHHFYPDAG3XY3APJNWGYSS2Y5GCX6KVQVJGEQ/triton_red_fused__fused_rms_norm_view_0.llir
@@ -0,0 +1,161 @@
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64"
+
+@global_smem = external local_unnamed_addr addrspace(3) global [0 x i8], align 16
+
+; Function Attrs: nounwind
+define ptx_kernel void @triton_red_fused__fused_rms_norm_view_0(ptr addrspace(1) %0, ptr addrspace(1) %1, i32 %2, i32 %3, ptr addrspace(1) readnone captures(none) %4, ptr addrspace(1) readnone captures(none) %5) local_unnamed_addr #0 !dbg !4 {
+  %7 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7
+  %8 = shl i32 %7, 6, !dbg !8
+  %9 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9
+  %10 = and i32 %9, 504, !dbg !9
+  %11 = lshr exact i32 %10, 3, !dbg !9
+  %12 = or disjoint i32 %11, %8, !dbg !10
+  %13 = shl nuw nsw i32 %9, 3, !dbg !11
+  %14 = and i32 %13, 56, !dbg !11
+  %15 = sdiv i32 %12, 32, !dbg !12
+  %16 = mul i32 %15, 32, !dbg !13
+  %.decomposed = sub i32 %12, %16, !dbg !13
+  %17 = shl nsw i32 %.decomposed, 7, !dbg !14
+  %18 = mul i32 %15, 12288, !dbg !15
+  %19 = or disjoint i32 %17, %14
+  %20 = add i32 %19, %18
+  %21 = sext i32 %20 to i64, !dbg !16
+  %22 = getelementptr bfloat, ptr addrspace(1) %0, i64 %21, !dbg !16
+  %23 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !17
+  %24 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %22, i64 %23, i1 true) #4, !dbg !17
+  %25 = extractvalue { i32, i32, i32, i32 } %24, 0, !dbg !17
+  %26 = bitcast i32 %25 to <2 x bfloat>, !dbg !17
+  %27 = extractvalue { i32, i32, i32, i32 } %24, 1, !dbg !17
+  %28 = bitcast i32 %27 to <2 x bfloat>, !dbg !17
+  %29 = extractvalue { i32, i32, i32, i32 } %24, 2, !dbg !17
+  %30 = bitcast i32 %29 to <2 x bfloat>, !dbg !17
+  %31 = extractvalue { i32, i32, i32, i32 } %24, 3, !dbg !17
+  %32 = bitcast i32 %31 to <2 x bfloat>, !dbg !17
+  %33 = sext i32 %20 to i64, !dbg !16
+  %34 = getelementptr bfloat, ptr addrspace(1) %0, i64 %33, !dbg !16
+  %35 = getelementptr i8, ptr addrspace(1) %34, i64 128, !dbg !16
+  %36 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !17
+  %37 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %35, i64 %36, i1 true) #4, !dbg !17
+  %38 = extractvalue { i32, i32, i32, i32 } %37, 0, !dbg !17
+  %39 = bitcast i32 %38 to <2 x bfloat>, !dbg !17
+  %40 = extractvalue { i32, i32, i32, i32 } %37, 1, !dbg !17
+  %41 = bitcast i32 %40 to <2 x bfloat>, !dbg !17
+  %42 = extractvalue { i32, i32, i32, i32 } %37, 2, !dbg !17
+  %43 = bitcast i32 %42 to <2 x bfloat>, !dbg !17
+  %44 = extractvalue { i32, i32, i32, i32 } %37, 3, !dbg !17
+  %45 = bitcast i32 %44 to <2 x bfloat>, !dbg !17
+  %46 = fpext <2 x bfloat> %26 to <2 x float>, !dbg !18
+  %47 = fmul <2 x float> %46, %46, !dbg !19
+  %48 = fpext <2 x bfloat> %39 to <2 x float>, !dbg !18
+  %49 = fmul <2 x float> %48, %48, !dbg !19
+  %50 = fadd <2 x float> %47, %49, !dbg !20
+  %51 = fpext <2 x bfloat> %28 to <2 x float>, !dbg !18
+  %52 = fmul <2 x float> %51, %51, !dbg !19
+  %53 = fpext <2 x bfloat> %41 to <2 x float>, !dbg !18
+  %54 = fmul <2 x float> %53, %53, !dbg !19
+  %55 = fadd <2 x float> %52, %54, !dbg !20
+  %56 = fpext <2 x bfloat> %30 to <2 x float>, !dbg !18
+  %57 = fmul <2 x float> %56, %56, !dbg !19
+  %58 = fpext <2 x bfloat> %43 to <2 x float>, !dbg !18
+  %59 = fmul <2 x float> %58, %58, !dbg !19
+  %60 = fadd <2 x float> %57, %59, !dbg !20
+  %61 = fpext <2 x bfloat> %32 to <2 x float>, !dbg !18
+  %62 = fmul <2 x float> %61, %61, !dbg !19
+  %63 = fpext <2 x bfloat> %45 to <2 x float>, !dbg !18
+  %64 = fmul <2 x float> %63, %63, !dbg !19
+  %65 = fadd <2 x float> %62, %64, !dbg !20
+  %66 = and i32 %9, 63, !dbg !9
+  %67 = or disjoint i32 %8, %66, !dbg !10
+  %shift = shufflevector <2 x float> %50, <2 x float> poison, <2 x i32> <i32 1, i32 poison>, !dbg !21
+  %foldExtExtBinop = fadd <2 x float> %50, %shift, !dbg !21
+  %foldExtExtBinop9 = fadd <2 x float> %55, %foldExtExtBinop, !dbg !21
+  %shift11 = shufflevector <2 x float> %55, <2 x float> poison, <2 x i32> <i32 1, i32 poison>, !dbg !21
+  %foldExtExtBinop12 = fadd <2 x float> %shift11, %foldExtExtBinop9, !dbg !21
+  %foldExtExtBinop14 = fadd <2 x float> %60, %foldExtExtBinop12, !dbg !21
+  %shift16 = shufflevector <2 x float> %60, <2 x float> poison, <2 x i32> <i32 1, i32 poison>, !dbg !21
+  %foldExtExtBinop17 = fadd <2 x float> %shift16, %foldExtExtBinop14, !dbg !21
+  %foldExtExtBinop19 = fadd <2 x float> %65, %foldExtExtBinop17, !dbg !21
+  %shift21 = shufflevector <2 x float> %65, <2 x float> poison, <2 x i32> <i32 1, i32 poison>, !dbg !21
+  %foldExtExtBinop22 = fadd <2 x float> %shift21, %foldExtExtBinop19, !dbg !21
+  %68 = extractelement <2 x float> %foldExtExtBinop22, i64 0, !dbg !21
+  %69 = bitcast float %68 to i32, !dbg !24
+  %70 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %69, i32 4, i32 31), !dbg !24
+  %71 = bitcast i32 %70 to float, !dbg !24
+  %72 = fadd float %68, %71, !dbg !21
+  %73 = bitcast float %72 to i32, !dbg !24
+  %74 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %73, i32 2, i32 31), !dbg !24
+  %75 = bitcast i32 %74 to float, !dbg !24
+  %76 = fadd float %72, %75, !dbg !21
+  %77 = bitcast float %76 to i32, !dbg !24
+  %78 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %77, i32 1, i32 31), !dbg !24
+  %79 = bitcast i32 %78 to float, !dbg !24
+  %80 = fadd float %76, %79, !dbg !21
+  %81 = lshr exact i32 %10, 1, !dbg !27
+  %82 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %81, !dbg !27
+  store float %80, ptr addrspace(3) %82, align 4, !dbg !27
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !27
+  %83 = shl nuw nsw i32 %66, 2, !dbg !27
+  %84 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %83, !dbg !27
+  %85 = load i32, ptr addrspace(3) %84, align 4, !dbg !27
+  %86 = sext i32 %67 to i64, !dbg !28
+  %87 = getelementptr float, ptr addrspace(1) %1, i64 %86, !dbg !28
+  %88 = and i32 %9, 448, !dbg !29
+  %89 = icmp eq i32 %88, 0, !dbg !29
+  tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %85, ptr addrspace(1) %87, i1 %89) #4, !dbg !29
+  ret void, !dbg !30
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1
+
+; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
+declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2
+
+; Function Attrs: convergent nocallback nounwind
+declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #3
+
+attributes #0 = { nounwind "nvvm.reqntid"="512" }
+attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
+attributes #3 = { convergent nocallback nounwind }
+attributes #4 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly)
+!1 = !DIFile(filename: "cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py", directory: "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv")
+!2 = !{i32 2, !"Debug Info Version", i32 3}
+!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
+!4 = distinct !DISubprogram(name: "triton_red_fused__fused_rms_norm_view_0", linkageName: "triton_red_fused__fused_rms_norm_view_0", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!5 = !DISubroutineType(cc: DW_CC_normal, types: !6)
+!6 = !{}
+!7 = !DILocation(line: 23, column: 28, scope: !4)
+!8 = !DILocation(line: 23, column: 33, scope: !4)
+!9 = !DILocation(line: 24, column: 44, scope: !4)
+!10 = !DILocation(line: 24, column: 23, scope: !4)
+!11 = !DILocation(line: 26, column: 37, scope: !4)
+!12 = !DILocation(line: 29, column: 19, scope: !4)
+!13 = !DILocation(line: 28, column: 19, scope: !4)
+!14 = !DILocation(line: 38, column: 45, scope: !4)
+!15 = !DILocation(line: 38, column: 56, scope: !4)
+!16 = !DILocation(line: 38, column: 34, scope: !4)
+!17 = !DILocation(line: 38, column: 61, scope: !4)
+!18 = !DILocation(line: 38, column: 115, scope: !4)
+!19 = !DILocation(line: 40, column: 22, scope: !4)
+!20 = !DILocation(line: 42, column: 23, scope: !4)
+!21 = !DILocation(line: 263, column: 15, scope: !22, inlinedAt: !24)
+!22 = distinct !DILexicalBlockFile(scope: !4, file: !23, discriminator: 0)
+!23 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.12/dist-packages/triton/language")
+!24 = !DILocation(line: 293, column: 36, scope: !22, inlinedAt: !25)
+!25 = !DILocation(line: 44, column: 25, scope: !26)
+!26 = distinct !DILexicalBlockFile(scope: !4, file: !1, discriminator: 0)
+!27 = !DILocation(line: 44, column: 28, scope: !4)
+!28 = !DILocation(line: 45, column: 25, scope: !4)
+!29 = !DILocation(line: 45, column: 36, scope: !4)
+!30 = !DILocation(line: 45, column: 4, scope: !4)
diff --git a/triton/AMWB5VPXRXQ5ZMIVHHFYPDAG3XY3APJNWGYSS2Y5GCX6KVQVJGEQ/triton_red_fused__fused_rms_norm_view_0.ptx b/triton/AMWB5VPXRXQ5ZMIVHHFYPDAG3XY3APJNWGYSS2Y5GCX6KVQVJGEQ/triton_red_fused__fused_rms_norm_view_0.ptx
new file mode 100644
index 0000000000000000000000000000000000000000..c03d0e81c89d7deb942021b08244255d56f89b37
--- /dev/null
+++ b/triton/AMWB5VPXRXQ5ZMIVHHFYPDAG3XY3APJNWGYSS2Y5GCX6KVQVJGEQ/triton_red_fused__fused_rms_norm_view_0.ptx
@@ -0,0 +1,557 @@
+//
+// Generated by LLVM NVPTX Back-End
+//
+
+.version 9.1
+.target sm_89
+.address_size 64
+
+	// .globl	triton_red_fused__fused_rms_norm_view_0 // -- Begin function triton_red_fused__fused_rms_norm_view_0
+.extern .shared .align 16 .b8 global_smem[];
+                                        // @triton_red_fused__fused_rms_norm_view_0
+.visible .entry triton_red_fused__fused_rms_norm_view_0(
+	.param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm_view_0_param_0,
+	.param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm_view_0_param_1,
+	.param .u32 triton_red_fused__fused_rms_norm_view_0_param_2,
+	.param .u32 triton_red_fused__fused_rms_norm_view_0_param_3,
+	.param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm_view_0_param_4,
+	.param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm_view_0_param_5
+)
+.reqntid 512
+{
+	.reg .pred 	%p<3>;
+	.reg .b16 	%rs<17>;
+	.reg .b32 	%r<81>;
+	.reg .b64 	%rd<8>;
+	.loc	1 18 0                          // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:18:0
+$L__func_begin0:
+	.loc	1 18 0                          // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:18:0
+
+// %bb.0:
+	ld.param.b64 	%rd6, [triton_red_fused__fused_rms_norm_view_0_param_0];
+	ld.param.b64 	%rd7, [triton_red_fused__fused_rms_norm_view_0_param_1];
+$L__tmp0:
+	.loc	1 23 28                         // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:23:28
+	mov.u32 	%r11, %ctaid.x;
+	.loc	1 23 33                         // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:23:33
+	shl.b32 	%r12, %r11, 6;
+	.loc	1 24 44                         // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:24:44
+	mov.u32 	%r13, %tid.x;
+	and.b32 	%r14, %r13, 504;
+	bfe.u32 	%r15, %r13, 3, 6;
+	.loc	1 24 23                         // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:24:23
+	or.b32 	%r16, %r15, %r12;
+	.loc	1 26 37                         // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:26:37
+	shl.b32 	%r17, %r13, 3;
+	and.b32 	%r18, %r17, 56;
+	.loc	1 29 19                         // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:29:19
+	bfe.s32 	%r19, %r11, 25, 1;
+	shr.u32 	%r20, %r19, 27;
+	add.s32 	%r21, %r16, %r20;
+	shr.u32 	%r22, %r21, 5;
+	.loc	1 28 19                         // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:28:19
+	and.b32 	%r23, %r21, 33554400;
+	sub.s32 	%r24, %r16, %r23;
+	.loc	1 38 45                         // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:38:45
+	shl.b32 	%r25, %r24, 7;
+	or.b32 	%r26, %r25, %r18;
+	mad.lo.s32 	%r27, %r22, 12288, %r26;
+	.loc	1 38 34                         // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:38:34
+	mad.wide.s32 	%rd1, %r27, 2, %rd6;
+	.loc	1 38 61                         // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:38:61
+	// begin inline asm
+	mov.u64 %rd2, 0x0;
+	createpolicy.fractional.L2::evict_first.b64 %rd2, 1.0;
+	// end inline asm
+	mov.b32 	%r5, 0;
+	mov.pred 	%p1, -1;
+	// begin inline asm
+	mov.u32 %r1, %r5;
+	mov.u32 %r2, %r5;
+	mov.u32 %r3, %r5;
+	mov.u32 %r4, %r5;
+	@%p1 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { %r1, %r2, %r3, %r4 }, [ %rd1 + 0 ], %rd2;
+	// end inline asm
+	.loc	1 38 34                         // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:38:34
+	add.s64 	%rd3, %rd1, 128;
+	.loc	1 38 61                         // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:38:61
+	// begin inline asm
+	mov.u64 %rd4, 0x0;
+	createpolicy.fractional.L2::evict_first.b64 %rd4, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r6, %r5;
+	mov.u32 %r7, %r5;
+	mov.u32 %r8, %r5;
+	mov.u32 %r9, %r5;
+	@%p1 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { %r6, %r7, %r8, %r9 }, [ %rd3 + 0 ], %rd4;
+	// end inline asm
+	.loc	1 38 115                        // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:38:115
+	mov.b32 	{%rs1, %rs2}, %r1;
+	cvt.f32.bf16 	%r28, %rs1;
+	cvt.f32.bf16 	%r29, %rs2;
+	mov.b32 	{%rs3, %rs4}, %r6;
+	cvt.f32.bf16 	%r30, %rs4;
+	cvt.f32.bf16 	%r31, %rs3;
+	.loc	1 40 22                         // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:40:22
+	mul.f32 	%r32, %r31, %r31;
+	mul.f32 	%r33, %r30, %r30;
+	.loc	1 42 23                         // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:42:23
+	fma.rn.f32 	%r34, %r29, %r29, %r33;
+	fma.rn.f32 	%r35, %r28, %r28, %r32;
+	.loc	1 38 115                        // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:38:115
+	mov.b32 	{%rs5, %rs6}, %r2;
+	cvt.f32.bf16 	%r36, %rs5;
+	cvt.f32.bf16 	%r37, %rs6;
+	mov.b32 	{%rs7, %rs8}, %r7;
+	cvt.f32.bf16 	%r38, %rs8;
+	cvt.f32.bf16 	%r39, %rs7;
+	.loc	1 40 22                         // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:40:22
+	mul.f32 	%r40, %r39, %r39;
+	mul.f32 	%r41, %r38, %r38;
+	.loc	1 42 23                         // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:42:23
+	fma.rn.f32 	%r42, %r37, %r37, %r41;
+	fma.rn.f32 	%r43, %r36, %r36, %r40;
+	.loc	1 38 115                        // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:38:115
+	mov.b32 	{%rs9, %rs10}, %r3;
+	cvt.f32.bf16 	%r44, %rs9;
+	cvt.f32.bf16 	%r45, %rs10;
+	mov.b32 	{%rs11, %rs12}, %r8;
+	cvt.f32.bf16 	%r46, %rs12;
+	cvt.f32.bf16 	%r47, %rs11;
+	.loc	1 40 22                         // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:40:22
+	mul.f32 	%r48, %r47, %r47;
+	mul.f32 	%r49, %r46, %r46;
+	.loc	1 42 23                         // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:42:23
+	fma.rn.f32 	%r50, %r45, %r45, %r49;
+	fma.rn.f32 	%r51, %r44, %r44, %r48;
+	.loc	1 38 115                        // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:38:115
+	mov.b32 	{%rs13, %rs14}, %r4;
+	cvt.f32.bf16 	%r52, %rs13;
+	cvt.f32.bf16 	%r53, %rs14;
+	mov.b32 	{%rs15, %rs16}, %r9;
+	cvt.f32.bf16 	%r54, %rs16;
+	cvt.f32.bf16 	%r55, %rs15;
+	.loc	1 40 22                         // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:40:22
+	mul.f32 	%r56, %r55, %r55;
+	mul.f32 	%r57, %r54, %r54;
+	.loc	1 42 23                         // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:42:23
+	fma.rn.f32 	%r58, %r53, %r53, %r57;
+	fma.rn.f32 	%r59, %r52, %r52, %r56;
+	.loc	1 24 44                         // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:24:44
+	and.b32 	%r60, %r13, 63;
+	.loc	1 24 23                         // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:24:23
+	or.b32 	%r61, %r12, %r60;
+$L__tmp1:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:44:25 ] ]
+	add.f32 	%r62, %r35, %r34;
+	add.f32 	%r63, %r43, %r62;
+	add.f32 	%r64, %r42, %r63;
+	add.f32 	%r65, %r51, %r64;
+	add.f32 	%r66, %r50, %r65;
+	add.f32 	%r67, %r59, %r66;
+	add.f32 	%r68, %r58, %r67;
+$L__tmp2:
+	.loc	2 293 36                        // standard.py:293:36 @[ cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:44:25 ]
+	shfl.sync.bfly.b32 	%r69, %r68, 4, 31, -1;
+$L__tmp3:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:44:25 ] ]
+	add.f32 	%r70, %r68, %r69;
+$L__tmp4:
+	.loc	2 293 36                        // standard.py:293:36 @[ cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:44:25 ]
+	shfl.sync.bfly.b32 	%r71, %r70, 2, 31, -1;
+$L__tmp5:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:44:25 ] ]
+	add.f32 	%r72, %r70, %r71;
+$L__tmp6:
+	.loc	2 293 36                        // standard.py:293:36 @[ cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:44:25 ]
+	shfl.sync.bfly.b32 	%r73, %r72, 1, 31, -1;
+$L__tmp7:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:44:25 ] ]
+	add.f32 	%r74, %r72, %r73;
+$L__tmp8:
+	.loc	1 44 28                         // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:44:28
+	shr.u32 	%r75, %r14, 1;
+	mov.b32 	%r76, global_smem;
+	add.s32 	%r77, %r76, %r75;
+	st.shared.b32 	[%r77], %r74;
+	bar.sync 	0;
+	shl.b32 	%r78, %r60, 2;
+	add.s32 	%r79, %r76, %r78;
+	ld.shared.b32 	%r10, [%r79];
+	.loc	1 45 25                         // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:45:25
+	mad.wide.s32 	%rd5, %r61, 4, %rd7;
+	.loc	1 45 36                         // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:45:36
+	and.b32 	%r80, %r13, 448;
+	setp.eq.b32 	%p2, %r80, 0;
+	// begin inline asm
+	@%p2 st.global.b32 [ %rd5 + 0 ], { %r10 };
+	// end inline asm
+	.loc	1 45 4                          // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:45:4
+	ret;
+$L__tmp9:
+$L__func_end0:
+                                        // -- End function
+}
+	.file	1 "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py"
+	.file	2 "/usr/local/lib/python3.12/dist-packages/triton/language/standard.py"
+	.section	.debug_abbrev
+	{
+.b8 1                                   // Abbreviation Code
+.b8 17                                  // DW_TAG_compile_unit
+.b8 1                                   // DW_CHILDREN_yes
+.b8 37                                  // DW_AT_producer
+.b8 8                                   // DW_FORM_string
+.b8 19                                  // DW_AT_language
+.b8 5                                   // DW_FORM_data2
+.b8 3                                   // DW_AT_name
+.b8 8                                   // DW_FORM_string
+.b8 16                                  // DW_AT_stmt_list
+.b8 6                                   // DW_FORM_data4
+.b8 27                                  // DW_AT_comp_dir
+.b8 8                                   // DW_FORM_string
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 2                                   // Abbreviation Code
+.b8 46                                  // DW_TAG_subprogram
+.b8 0                                   // DW_CHILDREN_no
+.b8 3                                   // DW_AT_name
+.b8 8                                   // DW_FORM_string
+.b8 32                                  // DW_AT_inline
+.b8 11                                  // DW_FORM_data1
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 3                                   // Abbreviation Code
+.b8 46                                  // DW_TAG_subprogram
+.b8 1                                   // DW_CHILDREN_yes
+.b8 17                                  // DW_AT_low_pc
+.b8 1                                   // DW_FORM_addr
+.b8 18                                  // DW_AT_high_pc
+.b8 1                                   // DW_FORM_addr
+.b8 49                                  // DW_AT_abstract_origin
+.b8 19                                  // DW_FORM_ref4
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 4                                   // Abbreviation Code
+.b8 29                                  // DW_TAG_inlined_subroutine
+.b8 1                                   // DW_CHILDREN_yes
+.b8 49                                  // DW_AT_abstract_origin
+.b8 19                                  // DW_FORM_ref4
+.b8 17                                  // DW_AT_low_pc
+.b8 1                                   // DW_FORM_addr
+.b8 18                                  // DW_AT_high_pc
+.b8 1                                   // DW_FORM_addr
+.b8 88                                  // DW_AT_call_file
+.b8 11                                  // DW_FORM_data1
+.b8 89                                  // DW_AT_call_line
+.b8 11                                  // DW_FORM_data1
+.b8 87                                  // DW_AT_call_column
+.b8 11                                  // DW_FORM_data1
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 5                                   // Abbreviation Code
+.b8 29                                  // DW_TAG_inlined_subroutine
+.b8 0                                   // DW_CHILDREN_no
+.b8 49                                  // DW_AT_abstract_origin
+.b8 19                                  // DW_FORM_ref4
+.b8 17                                  // DW_AT_low_pc
+.b8 1                                   // DW_FORM_addr
+.b8 18                                  // DW_AT_high_pc
+.b8 1                                   // DW_FORM_addr
+.b8 88                                  // DW_AT_call_file
+.b8 11                                  // DW_FORM_data1
+.b8 89                                  // DW_AT_call_line
+.b8 5                                   // DW_FORM_data2
+.b8 87                                  // DW_AT_call_column
+.b8 11                                  // DW_FORM_data1
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 0                                   // EOM(3)
+	}
+	.section	.debug_info
+	{
+.b32 339                                // Length of Unit
+.b8 2                                   // DWARF version number
+.b8 0
+.b32 .debug_abbrev                      // Offset Into Abbrev. Section
+.b8 8                                   // Address Size (in bytes)
+.b8 1                                   // Abbrev [1] 0xb:0x14c DW_TAG_compile_unit
+.b8 116                                 // DW_AT_producer
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 0
+.b8 2                                   // DW_AT_language
+.b8 0
+.b8 99                                  // DW_AT_name
+.b8 119
+.b8 118
+.b8 121
+.b8 116
+.b8 52
+.b8 50
+.b8 55
+.b8 51
+.b8 105
+.b8 117
+.b8 51
+.b8 51
+.b8 109
+.b8 112
+.b8 101
+.b8 101
+.b8 55
+.b8 104
+.b8 98
+.b8 101
+.b8 116
+.b8 53
+.b8 106
+.b8 53
+.b8 101
+.b8 113
+.b8 52
+.b8 52
+.b8 100
+.b8 54
+.b8 102
+.b8 115
+.b8 104
+.b8 103
+.b8 119
+.b8 107
+.b8 121
+.b8 120
+.b8 107
+.b8 110
+.b8 53
+.b8 50
+.b8 103
+.b8 103
+.b8 103
+.b8 107
+.b8 105
+.b8 113
+.b8 104
+.b8 106
+.b8 53
+.b8 46
+.b8 112
+.b8 121
+.b8 0
+.b32 .debug_line                        // DW_AT_stmt_list
+.b8 47                                  // DW_AT_comp_dir
+.b8 97
+.b8 112
+.b8 112
+.b8 47
+.b8 116
+.b8 101
+.b8 110
+.b8 115
+.b8 111
+.b8 114
+.b8 114
+.b8 116
+.b8 95
+.b8 108
+.b8 108
+.b8 109
+.b8 47
+.b8 118
+.b8 105
+.b8 115
+.b8 117
+.b8 97
+.b8 108
+.b8 95
+.b8 103
+.b8 101
+.b8 110
+.b8 47
+.b8 99
+.b8 111
+.b8 109
+.b8 112
+.b8 105
+.b8 108
+.b8 101
+.b8 100
+.b8 95
+.b8 99
+.b8 97
+.b8 99
+.b8 104
+.b8 101
+.b8 47
+.b8 102
+.b8 108
+.b8 117
+.b8 120
+.b8 50
+.b8 95
+.b8 107
+.b8 108
+.b8 101
+.b8 105
+.b8 110
+.b8 95
+.b8 57
+.b8 98
+.b8 95
+.b8 78
+.b8 86
+.b8 73
+.b8 68
+.b8 73
+.b8 65
+.b8 95
+.b8 71
+.b8 101
+.b8 70
+.b8 111
+.b8 114
+.b8 99
+.b8 101
+.b8 95
+.b8 82
+.b8 84
+.b8 88
+.b8 95
+.b8 52
+.b8 48
+.b8 57
+.b8 48
+.b8 95
+.b8 115
+.b8 109
+.b8 56
+.b8 57
+.b8 95
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 50
+.b8 46
+.b8 49
+.b8 48
+.b8 46
+.b8 48
+.b8 97
+.b8 48
+.b8 95
+.b8 98
+.b8 52
+.b8 101
+.b8 52
+.b8 101
+.b8 101
+.b8 56
+.b8 49
+.b8 100
+.b8 51
+.b8 46
+.b8 110
+.b8 118
+.b8 50
+.b8 53
+.b8 46
+.b8 49
+.b8 50
+.b8 95
+.b8 99
+.b8 117
+.b8 100
+.b8 97
+.b8 49
+.b8 51
+.b8 95
+.b8 49
+.b8 47
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 105
+.b8 110
+.b8 100
+.b8 117
+.b8 99
+.b8 116
+.b8 111
+.b8 114
+.b8 47
+.b8 119
+.b8 118
+.b8 0
+.b8 2                                   // Abbrev [2] 0xe4:0x2a DW_TAG_subprogram
+.b8 116                                 // DW_AT_name
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 114
+.b8 101
+.b8 100
+.b8 95
+.b8 102
+.b8 117
+.b8 115
+.b8 101
+.b8 100
+.b8 95
+.b8 95
+.b8 102
+.b8 117
+.b8 115
+.b8 101
+.b8 100
+.b8 95
+.b8 114
+.b8 109
+.b8 115
+.b8 95
+.b8 110
+.b8 111
+.b8 114
+.b8 109
+.b8 95
+.b8 118
+.b8 105
+.b8 101
+.b8 119
+.b8 95
+.b8 48
+.b8 0
+.b8 1                                   // DW_AT_inline
+.b8 3                                   // Abbrev [3] 0x10e:0x48 DW_TAG_subprogram
+.b64 $L__func_begin0                    // DW_AT_low_pc
+.b64 $L__func_end0                      // DW_AT_high_pc
+.b32 228                                // DW_AT_abstract_origin
+.b8 4                                   // Abbrev [4] 0x123:0x32 DW_TAG_inlined_subroutine
+.b32 228                                // DW_AT_abstract_origin
+.b64 $L__tmp1                           // DW_AT_low_pc
+.b64 $L__tmp8                           // DW_AT_high_pc
+.b8 1                                   // DW_AT_call_file
+.b8 44                                  // DW_AT_call_line
+.b8 25                                  // DW_AT_call_column
+.b8 5                                   // Abbrev [5] 0x13b:0x19 DW_TAG_inlined_subroutine
+.b32 228                                // DW_AT_abstract_origin
+.b64 $L__tmp1                           // DW_AT_low_pc
+.b64 $L__tmp8                           // DW_AT_high_pc
+.b8 2                                   // DW_AT_call_file
+.b8 37                                  // DW_AT_call_line
+.b8 1
+.b8 36                                  // DW_AT_call_column
+.b8 0                                   // End Of Children Mark
+.b8 0                                   // End Of Children Mark
+.b8 0                                   // End Of Children Mark
+	}
+	.section	.debug_macinfo	{	}
diff --git a/triton/AMWB5VPXRXQ5ZMIVHHFYPDAG3XY3APJNWGYSS2Y5GCX6KVQVJGEQ/triton_red_fused__fused_rms_norm_view_0.source b/triton/AMWB5VPXRXQ5ZMIVHHFYPDAG3XY3APJNWGYSS2Y5GCX6KVQVJGEQ/triton_red_fused__fused_rms_norm_view_0.source
new file mode 100644
index 0000000000000000000000000000000000000000..2a7953b6cf9c33c70bd81db34013741eeb793d5f
--- /dev/null
+++ b/triton/AMWB5VPXRXQ5ZMIVHHFYPDAG3XY3APJNWGYSS2Y5GCX6KVQVJGEQ/triton_red_fused__fused_rms_norm_view_0.source
@@ -0,0 +1,167 @@
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":18:0)
+#loc33 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":287:0)
+#loc35 = loc(unknown)
+#loc38 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":262:0)
+#loc42 = loc("in_ptr0"(#loc))
+#loc43 = loc("out_ptr0"(#loc))
+#loc44 = loc("xnumel"(#loc))
+#loc45 = loc("r0_numel"(#loc))
+#loc74 = loc("input"(#loc33))
+#loc75 = loc("a"(#loc38))
+#loc76 = loc("b"(#loc38))
+module {
+  tt.func public @triton_red_fused__fused_rms_norm_view_0(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
+    %xnumel_0 = arith.constant 8192 : i32 loc(#loc46)
+    %r0_numel_1 = arith.constant 128 : i32 loc(#loc47)
+    %xoffset = tt.get_program_id x : i32 loc(#loc48)
+    %xoffset_2 = arith.constant 64 : i32 loc(#loc49)
+    %xoffset_3 = arith.constant 64 : i32 loc(#loc49)
+    %xoffset_4 = arith.muli %xoffset, %xoffset_3 : i32 loc(#loc49)
+    %xindex = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc50)
+    %xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc51)
+    %xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<64x1xi32> loc(#loc52)
+    %xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<64x1xi32> loc(#loc52)
+    %xmask = arith.constant true loc(#loc53)
+    %xmask_8 = arith.constant dense<true> : tensor<64x64xi1> loc(#loc53)
+    %r0_base = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc54)
+    %r0_base_9 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc55)
+    %x0 = arith.constant 32 : i32 loc(#loc56)
+    %x0_10 = arith.constant 32 : i32 loc(#loc56)
+    %x0_11 = arith.constant dense<32> : tensor<64x1xi32> loc(#loc56)
+    %x0_12 = arith.remsi %xindex_7, %x0_11 : tensor<64x1xi32> loc(#loc56)
+    %x1 = arith.constant 32 : i32 loc(#loc57)
+    %x1_13 = arith.constant 32 : i32 loc(#loc57)
+    %x1_14 = arith.constant dense<32> : tensor<64x1xi32> loc(#loc57)
+    %x1_15 = arith.divsi %xindex_7, %x1_14 : tensor<64x1xi32> loc(#loc57)
+    %_tmp4 = arith.constant 0.000000e+00 : f32 loc(#loc58)
+    %_tmp4_16 = arith.constant dense<0.000000e+00> : tensor<64x64xf32> loc(#loc58)
+    %c0_i32 = arith.constant 0 : i32 loc(#loc14)
+    %c64_i32 = arith.constant 64 : i32 loc(#loc14)
+    %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc14)
+    %1 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc14)
+    %2 = arith.bitcast %c64_i32 : i32 to i32 loc(#loc14)
+    %3 = ub.poison : i32 loc(#loc14)
+    %_tmp4_17 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%_tmp4_19 = %_tmp4_16) -> (tensor<64x64xf32>)  : i32 {
+      %r0_index = tt.splat %r0_offset : i32 -> tensor<1x64xi32> loc(#loc60)
+      %r0_index_20 = arith.addi %r0_index, %r0_base_9 : tensor<1x64xi32> loc(#loc60)
+      %r0_mask = arith.constant dense<128> : tensor<1x64xi32> loc(#loc61)
+      %r0_mask_21 = arith.cmpi slt, %r0_index_20, %r0_mask : tensor<1x64xi32> loc(#loc61)
+      %tmp0 = arith.constant 128 : i32 loc(#loc62)
+      %tmp0_22 = arith.constant 128 : i32 loc(#loc62)
+      %tmp0_23 = arith.constant dense<128> : tensor<64x1xi32> loc(#loc62)
+      %tmp0_24 = arith.muli %tmp0_23, %x0_12 : tensor<64x1xi32> loc(#loc62)
+      %tmp0_25 = tt.broadcast %r0_index_20 : tensor<1x64xi32> -> tensor<64x64xi32> loc(#loc63)
+      %tmp0_26 = tt.broadcast %tmp0_24 : tensor<64x1xi32> -> tensor<64x64xi32> loc(#loc63)
+      %tmp0_27 = arith.addi %tmp0_25, %tmp0_26 : tensor<64x64xi32> loc(#loc63)
+      %tmp0_28 = arith.constant 12288 : i32 loc(#loc64)
+      %tmp0_29 = arith.constant 12288 : i32 loc(#loc64)
+      %tmp0_30 = arith.constant dense<12288> : tensor<64x1xi32> loc(#loc64)
+      %tmp0_31 = arith.muli %tmp0_30, %x1_15 : tensor<64x1xi32> loc(#loc64)
+      %tmp0_32 = tt.broadcast %tmp0_31 : tensor<64x1xi32> -> tensor<64x64xi32> loc(#loc65)
+      %tmp0_33 = arith.addi %tmp0_27, %tmp0_32 : tensor<64x64xi32> loc(#loc65)
+      %tmp0_34 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<64x64x!tt.ptr<bf16>> loc(#loc66)
+      %tmp0_35 = tt.addptr %tmp0_34, %tmp0_33 : tensor<64x64x!tt.ptr<bf16>>, tensor<64x64xi32> loc(#loc66)
+      %tmp0_36 = arith.constant 0.000000e+00 : f32 loc(#loc67)
+      %tmp0_37 = tt.broadcast %r0_mask_21 : tensor<1x64xi1> -> tensor<64x64xi1> loc(#loc67)
+      %tmp0_38 = arith.constant dense<0.000000e+00> : tensor<64x64xf32> loc(#loc67)
+      %tmp0_39 = arith.truncf %tmp0_38 : tensor<64x64xf32> to tensor<64x64xbf16> loc(#loc67)
+      %tmp0_40 = tt.load %tmp0_35, %tmp0_37, %tmp0_39 evictionPolicy = evict_first : tensor<64x64x!tt.ptr<bf16>> loc(#loc67)
+      %tmp0_41 = arith.extf %tmp0_40 : tensor<64x64xbf16> to tensor<64x64xf32> loc(#loc68)
+      %tmp2 = arith.mulf %tmp0_41, %tmp0_41 : tensor<64x64xf32> loc(#loc69)
+      %tmp5 = arith.addf %_tmp4_19, %tmp2 : tensor<64x64xf32> loc(#loc70)
+      %_tmp4_42 = tt.broadcast %r0_mask_21 : tensor<1x64xi1> -> tensor<64x64xi1> loc(#loc71)
+      %_tmp4_43 = arith.select %_tmp4_42, %tmp5, %_tmp4_19 : tensor<64x64xi1>, tensor<64x64xf32> loc(#loc71)
+      scf.yield %_tmp4_43 : tensor<64x64xf32> loc(#loc27)
+    } loc(#loc59)
+    %tmp4 = tt.call @"triton.language.standard.sum__fp32S64_64S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%_tmp4_17) : (tensor<64x64xf32>) -> tensor<64xf32> loc(#loc72)
+    %tmp4_18 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<64xf32> -> tensor<64x1xf32> loc(#loc73)
+    %4 = tt.splat %out_ptr0 : !tt.ptr<f32> -> tensor<64x1x!tt.ptr<f32>> loc(#loc30)
+    %5 = tt.addptr %4, %xindex_7 : tensor<64x1x!tt.ptr<f32>>, tensor<64x1xi32> loc(#loc30)
+    tt.store %5, %tmp4_18 : tensor<64x1x!tt.ptr<f32>> loc(#loc31)
+    tt.return loc(#loc32)
+  } loc(#loc)
+  tt.func private @"triton.language.standard.sum__fp32S64_64S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<64x64xf32> loc("input"(#loc33))) -> tensor<64xf32> attributes {noinline = false} {
+    %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({
+    ^bb0(%arg1: f32 loc(unknown), %arg2: f32 loc(unknown)):
+      %2 = tt.call @triton.language.standard._sum_combine__fp32_fp32__(%arg1, %arg2) : (f32, f32) -> f32 loc(#loc34)
+      tt.reduce.return %2 : f32 loc(#loc34)
+    }) : (tensor<64x64xf32>) -> tensor<64xf32> loc(#loc34)
+    tt.return %0 : tensor<64xf32> loc(#loc36)
+  ^bb1:  // no predecessors
+    %1 = ub.poison : tensor<64xf32> loc(#loc37)
+    tt.return %1 : tensor<64xf32> loc(#loc37)
+  } loc(#loc33)
+  tt.func private @triton.language.standard._sum_combine__fp32_fp32__(%a: f32 loc("a"(#loc38)), %b: f32 loc("b"(#loc38))) -> f32 attributes {noinline = false} {
+    %0 = arith.addf %a, %b : f32 loc(#loc39)
+    tt.return %0 : f32 loc(#loc40)
+  ^bb1:  // no predecessors
+    %1 = ub.poison : f32 loc(#loc41)
+    tt.return %1 : f32 loc(#loc41)
+  } loc(#loc38)
+} loc(#loc)
+#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":19:13)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":20:15)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":23:28)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":23:33)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":24:36)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":24:44)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":24:23)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":25:46)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":26:27)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":26:37)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":28:19)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":29:19)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":30:43)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":32:43)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":33:31)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":34:29)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:45)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:41)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:56)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:50)
+#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:34)
+#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:61)
+#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:115)
+#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":40:22)
+#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":42:23)
+#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":43:40)
+#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":43:8)
+#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":44:25)
+#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":44:28)
+#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":45:25)
+#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":45:36)
+#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":45:4)
+#loc34 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:36)
+#loc36 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:11)
+#loc37 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:4)
+#loc39 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:15)
+#loc40 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:11)
+#loc41 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:4)
+#loc46 = loc("xnumel"(#loc1))
+#loc47 = loc("r0_numel"(#loc2))
+#loc48 = loc("xoffset"(#loc3))
+#loc49 = loc("xoffset"(#loc4))
+#loc50 = loc("xindex"(#loc5))
+#loc51 = loc("xindex"(#loc6))
+#loc52 = loc("xindex"(#loc7))
+#loc53 = loc("xmask"(#loc8))
+#loc54 = loc("r0_base"(#loc9))
+#loc55 = loc("r0_base"(#loc10))
+#loc56 = loc("x0"(#loc11))
+#loc57 = loc("x1"(#loc12))
+#loc58 = loc("_tmp4"(#loc13))
+#loc59 = loc("_tmp4"(#loc14))
+#loc60 = loc("r0_index"(#loc15))
+#loc61 = loc("r0_mask"(#loc16))
+#loc62 = loc("tmp0"(#loc17))
+#loc63 = loc("tmp0"(#loc18))
+#loc64 = loc("tmp0"(#loc19))
+#loc65 = loc("tmp0"(#loc20))
+#loc66 = loc("tmp0"(#loc21))
+#loc67 = loc("tmp0"(#loc22))
+#loc68 = loc("tmp0"(#loc23))
+#loc69 = loc("tmp2"(#loc24))
+#loc70 = loc("tmp5"(#loc25))
+#loc71 = loc("_tmp4"(#loc26))
+#loc72 = loc("tmp4"(#loc28))
+#loc73 = loc("tmp4"(#loc29))
diff --git a/triton/AMWB5VPXRXQ5ZMIVHHFYPDAG3XY3APJNWGYSS2Y5GCX6KVQVJGEQ/triton_red_fused__fused_rms_norm_view_0.ttgir b/triton/AMWB5VPXRXQ5ZMIVHHFYPDAG3XY3APJNWGYSS2Y5GCX6KVQVJGEQ/triton_red_fused__fused_rms_norm_view_0.ttgir
new file mode 100644
index 0000000000000000000000000000000000000000..255ea11b76b6462c29eaa54e156207acf664e4c7
--- /dev/null
+++ b/triton/AMWB5VPXRXQ5ZMIVHHFYPDAG3XY3APJNWGYSS2Y5GCX6KVQVJGEQ/triton_red_fused__fused_rms_norm_view_0.ttgir
@@ -0,0 +1,120 @@
+#blocked = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [4, 8], warpsPerCTA = [16, 1], order = [1, 0]}>
+#blocked1 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [2, 8], order = [0, 1]}>
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":18:0)
+#loc1 = loc(unknown)
+#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":44:25)
+#loc30 = loc("in_ptr0"(#loc))
+#loc31 = loc("out_ptr0"(#loc))
+#loc32 = loc("xnumel"(#loc))
+#loc33 = loc("r0_numel"(#loc))
+#loc54 = loc("tmp4"(#loc24))
+#loc57 = loc(callsite(#loc1 at #loc54))
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 16 : i32, ttg.target = "cuda:89", "ttg.threads-per-warp" = 32 : i32} {
+  tt.func public @triton_red_fused__fused_rms_norm_view_0(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
+    %cst = arith.constant dense<128> : tensor<1x64xi32, #blocked> loc(#loc1)
+    %cst_0 = arith.constant dense<128> : tensor<64x1xi32, #blocked> loc(#loc1)
+    %cst_1 = arith.constant dense<12288> : tensor<64x1xi32, #blocked> loc(#loc1)
+    %cst_2 = arith.constant dense<32> : tensor<64x1xi32, #blocked> loc(#loc1)
+    %c64_i32 = arith.constant 64 : i32 loc(#loc1)
+    %cst_3 = arith.constant dense<0.000000e+00> : tensor<64x64xbf16, #blocked> loc(#loc1)
+    %c128_i32 = arith.constant 128 : i32 loc(#loc1)
+    %c0_i32 = arith.constant 0 : i32 loc(#loc1)
+    %cst_4 = arith.constant dense<0.000000e+00> : tensor<64x64xf32, #blocked> loc(#loc1)
+    %xoffset = tt.get_program_id x : i32 loc(#loc34)
+    %xoffset_5 = arith.muli %xoffset, %c64_i32 : i32 loc(#loc35)
+    %xindex = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc36)
+    %xindex_6 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc36)
+    %xindex_7 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> loc(#loc36)
+    %xindex_8 = tt.expand_dims %xindex_6 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<64x1xi32, #blocked1> loc(#loc36)
+    %xindex_9 = tt.splat %xoffset_5 : i32 -> tensor<64x1xi32, #blocked> loc(#loc37)
+    %xindex_10 = tt.splat %xoffset_5 : i32 -> tensor<64x1xi32, #blocked1> loc(#loc37)
+    %xindex_11 = arith.addi %xindex_9, %xindex_7 : tensor<64x1xi32, #blocked> loc(#loc37)
+    %xindex_12 = arith.addi %xindex_10, %xindex_8 : tensor<64x1xi32, #blocked1> loc(#loc37)
+    %r0_base = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc38)
+    %r0_base_13 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x64xi32, #blocked> loc(#loc38)
+    %x0 = arith.remsi %xindex_11, %cst_2 : tensor<64x1xi32, #blocked> loc(#loc39)
+    %x1 = arith.divsi %xindex_11, %cst_2 : tensor<64x1xi32, #blocked> loc(#loc40)
+    %tmp0 = arith.muli %x0, %cst_0 : tensor<64x1xi32, #blocked> loc(#loc41)
+    %tmp0_14 = tt.broadcast %tmp0 : tensor<64x1xi32, #blocked> -> tensor<64x64xi32, #blocked> loc(#loc42)
+    %tmp0_15 = arith.muli %x1, %cst_1 : tensor<64x1xi32, #blocked> loc(#loc43)
+    %tmp0_16 = tt.broadcast %tmp0_15 : tensor<64x1xi32, #blocked> -> tensor<64x64xi32, #blocked> loc(#loc44)
+    %tmp0_17 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<64x64x!tt.ptr<bf16>, #blocked> loc(#loc45)
+    %_tmp4 = scf.for %_tmp4_20 = %c0_i32 to %c128_i32 step %c64_i32 iter_args(%arg5 = %cst_4) -> (tensor<64x64xf32, #blocked>)  : i32 {
+      %r0_index = tt.splat %_tmp4_20 : i32 -> tensor<1x64xi32, #blocked> loc(#loc47)
+      %r0_index_21 = arith.addi %r0_index, %r0_base_13 : tensor<1x64xi32, #blocked> loc(#loc47)
+      %r0_mask = arith.cmpi slt, %r0_index_21, %cst : tensor<1x64xi32, #blocked> loc(#loc48)
+      %tmp0_22 = tt.broadcast %r0_index_21 : tensor<1x64xi32, #blocked> -> tensor<64x64xi32, #blocked> loc(#loc42)
+      %tmp0_23 = arith.addi %tmp0_22, %tmp0_14 : tensor<64x64xi32, #blocked> loc(#loc42)
+      %tmp0_24 = arith.addi %tmp0_23, %tmp0_16 : tensor<64x64xi32, #blocked> loc(#loc44)
+      %tmp0_25 = tt.addptr %tmp0_17, %tmp0_24 : tensor<64x64x!tt.ptr<bf16>, #blocked>, tensor<64x64xi32, #blocked> loc(#loc45)
+      %tmp0_26 = tt.broadcast %r0_mask : tensor<1x64xi1, #blocked> -> tensor<64x64xi1, #blocked> loc(#loc49)
+      %tmp0_27 = tt.load %tmp0_25, %tmp0_26, %cst_3 evictionPolicy = evict_first : tensor<64x64x!tt.ptr<bf16>, #blocked> loc(#loc49)
+      %tmp0_28 = arith.extf %tmp0_27 : tensor<64x64xbf16, #blocked> to tensor<64x64xf32, #blocked> loc(#loc50)
+      %tmp2 = arith.mulf %tmp0_28, %tmp0_28 : tensor<64x64xf32, #blocked> loc(#loc51)
+      %tmp5 = arith.addf %arg5, %tmp2 : tensor<64x64xf32, #blocked> loc(#loc52)
+      %_tmp4_29 = arith.select %tmp0_26, %tmp5, %arg5 : tensor<64x64xi1, #blocked>, tensor<64x64xf32, #blocked> loc(#loc53)
+      scf.yield %_tmp4_29 : tensor<64x64xf32, #blocked> loc(#loc22)
+    } loc(#loc46)
+    %tmp4 = "tt.reduce"(%_tmp4) <{axis = 1 : i32}> ({
+    ^bb0(%tmp4_20: f32 loc(callsite(#loc1 at #loc54)), %tmp4_21: f32 loc(callsite(#loc1 at #loc54))):
+      %tmp4_22 = arith.addf %tmp4_20, %tmp4_21 : f32 loc(#loc58)
+      tt.reduce.return %tmp4_22 : f32 loc(#loc56)
+    }) : (tensor<64x64xf32, #blocked>) -> tensor<64xf32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc56)
+    %tmp4_18 = ttg.convert_layout %tmp4 : tensor<64xf32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc55)
+    %tmp4_19 = tt.expand_dims %tmp4_18 {axis = 1 : i32} : tensor<64xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<64x1xf32, #blocked1> loc(#loc55)
+    %0 = tt.splat %out_ptr0 : !tt.ptr<f32> -> tensor<64x1x!tt.ptr<f32>, #blocked1> loc(#loc27)
+    %1 = tt.addptr %0, %xindex_12 : tensor<64x1x!tt.ptr<f32>, #blocked1>, tensor<64x1xi32, #blocked1> loc(#loc27)
+    tt.store %1, %tmp4_19 : tensor<64x1x!tt.ptr<f32>, #blocked1> loc(#loc28)
+    tt.return loc(#loc29)
+  } loc(#loc)
+} loc(#loc)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":23:28)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":23:33)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":24:44)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":24:23)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":26:37)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":28:19)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":29:19)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:45)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:41)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:56)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:50)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:34)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":32:43)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":33:31)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":34:29)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:61)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:115)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":40:22)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":42:23)
+#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":43:40)
+#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":43:8)
+#loc23 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:36)
+#loc25 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:15)
+#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":44:28)
+#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":45:25)
+#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":45:36)
+#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":45:4)
+#loc34 = loc("xoffset"(#loc2))
+#loc35 = loc("xoffset"(#loc3))
+#loc36 = loc("xindex"(#loc4))
+#loc37 = loc("xindex"(#loc5))
+#loc38 = loc("r0_base"(#loc6))
+#loc39 = loc("x0"(#loc7))
+#loc40 = loc("x1"(#loc8))
+#loc41 = loc("tmp0"(#loc9))
+#loc42 = loc("tmp0"(#loc10))
+#loc43 = loc("tmp0"(#loc11))
+#loc44 = loc("tmp0"(#loc12))
+#loc45 = loc("tmp0"(#loc13))
+#loc46 = loc("_tmp4"(#loc14))
+#loc47 = loc("r0_index"(#loc15))
+#loc48 = loc("r0_mask"(#loc16))
+#loc49 = loc("tmp0"(#loc17))
+#loc50 = loc("tmp0"(#loc18))
+#loc51 = loc("tmp2"(#loc19))
+#loc52 = loc("tmp5"(#loc20))
+#loc53 = loc("_tmp4"(#loc21))
+#loc55 = loc("tmp4"(#loc26))
+#loc56 = loc(callsite(#loc23 at #loc54))
+#loc58 = loc(callsite(#loc25 at #loc56))
diff --git a/triton/AMWB5VPXRXQ5ZMIVHHFYPDAG3XY3APJNWGYSS2Y5GCX6KVQVJGEQ/triton_red_fused__fused_rms_norm_view_0.ttir b/triton/AMWB5VPXRXQ5ZMIVHHFYPDAG3XY3APJNWGYSS2Y5GCX6KVQVJGEQ/triton_red_fused__fused_rms_norm_view_0.ttir
new file mode 100644
index 0000000000000000000000000000000000000000..909ab5dbd9c02d30c2f1beef4288872b0157a567
--- /dev/null
+++ b/triton/AMWB5VPXRXQ5ZMIVHHFYPDAG3XY3APJNWGYSS2Y5GCX6KVQVJGEQ/triton_red_fused__fused_rms_norm_view_0.ttir
@@ -0,0 +1,114 @@
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":18:0)
+#loc1 = loc(unknown)
+#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":44:25)
+#loc31 = loc("in_ptr0"(#loc))
+#loc32 = loc("out_ptr0"(#loc))
+#loc33 = loc("xnumel"(#loc))
+#loc34 = loc("r0_numel"(#loc))
+#loc56 = loc("tmp4"(#loc25))
+#loc59 = loc(callsite(#loc1 at #loc56))
+module {
+  tt.func public @triton_red_fused__fused_rms_norm_view_0(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
+    %cst = arith.constant dense<0.000000e+00> : tensor<64x64xbf16> loc(#loc1)
+    %c128_i32 = arith.constant 128 : i32 loc(#loc2)
+    %c0_i32 = arith.constant 0 : i32 loc(#loc2)
+    %cst_0 = arith.constant dense<12288> : tensor<64x1xi32> loc(#loc1)
+    %cst_1 = arith.constant dense<128> : tensor<64x1xi32> loc(#loc1)
+    %cst_2 = arith.constant dense<128> : tensor<1x64xi32> loc(#loc1)
+    %cst_3 = arith.constant dense<0.000000e+00> : tensor<64x64xf32> loc(#loc1)
+    %cst_4 = arith.constant dense<32> : tensor<64x1xi32> loc(#loc1)
+    %c64_i32 = arith.constant 64 : i32 loc(#loc1)
+    %xoffset = tt.get_program_id x : i32 loc(#loc35)
+    %xoffset_5 = arith.muli %xoffset, %c64_i32 : i32 loc(#loc36)
+    %xindex = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc37)
+    %xindex_6 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc38)
+    %xindex_7 = tt.splat %xoffset_5 : i32 -> tensor<64x1xi32> loc(#loc39)
+    %xindex_8 = arith.addi %xindex_7, %xindex_6 : tensor<64x1xi32> loc(#loc39)
+    %r0_base = tt.expand_dims %xindex {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc40)
+    %x0 = arith.remsi %xindex_8, %cst_4 : tensor<64x1xi32> loc(#loc41)
+    %x1 = arith.divsi %xindex_8, %cst_4 : tensor<64x1xi32> loc(#loc42)
+    %_tmp4 = scf.for %r0_offset = %c0_i32 to %c128_i32 step %c64_i32 iter_args(%_tmp4_10 = %cst_3) -> (tensor<64x64xf32>)  : i32 {
+      %r0_index = tt.splat %r0_offset : i32 -> tensor<1x64xi32> loc(#loc44)
+      %r0_index_11 = arith.addi %r0_index, %r0_base : tensor<1x64xi32> loc(#loc44)
+      %r0_mask = arith.cmpi slt, %r0_index_11, %cst_2 : tensor<1x64xi32> loc(#loc45)
+      %tmp0 = arith.muli %x0, %cst_1 : tensor<64x1xi32> loc(#loc46)
+      %tmp0_12 = tt.broadcast %r0_index_11 : tensor<1x64xi32> -> tensor<64x64xi32> loc(#loc47)
+      %tmp0_13 = tt.broadcast %tmp0 : tensor<64x1xi32> -> tensor<64x64xi32> loc(#loc47)
+      %tmp0_14 = arith.addi %tmp0_12, %tmp0_13 : tensor<64x64xi32> loc(#loc47)
+      %tmp0_15 = arith.muli %x1, %cst_0 : tensor<64x1xi32> loc(#loc48)
+      %tmp0_16 = tt.broadcast %tmp0_15 : tensor<64x1xi32> -> tensor<64x64xi32> loc(#loc49)
+      %tmp0_17 = arith.addi %tmp0_14, %tmp0_16 : tensor<64x64xi32> loc(#loc49)
+      %tmp0_18 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<64x64x!tt.ptr<bf16>> loc(#loc50)
+      %tmp0_19 = tt.addptr %tmp0_18, %tmp0_17 : tensor<64x64x!tt.ptr<bf16>>, tensor<64x64xi32> loc(#loc50)
+      %tmp0_20 = tt.broadcast %r0_mask : tensor<1x64xi1> -> tensor<64x64xi1> loc(#loc51)
+      %tmp0_21 = tt.load %tmp0_19, %tmp0_20, %cst evictionPolicy = evict_first : tensor<64x64x!tt.ptr<bf16>> loc(#loc51)
+      %tmp0_22 = arith.extf %tmp0_21 : tensor<64x64xbf16> to tensor<64x64xf32> loc(#loc52)
+      %tmp2 = arith.mulf %tmp0_22, %tmp0_22 : tensor<64x64xf32> loc(#loc53)
+      %tmp5 = arith.addf %_tmp4_10, %tmp2 : tensor<64x64xf32> loc(#loc54)
+      %_tmp4_23 = arith.select %tmp0_20, %tmp5, %_tmp4_10 : tensor<64x64xi1>, tensor<64x64xf32> loc(#loc55)
+      scf.yield %_tmp4_23 : tensor<64x64xf32> loc(#loc23)
+    } loc(#loc43)
+    %tmp4 = "tt.reduce"(%_tmp4) <{axis = 1 : i32}> ({
+    ^bb0(%tmp4_10: f32 loc(callsite(#loc1 at #loc56)), %tmp4_11: f32 loc(callsite(#loc1 at #loc56))):
+      %tmp4_12 = arith.addf %tmp4_10, %tmp4_11 : f32 loc(#loc60)
+      tt.reduce.return %tmp4_12 : f32 loc(#loc58)
+    }) : (tensor<64x64xf32>) -> tensor<64xf32> loc(#loc58)
+    %tmp4_9 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<64xf32> -> tensor<64x1xf32> loc(#loc57)
+    %0 = tt.splat %out_ptr0 : !tt.ptr<f32> -> tensor<64x1x!tt.ptr<f32>> loc(#loc28)
+    %1 = tt.addptr %0, %xindex_8 : tensor<64x1x!tt.ptr<f32>>, tensor<64x1xi32> loc(#loc28)
+    tt.store %1, %tmp4_9 : tensor<64x1x!tt.ptr<f32>> loc(#loc29)
+    tt.return loc(#loc30)
+  } loc(#loc)
+} loc(#loc)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":32:43)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":23:28)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":23:33)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":24:36)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":24:44)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":24:23)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":26:37)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":28:19)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":29:19)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":33:31)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":34:29)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:45)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:41)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:56)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:50)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:34)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:61)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:115)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":40:22)
+#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":42:23)
+#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":43:40)
+#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":43:8)
+#loc24 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:36)
+#loc26 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:15)
+#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":44:28)
+#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":45:25)
+#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":45:36)
+#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":45:4)
+#loc35 = loc("xoffset"(#loc3))
+#loc36 = loc("xoffset"(#loc4))
+#loc37 = loc("xindex"(#loc5))
+#loc38 = loc("xindex"(#loc6))
+#loc39 = loc("xindex"(#loc7))
+#loc40 = loc("r0_base"(#loc8))
+#loc41 = loc("x0"(#loc9))
+#loc42 = loc("x1"(#loc10))
+#loc43 = loc("_tmp4"(#loc2))
+#loc44 = loc("r0_index"(#loc11))
+#loc45 = loc("r0_mask"(#loc12))
+#loc46 = loc("tmp0"(#loc13))
+#loc47 = loc("tmp0"(#loc14))
+#loc48 = loc("tmp0"(#loc15))
+#loc49 = loc("tmp0"(#loc16))
+#loc50 = loc("tmp0"(#loc17))
+#loc51 = loc("tmp0"(#loc18))
+#loc52 = loc("tmp0"(#loc19))
+#loc53 = loc("tmp2"(#loc20))
+#loc54 = loc("tmp5"(#loc21))
+#loc55 = loc("_tmp4"(#loc22))
+#loc57 = loc("tmp4"(#loc27))
+#loc58 = loc(callsite(#loc24 at #loc56))
+#loc60 = loc(callsite(#loc26 at #loc58))
diff --git a/triton/AQ3FCZKOYK5LBOX7RLBQGX5T77RKI4M7SEZTYJU34QROQSJNLP5A/__grp__triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json b/triton/AQ3FCZKOYK5LBOX7RLBQGX5T77RKI4M7SEZTYJU34QROQSJNLP5A/__grp__triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..4b2bbd2d1a6fccfd3d25ff93d6b5a32b48f6f085
--- /dev/null
+++ b/triton/AQ3FCZKOYK5LBOX7RLBQGX5T77RKI4M7SEZTYJU34QROQSJNLP5A/__grp__triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json
@@ -0,0 +1 @@
+{"child_paths": {"triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.source": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/AQ3FCZKOYK5LBOX7RLBQGX5T77RKI4M7SEZTYJU34QROQSJNLP5A/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.source", "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/AQ3FCZKOYK5LBOX7RLBQGX5T77RKI4M7SEZTYJU34QROQSJNLP5A/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttir", "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttgir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/AQ3FCZKOYK5LBOX7RLBQGX5T77RKI4M7SEZTYJU34QROQSJNLP5A/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttgir", "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.llir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/AQ3FCZKOYK5LBOX7RLBQGX5T77RKI4M7SEZTYJU34QROQSJNLP5A/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.llir", "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ptx": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/AQ3FCZKOYK5LBOX7RLBQGX5T77RKI4M7SEZTYJU34QROQSJNLP5A/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ptx", "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.cubin": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/AQ3FCZKOYK5LBOX7RLBQGX5T77RKI4M7SEZTYJU34QROQSJNLP5A/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.cubin", "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/AQ3FCZKOYK5LBOX7RLBQGX5T77RKI4M7SEZTYJU34QROQSJNLP5A/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json"}}
\ No newline at end of file
diff --git a/triton/AQ3FCZKOYK5LBOX7RLBQGX5T77RKI4M7SEZTYJU34QROQSJNLP5A/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.cubin b/triton/AQ3FCZKOYK5LBOX7RLBQGX5T77RKI4M7SEZTYJU34QROQSJNLP5A/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..d6fd60642c7725f162e1ad21c3e7dd4beba88666
Binary files /dev/null and b/triton/AQ3FCZKOYK5LBOX7RLBQGX5T77RKI4M7SEZTYJU34QROQSJNLP5A/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.cubin differ
diff --git a/triton/AQ3FCZKOYK5LBOX7RLBQGX5T77RKI4M7SEZTYJU34QROQSJNLP5A/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json b/triton/AQ3FCZKOYK5LBOX7RLBQGX5T77RKI4M7SEZTYJU34QROQSJNLP5A/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..67b189f832459795e1bef3a08aa031f33f933750
--- /dev/null
+++ b/triton/AQ3FCZKOYK5LBOX7RLBQGX5T77RKI4M7SEZTYJU34QROQSJNLP5A/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json
@@ -0,0 +1 @@
+{"hash": "043651654ec2bab0baff8ac3035fb3ffe2a4719f91333c269be422e8492d5bfa", "target": {"backend": "cuda", "arch": 89, "warp_size": 32}, "num_warps": 2, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "enable_reflect_ftz": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee", "bf16x3", "bf16x6"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm89", "instrumentation_mode": "", "triton_version": "3.6.0", "tensordesc_meta": [], "shared": 1024, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0"}
\ No newline at end of file
diff --git a/triton/AQ3FCZKOYK5LBOX7RLBQGX5T77RKI4M7SEZTYJU34QROQSJNLP5A/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.llir b/triton/AQ3FCZKOYK5LBOX7RLBQGX5T77RKI4M7SEZTYJU34QROQSJNLP5A/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.llir
new file mode 100644
index 0000000000000000000000000000000000000000..7a26bef63ab64db75114165032a7066765f0a56f
--- /dev/null
+++ b/triton/AQ3FCZKOYK5LBOX7RLBQGX5T77RKI4M7SEZTYJU34QROQSJNLP5A/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.llir
@@ -0,0 +1,908 @@
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64"
+
+@global_smem = external local_unnamed_addr addrspace(3) global [0 x i8], align 16
+@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1
+
+; Function Attrs: nounwind
+define ptx_kernel void @triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, ptr addrspace(1) %6, i32 %7, i32 %8, ptr addrspace(1) readnone captures(none) %9, ptr addrspace(1) readnone captures(none) %10) local_unnamed_addr #0 !dbg !5 {
+  %12 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !8
+  %13 = shl nuw i32 %12, 1, !dbg !9
+  %14 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
+  %15 = and i32 %14, 32, !dbg !10
+  %.not = icmp eq i32 %15, 0, !dbg !10
+  %.lobit = lshr exact i32 %15, 5, !dbg !10
+  %16 = and i32 %14, 1, !dbg !10
+  %.not1 = icmp eq i32 %16, 0, !dbg !10
+  %17 = or disjoint i32 %.lobit, %13, !dbg !11
+  %18 = or disjoint i32 %13, %16, !dbg !11
+  %19 = and i32 %14, 31, !dbg !12
+  %20 = shl nuw nsw i32 %19, 2, !dbg !12
+  %21 = shl nuw nsw i32 %14, 1, !dbg !12
+  %22 = and i32 %21, 126, !dbg !12
+  %23 = and i32 %14, 62, !dbg !12
+  %24 = lshr i32 %14, 1, !dbg !12
+  %25 = sdiv i32 %17, 32, !dbg !13
+  %26 = mul i32 %25, 32, !dbg !14
+  %.decomposed = sub i32 %17, %26, !dbg !14
+  %27 = sdiv i32 %18, 32, !dbg !13
+  %28 = or disjoint i32 %20, 4096, !dbg !15
+  %29 = shl nsw i32 %.decomposed, 7, !dbg !16
+  %30 = add nsw i32 %28, %29, !dbg !17
+  %31 = mul i32 %25, 36864, !dbg !18
+  %32 = add i32 %30, %31, !dbg !19
+  %33 = sext i32 %32 to i64, !dbg !20
+  %34 = getelementptr bfloat, ptr addrspace(1) %2, i64 %33, !dbg !20
+  %35 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !21
+  %36 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %34, i64 %35, i1 true) #6, !dbg !21
+  %37 = extractvalue { i32, i32 } %36, 0, !dbg !21
+  %38 = bitcast i32 %37 to <2 x bfloat>, !dbg !21
+  %39 = extractvalue { i32, i32 } %36, 1, !dbg !21
+  %40 = bitcast i32 %39 to <2 x bfloat>, !dbg !21
+  %41 = extractelement <2 x bfloat> %38, i64 0, !dbg !21
+  %42 = extractelement <2 x bfloat> %38, i64 1, !dbg !21
+  %43 = extractelement <2 x bfloat> %40, i64 0, !dbg !21
+  %44 = extractelement <2 x bfloat> %40, i64 1, !dbg !21
+  %45 = fpext bfloat %41 to float, !dbg !22
+  %46 = fpext bfloat %42 to float, !dbg !22
+  %47 = fpext bfloat %43 to float, !dbg !22
+  %48 = fpext bfloat %44 to float, !dbg !22
+  %49 = or disjoint i32 %29, %20, !dbg !23
+  %50 = add i32 %49, %31, !dbg !24
+  %51 = sext i32 %50 to i64, !dbg !25
+  %52 = getelementptr bfloat, ptr addrspace(1) %2, i64 %51, !dbg !25
+  %53 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !26
+  %54 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %52, i64 %53, i1 true) #6, !dbg !26
+  %55 = extractvalue { i32, i32 } %54, 0, !dbg !26
+  %56 = bitcast i32 %55 to <2 x bfloat>, !dbg !26
+  %57 = extractvalue { i32, i32 } %54, 1, !dbg !26
+  %58 = bitcast i32 %57 to <2 x bfloat>, !dbg !26
+  %59 = extractelement <2 x bfloat> %56, i64 0, !dbg !26
+  %60 = extractelement <2 x bfloat> %56, i64 1, !dbg !26
+  %61 = extractelement <2 x bfloat> %58, i64 0, !dbg !26
+  %62 = extractelement <2 x bfloat> %58, i64 1, !dbg !26
+  %63 = fpext bfloat %59 to float, !dbg !27
+  %64 = fpext bfloat %60 to float, !dbg !27
+  %65 = fpext bfloat %61 to float, !dbg !27
+  %66 = fpext bfloat %62 to float, !dbg !27
+  %67 = fmul float %45, %45, !dbg !28
+  %68 = fmul float %46, %46, !dbg !28
+  %69 = fmul float %47, %47, !dbg !28
+  %70 = fmul float %48, %48, !dbg !28
+  %71 = fmul float %63, %63, !dbg !29
+  %72 = fmul float %64, %64, !dbg !29
+  %73 = fmul float %65, %65, !dbg !29
+  %74 = fmul float %66, %66, !dbg !29
+  %75 = fadd float %67, %68, !dbg !30
+  %76 = fadd float %69, %75, !dbg !30
+  %77 = fadd float %70, %76, !dbg !30
+  %78 = bitcast float %77 to i32, !dbg !33
+  %79 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %78, i32 16, i32 31), !dbg !33
+  %80 = bitcast i32 %79 to float, !dbg !33
+  %81 = fadd float %77, %80, !dbg !30
+  %82 = bitcast float %81 to i32, !dbg !33
+  %83 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %82, i32 8, i32 31), !dbg !33
+  %84 = bitcast i32 %83 to float, !dbg !33
+  %85 = fadd float %81, %84, !dbg !30
+  %86 = bitcast float %85 to i32, !dbg !33
+  %87 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %86, i32 4, i32 31), !dbg !33
+  %88 = bitcast i32 %87 to float, !dbg !33
+  %89 = fadd float %85, %88, !dbg !30
+  %90 = bitcast float %89 to i32, !dbg !33
+  %91 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %90, i32 2, i32 31), !dbg !33
+  %92 = bitcast i32 %91 to float, !dbg !33
+  %93 = fadd float %89, %92, !dbg !30
+  %94 = bitcast float %93 to i32, !dbg !33
+  %95 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %94, i32 1, i32 31), !dbg !33
+  %96 = bitcast i32 %95 to float, !dbg !33
+  %97 = fadd float %93, %96, !dbg !30
+  %98 = fadd float %71, %72, !dbg !36
+  %99 = fadd float %73, %98, !dbg !36
+  %100 = fadd float %74, %99, !dbg !36
+  %101 = bitcast float %100 to i32, !dbg !37
+  %102 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %101, i32 16, i32 31), !dbg !37
+  %103 = bitcast i32 %102 to float, !dbg !37
+  %104 = fadd float %100, %103, !dbg !36
+  %105 = bitcast float %104 to i32, !dbg !37
+  %106 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %105, i32 8, i32 31), !dbg !37
+  %107 = bitcast i32 %106 to float, !dbg !37
+  %108 = fadd float %104, %107, !dbg !36
+  %109 = bitcast float %108 to i32, !dbg !37
+  %110 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %109, i32 4, i32 31), !dbg !37
+  %111 = bitcast i32 %110 to float, !dbg !37
+  %112 = fadd float %108, %111, !dbg !36
+  %113 = bitcast float %112 to i32, !dbg !37
+  %114 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %113, i32 2, i32 31), !dbg !37
+  %115 = bitcast i32 %114 to float, !dbg !37
+  %116 = fadd float %112, %115, !dbg !36
+  %117 = bitcast float %116 to i32, !dbg !37
+  %118 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %117, i32 1, i32 31), !dbg !37
+  %119 = bitcast i32 %118 to float, !dbg !37
+  %120 = fadd float %116, %119, !dbg !36
+  %121 = and i32 %24, 1, !dbg !39
+  %122 = zext nneg i32 %22 to i64, !dbg !40
+  %123 = getelementptr bfloat, ptr addrspace(1) %3, i64 %122, !dbg !40
+  %124 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !41
+  %125 = tail call i32 asm sideeffect "mov.u32 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $2 + 0 ], $3;", "=r,r,l,l,b"(i32 0, ptr addrspace(1) %123, i64 %124, i1 true) #6, !dbg !41
+  %126 = bitcast i32 %125 to <2 x bfloat>, !dbg !41
+  %127 = fpext <2 x bfloat> %126 to <2 x float>, !dbg !42
+  %128 = shl i32 %25, 7, !dbg !43
+  %129 = or disjoint i32 %128, %20, !dbg !44
+  %130 = sext i32 %129 to i64, !dbg !45
+  %131 = getelementptr float, ptr addrspace(1) %4, i64 %130, !dbg !45
+  %132 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !46
+  %133 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %131, i64 %132, i1 true) #6, !dbg !46
+  %134 = extractvalue { i32, i32, i32, i32 } %133, 0, !dbg !46
+  %135 = extractvalue { i32, i32, i32, i32 } %133, 1, !dbg !46
+  %136 = extractvalue { i32, i32, i32, i32 } %133, 2, !dbg !46
+  %137 = extractvalue { i32, i32, i32, i32 } %133, 3, !dbg !46
+  %138 = bitcast i32 %134 to float, !dbg !46
+  %139 = bitcast i32 %135 to float, !dbg !46
+  %140 = bitcast i32 %136 to float, !dbg !46
+  %141 = bitcast i32 %137 to float, !dbg !46
+  %142 = shl nuw nsw i32 %14, 4, !dbg !46
+  %143 = and i32 %142, 112, !dbg !46
+  %144 = and i32 %14, 24, !dbg !46
+  %145 = lshr exact i32 %144, 1, !dbg !46
+  %146 = select i1 %.not, i32 0, i32 192, !dbg !46
+  %147 = or disjoint i32 %143, %145, !dbg !46
+  %148 = xor i32 %147, %146, !dbg !46
+  %149 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %148, !dbg !46
+  %150 = insertelement <1 x i32> poison, i32 %134, i64 0, !dbg !46
+  store <1 x i32> %150, ptr addrspace(3) %149, align 4, !dbg !46
+  %151 = xor i32 %148, 260, !dbg !46
+  %152 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %151, !dbg !46
+  %153 = insertelement <1 x i32> poison, i32 %135, i64 0, !dbg !46
+  store <1 x i32> %153, ptr addrspace(3) %152, align 4, !dbg !46
+  %154 = xor i32 %148, 520, !dbg !46
+  %155 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %154, !dbg !46
+  %156 = insertelement <1 x i32> poison, i32 %136, i64 0, !dbg !46
+  store <1 x i32> %156, ptr addrspace(3) %155, align 4, !dbg !46
+  %157 = xor i32 %148, 780, !dbg !46
+  %158 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %157, !dbg !46
+  %159 = insertelement <1 x i32> poison, i32 %137, i64 0, !dbg !46
+  store <1 x i32> %159, ptr addrspace(3) %158, align 4, !dbg !46
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !46
+  %160 = shl nuw nsw i32 %14, 7, !dbg !46
+  %161 = and i32 %160, 768, !dbg !46
+  %162 = shl nuw nsw i32 %23, 1, !dbg !46
+  %163 = select i1 %.not1, i32 0, i32 192, !dbg !46
+  %164 = xor i32 %163, %162, !dbg !46
+  %165 = or disjoint i32 %164, %161, !dbg !46
+  %166 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %165, !dbg !46
+  %167 = load float, ptr addrspace(3) %166, align 4, !dbg !46
+  %168 = xor i32 %165, 4, !dbg !46
+  %169 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %168, !dbg !46
+  %170 = load float, ptr addrspace(3) %169, align 4, !dbg !46
+  %171 = xor i32 %165, 8, !dbg !46
+  %172 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %171, !dbg !46
+  %173 = load float, ptr addrspace(3) %172, align 4, !dbg !46
+  %174 = xor i32 %165, 12, !dbg !46
+  %175 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %174, !dbg !46
+  %176 = load float, ptr addrspace(3) %175, align 4, !dbg !46
+  %177 = getelementptr float, ptr addrspace(1) %5, i64 %130, !dbg !47
+  %178 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !48
+  %179 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %177, i64 %178, i1 true) #6, !dbg !48
+  %180 = extractvalue { i32, i32, i32, i32 } %179, 0, !dbg !48
+  %181 = extractvalue { i32, i32, i32, i32 } %179, 1, !dbg !48
+  %182 = extractvalue { i32, i32, i32, i32 } %179, 2, !dbg !48
+  %183 = extractvalue { i32, i32, i32, i32 } %179, 3, !dbg !48
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !48
+  %184 = insertelement <1 x i32> poison, i32 %180, i64 0, !dbg !48
+  store <1 x i32> %184, ptr addrspace(3) %149, align 4, !dbg !48
+  %185 = insertelement <1 x i32> poison, i32 %181, i64 0, !dbg !48
+  store <1 x i32> %185, ptr addrspace(3) %152, align 4, !dbg !48
+  %186 = insertelement <1 x i32> poison, i32 %182, i64 0, !dbg !48
+  store <1 x i32> %186, ptr addrspace(3) %155, align 4, !dbg !48
+  %187 = insertelement <1 x i32> poison, i32 %183, i64 0, !dbg !48
+  store <1 x i32> %187, ptr addrspace(3) %158, align 4, !dbg !48
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !48
+  %188 = load float, ptr addrspace(3) %166, align 4, !dbg !48
+  %189 = load float, ptr addrspace(3) %169, align 4, !dbg !48
+  %190 = load float, ptr addrspace(3) %172, align 4, !dbg !48
+  %191 = load float, ptr addrspace(3) %175, align 4, !dbg !48
+  %192 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #6, !dbg !49
+  %193 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %34, i64 %192, i1 true) #6, !dbg !49
+  %194 = getelementptr bfloat, ptr addrspace(1) %6, i64 %122, !dbg !50
+  %195 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !51
+  %196 = tail call i32 asm sideeffect "mov.u32 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $2 + 0 ], $3;", "=r,r,l,l,b"(i32 0, ptr addrspace(1) %194, i64 %195, i1 true) #6, !dbg !51
+  %197 = icmp eq i32 %121, 0, !dbg !52
+  %198 = and i32 %24, 30, !dbg !53
+  %199 = or disjoint i32 %198, 32, !dbg !53
+  %200 = or disjoint i32 %198, 64, !dbg !53
+  %201 = or disjoint i32 %198, 96, !dbg !53
+  %202 = or disjoint i32 %198, 1, !dbg !54
+  %203 = or disjoint i32 %198, 33, !dbg !54
+  %204 = or disjoint i32 %198, 65, !dbg !54
+  %205 = or disjoint i32 %198, 97, !dbg !54
+  %206 = shl i32 %18, 7, !dbg !55
+  %207 = shl i32 %27, 15, !dbg !55
+  %208 = add i32 %207, %206, !dbg !55
+  %209 = or disjoint i32 %208, %202, !dbg !56
+  %210 = or disjoint i32 %208, %203, !dbg !56
+  %211 = or disjoint i32 %208, %204, !dbg !56
+  %212 = or disjoint i32 %208, %205, !dbg !56
+  %213 = sext i32 %209 to i64, !dbg !57
+  %214 = getelementptr bfloat, ptr addrspace(1) %2, i64 %213, !dbg !57
+  %215 = sext i32 %210 to i64, !dbg !57
+  %216 = getelementptr bfloat, ptr addrspace(1) %2, i64 %215, !dbg !57
+  %217 = sext i32 %211 to i64, !dbg !57
+  %218 = getelementptr bfloat, ptr addrspace(1) %2, i64 %217, !dbg !57
+  %219 = sext i32 %212 to i64, !dbg !57
+  %220 = getelementptr bfloat, ptr addrspace(1) %2, i64 %219, !dbg !57
+  %221 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !58
+  %222 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %214, i64 %221, i1 %197) #6, !dbg !58
+  %223 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !58
+  %224 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %216, i64 %223, i1 %197) #6, !dbg !58
+  %225 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !58
+  %226 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %218, i64 %225, i1 %197) #6, !dbg !58
+  %227 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !58
+  %228 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %220, i64 %227, i1 %197) #6, !dbg !58
+  %229 = tail call float @llvm.nvvm.div.full(float %120, float 1.280000e+02), !dbg !59
+  %230 = fadd float %229, 0x3EB0C6F7A0000000, !dbg !60
+  %231 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !61
+  %.not.i = icmp eq i32 %231, 0, !dbg !61
+  br i1 %.not.i, label %234, label %232, !dbg !61
+
+232:                                              ; preds = %11
+  %233 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %230), !dbg !61
+  br label %__nv_rsqrtf.exit, !dbg !61
+
+234:                                              ; preds = %11
+  %235 = tail call float @llvm.nvvm.rsqrt.approx.f(float %230), !dbg !61
+  br label %__nv_rsqrtf.exit, !dbg !61
+
+__nv_rsqrtf.exit:                                 ; preds = %232, %234
+  %.0.i = phi float [ %233, %232 ], [ %235, %234 ], !dbg !61
+  %236 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !61
+  %237 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !61
+  %238 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !61
+  %.not.i15 = icmp eq i32 %238, 0, !dbg !61
+  br i1 %.not.i15, label %241, label %239, !dbg !61
+
+239:                                              ; preds = %__nv_rsqrtf.exit
+  %240 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %230), !dbg !61
+  br label %__nv_rsqrtf.exit17, !dbg !61
+
+241:                                              ; preds = %__nv_rsqrtf.exit
+  %242 = tail call float @llvm.nvvm.rsqrt.approx.f(float %230), !dbg !61
+  br label %__nv_rsqrtf.exit17, !dbg !61
+
+__nv_rsqrtf.exit17:                               ; preds = %239, %241
+  %.0.i16 = phi float [ %240, %239 ], [ %242, %241 ], !dbg !61
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !62
+  %243 = lshr exact i32 %15, 3, !dbg !62
+  %244 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %243, !dbg !62
+  store float %.0.i, ptr addrspace(3) %244, align 4, !dbg !62
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !62
+  %245 = shl nuw nsw i32 %16, 2, !dbg !62
+  %246 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %245, !dbg !62
+  %247 = load float, ptr addrspace(3) %246, align 4, !dbg !62
+  %248 = zext nneg i32 %202 to i64, !dbg !63
+  %249 = getelementptr bfloat, ptr addrspace(1) %3, i64 %248, !dbg !63
+  %250 = zext nneg i32 %203 to i64, !dbg !63
+  %251 = getelementptr bfloat, ptr addrspace(1) %3, i64 %250, !dbg !63
+  %252 = zext nneg i32 %204 to i64, !dbg !63
+  %253 = getelementptr bfloat, ptr addrspace(1) %3, i64 %252, !dbg !63
+  %254 = zext nneg i32 %205 to i64, !dbg !63
+  %255 = getelementptr bfloat, ptr addrspace(1) %3, i64 %254, !dbg !63
+  %256 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !64
+  %257 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %249, i64 %256, i1 %197) #6, !dbg !64
+  %258 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !64
+  %259 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %251, i64 %258, i1 %197) #6, !dbg !64
+  %260 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !64
+  %261 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %253, i64 %260, i1 %197) #6, !dbg !64
+  %262 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !64
+  %263 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %255, i64 %262, i1 %197) #6, !dbg !64
+  %264 = icmp ne i32 %121, 0, !dbg !65
+  %265 = or disjoint i32 %208, %198, !dbg !66
+  %266 = or disjoint i32 %208, %199, !dbg !66
+  %267 = or disjoint i32 %208, %200, !dbg !66
+  %268 = or disjoint i32 %208, %201, !dbg !66
+  %269 = sext i32 %265 to i64, !dbg !67
+  %270 = getelementptr bfloat, ptr addrspace(1) %2, i64 %269, !dbg !67
+  %271 = sext i32 %266 to i64, !dbg !67
+  %272 = getelementptr bfloat, ptr addrspace(1) %2, i64 %271, !dbg !67
+  %273 = sext i32 %267 to i64, !dbg !67
+  %274 = getelementptr bfloat, ptr addrspace(1) %2, i64 %273, !dbg !67
+  %275 = sext i32 %268 to i64, !dbg !67
+  %276 = getelementptr bfloat, ptr addrspace(1) %2, i64 %275, !dbg !67
+  %277 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !68
+  %278 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %270, i64 %277, i1 %264) #6, !dbg !68
+  %279 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !68
+  %280 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %272, i64 %279, i1 %264) #6, !dbg !68
+  %281 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !68
+  %282 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %274, i64 %281, i1 %264) #6, !dbg !68
+  %283 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !68
+  %284 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %276, i64 %283, i1 %264) #6, !dbg !68
+  %285 = zext nneg i32 %198 to i64, !dbg !69
+  %286 = getelementptr bfloat, ptr addrspace(1) %3, i64 %285, !dbg !69
+  %287 = zext nneg i32 %199 to i64, !dbg !69
+  %288 = getelementptr bfloat, ptr addrspace(1) %3, i64 %287, !dbg !69
+  %289 = zext nneg i32 %200 to i64, !dbg !69
+  %290 = getelementptr bfloat, ptr addrspace(1) %3, i64 %289, !dbg !69
+  %291 = zext nneg i32 %201 to i64, !dbg !69
+  %292 = getelementptr bfloat, ptr addrspace(1) %3, i64 %291, !dbg !69
+  %293 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !70
+  %294 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %286, i64 %293, i1 %264) #6, !dbg !70
+  %295 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !70
+  %296 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %288, i64 %295, i1 %264) #6, !dbg !70
+  %297 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !70
+  %298 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %290, i64 %297, i1 %264) #6, !dbg !70
+  %299 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !70
+  %300 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %292, i64 %299, i1 %264) #6, !dbg !70
+  %301 = fmul float %.0.i16, %63, !dbg !71
+  %302 = fmul float %.0.i16, %64, !dbg !71
+  %303 = fmul float %.0.i16, %65, !dbg !71
+  %304 = fmul float %.0.i16, %66, !dbg !71
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !72
+  %305 = shl nuw nsw i32 %23, 2, !dbg !72
+  %306 = select i1 %.not1, i32 0, i32 320, !dbg !72
+  %307 = xor i32 %306, %305, !dbg !72
+  %308 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %307, !dbg !72
+  store <2 x float> %127, ptr addrspace(3) %308, align 8, !dbg !72
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !72
+  %309 = shl nuw nsw i32 %19, 3, !dbg !72
+  %310 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %309, !dbg !72
+  %311 = load float, ptr addrspace(3) %310, align 8, !dbg !72
+  %312 = getelementptr inbounds nuw i8, ptr addrspace(3) %310, i32 4, !dbg !72
+  %313 = load float, ptr addrspace(3) %312, align 4, !dbg !72
+  %314 = xor i32 %309, 320, !dbg !72
+  %315 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %314, !dbg !72
+  %316 = load float, ptr addrspace(3) %315, align 8, !dbg !72
+  %317 = getelementptr inbounds nuw i8, ptr addrspace(3) %315, i32 4, !dbg !72
+  %318 = load float, ptr addrspace(3) %317, align 4, !dbg !72
+  %319 = fmul float %301, %311, !dbg !72
+  %320 = fmul float %302, %313, !dbg !72
+  %321 = fmul float %303, %316, !dbg !72
+  %322 = fmul float %304, %318, !dbg !72
+  %323 = fmul float %319, %138, !dbg !73
+  %324 = fmul float %320, %139, !dbg !73
+  %325 = fmul float %321, %140, !dbg !73
+  %326 = fmul float %322, %141, !dbg !73
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !73
+  store float %323, ptr addrspace(3) %149, align 4, !dbg !73
+  store float %324, ptr addrspace(3) %152, align 4, !dbg !73
+  store float %325, ptr addrspace(3) %155, align 4, !dbg !73
+  store float %326, ptr addrspace(3) %158, align 4, !dbg !73
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !73
+  %327 = load float, ptr addrspace(3) %166, align 4, !dbg !73
+  %328 = load float, ptr addrspace(3) %169, align 4, !dbg !73
+  %329 = load float, ptr addrspace(3) %172, align 4, !dbg !73
+  %330 = load float, ptr addrspace(3) %175, align 4, !dbg !73
+  %331 = add i32 %208, 4097, !dbg !74
+  %332 = or disjoint i32 %331, %198, !dbg !75
+  %333 = add i32 %208, 4129, !dbg !74
+  %334 = or disjoint i32 %333, %198, !dbg !75
+  %335 = add i32 %208, 4161, !dbg !74
+  %336 = or disjoint i32 %335, %198, !dbg !75
+  %337 = add i32 %208, 4193, !dbg !74
+  %338 = or disjoint i32 %337, %198, !dbg !75
+  %339 = sext i32 %332 to i64, !dbg !76
+  %340 = getelementptr bfloat, ptr addrspace(1) %2, i64 %339, !dbg !76
+  %341 = sext i32 %334 to i64, !dbg !76
+  %342 = getelementptr bfloat, ptr addrspace(1) %2, i64 %341, !dbg !76
+  %343 = sext i32 %336 to i64, !dbg !76
+  %344 = getelementptr bfloat, ptr addrspace(1) %2, i64 %343, !dbg !76
+  %345 = sext i32 %338 to i64, !dbg !76
+  %346 = getelementptr bfloat, ptr addrspace(1) %2, i64 %345, !dbg !76
+  %347 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !77
+  %348 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %340, i64 %347, i1 %197) #6, !dbg !77
+  %349 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !77
+  %350 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %342, i64 %349, i1 %197) #6, !dbg !77
+  %351 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !77
+  %352 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %344, i64 %351, i1 %197) #6, !dbg !77
+  %353 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !77
+  %354 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %346, i64 %353, i1 %197) #6, !dbg !77
+  %355 = tail call float @llvm.nvvm.div.full(float %97, float 1.280000e+02), !dbg !78
+  %356 = fadd float %355, 0x3EB0C6F7A0000000, !dbg !79
+  %357 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !80
+  %.not.i18 = icmp eq i32 %357, 0, !dbg !80
+  br i1 %.not.i18, label %360, label %358, !dbg !80
+
+358:                                              ; preds = %__nv_rsqrtf.exit17
+  %359 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %356), !dbg !80
+  br label %__nv_rsqrtf.exit20, !dbg !80
+
+360:                                              ; preds = %__nv_rsqrtf.exit17
+  %361 = tail call float @llvm.nvvm.rsqrt.approx.f(float %356), !dbg !80
+  br label %__nv_rsqrtf.exit20, !dbg !80
+
+__nv_rsqrtf.exit20:                               ; preds = %358, %360
+  %.0.i19 = phi float [ %359, %358 ], [ %361, %360 ], !dbg !80
+  %362 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !80
+  %363 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !80
+  %364 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !80
+  %.not.i27 = icmp eq i32 %364, 0, !dbg !80
+  br i1 %.not.i27, label %367, label %365, !dbg !80
+
+365:                                              ; preds = %__nv_rsqrtf.exit20
+  %366 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %356), !dbg !80
+  br label %__nv_rsqrtf.exit29, !dbg !80
+
+367:                                              ; preds = %__nv_rsqrtf.exit20
+  %368 = tail call float @llvm.nvvm.rsqrt.approx.f(float %356), !dbg !80
+  br label %__nv_rsqrtf.exit29, !dbg !80
+
+__nv_rsqrtf.exit29:                               ; preds = %365, %367
+  %.0.i28 = phi float [ %366, %365 ], [ %368, %367 ], !dbg !80
+  %369 = bitcast i16 %354 to bfloat, !dbg !77
+  %370 = fpext bfloat %369 to float, !dbg !81
+  %371 = bitcast i16 %352 to bfloat, !dbg !77
+  %372 = fpext bfloat %371 to float, !dbg !81
+  %373 = bitcast i16 %350 to bfloat, !dbg !77
+  %374 = fpext bfloat %373 to float, !dbg !81
+  %375 = bitcast i16 %348 to bfloat, !dbg !77
+  %376 = fpext bfloat %375 to float, !dbg !81
+  %377 = bitcast i16 %228 to bfloat, !dbg !58
+  %378 = fpext bfloat %377 to float, !dbg !82
+  %379 = fmul float %247, %378, !dbg !62
+  %380 = bitcast i16 %263 to bfloat, !dbg !64
+  %381 = fpext bfloat %380 to float, !dbg !83
+  %382 = fmul float %379, %381, !dbg !84
+  %383 = fsub float 0.000000e+00, %382, !dbg !85
+  %384 = bitcast i16 %284 to bfloat, !dbg !68
+  %385 = fpext bfloat %384 to float, !dbg !86
+  %386 = fmul float %247, %385, !dbg !87
+  %387 = bitcast i16 %300 to bfloat, !dbg !70
+  %388 = fpext bfloat %387 to float, !dbg !88
+  %389 = fmul float %386, %388, !dbg !89
+  %390 = select i1 %197, float %383, float %389, !dbg !90
+  %391 = fmul float %191, %390, !dbg !91
+  %392 = fadd float %391, %330, !dbg !92
+  %393 = bitcast i16 %226 to bfloat, !dbg !58
+  %394 = fpext bfloat %393 to float, !dbg !82
+  %395 = fmul float %247, %394, !dbg !62
+  %396 = bitcast i16 %261 to bfloat, !dbg !64
+  %397 = fpext bfloat %396 to float, !dbg !83
+  %398 = fmul float %395, %397, !dbg !84
+  %399 = fsub float 0.000000e+00, %398, !dbg !85
+  %400 = bitcast i16 %282 to bfloat, !dbg !68
+  %401 = fpext bfloat %400 to float, !dbg !86
+  %402 = fmul float %247, %401, !dbg !87
+  %403 = bitcast i16 %298 to bfloat, !dbg !70
+  %404 = fpext bfloat %403 to float, !dbg !88
+  %405 = fmul float %402, %404, !dbg !89
+  %406 = select i1 %197, float %399, float %405, !dbg !90
+  %407 = fmul float %190, %406, !dbg !91
+  %408 = fadd float %407, %329, !dbg !92
+  %409 = bitcast i16 %224 to bfloat, !dbg !58
+  %410 = fpext bfloat %409 to float, !dbg !82
+  %411 = fmul float %247, %410, !dbg !62
+  %412 = bitcast i16 %259 to bfloat, !dbg !64
+  %413 = fpext bfloat %412 to float, !dbg !83
+  %414 = fmul float %411, %413, !dbg !84
+  %415 = fsub float 0.000000e+00, %414, !dbg !85
+  %416 = bitcast i16 %280 to bfloat, !dbg !68
+  %417 = fpext bfloat %416 to float, !dbg !86
+  %418 = fmul float %247, %417, !dbg !87
+  %419 = bitcast i16 %296 to bfloat, !dbg !70
+  %420 = fpext bfloat %419 to float, !dbg !88
+  %421 = fmul float %418, %420, !dbg !89
+  %422 = select i1 %197, float %415, float %421, !dbg !90
+  %423 = fmul float %189, %422, !dbg !91
+  %424 = fadd float %423, %328, !dbg !92
+  %425 = bitcast i16 %222 to bfloat, !dbg !58
+  %426 = fpext bfloat %425 to float, !dbg !82
+  %427 = fmul float %247, %426, !dbg !62
+  %428 = bitcast i16 %257 to bfloat, !dbg !64
+  %429 = fpext bfloat %428 to float, !dbg !83
+  %430 = fmul float %427, %429, !dbg !84
+  %431 = fsub float 0.000000e+00, %430, !dbg !85
+  %432 = bitcast i16 %278 to bfloat, !dbg !68
+  %433 = fpext bfloat %432 to float, !dbg !86
+  %434 = fmul float %247, %433, !dbg !87
+  %435 = bitcast i16 %294 to bfloat, !dbg !70
+  %436 = fpext bfloat %435 to float, !dbg !88
+  %437 = fmul float %434, %436, !dbg !89
+  %438 = select i1 %197, float %431, float %437, !dbg !90
+  %439 = fmul float %188, %438, !dbg !91
+  %440 = fadd float %439, %327, !dbg !92
+  %441 = bitcast i32 %196 to <2 x bfloat>, !dbg !51
+  %442 = extractelement <2 x bfloat> %441, i64 1, !dbg !51
+  %443 = fpext bfloat %442 to float, !dbg !93
+  %444 = extractelement <2 x bfloat> %441, i64 0, !dbg !51
+  %445 = fpext bfloat %444 to float, !dbg !93
+  %446 = extractvalue { i32, i32 } %193, 1, !dbg !49
+  %447 = bitcast i32 %446 to <2 x bfloat>, !dbg !49
+  %448 = extractelement <2 x bfloat> %447, i64 1, !dbg !49
+  %449 = fpext bfloat %448 to float, !dbg !94
+  %450 = extractelement <2 x bfloat> %447, i64 0, !dbg !49
+  %451 = fpext bfloat %450 to float, !dbg !94
+  %452 = extractvalue { i32, i32 } %193, 0, !dbg !49
+  %453 = bitcast i32 %452 to <2 x bfloat>, !dbg !49
+  %454 = extractelement <2 x bfloat> %453, i64 1, !dbg !49
+  %455 = fpext bfloat %454 to float, !dbg !94
+  %456 = extractelement <2 x bfloat> %453, i64 0, !dbg !49
+  %457 = fpext bfloat %456 to float, !dbg !94
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !95
+  store float %.0.i19, ptr addrspace(3) %244, align 4, !dbg !95
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !95
+  %458 = load float, ptr addrspace(3) %246, align 4, !dbg !95
+  %459 = fmul float %458, %376, !dbg !95
+  %460 = fmul float %458, %374, !dbg !95
+  %461 = fmul float %458, %372, !dbg !95
+  %462 = fmul float %458, %370, !dbg !95
+  %463 = getelementptr bfloat, ptr addrspace(1) %6, i64 %248, !dbg !96
+  %464 = getelementptr bfloat, ptr addrspace(1) %6, i64 %250, !dbg !96
+  %465 = getelementptr bfloat, ptr addrspace(1) %6, i64 %252, !dbg !96
+  %466 = getelementptr bfloat, ptr addrspace(1) %6, i64 %254, !dbg !96
+  %467 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !97
+  %468 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %463, i64 %467, i1 %197) #6, !dbg !97
+  %469 = bitcast i16 %468 to bfloat, !dbg !97
+  %470 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !97
+  %471 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %464, i64 %470, i1 %197) #6, !dbg !97
+  %472 = bitcast i16 %471 to bfloat, !dbg !97
+  %473 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !97
+  %474 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %465, i64 %473, i1 %197) #6, !dbg !97
+  %475 = bitcast i16 %474 to bfloat, !dbg !97
+  %476 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !97
+  %477 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %466, i64 %476, i1 %197) #6, !dbg !97
+  %478 = bitcast i16 %477 to bfloat, !dbg !97
+  %479 = fpext bfloat %469 to float, !dbg !98
+  %480 = fpext bfloat %472 to float, !dbg !98
+  %481 = fpext bfloat %475 to float, !dbg !98
+  %482 = fpext bfloat %478 to float, !dbg !98
+  %483 = fmul float %459, %479, !dbg !99
+  %484 = fmul float %460, %480, !dbg !99
+  %485 = fmul float %461, %481, !dbg !99
+  %486 = fmul float %462, %482, !dbg !99
+  %487 = fsub float 0.000000e+00, %483, !dbg !100
+  %488 = fsub float 0.000000e+00, %484, !dbg !100
+  %489 = fsub float 0.000000e+00, %485, !dbg !100
+  %490 = fsub float 0.000000e+00, %486, !dbg !100
+  %491 = add i32 %208, 4096, !dbg !101
+  %492 = or disjoint i32 %491, %198, !dbg !102
+  %493 = add i32 %208, 4128, !dbg !101
+  %494 = or disjoint i32 %493, %198, !dbg !102
+  %495 = add i32 %208, 4160, !dbg !101
+  %496 = or disjoint i32 %495, %198, !dbg !102
+  %497 = add i32 %208, 4192, !dbg !101
+  %498 = or disjoint i32 %497, %198, !dbg !102
+  %499 = sext i32 %492 to i64, !dbg !103
+  %500 = getelementptr bfloat, ptr addrspace(1) %2, i64 %499, !dbg !103
+  %501 = sext i32 %494 to i64, !dbg !103
+  %502 = getelementptr bfloat, ptr addrspace(1) %2, i64 %501, !dbg !103
+  %503 = sext i32 %496 to i64, !dbg !103
+  %504 = getelementptr bfloat, ptr addrspace(1) %2, i64 %503, !dbg !103
+  %505 = sext i32 %498 to i64, !dbg !103
+  %506 = getelementptr bfloat, ptr addrspace(1) %2, i64 %505, !dbg !103
+  %507 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !104
+  %508 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %500, i64 %507, i1 %264) #6, !dbg !104
+  %509 = bitcast i16 %508 to bfloat, !dbg !104
+  %510 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !104
+  %511 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %502, i64 %510, i1 %264) #6, !dbg !104
+  %512 = bitcast i16 %511 to bfloat, !dbg !104
+  %513 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !104
+  %514 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %504, i64 %513, i1 %264) #6, !dbg !104
+  %515 = bitcast i16 %514 to bfloat, !dbg !104
+  %516 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !104
+  %517 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %506, i64 %516, i1 %264) #6, !dbg !104
+  %518 = bitcast i16 %517 to bfloat, !dbg !104
+  %519 = fpext bfloat %509 to float, !dbg !105
+  %520 = fpext bfloat %512 to float, !dbg !105
+  %521 = fpext bfloat %515 to float, !dbg !105
+  %522 = fpext bfloat %518 to float, !dbg !105
+  %523 = fmul float %458, %519, !dbg !106
+  %524 = fmul float %458, %520, !dbg !106
+  %525 = fmul float %458, %521, !dbg !106
+  %526 = fmul float %458, %522, !dbg !106
+  %527 = getelementptr bfloat, ptr addrspace(1) %6, i64 %285, !dbg !107
+  %528 = getelementptr bfloat, ptr addrspace(1) %6, i64 %287, !dbg !107
+  %529 = getelementptr bfloat, ptr addrspace(1) %6, i64 %289, !dbg !107
+  %530 = getelementptr bfloat, ptr addrspace(1) %6, i64 %291, !dbg !107
+  %531 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !108
+  %532 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %527, i64 %531, i1 %264) #6, !dbg !108
+  %533 = bitcast i16 %532 to bfloat, !dbg !108
+  %534 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !108
+  %535 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %528, i64 %534, i1 %264) #6, !dbg !108
+  %536 = bitcast i16 %535 to bfloat, !dbg !108
+  %537 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !108
+  %538 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %529, i64 %537, i1 %264) #6, !dbg !108
+  %539 = bitcast i16 %538 to bfloat, !dbg !108
+  %540 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !108
+  %541 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %530, i64 %540, i1 %264) #6, !dbg !108
+  %542 = bitcast i16 %541 to bfloat, !dbg !108
+  %543 = fpext bfloat %533 to float, !dbg !109
+  %544 = fpext bfloat %536 to float, !dbg !109
+  %545 = fpext bfloat %539 to float, !dbg !109
+  %546 = fpext bfloat %542 to float, !dbg !109
+  %547 = fmul float %523, %543, !dbg !110
+  %548 = fmul float %524, %544, !dbg !110
+  %549 = fmul float %525, %545, !dbg !110
+  %550 = fmul float %526, %546, !dbg !110
+  %551 = select i1 %197, float %487, float %547, !dbg !90
+  %552 = select i1 %197, float %488, float %548, !dbg !90
+  %553 = select i1 %197, float %489, float %549, !dbg !90
+  %554 = select i1 %197, float %490, float %550, !dbg !90
+  %555 = fmul float %.0.i28, %457, !dbg !111
+  %556 = fmul float %.0.i28, %455, !dbg !111
+  %557 = fmul float %.0.i28, %451, !dbg !111
+  %558 = fmul float %.0.i28, %449, !dbg !111
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !111
+  store float %555, ptr addrspace(3) %149, align 4, !dbg !111
+  store float %556, ptr addrspace(3) %152, align 4, !dbg !111
+  store float %557, ptr addrspace(3) %155, align 4, !dbg !111
+  store float %558, ptr addrspace(3) %158, align 4, !dbg !111
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !111
+  %559 = load float, ptr addrspace(3) %166, align 4, !dbg !111
+  %560 = load float, ptr addrspace(3) %169, align 4, !dbg !111
+  %561 = load float, ptr addrspace(3) %172, align 4, !dbg !111
+  %562 = load float, ptr addrspace(3) %175, align 4, !dbg !111
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !112
+  %563 = shl nuw nsw i32 %14, 3, !dbg !112
+  %564 = and i32 %563, 120, !dbg !112
+  %565 = lshr i32 %14, 2, !dbg !112
+  %566 = and i32 %565, 4, !dbg !112
+  %567 = shl nuw nsw i32 %15, 2, !dbg !112
+  %568 = or disjoint i32 %566, %567, !dbg !112
+  %569 = or disjoint i32 %568, %564, !dbg !112
+  %570 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %569, !dbg !112
+  store float %445, ptr addrspace(3) %570, align 4, !dbg !112
+  %571 = xor i32 %569, 320, !dbg !112
+  %572 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %571, !dbg !112
+  store float %443, ptr addrspace(3) %572, align 4, !dbg !112
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !112
+  %573 = and i32 %21, 120, !dbg !112
+  %574 = and i32 %14, 2, !dbg !112
+  %575 = icmp eq i32 %574, 0, !dbg !112
+  %576 = select i1 %575, i32 0, i32 320, !dbg !112
+  %577 = xor i32 %576, %573, !dbg !112
+  %578 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %577, !dbg !112
+  %579 = load float, ptr addrspace(3) %578, align 8, !dbg !112
+  %580 = getelementptr inbounds nuw i8, ptr addrspace(3) %578, i32 4, !dbg !112
+  %581 = load float, ptr addrspace(3) %580, align 4, !dbg !112
+  %582 = getelementptr inbounds nuw i8, ptr addrspace(3) %578, i32 128, !dbg !112
+  %583 = load float, ptr addrspace(3) %582, align 8, !dbg !112
+  %584 = getelementptr inbounds nuw i8, ptr addrspace(3) %578, i32 132, !dbg !112
+  %585 = load float, ptr addrspace(3) %584, align 4, !dbg !112
+  %586 = fmul float %559, %579, !dbg !113
+  %587 = fmul float %560, %581, !dbg !113
+  %588 = fmul float %561, %583, !dbg !113
+  %589 = fmul float %562, %585, !dbg !113
+  %590 = fmul float %167, %586, !dbg !112
+  %591 = fmul float %170, %587, !dbg !112
+  %592 = fmul float %173, %588, !dbg !112
+  %593 = fmul float %176, %589, !dbg !112
+  %594 = fmul float %188, %551, !dbg !114
+  %595 = fmul float %189, %552, !dbg !114
+  %596 = fmul float %190, %553, !dbg !114
+  %597 = fmul float %191, %554, !dbg !114
+  %598 = fadd float %594, %590, !dbg !115
+  %599 = fadd float %595, %591, !dbg !115
+  %600 = fadd float %596, %592, !dbg !115
+  %601 = fadd float %597, %593, !dbg !115
+  %602 = shl i32 %17, 7, !dbg !116
+  %603 = or disjoint i32 %602, %20, !dbg !117
+  %604 = sext i32 %603 to i64, !dbg !118
+  %605 = getelementptr bfloat, ptr addrspace(1) %0, i64 %604, !dbg !118
+  %606 = fptrunc float %440 to bfloat, !dbg !119
+  %607 = fptrunc float %424 to bfloat, !dbg !119
+  %608 = fptrunc float %408 to bfloat, !dbg !119
+  %609 = fptrunc float %392 to bfloat, !dbg !119
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !119
+  %610 = and i32 %14, 3, !dbg !119
+  %611 = and i32 %14, 28, !dbg !119
+  %612 = lshr exact i32 %15, 4, !dbg !119
+  %613 = mul nuw nsw i32 %610, 160, !dbg !119
+  %614 = or disjoint i32 %613, %612, !dbg !119
+  %615 = or disjoint i32 %614, %611, !dbg !119
+  %616 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %615, !dbg !119
+  store bfloat %606, ptr addrspace(3) %616, align 2, !dbg !119
+  %617 = xor i32 %615, 32, !dbg !119
+  %618 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %617, !dbg !119
+  store bfloat %607, ptr addrspace(3) %618, align 2, !dbg !119
+  %619 = xor i32 %615, 64, !dbg !119
+  %620 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %619, !dbg !119
+  store bfloat %608, ptr addrspace(3) %620, align 2, !dbg !119
+  %621 = xor i32 %615, 96, !dbg !119
+  %622 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %621, !dbg !119
+  store bfloat %609, ptr addrspace(3) %622, align 2, !dbg !119
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !119
+  %623 = shl nuw nsw i32 %610, 3, !dbg !119
+  %624 = shl nuw nsw i32 %144, 2, !dbg !119
+  %625 = and i32 %24, 2, !dbg !119
+  %626 = select i1 %.not, i32 0, i32 160, !dbg !119
+  %627 = or disjoint i32 %623, %624, !dbg !119
+  %628 = xor i32 %627, %626, !dbg !119
+  %629 = or disjoint i32 %628, %625, !dbg !119
+  %630 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %629, !dbg !119
+  %631 = load bfloat, ptr addrspace(3) %630, align 2, !dbg !119
+  %632 = getelementptr inbounds nuw i8, ptr addrspace(3) %630, i32 4, !dbg !119
+  %633 = load bfloat, ptr addrspace(3) %632, align 2, !dbg !119
+  %634 = xor i32 %629, 320, !dbg !119
+  %635 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %634, !dbg !119
+  %636 = load bfloat, ptr addrspace(3) %635, align 2, !dbg !119
+  %637 = getelementptr inbounds nuw i8, ptr addrspace(3) %635, i32 4, !dbg !119
+  %638 = load bfloat, ptr addrspace(3) %637, align 2, !dbg !119
+  %639 = insertelement <2 x bfloat> poison, bfloat %631, i64 0, !dbg !119
+  %640 = insertelement <2 x bfloat> %639, bfloat %636, i64 1, !dbg !119
+  %641 = bitcast <2 x bfloat> %640 to i32, !dbg !119
+  %642 = insertelement <2 x bfloat> poison, bfloat %633, i64 0, !dbg !119
+  %643 = insertelement <2 x bfloat> %642, bfloat %638, i64 1, !dbg !119
+  %644 = bitcast <2 x bfloat> %643 to i32, !dbg !119
+  tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %641, i32 %644, ptr addrspace(1) %605, i1 true) #6, !dbg !119
+  %645 = getelementptr bfloat, ptr addrspace(1) %1, i64 %604, !dbg !120
+  %646 = fptrunc float %598 to bfloat, !dbg !121
+  %647 = fptrunc float %599 to bfloat, !dbg !121
+  %648 = fptrunc float %600 to bfloat, !dbg !121
+  %649 = fptrunc float %601 to bfloat, !dbg !121
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !121
+  store bfloat %646, ptr addrspace(3) %616, align 2, !dbg !121
+  store bfloat %647, ptr addrspace(3) %618, align 2, !dbg !121
+  store bfloat %648, ptr addrspace(3) %620, align 2, !dbg !121
+  store bfloat %649, ptr addrspace(3) %622, align 2, !dbg !121
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !121
+  %650 = load bfloat, ptr addrspace(3) %630, align 2, !dbg !121
+  %651 = load bfloat, ptr addrspace(3) %632, align 2, !dbg !121
+  %652 = load bfloat, ptr addrspace(3) %635, align 2, !dbg !121
+  %653 = load bfloat, ptr addrspace(3) %637, align 2, !dbg !121
+  %654 = insertelement <2 x bfloat> poison, bfloat %650, i64 0, !dbg !121
+  %655 = insertelement <2 x bfloat> %654, bfloat %652, i64 1, !dbg !121
+  %656 = bitcast <2 x bfloat> %655 to i32, !dbg !121
+  %657 = insertelement <2 x bfloat> poison, bfloat %651, i64 0, !dbg !121
+  %658 = insertelement <2 x bfloat> %657, bfloat %653, i64 1, !dbg !121
+  %659 = bitcast <2 x bfloat> %658 to i32, !dbg !121
+  tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %656, i32 %659, ptr addrspace(1) %645, i1 true) #6, !dbg !121
+  ret void, !dbg !122
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1
+
+; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
+declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2
+
+; Function Attrs: convergent nocallback nounwind
+declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #3
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.div.full(float, float) #4
+
+declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #5
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #4
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.rsqrt.approx.f(float) #4
+
+attributes #0 = { nounwind "nvvm.reqntid"="64" }
+attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
+attributes #3 = { convergent nocallback nounwind }
+attributes #4 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
+attributes #5 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #6 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3}
+!llvm.ident = !{!4}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly)
+!1 = !DIFile(filename: "cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py", directory: "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv")
+!2 = !{i32 2, !"Debug Info Version", i32 3}
+!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
+!4 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
+!5 = distinct !DISubprogram(name: "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0", linkageName: "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0", scope: !1, file: !1, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
+!7 = !{}
+!8 = !DILocation(line: 23, column: 28, scope: !5)
+!9 = !DILocation(line: 23, column: 33, scope: !5)
+!10 = !DILocation(line: 24, column: 44, scope: !5)
+!11 = !DILocation(line: 24, column: 23, scope: !5)
+!12 = !DILocation(line: 26, column: 37, scope: !5)
+!13 = !DILocation(line: 29, column: 19, scope: !5)
+!14 = !DILocation(line: 28, column: 19, scope: !5)
+!15 = !DILocation(line: 39, column: 41, scope: !5)
+!16 = !DILocation(line: 39, column: 52, scope: !5)
+!17 = !DILocation(line: 39, column: 48, scope: !5)
+!18 = !DILocation(line: 39, column: 63, scope: !5)
+!19 = !DILocation(line: 39, column: 57, scope: !5)
+!20 = !DILocation(line: 39, column: 34, scope: !5)
+!21 = !DILocation(line: 39, column: 68, scope: !5)
+!22 = !DILocation(line: 39, column: 121, scope: !5)
+!23 = !DILocation(line: 40, column: 41, scope: !5)
+!24 = !DILocation(line: 40, column: 50, scope: !5)
+!25 = !DILocation(line: 40, column: 34, scope: !5)
+!26 = !DILocation(line: 40, column: 61, scope: !5)
+!27 = !DILocation(line: 40, column: 114, scope: !5)
+!28 = !DILocation(line: 42, column: 22, scope: !5)
+!29 = !DILocation(line: 47, column: 22, scope: !5)
+!30 = !DILocation(line: 263, column: 15, scope: !31, inlinedAt: !33)
+!31 = distinct !DILexicalBlockFile(scope: !5, file: !32, discriminator: 0)
+!32 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.12/dist-packages/triton/language")
+!33 = !DILocation(line: 293, column: 36, scope: !31, inlinedAt: !34)
+!34 = !DILocation(line: 51, column: 25, scope: !35)
+!35 = distinct !DILexicalBlockFile(scope: !5, file: !1, discriminator: 0)
+!36 = !DILocation(line: 263, column: 15, scope: !31, inlinedAt: !37)
+!37 = !DILocation(line: 293, column: 36, scope: !31, inlinedAt: !38)
+!38 = !DILocation(line: 52, column: 27, scope: !35)
+!39 = !DILocation(line: 58, column: 27, scope: !5)
+!40 = !DILocation(line: 62, column: 35, scope: !5)
+!41 = !DILocation(line: 62, column: 42, scope: !5)
+!42 = !DILocation(line: 62, column: 95, scope: !5)
+!43 = !DILocation(line: 63, column: 46, scope: !5)
+!44 = !DILocation(line: 63, column: 42, scope: !5)
+!45 = !DILocation(line: 63, column: 35, scope: !5)
+!46 = !DILocation(line: 63, column: 51, scope: !5)
+!47 = !DILocation(line: 64, column: 35, scope: !5)
+!48 = !DILocation(line: 64, column: 51, scope: !5)
+!49 = !DILocation(line: 65, column: 69, scope: !5)
+!50 = !DILocation(line: 66, column: 36, scope: !5)
+!51 = !DILocation(line: 66, column: 43, scope: !5)
+!52 = !DILocation(line: 71, column: 24, scope: !5)
+!53 = !DILocation(line: 72, column: 41, scope: !5)
+!54 = !DILocation(line: 72, column: 39, scope: !5)
+!55 = !DILocation(line: 72, column: 48, scope: !5)
+!56 = !DILocation(line: 72, column: 57, scope: !5)
+!57 = !DILocation(line: 72, column: 35, scope: !5)
+!58 = !DILocation(line: 72, column: 68, scope: !5)
+!59 = !DILocation(line: 75, column: 25, scope: !5)
+!60 = !DILocation(line: 77, column: 24, scope: !5)
+!61 = !DILocation(line: 78, column: 32, scope: !5)
+!62 = !DILocation(line: 79, column: 24, scope: !5)
+!63 = !DILocation(line: 80, column: 35, scope: !5)
+!64 = !DILocation(line: 80, column: 85, scope: !5)
+!65 = !DILocation(line: 87, column: 25, scope: !5)
+!66 = !DILocation(line: 90, column: 53, scope: !5)
+!67 = !DILocation(line: 90, column: 35, scope: !5)
+!68 = !DILocation(line: 90, column: 64, scope: !5)
+!69 = !DILocation(line: 98, column: 35, scope: !5)
+!70 = !DILocation(line: 98, column: 81, scope: !5)
+!71 = !DILocation(line: 111, column: 24, scope: !5)
+!72 = !DILocation(line: 113, column: 24, scope: !5)
+!73 = !DILocation(line: 116, column: 24, scope: !5)
+!74 = !DILocation(line: 121, column: 51, scope: !5)
+!75 = !DILocation(line: 121, column: 60, scope: !5)
+!76 = !DILocation(line: 121, column: 35, scope: !5)
+!77 = !DILocation(line: 121, column: 71, scope: !5)
+!78 = !DILocation(line: 123, column: 24, scope: !5)
+!79 = !DILocation(line: 124, column: 24, scope: !5)
+!80 = !DILocation(line: 125, column: 32, scope: !5)
+!81 = !DILocation(line: 121, column: 132, scope: !5)
+!82 = !DILocation(line: 72, column: 129, scope: !5)
+!83 = !DILocation(line: 80, column: 146, scope: !5)
+!84 = !DILocation(line: 82, column: 24, scope: !5)
+!85 = !DILocation(line: 84, column: 17, scope: !5)
+!86 = !DILocation(line: 90, column: 125, scope: !5)
+!87 = !DILocation(line: 97, column: 24, scope: !5)
+!88 = !DILocation(line: 98, column: 142, scope: !5)
+!89 = !DILocation(line: 100, column: 24, scope: !5)
+!90 = !DILocation(line: 0, scope: !5)
+!91 = !DILocation(line: 118, column: 24, scope: !5)
+!92 = !DILocation(line: 119, column: 24, scope: !5)
+!93 = !DILocation(line: 66, column: 96, scope: !5)
+!94 = !DILocation(line: 65, column: 123, scope: !5)
+!95 = !DILocation(line: 126, column: 24, scope: !5)
+!96 = !DILocation(line: 127, column: 35, scope: !5)
+!97 = !DILocation(line: 127, column: 85, scope: !5)
+!98 = !DILocation(line: 127, column: 146, scope: !5)
+!99 = !DILocation(line: 129, column: 24, scope: !5)
+!100 = !DILocation(line: 131, column: 17, scope: !5)
+!101 = !DILocation(line: 134, column: 51, scope: !5)
+!102 = !DILocation(line: 134, column: 60, scope: !5)
+!103 = !DILocation(line: 134, column: 35, scope: !5)
+!104 = !DILocation(line: 134, column: 71, scope: !5)
+!105 = !DILocation(line: 134, column: 132, scope: !5)
+!106 = !DILocation(line: 139, column: 24, scope: !5)
+!107 = !DILocation(line: 140, column: 35, scope: !5)
+!108 = !DILocation(line: 140, column: 81, scope: !5)
+!109 = !DILocation(line: 140, column: 142, scope: !5)
+!110 = !DILocation(line: 142, column: 24, scope: !5)
+!111 = !DILocation(line: 151, column: 25, scope: !5)
+!112 = !DILocation(line: 156, column: 26, scope: !5)
+!113 = !DILocation(line: 153, column: 26, scope: !5)
+!114 = !DILocation(line: 158, column: 26, scope: !5)
+!115 = !DILocation(line: 159, column: 26, scope: !5)
+!116 = !DILocation(line: 161, column: 43, scope: !5)
+!117 = !DILocation(line: 161, column: 39, scope: !5)
+!118 = !DILocation(line: 161, column: 32, scope: !5)
+!119 = !DILocation(line: 161, column: 55, scope: !5)
+!120 = !DILocation(line: 162, column: 32, scope: !5)
+!121 = !DILocation(line: 162, column: 56, scope: !5)
+!122 = !DILocation(line: 53, column: 4, scope: !5)
diff --git a/triton/AQ3FCZKOYK5LBOX7RLBQGX5T77RKI4M7SEZTYJU34QROQSJNLP5A/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ptx b/triton/AQ3FCZKOYK5LBOX7RLBQGX5T77RKI4M7SEZTYJU34QROQSJNLP5A/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ptx
new file mode 100644
index 0000000000000000000000000000000000000000..7aae02eb020363fadb3f82846e2d73e821f99a1b
--- /dev/null
+++ b/triton/AQ3FCZKOYK5LBOX7RLBQGX5T77RKI4M7SEZTYJU34QROQSJNLP5A/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ptx
@@ -0,0 +1,1435 @@
+//
+// Generated by LLVM NVPTX Back-End
+//
+
+.version 9.1
+.target sm_89
+.address_size 64
+
+	// .globl	triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0 // -- Begin function triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0
+.extern .shared .align 16 .b8 global_smem[];
+.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90};
+                                        // @triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0
+.visible .entry triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0(
+	.param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_0,
+	.param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_1,
+	.param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_2,
+	.param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_3,
+	.param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_4,
+	.param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_5,
+	.param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_6,
+	.param .u32 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_7,
+	.param .u32 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_8,
+	.param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_9,
+	.param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_10
+)
+.reqntid 64
+{
+	.reg .pred 	%p<4>;
+	.reg .b16 	%rs<66>;
+	.reg .b32 	%r<335>;
+	.reg .b64 	%rd<96>;
+	.loc	1 18 0                          // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:18:0
+$L__func_begin0:
+	.loc	1 18 0                          // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:18:0
+
+// %bb.0:                               // %__nv_rsqrtf.exit
+	ld.param.b64 	%rd80, [triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_0];
+	ld.param.b64 	%rd81, [triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_1];
+$L__tmp0:
+	.loc	1 23 28                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:23:28
+	mov.u32 	%r22, %ctaid.x;
+	.loc	1 23 33                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:23:33
+	shl.b32 	%r23, %r22, 1;
+	ld.param.b64 	%rd82, [triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_2];
+	ld.param.b64 	%rd83, [triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_3];
+	.loc	1 24 44                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:24:44
+	mov.u32 	%r24, %tid.x;
+	and.b32 	%r25, %r24, 32;
+	bfe.s32 	%r26, %r24, 5, 1;
+	ld.param.b64 	%rd84, [triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_4];
+	ld.param.b64 	%rd85, [triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_5];
+	bfe.u32 	%r27, %r24, 5, 1;
+	ld.param.b64 	%rd86, [triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_6];
+	and.b32 	%r28, %r24, 1;
+	neg.s32 	%r29, %r28;
+	.loc	1 24 23                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:24:23
+	or.b32 	%r30, %r27, %r23;
+	or.b32 	%r31, %r23, %r28;
+	.loc	1 26 37                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:26:37
+	and.b32 	%r32, %r24, 31;
+	shl.b32 	%r33, %r32, 2;
+	shl.b32 	%r34, %r24, 1;
+	and.b32 	%r35, %r34, 126;
+	and.b32 	%r36, %r24, 62;
+	shr.u32 	%r37, %r24, 1;
+	.loc	1 29 19                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:29:19
+	bfe.s32 	%r38, %r22, 30, 1;
+	shr.u32 	%r39, %r38, 27;
+	add.s32 	%r40, %r30, %r39;
+	shr.s32 	%r41, %r40, 5;
+	.loc	1 28 19                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:28:19
+	and.b32 	%r42, %r40, 33554400;
+	sub.s32 	%r43, %r30, %r42;
+	.loc	1 29 19                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:29:19
+	add.s32 	%r44, %r31, %r39;
+	.loc	1 39 52                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:39:52
+	shl.b32 	%r45, %r43, 7;
+	.loc	1 39 48                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:39:48
+	or.b32 	%r46, %r45, %r33;
+	mad.lo.s32 	%r47, %r41, 36864, %r46;
+	.loc	1 39 57                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:39:57
+	add.s32 	%r48, %r47, 4096;
+	.loc	1 39 34                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:39:34
+	mad.wide.s32 	%rd1, %r48, 2, %rd82;
+	.loc	1 39 68                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:39:68
+	// begin inline asm
+	mov.u64 %rd2, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd2, 1.0;
+	// end inline asm
+	mov.b32 	%r3, 0;
+	mov.pred 	%p1, -1;
+	// begin inline asm
+	mov.u32 %r1, %r3;
+	mov.u32 %r2, %r3;
+	@%p1 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { %r1, %r2 }, [ %rd1 + 0 ], %rd2;
+	// end inline asm
+	mov.b32 	{%rs34, %rs35}, %r1;
+	mov.b32 	{%rs36, %rs37}, %r2;
+	.loc	1 39 121                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:39:121
+	cvt.f32.bf16 	%r49, %rs34;
+	cvt.f32.bf16 	%r50, %rs35;
+	cvt.f32.bf16 	%r51, %rs36;
+	cvt.f32.bf16 	%r52, %rs37;
+	.loc	1 40 34                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:40:34
+	mad.wide.s32 	%rd3, %r47, 2, %rd82;
+	.loc	1 40 61                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:40:61
+	// begin inline asm
+	mov.u64 %rd4, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd4, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r4, %r3;
+	mov.u32 %r5, %r3;
+	@%p1 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { %r4, %r5 }, [ %rd3 + 0 ], %rd4;
+	// end inline asm
+	mov.b32 	{%rs38, %rs39}, %r4;
+	mov.b32 	{%rs40, %rs41}, %r5;
+	.loc	1 40 114                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:40:114
+	cvt.f32.bf16 	%r53, %rs38;
+	cvt.f32.bf16 	%r54, %rs39;
+	cvt.f32.bf16 	%r55, %rs40;
+	cvt.f32.bf16 	%r56, %rs41;
+	.loc	1 42 22                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:42:22
+	mul.f32 	%r57, %r50, %r50;
+	.loc	1 47 22                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:47:22
+	mul.f32 	%r58, %r54, %r54;
+$L__tmp1:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ] ]
+	fma.rn.f32 	%r59, %r49, %r49, %r57;
+	fma.rn.f32 	%r60, %r51, %r51, %r59;
+	fma.rn.f32 	%r61, %r52, %r52, %r60;
+$L__tmp2:
+	.loc	2 293 36                        // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ]
+	shfl.sync.bfly.b32 	%r62, %r61, 16, 31, -1;
+$L__tmp3:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ] ]
+	add.f32 	%r63, %r61, %r62;
+$L__tmp4:
+	.loc	2 293 36                        // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ]
+	shfl.sync.bfly.b32 	%r64, %r63, 8, 31, -1;
+$L__tmp5:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ] ]
+	add.f32 	%r65, %r63, %r64;
+$L__tmp6:
+	.loc	2 293 36                        // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ]
+	shfl.sync.bfly.b32 	%r66, %r65, 4, 31, -1;
+$L__tmp7:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ] ]
+	add.f32 	%r67, %r65, %r66;
+$L__tmp8:
+	.loc	2 293 36                        // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ]
+	shfl.sync.bfly.b32 	%r68, %r67, 2, 31, -1;
+$L__tmp9:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ] ]
+	add.f32 	%r69, %r67, %r68;
+$L__tmp10:
+	.loc	2 293 36                        // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ]
+	shfl.sync.bfly.b32 	%r70, %r69, 1, 31, -1;
+$L__tmp11:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ] ]
+	add.f32 	%r71, %r69, %r70;
+$L__tmp12:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ] ]
+	fma.rn.f32 	%r72, %r53, %r53, %r58;
+	fma.rn.f32 	%r73, %r55, %r55, %r72;
+	fma.rn.f32 	%r74, %r56, %r56, %r73;
+$L__tmp13:
+	.loc	2 293 36                        // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ]
+	shfl.sync.bfly.b32 	%r75, %r74, 16, 31, -1;
+$L__tmp14:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ] ]
+	add.f32 	%r76, %r74, %r75;
+$L__tmp15:
+	.loc	2 293 36                        // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ]
+	shfl.sync.bfly.b32 	%r77, %r76, 8, 31, -1;
+$L__tmp16:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ] ]
+	add.f32 	%r78, %r76, %r77;
+$L__tmp17:
+	.loc	2 293 36                        // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ]
+	shfl.sync.bfly.b32 	%r79, %r78, 4, 31, -1;
+$L__tmp18:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ] ]
+	add.f32 	%r80, %r78, %r79;
+$L__tmp19:
+	.loc	2 293 36                        // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ]
+	shfl.sync.bfly.b32 	%r81, %r80, 2, 31, -1;
+$L__tmp20:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ] ]
+	add.f32 	%r82, %r80, %r81;
+$L__tmp21:
+	.loc	2 293 36                        // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ]
+	shfl.sync.bfly.b32 	%r83, %r82, 1, 31, -1;
+$L__tmp22:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ] ]
+	add.f32 	%r84, %r82, %r83;
+$L__tmp23:
+	.loc	1 62 35                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:62:35
+	mul.wide.u32 	%rd87, %r35, 2;
+	add.s64 	%rd5, %rd83, %rd87;
+	.loc	1 62 42                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:62:42
+	// begin inline asm
+	mov.u64 %rd6, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd6, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r6, %r3;
+	@%p1 ld.global.L1::evict_last.L2::cache_hint.b32 { %r6 }, [ %rd5 + 0 ], %rd6;
+	// end inline asm
+	.loc	1 62 95                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:62:95
+	mov.b32 	{%rs42, %rs43}, %r6;
+	cvt.f32.bf16 	%r85, %rs43;
+	cvt.f32.bf16 	%r86, %rs42;
+	.loc	1 63 46                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:63:46
+	shl.b32 	%r87, %r41, 7;
+	.loc	1 63 42                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:63:42
+	or.b32 	%r88, %r87, %r33;
+	.loc	1 63 35                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:63:35
+	mul.wide.s32 	%rd88, %r88, 4;
+	add.s64 	%rd7, %rd84, %rd88;
+	.loc	1 63 51                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:63:51
+	// begin inline asm
+	mov.u64 %rd8, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd8, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r7, %r3;
+	mov.u32 %r8, %r3;
+	mov.u32 %r9, %r3;
+	mov.u32 %r10, %r3;
+	@%p1 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r7, %r8, %r9, %r10 }, [ %rd7 + 0 ], %rd8;
+	// end inline asm
+	shl.b32 	%r89, %r24, 4;
+	and.b32 	%r90, %r89, 112;
+	and.b32 	%r91, %r24, 24;
+	shr.u32 	%r92, %r91, 1;
+	and.b32 	%r93, %r26, 192;
+	or.b32 	%r94, %r90, %r92;
+	xor.b32 	%r95, %r94, %r93;
+	mov.b32 	%r96, global_smem;
+	add.s32 	%r97, %r96, %r95;
+	st.shared.b32 	[%r97], %r7;
+	xor.b32 	%r98, %r95, 4;
+	add.s32 	%r99, %r96, %r98;
+	st.shared.b32 	[%r99+256], %r8;
+	xor.b32 	%r100, %r95, 8;
+	add.s32 	%r101, %r96, %r100;
+	st.shared.b32 	[%r101+512], %r9;
+	xor.b32 	%r102, %r95, 12;
+	add.s32 	%r103, %r96, %r102;
+	st.shared.b32 	[%r103+768], %r10;
+	bar.sync 	0;
+	shl.b32 	%r104, %r24, 7;
+	and.b32 	%r105, %r104, 768;
+	shl.b32 	%r106, %r36, 1;
+	and.b32 	%r107, %r29, 192;
+	xor.b32 	%r108, %r107, %r106;
+	or.b32 	%r109, %r108, %r105;
+	add.s32 	%r110, %r96, %r109;
+	ld.shared.b32 	%r111, [%r110];
+	xor.b32 	%r112, %r109, 4;
+	add.s32 	%r113, %r96, %r112;
+	ld.shared.b32 	%r114, [%r113];
+	xor.b32 	%r115, %r109, 8;
+	add.s32 	%r116, %r96, %r115;
+	ld.shared.b32 	%r117, [%r116];
+	xor.b32 	%r118, %r109, 12;
+	add.s32 	%r119, %r96, %r118;
+	ld.shared.b32 	%r120, [%r119];
+	.loc	1 64 35                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:64:35
+	add.s64 	%rd9, %rd85, %rd88;
+	.loc	1 64 51                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:64:51
+	// begin inline asm
+	mov.u64 %rd10, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd10, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r11, %r3;
+	mov.u32 %r12, %r3;
+	mov.u32 %r13, %r3;
+	mov.u32 %r14, %r3;
+	@%p1 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r11, %r12, %r13, %r14 }, [ %rd9 + 0 ], %rd10;
+	// end inline asm
+	bar.sync 	0;
+	st.shared.b32 	[%r97], %r11;
+	st.shared.b32 	[%r99+256], %r12;
+	st.shared.b32 	[%r101+512], %r13;
+	st.shared.b32 	[%r103+768], %r14;
+	bar.sync 	0;
+	ld.shared.b32 	%r121, [%r110];
+	ld.shared.b32 	%r122, [%r113];
+	ld.shared.b32 	%r123, [%r116];
+	ld.shared.b32 	%r124, [%r119];
+	.loc	1 65 69                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:65:69
+	// begin inline asm
+	mov.u64 %rd11, 0x0;
+	createpolicy.fractional.L2::evict_first.b64 %rd11, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r15, %r3;
+	mov.u32 %r16, %r3;
+	@%p1 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { %r15, %r16 }, [ %rd1 + 0 ], %rd11;
+	// end inline asm
+	.loc	1 66 36                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:66:36
+	add.s64 	%rd12, %rd86, %rd87;
+	.loc	1 66 43                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:66:43
+	// begin inline asm
+	mov.u64 %rd13, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd13, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r17, %r3;
+	@%p1 ld.global.L1::evict_last.L2::cache_hint.b32 { %r17 }, [ %rd12 + 0 ], %rd13;
+	// end inline asm
+	.loc	1 71 24                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:71:24
+	and.b32 	%r125, %r37, 1;
+	setp.ne.b32 	%p3, %r125, 0;
+	not.pred 	%p2, %p3;
+	.loc	1 72 41                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:72:41
+	and.b32 	%r126, %r37, 30;
+	.loc	1 72 48                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:72:48
+	shl.b32 	%r127, %r31, 7;
+	shl.b32 	%r128, %r44, 10;
+	and.b32 	%r129, %r128, -32768;
+	add.s32 	%r130, %r129, %r127;
+	.loc	1 72 35                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:72:35
+	cvt.s64.s32 	%rd89, %r130;
+	cvt.u64.u32 	%rd90, %r126;
+	or.b64 	%rd91, %rd89, %rd90;
+	shl.b64 	%rd92, %rd91, 1;
+	add.s64 	%rd93, %rd82, %rd92;
+	add.s64 	%rd14, %rd93, 2;
+	add.s64 	%rd16, %rd93, 66;
+	add.s64 	%rd18, %rd93, 130;
+	add.s64 	%rd20, %rd93, 194;
+	.loc	1 72 68                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:72:68
+	// begin inline asm
+	mov.u64 %rd15, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd15, 1.0;
+	// end inline asm
+	mov.b16 	%rs2, 0;
+	// begin inline asm
+	mov.u16 %rs1, %rs2;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs1 }, [ %rd14 + 0 ], %rd15;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd17, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd17, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs3, %rs2;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs3 }, [ %rd16 + 0 ], %rd17;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd19, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd19, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs4, %rs2;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs4 }, [ %rd18 + 0 ], %rd19;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd21, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd21, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs5, %rs2;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs5 }, [ %rd20 + 0 ], %rd21;
+	// end inline asm
+	mov.b32 	%r131, 0f43000000;
+	.loc	1 75 25                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:75:25
+	div.full.f32 	%r132, %r84, %r131;
+	.loc	1 77 24                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:77:24
+	add.f32 	%r133, %r132, 0f358637BD;
+	.loc	1 78 32                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:78:32
+	rsqrt.approx.ftz.f32 	%r134, %r133;
+	.loc	1 79 24                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:79:24
+	bar.sync 	0;
+	shr.u32 	%r135, %r25, 3;
+	add.s32 	%r136, %r96, %r135;
+	st.shared.b32 	[%r136], %r134;
+	bar.sync 	0;
+	shl.b32 	%r137, %r28, 2;
+	add.s32 	%r138, %r96, %r137;
+	ld.shared.b32 	%r139, [%r138];
+	.loc	1 80 35                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:80:35
+	mul.wide.u32 	%rd94, %r126, 2;
+	add.s64 	%rd38, %rd83, %rd94;
+	add.s64 	%rd22, %rd38, 2;
+	add.s64 	%rd24, %rd38, 66;
+	add.s64 	%rd26, %rd38, 130;
+	add.s64 	%rd28, %rd38, 194;
+	.loc	1 80 85                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:80:85
+	// begin inline asm
+	mov.u64 %rd23, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd23, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs6, %rs2;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs6 }, [ %rd22 + 0 ], %rd23;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd25, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd25, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs7, %rs2;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs7 }, [ %rd24 + 0 ], %rd25;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd27, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd27, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs8, %rs2;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs8 }, [ %rd26 + 0 ], %rd27;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd29, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd29, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs9, %rs2;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs9 }, [ %rd28 + 0 ], %rd29;
+	// end inline asm
+	.loc	1 90 53                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:90:53
+	or.b32 	%r140, %r130, %r126;
+	.loc	1 90 35                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:90:35
+	mad.wide.s32 	%rd30, %r140, 2, %rd82;
+	add.s64 	%rd32, %rd93, 64;
+	add.s64 	%rd34, %rd93, 128;
+	add.s64 	%rd36, %rd93, 192;
+	.loc	1 90 64                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:90:64
+	// begin inline asm
+	mov.u64 %rd31, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd31, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs10, %rs2;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs10 }, [ %rd30 + 0 ], %rd31;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd33, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd33, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs11, %rs2;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs11 }, [ %rd32 + 0 ], %rd33;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd35, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd35, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs12, %rs2;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs12 }, [ %rd34 + 0 ], %rd35;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd37, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd37, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs13, %rs2;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs13 }, [ %rd36 + 0 ], %rd37;
+	// end inline asm
+	.loc	1 98 35                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:98:35
+	add.s64 	%rd40, %rd38, 64;
+	add.s64 	%rd42, %rd38, 128;
+	add.s64 	%rd44, %rd38, 192;
+	.loc	1 98 81                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:98:81
+	// begin inline asm
+	mov.u64 %rd39, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd39, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs14, %rs2;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs14 }, [ %rd38 + 0 ], %rd39;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd41, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd41, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs15, %rs2;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs15 }, [ %rd40 + 0 ], %rd41;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd43, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd43, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs16, %rs2;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs16 }, [ %rd42 + 0 ], %rd43;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd45, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd45, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs17, %rs2;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs17 }, [ %rd44 + 0 ], %rd45;
+	// end inline asm
+	.loc	1 111 24                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:111:24
+	mul.f32 	%r141, %r134, %r53;
+	mul.f32 	%r142, %r134, %r54;
+	mul.f32 	%r143, %r134, %r55;
+	mul.f32 	%r144, %r134, %r56;
+	.loc	1 113 24                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:113:24
+	bar.sync 	0;
+	shl.b32 	%r145, %r36, 2;
+	and.b32 	%r146, %r29, 320;
+	xor.b32 	%r147, %r146, %r145;
+	add.s32 	%r148, %r96, %r147;
+	st.shared.v2.b32 	[%r148], {%r86, %r85};
+	bar.sync 	0;
+	shl.b32 	%r149, %r32, 3;
+	add.s32 	%r150, %r96, %r149;
+	ld.shared.v2.b32 	{%r151, %r152}, [%r150];
+	xor.b32 	%r153, %r149, 64;
+	add.s32 	%r154, %r96, %r153;
+	ld.shared.v2.b32 	{%r155, %r156}, [%r154+256];
+	mul.f32 	%r157, %r141, %r151;
+	mul.f32 	%r158, %r142, %r152;
+	mul.f32 	%r159, %r143, %r155;
+	mul.f32 	%r160, %r144, %r156;
+	.loc	1 116 24                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:116:24
+	mul.f32 	%r161, %r157, %r7;
+	mul.f32 	%r162, %r158, %r8;
+	mul.f32 	%r163, %r159, %r9;
+	mul.f32 	%r164, %r160, %r10;
+	bar.sync 	0;
+	st.shared.b32 	[%r97], %r161;
+	st.shared.b32 	[%r99+256], %r162;
+	st.shared.b32 	[%r101+512], %r163;
+	st.shared.b32 	[%r103+768], %r164;
+	bar.sync 	0;
+	ld.shared.b32 	%r165, [%r110];
+	ld.shared.b32 	%r166, [%r113];
+	ld.shared.b32 	%r167, [%r116];
+	ld.shared.b32 	%r168, [%r119];
+	.loc	1 121 60                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:121:60
+	add.s32 	%r169, %r140, 4097;
+	add.s32 	%r170, %r140, 4129;
+	add.s32 	%r171, %r140, 4161;
+	add.s32 	%r172, %r140, 4193;
+	.loc	1 121 35                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:121:35
+	mad.wide.s32 	%rd46, %r169, 2, %rd82;
+	mad.wide.s32 	%rd48, %r170, 2, %rd82;
+	mad.wide.s32 	%rd50, %r171, 2, %rd82;
+	mad.wide.s32 	%rd52, %r172, 2, %rd82;
+	.loc	1 121 71                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:121:71
+	// begin inline asm
+	mov.u64 %rd47, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd47, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs18, %rs2;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs18 }, [ %rd46 + 0 ], %rd47;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd49, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd49, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs19, %rs2;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs19 }, [ %rd48 + 0 ], %rd49;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd51, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd51, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs20, %rs2;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs20 }, [ %rd50 + 0 ], %rd51;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd53, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd53, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs21, %rs2;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs21 }, [ %rd52 + 0 ], %rd53;
+	// end inline asm
+	.loc	1 123 24                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:123:24
+	div.full.f32 	%r173, %r71, %r131;
+	.loc	1 124 24                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:124:24
+	add.f32 	%r174, %r173, 0f358637BD;
+	.loc	1 125 32                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:125:32
+	rsqrt.approx.ftz.f32 	%r175, %r174;
+	.loc	1 121 132                       // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:121:132
+	cvt.f32.bf16 	%r176, %rs21;
+	cvt.f32.bf16 	%r177, %rs20;
+	cvt.f32.bf16 	%r178, %rs19;
+	cvt.f32.bf16 	%r179, %rs18;
+	.loc	1 72 129                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:72:129
+	cvt.f32.bf16 	%r180, %rs5;
+	.loc	1 79 24                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:79:24
+	mul.f32 	%r181, %r139, %r180;
+	.loc	1 80 146                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:80:146
+	cvt.f32.bf16 	%r182, %rs9;
+	.loc	1 84 17                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:84:17
+	neg.f32 	%r183, %r181;
+	fma.rn.f32 	%r184, %r183, %r182, 0f00000000;
+	.loc	1 90 125                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:90:125
+	cvt.f32.bf16 	%r185, %rs13;
+	.loc	1 97 24                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:97:24
+	mul.f32 	%r186, %r139, %r185;
+	.loc	1 98 142                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:98:142
+	cvt.f32.bf16 	%r187, %rs17;
+	.loc	1 100 24                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:100:24
+	mul.f32 	%r188, %r186, %r187;
+	.loc	1 0 0                           // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:0
+	selp.f32 	%r189, %r188, %r184, %p3;
+	.loc	1 119 24                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:119:24
+	fma.rn.f32 	%r190, %r124, %r189, %r168;
+	.loc	1 72 129                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:72:129
+	cvt.f32.bf16 	%r191, %rs4;
+	.loc	1 79 24                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:79:24
+	mul.f32 	%r192, %r139, %r191;
+	.loc	1 80 146                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:80:146
+	cvt.f32.bf16 	%r193, %rs8;
+	.loc	1 84 17                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:84:17
+	neg.f32 	%r194, %r192;
+	fma.rn.f32 	%r195, %r194, %r193, 0f00000000;
+	.loc	1 90 125                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:90:125
+	cvt.f32.bf16 	%r196, %rs12;
+	.loc	1 97 24                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:97:24
+	mul.f32 	%r197, %r139, %r196;
+	.loc	1 98 142                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:98:142
+	cvt.f32.bf16 	%r198, %rs16;
+	.loc	1 100 24                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:100:24
+	mul.f32 	%r199, %r197, %r198;
+	.loc	1 0 0                           // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:0
+	selp.f32 	%r200, %r199, %r195, %p3;
+	.loc	1 119 24                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:119:24
+	fma.rn.f32 	%r201, %r123, %r200, %r167;
+	.loc	1 72 129                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:72:129
+	cvt.f32.bf16 	%r202, %rs3;
+	.loc	1 79 24                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:79:24
+	mul.f32 	%r203, %r139, %r202;
+	.loc	1 80 146                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:80:146
+	cvt.f32.bf16 	%r204, %rs7;
+	.loc	1 84 17                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:84:17
+	neg.f32 	%r205, %r203;
+	fma.rn.f32 	%r206, %r205, %r204, 0f00000000;
+	.loc	1 90 125                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:90:125
+	cvt.f32.bf16 	%r207, %rs11;
+	.loc	1 97 24                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:97:24
+	mul.f32 	%r208, %r139, %r207;
+	.loc	1 98 142                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:98:142
+	cvt.f32.bf16 	%r209, %rs15;
+	.loc	1 100 24                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:100:24
+	mul.f32 	%r210, %r208, %r209;
+	.loc	1 0 0                           // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:0
+	selp.f32 	%r211, %r210, %r206, %p3;
+	.loc	1 119 24                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:119:24
+	fma.rn.f32 	%r212, %r122, %r211, %r166;
+	.loc	1 72 129                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:72:129
+	cvt.f32.bf16 	%r213, %rs1;
+	.loc	1 79 24                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:79:24
+	mul.f32 	%r214, %r139, %r213;
+	.loc	1 80 146                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:80:146
+	cvt.f32.bf16 	%r215, %rs6;
+	.loc	1 84 17                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:84:17
+	neg.f32 	%r216, %r214;
+	fma.rn.f32 	%r217, %r216, %r215, 0f00000000;
+	.loc	1 90 125                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:90:125
+	cvt.f32.bf16 	%r218, %rs10;
+	.loc	1 97 24                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:97:24
+	mul.f32 	%r219, %r139, %r218;
+	.loc	1 98 142                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:98:142
+	cvt.f32.bf16 	%r220, %rs14;
+	.loc	1 100 24                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:100:24
+	mul.f32 	%r221, %r219, %r220;
+	.loc	1 0 0                           // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:0
+	selp.f32 	%r222, %r221, %r217, %p3;
+	.loc	1 119 24                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:119:24
+	fma.rn.f32 	%r223, %r121, %r222, %r165;
+	.loc	1 66 43                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:66:43
+	mov.b32 	{%rs44, %rs45}, %r17;
+	.loc	1 66 96                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:66:96
+	cvt.f32.bf16 	%r224, %rs45;
+	cvt.f32.bf16 	%r225, %rs44;
+	.loc	1 65 69                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:65:69
+	mov.b32 	{%rs46, %rs47}, %r16;
+	.loc	1 65 123                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:65:123
+	cvt.f32.bf16 	%r226, %rs47;
+	cvt.f32.bf16 	%r227, %rs46;
+	.loc	1 65 69                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:65:69
+	mov.b32 	{%rs48, %rs49}, %r15;
+	.loc	1 65 123                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:65:123
+	cvt.f32.bf16 	%r228, %rs49;
+	cvt.f32.bf16 	%r229, %rs48;
+	.loc	1 126 24                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:126:24
+	bar.sync 	0;
+	st.shared.b32 	[%r136], %r175;
+	bar.sync 	0;
+	ld.shared.b32 	%r230, [%r138];
+	mul.f32 	%r231, %r230, %r179;
+	mul.f32 	%r232, %r230, %r178;
+	mul.f32 	%r233, %r230, %r177;
+	mul.f32 	%r234, %r230, %r176;
+	.loc	1 127 35                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:127:35
+	add.s64 	%rd70, %rd86, %rd94;
+	add.s64 	%rd54, %rd70, 2;
+	add.s64 	%rd56, %rd70, 66;
+	add.s64 	%rd58, %rd70, 130;
+	add.s64 	%rd60, %rd70, 194;
+	.loc	1 127 85                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:127:85
+	// begin inline asm
+	mov.u64 %rd55, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd55, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs22, %rs2;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs22 }, [ %rd54 + 0 ], %rd55;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd57, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd57, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs23, %rs2;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs23 }, [ %rd56 + 0 ], %rd57;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd59, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd59, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs24, %rs2;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs24 }, [ %rd58 + 0 ], %rd59;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd61, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd61, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs25, %rs2;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs25 }, [ %rd60 + 0 ], %rd61;
+	// end inline asm
+	.loc	1 127 146                       // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:127:146
+	cvt.f32.bf16 	%r235, %rs22;
+	cvt.f32.bf16 	%r236, %rs23;
+	cvt.f32.bf16 	%r237, %rs24;
+	cvt.f32.bf16 	%r238, %rs25;
+	.loc	1 131 17                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:131:17
+	neg.f32 	%r239, %r231;
+	fma.rn.f32 	%r240, %r239, %r235, 0f00000000;
+	neg.f32 	%r241, %r232;
+	fma.rn.f32 	%r242, %r241, %r236, 0f00000000;
+	neg.f32 	%r243, %r233;
+	fma.rn.f32 	%r244, %r243, %r237, 0f00000000;
+	neg.f32 	%r245, %r234;
+	fma.rn.f32 	%r246, %r245, %r238, 0f00000000;
+	.loc	1 134 60                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:134:60
+	add.s32 	%r247, %r140, 4096;
+	add.s32 	%r248, %r140, 4128;
+	add.s32 	%r249, %r140, 4160;
+	add.s32 	%r250, %r140, 4192;
+	.loc	1 134 35                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:134:35
+	mad.wide.s32 	%rd62, %r247, 2, %rd82;
+	mad.wide.s32 	%rd64, %r248, 2, %rd82;
+	mad.wide.s32 	%rd66, %r249, 2, %rd82;
+	mad.wide.s32 	%rd68, %r250, 2, %rd82;
+	.loc	1 134 71                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:134:71
+	// begin inline asm
+	mov.u64 %rd63, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd63, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs26, %rs2;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs26 }, [ %rd62 + 0 ], %rd63;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd65, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd65, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs27, %rs2;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs27 }, [ %rd64 + 0 ], %rd65;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd67, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd67, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs28, %rs2;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs28 }, [ %rd66 + 0 ], %rd67;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd69, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd69, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs29, %rs2;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs29 }, [ %rd68 + 0 ], %rd69;
+	// end inline asm
+	.loc	1 134 132                       // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:134:132
+	cvt.f32.bf16 	%r251, %rs26;
+	cvt.f32.bf16 	%r252, %rs27;
+	cvt.f32.bf16 	%r253, %rs28;
+	cvt.f32.bf16 	%r254, %rs29;
+	.loc	1 139 24                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:139:24
+	mul.f32 	%r255, %r230, %r251;
+	mul.f32 	%r256, %r230, %r252;
+	mul.f32 	%r257, %r230, %r253;
+	mul.f32 	%r258, %r230, %r254;
+	.loc	1 140 35                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:140:35
+	add.s64 	%rd72, %rd70, 64;
+	add.s64 	%rd74, %rd70, 128;
+	add.s64 	%rd76, %rd70, 192;
+	.loc	1 140 81                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:140:81
+	// begin inline asm
+	mov.u64 %rd71, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd71, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs30, %rs2;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs30 }, [ %rd70 + 0 ], %rd71;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd73, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd73, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs31, %rs2;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs31 }, [ %rd72 + 0 ], %rd73;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd75, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd75, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs32, %rs2;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs32 }, [ %rd74 + 0 ], %rd75;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd77, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd77, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs33, %rs2;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs33 }, [ %rd76 + 0 ], %rd77;
+	// end inline asm
+	.loc	1 140 142                       // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:140:142
+	cvt.f32.bf16 	%r259, %rs30;
+	cvt.f32.bf16 	%r260, %rs31;
+	cvt.f32.bf16 	%r261, %rs32;
+	cvt.f32.bf16 	%r262, %rs33;
+	.loc	1 142 24                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:142:24
+	mul.f32 	%r263, %r255, %r259;
+	mul.f32 	%r264, %r256, %r260;
+	mul.f32 	%r265, %r257, %r261;
+	mul.f32 	%r266, %r258, %r262;
+	.loc	1 0 0                           // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:0
+	selp.f32 	%r267, %r263, %r240, %p3;
+	selp.f32 	%r268, %r264, %r242, %p3;
+	selp.f32 	%r269, %r265, %r244, %p3;
+	selp.f32 	%r270, %r266, %r246, %p3;
+	.loc	1 151 25                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:151:25
+	mul.f32 	%r271, %r175, %r229;
+	mul.f32 	%r272, %r175, %r228;
+	mul.f32 	%r273, %r175, %r227;
+	mul.f32 	%r274, %r175, %r226;
+	bar.sync 	0;
+	st.shared.b32 	[%r97], %r271;
+	st.shared.b32 	[%r99+256], %r272;
+	st.shared.b32 	[%r101+512], %r273;
+	st.shared.b32 	[%r103+768], %r274;
+	bar.sync 	0;
+	ld.shared.b32 	%r275, [%r110];
+	ld.shared.b32 	%r276, [%r113];
+	ld.shared.b32 	%r277, [%r116];
+	ld.shared.b32 	%r278, [%r119];
+	.loc	1 156 26                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:156:26
+	bar.sync 	0;
+	shl.b32 	%r279, %r24, 3;
+	and.b32 	%r280, %r279, 120;
+	shr.u32 	%r281, %r24, 2;
+	and.b32 	%r282, %r281, 4;
+	shl.b32 	%r283, %r25, 2;
+	or.b32 	%r284, %r282, %r283;
+	or.b32 	%r285, %r284, %r280;
+	add.s32 	%r286, %r96, %r285;
+	st.shared.b32 	[%r286], %r225;
+	xor.b32 	%r287, %r285, 64;
+	add.s32 	%r288, %r96, %r287;
+	st.shared.b32 	[%r288+256], %r224;
+	bar.sync 	0;
+	and.b32 	%r289, %r34, 120;
+	bfe.s32 	%r290, %r24, 1, 1;
+	and.b32 	%r291, %r290, 320;
+	xor.b32 	%r292, %r291, %r289;
+	add.s32 	%r293, %r96, %r292;
+	ld.shared.v2.b32 	{%r294, %r295}, [%r293];
+	ld.shared.v2.b32 	{%r296, %r297}, [%r293+128];
+	.loc	1 153 26                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:153:26
+	mul.f32 	%r298, %r275, %r294;
+	mul.f32 	%r299, %r276, %r295;
+	mul.f32 	%r300, %r277, %r296;
+	mul.f32 	%r301, %r278, %r297;
+	.loc	1 156 26                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:156:26
+	mul.f32 	%r302, %r111, %r298;
+	mul.f32 	%r303, %r114, %r299;
+	mul.f32 	%r304, %r117, %r300;
+	mul.f32 	%r305, %r120, %r301;
+	.loc	1 159 26                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:159:26
+	fma.rn.f32 	%r306, %r121, %r267, %r302;
+	fma.rn.f32 	%r307, %r122, %r268, %r303;
+	fma.rn.f32 	%r308, %r123, %r269, %r304;
+	fma.rn.f32 	%r309, %r124, %r270, %r305;
+	.loc	1 161 43                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:161:43
+	shl.b32 	%r310, %r30, 7;
+	.loc	1 161 39                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:161:39
+	or.b32 	%r311, %r310, %r33;
+	.loc	1 161 32                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:161:32
+	mul.wide.s32 	%rd95, %r311, 2;
+	add.s64 	%rd78, %rd80, %rd95;
+	.loc	1 161 55                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:161:55
+	cvt.rn.bf16.f32 	%rs50, %r223;
+	cvt.rn.bf16.f32 	%rs51, %r212;
+	cvt.rn.bf16.f32 	%rs52, %r201;
+	cvt.rn.bf16.f32 	%rs53, %r190;
+	bar.sync 	0;
+	and.b32 	%r312, %r24, 3;
+	and.b32 	%r313, %r24, 28;
+	shr.u32 	%r314, %r25, 4;
+	mul.lo.s32 	%r315, %r312, 160;
+	or.b32 	%r316, %r315, %r314;
+	or.b32 	%r317, %r316, %r313;
+	add.s32 	%r318, %r96, %r317;
+	st.shared.b16 	[%r318], %rs50;
+	xor.b32 	%r319, %r317, 32;
+	add.s32 	%r320, %r96, %r319;
+	st.shared.b16 	[%r320], %rs51;
+	xor.b32 	%r321, %r317, 64;
+	add.s32 	%r322, %r96, %r321;
+	st.shared.b16 	[%r322], %rs52;
+	xor.b32 	%r323, %r317, 96;
+	add.s32 	%r324, %r96, %r323;
+	st.shared.b16 	[%r324], %rs53;
+	bar.sync 	0;
+	shl.b32 	%r325, %r312, 3;
+	shl.b32 	%r326, %r91, 2;
+	and.b32 	%r327, %r37, 2;
+	and.b32 	%r328, %r26, 160;
+	or.b32 	%r329, %r325, %r326;
+	xor.b32 	%r330, %r329, %r328;
+	or.b32 	%r331, %r330, %r327;
+	add.s32 	%r332, %r96, %r331;
+	ld.shared.b16 	%rs54, [%r332];
+	ld.shared.b16 	%rs55, [%r332+4];
+	xor.b32 	%r333, %r331, 64;
+	add.s32 	%r334, %r96, %r333;
+	ld.shared.b16 	%rs56, [%r334+256];
+	ld.shared.b16 	%rs57, [%r334+260];
+	mov.b32 	%r18, {%rs54, %rs56};
+	mov.b32 	%r19, {%rs55, %rs57};
+	// begin inline asm
+	@%p1 st.global.v2.b32 [ %rd78 + 0 ], { %r18, %r19 };
+	// end inline asm
+	.loc	1 162 32                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:162:32
+	add.s64 	%rd79, %rd81, %rd95;
+	.loc	1 162 56                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:162:56
+	cvt.rn.bf16.f32 	%rs58, %r306;
+	cvt.rn.bf16.f32 	%rs59, %r307;
+	cvt.rn.bf16.f32 	%rs60, %r308;
+	cvt.rn.bf16.f32 	%rs61, %r309;
+	bar.sync 	0;
+	st.shared.b16 	[%r318], %rs58;
+	st.shared.b16 	[%r320], %rs59;
+	st.shared.b16 	[%r322], %rs60;
+	st.shared.b16 	[%r324], %rs61;
+	bar.sync 	0;
+	ld.shared.b16 	%rs62, [%r332];
+	ld.shared.b16 	%rs63, [%r332+4];
+	ld.shared.b16 	%rs64, [%r334+256];
+	ld.shared.b16 	%rs65, [%r334+260];
+	mov.b32 	%r20, {%rs62, %rs64};
+	mov.b32 	%r21, {%rs63, %rs65};
+	// begin inline asm
+	@%p1 st.global.v2.b32 [ %rd79 + 0 ], { %r20, %r21 };
+	// end inline asm
+	.loc	1 53 4                          // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:53:4
+	ret;
+$L__tmp24:
+$L__func_end0:
+                                        // -- End function
+}
+	.file	1 "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py"
+	.file	2 "/usr/local/lib/python3.12/dist-packages/triton/language/standard.py"
+	.section	.debug_abbrev
+	{
+.b8 1                                   // Abbreviation Code
+.b8 17                                  // DW_TAG_compile_unit
+.b8 1                                   // DW_CHILDREN_yes
+.b8 37                                  // DW_AT_producer
+.b8 8                                   // DW_FORM_string
+.b8 19                                  // DW_AT_language
+.b8 5                                   // DW_FORM_data2
+.b8 3                                   // DW_AT_name
+.b8 8                                   // DW_FORM_string
+.b8 16                                  // DW_AT_stmt_list
+.b8 6                                   // DW_FORM_data4
+.b8 27                                  // DW_AT_comp_dir
+.b8 8                                   // DW_FORM_string
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 2                                   // Abbreviation Code
+.b8 46                                  // DW_TAG_subprogram
+.b8 0                                   // DW_CHILDREN_no
+.b8 3                                   // DW_AT_name
+.b8 8                                   // DW_FORM_string
+.b8 32                                  // DW_AT_inline
+.b8 11                                  // DW_FORM_data1
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 3                                   // Abbreviation Code
+.b8 46                                  // DW_TAG_subprogram
+.b8 1                                   // DW_CHILDREN_yes
+.b8 17                                  // DW_AT_low_pc
+.b8 1                                   // DW_FORM_addr
+.b8 18                                  // DW_AT_high_pc
+.b8 1                                   // DW_FORM_addr
+.b8 49                                  // DW_AT_abstract_origin
+.b8 19                                  // DW_FORM_ref4
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 4                                   // Abbreviation Code
+.b8 29                                  // DW_TAG_inlined_subroutine
+.b8 1                                   // DW_CHILDREN_yes
+.b8 49                                  // DW_AT_abstract_origin
+.b8 19                                  // DW_FORM_ref4
+.b8 17                                  // DW_AT_low_pc
+.b8 1                                   // DW_FORM_addr
+.b8 18                                  // DW_AT_high_pc
+.b8 1                                   // DW_FORM_addr
+.b8 88                                  // DW_AT_call_file
+.b8 11                                  // DW_FORM_data1
+.b8 89                                  // DW_AT_call_line
+.b8 11                                  // DW_FORM_data1
+.b8 87                                  // DW_AT_call_column
+.b8 11                                  // DW_FORM_data1
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 5                                   // Abbreviation Code
+.b8 29                                  // DW_TAG_inlined_subroutine
+.b8 0                                   // DW_CHILDREN_no
+.b8 49                                  // DW_AT_abstract_origin
+.b8 19                                  // DW_FORM_ref4
+.b8 17                                  // DW_AT_low_pc
+.b8 1                                   // DW_FORM_addr
+.b8 18                                  // DW_AT_high_pc
+.b8 1                                   // DW_FORM_addr
+.b8 88                                  // DW_AT_call_file
+.b8 11                                  // DW_FORM_data1
+.b8 89                                  // DW_AT_call_line
+.b8 5                                   // DW_FORM_data2
+.b8 87                                  // DW_AT_call_column
+.b8 11                                  // DW_FORM_data1
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 0                                   // EOM(3)
+	}
+	.section	.debug_info
+	{
+.b32 456                                // Length of Unit
+.b8 2                                   // DWARF version number
+.b8 0
+.b32 .debug_abbrev                      // Offset Into Abbrev. Section
+.b8 8                                   // Address Size (in bytes)
+.b8 1                                   // Abbrev [1] 0xb:0x1c1 DW_TAG_compile_unit
+.b8 116                                 // DW_AT_producer
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 0
+.b8 2                                   // DW_AT_language
+.b8 0
+.b8 99                                  // DW_AT_name
+.b8 98
+.b8 118
+.b8 113
+.b8 104
+.b8 106
+.b8 116
+.b8 121
+.b8 103
+.b8 55
+.b8 102
+.b8 118
+.b8 120
+.b8 122
+.b8 119
+.b8 116
+.b8 98
+.b8 116
+.b8 116
+.b8 52
+.b8 118
+.b8 114
+.b8 100
+.b8 107
+.b8 98
+.b8 110
+.b8 98
+.b8 54
+.b8 110
+.b8 51
+.b8 50
+.b8 102
+.b8 110
+.b8 114
+.b8 105
+.b8 106
+.b8 106
+.b8 112
+.b8 108
+.b8 51
+.b8 118
+.b8 118
+.b8 52
+.b8 99
+.b8 102
+.b8 113
+.b8 100
+.b8 52
+.b8 109
+.b8 122
+.b8 110
+.b8 114
+.b8 46
+.b8 112
+.b8 121
+.b8 0
+.b32 .debug_line                        // DW_AT_stmt_list
+.b8 47                                  // DW_AT_comp_dir
+.b8 97
+.b8 112
+.b8 112
+.b8 47
+.b8 116
+.b8 101
+.b8 110
+.b8 115
+.b8 111
+.b8 114
+.b8 114
+.b8 116
+.b8 95
+.b8 108
+.b8 108
+.b8 109
+.b8 47
+.b8 118
+.b8 105
+.b8 115
+.b8 117
+.b8 97
+.b8 108
+.b8 95
+.b8 103
+.b8 101
+.b8 110
+.b8 47
+.b8 99
+.b8 111
+.b8 109
+.b8 112
+.b8 105
+.b8 108
+.b8 101
+.b8 100
+.b8 95
+.b8 99
+.b8 97
+.b8 99
+.b8 104
+.b8 101
+.b8 47
+.b8 102
+.b8 108
+.b8 117
+.b8 120
+.b8 50
+.b8 95
+.b8 107
+.b8 108
+.b8 101
+.b8 105
+.b8 110
+.b8 95
+.b8 57
+.b8 98
+.b8 95
+.b8 78
+.b8 86
+.b8 73
+.b8 68
+.b8 73
+.b8 65
+.b8 95
+.b8 71
+.b8 101
+.b8 70
+.b8 111
+.b8 114
+.b8 99
+.b8 101
+.b8 95
+.b8 82
+.b8 84
+.b8 88
+.b8 95
+.b8 52
+.b8 48
+.b8 57
+.b8 48
+.b8 95
+.b8 115
+.b8 109
+.b8 56
+.b8 57
+.b8 95
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 50
+.b8 46
+.b8 49
+.b8 48
+.b8 46
+.b8 48
+.b8 97
+.b8 48
+.b8 95
+.b8 98
+.b8 52
+.b8 101
+.b8 52
+.b8 101
+.b8 101
+.b8 56
+.b8 49
+.b8 100
+.b8 51
+.b8 46
+.b8 110
+.b8 118
+.b8 50
+.b8 53
+.b8 46
+.b8 49
+.b8 50
+.b8 95
+.b8 99
+.b8 117
+.b8 100
+.b8 97
+.b8 49
+.b8 51
+.b8 95
+.b8 49
+.b8 47
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 105
+.b8 110
+.b8 100
+.b8 117
+.b8 99
+.b8 116
+.b8 111
+.b8 114
+.b8 47
+.b8 98
+.b8 118
+.b8 0
+.b8 2                                   // Abbrev [2] 0xe4:0x6d DW_TAG_subprogram
+.b8 116                                 // DW_AT_name
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 114
+.b8 101
+.b8 100
+.b8 95
+.b8 102
+.b8 117
+.b8 115
+.b8 101
+.b8 100
+.b8 95
+.b8 95
+.b8 102
+.b8 117
+.b8 115
+.b8 101
+.b8 100
+.b8 95
+.b8 114
+.b8 109
+.b8 115
+.b8 95
+.b8 110
+.b8 111
+.b8 114
+.b8 109
+.b8 95
+.b8 95
+.b8 116
+.b8 111
+.b8 95
+.b8 99
+.b8 111
+.b8 112
+.b8 121
+.b8 95
+.b8 97
+.b8 100
+.b8 100
+.b8 95
+.b8 109
+.b8 117
+.b8 108
+.b8 95
+.b8 110
+.b8 101
+.b8 103
+.b8 95
+.b8 115
+.b8 112
+.b8 108
+.b8 105
+.b8 116
+.b8 95
+.b8 115
+.b8 112
+.b8 108
+.b8 105
+.b8 116
+.b8 95
+.b8 119
+.b8 105
+.b8 116
+.b8 104
+.b8 95
+.b8 115
+.b8 105
+.b8 122
+.b8 101
+.b8 115
+.b8 95
+.b8 115
+.b8 116
+.b8 97
+.b8 99
+.b8 107
+.b8 95
+.b8 117
+.b8 110
+.b8 98
+.b8 105
+.b8 110
+.b8 100
+.b8 95
+.b8 117
+.b8 110
+.b8 115
+.b8 113
+.b8 117
+.b8 101
+.b8 101
+.b8 122
+.b8 101
+.b8 95
+.b8 118
+.b8 105
+.b8 101
+.b8 119
+.b8 95
+.b8 48
+.b8 0
+.b8 1                                   // DW_AT_inline
+.b8 3                                   // Abbrev [3] 0x151:0x7a DW_TAG_subprogram
+.b64 $L__func_begin0                    // DW_AT_low_pc
+.b64 $L__func_end0                      // DW_AT_high_pc
+.b32 228                                // DW_AT_abstract_origin
+.b8 4                                   // Abbrev [4] 0x166:0x32 DW_TAG_inlined_subroutine
+.b32 228                                // DW_AT_abstract_origin
+.b64 $L__tmp1                           // DW_AT_low_pc
+.b64 $L__tmp12                          // DW_AT_high_pc
+.b8 1                                   // DW_AT_call_file
+.b8 51                                  // DW_AT_call_line
+.b8 25                                  // DW_AT_call_column
+.b8 5                                   // Abbrev [5] 0x17e:0x19 DW_TAG_inlined_subroutine
+.b32 228                                // DW_AT_abstract_origin
+.b64 $L__tmp1                           // DW_AT_low_pc
+.b64 $L__tmp12                          // DW_AT_high_pc
+.b8 2                                   // DW_AT_call_file
+.b8 37                                  // DW_AT_call_line
+.b8 1
+.b8 36                                  // DW_AT_call_column
+.b8 0                                   // End Of Children Mark
+.b8 4                                   // Abbrev [4] 0x198:0x32 DW_TAG_inlined_subroutine
+.b32 228                                // DW_AT_abstract_origin
+.b64 $L__tmp12                          // DW_AT_low_pc
+.b64 $L__tmp23                          // DW_AT_high_pc
+.b8 1                                   // DW_AT_call_file
+.b8 52                                  // DW_AT_call_line
+.b8 27                                  // DW_AT_call_column
+.b8 5                                   // Abbrev [5] 0x1b0:0x19 DW_TAG_inlined_subroutine
+.b32 228                                // DW_AT_abstract_origin
+.b64 $L__tmp12                          // DW_AT_low_pc
+.b64 $L__tmp23                          // DW_AT_high_pc
+.b8 2                                   // DW_AT_call_file
+.b8 37                                  // DW_AT_call_line
+.b8 1
+.b8 36                                  // DW_AT_call_column
+.b8 0                                   // End Of Children Mark
+.b8 0                                   // End Of Children Mark
+.b8 0                                   // End Of Children Mark
+	}
+	.section	.debug_macinfo	{	}
diff --git a/triton/AQ3FCZKOYK5LBOX7RLBQGX5T77RKI4M7SEZTYJU34QROQSJNLP5A/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.source b/triton/AQ3FCZKOYK5LBOX7RLBQGX5T77RKI4M7SEZTYJU34QROQSJNLP5A/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.source
new file mode 100644
index 0000000000000000000000000000000000000000..1e72ed51c4e737348c551db3fbc792111a227cb0
--- /dev/null
+++ b/triton/AQ3FCZKOYK5LBOX7RLBQGX5T77RKI4M7SEZTYJU34QROQSJNLP5A/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.source
@@ -0,0 +1,972 @@
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":18:0)
+#loc213 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":287:0)
+#loc215 = loc(unknown)
+#loc218 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":262:0)
+#loc222 = loc("in_out_ptr0"(#loc))
+#loc223 = loc("in_out_ptr1"(#loc))
+#loc224 = loc("in_ptr0"(#loc))
+#loc225 = loc("in_ptr1"(#loc))
+#loc226 = loc("in_ptr2"(#loc))
+#loc227 = loc("in_ptr3"(#loc))
+#loc228 = loc("in_ptr4"(#loc))
+#loc229 = loc("xnumel"(#loc))
+#loc230 = loc("r0_numel"(#loc))
+#loc432 = loc("input"(#loc213))
+#loc433 = loc("a"(#loc218))
+#loc434 = loc("b"(#loc218))
+module {
+  tt.func public @triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0(%in_out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_out_ptr0"(#loc)), %in_out_ptr1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_out_ptr1"(#loc)), %in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %in_ptr4: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr4"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
+    %xnumel_0 = arith.constant 73728 : i32 loc(#loc231)
+    %r0_numel_1 = arith.constant 128 : i32 loc(#loc232)
+    %xoffset = tt.get_program_id x : i32 loc(#loc233)
+    %xoffset_2 = arith.constant 2 : i32 loc(#loc234)
+    %xoffset_3 = arith.constant 2 : i32 loc(#loc234)
+    %xoffset_4 = arith.muli %xoffset, %xoffset_3 : i32 loc(#loc234)
+    %xindex = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc235)
+    %xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<2xi32> -> tensor<2x1xi32> loc(#loc236)
+    %xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<2x1xi32> loc(#loc237)
+    %xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<2x1xi32> loc(#loc237)
+    %xmask = arith.constant true loc(#loc238)
+    %xmask_8 = arith.constant dense<true> : tensor<2x128xi1> loc(#loc238)
+    %r0_base = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc239)
+    %r0_base_9 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc240)
+    %x0 = arith.constant 32 : i32 loc(#loc241)
+    %x0_10 = arith.constant 32 : i32 loc(#loc241)
+    %x0_11 = arith.constant dense<32> : tensor<2x1xi32> loc(#loc241)
+    %x0_12 = arith.remsi %xindex_7, %x0_11 : tensor<2x1xi32> loc(#loc241)
+    %x1 = arith.constant 32 : i32 loc(#loc242)
+    %x1_13 = arith.constant 32 : i32 loc(#loc242)
+    %x1_14 = arith.constant dense<32> : tensor<2x1xi32> loc(#loc242)
+    %x1_15 = arith.divsi %xindex_7, %x1_14 : tensor<2x1xi32> loc(#loc242)
+    %_tmp4 = arith.constant 0.000000e+00 : f32 loc(#loc243)
+    %_tmp4_16 = arith.constant dense<0.000000e+00> : tensor<2x128xf32> loc(#loc243)
+    %_tmp10 = arith.constant 0.000000e+00 : f32 loc(#loc244)
+    %_tmp10_17 = arith.constant dense<0.000000e+00> : tensor<2x128xf32> loc(#loc244)
+    %c0_i32 = arith.constant 0 : i32 loc(#loc15)
+    %c128_i32 = arith.constant 128 : i32 loc(#loc15)
+    %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc15)
+    %1 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc15)
+    %2 = arith.bitcast %c128_i32 : i32 to i32 loc(#loc15)
+    %3 = ub.poison : i32 loc(#loc15)
+    %_tmp10_18:2 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%_tmp4_23 = %_tmp4_16, %_tmp10_24 = %_tmp10_17) -> (tensor<2x128xf32>, tensor<2x128xf32>)  : i32 {
+      %r0_index = tt.splat %r0_offset : i32 -> tensor<1x128xi32> loc(#loc246)
+      %r0_index_25 = arith.addi %r0_index, %r0_base_9 : tensor<1x128xi32> loc(#loc246)
+      %r0_mask = arith.constant dense<128> : tensor<1x128xi32> loc(#loc247)
+      %r0_mask_26 = arith.cmpi slt, %r0_index_25, %r0_mask : tensor<1x128xi32> loc(#loc247)
+      %tmp0 = arith.constant 4096 : i32 loc(#loc248)
+      %tmp0_27 = arith.constant 4096 : i32 loc(#loc248)
+      %tmp0_28 = arith.constant dense<4096> : tensor<1x128xi32> loc(#loc248)
+      %tmp0_29 = arith.addi %tmp0_28, %r0_index_25 : tensor<1x128xi32> loc(#loc248)
+      %tmp0_30 = arith.constant 128 : i32 loc(#loc249)
+      %tmp0_31 = arith.constant 128 : i32 loc(#loc249)
+      %tmp0_32 = arith.constant dense<128> : tensor<2x1xi32> loc(#loc249)
+      %tmp0_33 = arith.muli %tmp0_32, %x0_12 : tensor<2x1xi32> loc(#loc249)
+      %tmp0_34 = tt.broadcast %tmp0_29 : tensor<1x128xi32> -> tensor<2x128xi32> loc(#loc250)
+      %tmp0_35 = tt.broadcast %tmp0_33 : tensor<2x1xi32> -> tensor<2x128xi32> loc(#loc250)
+      %tmp0_36 = arith.addi %tmp0_34, %tmp0_35 : tensor<2x128xi32> loc(#loc250)
+      %tmp0_37 = arith.constant 36864 : i32 loc(#loc251)
+      %tmp0_38 = arith.constant 36864 : i32 loc(#loc251)
+      %tmp0_39 = arith.constant dense<36864> : tensor<2x1xi32> loc(#loc251)
+      %tmp0_40 = arith.muli %tmp0_39, %x1_15 : tensor<2x1xi32> loc(#loc251)
+      %tmp0_41 = tt.broadcast %tmp0_40 : tensor<2x1xi32> -> tensor<2x128xi32> loc(#loc252)
+      %tmp0_42 = arith.addi %tmp0_36, %tmp0_41 : tensor<2x128xi32> loc(#loc252)
+      %tmp0_43 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<2x128x!tt.ptr<bf16>> loc(#loc253)
+      %tmp0_44 = tt.addptr %tmp0_43, %tmp0_42 : tensor<2x128x!tt.ptr<bf16>>, tensor<2x128xi32> loc(#loc253)
+      %tmp0_45 = arith.constant 0.000000e+00 : f32 loc(#loc254)
+      %tmp0_46 = tt.broadcast %r0_mask_26 : tensor<1x128xi1> -> tensor<2x128xi1> loc(#loc254)
+      %tmp0_47 = arith.constant dense<0.000000e+00> : tensor<2x128xf32> loc(#loc254)
+      %tmp0_48 = arith.truncf %tmp0_47 : tensor<2x128xf32> to tensor<2x128xbf16> loc(#loc254)
+      %tmp0_49 = tt.load %tmp0_44, %tmp0_46, %tmp0_48 evictionPolicy = evict_last : tensor<2x128x!tt.ptr<bf16>> loc(#loc254)
+      %tmp0_50 = arith.extf %tmp0_49 : tensor<2x128xbf16> to tensor<2x128xf32> loc(#loc255)
+      %tmp6 = arith.constant 128 : i32 loc(#loc256)
+      %tmp6_51 = arith.constant 128 : i32 loc(#loc256)
+      %tmp6_52 = arith.constant dense<128> : tensor<2x1xi32> loc(#loc256)
+      %tmp6_53 = arith.muli %tmp6_52, %x0_12 : tensor<2x1xi32> loc(#loc256)
+      %tmp6_54 = tt.broadcast %r0_index_25 : tensor<1x128xi32> -> tensor<2x128xi32> loc(#loc257)
+      %tmp6_55 = tt.broadcast %tmp6_53 : tensor<2x1xi32> -> tensor<2x128xi32> loc(#loc257)
+      %tmp6_56 = arith.addi %tmp6_54, %tmp6_55 : tensor<2x128xi32> loc(#loc257)
+      %tmp6_57 = arith.constant 36864 : i32 loc(#loc258)
+      %tmp6_58 = arith.constant 36864 : i32 loc(#loc258)
+      %tmp6_59 = arith.constant dense<36864> : tensor<2x1xi32> loc(#loc258)
+      %tmp6_60 = arith.muli %tmp6_59, %x1_15 : tensor<2x1xi32> loc(#loc258)
+      %tmp6_61 = tt.broadcast %tmp6_60 : tensor<2x1xi32> -> tensor<2x128xi32> loc(#loc259)
+      %tmp6_62 = arith.addi %tmp6_56, %tmp6_61 : tensor<2x128xi32> loc(#loc259)
+      %tmp6_63 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<2x128x!tt.ptr<bf16>> loc(#loc260)
+      %tmp6_64 = tt.addptr %tmp6_63, %tmp6_62 : tensor<2x128x!tt.ptr<bf16>>, tensor<2x128xi32> loc(#loc260)
+      %tmp6_65 = arith.constant 0.000000e+00 : f32 loc(#loc261)
+      %tmp6_66 = tt.broadcast %r0_mask_26 : tensor<1x128xi1> -> tensor<2x128xi1> loc(#loc261)
+      %tmp6_67 = arith.constant dense<0.000000e+00> : tensor<2x128xf32> loc(#loc261)
+      %tmp6_68 = arith.truncf %tmp6_67 : tensor<2x128xf32> to tensor<2x128xbf16> loc(#loc261)
+      %tmp6_69 = tt.load %tmp6_64, %tmp6_66, %tmp6_68 evictionPolicy = evict_last : tensor<2x128x!tt.ptr<bf16>> loc(#loc261)
+      %tmp6_70 = arith.extf %tmp6_69 : tensor<2x128xbf16> to tensor<2x128xf32> loc(#loc262)
+      %tmp2 = arith.mulf %tmp0_50, %tmp0_50 : tensor<2x128xf32> loc(#loc263)
+      %tmp5 = arith.addf %_tmp4_23, %tmp2 : tensor<2x128xf32> loc(#loc264)
+      %_tmp4_71 = tt.broadcast %r0_mask_26 : tensor<1x128xi1> -> tensor<2x128xi1> loc(#loc265)
+      %_tmp4_72 = arith.select %_tmp4_71, %tmp5, %_tmp4_23 : tensor<2x128xi1>, tensor<2x128xf32> loc(#loc265)
+      %tmp8 = arith.mulf %tmp6_70, %tmp6_70 : tensor<2x128xf32> loc(#loc266)
+      %tmp11 = arith.addf %_tmp10_24, %tmp8 : tensor<2x128xf32> loc(#loc267)
+      %_tmp10_73 = tt.broadcast %r0_mask_26 : tensor<1x128xi1> -> tensor<2x128xi1> loc(#loc268)
+      %_tmp10_74 = arith.select %_tmp10_73, %tmp11, %_tmp10_24 : tensor<2x128xi1>, tensor<2x128xf32> loc(#loc268)
+      scf.yield %_tmp4_72, %_tmp10_74 : tensor<2x128xf32>, tensor<2x128xf32> loc(#loc39)
+    } loc(#loc435)
+    %tmp4 = tt.call @"triton.language.standard.sum__fp32S2_128S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%_tmp10_18#0) : (tensor<2x128xf32>) -> tensor<2xf32> loc(#loc269)
+    %tmp4_19 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<2xf32> -> tensor<2x1xf32> loc(#loc270)
+    %tmp10 = tt.call @"triton.language.standard.sum__fp32S2_128S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%_tmp10_18#1) : (tensor<2x128xf32>) -> tensor<2xf32> loc(#loc271)
+    %tmp10_20 = tt.expand_dims %tmp10 {axis = 1 : i32} : tensor<2xf32> -> tensor<2x1xf32> loc(#loc272)
+    %c0_i32_21 = arith.constant 0 : i32 loc(#loc44)
+    %c128_i32_22 = arith.constant 128 : i32 loc(#loc44)
+    %4 = arith.bitcast %c0_i32_21 : i32 to i32 loc(#loc44)
+    %5 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc44)
+    %6 = arith.bitcast %c128_i32_22 : i32 to i32 loc(#loc44)
+    %7 = ub.poison : i32 loc(#loc44)
+    scf.for %r0_offset = %4 to %5 step %6  : i32 {
+      %r0_index = tt.splat %r0_offset : i32 -> tensor<1x128xi32> loc(#loc273)
+      %r0_index_23 = arith.addi %r0_index, %r0_base_9 : tensor<1x128xi32> loc(#loc273)
+      %r0_mask = arith.constant dense<128> : tensor<1x128xi32> loc(#loc274)
+      %r0_mask_24 = arith.cmpi slt, %r0_index_23, %r0_mask : tensor<1x128xi32> loc(#loc274)
+      %r0_3 = arith.constant 2 : i32 loc(#loc275)
+      %r0_3_25 = arith.constant 2 : i32 loc(#loc275)
+      %r0_3_26 = arith.constant dense<2> : tensor<1x128xi32> loc(#loc275)
+      %r0_3_27 = arith.remsi %r0_index_23, %r0_3_26 : tensor<1x128xi32> loc(#loc275)
+      %r0_4 = arith.constant 2 : i32 loc(#loc276)
+      %r0_4_28 = arith.constant 2 : i32 loc(#loc276)
+      %r0_4_29 = arith.constant dense<2> : tensor<1x128xi32> loc(#loc276)
+      %r0_4_30 = arith.divsi %r0_index_23, %r0_4_29 : tensor<1x128xi32> loc(#loc276)
+      %tmp50 = arith.constant 128 : i32 loc(#loc277)
+      %tmp50_31 = arith.constant 128 : i32 loc(#loc277)
+      %tmp50_32 = arith.constant dense<128> : tensor<2x1xi32> loc(#loc277)
+      %tmp50_33 = arith.muli %tmp50_32, %x0_12 : tensor<2x1xi32> loc(#loc277)
+      %tmp50_34 = tt.broadcast %r0_index_23 : tensor<1x128xi32> -> tensor<2x128xi32> loc(#loc278)
+      %tmp50_35 = tt.broadcast %tmp50_33 : tensor<2x1xi32> -> tensor<2x128xi32> loc(#loc278)
+      %tmp50_36 = arith.addi %tmp50_34, %tmp50_35 : tensor<2x128xi32> loc(#loc278)
+      %tmp50_37 = arith.constant 36864 : i32 loc(#loc279)
+      %tmp50_38 = arith.constant 36864 : i32 loc(#loc279)
+      %tmp50_39 = arith.constant dense<36864> : tensor<2x1xi32> loc(#loc279)
+      %tmp50_40 = arith.muli %tmp50_39, %x1_15 : tensor<2x1xi32> loc(#loc279)
+      %tmp50_41 = tt.broadcast %tmp50_40 : tensor<2x1xi32> -> tensor<2x128xi32> loc(#loc280)
+      %tmp50_42 = arith.addi %tmp50_36, %tmp50_41 : tensor<2x128xi32> loc(#loc280)
+      %tmp50_43 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<2x128x!tt.ptr<bf16>> loc(#loc281)
+      %tmp50_44 = tt.addptr %tmp50_43, %tmp50_42 : tensor<2x128x!tt.ptr<bf16>>, tensor<2x128xi32> loc(#loc281)
+      %tmp50_45 = arith.constant 0.000000e+00 : f32 loc(#loc282)
+      %tmp50_46 = tt.broadcast %r0_mask_24 : tensor<1x128xi1> -> tensor<2x128xi1> loc(#loc282)
+      %tmp50_47 = arith.constant dense<0.000000e+00> : tensor<2x128xf32> loc(#loc282)
+      %tmp50_48 = arith.truncf %tmp50_47 : tensor<2x128xf32> to tensor<2x128xbf16> loc(#loc282)
+      %tmp50_49 = tt.load %tmp50_44, %tmp50_46, %tmp50_48 evictionPolicy = evict_last : tensor<2x128x!tt.ptr<bf16>> loc(#loc282)
+      %tmp50_50 = arith.extf %tmp50_49 : tensor<2x128xbf16> to tensor<2x128xf32> loc(#loc283)
+      %tmp58 = tt.splat %in_ptr1 : !tt.ptr<bf16> -> tensor<1x128x!tt.ptr<bf16>> loc(#loc284)
+      %tmp58_51 = tt.addptr %tmp58, %r0_index_23 : tensor<1x128x!tt.ptr<bf16>>, tensor<1x128xi32> loc(#loc284)
+      %tmp58_52 = arith.constant 0.000000e+00 : f32 loc(#loc285)
+      %tmp58_53 = arith.constant dense<0.000000e+00> : tensor<1x128xf32> loc(#loc285)
+      %tmp58_54 = arith.truncf %tmp58_53 : tensor<1x128xf32> to tensor<1x128xbf16> loc(#loc285)
+      %tmp58_55 = tt.load %tmp58_51, %r0_mask_24, %tmp58_54 evictionPolicy = evict_last : tensor<1x128x!tt.ptr<bf16>> loc(#loc285)
+      %tmp58_56 = arith.extf %tmp58_55 : tensor<1x128xbf16> to tensor<1x128xf32> loc(#loc286)
+      %tmp63 = arith.constant 128 : i32 loc(#loc287)
+      %tmp63_57 = arith.constant 128 : i32 loc(#loc287)
+      %tmp63_58 = arith.constant dense<128> : tensor<2x1xi32> loc(#loc287)
+      %tmp63_59 = arith.muli %tmp63_58, %x1_15 : tensor<2x1xi32> loc(#loc287)
+      %tmp63_60 = tt.broadcast %r0_index_23 : tensor<1x128xi32> -> tensor<2x128xi32> loc(#loc288)
+      %tmp63_61 = tt.broadcast %tmp63_59 : tensor<2x1xi32> -> tensor<2x128xi32> loc(#loc288)
+      %tmp63_62 = arith.addi %tmp63_60, %tmp63_61 : tensor<2x128xi32> loc(#loc288)
+      %tmp63_63 = tt.splat %in_ptr2 : !tt.ptr<f32> -> tensor<2x128x!tt.ptr<f32>> loc(#loc289)
+      %tmp63_64 = tt.addptr %tmp63_63, %tmp63_62 : tensor<2x128x!tt.ptr<f32>>, tensor<2x128xi32> loc(#loc289)
+      %tmp63_65 = arith.constant 0.000000e+00 : f32 loc(#loc290)
+      %tmp63_66 = tt.broadcast %r0_mask_24 : tensor<1x128xi1> -> tensor<2x128xi1> loc(#loc290)
+      %tmp63_67 = arith.constant dense<0.000000e+00> : tensor<2x128xf32> loc(#loc290)
+      %tmp63_68 = tt.load %tmp63_64, %tmp63_66, %tmp63_67 evictionPolicy = evict_last : tensor<2x128x!tt.ptr<f32>> loc(#loc290)
+      %tmp66 = arith.constant 128 : i32 loc(#loc291)
+      %tmp66_69 = arith.constant 128 : i32 loc(#loc291)
+      %tmp66_70 = arith.constant dense<128> : tensor<2x1xi32> loc(#loc291)
+      %tmp66_71 = arith.muli %tmp66_70, %x1_15 : tensor<2x1xi32> loc(#loc291)
+      %tmp66_72 = tt.broadcast %r0_index_23 : tensor<1x128xi32> -> tensor<2x128xi32> loc(#loc292)
+      %tmp66_73 = tt.broadcast %tmp66_71 : tensor<2x1xi32> -> tensor<2x128xi32> loc(#loc292)
+      %tmp66_74 = arith.addi %tmp66_72, %tmp66_73 : tensor<2x128xi32> loc(#loc292)
+      %tmp66_75 = tt.splat %in_ptr3 : !tt.ptr<f32> -> tensor<2x128x!tt.ptr<f32>> loc(#loc293)
+      %tmp66_76 = tt.addptr %tmp66_75, %tmp66_74 : tensor<2x128x!tt.ptr<f32>>, tensor<2x128xi32> loc(#loc293)
+      %tmp66_77 = arith.constant 0.000000e+00 : f32 loc(#loc294)
+      %tmp66_78 = tt.broadcast %r0_mask_24 : tensor<1x128xi1> -> tensor<2x128xi1> loc(#loc294)
+      %tmp66_79 = arith.constant dense<0.000000e+00> : tensor<2x128xf32> loc(#loc294)
+      %tmp66_80 = tt.load %tmp66_76, %tmp66_78, %tmp66_79 evictionPolicy = evict_last : tensor<2x128x!tt.ptr<f32>> loc(#loc294)
+      %tmp96 = arith.constant 4096 : i32 loc(#loc295)
+      %tmp96_81 = arith.constant 4096 : i32 loc(#loc295)
+      %tmp96_82 = arith.constant dense<4096> : tensor<1x128xi32> loc(#loc295)
+      %tmp96_83 = arith.addi %tmp96_82, %r0_index_23 : tensor<1x128xi32> loc(#loc295)
+      %tmp96_84 = arith.constant 128 : i32 loc(#loc296)
+      %tmp96_85 = arith.constant 128 : i32 loc(#loc296)
+      %tmp96_86 = arith.constant dense<128> : tensor<2x1xi32> loc(#loc296)
+      %tmp96_87 = arith.muli %tmp96_86, %x0_12 : tensor<2x1xi32> loc(#loc296)
+      %tmp96_88 = tt.broadcast %tmp96_83 : tensor<1x128xi32> -> tensor<2x128xi32> loc(#loc297)
+      %tmp96_89 = tt.broadcast %tmp96_87 : tensor<2x1xi32> -> tensor<2x128xi32> loc(#loc297)
+      %tmp96_90 = arith.addi %tmp96_88, %tmp96_89 : tensor<2x128xi32> loc(#loc297)
+      %tmp96_91 = arith.constant 36864 : i32 loc(#loc298)
+      %tmp96_92 = arith.constant 36864 : i32 loc(#loc298)
+      %tmp96_93 = arith.constant dense<36864> : tensor<2x1xi32> loc(#loc298)
+      %tmp96_94 = arith.muli %tmp96_93, %x1_15 : tensor<2x1xi32> loc(#loc298)
+      %tmp96_95 = tt.broadcast %tmp96_94 : tensor<2x1xi32> -> tensor<2x128xi32> loc(#loc299)
+      %tmp96_96 = arith.addi %tmp96_90, %tmp96_95 : tensor<2x128xi32> loc(#loc299)
+      %tmp96_97 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<2x128x!tt.ptr<bf16>> loc(#loc300)
+      %tmp96_98 = tt.addptr %tmp96_97, %tmp96_96 : tensor<2x128x!tt.ptr<bf16>>, tensor<2x128xi32> loc(#loc300)
+      %tmp96_99 = arith.constant 0.000000e+00 : f32 loc(#loc301)
+      %tmp96_100 = tt.broadcast %r0_mask_24 : tensor<1x128xi1> -> tensor<2x128xi1> loc(#loc301)
+      %tmp96_101 = arith.constant dense<0.000000e+00> : tensor<2x128xf32> loc(#loc301)
+      %tmp96_102 = arith.truncf %tmp96_101 : tensor<2x128xf32> to tensor<2x128xbf16> loc(#loc301)
+      %tmp96_103 = tt.load %tmp96_98, %tmp96_100, %tmp96_102 evictionPolicy = evict_first : tensor<2x128x!tt.ptr<bf16>> loc(#loc301)
+      %tmp96_104 = arith.extf %tmp96_103 : tensor<2x128xbf16> to tensor<2x128xf32> loc(#loc302)
+      %tmp102 = tt.splat %in_ptr4 : !tt.ptr<bf16> -> tensor<1x128x!tt.ptr<bf16>> loc(#loc303)
+      %tmp102_105 = tt.addptr %tmp102, %r0_index_23 : tensor<1x128x!tt.ptr<bf16>>, tensor<1x128xi32> loc(#loc303)
+      %tmp102_106 = arith.constant 0.000000e+00 : f32 loc(#loc304)
+      %tmp102_107 = arith.constant dense<0.000000e+00> : tensor<1x128xf32> loc(#loc304)
+      %tmp102_108 = arith.truncf %tmp102_107 : tensor<1x128xf32> to tensor<1x128xbf16> loc(#loc304)
+      %tmp102_109 = tt.load %tmp102_105, %r0_mask_24, %tmp102_108 evictionPolicy = evict_last : tensor<1x128x!tt.ptr<bf16>> loc(#loc304)
+      %tmp102_110 = arith.extf %tmp102_109 : tensor<1x128xbf16> to tensor<1x128xf32> loc(#loc305)
+      %tmp13 = arith.constant 0 : i64 loc(#loc306)
+      %tmp13_111 = arith.constant dense<0> : tensor<1x1xi64> loc(#loc306)
+      %tmp14 = arith.extsi %r0_3_27 : tensor<1x128xi32> to tensor<1x128xi64> loc(#loc307)
+      %tmp14_112 = arith.constant dense<0> : tensor<1x128xi64> loc(#loc307)
+      %tmp14_113 = arith.cmpi sge, %tmp14, %tmp14_112 : tensor<1x128xi64> loc(#loc307)
+      %tmp15 = arith.constant 1 : i64 loc(#loc308)
+      %tmp15_114 = arith.constant dense<1> : tensor<1x1xi64> loc(#loc308)
+      %tmp16 = arith.extsi %r0_3_27 : tensor<1x128xi32> to tensor<1x128xi64> loc(#loc309)
+      %tmp16_115 = arith.constant dense<1> : tensor<1x128xi64> loc(#loc309)
+      %tmp16_116 = arith.cmpi slt, %tmp16, %tmp16_115 : tensor<1x128xi64> loc(#loc309)
+      %tmp17 = arith.constant 2 : i32 loc(#loc310)
+      %tmp17_117 = arith.constant 2 : i32 loc(#loc310)
+      %tmp17_118 = arith.constant dense<2> : tensor<1x128xi32> loc(#loc310)
+      %tmp17_119 = arith.muli %tmp17_118, %r0_4_30 : tensor<1x128xi32> loc(#loc310)
+      %tmp17_120 = arith.constant 1 : i32 loc(#loc311)
+      %tmp17_121 = arith.constant 1 : i32 loc(#loc311)
+      %tmp17_122 = arith.constant dense<1> : tensor<1x128xi32> loc(#loc311)
+      %tmp17_123 = arith.addi %tmp17_122, %tmp17_119 : tensor<1x128xi32> loc(#loc311)
+      %tmp17_124 = arith.constant 128 : i32 loc(#loc312)
+      %tmp17_125 = arith.constant 128 : i32 loc(#loc312)
+      %tmp17_126 = arith.constant dense<128> : tensor<2x1xi32> loc(#loc312)
+      %tmp17_127 = arith.muli %tmp17_126, %x0_12 : tensor<2x1xi32> loc(#loc312)
+      %tmp17_128 = tt.broadcast %tmp17_123 : tensor<1x128xi32> -> tensor<2x128xi32> loc(#loc313)
+      %tmp17_129 = tt.broadcast %tmp17_127 : tensor<2x1xi32> -> tensor<2x128xi32> loc(#loc313)
+      %tmp17_130 = arith.addi %tmp17_128, %tmp17_129 : tensor<2x128xi32> loc(#loc313)
+      %tmp17_131 = arith.constant 36864 : i32 loc(#loc314)
+      %tmp17_132 = arith.constant 36864 : i32 loc(#loc314)
+      %tmp17_133 = arith.constant dense<36864> : tensor<2x1xi32> loc(#loc314)
+      %tmp17_134 = arith.muli %tmp17_133, %x1_15 : tensor<2x1xi32> loc(#loc314)
+      %tmp17_135 = tt.broadcast %tmp17_134 : tensor<2x1xi32> -> tensor<2x128xi32> loc(#loc315)
+      %tmp17_136 = arith.addi %tmp17_130, %tmp17_135 : tensor<2x128xi32> loc(#loc315)
+      %tmp17_137 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<2x128x!tt.ptr<bf16>> loc(#loc316)
+      %tmp17_138 = tt.addptr %tmp17_137, %tmp17_136 : tensor<2x128x!tt.ptr<bf16>>, tensor<2x128xi32> loc(#loc316)
+      %tmp17_139 = arith.andi %r0_mask_24, %tmp16_116 : tensor<1x128xi1> loc(#loc317)
+      %tmp17_140 = arith.constant 0.000000e+00 : f32 loc(#loc318)
+      %tmp17_141 = tt.broadcast %tmp17_139 : tensor<1x128xi1> -> tensor<2x128xi1> loc(#loc318)
+      %tmp17_142 = arith.constant dense<0.000000e+00> : tensor<2x128xf32> loc(#loc318)
+      %tmp17_143 = arith.truncf %tmp17_142 : tensor<2x128xf32> to tensor<2x128xbf16> loc(#loc318)
+      %tmp17_144 = tt.load %tmp17_138, %tmp17_141, %tmp17_143 evictionPolicy = evict_last : tensor<2x128x!tt.ptr<bf16>> loc(#loc318)
+      %tmp17_145 = arith.extf %tmp17_144 : tensor<2x128xbf16> to tensor<2x128xf32> loc(#loc319)
+      %tmp19 = arith.constant 1.280000e+02 : f32 loc(#loc320)
+      %tmp20 = arith.constant dense<1.280000e+02> : tensor<2x1xf32> loc(#loc321)
+      %tmp20_146 = arith.divf %tmp10_20, %tmp20 : tensor<2x1xf32> loc(#loc321)
+      %tmp21 = arith.constant 9.99999997E-7 : f32 loc(#loc322)
+      %tmp22 = arith.constant dense<9.99999997E-7> : tensor<2x1xf32> loc(#loc323)
+      %tmp22_147 = arith.addf %tmp20_146, %tmp22 : tensor<2x1xf32> loc(#loc323)
+      %tmp23 = tt.extern_elementwise %tmp22_147 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<2x1xf32>) -> tensor<2x1xf32> loc(#loc324)
+      %tmp24 = tt.broadcast %tmp23 : tensor<2x1xf32> -> tensor<2x128xf32> loc(#loc325)
+      %tmp24_148 = arith.mulf %tmp17_145, %tmp24 : tensor<2x128xf32> loc(#loc325)
+      %tmp25 = arith.constant 2 : i32 loc(#loc326)
+      %tmp25_149 = arith.constant 2 : i32 loc(#loc326)
+      %tmp25_150 = arith.constant dense<2> : tensor<1x128xi32> loc(#loc326)
+      %tmp25_151 = arith.muli %tmp25_150, %r0_4_30 : tensor<1x128xi32> loc(#loc326)
+      %tmp25_152 = arith.constant 1 : i32 loc(#loc327)
+      %tmp25_153 = arith.constant 1 : i32 loc(#loc327)
+      %tmp25_154 = arith.constant dense<1> : tensor<1x128xi32> loc(#loc327)
+      %tmp25_155 = arith.addi %tmp25_154, %tmp25_151 : tensor<1x128xi32> loc(#loc327)
+      %tmp25_156 = tt.broadcast %tmp25_155 : tensor<1x128xi32> -> tensor<2x128xi32> loc(#loc328)
+      %tmp25_157 = tt.splat %in_ptr1 : !tt.ptr<bf16> -> tensor<2x128x!tt.ptr<bf16>> loc(#loc329)
+      %tmp25_158 = tt.addptr %tmp25_157, %tmp25_156 : tensor<2x128x!tt.ptr<bf16>>, tensor<2x128xi32> loc(#loc329)
+      %tmp25_159 = arith.andi %r0_mask_24, %tmp16_116 : tensor<1x128xi1> loc(#loc330)
+      %tmp25_160 = arith.constant 0.000000e+00 : f32 loc(#loc331)
+      %tmp25_161 = tt.broadcast %tmp25_159 : tensor<1x128xi1> -> tensor<2x128xi1> loc(#loc331)
+      %tmp25_162 = arith.constant dense<0.000000e+00> : tensor<2x128xf32> loc(#loc331)
+      %tmp25_163 = arith.truncf %tmp25_162 : tensor<2x128xf32> to tensor<2x128xbf16> loc(#loc331)
+      %tmp25_164 = tt.load %tmp25_158, %tmp25_161, %tmp25_163 evictionPolicy = evict_last : tensor<2x128x!tt.ptr<bf16>> loc(#loc331)
+      %tmp25_165 = arith.extf %tmp25_164 : tensor<2x128xbf16> to tensor<2x128xf32> loc(#loc332)
+      %tmp27 = arith.mulf %tmp24_148, %tmp25_165 : tensor<2x128xf32> loc(#loc333)
+      %tmp29 = arith.constant 0.000000e+00 : f32 loc(#loc334)
+      %tmp29_166 = arith.constant dense<0.000000e+00> : tensor<2x128xf32> loc(#loc334)
+      %tmp29_167 = arith.subf %tmp29_166, %tmp27 : tensor<2x128xf32> loc(#loc334)
+      %tmp30 = arith.constant 0.000000e+00 : f32 loc(#loc335)
+      %tmp30_168 = arith.constant dense<0.000000e+00> : tensor<2x128xf32> loc(#loc335)
+      %tmp31 = tt.broadcast %tmp16_116 : tensor<1x128xi1> -> tensor<2x128xi1> loc(#loc336)
+      %tmp31_169 = arith.select %tmp31, %tmp29_167, %tmp30_168 : tensor<2x128xi1>, tensor<2x128xf32> loc(#loc336)
+      %tmp32 = arith.extsi %r0_3_27 : tensor<1x128xi32> to tensor<1x128xi64> loc(#loc337)
+      %tmp32_170 = arith.constant dense<1> : tensor<1x128xi64> loc(#loc337)
+      %tmp32_171 = arith.cmpi sge, %tmp32, %tmp32_170 : tensor<1x128xi64> loc(#loc337)
+      %tmp33 = arith.constant 2 : i64 loc(#loc338)
+      %tmp33_172 = arith.constant dense<2> : tensor<1x1xi64> loc(#loc338)
+      %tmp34 = arith.extsi %r0_3_27 : tensor<1x128xi32> to tensor<1x128xi64> loc(#loc339)
+      %tmp34_173 = arith.constant dense<2> : tensor<1x128xi64> loc(#loc339)
+      %tmp34_174 = arith.cmpi slt, %tmp34, %tmp34_173 : tensor<1x128xi64> loc(#loc339)
+      %tmp35 = arith.constant 2 : i32 loc(#loc340)
+      %tmp35_175 = arith.constant 2 : i32 loc(#loc340)
+      %tmp35_176 = arith.constant dense<2> : tensor<1x128xi32> loc(#loc340)
+      %tmp35_177 = arith.muli %tmp35_176, %r0_4_30 : tensor<1x128xi32> loc(#loc340)
+      %tmp35_178 = arith.constant 128 : i32 loc(#loc341)
+      %tmp35_179 = arith.constant 128 : i32 loc(#loc341)
+      %tmp35_180 = arith.constant dense<128> : tensor<2x1xi32> loc(#loc341)
+      %tmp35_181 = arith.muli %tmp35_180, %x0_12 : tensor<2x1xi32> loc(#loc341)
+      %tmp35_182 = tt.broadcast %tmp35_177 : tensor<1x128xi32> -> tensor<2x128xi32> loc(#loc342)
+      %tmp35_183 = tt.broadcast %tmp35_181 : tensor<2x1xi32> -> tensor<2x128xi32> loc(#loc342)
+      %tmp35_184 = arith.addi %tmp35_182, %tmp35_183 : tensor<2x128xi32> loc(#loc342)
+      %tmp35_185 = arith.constant 36864 : i32 loc(#loc343)
+      %tmp35_186 = arith.constant 36864 : i32 loc(#loc343)
+      %tmp35_187 = arith.constant dense<36864> : tensor<2x1xi32> loc(#loc343)
+      %tmp35_188 = arith.muli %tmp35_187, %x1_15 : tensor<2x1xi32> loc(#loc343)
+      %tmp35_189 = tt.broadcast %tmp35_188 : tensor<2x1xi32> -> tensor<2x128xi32> loc(#loc344)
+      %tmp35_190 = arith.addi %tmp35_184, %tmp35_189 : tensor<2x128xi32> loc(#loc344)
+      %tmp35_191 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<2x128x!tt.ptr<bf16>> loc(#loc345)
+      %tmp35_192 = tt.addptr %tmp35_191, %tmp35_190 : tensor<2x128x!tt.ptr<bf16>>, tensor<2x128xi32> loc(#loc345)
+      %tmp35_193 = arith.andi %r0_mask_24, %tmp32_171 : tensor<1x128xi1> loc(#loc346)
+      %tmp35_194 = arith.constant 0.000000e+00 : f32 loc(#loc347)
+      %tmp35_195 = tt.broadcast %tmp35_193 : tensor<1x128xi1> -> tensor<2x128xi1> loc(#loc347)
+      %tmp35_196 = arith.constant dense<0.000000e+00> : tensor<2x128xf32> loc(#loc347)
+      %tmp35_197 = arith.truncf %tmp35_196 : tensor<2x128xf32> to tensor<2x128xbf16> loc(#loc347)
+      %tmp35_198 = tt.load %tmp35_192, %tmp35_195, %tmp35_197 evictionPolicy = evict_last : tensor<2x128x!tt.ptr<bf16>> loc(#loc347)
+      %tmp35_199 = arith.extf %tmp35_198 : tensor<2x128xbf16> to tensor<2x128xf32> loc(#loc348)
+      %tmp37 = arith.constant 1.280000e+02 : f32 loc(#loc349)
+      %tmp38 = arith.constant dense<1.280000e+02> : tensor<2x1xf32> loc(#loc350)
+      %tmp38_200 = arith.divf %tmp10_20, %tmp38 : tensor<2x1xf32> loc(#loc350)
+      %tmp39 = arith.constant 9.99999997E-7 : f32 loc(#loc351)
+      %tmp40 = arith.constant dense<9.99999997E-7> : tensor<2x1xf32> loc(#loc352)
+      %tmp40_201 = arith.addf %tmp38_200, %tmp40 : tensor<2x1xf32> loc(#loc352)
+      %tmp41 = tt.extern_elementwise %tmp40_201 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<2x1xf32>) -> tensor<2x1xf32> loc(#loc353)
+      %tmp42 = tt.broadcast %tmp41 : tensor<2x1xf32> -> tensor<2x128xf32> loc(#loc354)
+      %tmp42_202 = arith.mulf %tmp35_199, %tmp42 : tensor<2x128xf32> loc(#loc354)
+      %tmp43 = arith.constant 2 : i32 loc(#loc355)
+      %tmp43_203 = arith.constant 2 : i32 loc(#loc355)
+      %tmp43_204 = arith.constant dense<2> : tensor<1x128xi32> loc(#loc355)
+      %tmp43_205 = arith.muli %tmp43_204, %r0_4_30 : tensor<1x128xi32> loc(#loc355)
+      %tmp43_206 = tt.broadcast %tmp43_205 : tensor<1x128xi32> -> tensor<2x128xi32> loc(#loc356)
+      %tmp43_207 = tt.splat %in_ptr1 : !tt.ptr<bf16> -> tensor<2x128x!tt.ptr<bf16>> loc(#loc357)
+      %tmp43_208 = tt.addptr %tmp43_207, %tmp43_206 : tensor<2x128x!tt.ptr<bf16>>, tensor<2x128xi32> loc(#loc357)
+      %tmp43_209 = arith.andi %r0_mask_24, %tmp32_171 : tensor<1x128xi1> loc(#loc358)
+      %tmp43_210 = arith.constant 0.000000e+00 : f32 loc(#loc359)
+      %tmp43_211 = tt.broadcast %tmp43_209 : tensor<1x128xi1> -> tensor<2x128xi1> loc(#loc359)
+      %tmp43_212 = arith.constant dense<0.000000e+00> : tensor<2x128xf32> loc(#loc359)
+      %tmp43_213 = arith.truncf %tmp43_212 : tensor<2x128xf32> to tensor<2x128xbf16> loc(#loc359)
+      %tmp43_214 = tt.load %tmp43_208, %tmp43_211, %tmp43_213 evictionPolicy = evict_last : tensor<2x128x!tt.ptr<bf16>> loc(#loc359)
+      %tmp43_215 = arith.extf %tmp43_214 : tensor<2x128xbf16> to tensor<2x128xf32> loc(#loc360)
+      %tmp45 = arith.mulf %tmp42_202, %tmp43_215 : tensor<2x128xf32> loc(#loc361)
+      %tmp47 = arith.constant 0.000000e+00 : f32 loc(#loc362)
+      %tmp47_216 = arith.constant dense<0.000000e+00> : tensor<2x128xf32> loc(#loc362)
+      %tmp48 = tt.broadcast %tmp32_171 : tensor<1x128xi1> -> tensor<2x128xi1> loc(#loc363)
+      %tmp48_217 = arith.select %tmp48, %tmp45, %tmp47_216 : tensor<2x128xi1>, tensor<2x128xf32> loc(#loc363)
+      %tmp49 = tt.broadcast %tmp16_116 : tensor<1x128xi1> -> tensor<2x128xi1> loc(#loc364)
+      %tmp49_218 = arith.select %tmp49, %tmp31_169, %tmp48_217 : tensor<2x128xi1>, tensor<2x128xf32> loc(#loc364)
+      %tmp52 = arith.constant 1.280000e+02 : f32 loc(#loc365)
+      %tmp53 = arith.constant dense<1.280000e+02> : tensor<2x1xf32> loc(#loc366)
+      %tmp53_219 = arith.divf %tmp10_20, %tmp53 : tensor<2x1xf32> loc(#loc366)
+      %tmp54 = arith.constant 9.99999997E-7 : f32 loc(#loc367)
+      %tmp55 = arith.constant dense<9.99999997E-7> : tensor<2x1xf32> loc(#loc368)
+      %tmp55_220 = arith.addf %tmp53_219, %tmp55 : tensor<2x1xf32> loc(#loc368)
+      %tmp56 = tt.extern_elementwise %tmp55_220 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<2x1xf32>) -> tensor<2x1xf32> loc(#loc369)
+      %tmp57 = tt.broadcast %tmp56 : tensor<2x1xf32> -> tensor<2x128xf32> loc(#loc370)
+      %tmp57_221 = arith.mulf %tmp50_50, %tmp57 : tensor<2x128xf32> loc(#loc370)
+      %tmp60 = tt.broadcast %tmp58_56 : tensor<1x128xf32> -> tensor<2x128xf32> loc(#loc371)
+      %tmp60_222 = arith.mulf %tmp57_221, %tmp60 : tensor<2x128xf32> loc(#loc371)
+      %tmp64 = arith.mulf %tmp60_222, %tmp63_68 : tensor<2x128xf32> loc(#loc372)
+      %tmp67 = arith.mulf %tmp49_218, %tmp66_80 : tensor<2x128xf32> loc(#loc373)
+      %tmp68 = arith.addf %tmp64, %tmp67 : tensor<2x128xf32> loc(#loc374)
+      %tmp70 = arith.constant 2 : i32 loc(#loc375)
+      %tmp70_223 = arith.constant 2 : i32 loc(#loc375)
+      %tmp70_224 = arith.constant dense<2> : tensor<1x128xi32> loc(#loc375)
+      %tmp70_225 = arith.muli %tmp70_224, %r0_4_30 : tensor<1x128xi32> loc(#loc375)
+      %tmp70_226 = arith.constant 4097 : i32 loc(#loc376)
+      %tmp70_227 = arith.constant 4097 : i32 loc(#loc376)
+      %tmp70_228 = arith.constant dense<4097> : tensor<1x128xi32> loc(#loc376)
+      %tmp70_229 = arith.addi %tmp70_228, %tmp70_225 : tensor<1x128xi32> loc(#loc376)
+      %tmp70_230 = arith.constant 128 : i32 loc(#loc377)
+      %tmp70_231 = arith.constant 128 : i32 loc(#loc377)
+      %tmp70_232 = arith.constant dense<128> : tensor<2x1xi32> loc(#loc377)
+      %tmp70_233 = arith.muli %tmp70_232, %x0_12 : tensor<2x1xi32> loc(#loc377)
+      %tmp70_234 = tt.broadcast %tmp70_229 : tensor<1x128xi32> -> tensor<2x128xi32> loc(#loc378)
+      %tmp70_235 = tt.broadcast %tmp70_233 : tensor<2x1xi32> -> tensor<2x128xi32> loc(#loc378)
+      %tmp70_236 = arith.addi %tmp70_234, %tmp70_235 : tensor<2x128xi32> loc(#loc378)
+      %tmp70_237 = arith.constant 36864 : i32 loc(#loc379)
+      %tmp70_238 = arith.constant 36864 : i32 loc(#loc379)
+      %tmp70_239 = arith.constant dense<36864> : tensor<2x1xi32> loc(#loc379)
+      %tmp70_240 = arith.muli %tmp70_239, %x1_15 : tensor<2x1xi32> loc(#loc379)
+      %tmp70_241 = tt.broadcast %tmp70_240 : tensor<2x1xi32> -> tensor<2x128xi32> loc(#loc380)
+      %tmp70_242 = arith.addi %tmp70_236, %tmp70_241 : tensor<2x128xi32> loc(#loc380)
+      %tmp70_243 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<2x128x!tt.ptr<bf16>> loc(#loc381)
+      %tmp70_244 = tt.addptr %tmp70_243, %tmp70_242 : tensor<2x128x!tt.ptr<bf16>>, tensor<2x128xi32> loc(#loc381)
+      %tmp70_245 = arith.andi %r0_mask_24, %tmp16_116 : tensor<1x128xi1> loc(#loc382)
+      %tmp70_246 = arith.constant 0.000000e+00 : f32 loc(#loc383)
+      %tmp70_247 = tt.broadcast %tmp70_245 : tensor<1x128xi1> -> tensor<2x128xi1> loc(#loc383)
+      %tmp70_248 = arith.constant dense<0.000000e+00> : tensor<2x128xf32> loc(#loc383)
+      %tmp70_249 = arith.truncf %tmp70_248 : tensor<2x128xf32> to tensor<2x128xbf16> loc(#loc383)
+      %tmp70_250 = tt.load %tmp70_244, %tmp70_247, %tmp70_249 evictionPolicy = evict_last : tensor<2x128x!tt.ptr<bf16>> loc(#loc383)
+      %tmp70_251 = arith.extf %tmp70_250 : tensor<2x128xbf16> to tensor<2x128xf32> loc(#loc384)
+      %tmp72 = arith.constant dense<1.280000e+02> : tensor<2x1xf32> loc(#loc385)
+      %tmp72_252 = arith.divf %tmp4_19, %tmp72 : tensor<2x1xf32> loc(#loc385)
+      %tmp73 = arith.constant dense<9.99999997E-7> : tensor<2x1xf32> loc(#loc386)
+      %tmp73_253 = arith.addf %tmp72_252, %tmp73 : tensor<2x1xf32> loc(#loc386)
+      %tmp74 = tt.extern_elementwise %tmp73_253 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<2x1xf32>) -> tensor<2x1xf32> loc(#loc387)
+      %tmp75 = tt.broadcast %tmp74 : tensor<2x1xf32> -> tensor<2x128xf32> loc(#loc388)
+      %tmp75_254 = arith.mulf %tmp70_251, %tmp75 : tensor<2x128xf32> loc(#loc388)
+      %tmp76 = arith.constant 2 : i32 loc(#loc389)
+      %tmp76_255 = arith.constant 2 : i32 loc(#loc389)
+      %tmp76_256 = arith.constant dense<2> : tensor<1x128xi32> loc(#loc389)
+      %tmp76_257 = arith.muli %tmp76_256, %r0_4_30 : tensor<1x128xi32> loc(#loc389)
+      %tmp76_258 = arith.constant 1 : i32 loc(#loc390)
+      %tmp76_259 = arith.constant 1 : i32 loc(#loc390)
+      %tmp76_260 = arith.constant dense<1> : tensor<1x128xi32> loc(#loc390)
+      %tmp76_261 = arith.addi %tmp76_260, %tmp76_257 : tensor<1x128xi32> loc(#loc390)
+      %tmp76_262 = tt.broadcast %tmp76_261 : tensor<1x128xi32> -> tensor<2x128xi32> loc(#loc391)
+      %tmp76_263 = tt.splat %in_ptr4 : !tt.ptr<bf16> -> tensor<2x128x!tt.ptr<bf16>> loc(#loc392)
+      %tmp76_264 = tt.addptr %tmp76_263, %tmp76_262 : tensor<2x128x!tt.ptr<bf16>>, tensor<2x128xi32> loc(#loc392)
+      %tmp76_265 = arith.andi %r0_mask_24, %tmp16_116 : tensor<1x128xi1> loc(#loc393)
+      %tmp76_266 = arith.constant 0.000000e+00 : f32 loc(#loc394)
+      %tmp76_267 = tt.broadcast %tmp76_265 : tensor<1x128xi1> -> tensor<2x128xi1> loc(#loc394)
+      %tmp76_268 = arith.constant dense<0.000000e+00> : tensor<2x128xf32> loc(#loc394)
+      %tmp76_269 = arith.truncf %tmp76_268 : tensor<2x128xf32> to tensor<2x128xbf16> loc(#loc394)
+      %tmp76_270 = tt.load %tmp76_264, %tmp76_267, %tmp76_269 evictionPolicy = evict_last : tensor<2x128x!tt.ptr<bf16>> loc(#loc394)
+      %tmp76_271 = arith.extf %tmp76_270 : tensor<2x128xbf16> to tensor<2x128xf32> loc(#loc395)
+      %tmp78 = arith.mulf %tmp75_254, %tmp76_271 : tensor<2x128xf32> loc(#loc396)
+      %tmp80 = arith.constant 0.000000e+00 : f32 loc(#loc397)
+      %tmp80_272 = arith.constant dense<0.000000e+00> : tensor<2x128xf32> loc(#loc397)
+      %tmp80_273 = arith.subf %tmp80_272, %tmp78 : tensor<2x128xf32> loc(#loc397)
+      %tmp81 = arith.constant 0.000000e+00 : f32 loc(#loc398)
+      %tmp81_274 = arith.constant dense<0.000000e+00> : tensor<2x128xf32> loc(#loc398)
+      %tmp82 = tt.broadcast %tmp16_116 : tensor<1x128xi1> -> tensor<2x128xi1> loc(#loc399)
+      %tmp82_275 = arith.select %tmp82, %tmp80_273, %tmp81_274 : tensor<2x128xi1>, tensor<2x128xf32> loc(#loc399)
+      %tmp83 = arith.constant 2 : i32 loc(#loc400)
+      %tmp83_276 = arith.constant 2 : i32 loc(#loc400)
+      %tmp83_277 = arith.constant dense<2> : tensor<1x128xi32> loc(#loc400)
+      %tmp83_278 = arith.muli %tmp83_277, %r0_4_30 : tensor<1x128xi32> loc(#loc400)
+      %tmp83_279 = arith.constant 4096 : i32 loc(#loc401)
+      %tmp83_280 = arith.constant 4096 : i32 loc(#loc401)
+      %tmp83_281 = arith.constant dense<4096> : tensor<1x128xi32> loc(#loc401)
+      %tmp83_282 = arith.addi %tmp83_281, %tmp83_278 : tensor<1x128xi32> loc(#loc401)
+      %tmp83_283 = arith.constant 128 : i32 loc(#loc402)
+      %tmp83_284 = arith.constant 128 : i32 loc(#loc402)
+      %tmp83_285 = arith.constant dense<128> : tensor<2x1xi32> loc(#loc402)
+      %tmp83_286 = arith.muli %tmp83_285, %x0_12 : tensor<2x1xi32> loc(#loc402)
+      %tmp83_287 = tt.broadcast %tmp83_282 : tensor<1x128xi32> -> tensor<2x128xi32> loc(#loc403)
+      %tmp83_288 = tt.broadcast %tmp83_286 : tensor<2x1xi32> -> tensor<2x128xi32> loc(#loc403)
+      %tmp83_289 = arith.addi %tmp83_287, %tmp83_288 : tensor<2x128xi32> loc(#loc403)
+      %tmp83_290 = arith.constant 36864 : i32 loc(#loc404)
+      %tmp83_291 = arith.constant 36864 : i32 loc(#loc404)
+      %tmp83_292 = arith.constant dense<36864> : tensor<2x1xi32> loc(#loc404)
+      %tmp83_293 = arith.muli %tmp83_292, %x1_15 : tensor<2x1xi32> loc(#loc404)
+      %tmp83_294 = tt.broadcast %tmp83_293 : tensor<2x1xi32> -> tensor<2x128xi32> loc(#loc405)
+      %tmp83_295 = arith.addi %tmp83_289, %tmp83_294 : tensor<2x128xi32> loc(#loc405)
+      %tmp83_296 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<2x128x!tt.ptr<bf16>> loc(#loc406)
+      %tmp83_297 = tt.addptr %tmp83_296, %tmp83_295 : tensor<2x128x!tt.ptr<bf16>>, tensor<2x128xi32> loc(#loc406)
+      %tmp83_298 = arith.andi %r0_mask_24, %tmp32_171 : tensor<1x128xi1> loc(#loc407)
+      %tmp83_299 = arith.constant 0.000000e+00 : f32 loc(#loc408)
+      %tmp83_300 = tt.broadcast %tmp83_298 : tensor<1x128xi1> -> tensor<2x128xi1> loc(#loc408)
+      %tmp83_301 = arith.constant dense<0.000000e+00> : tensor<2x128xf32> loc(#loc408)
+      %tmp83_302 = arith.truncf %tmp83_301 : tensor<2x128xf32> to tensor<2x128xbf16> loc(#loc408)
+      %tmp83_303 = tt.load %tmp83_297, %tmp83_300, %tmp83_302 evictionPolicy = evict_last : tensor<2x128x!tt.ptr<bf16>> loc(#loc408)
+      %tmp83_304 = arith.extf %tmp83_303 : tensor<2x128xbf16> to tensor<2x128xf32> loc(#loc409)
+      %tmp85 = arith.constant dense<1.280000e+02> : tensor<2x1xf32> loc(#loc410)
+      %tmp85_305 = arith.divf %tmp4_19, %tmp85 : tensor<2x1xf32> loc(#loc410)
+      %tmp86 = arith.constant dense<9.99999997E-7> : tensor<2x1xf32> loc(#loc411)
+      %tmp86_306 = arith.addf %tmp85_305, %tmp86 : tensor<2x1xf32> loc(#loc411)
+      %tmp87 = tt.extern_elementwise %tmp86_306 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<2x1xf32>) -> tensor<2x1xf32> loc(#loc412)
+      %tmp88 = tt.broadcast %tmp87 : tensor<2x1xf32> -> tensor<2x128xf32> loc(#loc413)
+      %tmp88_307 = arith.mulf %tmp83_304, %tmp88 : tensor<2x128xf32> loc(#loc413)
+      %tmp89 = arith.constant 2 : i32 loc(#loc414)
+      %tmp89_308 = arith.constant 2 : i32 loc(#loc414)
+      %tmp89_309 = arith.constant dense<2> : tensor<1x128xi32> loc(#loc414)
+      %tmp89_310 = arith.muli %tmp89_309, %r0_4_30 : tensor<1x128xi32> loc(#loc414)
+      %tmp89_311 = tt.broadcast %tmp89_310 : tensor<1x128xi32> -> tensor<2x128xi32> loc(#loc415)
+      %tmp89_312 = tt.splat %in_ptr4 : !tt.ptr<bf16> -> tensor<2x128x!tt.ptr<bf16>> loc(#loc416)
+      %tmp89_313 = tt.addptr %tmp89_312, %tmp89_311 : tensor<2x128x!tt.ptr<bf16>>, tensor<2x128xi32> loc(#loc416)
+      %tmp89_314 = arith.andi %r0_mask_24, %tmp32_171 : tensor<1x128xi1> loc(#loc417)
+      %tmp89_315 = arith.constant 0.000000e+00 : f32 loc(#loc418)
+      %tmp89_316 = tt.broadcast %tmp89_314 : tensor<1x128xi1> -> tensor<2x128xi1> loc(#loc418)
+      %tmp89_317 = arith.constant dense<0.000000e+00> : tensor<2x128xf32> loc(#loc418)
+      %tmp89_318 = arith.truncf %tmp89_317 : tensor<2x128xf32> to tensor<2x128xbf16> loc(#loc418)
+      %tmp89_319 = tt.load %tmp89_313, %tmp89_316, %tmp89_318 evictionPolicy = evict_last : tensor<2x128x!tt.ptr<bf16>> loc(#loc418)
+      %tmp89_320 = arith.extf %tmp89_319 : tensor<2x128xbf16> to tensor<2x128xf32> loc(#loc419)
+      %tmp91 = arith.mulf %tmp88_307, %tmp89_320 : tensor<2x128xf32> loc(#loc420)
+      %tmp93 = arith.constant 0.000000e+00 : f32 loc(#loc421)
+      %tmp93_321 = arith.constant dense<0.000000e+00> : tensor<2x128xf32> loc(#loc421)
+      %tmp94 = tt.broadcast %tmp32_171 : tensor<1x128xi1> -> tensor<2x128xi1> loc(#loc422)
+      %tmp94_322 = arith.select %tmp94, %tmp91, %tmp93_321 : tensor<2x128xi1>, tensor<2x128xf32> loc(#loc422)
+      %tmp95 = tt.broadcast %tmp16_116 : tensor<1x128xi1> -> tensor<2x128xi1> loc(#loc423)
+      %tmp95_323 = arith.select %tmp95, %tmp82_275, %tmp94_322 : tensor<2x128xi1>, tensor<2x128xf32> loc(#loc423)
+      %tmp98 = arith.constant dense<1.280000e+02> : tensor<2x1xf32> loc(#loc424)
+      %tmp98_324 = arith.divf %tmp4_19, %tmp98 : tensor<2x1xf32> loc(#loc424)
+      %tmp99 = arith.constant dense<9.99999997E-7> : tensor<2x1xf32> loc(#loc425)
+      %tmp99_325 = arith.addf %tmp98_324, %tmp99 : tensor<2x1xf32> loc(#loc425)
+      %tmp100 = tt.extern_elementwise %tmp99_325 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<2x1xf32>) -> tensor<2x1xf32> loc(#loc426)
+      %tmp101 = tt.broadcast %tmp100 : tensor<2x1xf32> -> tensor<2x128xf32> loc(#loc427)
+      %tmp101_326 = arith.mulf %tmp96_104, %tmp101 : tensor<2x128xf32> loc(#loc427)
+      %tmp104 = tt.broadcast %tmp102_110 : tensor<1x128xf32> -> tensor<2x128xf32> loc(#loc428)
+      %tmp104_327 = arith.mulf %tmp101_326, %tmp104 : tensor<2x128xf32> loc(#loc428)
+      %tmp107 = arith.mulf %tmp104_327, %tmp63_68 : tensor<2x128xf32> loc(#loc429)
+      %tmp109 = arith.mulf %tmp95_323, %tmp66_80 : tensor<2x128xf32> loc(#loc430)
+      %tmp110 = arith.addf %tmp107, %tmp109 : tensor<2x128xf32> loc(#loc431)
+      %c128_i32_328 = arith.constant 128 : i32 loc(#loc204)
+      %c128_i32_329 = arith.constant 128 : i32 loc(#loc204)
+      %cst = arith.constant dense<128> : tensor<2x1xi32> loc(#loc204)
+      %8 = arith.muli %cst, %xindex_7 : tensor<2x1xi32> loc(#loc204)
+      %9 = tt.broadcast %r0_index_23 : tensor<1x128xi32> -> tensor<2x128xi32> loc(#loc205)
+      %10 = tt.broadcast %8 : tensor<2x1xi32> -> tensor<2x128xi32> loc(#loc205)
+      %11 = arith.addi %9, %10 : tensor<2x128xi32> loc(#loc205)
+      %12 = tt.splat %in_out_ptr0 : !tt.ptr<bf16> -> tensor<2x128x!tt.ptr<bf16>> loc(#loc206)
+      %13 = tt.addptr %12, %11 : tensor<2x128x!tt.ptr<bf16>>, tensor<2x128xi32> loc(#loc206)
+      %14 = tt.broadcast %r0_mask_24 : tensor<1x128xi1> -> tensor<2x128xi1> loc(#loc207)
+      %15 = arith.truncf %tmp68 : tensor<2x128xf32> to tensor<2x128xbf16> loc(#loc207)
+      tt.store %13, %15, %14 : tensor<2x128x!tt.ptr<bf16>> loc(#loc207)
+      %c128_i32_330 = arith.constant 128 : i32 loc(#loc208)
+      %c128_i32_331 = arith.constant 128 : i32 loc(#loc208)
+      %cst_332 = arith.constant dense<128> : tensor<2x1xi32> loc(#loc208)
+      %16 = arith.muli %cst_332, %xindex_7 : tensor<2x1xi32> loc(#loc208)
+      %17 = tt.broadcast %r0_index_23 : tensor<1x128xi32> -> tensor<2x128xi32> loc(#loc209)
+      %18 = tt.broadcast %16 : tensor<2x1xi32> -> tensor<2x128xi32> loc(#loc209)
+      %19 = arith.addi %17, %18 : tensor<2x128xi32> loc(#loc209)
+      %20 = tt.splat %in_out_ptr1 : !tt.ptr<bf16> -> tensor<2x128x!tt.ptr<bf16>> loc(#loc210)
+      %21 = tt.addptr %20, %19 : tensor<2x128x!tt.ptr<bf16>>, tensor<2x128xi32> loc(#loc210)
+      %22 = tt.broadcast %r0_mask_24 : tensor<1x128xi1> -> tensor<2x128xi1> loc(#loc211)
+      %23 = arith.truncf %tmp110 : tensor<2x128xf32> to tensor<2x128xbf16> loc(#loc211)
+      tt.store %21, %23, %22 : tensor<2x128x!tt.ptr<bf16>> loc(#loc211)
+    } loc(#loc44)
+    tt.return loc(#loc212)
+  } loc(#loc)
+  tt.func private @"triton.language.standard.sum__fp32S2_128S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<2x128xf32> loc("input"(#loc213))) -> tensor<2xf32> attributes {noinline = false} {
+    %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({
+    ^bb0(%arg1: f32 loc(unknown), %arg2: f32 loc(unknown)):
+      %2 = tt.call @triton.language.standard._sum_combine__fp32_fp32__(%arg1, %arg2) : (f32, f32) -> f32 loc(#loc214)
+      tt.reduce.return %2 : f32 loc(#loc214)
+    }) : (tensor<2x128xf32>) -> tensor<2xf32> loc(#loc214)
+    tt.return %0 : tensor<2xf32> loc(#loc216)
+  ^bb1:  // no predecessors
+    %1 = ub.poison : tensor<2xf32> loc(#loc217)
+    tt.return %1 : tensor<2xf32> loc(#loc217)
+  } loc(#loc213)
+  tt.func private @triton.language.standard._sum_combine__fp32_fp32__(%a: f32 loc("a"(#loc218)), %b: f32 loc("b"(#loc218))) -> f32 attributes {noinline = false} {
+    %0 = arith.addf %a, %b : f32 loc(#loc219)
+    tt.return %0 : f32 loc(#loc220)
+  ^bb1:  // no predecessors
+    %1 = ub.poison : f32 loc(#loc221)
+    tt.return %1 : f32 loc(#loc221)
+  } loc(#loc218)
+} loc(#loc)
+#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":19:13)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":20:15)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":23:28)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":23:33)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:36)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:44)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:23)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":25:46)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":26:27)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":26:37)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":28:19)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":29:19)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":30:43)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":32:44)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":33:43)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":34:31)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":35:29)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:41)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:52)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:48)
+#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:63)
+#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:57)
+#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:34)
+#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:68)
+#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:121)
+#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:45)
+#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:41)
+#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:56)
+#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:50)
+#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:34)
+#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:61)
+#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:114)
+#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":42:22)
+#loc34 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":44:23)
+#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":45:40)
+#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":47:22)
+#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":49:25)
+#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":50:42)
+#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":50:8)
+#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":51:25)
+#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":51:28)
+#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":52:27)
+#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":52:30)
+#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":53:43)
+#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":54:31)
+#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":55:29)
+#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":58:27)
+#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":59:27)
+#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:46)
+#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:42)
+#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:57)
+#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:51)
+#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:35)
+#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:62)
+#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:115)
+#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:35)
+#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:42)
+#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:95)
+#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:46)
+#loc60 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:42)
+#loc61 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:35)
+#loc62 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:51)
+#loc63 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:46)
+#loc64 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:42)
+#loc65 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:35)
+#loc66 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:51)
+#loc67 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:42)
+#loc68 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:53)
+#loc69 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:49)
+#loc70 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:64)
+#loc71 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:58)
+#loc72 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:35)
+#loc73 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:69)
+#loc74 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:123)
+#loc75 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:36)
+#loc76 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:43)
+#loc77 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:96)
+#loc78 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":68:35)
+#loc79 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":69:25)
+#loc80 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":70:35)
+#loc81 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":71:24)
+#loc82 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:41)
+#loc83 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:39)
+#loc84 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:52)
+#loc85 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:48)
+#loc86 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:63)
+#loc87 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:57)
+#loc88 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:35)
+#loc89 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:78)
+#loc90 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:68)
+#loc91 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:129)
+#loc92 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":74:16)
+#loc93 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":75:25)
+#loc94 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":76:16)
+#loc95 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":77:24)
+#loc96 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":78:32)
+#loc97 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":79:24)
+#loc98 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:57)
+#loc99 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:55)
+#loc100 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:63)
+#loc101 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:35)
+#loc102 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:95)
+#loc103 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:85)
+#loc104 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:146)
+#loc105 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":82:24)
+#loc106 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":84:17)
+#loc107 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":85:42)
+#loc108 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":86:39)
+#loc109 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":87:25)
+#loc110 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":88:35)
+#loc111 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":89:24)
+#loc112 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:37)
+#loc113 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:48)
+#loc114 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:44)
+#loc115 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:59)
+#loc116 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:53)
+#loc117 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:35)
+#loc118 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:74)
+#loc119 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:64)
+#loc120 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:125)
+#loc121 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":92:16)
+#loc122 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":93:25)
+#loc123 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":94:16)
+#loc124 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":95:24)
+#loc125 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":96:32)
+#loc126 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":97:24)
+#loc127 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:53)
+#loc128 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:59)
+#loc129 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:35)
+#loc130 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:91)
+#loc131 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:81)
+#loc132 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:142)
+#loc133 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":100:24)
+#loc134 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":102:42)
+#loc135 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":103:39)
+#loc136 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":104:39)
+#loc137 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":106:16)
+#loc138 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":107:25)
+#loc139 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":108:16)
+#loc140 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":109:24)
+#loc141 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":110:32)
+#loc142 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":111:24)
+#loc143 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":113:24)
+#loc144 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":116:24)
+#loc145 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":118:24)
+#loc146 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":119:24)
+#loc147 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:44)
+#loc148 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:42)
+#loc149 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:55)
+#loc150 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:51)
+#loc151 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:66)
+#loc152 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:60)
+#loc153 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:35)
+#loc154 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:81)
+#loc155 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:71)
+#loc156 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:132)
+#loc157 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":123:24)
+#loc158 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":124:24)
+#loc159 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":125:32)
+#loc160 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":126:24)
+#loc161 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:57)
+#loc162 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:55)
+#loc163 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:63)
+#loc164 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:35)
+#loc165 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:95)
+#loc166 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:85)
+#loc167 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:146)
+#loc168 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":129:24)
+#loc169 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":131:17)
+#loc170 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":132:42)
+#loc171 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":133:39)
+#loc172 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:44)
+#loc173 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:42)
+#loc174 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:55)
+#loc175 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:51)
+#loc176 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:66)
+#loc177 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:60)
+#loc178 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:35)
+#loc179 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:81)
+#loc180 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:71)
+#loc181 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:132)
+#loc182 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":136:24)
+#loc183 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":137:24)
+#loc184 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":138:32)
+#loc185 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":139:24)
+#loc186 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:53)
+#loc187 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:59)
+#loc188 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:35)
+#loc189 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:91)
+#loc190 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:81)
+#loc191 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:142)
+#loc192 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":142:24)
+#loc193 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":144:42)
+#loc194 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":145:39)
+#loc195 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":146:39)
+#loc196 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":148:24)
+#loc197 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":149:24)
+#loc198 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":150:33)
+#loc199 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":151:25)
+#loc200 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":153:26)
+#loc201 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":156:26)
+#loc202 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":158:26)
+#loc203 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":159:26)
+#loc204 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:43)
+#loc205 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:39)
+#loc206 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:32)
+#loc207 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:55)
+#loc208 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:43)
+#loc209 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:39)
+#loc210 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:32)
+#loc211 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:56)
+#loc212 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":53:4)
+#loc214 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:36)
+#loc216 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:11)
+#loc217 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:4)
+#loc219 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:15)
+#loc220 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:11)
+#loc221 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:4)
+#loc231 = loc("xnumel"(#loc1))
+#loc232 = loc("r0_numel"(#loc2))
+#loc233 = loc("xoffset"(#loc3))
+#loc234 = loc("xoffset"(#loc4))
+#loc235 = loc("xindex"(#loc5))
+#loc236 = loc("xindex"(#loc6))
+#loc237 = loc("xindex"(#loc7))
+#loc238 = loc("xmask"(#loc8))
+#loc239 = loc("r0_base"(#loc9))
+#loc240 = loc("r0_base"(#loc10))
+#loc241 = loc("x0"(#loc11))
+#loc242 = loc("x1"(#loc12))
+#loc243 = loc("_tmp4"(#loc13))
+#loc244 = loc("_tmp10"(#loc14))
+#loc245 = loc("_tmp4"(#loc15))
+#loc246 = loc("r0_index"(#loc16))
+#loc247 = loc("r0_mask"(#loc17))
+#loc248 = loc("tmp0"(#loc18))
+#loc249 = loc("tmp0"(#loc19))
+#loc250 = loc("tmp0"(#loc20))
+#loc251 = loc("tmp0"(#loc21))
+#loc252 = loc("tmp0"(#loc22))
+#loc253 = loc("tmp0"(#loc23))
+#loc254 = loc("tmp0"(#loc24))
+#loc255 = loc("tmp0"(#loc25))
+#loc256 = loc("tmp6"(#loc26))
+#loc257 = loc("tmp6"(#loc27))
+#loc258 = loc("tmp6"(#loc28))
+#loc259 = loc("tmp6"(#loc29))
+#loc260 = loc("tmp6"(#loc30))
+#loc261 = loc("tmp6"(#loc31))
+#loc262 = loc("tmp6"(#loc32))
+#loc263 = loc("tmp2"(#loc33))
+#loc264 = loc("tmp5"(#loc34))
+#loc265 = loc("_tmp4"(#loc35))
+#loc266 = loc("tmp8"(#loc36))
+#loc267 = loc("tmp11"(#loc37))
+#loc268 = loc("_tmp10"(#loc38))
+#loc269 = loc("tmp4"(#loc40))
+#loc270 = loc("tmp4"(#loc41))
+#loc271 = loc("tmp10"(#loc42))
+#loc272 = loc("tmp10"(#loc43))
+#loc273 = loc("r0_index"(#loc45))
+#loc274 = loc("r0_mask"(#loc46))
+#loc275 = loc("r0_3"(#loc47))
+#loc276 = loc("r0_4"(#loc48))
+#loc277 = loc("tmp50"(#loc49))
+#loc278 = loc("tmp50"(#loc50))
+#loc279 = loc("tmp50"(#loc51))
+#loc280 = loc("tmp50"(#loc52))
+#loc281 = loc("tmp50"(#loc53))
+#loc282 = loc("tmp50"(#loc54))
+#loc283 = loc("tmp50"(#loc55))
+#loc284 = loc("tmp58"(#loc56))
+#loc285 = loc("tmp58"(#loc57))
+#loc286 = loc("tmp58"(#loc58))
+#loc287 = loc("tmp63"(#loc59))
+#loc288 = loc("tmp63"(#loc60))
+#loc289 = loc("tmp63"(#loc61))
+#loc290 = loc("tmp63"(#loc62))
+#loc291 = loc("tmp66"(#loc63))
+#loc292 = loc("tmp66"(#loc64))
+#loc293 = loc("tmp66"(#loc65))
+#loc294 = loc("tmp66"(#loc66))
+#loc295 = loc("tmp96"(#loc67))
+#loc296 = loc("tmp96"(#loc68))
+#loc297 = loc("tmp96"(#loc69))
+#loc298 = loc("tmp96"(#loc70))
+#loc299 = loc("tmp96"(#loc71))
+#loc300 = loc("tmp96"(#loc72))
+#loc301 = loc("tmp96"(#loc73))
+#loc302 = loc("tmp96"(#loc74))
+#loc303 = loc("tmp102"(#loc75))
+#loc304 = loc("tmp102"(#loc76))
+#loc305 = loc("tmp102"(#loc77))
+#loc306 = loc("tmp13"(#loc78))
+#loc307 = loc("tmp14"(#loc79))
+#loc308 = loc("tmp15"(#loc80))
+#loc309 = loc("tmp16"(#loc81))
+#loc310 = loc("tmp17"(#loc82))
+#loc311 = loc("tmp17"(#loc83))
+#loc312 = loc("tmp17"(#loc84))
+#loc313 = loc("tmp17"(#loc85))
+#loc314 = loc("tmp17"(#loc86))
+#loc315 = loc("tmp17"(#loc87))
+#loc316 = loc("tmp17"(#loc88))
+#loc317 = loc("tmp17"(#loc89))
+#loc318 = loc("tmp17"(#loc90))
+#loc319 = loc("tmp17"(#loc91))
+#loc320 = loc("tmp19"(#loc92))
+#loc321 = loc("tmp20"(#loc93))
+#loc322 = loc("tmp21"(#loc94))
+#loc323 = loc("tmp22"(#loc95))
+#loc324 = loc("tmp23"(#loc96))
+#loc325 = loc("tmp24"(#loc97))
+#loc326 = loc("tmp25"(#loc98))
+#loc327 = loc("tmp25"(#loc99))
+#loc328 = loc("tmp25"(#loc100))
+#loc329 = loc("tmp25"(#loc101))
+#loc330 = loc("tmp25"(#loc102))
+#loc331 = loc("tmp25"(#loc103))
+#loc332 = loc("tmp25"(#loc104))
+#loc333 = loc("tmp27"(#loc105))
+#loc334 = loc("tmp29"(#loc106))
+#loc335 = loc("tmp30"(#loc107))
+#loc336 = loc("tmp31"(#loc108))
+#loc337 = loc("tmp32"(#loc109))
+#loc338 = loc("tmp33"(#loc110))
+#loc339 = loc("tmp34"(#loc111))
+#loc340 = loc("tmp35"(#loc112))
+#loc341 = loc("tmp35"(#loc113))
+#loc342 = loc("tmp35"(#loc114))
+#loc343 = loc("tmp35"(#loc115))
+#loc344 = loc("tmp35"(#loc116))
+#loc345 = loc("tmp35"(#loc117))
+#loc346 = loc("tmp35"(#loc118))
+#loc347 = loc("tmp35"(#loc119))
+#loc348 = loc("tmp35"(#loc120))
+#loc349 = loc("tmp37"(#loc121))
+#loc350 = loc("tmp38"(#loc122))
+#loc351 = loc("tmp39"(#loc123))
+#loc352 = loc("tmp40"(#loc124))
+#loc353 = loc("tmp41"(#loc125))
+#loc354 = loc("tmp42"(#loc126))
+#loc355 = loc("tmp43"(#loc127))
+#loc356 = loc("tmp43"(#loc128))
+#loc357 = loc("tmp43"(#loc129))
+#loc358 = loc("tmp43"(#loc130))
+#loc359 = loc("tmp43"(#loc131))
+#loc360 = loc("tmp43"(#loc132))
+#loc361 = loc("tmp45"(#loc133))
+#loc362 = loc("tmp47"(#loc134))
+#loc363 = loc("tmp48"(#loc135))
+#loc364 = loc("tmp49"(#loc136))
+#loc365 = loc("tmp52"(#loc137))
+#loc366 = loc("tmp53"(#loc138))
+#loc367 = loc("tmp54"(#loc139))
+#loc368 = loc("tmp55"(#loc140))
+#loc369 = loc("tmp56"(#loc141))
+#loc370 = loc("tmp57"(#loc142))
+#loc371 = loc("tmp60"(#loc143))
+#loc372 = loc("tmp64"(#loc144))
+#loc373 = loc("tmp67"(#loc145))
+#loc374 = loc("tmp68"(#loc146))
+#loc375 = loc("tmp70"(#loc147))
+#loc376 = loc("tmp70"(#loc148))
+#loc377 = loc("tmp70"(#loc149))
+#loc378 = loc("tmp70"(#loc150))
+#loc379 = loc("tmp70"(#loc151))
+#loc380 = loc("tmp70"(#loc152))
+#loc381 = loc("tmp70"(#loc153))
+#loc382 = loc("tmp70"(#loc154))
+#loc383 = loc("tmp70"(#loc155))
+#loc384 = loc("tmp70"(#loc156))
+#loc385 = loc("tmp72"(#loc157))
+#loc386 = loc("tmp73"(#loc158))
+#loc387 = loc("tmp74"(#loc159))
+#loc388 = loc("tmp75"(#loc160))
+#loc389 = loc("tmp76"(#loc161))
+#loc390 = loc("tmp76"(#loc162))
+#loc391 = loc("tmp76"(#loc163))
+#loc392 = loc("tmp76"(#loc164))
+#loc393 = loc("tmp76"(#loc165))
+#loc394 = loc("tmp76"(#loc166))
+#loc395 = loc("tmp76"(#loc167))
+#loc396 = loc("tmp78"(#loc168))
+#loc397 = loc("tmp80"(#loc169))
+#loc398 = loc("tmp81"(#loc170))
+#loc399 = loc("tmp82"(#loc171))
+#loc400 = loc("tmp83"(#loc172))
+#loc401 = loc("tmp83"(#loc173))
+#loc402 = loc("tmp83"(#loc174))
+#loc403 = loc("tmp83"(#loc175))
+#loc404 = loc("tmp83"(#loc176))
+#loc405 = loc("tmp83"(#loc177))
+#loc406 = loc("tmp83"(#loc178))
+#loc407 = loc("tmp83"(#loc179))
+#loc408 = loc("tmp83"(#loc180))
+#loc409 = loc("tmp83"(#loc181))
+#loc410 = loc("tmp85"(#loc182))
+#loc411 = loc("tmp86"(#loc183))
+#loc412 = loc("tmp87"(#loc184))
+#loc413 = loc("tmp88"(#loc185))
+#loc414 = loc("tmp89"(#loc186))
+#loc415 = loc("tmp89"(#loc187))
+#loc416 = loc("tmp89"(#loc188))
+#loc417 = loc("tmp89"(#loc189))
+#loc418 = loc("tmp89"(#loc190))
+#loc419 = loc("tmp89"(#loc191))
+#loc420 = loc("tmp91"(#loc192))
+#loc421 = loc("tmp93"(#loc193))
+#loc422 = loc("tmp94"(#loc194))
+#loc423 = loc("tmp95"(#loc195))
+#loc424 = loc("tmp98"(#loc196))
+#loc425 = loc("tmp99"(#loc197))
+#loc426 = loc("tmp100"(#loc198))
+#loc427 = loc("tmp101"(#loc199))
+#loc428 = loc("tmp104"(#loc200))
+#loc429 = loc("tmp107"(#loc201))
+#loc430 = loc("tmp109"(#loc202))
+#loc431 = loc("tmp110"(#loc203))
+#loc435 = loc("_tmp10"(#loc245))
diff --git a/triton/AQ3FCZKOYK5LBOX7RLBQGX5T77RKI4M7SEZTYJU34QROQSJNLP5A/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttgir b/triton/AQ3FCZKOYK5LBOX7RLBQGX5T77RKI4M7SEZTYJU34QROQSJNLP5A/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttgir
new file mode 100644
index 0000000000000000000000000000000000000000..a047651df08aa291fb23d94c576ff2053ff171ba
--- /dev/null
+++ b/triton/AQ3FCZKOYK5LBOX7RLBQGX5T77RKI4M7SEZTYJU34QROQSJNLP5A/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttgir
@@ -0,0 +1,495 @@
+#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [2, 16], warpsPerCTA = [1, 2], order = [0, 1]}>
+#blocked1 = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 32], warpsPerCTA = [2, 1], order = [1, 0]}>
+#blocked2 = #ttg.blocked<{sizePerThread = [1, 2], threadsPerWarp = [1, 32], warpsPerCTA = [1, 2], order = [1, 0]}>
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":18:0)
+#loc1 = loc(unknown)
+#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":51:25)
+#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":52:27)
+#loc130 = loc("in_out_ptr0"(#loc))
+#loc131 = loc("in_out_ptr1"(#loc))
+#loc132 = loc("in_ptr0"(#loc))
+#loc133 = loc("in_ptr1"(#loc))
+#loc134 = loc("in_ptr2"(#loc))
+#loc135 = loc("in_ptr3"(#loc))
+#loc136 = loc("in_ptr4"(#loc))
+#loc137 = loc("xnumel"(#loc))
+#loc138 = loc("r0_numel"(#loc))
+#loc166 = loc("tmp4"(#loc30))
+#loc168 = loc("tmp10"(#loc33))
+#loc259 = loc(callsite(#loc1 at #loc166))
+#loc261 = loc(callsite(#loc1 at #loc168))
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 2 : i32, ttg.target = "cuda:89", "ttg.threads-per-warp" = 32 : i32} {
+  tt.func public @triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0(%in_out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_out_ptr0"(#loc)), %in_out_ptr1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_out_ptr1"(#loc)), %in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %in_ptr4: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr4"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
+    %cst = arith.constant dense<4097> : tensor<1x128xi32, #blocked> loc(#loc1)
+    %cst_0 = arith.constant dense<1> : tensor<1x128xi32, #blocked> loc(#loc1)
+    %cst_1 = arith.constant dense<1> : tensor<1x128xi64, #blocked> loc(#loc1)
+    %cst_2 = arith.constant dense<2> : tensor<1x128xi32, #blocked> loc(#loc1)
+    %cst_3 = arith.constant dense<36864> : tensor<2x1xi32, #blocked> loc(#loc1)
+    %cst_4 = arith.constant dense<36864> : tensor<2x1xi32, #blocked1> loc(#loc1)
+    %cst_5 = arith.constant dense<128> : tensor<2x1xi32, #blocked> loc(#loc1)
+    %cst_6 = arith.constant dense<128> : tensor<2x1xi32, #blocked1> loc(#loc1)
+    %cst_7 = arith.constant dense<4096> : tensor<1x128xi32, #blocked> loc(#loc1)
+    %cst_8 = arith.constant dense<4096> : tensor<1x128xi32, #blocked1> loc(#loc1)
+    %cst_9 = arith.constant dense<128> : tensor<1x128xi32, #blocked> loc(#loc1)
+    %cst_10 = arith.constant dense<128> : tensor<1x128xi32, #blocked2> loc(#loc1)
+    %cst_11 = arith.constant dense<128> : tensor<1x128xi32, #blocked1> loc(#loc1)
+    %cst_12 = arith.constant dense<32> : tensor<2x1xi32, #blocked> loc(#loc1)
+    %cst_13 = arith.constant dense<32> : tensor<2x1xi32, #blocked1> loc(#loc1)
+    %c2_i32 = arith.constant 2 : i32 loc(#loc1)
+    %cst_14 = arith.constant dense<0.000000e+00> : tensor<2x128xbf16, #blocked1> loc(#loc1)
+    %cst_15 = arith.constant dense<0.000000e+00> : tensor<1x128xbf16, #blocked2> loc(#loc1)
+    %cst_16 = arith.constant dense<0.000000e+00> : tensor<2x128xbf16, #blocked> loc(#loc1)
+    %cst_17 = arith.constant dense<9.99999997E-7> : tensor<2x1xf32, #blocked1> loc(#loc1)
+    %cst_18 = arith.constant dense<1.280000e+02> : tensor<2x1xf32, #blocked1> loc(#loc1)
+    %cst_19 = arith.constant dense<0.000000e+00> : tensor<2x128xf32, #blocked> loc(#loc1)
+    %cst_20 = arith.constant dense<0.000000e+00> : tensor<2x128xf32, #blocked1> loc(#loc1)
+    %xoffset = tt.get_program_id x : i32 loc(#loc139)
+    %xoffset_21 = arith.muli %xoffset, %c2_i32 : i32 loc(#loc140)
+    %xindex = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc141)
+    %xindex_22 = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc141)
+    %xindex_23 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<2xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<2x1xi32, #blocked1> loc(#loc141)
+    %xindex_24 = tt.expand_dims %xindex_22 {axis = 1 : i32} : tensor<2xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<2x1xi32, #blocked> loc(#loc141)
+    %xindex_25 = tt.splat %xoffset_21 : i32 -> tensor<2x1xi32, #blocked1> loc(#loc142)
+    %xindex_26 = tt.splat %xoffset_21 : i32 -> tensor<2x1xi32, #blocked> loc(#loc142)
+    %xindex_27 = arith.addi %xindex_25, %xindex_23 : tensor<2x1xi32, #blocked1> loc(#loc142)
+    %xindex_28 = arith.addi %xindex_26, %xindex_24 : tensor<2x1xi32, #blocked> loc(#loc142)
+    %r0_base = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc143)
+    %r0_base_29 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked2}>> loc(#loc143)
+    %r0_base_30 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc143)
+    %r0_base_31 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x128xi32, #blocked1> loc(#loc143)
+    %r0_base_32 = tt.expand_dims %r0_base_29 {axis = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked2}>> -> tensor<1x128xi32, #blocked2> loc(#loc143)
+    %r0_base_33 = tt.expand_dims %r0_base_30 {axis = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x128xi32, #blocked> loc(#loc143)
+    %x0 = arith.remsi %xindex_27, %cst_13 : tensor<2x1xi32, #blocked1> loc(#loc144)
+    %x0_34 = arith.remsi %xindex_28, %cst_12 : tensor<2x1xi32, #blocked> loc(#loc144)
+    %x1 = arith.divsi %xindex_27, %cst_13 : tensor<2x1xi32, #blocked1> loc(#loc145)
+    %x1_35 = arith.divsi %xindex_28, %cst_12 : tensor<2x1xi32, #blocked> loc(#loc145)
+    %r0_mask = arith.cmpi slt, %r0_base_31, %cst_11 : tensor<1x128xi32, #blocked1> loc(#loc146)
+    %r0_mask_36 = arith.cmpi slt, %r0_base_32, %cst_10 : tensor<1x128xi32, #blocked2> loc(#loc146)
+    %r0_mask_37 = arith.cmpi slt, %r0_base_33, %cst_9 : tensor<1x128xi32, #blocked> loc(#loc146)
+    %tmp0 = arith.addi %r0_base_31, %cst_8 : tensor<1x128xi32, #blocked1> loc(#loc147)
+    %tmp0_38 = arith.muli %x0, %cst_6 : tensor<2x1xi32, #blocked1> loc(#loc148)
+    %tmp0_39 = arith.muli %x0_34, %cst_5 : tensor<2x1xi32, #blocked> loc(#loc148)
+    %tmp0_40 = tt.broadcast %tmp0 : tensor<1x128xi32, #blocked1> -> tensor<2x128xi32, #blocked1> loc(#loc149)
+    %tmp0_41 = tt.broadcast %tmp0_38 : tensor<2x1xi32, #blocked1> -> tensor<2x128xi32, #blocked1> loc(#loc149)
+    %tmp0_42 = tt.broadcast %tmp0_39 : tensor<2x1xi32, #blocked> -> tensor<2x128xi32, #blocked> loc(#loc149)
+    %tmp0_43 = arith.addi %tmp0_40, %tmp0_41 : tensor<2x128xi32, #blocked1> loc(#loc149)
+    %tmp0_44 = arith.muli %x1, %cst_4 : tensor<2x1xi32, #blocked1> loc(#loc150)
+    %tmp0_45 = arith.muli %x1_35, %cst_3 : tensor<2x1xi32, #blocked> loc(#loc150)
+    %tmp0_46 = tt.broadcast %tmp0_44 : tensor<2x1xi32, #blocked1> -> tensor<2x128xi32, #blocked1> loc(#loc151)
+    %tmp0_47 = tt.broadcast %tmp0_45 : tensor<2x1xi32, #blocked> -> tensor<2x128xi32, #blocked> loc(#loc151)
+    %tmp0_48 = arith.addi %tmp0_43, %tmp0_46 : tensor<2x128xi32, #blocked1> loc(#loc151)
+    %tmp0_49 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<2x128x!tt.ptr<bf16>, #blocked1> loc(#loc152)
+    %tmp0_50 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<2x128x!tt.ptr<bf16>, #blocked> loc(#loc152)
+    %tmp0_51 = tt.addptr %tmp0_49, %tmp0_48 : tensor<2x128x!tt.ptr<bf16>, #blocked1>, tensor<2x128xi32, #blocked1> loc(#loc152)
+    %tmp0_52 = tt.broadcast %r0_mask : tensor<1x128xi1, #blocked1> -> tensor<2x128xi1, #blocked1> loc(#loc153)
+    %tmp0_53 = tt.load %tmp0_51, %tmp0_52, %cst_14 evictionPolicy = evict_last : tensor<2x128x!tt.ptr<bf16>, #blocked1> loc(#loc153)
+    %tmp0_54 = arith.extf %tmp0_53 : tensor<2x128xbf16, #blocked1> to tensor<2x128xf32, #blocked1> loc(#loc154)
+    %tmp6 = tt.broadcast %r0_base_31 : tensor<1x128xi32, #blocked1> -> tensor<2x128xi32, #blocked1> loc(#loc155)
+    %tmp6_55 = arith.addi %tmp6, %tmp0_41 : tensor<2x128xi32, #blocked1> loc(#loc155)
+    %tmp6_56 = arith.addi %tmp6_55, %tmp0_46 : tensor<2x128xi32, #blocked1> loc(#loc156)
+    %tmp6_57 = tt.addptr %tmp0_49, %tmp6_56 : tensor<2x128x!tt.ptr<bf16>, #blocked1>, tensor<2x128xi32, #blocked1> loc(#loc157)
+    %tmp6_58 = tt.load %tmp6_57, %tmp0_52, %cst_14 evictionPolicy = evict_last : tensor<2x128x!tt.ptr<bf16>, #blocked1> loc(#loc158)
+    %tmp6_59 = arith.extf %tmp6_58 : tensor<2x128xbf16, #blocked1> to tensor<2x128xf32, #blocked1> loc(#loc159)
+    %tmp2 = arith.mulf %tmp0_54, %tmp0_54 : tensor<2x128xf32, #blocked1> loc(#loc160)
+    %tmp5 = arith.addf %tmp2, %cst_20 : tensor<2x128xf32, #blocked1> loc(#loc161)
+    %_tmp4 = arith.select %tmp0_52, %tmp5, %cst_20 : tensor<2x128xi1, #blocked1>, tensor<2x128xf32, #blocked1> loc(#loc162)
+    %tmp8 = arith.mulf %tmp6_59, %tmp6_59 : tensor<2x128xf32, #blocked1> loc(#loc163)
+    %tmp11 = arith.addf %tmp8, %cst_20 : tensor<2x128xf32, #blocked1> loc(#loc164)
+    %_tmp10 = arith.select %tmp0_52, %tmp11, %cst_20 : tensor<2x128xi1, #blocked1>, tensor<2x128xf32, #blocked1> loc(#loc165)
+    %tmp4 = "tt.reduce"(%_tmp4) <{axis = 1 : i32}> ({
+    ^bb0(%tmp4_134: f32 loc(callsite(#loc1 at #loc166)), %tmp4_135: f32 loc(callsite(#loc1 at #loc166))):
+      %tmp4_136 = arith.addf %tmp4_134, %tmp4_135 : f32 loc(#loc264)
+      tt.reduce.return %tmp4_136 : f32 loc(#loc258)
+    }) : (tensor<2x128xf32, #blocked1>) -> tensor<2xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc258)
+    %tmp4_60 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<2xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<2x1xf32, #blocked1> loc(#loc167)
+    %tmp10 = "tt.reduce"(%_tmp10) <{axis = 1 : i32}> ({
+    ^bb0(%tmp10_134: f32 loc(callsite(#loc1 at #loc168)), %tmp10_135: f32 loc(callsite(#loc1 at #loc168))):
+      %tmp10_136 = arith.addf %tmp10_134, %tmp10_135 : f32 loc(#loc265)
+      tt.reduce.return %tmp10_136 : f32 loc(#loc260)
+    }) : (tensor<2x128xf32, #blocked1>) -> tensor<2xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc260)
+    %tmp10_61 = tt.expand_dims %tmp10 {axis = 1 : i32} : tensor<2xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<2x1xf32, #blocked1> loc(#loc169)
+    %r0_3 = arith.remsi %r0_base_33, %cst_2 : tensor<1x128xi32, #blocked> loc(#loc170)
+    %r0_4 = arith.divsi %r0_base_33, %cst_2 : tensor<1x128xi32, #blocked> loc(#loc171)
+    %tmp58 = tt.splat %in_ptr1 : !tt.ptr<bf16> -> tensor<1x128x!tt.ptr<bf16>, #blocked2> loc(#loc172)
+    %tmp58_62 = tt.splat %in_ptr1 : !tt.ptr<bf16> -> tensor<1x128x!tt.ptr<bf16>, #blocked> loc(#loc172)
+    %tmp58_63 = tt.addptr %tmp58, %r0_base_32 : tensor<1x128x!tt.ptr<bf16>, #blocked2>, tensor<1x128xi32, #blocked2> loc(#loc172)
+    %tmp58_64 = tt.load %tmp58_63, %r0_mask_36, %cst_15 evictionPolicy = evict_last : tensor<1x128x!tt.ptr<bf16>, #blocked2> loc(#loc173)
+    %tmp58_65 = arith.extf %tmp58_64 : tensor<1x128xbf16, #blocked2> to tensor<1x128xf32, #blocked2> loc(#loc174)
+    %tmp63 = arith.muli %x1, %cst_6 : tensor<2x1xi32, #blocked1> loc(#loc175)
+    %tmp63_66 = tt.broadcast %tmp63 : tensor<2x1xi32, #blocked1> -> tensor<2x128xi32, #blocked1> loc(#loc176)
+    %tmp63_67 = arith.addi %tmp6, %tmp63_66 : tensor<2x128xi32, #blocked1> loc(#loc176)
+    %tmp63_68 = tt.splat %in_ptr2 : !tt.ptr<f32> -> tensor<2x128x!tt.ptr<f32>, #blocked1> loc(#loc177)
+    %tmp63_69 = tt.addptr %tmp63_68, %tmp63_67 : tensor<2x128x!tt.ptr<f32>, #blocked1>, tensor<2x128xi32, #blocked1> loc(#loc177)
+    %tmp63_70 = tt.load %tmp63_69, %tmp0_52, %cst_20 evictionPolicy = evict_last : tensor<2x128x!tt.ptr<f32>, #blocked1> loc(#loc178)
+    %tmp63_71 = ttg.convert_layout %tmp63_70 : tensor<2x128xf32, #blocked1> -> tensor<2x128xf32, #blocked> loc(#loc178)
+    %tmp66 = tt.splat %in_ptr3 : !tt.ptr<f32> -> tensor<2x128x!tt.ptr<f32>, #blocked1> loc(#loc179)
+    %tmp66_72 = tt.addptr %tmp66, %tmp63_67 : tensor<2x128x!tt.ptr<f32>, #blocked1>, tensor<2x128xi32, #blocked1> loc(#loc179)
+    %tmp66_73 = tt.load %tmp66_72, %tmp0_52, %cst_20 evictionPolicy = evict_last : tensor<2x128x!tt.ptr<f32>, #blocked1> loc(#loc180)
+    %tmp66_74 = ttg.convert_layout %tmp66_73 : tensor<2x128xf32, #blocked1> -> tensor<2x128xf32, #blocked> loc(#loc180)
+    %tmp96 = tt.load %tmp0_51, %tmp0_52, %cst_14 evictionPolicy = evict_first : tensor<2x128x!tt.ptr<bf16>, #blocked1> loc(#loc181)
+    %tmp96_75 = arith.extf %tmp96 : tensor<2x128xbf16, #blocked1> to tensor<2x128xf32, #blocked1> loc(#loc182)
+    %tmp102 = tt.splat %in_ptr4 : !tt.ptr<bf16> -> tensor<1x128x!tt.ptr<bf16>, #blocked2> loc(#loc183)
+    %tmp102_76 = tt.splat %in_ptr4 : !tt.ptr<bf16> -> tensor<1x128x!tt.ptr<bf16>, #blocked> loc(#loc183)
+    %tmp102_77 = tt.addptr %tmp102, %r0_base_32 : tensor<1x128x!tt.ptr<bf16>, #blocked2>, tensor<1x128xi32, #blocked2> loc(#loc183)
+    %tmp102_78 = tt.load %tmp102_77, %r0_mask_36, %cst_15 evictionPolicy = evict_last : tensor<1x128x!tt.ptr<bf16>, #blocked2> loc(#loc184)
+    %tmp102_79 = arith.extf %tmp102_78 : tensor<1x128xbf16, #blocked2> to tensor<1x128xf32, #blocked2> loc(#loc185)
+    %tmp16 = arith.extsi %r0_3 : tensor<1x128xi32, #blocked> to tensor<1x128xi64, #blocked> loc(#loc186)
+    %tmp16_80 = arith.cmpi slt, %tmp16, %cst_1 : tensor<1x128xi64, #blocked> loc(#loc186)
+    %tmp17 = arith.muli %r0_4, %cst_2 : tensor<1x128xi32, #blocked> loc(#loc187)
+    %tmp17_81 = arith.addi %tmp17, %cst_0 : tensor<1x128xi32, #blocked> loc(#loc188)
+    %tmp17_82 = tt.broadcast %tmp17_81 : tensor<1x128xi32, #blocked> -> tensor<2x128xi32, #blocked> loc(#loc189)
+    %tmp17_83 = arith.addi %tmp17_82, %tmp0_42 : tensor<2x128xi32, #blocked> loc(#loc189)
+    %tmp17_84 = arith.addi %tmp17_83, %tmp0_47 : tensor<2x128xi32, #blocked> loc(#loc190)
+    %tmp17_85 = tt.addptr %tmp0_50, %tmp17_84 : tensor<2x128x!tt.ptr<bf16>, #blocked>, tensor<2x128xi32, #blocked> loc(#loc191)
+    %tmp17_86 = arith.andi %r0_mask_37, %tmp16_80 : tensor<1x128xi1, #blocked> loc(#loc192)
+    %tmp17_87 = tt.broadcast %tmp17_86 : tensor<1x128xi1, #blocked> -> tensor<2x128xi1, #blocked> loc(#loc193)
+    %tmp17_88 = tt.load %tmp17_85, %tmp17_87, %cst_16 evictionPolicy = evict_last : tensor<2x128x!tt.ptr<bf16>, #blocked> loc(#loc193)
+    %tmp17_89 = arith.extf %tmp17_88 : tensor<2x128xbf16, #blocked> to tensor<2x128xf32, #blocked> loc(#loc194)
+    %tmp20 = arith.divf %tmp10_61, %cst_18 : tensor<2x1xf32, #blocked1> loc(#loc195)
+    %tmp22 = arith.addf %tmp20, %cst_17 : tensor<2x1xf32, #blocked1> loc(#loc196)
+    %tmp23 = tt.extern_elementwise %tmp22 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<2x1xf32, #blocked1>) -> tensor<2x1xf32, #blocked1> loc(#loc197)
+    %tmp24 = ttg.convert_layout %tmp23 : tensor<2x1xf32, #blocked1> -> tensor<2x1xf32, #blocked> loc(#loc198)
+    %tmp24_90 = tt.broadcast %tmp24 : tensor<2x1xf32, #blocked> -> tensor<2x128xf32, #blocked> loc(#loc198)
+    %tmp24_91 = tt.broadcast %tmp23 : tensor<2x1xf32, #blocked1> -> tensor<2x128xf32, #blocked1> loc(#loc198)
+    %tmp24_92 = arith.mulf %tmp17_89, %tmp24_90 : tensor<2x128xf32, #blocked> loc(#loc198)
+    %tmp25 = tt.addptr %tmp58_62, %tmp17_81 : tensor<1x128x!tt.ptr<bf16>, #blocked>, tensor<1x128xi32, #blocked> loc(#loc199)
+    %tmp25_93 = tt.broadcast %tmp25 : tensor<1x128x!tt.ptr<bf16>, #blocked> -> tensor<2x128x!tt.ptr<bf16>, #blocked> loc(#loc199)
+    %tmp25_94 = tt.load %tmp25_93, %tmp17_87, %cst_16 evictionPolicy = evict_last : tensor<2x128x!tt.ptr<bf16>, #blocked> loc(#loc200)
+    %tmp25_95 = arith.extf %tmp25_94 : tensor<2x128xbf16, #blocked> to tensor<2x128xf32, #blocked> loc(#loc201)
+    %tmp27 = arith.mulf %tmp24_92, %tmp25_95 : tensor<2x128xf32, #blocked> loc(#loc202)
+    %tmp29 = arith.subf %cst_19, %tmp27 : tensor<2x128xf32, #blocked> loc(#loc203)
+    %tmp31 = tt.broadcast %tmp16_80 : tensor<1x128xi1, #blocked> -> tensor<2x128xi1, #blocked> loc(#loc204)
+    %tmp32 = arith.cmpi sge, %tmp16, %cst_1 : tensor<1x128xi64, #blocked> loc(#loc205)
+    %tmp35 = tt.broadcast %tmp17 : tensor<1x128xi32, #blocked> -> tensor<2x128xi32, #blocked> loc(#loc206)
+    %tmp35_96 = arith.addi %tmp35, %tmp0_42 : tensor<2x128xi32, #blocked> loc(#loc206)
+    %tmp35_97 = arith.addi %tmp35_96, %tmp0_47 : tensor<2x128xi32, #blocked> loc(#loc207)
+    %tmp35_98 = tt.addptr %tmp0_50, %tmp35_97 : tensor<2x128x!tt.ptr<bf16>, #blocked>, tensor<2x128xi32, #blocked> loc(#loc208)
+    %tmp35_99 = arith.andi %r0_mask_37, %tmp32 : tensor<1x128xi1, #blocked> loc(#loc209)
+    %tmp35_100 = tt.broadcast %tmp35_99 : tensor<1x128xi1, #blocked> -> tensor<2x128xi1, #blocked> loc(#loc210)
+    %tmp35_101 = tt.load %tmp35_98, %tmp35_100, %cst_16 evictionPolicy = evict_last : tensor<2x128x!tt.ptr<bf16>, #blocked> loc(#loc210)
+    %tmp35_102 = arith.extf %tmp35_101 : tensor<2x128xbf16, #blocked> to tensor<2x128xf32, #blocked> loc(#loc211)
+    %tmp42 = arith.mulf %tmp35_102, %tmp24_90 : tensor<2x128xf32, #blocked> loc(#loc212)
+    %tmp43 = tt.addptr %tmp58_62, %tmp17 : tensor<1x128x!tt.ptr<bf16>, #blocked>, tensor<1x128xi32, #blocked> loc(#loc213)
+    %tmp43_103 = tt.broadcast %tmp43 : tensor<1x128x!tt.ptr<bf16>, #blocked> -> tensor<2x128x!tt.ptr<bf16>, #blocked> loc(#loc213)
+    %tmp43_104 = tt.load %tmp43_103, %tmp35_100, %cst_16 evictionPolicy = evict_last : tensor<2x128x!tt.ptr<bf16>, #blocked> loc(#loc214)
+    %tmp43_105 = arith.extf %tmp43_104 : tensor<2x128xbf16, #blocked> to tensor<2x128xf32, #blocked> loc(#loc215)
+    %tmp45 = arith.mulf %tmp42, %tmp43_105 : tensor<2x128xf32, #blocked> loc(#loc216)
+    %tmp48 = tt.broadcast %tmp32 : tensor<1x128xi1, #blocked> -> tensor<2x128xi1, #blocked> loc(#loc217)
+    %tmp48_106 = arith.select %tmp48, %tmp45, %cst_19 : tensor<2x128xi1, #blocked>, tensor<2x128xf32, #blocked> loc(#loc217)
+    %tmp49 = arith.select %tmp31, %tmp29, %tmp48_106 : tensor<2x128xi1, #blocked>, tensor<2x128xf32, #blocked> loc(#loc262)
+    %tmp57 = arith.mulf %tmp6_59, %tmp24_91 : tensor<2x128xf32, #blocked1> loc(#loc219)
+    %tmp60 = ttg.convert_layout %tmp58_65 : tensor<1x128xf32, #blocked2> -> tensor<1x128xf32, #blocked1> loc(#loc220)
+    %tmp60_107 = tt.broadcast %tmp60 : tensor<1x128xf32, #blocked1> -> tensor<2x128xf32, #blocked1> loc(#loc220)
+    %tmp60_108 = arith.mulf %tmp57, %tmp60_107 : tensor<2x128xf32, #blocked1> loc(#loc220)
+    %tmp64 = arith.mulf %tmp60_108, %tmp63_70 : tensor<2x128xf32, #blocked1> loc(#loc221)
+    %tmp64_109 = ttg.convert_layout %tmp64 : tensor<2x128xf32, #blocked1> -> tensor<2x128xf32, #blocked> loc(#loc221)
+    %tmp67 = arith.mulf %tmp49, %tmp66_74 : tensor<2x128xf32, #blocked> loc(#loc222)
+    %tmp68 = arith.addf %tmp64_109, %tmp67 : tensor<2x128xf32, #blocked> loc(#loc223)
+    %tmp70 = arith.addi %tmp17, %cst : tensor<1x128xi32, #blocked> loc(#loc224)
+    %tmp70_110 = tt.broadcast %tmp70 : tensor<1x128xi32, #blocked> -> tensor<2x128xi32, #blocked> loc(#loc225)
+    %tmp70_111 = arith.addi %tmp70_110, %tmp0_42 : tensor<2x128xi32, #blocked> loc(#loc225)
+    %tmp70_112 = arith.addi %tmp70_111, %tmp0_47 : tensor<2x128xi32, #blocked> loc(#loc226)
+    %tmp70_113 = tt.addptr %tmp0_50, %tmp70_112 : tensor<2x128x!tt.ptr<bf16>, #blocked>, tensor<2x128xi32, #blocked> loc(#loc227)
+    %tmp70_114 = tt.load %tmp70_113, %tmp17_87, %cst_16 evictionPolicy = evict_last : tensor<2x128x!tt.ptr<bf16>, #blocked> loc(#loc228)
+    %tmp70_115 = arith.extf %tmp70_114 : tensor<2x128xbf16, #blocked> to tensor<2x128xf32, #blocked> loc(#loc229)
+    %tmp72 = arith.divf %tmp4_60, %cst_18 : tensor<2x1xf32, #blocked1> loc(#loc230)
+    %tmp73 = arith.addf %tmp72, %cst_17 : tensor<2x1xf32, #blocked1> loc(#loc231)
+    %tmp74 = tt.extern_elementwise %tmp73 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<2x1xf32, #blocked1>) -> tensor<2x1xf32, #blocked1> loc(#loc232)
+    %tmp75 = ttg.convert_layout %tmp74 : tensor<2x1xf32, #blocked1> -> tensor<2x1xf32, #blocked> loc(#loc233)
+    %tmp75_116 = tt.broadcast %tmp75 : tensor<2x1xf32, #blocked> -> tensor<2x128xf32, #blocked> loc(#loc233)
+    %tmp75_117 = tt.broadcast %tmp74 : tensor<2x1xf32, #blocked1> -> tensor<2x128xf32, #blocked1> loc(#loc233)
+    %tmp75_118 = arith.mulf %tmp70_115, %tmp75_116 : tensor<2x128xf32, #blocked> loc(#loc233)
+    %tmp76 = tt.addptr %tmp102_76, %tmp17_81 : tensor<1x128x!tt.ptr<bf16>, #blocked>, tensor<1x128xi32, #blocked> loc(#loc234)
+    %tmp76_119 = tt.broadcast %tmp76 : tensor<1x128x!tt.ptr<bf16>, #blocked> -> tensor<2x128x!tt.ptr<bf16>, #blocked> loc(#loc234)
+    %tmp76_120 = tt.load %tmp76_119, %tmp17_87, %cst_16 evictionPolicy = evict_last : tensor<2x128x!tt.ptr<bf16>, #blocked> loc(#loc235)
+    %tmp76_121 = arith.extf %tmp76_120 : tensor<2x128xbf16, #blocked> to tensor<2x128xf32, #blocked> loc(#loc236)
+    %tmp78 = arith.mulf %tmp75_118, %tmp76_121 : tensor<2x128xf32, #blocked> loc(#loc237)
+    %tmp80 = arith.subf %cst_19, %tmp78 : tensor<2x128xf32, #blocked> loc(#loc238)
+    %tmp83 = arith.addi %tmp17, %cst_7 : tensor<1x128xi32, #blocked> loc(#loc239)
+    %tmp83_122 = tt.broadcast %tmp83 : tensor<1x128xi32, #blocked> -> tensor<2x128xi32, #blocked> loc(#loc240)
+    %tmp83_123 = arith.addi %tmp83_122, %tmp0_42 : tensor<2x128xi32, #blocked> loc(#loc240)
+    %tmp83_124 = arith.addi %tmp83_123, %tmp0_47 : tensor<2x128xi32, #blocked> loc(#loc241)
+    %tmp83_125 = tt.addptr %tmp0_50, %tmp83_124 : tensor<2x128x!tt.ptr<bf16>, #blocked>, tensor<2x128xi32, #blocked> loc(#loc242)
+    %tmp83_126 = tt.load %tmp83_125, %tmp35_100, %cst_16 evictionPolicy = evict_last : tensor<2x128x!tt.ptr<bf16>, #blocked> loc(#loc243)
+    %tmp83_127 = arith.extf %tmp83_126 : tensor<2x128xbf16, #blocked> to tensor<2x128xf32, #blocked> loc(#loc244)
+    %tmp88 = arith.mulf %tmp83_127, %tmp75_116 : tensor<2x128xf32, #blocked> loc(#loc245)
+    %tmp89 = tt.addptr %tmp102_76, %tmp17 : tensor<1x128x!tt.ptr<bf16>, #blocked>, tensor<1x128xi32, #blocked> loc(#loc246)
+    %tmp89_128 = tt.broadcast %tmp89 : tensor<1x128x!tt.ptr<bf16>, #blocked> -> tensor<2x128x!tt.ptr<bf16>, #blocked> loc(#loc246)
+    %tmp89_129 = tt.load %tmp89_128, %tmp35_100, %cst_16 evictionPolicy = evict_last : tensor<2x128x!tt.ptr<bf16>, #blocked> loc(#loc247)
+    %tmp89_130 = arith.extf %tmp89_129 : tensor<2x128xbf16, #blocked> to tensor<2x128xf32, #blocked> loc(#loc248)
+    %tmp91 = arith.mulf %tmp88, %tmp89_130 : tensor<2x128xf32, #blocked> loc(#loc249)
+    %tmp94 = arith.select %tmp48, %tmp91, %cst_19 : tensor<2x128xi1, #blocked>, tensor<2x128xf32, #blocked> loc(#loc250)
+    %tmp95 = arith.select %tmp31, %tmp80, %tmp94 : tensor<2x128xi1, #blocked>, tensor<2x128xf32, #blocked> loc(#loc263)
+    %tmp101 = arith.mulf %tmp96_75, %tmp75_117 : tensor<2x128xf32, #blocked1> loc(#loc253)
+    %tmp101_131 = ttg.convert_layout %tmp101 : tensor<2x128xf32, #blocked1> -> tensor<2x128xf32, #blocked> loc(#loc253)
+    %tmp107 = ttg.convert_layout %tmp102_79 : tensor<1x128xf32, #blocked2> -> tensor<1x128xf32, #blocked> loc(#loc254)
+    %tmp104 = tt.broadcast %tmp107 : tensor<1x128xf32, #blocked> -> tensor<2x128xf32, #blocked> loc(#loc255)
+    %tmp104_132 = arith.mulf %tmp101_131, %tmp104 : tensor<2x128xf32, #blocked> loc(#loc255)
+    %tmp107_133 = arith.mulf %tmp104_132, %tmp63_71 : tensor<2x128xf32, #blocked> loc(#loc254)
+    %tmp109 = arith.mulf %tmp95, %tmp66_74 : tensor<2x128xf32, #blocked> loc(#loc256)
+    %tmp110 = arith.addf %tmp107_133, %tmp109 : tensor<2x128xf32, #blocked> loc(#loc257)
+    %0 = arith.muli %xindex_27, %cst_6 : tensor<2x1xi32, #blocked1> loc(#loc123)
+    %1 = tt.broadcast %0 : tensor<2x1xi32, #blocked1> -> tensor<2x128xi32, #blocked1> loc(#loc124)
+    %2 = arith.addi %tmp6, %1 : tensor<2x128xi32, #blocked1> loc(#loc124)
+    %3 = tt.splat %in_out_ptr0 : !tt.ptr<bf16> -> tensor<2x128x!tt.ptr<bf16>, #blocked1> loc(#loc125)
+    %4 = tt.addptr %3, %2 : tensor<2x128x!tt.ptr<bf16>, #blocked1>, tensor<2x128xi32, #blocked1> loc(#loc125)
+    %5 = arith.truncf %tmp68 : tensor<2x128xf32, #blocked> to tensor<2x128xbf16, #blocked> loc(#loc126)
+    %6 = ttg.convert_layout %5 : tensor<2x128xbf16, #blocked> -> tensor<2x128xbf16, #blocked1> loc(#loc126)
+    tt.store %4, %6, %tmp0_52 : tensor<2x128x!tt.ptr<bf16>, #blocked1> loc(#loc126)
+    %7 = tt.splat %in_out_ptr1 : !tt.ptr<bf16> -> tensor<2x128x!tt.ptr<bf16>, #blocked1> loc(#loc127)
+    %8 = tt.addptr %7, %2 : tensor<2x128x!tt.ptr<bf16>, #blocked1>, tensor<2x128xi32, #blocked1> loc(#loc127)
+    %9 = arith.truncf %tmp110 : tensor<2x128xf32, #blocked> to tensor<2x128xbf16, #blocked> loc(#loc128)
+    %10 = ttg.convert_layout %9 : tensor<2x128xbf16, #blocked> -> tensor<2x128xbf16, #blocked1> loc(#loc128)
+    tt.store %8, %10, %tmp0_52 : tensor<2x128x!tt.ptr<bf16>, #blocked1> loc(#loc128)
+    tt.return loc(#loc129)
+  } loc(#loc)
+} loc(#loc)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":23:28)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":23:33)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:44)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:23)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":26:37)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":28:19)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":29:19)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":35:29)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:41)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:52)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:48)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:63)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:57)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:34)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:68)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:121)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:41)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:50)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:34)
+#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:61)
+#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:114)
+#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":42:22)
+#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":44:23)
+#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":45:40)
+#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":47:22)
+#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":49:25)
+#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":50:42)
+#loc29 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:36)
+#loc31 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:15)
+#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":51:28)
+#loc34 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":52:30)
+#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":58:27)
+#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":59:27)
+#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:35)
+#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:42)
+#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:95)
+#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:46)
+#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:42)
+#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:35)
+#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:51)
+#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:35)
+#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:51)
+#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:69)
+#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:123)
+#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:36)
+#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:43)
+#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:96)
+#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":71:24)
+#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:41)
+#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:39)
+#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:48)
+#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:57)
+#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:35)
+#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:78)
+#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:68)
+#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:129)
+#loc60 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":75:25)
+#loc61 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":77:24)
+#loc62 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":78:32)
+#loc63 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":79:24)
+#loc64 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:35)
+#loc65 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:85)
+#loc66 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:146)
+#loc67 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":82:24)
+#loc68 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":84:17)
+#loc69 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":86:39)
+#loc70 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":87:25)
+#loc71 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:44)
+#loc72 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:53)
+#loc73 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:35)
+#loc74 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:74)
+#loc75 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:64)
+#loc76 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:125)
+#loc77 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":97:24)
+#loc78 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:35)
+#loc79 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:81)
+#loc80 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:142)
+#loc81 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":100:24)
+#loc82 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":103:39)
+#loc83 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":104:39)
+#loc84 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":111:24)
+#loc85 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":113:24)
+#loc86 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":116:24)
+#loc87 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":118:24)
+#loc88 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":119:24)
+#loc89 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:42)
+#loc90 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:51)
+#loc91 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:60)
+#loc92 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:35)
+#loc93 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:71)
+#loc94 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:132)
+#loc95 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":123:24)
+#loc96 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":124:24)
+#loc97 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":125:32)
+#loc98 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":126:24)
+#loc99 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:35)
+#loc100 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:85)
+#loc101 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:146)
+#loc102 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":129:24)
+#loc103 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":131:17)
+#loc104 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:42)
+#loc105 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:51)
+#loc106 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:60)
+#loc107 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:35)
+#loc108 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:71)
+#loc109 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:132)
+#loc110 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":139:24)
+#loc111 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:35)
+#loc112 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:81)
+#loc113 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:142)
+#loc114 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":142:24)
+#loc115 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":145:39)
+#loc116 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":146:39)
+#loc117 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":133:39)
+#loc118 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":151:25)
+#loc119 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":156:26)
+#loc120 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":153:26)
+#loc121 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":158:26)
+#loc122 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":159:26)
+#loc123 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:43)
+#loc124 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:39)
+#loc125 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:32)
+#loc126 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:55)
+#loc127 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:32)
+#loc128 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:56)
+#loc129 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":53:4)
+#loc139 = loc("xoffset"(#loc2))
+#loc140 = loc("xoffset"(#loc3))
+#loc141 = loc("xindex"(#loc4))
+#loc142 = loc("xindex"(#loc5))
+#loc143 = loc("r0_base"(#loc6))
+#loc144 = loc("x0"(#loc7))
+#loc145 = loc("x1"(#loc8))
+#loc146 = loc("r0_mask"(#loc9))
+#loc147 = loc("tmp0"(#loc10))
+#loc148 = loc("tmp0"(#loc11))
+#loc149 = loc("tmp0"(#loc12))
+#loc150 = loc("tmp0"(#loc13))
+#loc151 = loc("tmp0"(#loc14))
+#loc152 = loc("tmp0"(#loc15))
+#loc153 = loc("tmp0"(#loc16))
+#loc154 = loc("tmp0"(#loc17))
+#loc155 = loc("tmp6"(#loc18))
+#loc156 = loc("tmp6"(#loc19))
+#loc157 = loc("tmp6"(#loc20))
+#loc158 = loc("tmp6"(#loc21))
+#loc159 = loc("tmp6"(#loc22))
+#loc160 = loc("tmp2"(#loc23))
+#loc161 = loc("tmp5"(#loc24))
+#loc162 = loc("_tmp4"(#loc25))
+#loc163 = loc("tmp8"(#loc26))
+#loc164 = loc("tmp11"(#loc27))
+#loc165 = loc("_tmp10"(#loc28))
+#loc167 = loc("tmp4"(#loc32))
+#loc169 = loc("tmp10"(#loc34))
+#loc170 = loc("r0_3"(#loc35))
+#loc171 = loc("r0_4"(#loc36))
+#loc172 = loc("tmp58"(#loc37))
+#loc173 = loc("tmp58"(#loc38))
+#loc174 = loc("tmp58"(#loc39))
+#loc175 = loc("tmp63"(#loc40))
+#loc176 = loc("tmp63"(#loc41))
+#loc177 = loc("tmp63"(#loc42))
+#loc178 = loc("tmp63"(#loc43))
+#loc179 = loc("tmp66"(#loc44))
+#loc180 = loc("tmp66"(#loc45))
+#loc181 = loc("tmp96"(#loc46))
+#loc182 = loc("tmp96"(#loc47))
+#loc183 = loc("tmp102"(#loc48))
+#loc184 = loc("tmp102"(#loc49))
+#loc185 = loc("tmp102"(#loc50))
+#loc186 = loc("tmp16"(#loc51))
+#loc187 = loc("tmp17"(#loc52))
+#loc188 = loc("tmp17"(#loc53))
+#loc189 = loc("tmp17"(#loc54))
+#loc190 = loc("tmp17"(#loc55))
+#loc191 = loc("tmp17"(#loc56))
+#loc192 = loc("tmp17"(#loc57))
+#loc193 = loc("tmp17"(#loc58))
+#loc194 = loc("tmp17"(#loc59))
+#loc195 = loc("tmp20"(#loc60))
+#loc196 = loc("tmp22"(#loc61))
+#loc197 = loc("tmp23"(#loc62))
+#loc198 = loc("tmp24"(#loc63))
+#loc199 = loc("tmp25"(#loc64))
+#loc200 = loc("tmp25"(#loc65))
+#loc201 = loc("tmp25"(#loc66))
+#loc202 = loc("tmp27"(#loc67))
+#loc203 = loc("tmp29"(#loc68))
+#loc204 = loc("tmp31"(#loc69))
+#loc205 = loc("tmp32"(#loc70))
+#loc206 = loc("tmp35"(#loc71))
+#loc207 = loc("tmp35"(#loc72))
+#loc208 = loc("tmp35"(#loc73))
+#loc209 = loc("tmp35"(#loc74))
+#loc210 = loc("tmp35"(#loc75))
+#loc211 = loc("tmp35"(#loc76))
+#loc212 = loc("tmp42"(#loc77))
+#loc213 = loc("tmp43"(#loc78))
+#loc214 = loc("tmp43"(#loc79))
+#loc215 = loc("tmp43"(#loc80))
+#loc216 = loc("tmp45"(#loc81))
+#loc217 = loc("tmp48"(#loc82))
+#loc218 = loc("tmp49"(#loc83))
+#loc219 = loc("tmp57"(#loc84))
+#loc220 = loc("tmp60"(#loc85))
+#loc221 = loc("tmp64"(#loc86))
+#loc222 = loc("tmp67"(#loc87))
+#loc223 = loc("tmp68"(#loc88))
+#loc224 = loc("tmp70"(#loc89))
+#loc225 = loc("tmp70"(#loc90))
+#loc226 = loc("tmp70"(#loc91))
+#loc227 = loc("tmp70"(#loc92))
+#loc228 = loc("tmp70"(#loc93))
+#loc229 = loc("tmp70"(#loc94))
+#loc230 = loc("tmp72"(#loc95))
+#loc231 = loc("tmp73"(#loc96))
+#loc232 = loc("tmp74"(#loc97))
+#loc233 = loc("tmp75"(#loc98))
+#loc234 = loc("tmp76"(#loc99))
+#loc235 = loc("tmp76"(#loc100))
+#loc236 = loc("tmp76"(#loc101))
+#loc237 = loc("tmp78"(#loc102))
+#loc238 = loc("tmp80"(#loc103))
+#loc239 = loc("tmp83"(#loc104))
+#loc240 = loc("tmp83"(#loc105))
+#loc241 = loc("tmp83"(#loc106))
+#loc242 = loc("tmp83"(#loc107))
+#loc243 = loc("tmp83"(#loc108))
+#loc244 = loc("tmp83"(#loc109))
+#loc245 = loc("tmp88"(#loc110))
+#loc246 = loc("tmp89"(#loc111))
+#loc247 = loc("tmp89"(#loc112))
+#loc248 = loc("tmp89"(#loc113))
+#loc249 = loc("tmp91"(#loc114))
+#loc250 = loc("tmp94"(#loc115))
+#loc251 = loc("tmp95"(#loc116))
+#loc252 = loc("tmp82"(#loc117))
+#loc253 = loc("tmp101"(#loc118))
+#loc254 = loc("tmp107"(#loc119))
+#loc255 = loc("tmp104"(#loc120))
+#loc256 = loc("tmp109"(#loc121))
+#loc257 = loc("tmp110"(#loc122))
+#loc258 = loc(callsite(#loc29 at #loc166))
+#loc260 = loc(callsite(#loc29 at #loc168))
+#loc262 = loc(fused[#loc218, #loc204])
+#loc263 = loc(fused[#loc251, #loc252])
+#loc264 = loc(callsite(#loc31 at #loc258))
+#loc265 = loc(callsite(#loc31 at #loc260))
diff --git a/triton/AQ3FCZKOYK5LBOX7RLBQGX5T77RKI4M7SEZTYJU34QROQSJNLP5A/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttir b/triton/AQ3FCZKOYK5LBOX7RLBQGX5T77RKI4M7SEZTYJU34QROQSJNLP5A/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttir
new file mode 100644
index 0000000000000000000000000000000000000000..393fb5118cb07cd0c2dc1a641c6a9ee3dbedd7e6
--- /dev/null
+++ b/triton/AQ3FCZKOYK5LBOX7RLBQGX5T77RKI4M7SEZTYJU34QROQSJNLP5A/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttir
@@ -0,0 +1,457 @@
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":18:0)
+#loc1 = loc(unknown)
+#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":51:25)
+#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":52:27)
+#loc132 = loc("in_out_ptr0"(#loc))
+#loc133 = loc("in_out_ptr1"(#loc))
+#loc134 = loc("in_ptr0"(#loc))
+#loc135 = loc("in_ptr1"(#loc))
+#loc136 = loc("in_ptr2"(#loc))
+#loc137 = loc("in_ptr3"(#loc))
+#loc138 = loc("in_ptr4"(#loc))
+#loc139 = loc("xnumel"(#loc))
+#loc140 = loc("r0_numel"(#loc))
+#loc170 = loc("tmp4"(#loc32))
+#loc172 = loc("tmp10"(#loc35))
+#loc263 = loc(callsite(#loc1 at #loc170))
+#loc265 = loc(callsite(#loc1 at #loc172))
+module {
+  tt.func public @triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0(%in_out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_out_ptr0"(#loc)), %in_out_ptr1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_out_ptr1"(#loc)), %in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %in_ptr4: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr4"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
+    %cst = arith.constant dense<0.000000e+00> : tensor<1x128xbf16> loc(#loc1)
+    %cst_0 = arith.constant dense<0.000000e+00> : tensor<2x128xbf16> loc(#loc1)
+    %cst_1 = arith.constant dense<4097> : tensor<1x128xi32> loc(#loc1)
+    %cst_2 = arith.constant dense<9.99999997E-7> : tensor<2x1xf32> loc(#loc1)
+    %cst_3 = arith.constant dense<1.280000e+02> : tensor<2x1xf32> loc(#loc1)
+    %cst_4 = arith.constant dense<1> : tensor<1x128xi32> loc(#loc1)
+    %cst_5 = arith.constant dense<1> : tensor<1x128xi64> loc(#loc1)
+    %cst_6 = arith.constant dense<2> : tensor<1x128xi32> loc(#loc1)
+    %cst_7 = arith.constant dense<36864> : tensor<2x1xi32> loc(#loc1)
+    %cst_8 = arith.constant dense<128> : tensor<2x1xi32> loc(#loc1)
+    %cst_9 = arith.constant dense<4096> : tensor<1x128xi32> loc(#loc1)
+    %cst_10 = arith.constant dense<128> : tensor<1x128xi32> loc(#loc1)
+    %cst_11 = arith.constant dense<0.000000e+00> : tensor<2x128xf32> loc(#loc1)
+    %cst_12 = arith.constant dense<32> : tensor<2x1xi32> loc(#loc1)
+    %c2_i32 = arith.constant 2 : i32 loc(#loc1)
+    %xoffset = tt.get_program_id x : i32 loc(#loc141)
+    %xoffset_13 = arith.muli %xoffset, %c2_i32 : i32 loc(#loc142)
+    %xindex = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc143)
+    %xindex_14 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<2xi32> -> tensor<2x1xi32> loc(#loc144)
+    %xindex_15 = tt.splat %xoffset_13 : i32 -> tensor<2x1xi32> loc(#loc145)
+    %xindex_16 = arith.addi %xindex_15, %xindex_14 : tensor<2x1xi32> loc(#loc145)
+    %r0_base = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc146)
+    %r0_base_17 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc147)
+    %x0 = arith.remsi %xindex_16, %cst_12 : tensor<2x1xi32> loc(#loc148)
+    %x1 = arith.divsi %xindex_16, %cst_12 : tensor<2x1xi32> loc(#loc149)
+    %r0_mask = arith.cmpi slt, %r0_base_17, %cst_10 : tensor<1x128xi32> loc(#loc150)
+    %tmp0 = arith.addi %r0_base_17, %cst_9 : tensor<1x128xi32> loc(#loc151)
+    %tmp0_18 = arith.muli %x0, %cst_8 : tensor<2x1xi32> loc(#loc152)
+    %tmp0_19 = tt.broadcast %tmp0 : tensor<1x128xi32> -> tensor<2x128xi32> loc(#loc153)
+    %tmp0_20 = tt.broadcast %tmp0_18 : tensor<2x1xi32> -> tensor<2x128xi32> loc(#loc153)
+    %tmp0_21 = arith.addi %tmp0_19, %tmp0_20 : tensor<2x128xi32> loc(#loc153)
+    %tmp0_22 = arith.muli %x1, %cst_7 : tensor<2x1xi32> loc(#loc154)
+    %tmp0_23 = tt.broadcast %tmp0_22 : tensor<2x1xi32> -> tensor<2x128xi32> loc(#loc155)
+    %tmp0_24 = arith.addi %tmp0_21, %tmp0_23 : tensor<2x128xi32> loc(#loc155)
+    %tmp0_25 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<2x128x!tt.ptr<bf16>> loc(#loc156)
+    %tmp0_26 = tt.addptr %tmp0_25, %tmp0_24 : tensor<2x128x!tt.ptr<bf16>>, tensor<2x128xi32> loc(#loc156)
+    %tmp0_27 = tt.broadcast %r0_mask : tensor<1x128xi1> -> tensor<2x128xi1> loc(#loc157)
+    %tmp0_28 = tt.load %tmp0_26, %tmp0_27, %cst_0 evictionPolicy = evict_last : tensor<2x128x!tt.ptr<bf16>> loc(#loc157)
+    %tmp0_29 = arith.extf %tmp0_28 : tensor<2x128xbf16> to tensor<2x128xf32> loc(#loc158)
+    %tmp6 = tt.broadcast %r0_base_17 : tensor<1x128xi32> -> tensor<2x128xi32> loc(#loc159)
+    %tmp6_30 = arith.addi %tmp6, %tmp0_20 : tensor<2x128xi32> loc(#loc159)
+    %tmp6_31 = arith.addi %tmp6_30, %tmp0_23 : tensor<2x128xi32> loc(#loc160)
+    %tmp6_32 = tt.addptr %tmp0_25, %tmp6_31 : tensor<2x128x!tt.ptr<bf16>>, tensor<2x128xi32> loc(#loc161)
+    %tmp6_33 = tt.load %tmp6_32, %tmp0_27, %cst_0 evictionPolicy = evict_last : tensor<2x128x!tt.ptr<bf16>> loc(#loc162)
+    %tmp6_34 = arith.extf %tmp6_33 : tensor<2x128xbf16> to tensor<2x128xf32> loc(#loc163)
+    %tmp2 = arith.mulf %tmp0_29, %tmp0_29 : tensor<2x128xf32> loc(#loc164)
+    %tmp5 = arith.addf %tmp2, %cst_11 : tensor<2x128xf32> loc(#loc165)
+    %_tmp4 = arith.select %tmp0_27, %tmp5, %cst_11 : tensor<2x128xi1>, tensor<2x128xf32> loc(#loc166)
+    %tmp8 = arith.mulf %tmp6_34, %tmp6_34 : tensor<2x128xf32> loc(#loc167)
+    %tmp11 = arith.addf %tmp8, %cst_11 : tensor<2x128xf32> loc(#loc168)
+    %_tmp10 = arith.select %tmp0_27, %tmp11, %cst_11 : tensor<2x128xi1>, tensor<2x128xf32> loc(#loc169)
+    %tmp4 = "tt.reduce"(%_tmp4) <{axis = 1 : i32}> ({
+    ^bb0(%tmp4_98: f32 loc(callsite(#loc1 at #loc170)), %tmp4_99: f32 loc(callsite(#loc1 at #loc170))):
+      %tmp4_100 = arith.addf %tmp4_98, %tmp4_99 : f32 loc(#loc266)
+      tt.reduce.return %tmp4_100 : f32 loc(#loc262)
+    }) : (tensor<2x128xf32>) -> tensor<2xf32> loc(#loc262)
+    %tmp4_35 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<2xf32> -> tensor<2x1xf32> loc(#loc171)
+    %tmp10 = "tt.reduce"(%_tmp10) <{axis = 1 : i32}> ({
+    ^bb0(%tmp10_98: f32 loc(callsite(#loc1 at #loc172)), %tmp10_99: f32 loc(callsite(#loc1 at #loc172))):
+      %tmp10_100 = arith.addf %tmp10_98, %tmp10_99 : f32 loc(#loc267)
+      tt.reduce.return %tmp10_100 : f32 loc(#loc264)
+    }) : (tensor<2x128xf32>) -> tensor<2xf32> loc(#loc264)
+    %tmp10_36 = tt.expand_dims %tmp10 {axis = 1 : i32} : tensor<2xf32> -> tensor<2x1xf32> loc(#loc173)
+    %r0_3 = arith.remsi %r0_base_17, %cst_6 : tensor<1x128xi32> loc(#loc174)
+    %r0_4 = arith.divsi %r0_base_17, %cst_6 : tensor<1x128xi32> loc(#loc175)
+    %tmp58 = tt.splat %in_ptr1 : !tt.ptr<bf16> -> tensor<1x128x!tt.ptr<bf16>> loc(#loc176)
+    %tmp58_37 = tt.addptr %tmp58, %r0_base_17 : tensor<1x128x!tt.ptr<bf16>>, tensor<1x128xi32> loc(#loc176)
+    %tmp58_38 = tt.load %tmp58_37, %r0_mask, %cst evictionPolicy = evict_last : tensor<1x128x!tt.ptr<bf16>> loc(#loc177)
+    %tmp58_39 = arith.extf %tmp58_38 : tensor<1x128xbf16> to tensor<1x128xf32> loc(#loc178)
+    %tmp63 = arith.muli %x1, %cst_8 : tensor<2x1xi32> loc(#loc179)
+    %tmp63_40 = tt.broadcast %tmp63 : tensor<2x1xi32> -> tensor<2x128xi32> loc(#loc180)
+    %tmp63_41 = arith.addi %tmp6, %tmp63_40 : tensor<2x128xi32> loc(#loc180)
+    %tmp63_42 = tt.splat %in_ptr2 : !tt.ptr<f32> -> tensor<2x128x!tt.ptr<f32>> loc(#loc181)
+    %tmp63_43 = tt.addptr %tmp63_42, %tmp63_41 : tensor<2x128x!tt.ptr<f32>>, tensor<2x128xi32> loc(#loc181)
+    %tmp63_44 = tt.load %tmp63_43, %tmp0_27, %cst_11 evictionPolicy = evict_last : tensor<2x128x!tt.ptr<f32>> loc(#loc182)
+    %tmp66 = tt.splat %in_ptr3 : !tt.ptr<f32> -> tensor<2x128x!tt.ptr<f32>> loc(#loc183)
+    %tmp66_45 = tt.addptr %tmp66, %tmp63_41 : tensor<2x128x!tt.ptr<f32>>, tensor<2x128xi32> loc(#loc183)
+    %tmp66_46 = tt.load %tmp66_45, %tmp0_27, %cst_11 evictionPolicy = evict_last : tensor<2x128x!tt.ptr<f32>> loc(#loc184)
+    %tmp96 = tt.load %tmp0_26, %tmp0_27, %cst_0 evictionPolicy = evict_first : tensor<2x128x!tt.ptr<bf16>> loc(#loc185)
+    %tmp96_47 = arith.extf %tmp96 : tensor<2x128xbf16> to tensor<2x128xf32> loc(#loc186)
+    %tmp102 = tt.splat %in_ptr4 : !tt.ptr<bf16> -> tensor<1x128x!tt.ptr<bf16>> loc(#loc187)
+    %tmp102_48 = tt.addptr %tmp102, %r0_base_17 : tensor<1x128x!tt.ptr<bf16>>, tensor<1x128xi32> loc(#loc187)
+    %tmp102_49 = tt.load %tmp102_48, %r0_mask, %cst evictionPolicy = evict_last : tensor<1x128x!tt.ptr<bf16>> loc(#loc188)
+    %tmp102_50 = arith.extf %tmp102_49 : tensor<1x128xbf16> to tensor<1x128xf32> loc(#loc189)
+    %tmp16 = arith.extsi %r0_3 : tensor<1x128xi32> to tensor<1x128xi64> loc(#loc190)
+    %tmp16_51 = arith.cmpi slt, %tmp16, %cst_5 : tensor<1x128xi64> loc(#loc190)
+    %tmp17 = arith.muli %r0_4, %cst_6 : tensor<1x128xi32> loc(#loc191)
+    %tmp17_52 = arith.addi %tmp17, %cst_4 : tensor<1x128xi32> loc(#loc192)
+    %tmp17_53 = tt.broadcast %tmp17_52 : tensor<1x128xi32> -> tensor<2x128xi32> loc(#loc193)
+    %tmp17_54 = arith.addi %tmp17_53, %tmp0_20 : tensor<2x128xi32> loc(#loc193)
+    %tmp17_55 = arith.addi %tmp17_54, %tmp0_23 : tensor<2x128xi32> loc(#loc194)
+    %tmp17_56 = tt.addptr %tmp0_25, %tmp17_55 : tensor<2x128x!tt.ptr<bf16>>, tensor<2x128xi32> loc(#loc195)
+    %tmp17_57 = arith.andi %r0_mask, %tmp16_51 : tensor<1x128xi1> loc(#loc196)
+    %tmp17_58 = tt.broadcast %tmp17_57 : tensor<1x128xi1> -> tensor<2x128xi1> loc(#loc197)
+    %tmp17_59 = tt.load %tmp17_56, %tmp17_58, %cst_0 evictionPolicy = evict_last : tensor<2x128x!tt.ptr<bf16>> loc(#loc197)
+    %tmp17_60 = arith.extf %tmp17_59 : tensor<2x128xbf16> to tensor<2x128xf32> loc(#loc198)
+    %tmp20 = arith.divf %tmp10_36, %cst_3 : tensor<2x1xf32> loc(#loc199)
+    %tmp22 = arith.addf %tmp20, %cst_2 : tensor<2x1xf32> loc(#loc200)
+    %tmp23 = tt.extern_elementwise %tmp22 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<2x1xf32>) -> tensor<2x1xf32> loc(#loc201)
+    %tmp24 = tt.broadcast %tmp23 : tensor<2x1xf32> -> tensor<2x128xf32> loc(#loc202)
+    %tmp24_61 = arith.mulf %tmp17_60, %tmp24 : tensor<2x128xf32> loc(#loc202)
+    %tmp25 = tt.addptr %tmp58, %tmp17_52 : tensor<1x128x!tt.ptr<bf16>>, tensor<1x128xi32> loc(#loc203)
+    %tmp25_62 = tt.broadcast %tmp25 : tensor<1x128x!tt.ptr<bf16>> -> tensor<2x128x!tt.ptr<bf16>> loc(#loc203)
+    %tmp25_63 = tt.load %tmp25_62, %tmp17_58, %cst_0 evictionPolicy = evict_last : tensor<2x128x!tt.ptr<bf16>> loc(#loc204)
+    %tmp25_64 = arith.extf %tmp25_63 : tensor<2x128xbf16> to tensor<2x128xf32> loc(#loc205)
+    %tmp27 = arith.mulf %tmp24_61, %tmp25_64 : tensor<2x128xf32> loc(#loc206)
+    %tmp29 = arith.subf %cst_11, %tmp27 : tensor<2x128xf32> loc(#loc207)
+    %tmp31 = tt.broadcast %tmp16_51 : tensor<1x128xi1> -> tensor<2x128xi1> loc(#loc208)
+    %tmp31_65 = arith.select %tmp31, %tmp29, %cst_11 : tensor<2x128xi1>, tensor<2x128xf32> loc(#loc208)
+    %tmp32 = arith.cmpi sge, %tmp16, %cst_5 : tensor<1x128xi64> loc(#loc209)
+    %tmp35 = tt.broadcast %tmp17 : tensor<1x128xi32> -> tensor<2x128xi32> loc(#loc210)
+    %tmp35_66 = arith.addi %tmp35, %tmp0_20 : tensor<2x128xi32> loc(#loc210)
+    %tmp35_67 = arith.addi %tmp35_66, %tmp0_23 : tensor<2x128xi32> loc(#loc211)
+    %tmp35_68 = tt.addptr %tmp0_25, %tmp35_67 : tensor<2x128x!tt.ptr<bf16>>, tensor<2x128xi32> loc(#loc212)
+    %tmp35_69 = arith.andi %r0_mask, %tmp32 : tensor<1x128xi1> loc(#loc213)
+    %tmp35_70 = tt.broadcast %tmp35_69 : tensor<1x128xi1> -> tensor<2x128xi1> loc(#loc214)
+    %tmp35_71 = tt.load %tmp35_68, %tmp35_70, %cst_0 evictionPolicy = evict_last : tensor<2x128x!tt.ptr<bf16>> loc(#loc214)
+    %tmp35_72 = arith.extf %tmp35_71 : tensor<2x128xbf16> to tensor<2x128xf32> loc(#loc215)
+    %tmp42 = arith.mulf %tmp35_72, %tmp24 : tensor<2x128xf32> loc(#loc216)
+    %tmp43 = tt.addptr %tmp58, %tmp17 : tensor<1x128x!tt.ptr<bf16>>, tensor<1x128xi32> loc(#loc217)
+    %tmp43_73 = tt.broadcast %tmp43 : tensor<1x128x!tt.ptr<bf16>> -> tensor<2x128x!tt.ptr<bf16>> loc(#loc217)
+    %tmp43_74 = tt.load %tmp43_73, %tmp35_70, %cst_0 evictionPolicy = evict_last : tensor<2x128x!tt.ptr<bf16>> loc(#loc218)
+    %tmp43_75 = arith.extf %tmp43_74 : tensor<2x128xbf16> to tensor<2x128xf32> loc(#loc219)
+    %tmp45 = arith.mulf %tmp42, %tmp43_75 : tensor<2x128xf32> loc(#loc220)
+    %tmp48 = tt.broadcast %tmp32 : tensor<1x128xi1> -> tensor<2x128xi1> loc(#loc221)
+    %tmp48_76 = arith.select %tmp48, %tmp45, %cst_11 : tensor<2x128xi1>, tensor<2x128xf32> loc(#loc221)
+    %tmp49 = arith.select %tmp31, %tmp31_65, %tmp48_76 : tensor<2x128xi1>, tensor<2x128xf32> loc(#loc222)
+    %tmp57 = arith.mulf %tmp6_34, %tmp24 : tensor<2x128xf32> loc(#loc223)
+    %tmp60 = tt.broadcast %tmp58_39 : tensor<1x128xf32> -> tensor<2x128xf32> loc(#loc224)
+    %tmp60_77 = arith.mulf %tmp57, %tmp60 : tensor<2x128xf32> loc(#loc224)
+    %tmp64 = arith.mulf %tmp60_77, %tmp63_44 : tensor<2x128xf32> loc(#loc225)
+    %tmp67 = arith.mulf %tmp49, %tmp66_46 : tensor<2x128xf32> loc(#loc226)
+    %tmp68 = arith.addf %tmp64, %tmp67 : tensor<2x128xf32> loc(#loc227)
+    %tmp70 = arith.addi %tmp17, %cst_1 : tensor<1x128xi32> loc(#loc228)
+    %tmp70_78 = tt.broadcast %tmp70 : tensor<1x128xi32> -> tensor<2x128xi32> loc(#loc229)
+    %tmp70_79 = arith.addi %tmp70_78, %tmp0_20 : tensor<2x128xi32> loc(#loc229)
+    %tmp70_80 = arith.addi %tmp70_79, %tmp0_23 : tensor<2x128xi32> loc(#loc230)
+    %tmp70_81 = tt.addptr %tmp0_25, %tmp70_80 : tensor<2x128x!tt.ptr<bf16>>, tensor<2x128xi32> loc(#loc231)
+    %tmp70_82 = tt.load %tmp70_81, %tmp17_58, %cst_0 evictionPolicy = evict_last : tensor<2x128x!tt.ptr<bf16>> loc(#loc232)
+    %tmp70_83 = arith.extf %tmp70_82 : tensor<2x128xbf16> to tensor<2x128xf32> loc(#loc233)
+    %tmp72 = arith.divf %tmp4_35, %cst_3 : tensor<2x1xf32> loc(#loc234)
+    %tmp73 = arith.addf %tmp72, %cst_2 : tensor<2x1xf32> loc(#loc235)
+    %tmp74 = tt.extern_elementwise %tmp73 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<2x1xf32>) -> tensor<2x1xf32> loc(#loc236)
+    %tmp75 = tt.broadcast %tmp74 : tensor<2x1xf32> -> tensor<2x128xf32> loc(#loc237)
+    %tmp75_84 = arith.mulf %tmp70_83, %tmp75 : tensor<2x128xf32> loc(#loc237)
+    %tmp76 = tt.addptr %tmp102, %tmp17_52 : tensor<1x128x!tt.ptr<bf16>>, tensor<1x128xi32> loc(#loc238)
+    %tmp76_85 = tt.broadcast %tmp76 : tensor<1x128x!tt.ptr<bf16>> -> tensor<2x128x!tt.ptr<bf16>> loc(#loc238)
+    %tmp76_86 = tt.load %tmp76_85, %tmp17_58, %cst_0 evictionPolicy = evict_last : tensor<2x128x!tt.ptr<bf16>> loc(#loc239)
+    %tmp76_87 = arith.extf %tmp76_86 : tensor<2x128xbf16> to tensor<2x128xf32> loc(#loc240)
+    %tmp78 = arith.mulf %tmp75_84, %tmp76_87 : tensor<2x128xf32> loc(#loc241)
+    %tmp80 = arith.subf %cst_11, %tmp78 : tensor<2x128xf32> loc(#loc242)
+    %tmp82 = arith.select %tmp31, %tmp80, %cst_11 : tensor<2x128xi1>, tensor<2x128xf32> loc(#loc243)
+    %tmp83 = arith.addi %tmp17, %cst_9 : tensor<1x128xi32> loc(#loc244)
+    %tmp83_88 = tt.broadcast %tmp83 : tensor<1x128xi32> -> tensor<2x128xi32> loc(#loc245)
+    %tmp83_89 = arith.addi %tmp83_88, %tmp0_20 : tensor<2x128xi32> loc(#loc245)
+    %tmp83_90 = arith.addi %tmp83_89, %tmp0_23 : tensor<2x128xi32> loc(#loc246)
+    %tmp83_91 = tt.addptr %tmp0_25, %tmp83_90 : tensor<2x128x!tt.ptr<bf16>>, tensor<2x128xi32> loc(#loc247)
+    %tmp83_92 = tt.load %tmp83_91, %tmp35_70, %cst_0 evictionPolicy = evict_last : tensor<2x128x!tt.ptr<bf16>> loc(#loc248)
+    %tmp83_93 = arith.extf %tmp83_92 : tensor<2x128xbf16> to tensor<2x128xf32> loc(#loc249)
+    %tmp88 = arith.mulf %tmp83_93, %tmp75 : tensor<2x128xf32> loc(#loc250)
+    %tmp89 = tt.addptr %tmp102, %tmp17 : tensor<1x128x!tt.ptr<bf16>>, tensor<1x128xi32> loc(#loc251)
+    %tmp89_94 = tt.broadcast %tmp89 : tensor<1x128x!tt.ptr<bf16>> -> tensor<2x128x!tt.ptr<bf16>> loc(#loc251)
+    %tmp89_95 = tt.load %tmp89_94, %tmp35_70, %cst_0 evictionPolicy = evict_last : tensor<2x128x!tt.ptr<bf16>> loc(#loc252)
+    %tmp89_96 = arith.extf %tmp89_95 : tensor<2x128xbf16> to tensor<2x128xf32> loc(#loc253)
+    %tmp91 = arith.mulf %tmp88, %tmp89_96 : tensor<2x128xf32> loc(#loc254)
+    %tmp94 = arith.select %tmp48, %tmp91, %cst_11 : tensor<2x128xi1>, tensor<2x128xf32> loc(#loc255)
+    %tmp95 = arith.select %tmp31, %tmp82, %tmp94 : tensor<2x128xi1>, tensor<2x128xf32> loc(#loc256)
+    %tmp101 = arith.mulf %tmp96_47, %tmp75 : tensor<2x128xf32> loc(#loc257)
+    %tmp104 = tt.broadcast %tmp102_50 : tensor<1x128xf32> -> tensor<2x128xf32> loc(#loc258)
+    %tmp104_97 = arith.mulf %tmp101, %tmp104 : tensor<2x128xf32> loc(#loc258)
+    %tmp107 = arith.mulf %tmp104_97, %tmp63_44 : tensor<2x128xf32> loc(#loc259)
+    %tmp109 = arith.mulf %tmp95, %tmp66_46 : tensor<2x128xf32> loc(#loc260)
+    %tmp110 = arith.addf %tmp107, %tmp109 : tensor<2x128xf32> loc(#loc261)
+    %0 = arith.muli %xindex_16, %cst_8 : tensor<2x1xi32> loc(#loc125)
+    %1 = tt.broadcast %0 : tensor<2x1xi32> -> tensor<2x128xi32> loc(#loc126)
+    %2 = arith.addi %tmp6, %1 : tensor<2x128xi32> loc(#loc126)
+    %3 = tt.splat %in_out_ptr0 : !tt.ptr<bf16> -> tensor<2x128x!tt.ptr<bf16>> loc(#loc127)
+    %4 = tt.addptr %3, %2 : tensor<2x128x!tt.ptr<bf16>>, tensor<2x128xi32> loc(#loc127)
+    %5 = arith.truncf %tmp68 : tensor<2x128xf32> to tensor<2x128xbf16> loc(#loc128)
+    tt.store %4, %5, %tmp0_27 : tensor<2x128x!tt.ptr<bf16>> loc(#loc128)
+    %6 = tt.splat %in_out_ptr1 : !tt.ptr<bf16> -> tensor<2x128x!tt.ptr<bf16>> loc(#loc129)
+    %7 = tt.addptr %6, %2 : tensor<2x128x!tt.ptr<bf16>>, tensor<2x128xi32> loc(#loc129)
+    %8 = arith.truncf %tmp110 : tensor<2x128xf32> to tensor<2x128xbf16> loc(#loc130)
+    tt.store %7, %8, %tmp0_27 : tensor<2x128x!tt.ptr<bf16>> loc(#loc130)
+    tt.return loc(#loc131)
+  } loc(#loc)
+} loc(#loc)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":23:28)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":23:33)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:36)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:44)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:23)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":26:27)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":26:37)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":28:19)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":29:19)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":35:29)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:41)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:52)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:48)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:63)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:57)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:34)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:68)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:121)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:41)
+#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:50)
+#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:34)
+#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:61)
+#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:114)
+#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":42:22)
+#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":44:23)
+#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":45:40)
+#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":47:22)
+#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":49:25)
+#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":50:42)
+#loc31 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:36)
+#loc33 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:15)
+#loc34 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":51:28)
+#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":52:30)
+#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":58:27)
+#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":59:27)
+#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:35)
+#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:42)
+#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:95)
+#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:46)
+#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:42)
+#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:35)
+#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:51)
+#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:35)
+#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:51)
+#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:69)
+#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:123)
+#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:36)
+#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:43)
+#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:96)
+#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":71:24)
+#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:41)
+#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:39)
+#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:48)
+#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:57)
+#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:35)
+#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:78)
+#loc60 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:68)
+#loc61 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:129)
+#loc62 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":75:25)
+#loc63 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":77:24)
+#loc64 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":78:32)
+#loc65 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":79:24)
+#loc66 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:35)
+#loc67 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:85)
+#loc68 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:146)
+#loc69 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":82:24)
+#loc70 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":84:17)
+#loc71 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":86:39)
+#loc72 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":87:25)
+#loc73 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:44)
+#loc74 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:53)
+#loc75 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:35)
+#loc76 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:74)
+#loc77 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:64)
+#loc78 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:125)
+#loc79 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":97:24)
+#loc80 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:35)
+#loc81 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:81)
+#loc82 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:142)
+#loc83 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":100:24)
+#loc84 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":103:39)
+#loc85 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":104:39)
+#loc86 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":111:24)
+#loc87 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":113:24)
+#loc88 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":116:24)
+#loc89 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":118:24)
+#loc90 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":119:24)
+#loc91 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:42)
+#loc92 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:51)
+#loc93 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:60)
+#loc94 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:35)
+#loc95 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:71)
+#loc96 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:132)
+#loc97 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":123:24)
+#loc98 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":124:24)
+#loc99 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":125:32)
+#loc100 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":126:24)
+#loc101 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:35)
+#loc102 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:85)
+#loc103 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:146)
+#loc104 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":129:24)
+#loc105 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":131:17)
+#loc106 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":133:39)
+#loc107 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:42)
+#loc108 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:51)
+#loc109 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:60)
+#loc110 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:35)
+#loc111 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:71)
+#loc112 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:132)
+#loc113 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":139:24)
+#loc114 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:35)
+#loc115 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:81)
+#loc116 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:142)
+#loc117 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":142:24)
+#loc118 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":145:39)
+#loc119 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":146:39)
+#loc120 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":151:25)
+#loc121 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":153:26)
+#loc122 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":156:26)
+#loc123 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":158:26)
+#loc124 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":159:26)
+#loc125 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:43)
+#loc126 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:39)
+#loc127 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:32)
+#loc128 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:55)
+#loc129 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:32)
+#loc130 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:56)
+#loc131 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":53:4)
+#loc141 = loc("xoffset"(#loc2))
+#loc142 = loc("xoffset"(#loc3))
+#loc143 = loc("xindex"(#loc4))
+#loc144 = loc("xindex"(#loc5))
+#loc145 = loc("xindex"(#loc6))
+#loc146 = loc("r0_base"(#loc7))
+#loc147 = loc("r0_base"(#loc8))
+#loc148 = loc("x0"(#loc9))
+#loc149 = loc("x1"(#loc10))
+#loc150 = loc("r0_mask"(#loc11))
+#loc151 = loc("tmp0"(#loc12))
+#loc152 = loc("tmp0"(#loc13))
+#loc153 = loc("tmp0"(#loc14))
+#loc154 = loc("tmp0"(#loc15))
+#loc155 = loc("tmp0"(#loc16))
+#loc156 = loc("tmp0"(#loc17))
+#loc157 = loc("tmp0"(#loc18))
+#loc158 = loc("tmp0"(#loc19))
+#loc159 = loc("tmp6"(#loc20))
+#loc160 = loc("tmp6"(#loc21))
+#loc161 = loc("tmp6"(#loc22))
+#loc162 = loc("tmp6"(#loc23))
+#loc163 = loc("tmp6"(#loc24))
+#loc164 = loc("tmp2"(#loc25))
+#loc165 = loc("tmp5"(#loc26))
+#loc166 = loc("_tmp4"(#loc27))
+#loc167 = loc("tmp8"(#loc28))
+#loc168 = loc("tmp11"(#loc29))
+#loc169 = loc("_tmp10"(#loc30))
+#loc171 = loc("tmp4"(#loc34))
+#loc173 = loc("tmp10"(#loc36))
+#loc174 = loc("r0_3"(#loc37))
+#loc175 = loc("r0_4"(#loc38))
+#loc176 = loc("tmp58"(#loc39))
+#loc177 = loc("tmp58"(#loc40))
+#loc178 = loc("tmp58"(#loc41))
+#loc179 = loc("tmp63"(#loc42))
+#loc180 = loc("tmp63"(#loc43))
+#loc181 = loc("tmp63"(#loc44))
+#loc182 = loc("tmp63"(#loc45))
+#loc183 = loc("tmp66"(#loc46))
+#loc184 = loc("tmp66"(#loc47))
+#loc185 = loc("tmp96"(#loc48))
+#loc186 = loc("tmp96"(#loc49))
+#loc187 = loc("tmp102"(#loc50))
+#loc188 = loc("tmp102"(#loc51))
+#loc189 = loc("tmp102"(#loc52))
+#loc190 = loc("tmp16"(#loc53))
+#loc191 = loc("tmp17"(#loc54))
+#loc192 = loc("tmp17"(#loc55))
+#loc193 = loc("tmp17"(#loc56))
+#loc194 = loc("tmp17"(#loc57))
+#loc195 = loc("tmp17"(#loc58))
+#loc196 = loc("tmp17"(#loc59))
+#loc197 = loc("tmp17"(#loc60))
+#loc198 = loc("tmp17"(#loc61))
+#loc199 = loc("tmp20"(#loc62))
+#loc200 = loc("tmp22"(#loc63))
+#loc201 = loc("tmp23"(#loc64))
+#loc202 = loc("tmp24"(#loc65))
+#loc203 = loc("tmp25"(#loc66))
+#loc204 = loc("tmp25"(#loc67))
+#loc205 = loc("tmp25"(#loc68))
+#loc206 = loc("tmp27"(#loc69))
+#loc207 = loc("tmp29"(#loc70))
+#loc208 = loc("tmp31"(#loc71))
+#loc209 = loc("tmp32"(#loc72))
+#loc210 = loc("tmp35"(#loc73))
+#loc211 = loc("tmp35"(#loc74))
+#loc212 = loc("tmp35"(#loc75))
+#loc213 = loc("tmp35"(#loc76))
+#loc214 = loc("tmp35"(#loc77))
+#loc215 = loc("tmp35"(#loc78))
+#loc216 = loc("tmp42"(#loc79))
+#loc217 = loc("tmp43"(#loc80))
+#loc218 = loc("tmp43"(#loc81))
+#loc219 = loc("tmp43"(#loc82))
+#loc220 = loc("tmp45"(#loc83))
+#loc221 = loc("tmp48"(#loc84))
+#loc222 = loc("tmp49"(#loc85))
+#loc223 = loc("tmp57"(#loc86))
+#loc224 = loc("tmp60"(#loc87))
+#loc225 = loc("tmp64"(#loc88))
+#loc226 = loc("tmp67"(#loc89))
+#loc227 = loc("tmp68"(#loc90))
+#loc228 = loc("tmp70"(#loc91))
+#loc229 = loc("tmp70"(#loc92))
+#loc230 = loc("tmp70"(#loc93))
+#loc231 = loc("tmp70"(#loc94))
+#loc232 = loc("tmp70"(#loc95))
+#loc233 = loc("tmp70"(#loc96))
+#loc234 = loc("tmp72"(#loc97))
+#loc235 = loc("tmp73"(#loc98))
+#loc236 = loc("tmp74"(#loc99))
+#loc237 = loc("tmp75"(#loc100))
+#loc238 = loc("tmp76"(#loc101))
+#loc239 = loc("tmp76"(#loc102))
+#loc240 = loc("tmp76"(#loc103))
+#loc241 = loc("tmp78"(#loc104))
+#loc242 = loc("tmp80"(#loc105))
+#loc243 = loc("tmp82"(#loc106))
+#loc244 = loc("tmp83"(#loc107))
+#loc245 = loc("tmp83"(#loc108))
+#loc246 = loc("tmp83"(#loc109))
+#loc247 = loc("tmp83"(#loc110))
+#loc248 = loc("tmp83"(#loc111))
+#loc249 = loc("tmp83"(#loc112))
+#loc250 = loc("tmp88"(#loc113))
+#loc251 = loc("tmp89"(#loc114))
+#loc252 = loc("tmp89"(#loc115))
+#loc253 = loc("tmp89"(#loc116))
+#loc254 = loc("tmp91"(#loc117))
+#loc255 = loc("tmp94"(#loc118))
+#loc256 = loc("tmp95"(#loc119))
+#loc257 = loc("tmp101"(#loc120))
+#loc258 = loc("tmp104"(#loc121))
+#loc259 = loc("tmp107"(#loc122))
+#loc260 = loc("tmp109"(#loc123))
+#loc261 = loc("tmp110"(#loc124))
+#loc262 = loc(callsite(#loc31 at #loc170))
+#loc264 = loc(callsite(#loc31 at #loc172))
+#loc266 = loc(callsite(#loc33 at #loc262))
+#loc267 = loc(callsite(#loc33 at #loc264))
diff --git a/triton/AW5ZCLPC4IBBI4EFLOYOOAPMG5OIZIY564UDYNREA6OGUZ6JK6GQ/__grp__triton_poi_fused_cat_mul_silu_split_view_0.json b/triton/AW5ZCLPC4IBBI4EFLOYOOAPMG5OIZIY564UDYNREA6OGUZ6JK6GQ/__grp__triton_poi_fused_cat_mul_silu_split_view_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..6384c258fc10549d2cc1b20c3800727ad07d4c8c
--- /dev/null
+++ b/triton/AW5ZCLPC4IBBI4EFLOYOOAPMG5OIZIY564UDYNREA6OGUZ6JK6GQ/__grp__triton_poi_fused_cat_mul_silu_split_view_0.json
@@ -0,0 +1 @@
+{"child_paths": {"triton_poi_fused_cat_mul_silu_split_view_0.source": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/AW5ZCLPC4IBBI4EFLOYOOAPMG5OIZIY564UDYNREA6OGUZ6JK6GQ/triton_poi_fused_cat_mul_silu_split_view_0.source", "triton_poi_fused_cat_mul_silu_split_view_0.ttir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/AW5ZCLPC4IBBI4EFLOYOOAPMG5OIZIY564UDYNREA6OGUZ6JK6GQ/triton_poi_fused_cat_mul_silu_split_view_0.ttir", "triton_poi_fused_cat_mul_silu_split_view_0.ttgir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/AW5ZCLPC4IBBI4EFLOYOOAPMG5OIZIY564UDYNREA6OGUZ6JK6GQ/triton_poi_fused_cat_mul_silu_split_view_0.ttgir", "triton_poi_fused_cat_mul_silu_split_view_0.llir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/AW5ZCLPC4IBBI4EFLOYOOAPMG5OIZIY564UDYNREA6OGUZ6JK6GQ/triton_poi_fused_cat_mul_silu_split_view_0.llir", "triton_poi_fused_cat_mul_silu_split_view_0.ptx": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/AW5ZCLPC4IBBI4EFLOYOOAPMG5OIZIY564UDYNREA6OGUZ6JK6GQ/triton_poi_fused_cat_mul_silu_split_view_0.ptx", "triton_poi_fused_cat_mul_silu_split_view_0.cubin": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/AW5ZCLPC4IBBI4EFLOYOOAPMG5OIZIY564UDYNREA6OGUZ6JK6GQ/triton_poi_fused_cat_mul_silu_split_view_0.cubin", "triton_poi_fused_cat_mul_silu_split_view_0.json": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/AW5ZCLPC4IBBI4EFLOYOOAPMG5OIZIY564UDYNREA6OGUZ6JK6GQ/triton_poi_fused_cat_mul_silu_split_view_0.json"}}
\ No newline at end of file
diff --git a/triton/AW5ZCLPC4IBBI4EFLOYOOAPMG5OIZIY564UDYNREA6OGUZ6JK6GQ/triton_poi_fused_cat_mul_silu_split_view_0.cubin b/triton/AW5ZCLPC4IBBI4EFLOYOOAPMG5OIZIY564UDYNREA6OGUZ6JK6GQ/triton_poi_fused_cat_mul_silu_split_view_0.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..1891ade5ad01b38b9446b11bd4f77b5bd72a7432
Binary files /dev/null and b/triton/AW5ZCLPC4IBBI4EFLOYOOAPMG5OIZIY564UDYNREA6OGUZ6JK6GQ/triton_poi_fused_cat_mul_silu_split_view_0.cubin differ
diff --git a/triton/AW5ZCLPC4IBBI4EFLOYOOAPMG5OIZIY564UDYNREA6OGUZ6JK6GQ/triton_poi_fused_cat_mul_silu_split_view_0.json b/triton/AW5ZCLPC4IBBI4EFLOYOOAPMG5OIZIY564UDYNREA6OGUZ6JK6GQ/triton_poi_fused_cat_mul_silu_split_view_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..fc26b795ff4db5e20eb6bac332a4e6a2b9182a5c
--- /dev/null
+++ b/triton/AW5ZCLPC4IBBI4EFLOYOOAPMG5OIZIY564UDYNREA6OGUZ6JK6GQ/triton_poi_fused_cat_mul_silu_split_view_0.json
@@ -0,0 +1 @@
+{"hash": "05bb912de2e2021470855bb0e701ec375c8ca31df7283c3624079c6a67c9578d", "target": {"backend": "cuda", "arch": 89, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "enable_reflect_ftz": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee", "bf16x3", "bf16x6"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm89", "instrumentation_mode": "", "triton_version": "3.6.0", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_poi_fused_cat_mul_silu_split_view_0"}
\ No newline at end of file
diff --git a/triton/AW5ZCLPC4IBBI4EFLOYOOAPMG5OIZIY564UDYNREA6OGUZ6JK6GQ/triton_poi_fused_cat_mul_silu_split_view_0.llir b/triton/AW5ZCLPC4IBBI4EFLOYOOAPMG5OIZIY564UDYNREA6OGUZ6JK6GQ/triton_poi_fused_cat_mul_silu_split_view_0.llir
new file mode 100644
index 0000000000000000000000000000000000000000..278b73fe5427df4c4bf2ea510e9fa5f944c90135
--- /dev/null
+++ b/triton/AW5ZCLPC4IBBI4EFLOYOOAPMG5OIZIY564UDYNREA6OGUZ6JK6GQ/triton_poi_fused_cat_mul_silu_split_view_0.llir
@@ -0,0 +1,214 @@
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64"
+
+; Function Attrs: nounwind
+define ptx_kernel void @triton_poi_fused_cat_mul_silu_split_view_0(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, i32 %3, ptr addrspace(1) readnone captures(none) %4, ptr addrspace(1) readnone captures(none) %5) local_unnamed_addr #0 !dbg !4 {
+  %7 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7
+  %8 = shl i32 %7, 10, !dbg !8
+  %9 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9
+  %10 = shl nuw nsw i32 %9, 3, !dbg !9
+  %11 = and i32 %10, 1016, !dbg !9
+  %12 = or disjoint i32 %11, %8, !dbg !10
+  %13 = sdiv i32 %12, 16384, !dbg !11
+  %14 = mul i32 %13, 16384, !dbg !12
+  %.decomposed = sub i32 %12, %14, !dbg !12
+  %15 = icmp slt i32 %.decomposed, 4096, !dbg !13
+  %16 = shl nsw i32 %13, 12, !dbg !14
+  %17 = add nsw i32 %16, %.decomposed, !dbg !15
+  %18 = sext i32 %17 to i64, !dbg !16
+  %19 = getelementptr bfloat, ptr addrspace(1) %0, i64 %18, !dbg !16
+  %20 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #3, !dbg !17
+  %21 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %19, i64 %20, i1 %15) #3, !dbg !17
+  %22 = extractvalue { i32, i32, i32, i32 } %21, 0, !dbg !17
+  %23 = bitcast i32 %22 to <2 x bfloat>, !dbg !17
+  %24 = extractvalue { i32, i32, i32, i32 } %21, 1, !dbg !17
+  %25 = bitcast i32 %24 to <2 x bfloat>, !dbg !17
+  %26 = extractvalue { i32, i32, i32, i32 } %21, 2, !dbg !17
+  %27 = bitcast i32 %26 to <2 x bfloat>, !dbg !17
+  %28 = extractvalue { i32, i32, i32, i32 } %21, 3, !dbg !17
+  %29 = bitcast i32 %28 to <2 x bfloat>, !dbg !17
+  %30 = icmp sgt i32 %.decomposed, 4095, !dbg !18
+  %31 = mul i32 %13, 36864, !dbg !19
+  %32 = add nsw i32 %.decomposed, -4096, !dbg !20
+  %33 = add i32 %31, %32, !dbg !21
+  %34 = sext i32 %33 to i64, !dbg !22
+  %35 = getelementptr bfloat, ptr addrspace(1) %1, i64 %34, !dbg !22
+  %36 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #3, !dbg !23
+  %37 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %35, i64 %36, i1 %30) #3, !dbg !23
+  %38 = extractvalue { i32, i32, i32, i32 } %37, 0, !dbg !23
+  %39 = bitcast i32 %38 to <2 x bfloat>, !dbg !23
+  %40 = extractvalue { i32, i32, i32, i32 } %37, 1, !dbg !23
+  %41 = bitcast i32 %40 to <2 x bfloat>, !dbg !23
+  %42 = extractvalue { i32, i32, i32, i32 } %37, 2, !dbg !23
+  %43 = bitcast i32 %42 to <2 x bfloat>, !dbg !23
+  %44 = extractvalue { i32, i32, i32, i32 } %37, 3, !dbg !23
+  %45 = bitcast i32 %44 to <2 x bfloat>, !dbg !23
+  %46 = add i32 %33, 12288, !dbg !24
+  %47 = sext i32 %46 to i64, !dbg !25
+  %48 = getelementptr bfloat, ptr addrspace(1) %1, i64 %47, !dbg !25
+  %49 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #3, !dbg !26
+  %50 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %48, i64 %49, i1 %30) #3, !dbg !26
+  %51 = extractvalue { i32, i32, i32, i32 } %50, 0, !dbg !26
+  %52 = bitcast i32 %51 to <2 x bfloat>, !dbg !26
+  %53 = extractvalue { i32, i32, i32, i32 } %50, 1, !dbg !26
+  %54 = bitcast i32 %53 to <2 x bfloat>, !dbg !26
+  %55 = extractvalue { i32, i32, i32, i32 } %50, 2, !dbg !26
+  %56 = bitcast i32 %55 to <2 x bfloat>, !dbg !26
+  %57 = extractvalue { i32, i32, i32, i32 } %50, 3, !dbg !26
+  %58 = bitcast i32 %57 to <2 x bfloat>, !dbg !26
+  %59 = sext i32 %12 to i64, !dbg !27
+  %60 = getelementptr bfloat, ptr addrspace(1) %2, i64 %59, !dbg !27
+  %61 = fpext <2 x bfloat> %39 to <2 x float>, !dbg !28
+  %62 = extractelement <2 x float> %61, i64 0, !dbg !29
+  %63 = fsub float 0.000000e+00, %62, !dbg !29
+  %64 = extractelement <2 x float> %61, i64 1, !dbg !29
+  %65 = fsub float 0.000000e+00, %64, !dbg !29
+  %66 = fmul float %63, 0x3FF7154760000000, !dbg !34
+  %67 = tail call float @llvm.nvvm.ex2.approx.f(float %66), !dbg !34
+  %68 = fmul float %65, 0x3FF7154760000000, !dbg !34
+  %69 = tail call float @llvm.nvvm.ex2.approx.f(float %68), !dbg !34
+  %70 = fadd float %67, 1.000000e+00, !dbg !35
+  %71 = fadd float %69, 1.000000e+00, !dbg !35
+  %72 = tail call float @llvm.nvvm.div.full(float 1.000000e+00, float %70), !dbg !36
+  %73 = tail call float @llvm.nvvm.div.full(float 1.000000e+00, float %71), !dbg !36
+  %74 = insertelement <2 x float> poison, float %72, i64 0, !dbg !37
+  %75 = insertelement <2 x float> %74, float %73, i64 1, !dbg !37
+  %76 = fmul <2 x float> %75, %61, !dbg !37
+  %77 = fpext <2 x bfloat> %52 to <2 x float>, !dbg !38
+  %78 = fmul <2 x float> %76, %77, !dbg !39
+  %79 = fptrunc <2 x float> %78 to <2 x bfloat>, !dbg !40
+  %80 = insertelement <2 x i1> poison, i1 %15, i64 0, !dbg !41
+  %81 = shufflevector <2 x i1> %80, <2 x i1> poison, <2 x i32> zeroinitializer, !dbg !41
+  %82 = select <2 x i1> %81, <2 x bfloat> %23, <2 x bfloat> %79, !dbg !41
+  %83 = fpext <2 x bfloat> %41 to <2 x float>, !dbg !28
+  %84 = extractelement <2 x float> %83, i64 0, !dbg !29
+  %85 = fsub float 0.000000e+00, %84, !dbg !29
+  %86 = extractelement <2 x float> %83, i64 1, !dbg !29
+  %87 = fsub float 0.000000e+00, %86, !dbg !29
+  %88 = fmul float %85, 0x3FF7154760000000, !dbg !34
+  %89 = tail call float @llvm.nvvm.ex2.approx.f(float %88), !dbg !34
+  %90 = fmul float %87, 0x3FF7154760000000, !dbg !34
+  %91 = tail call float @llvm.nvvm.ex2.approx.f(float %90), !dbg !34
+  %92 = fadd float %89, 1.000000e+00, !dbg !35
+  %93 = fadd float %91, 1.000000e+00, !dbg !35
+  %94 = tail call float @llvm.nvvm.div.full(float 1.000000e+00, float %92), !dbg !36
+  %95 = tail call float @llvm.nvvm.div.full(float 1.000000e+00, float %93), !dbg !36
+  %96 = insertelement <2 x float> poison, float %94, i64 0, !dbg !37
+  %97 = insertelement <2 x float> %96, float %95, i64 1, !dbg !37
+  %98 = fmul <2 x float> %97, %83, !dbg !37
+  %99 = fpext <2 x bfloat> %54 to <2 x float>, !dbg !38
+  %100 = fmul <2 x float> %98, %99, !dbg !39
+  %101 = fptrunc <2 x float> %100 to <2 x bfloat>, !dbg !40
+  %102 = select <2 x i1> %81, <2 x bfloat> %25, <2 x bfloat> %101, !dbg !41
+  %103 = fpext <2 x bfloat> %43 to <2 x float>, !dbg !28
+  %104 = extractelement <2 x float> %103, i64 0, !dbg !29
+  %105 = fsub float 0.000000e+00, %104, !dbg !29
+  %106 = extractelement <2 x float> %103, i64 1, !dbg !29
+  %107 = fsub float 0.000000e+00, %106, !dbg !29
+  %108 = fmul float %105, 0x3FF7154760000000, !dbg !34
+  %109 = tail call float @llvm.nvvm.ex2.approx.f(float %108), !dbg !34
+  %110 = fmul float %107, 0x3FF7154760000000, !dbg !34
+  %111 = tail call float @llvm.nvvm.ex2.approx.f(float %110), !dbg !34
+  %112 = fadd float %109, 1.000000e+00, !dbg !35
+  %113 = fadd float %111, 1.000000e+00, !dbg !35
+  %114 = tail call float @llvm.nvvm.div.full(float 1.000000e+00, float %112), !dbg !36
+  %115 = tail call float @llvm.nvvm.div.full(float 1.000000e+00, float %113), !dbg !36
+  %116 = insertelement <2 x float> poison, float %114, i64 0, !dbg !37
+  %117 = insertelement <2 x float> %116, float %115, i64 1, !dbg !37
+  %118 = fmul <2 x float> %117, %103, !dbg !37
+  %119 = fpext <2 x bfloat> %56 to <2 x float>, !dbg !38
+  %120 = fmul <2 x float> %118, %119, !dbg !39
+  %121 = fptrunc <2 x float> %120 to <2 x bfloat>, !dbg !40
+  %122 = select <2 x i1> %81, <2 x bfloat> %27, <2 x bfloat> %121, !dbg !41
+  %123 = fpext <2 x bfloat> %45 to <2 x float>, !dbg !28
+  %124 = extractelement <2 x float> %123, i64 0, !dbg !29
+  %125 = fsub float 0.000000e+00, %124, !dbg !29
+  %126 = extractelement <2 x float> %123, i64 1, !dbg !29
+  %127 = fsub float 0.000000e+00, %126, !dbg !29
+  %128 = fmul float %125, 0x3FF7154760000000, !dbg !34
+  %129 = tail call float @llvm.nvvm.ex2.approx.f(float %128), !dbg !34
+  %130 = fmul float %127, 0x3FF7154760000000, !dbg !34
+  %131 = tail call float @llvm.nvvm.ex2.approx.f(float %130), !dbg !34
+  %132 = fadd float %129, 1.000000e+00, !dbg !35
+  %133 = fadd float %131, 1.000000e+00, !dbg !35
+  %134 = tail call float @llvm.nvvm.div.full(float 1.000000e+00, float %132), !dbg !36
+  %135 = tail call float @llvm.nvvm.div.full(float 1.000000e+00, float %133), !dbg !36
+  %136 = insertelement <2 x float> poison, float %134, i64 0, !dbg !37
+  %137 = insertelement <2 x float> %136, float %135, i64 1, !dbg !37
+  %138 = fmul <2 x float> %137, %123, !dbg !37
+  %139 = fpext <2 x bfloat> %58 to <2 x float>, !dbg !38
+  %140 = fmul <2 x float> %138, %139, !dbg !39
+  %141 = fptrunc <2 x float> %140 to <2 x bfloat>, !dbg !40
+  %142 = select <2 x i1> %81, <2 x bfloat> %29, <2 x bfloat> %141, !dbg !41
+  %143 = bitcast <2 x bfloat> %82 to i32, !dbg !40
+  %144 = bitcast <2 x bfloat> %102 to i32, !dbg !40
+  %145 = bitcast <2 x bfloat> %122 to i32, !dbg !40
+  %146 = bitcast <2 x bfloat> %142 to i32, !dbg !40
+  tail call void asm sideeffect "st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l"(i32 %143, i32 %144, i32 %145, i32 %146, ptr addrspace(1) %60) #3, !dbg !40
+  ret void, !dbg !42
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.ex2.approx.f(float) #2
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.div.full(float, float) #2
+
+attributes #0 = { nounwind "nvvm.reqntid"="128" }
+attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #2 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
+attributes #3 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly)
+!1 = !DIFile(filename: "c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py", directory: "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i")
+!2 = !{i32 2, !"Debug Info Version", i32 3}
+!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
+!4 = distinct !DISubprogram(name: "triton_poi_fused_cat_mul_silu_split_view_0", linkageName: "triton_poi_fused_cat_mul_silu_split_view_0", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!5 = !DISubroutineType(cc: DW_CC_normal, types: !6)
+!6 = !{}
+!7 = !DILocation(line: 20, column: 28, scope: !4)
+!8 = !DILocation(line: 20, column: 33, scope: !4)
+!9 = !DILocation(line: 21, column: 36, scope: !4)
+!10 = !DILocation(line: 21, column: 23, scope: !4)
+!11 = !DILocation(line: 24, column: 19, scope: !4)
+!12 = !DILocation(line: 23, column: 19, scope: !4)
+!13 = !DILocation(line: 30, column: 18, scope: !4)
+!14 = !DILocation(line: 31, column: 35, scope: !4)
+!15 = !DILocation(line: 31, column: 41, scope: !4)
+!16 = !DILocation(line: 31, column: 30, scope: !4)
+!17 = !DILocation(line: 31, column: 47, scope: !4)
+!18 = !DILocation(line: 32, column: 19, scope: !4)
+!19 = !DILocation(line: 35, column: 36, scope: !4)
+!20 = !DILocation(line: 35, column: 52, scope: !4)
+!21 = !DILocation(line: 35, column: 42, scope: !4)
+!22 = !DILocation(line: 35, column: 30, scope: !4)
+!23 = !DILocation(line: 35, column: 58, scope: !4)
+!24 = !DILocation(line: 40, column: 51, scope: !4)
+!25 = !DILocation(line: 40, column: 31, scope: !4)
+!26 = !DILocation(line: 40, column: 67, scope: !4)
+!27 = !DILocation(line: 45, column: 25, scope: !4)
+!28 = !DILocation(line: 35, column: 108, scope: !4)
+!29 = !DILocation(line: 50, column: 30, scope: !30, inlinedAt: !32)
+!30 = distinct !DILexicalBlockFile(scope: !4, file: !31, discriminator: 0)
+!31 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.12/dist-packages/triton/language")
+!32 = !DILocation(line: 37, column: 23, scope: !33)
+!33 = distinct !DILexicalBlockFile(scope: !4, file: !1, discriminator: 0)
+!34 = !DILocation(line: 50, column: 29, scope: !30, inlinedAt: !32)
+!35 = !DILocation(line: 50, column: 20, scope: !30, inlinedAt: !32)
+!36 = !DILocation(line: 50, column: 16, scope: !30, inlinedAt: !32)
+!37 = !DILocation(line: 38, column: 20, scope: !4)
+!38 = !DILocation(line: 40, column: 117, scope: !4)
+!39 = !DILocation(line: 41, column: 20, scope: !4)
+!40 = !DILocation(line: 45, column: 37, scope: !4)
+!41 = !DILocation(line: 44, column: 33, scope: !4)
+!42 = !DILocation(line: 45, column: 4, scope: !4)
diff --git a/triton/AW5ZCLPC4IBBI4EFLOYOOAPMG5OIZIY564UDYNREA6OGUZ6JK6GQ/triton_poi_fused_cat_mul_silu_split_view_0.ptx b/triton/AW5ZCLPC4IBBI4EFLOYOOAPMG5OIZIY564UDYNREA6OGUZ6JK6GQ/triton_poi_fused_cat_mul_silu_split_view_0.ptx
new file mode 100644
index 0000000000000000000000000000000000000000..cfa287e4d55b3317c15267f3e39fa70f6ad45d11
--- /dev/null
+++ b/triton/AW5ZCLPC4IBBI4EFLOYOOAPMG5OIZIY564UDYNREA6OGUZ6JK6GQ/triton_poi_fused_cat_mul_silu_split_view_0.ptx
@@ -0,0 +1,613 @@
+//
+// Generated by LLVM NVPTX Back-End
+//
+
+.version 9.1
+.target sm_89
+.address_size 64
+
+	// .globl	triton_poi_fused_cat_mul_silu_split_view_0 // -- Begin function triton_poi_fused_cat_mul_silu_split_view_0
+                                        // @triton_poi_fused_cat_mul_silu_split_view_0
+.visible .entry triton_poi_fused_cat_mul_silu_split_view_0(
+	.param .u64 .ptr .global .align 1 triton_poi_fused_cat_mul_silu_split_view_0_param_0,
+	.param .u64 .ptr .global .align 1 triton_poi_fused_cat_mul_silu_split_view_0_param_1,
+	.param .u64 .ptr .global .align 1 triton_poi_fused_cat_mul_silu_split_view_0_param_2,
+	.param .u32 triton_poi_fused_cat_mul_silu_split_view_0_param_3,
+	.param .u64 .ptr .global .align 1 triton_poi_fused_cat_mul_silu_split_view_0_param_4,
+	.param .u64 .ptr .global .align 1 triton_poi_fused_cat_mul_silu_split_view_0_param_5
+)
+.reqntid 128
+{
+	.reg .pred 	%p<3>;
+	.reg .b16 	%rs<41>;
+	.reg .b32 	%r<109>;
+	.reg .b64 	%rd<11>;
+	.loc	1 18 0                          // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:18:0
+$L__func_begin0:
+	.loc	1 18 0                          // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:18:0
+
+// %bb.0:
+	ld.param.b64 	%rd8, [triton_poi_fused_cat_mul_silu_split_view_0_param_0];
+	ld.param.b64 	%rd9, [triton_poi_fused_cat_mul_silu_split_view_0_param_1];
+$L__tmp0:
+	.loc	1 20 28                         // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:20:28
+	mov.u32 	%r18, %ctaid.x;
+	.loc	1 20 33                         // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:20:33
+	shl.b32 	%r19, %r18, 10;
+	ld.param.b64 	%rd10, [triton_poi_fused_cat_mul_silu_split_view_0_param_2];
+	.loc	1 21 36                         // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:21:36
+	mov.u32 	%r20, %tid.x;
+	shl.b32 	%r21, %r20, 3;
+	and.b32 	%r22, %r21, 1016;
+	.loc	1 21 23                         // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:21:23
+	or.b32 	%r23, %r22, %r19;
+	.loc	1 24 19                         // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:24:19
+	bfe.s32 	%r24, %r18, 21, 1;
+	shr.u32 	%r25, %r24, 18;
+	add.s32 	%r26, %r23, %r25;
+	shr.s32 	%r27, %r26, 14;
+	.loc	1 23 19                         // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:23:19
+	and.b32 	%r28, %r26, -16384;
+	sub.s32 	%r29, %r23, %r28;
+	.loc	1 30 18                         // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:30:18
+	setp.lt.s32 	%p1, %r29, 4096;
+	.loc	1 31 35                         // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:31:35
+	shl.b32 	%r30, %r27, 12;
+	.loc	1 31 41                         // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:31:41
+	add.s32 	%r31, %r30, %r29;
+	.loc	1 31 30                         // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:31:30
+	mad.wide.s32 	%rd1, %r31, 2, %rd8;
+	.loc	1 31 47                         // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:31:47
+	// begin inline asm
+	mov.u64 %rd2, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd2, 1.0;
+	// end inline asm
+	mov.b32 	%r5, 0;
+	// begin inline asm
+	mov.u32 %r1, %r5;
+	mov.u32 %r2, %r5;
+	mov.u32 %r3, %r5;
+	mov.u32 %r4, %r5;
+	@%p1 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r1, %r2, %r3, %r4 }, [ %rd1 + 0 ], %rd2;
+	// end inline asm
+	.loc	1 32 19                         // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:32:19
+	setp.gt.s32 	%p2, %r29, 4095;
+	.loc	1 35 52                         // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:35:52
+	mad.lo.s32 	%r32, %r27, 36864, %r29;
+	.loc	1 35 42                         // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:35:42
+	add.s32 	%r33, %r32, -4096;
+	.loc	1 35 30                         // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:35:30
+	mad.wide.s32 	%rd3, %r33, 2, %rd9;
+	.loc	1 35 58                         // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:35:58
+	// begin inline asm
+	mov.u64 %rd4, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd4, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r6, %r5;
+	mov.u32 %r7, %r5;
+	mov.u32 %r8, %r5;
+	mov.u32 %r9, %r5;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r6, %r7, %r8, %r9 }, [ %rd3 + 0 ], %rd4;
+	// end inline asm
+	.loc	1 40 51                         // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:40:51
+	add.s32 	%r34, %r32, 8192;
+	.loc	1 40 31                         // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:40:31
+	mad.wide.s32 	%rd5, %r34, 2, %rd9;
+	.loc	1 40 67                         // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:40:67
+	// begin inline asm
+	mov.u64 %rd6, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd6, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r10, %r5;
+	mov.u32 %r11, %r5;
+	mov.u32 %r12, %r5;
+	mov.u32 %r13, %r5;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r10, %r11, %r12, %r13 }, [ %rd5 + 0 ], %rd6;
+	// end inline asm
+	.loc	1 45 25                         // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:45:25
+	mad.wide.s32 	%rd7, %r23, 2, %rd10;
+	.loc	1 35 108                        // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:35:108
+	mov.b32 	{%rs1, %rs2}, %r6;
+	cvt.f32.bf16 	%r35, %rs2;
+	cvt.f32.bf16 	%r36, %rs1;
+	mov.b32 	%r37, 0f00000000;
+$L__tmp1:
+	.loc	2 50 30                         // standard.py:50:30 @[ c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:37:23 ]
+	sub.f32 	%r38, %r37, %r36;
+	sub.f32 	%r39, %r37, %r35;
+	.loc	2 50 29                         // standard.py:50:29 @[ c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:37:23 ]
+	mul.f32 	%r40, %r38, 0f3FB8AA3B;
+	ex2.approx.f32 	%r41, %r40;
+	mul.f32 	%r42, %r39, 0f3FB8AA3B;
+	ex2.approx.f32 	%r43, %r42;
+	.loc	2 50 20                         // standard.py:50:20 @[ c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:37:23 ]
+	add.f32 	%r44, %r41, 0f3F800000;
+	add.f32 	%r45, %r43, 0f3F800000;
+	mov.b32 	%r46, 0f3F800000;
+	.loc	2 50 16                         // standard.py:50:16 @[ c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:37:23 ]
+	div.full.f32 	%r47, %r46, %r44;
+	div.full.f32 	%r48, %r46, %r45;
+$L__tmp2:
+	.loc	1 38 20                         // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:38:20
+	mul.f32 	%r49, %r47, %r36;
+	mul.f32 	%r50, %r48, %r35;
+	.loc	1 40 117                        // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:40:117
+	mov.b32 	{%rs3, %rs4}, %r10;
+	cvt.f32.bf16 	%r51, %rs3;
+	cvt.f32.bf16 	%r52, %rs4;
+	.loc	1 41 20                         // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:41:20
+	mul.f32 	%r53, %r50, %r52;
+	mul.f32 	%r54, %r49, %r51;
+	.loc	1 45 37                         // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:45:37
+	cvt.rn.bf16.f32 	%rs5, %r54;
+	cvt.rn.bf16.f32 	%rs6, %r53;
+	.loc	1 44 33                         // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:44:33
+	mov.b32 	{%rs7, %rs8}, %r1;
+	selp.b16 	%rs9, %rs8, %rs6, %p1;
+	selp.b16 	%rs10, %rs7, %rs5, %p1;
+	mov.b32 	%r14, {%rs10, %rs9};
+	.loc	1 35 108                        // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:35:108
+	mov.b32 	{%rs11, %rs12}, %r7;
+	cvt.f32.bf16 	%r55, %rs12;
+	cvt.f32.bf16 	%r56, %rs11;
+$L__tmp3:
+	.loc	2 50 30                         // standard.py:50:30 @[ c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:37:23 ]
+	sub.f32 	%r57, %r37, %r56;
+	sub.f32 	%r58, %r37, %r55;
+	.loc	2 50 29                         // standard.py:50:29 @[ c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:37:23 ]
+	mul.f32 	%r59, %r57, 0f3FB8AA3B;
+	ex2.approx.f32 	%r60, %r59;
+	mul.f32 	%r61, %r58, 0f3FB8AA3B;
+	ex2.approx.f32 	%r62, %r61;
+	.loc	2 50 20                         // standard.py:50:20 @[ c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:37:23 ]
+	add.f32 	%r63, %r60, 0f3F800000;
+	add.f32 	%r64, %r62, 0f3F800000;
+	.loc	2 50 16                         // standard.py:50:16 @[ c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:37:23 ]
+	div.full.f32 	%r65, %r46, %r63;
+	div.full.f32 	%r66, %r46, %r64;
+$L__tmp4:
+	.loc	1 38 20                         // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:38:20
+	mul.f32 	%r67, %r65, %r56;
+	mul.f32 	%r68, %r66, %r55;
+	.loc	1 40 117                        // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:40:117
+	mov.b32 	{%rs13, %rs14}, %r11;
+	cvt.f32.bf16 	%r69, %rs13;
+	cvt.f32.bf16 	%r70, %rs14;
+	.loc	1 41 20                         // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:41:20
+	mul.f32 	%r71, %r68, %r70;
+	mul.f32 	%r72, %r67, %r69;
+	.loc	1 45 37                         // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:45:37
+	cvt.rn.bf16.f32 	%rs15, %r72;
+	cvt.rn.bf16.f32 	%rs16, %r71;
+	.loc	1 44 33                         // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:44:33
+	mov.b32 	{%rs17, %rs18}, %r2;
+	selp.b16 	%rs19, %rs18, %rs16, %p1;
+	selp.b16 	%rs20, %rs17, %rs15, %p1;
+	mov.b32 	%r15, {%rs20, %rs19};
+	.loc	1 35 108                        // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:35:108
+	mov.b32 	{%rs21, %rs22}, %r8;
+	cvt.f32.bf16 	%r73, %rs22;
+	cvt.f32.bf16 	%r74, %rs21;
+$L__tmp5:
+	.loc	2 50 30                         // standard.py:50:30 @[ c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:37:23 ]
+	sub.f32 	%r75, %r37, %r74;
+	sub.f32 	%r76, %r37, %r73;
+	.loc	2 50 29                         // standard.py:50:29 @[ c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:37:23 ]
+	mul.f32 	%r77, %r75, 0f3FB8AA3B;
+	ex2.approx.f32 	%r78, %r77;
+	mul.f32 	%r79, %r76, 0f3FB8AA3B;
+	ex2.approx.f32 	%r80, %r79;
+	.loc	2 50 20                         // standard.py:50:20 @[ c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:37:23 ]
+	add.f32 	%r81, %r78, 0f3F800000;
+	add.f32 	%r82, %r80, 0f3F800000;
+	.loc	2 50 16                         // standard.py:50:16 @[ c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:37:23 ]
+	div.full.f32 	%r83, %r46, %r81;
+	div.full.f32 	%r84, %r46, %r82;
+$L__tmp6:
+	.loc	1 38 20                         // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:38:20
+	mul.f32 	%r85, %r83, %r74;
+	mul.f32 	%r86, %r84, %r73;
+	.loc	1 40 117                        // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:40:117
+	mov.b32 	{%rs23, %rs24}, %r12;
+	cvt.f32.bf16 	%r87, %rs23;
+	cvt.f32.bf16 	%r88, %rs24;
+	.loc	1 41 20                         // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:41:20
+	mul.f32 	%r89, %r86, %r88;
+	mul.f32 	%r90, %r85, %r87;
+	.loc	1 45 37                         // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:45:37
+	cvt.rn.bf16.f32 	%rs25, %r90;
+	cvt.rn.bf16.f32 	%rs26, %r89;
+	.loc	1 44 33                         // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:44:33
+	mov.b32 	{%rs27, %rs28}, %r3;
+	selp.b16 	%rs29, %rs28, %rs26, %p1;
+	selp.b16 	%rs30, %rs27, %rs25, %p1;
+	mov.b32 	%r16, {%rs30, %rs29};
+	.loc	1 35 108                        // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:35:108
+	mov.b32 	{%rs31, %rs32}, %r9;
+	cvt.f32.bf16 	%r91, %rs32;
+	cvt.f32.bf16 	%r92, %rs31;
+$L__tmp7:
+	.loc	2 50 30                         // standard.py:50:30 @[ c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:37:23 ]
+	sub.f32 	%r93, %r37, %r92;
+	sub.f32 	%r94, %r37, %r91;
+	.loc	2 50 29                         // standard.py:50:29 @[ c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:37:23 ]
+	mul.f32 	%r95, %r93, 0f3FB8AA3B;
+	ex2.approx.f32 	%r96, %r95;
+	mul.f32 	%r97, %r94, 0f3FB8AA3B;
+	ex2.approx.f32 	%r98, %r97;
+	.loc	2 50 20                         // standard.py:50:20 @[ c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:37:23 ]
+	add.f32 	%r99, %r96, 0f3F800000;
+	add.f32 	%r100, %r98, 0f3F800000;
+	.loc	2 50 16                         // standard.py:50:16 @[ c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:37:23 ]
+	div.full.f32 	%r101, %r46, %r99;
+	div.full.f32 	%r102, %r46, %r100;
+$L__tmp8:
+	.loc	1 38 20                         // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:38:20
+	mul.f32 	%r103, %r101, %r92;
+	mul.f32 	%r104, %r102, %r91;
+	.loc	1 40 117                        // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:40:117
+	mov.b32 	{%rs33, %rs34}, %r13;
+	cvt.f32.bf16 	%r105, %rs33;
+	cvt.f32.bf16 	%r106, %rs34;
+	.loc	1 41 20                         // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:41:20
+	mul.f32 	%r107, %r104, %r106;
+	mul.f32 	%r108, %r103, %r105;
+	.loc	1 45 37                         // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:45:37
+	cvt.rn.bf16.f32 	%rs35, %r108;
+	cvt.rn.bf16.f32 	%rs36, %r107;
+	.loc	1 44 33                         // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:44:33
+	mov.b32 	{%rs37, %rs38}, %r4;
+	selp.b16 	%rs39, %rs38, %rs36, %p1;
+	selp.b16 	%rs40, %rs37, %rs35, %p1;
+	mov.b32 	%r17, {%rs40, %rs39};
+	.loc	1 45 37                         // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:45:37
+	// begin inline asm
+	st.global.v4.b32 [ %rd7 + 0 ], { %r14, %r15, %r16, %r17 };
+	// end inline asm
+	.loc	1 45 4                          // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:45:4
+	ret;
+$L__tmp9:
+$L__func_end0:
+                                        // -- End function
+}
+	.file	1 "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py"
+	.file	2 "/usr/local/lib/python3.12/dist-packages/triton/language/standard.py"
+	.section	.debug_abbrev
+	{
+.b8 1                                   // Abbreviation Code
+.b8 17                                  // DW_TAG_compile_unit
+.b8 1                                   // DW_CHILDREN_yes
+.b8 37                                  // DW_AT_producer
+.b8 8                                   // DW_FORM_string
+.b8 19                                  // DW_AT_language
+.b8 5                                   // DW_FORM_data2
+.b8 3                                   // DW_AT_name
+.b8 8                                   // DW_FORM_string
+.b8 16                                  // DW_AT_stmt_list
+.b8 6                                   // DW_FORM_data4
+.b8 27                                  // DW_AT_comp_dir
+.b8 8                                   // DW_FORM_string
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 2                                   // Abbreviation Code
+.b8 46                                  // DW_TAG_subprogram
+.b8 0                                   // DW_CHILDREN_no
+.b8 3                                   // DW_AT_name
+.b8 8                                   // DW_FORM_string
+.b8 32                                  // DW_AT_inline
+.b8 11                                  // DW_FORM_data1
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 3                                   // Abbreviation Code
+.b8 46                                  // DW_TAG_subprogram
+.b8 1                                   // DW_CHILDREN_yes
+.b8 17                                  // DW_AT_low_pc
+.b8 1                                   // DW_FORM_addr
+.b8 18                                  // DW_AT_high_pc
+.b8 1                                   // DW_FORM_addr
+.b8 49                                  // DW_AT_abstract_origin
+.b8 19                                  // DW_FORM_ref4
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 4                                   // Abbreviation Code
+.b8 29                                  // DW_TAG_inlined_subroutine
+.b8 0                                   // DW_CHILDREN_no
+.b8 49                                  // DW_AT_abstract_origin
+.b8 19                                  // DW_FORM_ref4
+.b8 17                                  // DW_AT_low_pc
+.b8 1                                   // DW_FORM_addr
+.b8 18                                  // DW_AT_high_pc
+.b8 1                                   // DW_FORM_addr
+.b8 88                                  // DW_AT_call_file
+.b8 11                                  // DW_FORM_data1
+.b8 89                                  // DW_AT_call_line
+.b8 11                                  // DW_FORM_data1
+.b8 87                                  // DW_AT_call_column
+.b8 11                                  // DW_FORM_data1
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 0                                   // EOM(3)
+	}
+	.section	.debug_info
+	{
+.b32 316                                // Length of Unit
+.b8 2                                   // DWARF version number
+.b8 0
+.b32 .debug_abbrev                      // Offset Into Abbrev. Section
+.b8 8                                   // Address Size (in bytes)
+.b8 1                                   // Abbrev [1] 0xb:0x135 DW_TAG_compile_unit
+.b8 116                                 // DW_AT_producer
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 0
+.b8 2                                   // DW_AT_language
+.b8 0
+.b8 99                                  // DW_AT_name
+.b8 51
+.b8 105
+.b8 109
+.b8 121
+.b8 102
+.b8 105
+.b8 98
+.b8 99
+.b8 113
+.b8 51
+.b8 122
+.b8 119
+.b8 114
+.b8 99
+.b8 53
+.b8 103
+.b8 118
+.b8 102
+.b8 115
+.b8 99
+.b8 118
+.b8 112
+.b8 115
+.b8 97
+.b8 120
+.b8 100
+.b8 122
+.b8 106
+.b8 105
+.b8 106
+.b8 121
+.b8 109
+.b8 114
+.b8 110
+.b8 116
+.b8 50
+.b8 108
+.b8 102
+.b8 97
+.b8 104
+.b8 116
+.b8 114
+.b8 106
+.b8 109
+.b8 114
+.b8 98
+.b8 116
+.b8 108
+.b8 109
+.b8 104
+.b8 101
+.b8 46
+.b8 112
+.b8 121
+.b8 0
+.b32 .debug_line                        // DW_AT_stmt_list
+.b8 47                                  // DW_AT_comp_dir
+.b8 97
+.b8 112
+.b8 112
+.b8 47
+.b8 116
+.b8 101
+.b8 110
+.b8 115
+.b8 111
+.b8 114
+.b8 114
+.b8 116
+.b8 95
+.b8 108
+.b8 108
+.b8 109
+.b8 47
+.b8 118
+.b8 105
+.b8 115
+.b8 117
+.b8 97
+.b8 108
+.b8 95
+.b8 103
+.b8 101
+.b8 110
+.b8 47
+.b8 99
+.b8 111
+.b8 109
+.b8 112
+.b8 105
+.b8 108
+.b8 101
+.b8 100
+.b8 95
+.b8 99
+.b8 97
+.b8 99
+.b8 104
+.b8 101
+.b8 47
+.b8 102
+.b8 108
+.b8 117
+.b8 120
+.b8 50
+.b8 95
+.b8 107
+.b8 108
+.b8 101
+.b8 105
+.b8 110
+.b8 95
+.b8 57
+.b8 98
+.b8 95
+.b8 78
+.b8 86
+.b8 73
+.b8 68
+.b8 73
+.b8 65
+.b8 95
+.b8 71
+.b8 101
+.b8 70
+.b8 111
+.b8 114
+.b8 99
+.b8 101
+.b8 95
+.b8 82
+.b8 84
+.b8 88
+.b8 95
+.b8 52
+.b8 48
+.b8 57
+.b8 48
+.b8 95
+.b8 115
+.b8 109
+.b8 56
+.b8 57
+.b8 95
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 50
+.b8 46
+.b8 49
+.b8 48
+.b8 46
+.b8 48
+.b8 97
+.b8 48
+.b8 95
+.b8 98
+.b8 52
+.b8 101
+.b8 52
+.b8 101
+.b8 101
+.b8 56
+.b8 49
+.b8 100
+.b8 51
+.b8 46
+.b8 110
+.b8 118
+.b8 50
+.b8 53
+.b8 46
+.b8 49
+.b8 50
+.b8 95
+.b8 99
+.b8 117
+.b8 100
+.b8 97
+.b8 49
+.b8 51
+.b8 95
+.b8 49
+.b8 47
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 105
+.b8 110
+.b8 100
+.b8 117
+.b8 99
+.b8 116
+.b8 111
+.b8 114
+.b8 47
+.b8 51
+.b8 105
+.b8 0
+.b8 2                                   // Abbrev [2] 0xe4:0x2d DW_TAG_subprogram
+.b8 116                                 // DW_AT_name
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 112
+.b8 111
+.b8 105
+.b8 95
+.b8 102
+.b8 117
+.b8 115
+.b8 101
+.b8 100
+.b8 95
+.b8 99
+.b8 97
+.b8 116
+.b8 95
+.b8 109
+.b8 117
+.b8 108
+.b8 95
+.b8 115
+.b8 105
+.b8 108
+.b8 117
+.b8 95
+.b8 115
+.b8 112
+.b8 108
+.b8 105
+.b8 116
+.b8 95
+.b8 118
+.b8 105
+.b8 101
+.b8 119
+.b8 95
+.b8 48
+.b8 0
+.b8 1                                   // DW_AT_inline
+.b8 3                                   // Abbrev [3] 0x111:0x2e DW_TAG_subprogram
+.b64 $L__func_begin0                    // DW_AT_low_pc
+.b64 $L__func_end0                      // DW_AT_high_pc
+.b32 228                                // DW_AT_abstract_origin
+.b8 4                                   // Abbrev [4] 0x126:0x18 DW_TAG_inlined_subroutine
+.b32 228                                // DW_AT_abstract_origin
+.b64 $L__tmp1                           // DW_AT_low_pc
+.b64 $L__tmp8                           // DW_AT_high_pc
+.b8 1                                   // DW_AT_call_file
+.b8 37                                  // DW_AT_call_line
+.b8 23                                  // DW_AT_call_column
+.b8 0                                   // End Of Children Mark
+.b8 0                                   // End Of Children Mark
+	}
+	.section	.debug_macinfo	{	}
diff --git a/triton/AW5ZCLPC4IBBI4EFLOYOOAPMG5OIZIY564UDYNREA6OGUZ6JK6GQ/triton_poi_fused_cat_mul_silu_split_view_0.source b/triton/AW5ZCLPC4IBBI4EFLOYOOAPMG5OIZIY564UDYNREA6OGUZ6JK6GQ/triton_poi_fused_cat_mul_silu_split_view_0.source
new file mode 100644
index 0000000000000000000000000000000000000000..5e32453a051384c371add5991fb7aba58655fac7
--- /dev/null
+++ b/triton/AW5ZCLPC4IBBI4EFLOYOOAPMG5OIZIY564UDYNREA6OGUZ6JK6GQ/triton_poi_fused_cat_mul_silu_split_view_0.source
@@ -0,0 +1,212 @@
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":18:0)
+#loc43 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":49:0)
+#loc50 = loc("in_ptr0"(#loc))
+#loc51 = loc("in_ptr1"(#loc))
+#loc52 = loc("out_ptr0"(#loc))
+#loc53 = loc("xnumel"(#loc))
+#loc93 = loc("x"(#loc43))
+module {
+  tt.func public @triton_poi_fused_cat_mul_silu_split_view_0(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} {
+    %xnumel_0 = arith.constant 37748736 : i32 loc(#loc54)
+    %xoffset = tt.get_program_id x : i32 loc(#loc55)
+    %xoffset_1 = arith.constant 1024 : i32 loc(#loc56)
+    %xoffset_2 = arith.constant 1024 : i32 loc(#loc56)
+    %xoffset_3 = arith.muli %xoffset, %xoffset_2 : i32 loc(#loc56)
+    %xindex = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32> loc(#loc57)
+    %xindex_4 = tt.splat %xoffset_3 : i32 -> tensor<1024xi32> loc(#loc58)
+    %xindex_5 = arith.addi %xindex_4, %xindex : tensor<1024xi32> loc(#loc58)
+    %xmask = arith.constant true loc(#loc59)
+    %xmask_6 = arith.constant dense<true> : tensor<1024xi1> loc(#loc59)
+    %x0 = arith.constant 16384 : i32 loc(#loc60)
+    %x0_7 = arith.constant 16384 : i32 loc(#loc60)
+    %x0_8 = arith.constant dense<16384> : tensor<1024xi32> loc(#loc60)
+    %x0_9 = arith.remsi %xindex_5, %x0_8 : tensor<1024xi32> loc(#loc60)
+    %x1 = arith.constant 16384 : i32 loc(#loc61)
+    %x1_10 = arith.constant 16384 : i32 loc(#loc61)
+    %x1_11 = arith.constant dense<16384> : tensor<1024xi32> loc(#loc61)
+    %x1_12 = arith.divsi %xindex_5, %x1_11 : tensor<1024xi32> loc(#loc61)
+    %tmp1 = arith.constant 0 : i64 loc(#loc62)
+    %tmp1_13 = arith.constant dense<0> : tensor<1xi64> loc(#loc62)
+    %tmp2 = arith.extsi %x0_9 : tensor<1024xi32> to tensor<1024xi64> loc(#loc63)
+    %tmp2_14 = arith.constant dense<0> : tensor<1024xi64> loc(#loc63)
+    %tmp2_15 = arith.cmpi sge, %tmp2, %tmp2_14 : tensor<1024xi64> loc(#loc63)
+    %tmp3 = arith.constant 4096 : i64 loc(#loc64)
+    %tmp3_16 = arith.constant dense<4096> : tensor<1xi64> loc(#loc64)
+    %tmp4 = arith.extsi %x0_9 : tensor<1024xi32> to tensor<1024xi64> loc(#loc65)
+    %tmp4_17 = arith.constant dense<4096> : tensor<1024xi64> loc(#loc65)
+    %tmp4_18 = arith.cmpi slt, %tmp4, %tmp4_17 : tensor<1024xi64> loc(#loc65)
+    %tmp5 = arith.constant 4096 : i32 loc(#loc66)
+    %tmp5_19 = arith.constant 4096 : i32 loc(#loc66)
+    %tmp5_20 = arith.constant dense<4096> : tensor<1024xi32> loc(#loc66)
+    %tmp5_21 = arith.muli %tmp5_20, %x1_12 : tensor<1024xi32> loc(#loc66)
+    %tmp5_22 = arith.addi %tmp5_21, %x0_9 : tensor<1024xi32> loc(#loc67)
+    %tmp5_23 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<1024x!tt.ptr<bf16>> loc(#loc68)
+    %tmp5_24 = tt.addptr %tmp5_23, %tmp5_22 : tensor<1024x!tt.ptr<bf16>>, tensor<1024xi32> loc(#loc68)
+    %tmp5_25 = arith.constant 0.000000e+00 : f32 loc(#loc69)
+    %tmp5_26 = arith.constant dense<0.000000e+00> : tensor<1024xf32> loc(#loc69)
+    %tmp5_27 = arith.truncf %tmp5_26 : tensor<1024xf32> to tensor<1024xbf16> loc(#loc69)
+    %tmp5_28 = tt.load %tmp5_24, %tmp4_18, %tmp5_27 evictionPolicy = evict_last : tensor<1024x!tt.ptr<bf16>> loc(#loc69)
+    %tmp5_29 = arith.extf %tmp5_28 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc70)
+    %tmp6 = arith.extsi %x0_9 : tensor<1024xi32> to tensor<1024xi64> loc(#loc71)
+    %tmp6_30 = arith.constant dense<4096> : tensor<1024xi64> loc(#loc71)
+    %tmp6_31 = arith.cmpi sge, %tmp6, %tmp6_30 : tensor<1024xi64> loc(#loc71)
+    %tmp7 = arith.constant 16384 : i64 loc(#loc72)
+    %tmp7_32 = arith.constant dense<16384> : tensor<1xi64> loc(#loc72)
+    %tmp8 = arith.extsi %x0_9 : tensor<1024xi32> to tensor<1024xi64> loc(#loc73)
+    %tmp8_33 = arith.constant dense<16384> : tensor<1024xi64> loc(#loc73)
+    %tmp8_34 = arith.cmpi slt, %tmp8, %tmp8_33 : tensor<1024xi64> loc(#loc73)
+    %tmp9 = arith.constant 36864 : i32 loc(#loc74)
+    %tmp9_35 = arith.constant 36864 : i32 loc(#loc74)
+    %tmp9_36 = arith.constant dense<36864> : tensor<1024xi32> loc(#loc74)
+    %tmp9_37 = arith.muli %tmp9_36, %x1_12 : tensor<1024xi32> loc(#loc74)
+    %tmp9_38 = arith.constant -4096 : i32 loc(#loc75)
+    %tmp9_39 = arith.constant -4096 : i32 loc(#loc75)
+    %tmp9_40 = arith.constant dense<-4096> : tensor<1024xi32> loc(#loc75)
+    %tmp9_41 = arith.addi %tmp9_40, %x0_9 : tensor<1024xi32> loc(#loc75)
+    %tmp9_42 = arith.addi %tmp9_37, %tmp9_41 : tensor<1024xi32> loc(#loc76)
+    %tmp9_43 = tt.splat %in_ptr1 : !tt.ptr<bf16> -> tensor<1024x!tt.ptr<bf16>> loc(#loc77)
+    %tmp9_44 = tt.addptr %tmp9_43, %tmp9_42 : tensor<1024x!tt.ptr<bf16>>, tensor<1024xi32> loc(#loc77)
+    %tmp9_45 = arith.constant 0.000000e+00 : f32 loc(#loc78)
+    %tmp9_46 = arith.constant dense<0.000000e+00> : tensor<1024xf32> loc(#loc78)
+    %tmp9_47 = arith.truncf %tmp9_46 : tensor<1024xf32> to tensor<1024xbf16> loc(#loc78)
+    %tmp9_48 = tt.load %tmp9_44, %tmp6_31, %tmp9_47 evictionPolicy = evict_last : tensor<1024x!tt.ptr<bf16>> loc(#loc78)
+    %tmp9_49 = arith.extf %tmp9_48 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc79)
+    %tmp11 = tt.call @triton.language.standard.sigmoid__fp32S1024S__(%tmp9_49) : (tensor<1024xf32>) -> tensor<1024xf32> loc(#loc80)
+    %tmp12 = arith.mulf %tmp9_49, %tmp11 : tensor<1024xf32> loc(#loc81)
+    %tmp14 = arith.constant 36864 : i32 loc(#loc82)
+    %tmp14_50 = arith.constant 36864 : i32 loc(#loc82)
+    %tmp14_51 = arith.constant dense<36864> : tensor<1024xi32> loc(#loc82)
+    %tmp14_52 = arith.muli %tmp14_51, %x1_12 : tensor<1024xi32> loc(#loc82)
+    %tmp14_53 = arith.constant 12288 : i32 loc(#loc83)
+    %tmp14_54 = arith.constant 12288 : i32 loc(#loc83)
+    %tmp14_55 = arith.constant dense<12288> : tensor<1024xi32> loc(#loc83)
+    %tmp14_56 = arith.addi %tmp14_55, %tmp14_52 : tensor<1024xi32> loc(#loc83)
+    %tmp14_57 = arith.constant -4096 : i32 loc(#loc84)
+    %tmp14_58 = arith.constant -4096 : i32 loc(#loc84)
+    %tmp14_59 = arith.constant dense<-4096> : tensor<1024xi32> loc(#loc84)
+    %tmp14_60 = arith.addi %tmp14_59, %x0_9 : tensor<1024xi32> loc(#loc84)
+    %tmp14_61 = arith.addi %tmp14_56, %tmp14_60 : tensor<1024xi32> loc(#loc85)
+    %tmp14_62 = tt.splat %in_ptr1 : !tt.ptr<bf16> -> tensor<1024x!tt.ptr<bf16>> loc(#loc86)
+    %tmp14_63 = tt.addptr %tmp14_62, %tmp14_61 : tensor<1024x!tt.ptr<bf16>>, tensor<1024xi32> loc(#loc86)
+    %tmp14_64 = arith.constant 0.000000e+00 : f32 loc(#loc87)
+    %tmp14_65 = arith.constant dense<0.000000e+00> : tensor<1024xf32> loc(#loc87)
+    %tmp14_66 = arith.truncf %tmp14_65 : tensor<1024xf32> to tensor<1024xbf16> loc(#loc87)
+    %tmp14_67 = tt.load %tmp14_63, %tmp6_31, %tmp14_66 evictionPolicy = evict_last : tensor<1024x!tt.ptr<bf16>> loc(#loc87)
+    %tmp14_68 = arith.extf %tmp14_67 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc88)
+    %tmp15 = arith.mulf %tmp12, %tmp14_68 : tensor<1024xf32> loc(#loc89)
+    %tmp16 = arith.constant 0.000000e+00 : f32 loc(#loc90)
+    %tmp16_69 = arith.constant dense<0.000000e+00> : tensor<1024xf32> loc(#loc90)
+    %tmp17 = arith.select %tmp6_31, %tmp15, %tmp16_69 : tensor<1024xi1>, tensor<1024xf32> loc(#loc91)
+    %tmp18 = arith.select %tmp4_18, %tmp5_29, %tmp17 : tensor<1024xi1>, tensor<1024xf32> loc(#loc92)
+    %0 = tt.splat %out_ptr0 : !tt.ptr<bf16> -> tensor<1024x!tt.ptr<bf16>> loc(#loc40)
+    %1 = tt.addptr %0, %xindex_5 : tensor<1024x!tt.ptr<bf16>>, tensor<1024xi32> loc(#loc40)
+    %2 = arith.truncf %tmp18 : tensor<1024xf32> to tensor<1024xbf16> loc(#loc41)
+    tt.store %1, %2 : tensor<1024x!tt.ptr<bf16>> loc(#loc41)
+    tt.return loc(#loc42)
+  } loc(#loc)
+  tt.func private @triton.language.standard.sigmoid__fp32S1024S__(%x: tensor<1024xf32> loc("x"(#loc43))) -> tensor<1024xf32> attributes {noinline = false} {
+    %cst = arith.constant 0.000000e+00 : f32 loc(#loc44)
+    %cst_0 = arith.constant dense<0.000000e+00> : tensor<1024xf32> loc(#loc44)
+    %0 = arith.subf %cst_0, %x : tensor<1024xf32> loc(#loc44)
+    %1 = math.exp %0 : tensor<1024xf32> loc(#loc45)
+    %c1_i32 = arith.constant 1 : i32 loc(#loc46)
+    %cst_1 = arith.constant 1.000000e+00 : f32 loc(#loc46)
+    %cst_2 = arith.constant dense<1.000000e+00> : tensor<1024xf32> loc(#loc46)
+    %2 = arith.addf %cst_2, %1 : tensor<1024xf32> loc(#loc46)
+    %c1_i32_3 = arith.constant 1 : i32 loc(#loc47)
+    %cst_4 = arith.constant 1.000000e+00 : f32 loc(#loc47)
+    %cst_5 = arith.constant dense<1.000000e+00> : tensor<1024xf32> loc(#loc47)
+    %3 = arith.divf %cst_5, %2 : tensor<1024xf32> loc(#loc47)
+    tt.return %3 : tensor<1024xf32> loc(#loc48)
+  ^bb1:  // no predecessors
+    %4 = ub.poison : tensor<1024xf32> loc(#loc49)
+    tt.return %4 : tensor<1024xf32> loc(#loc49)
+  } loc(#loc43)
+} loc(#loc)
+#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":19:13)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":20:28)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":20:33)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":21:36)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":21:23)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":22:36)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":23:19)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":24:19)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":27:27)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":28:19)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":29:30)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":30:18)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":31:35)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":31:41)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":31:30)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":31:47)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":31:97)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":32:19)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":33:31)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":34:18)
+#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":35:36)
+#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":35:52)
+#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":35:42)
+#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":35:30)
+#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":35:58)
+#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":35:108)
+#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":37:23)
+#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":38:20)
+#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":40:45)
+#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":40:39)
+#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":40:61)
+#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":40:51)
+#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":40:31)
+#loc34 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":40:67)
+#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":40:117)
+#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":41:20)
+#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":42:38)
+#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":43:34)
+#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":44:33)
+#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":45:25)
+#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":45:37)
+#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":45:4)
+#loc44 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:30)
+#loc45 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:29)
+#loc46 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:20)
+#loc47 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:16)
+#loc48 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:11)
+#loc49 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:4)
+#loc54 = loc("xnumel"(#loc1))
+#loc55 = loc("xoffset"(#loc2))
+#loc56 = loc("xoffset"(#loc3))
+#loc57 = loc("xindex"(#loc4))
+#loc58 = loc("xindex"(#loc5))
+#loc59 = loc("xmask"(#loc6))
+#loc60 = loc("x0"(#loc7))
+#loc61 = loc("x1"(#loc8))
+#loc62 = loc("tmp1"(#loc9))
+#loc63 = loc("tmp2"(#loc10))
+#loc64 = loc("tmp3"(#loc11))
+#loc65 = loc("tmp4"(#loc12))
+#loc66 = loc("tmp5"(#loc13))
+#loc67 = loc("tmp5"(#loc14))
+#loc68 = loc("tmp5"(#loc15))
+#loc69 = loc("tmp5"(#loc16))
+#loc70 = loc("tmp5"(#loc17))
+#loc71 = loc("tmp6"(#loc18))
+#loc72 = loc("tmp7"(#loc19))
+#loc73 = loc("tmp8"(#loc20))
+#loc74 = loc("tmp9"(#loc21))
+#loc75 = loc("tmp9"(#loc22))
+#loc76 = loc("tmp9"(#loc23))
+#loc77 = loc("tmp9"(#loc24))
+#loc78 = loc("tmp9"(#loc25))
+#loc79 = loc("tmp9"(#loc26))
+#loc80 = loc("tmp11"(#loc27))
+#loc81 = loc("tmp12"(#loc28))
+#loc82 = loc("tmp14"(#loc29))
+#loc83 = loc("tmp14"(#loc30))
+#loc84 = loc("tmp14"(#loc31))
+#loc85 = loc("tmp14"(#loc32))
+#loc86 = loc("tmp14"(#loc33))
+#loc87 = loc("tmp14"(#loc34))
+#loc88 = loc("tmp14"(#loc35))
+#loc89 = loc("tmp15"(#loc36))
+#loc90 = loc("tmp16"(#loc37))
+#loc91 = loc("tmp17"(#loc38))
+#loc92 = loc("tmp18"(#loc39))
diff --git a/triton/AW5ZCLPC4IBBI4EFLOYOOAPMG5OIZIY564UDYNREA6OGUZ6JK6GQ/triton_poi_fused_cat_mul_silu_split_view_0.ttgir b/triton/AW5ZCLPC4IBBI4EFLOYOOAPMG5OIZIY564UDYNREA6OGUZ6JK6GQ/triton_poi_fused_cat_mul_silu_split_view_0.ttgir
new file mode 100644
index 0000000000000000000000000000000000000000..6adc45493dd4f0873add3cce8bf2b6c4fa884f88
--- /dev/null
+++ b/triton/AW5ZCLPC4IBBI4EFLOYOOAPMG5OIZIY564UDYNREA6OGUZ6JK6GQ/triton_poi_fused_cat_mul_silu_split_view_0.ttgir
@@ -0,0 +1,131 @@
+#blocked = #ttg.blocked<{sizePerThread = [8], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}>
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":18:0)
+#loc38 = loc("in_ptr0"(#loc))
+#loc39 = loc("in_ptr1"(#loc))
+#loc40 = loc("out_ptr0"(#loc))
+#loc41 = loc("xnumel"(#loc))
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:89", "ttg.threads-per-warp" = 32 : i32} {
+  tt.func public @triton_poi_fused_cat_mul_silu_split_view_0(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} {
+    %cst = arith.constant dense<12288> : tensor<1024xi32, #blocked> loc(#loc1)
+    %cst_0 = arith.constant dense<-4096> : tensor<1024xi32, #blocked> loc(#loc1)
+    %cst_1 = arith.constant dense<36864> : tensor<1024xi32, #blocked> loc(#loc1)
+    %cst_2 = arith.constant dense<4096> : tensor<1024xi32, #blocked> loc(#loc1)
+    %cst_3 = arith.constant dense<4096> : tensor<1024xi64, #blocked> loc(#loc1)
+    %cst_4 = arith.constant dense<16384> : tensor<1024xi32, #blocked> loc(#loc1)
+    %c1024_i32 = arith.constant 1024 : i32 loc(#loc1)
+    %cst_5 = arith.constant dense<0.000000e+00> : tensor<1024xbf16, #blocked> loc(#loc1)
+    %cst_6 = arith.constant dense<1.000000e+00> : tensor<1024xf32, #blocked> loc(#loc1)
+    %cst_7 = arith.constant dense<0.000000e+00> : tensor<1024xf32, #blocked> loc(#loc1)
+    %xoffset = tt.get_program_id x : i32 loc(#loc42)
+    %xoffset_8 = arith.muli %xoffset, %c1024_i32 : i32 loc(#loc43)
+    %xindex = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked> loc(#loc44)
+    %xindex_9 = tt.splat %xoffset_8 : i32 -> tensor<1024xi32, #blocked> loc(#loc45)
+    %xindex_10 = arith.addi %xindex_9, %xindex : tensor<1024xi32, #blocked> loc(#loc45)
+    %x0 = arith.remsi %xindex_10, %cst_4 : tensor<1024xi32, #blocked> loc(#loc46)
+    %x1 = arith.divsi %xindex_10, %cst_4 : tensor<1024xi32, #blocked> loc(#loc47)
+    %tmp4 = arith.extsi %x0 : tensor<1024xi32, #blocked> to tensor<1024xi64, #blocked> loc(#loc48)
+    %tmp4_11 = arith.cmpi slt, %tmp4, %cst_3 : tensor<1024xi64, #blocked> loc(#loc48)
+    %tmp5 = arith.muli %x1, %cst_2 : tensor<1024xi32, #blocked> loc(#loc49)
+    %tmp5_12 = arith.addi %tmp5, %x0 : tensor<1024xi32, #blocked> loc(#loc50)
+    %tmp5_13 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<1024x!tt.ptr<bf16>, #blocked> loc(#loc51)
+    %tmp5_14 = tt.addptr %tmp5_13, %tmp5_12 : tensor<1024x!tt.ptr<bf16>, #blocked>, tensor<1024xi32, #blocked> loc(#loc51)
+    %tmp5_15 = tt.load %tmp5_14, %tmp4_11, %cst_5 evictionPolicy = evict_last : tensor<1024x!tt.ptr<bf16>, #blocked> loc(#loc52)
+    %tmp5_16 = arith.extf %tmp5_15 : tensor<1024xbf16, #blocked> to tensor<1024xf32, #blocked> loc(#loc53)
+    %tmp6 = arith.cmpi sge, %tmp4, %cst_3 : tensor<1024xi64, #blocked> loc(#loc54)
+    %tmp9 = arith.muli %x1, %cst_1 : tensor<1024xi32, #blocked> loc(#loc55)
+    %tmp9_17 = arith.addi %x0, %cst_0 : tensor<1024xi32, #blocked> loc(#loc56)
+    %tmp9_18 = arith.addi %tmp9, %tmp9_17 : tensor<1024xi32, #blocked> loc(#loc57)
+    %tmp9_19 = tt.splat %in_ptr1 : !tt.ptr<bf16> -> tensor<1024x!tt.ptr<bf16>, #blocked> loc(#loc58)
+    %tmp9_20 = tt.addptr %tmp9_19, %tmp9_18 : tensor<1024x!tt.ptr<bf16>, #blocked>, tensor<1024xi32, #blocked> loc(#loc58)
+    %tmp9_21 = tt.load %tmp9_20, %tmp6, %cst_5 evictionPolicy = evict_last : tensor<1024x!tt.ptr<bf16>, #blocked> loc(#loc59)
+    %tmp9_22 = arith.extf %tmp9_21 : tensor<1024xbf16, #blocked> to tensor<1024xf32, #blocked> loc(#loc60)
+    %tmp11 = arith.subf %cst_7, %tmp9_22 : tensor<1024xf32, #blocked> loc(#loc71)
+    %tmp11_23 = math.exp %tmp11 : tensor<1024xf32, #blocked> loc(#loc72)
+    %tmp11_24 = arith.addf %tmp11_23, %cst_6 : tensor<1024xf32, #blocked> loc(#loc73)
+    %tmp11_25 = arith.divf %cst_6, %tmp11_24 : tensor<1024xf32, #blocked> loc(#loc74)
+    %tmp12 = arith.mulf %tmp9_22, %tmp11_25 : tensor<1024xf32, #blocked> loc(#loc62)
+    %tmp14 = arith.addi %tmp9, %cst : tensor<1024xi32, #blocked> loc(#loc63)
+    %tmp14_26 = arith.addi %tmp14, %tmp9_17 : tensor<1024xi32, #blocked> loc(#loc64)
+    %tmp14_27 = tt.addptr %tmp9_19, %tmp14_26 : tensor<1024x!tt.ptr<bf16>, #blocked>, tensor<1024xi32, #blocked> loc(#loc65)
+    %tmp14_28 = tt.load %tmp14_27, %tmp6, %cst_5 evictionPolicy = evict_last : tensor<1024x!tt.ptr<bf16>, #blocked> loc(#loc66)
+    %tmp14_29 = arith.extf %tmp14_28 : tensor<1024xbf16, #blocked> to tensor<1024xf32, #blocked> loc(#loc67)
+    %tmp15 = arith.mulf %tmp12, %tmp14_29 : tensor<1024xf32, #blocked> loc(#loc68)
+    %tmp17 = arith.select %tmp6, %tmp15, %cst_7 : tensor<1024xi1, #blocked>, tensor<1024xf32, #blocked> loc(#loc69)
+    %tmp18 = arith.select %tmp4_11, %tmp5_16, %tmp17 : tensor<1024xi1, #blocked>, tensor<1024xf32, #blocked> loc(#loc70)
+    %0 = tt.splat %out_ptr0 : !tt.ptr<bf16> -> tensor<1024x!tt.ptr<bf16>, #blocked> loc(#loc35)
+    %1 = tt.addptr %0, %xindex_10 : tensor<1024x!tt.ptr<bf16>, #blocked>, tensor<1024xi32, #blocked> loc(#loc35)
+    %2 = arith.truncf %tmp18 : tensor<1024xf32, #blocked> to tensor<1024xbf16, #blocked> loc(#loc36)
+    tt.store %1, %2 : tensor<1024x!tt.ptr<bf16>, #blocked> loc(#loc36)
+    tt.return loc(#loc37)
+  } loc(#loc)
+} loc(#loc)
+#loc1 = loc(unknown)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":20:28)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":20:33)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":21:36)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":21:23)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":23:19)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":24:19)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":30:18)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":31:35)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":31:41)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":31:30)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":31:47)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":31:97)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":32:19)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":35:36)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":35:52)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":35:42)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":35:30)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":35:58)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":35:108)
+#loc21 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:30)
+#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":37:23)
+#loc23 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:29)
+#loc24 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:20)
+#loc25 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:16)
+#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":38:20)
+#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":40:39)
+#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":40:51)
+#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":40:31)
+#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":40:67)
+#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":40:117)
+#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":41:20)
+#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":43:34)
+#loc34 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":44:33)
+#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":45:25)
+#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":45:37)
+#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":45:4)
+#loc42 = loc("xoffset"(#loc2))
+#loc43 = loc("xoffset"(#loc3))
+#loc44 = loc("xindex"(#loc4))
+#loc45 = loc("xindex"(#loc5))
+#loc46 = loc("x0"(#loc6))
+#loc47 = loc("x1"(#loc7))
+#loc48 = loc("tmp4"(#loc8))
+#loc49 = loc("tmp5"(#loc9))
+#loc50 = loc("tmp5"(#loc10))
+#loc51 = loc("tmp5"(#loc11))
+#loc52 = loc("tmp5"(#loc12))
+#loc53 = loc("tmp5"(#loc13))
+#loc54 = loc("tmp6"(#loc14))
+#loc55 = loc("tmp9"(#loc15))
+#loc56 = loc("tmp9"(#loc16))
+#loc57 = loc("tmp9"(#loc17))
+#loc58 = loc("tmp9"(#loc18))
+#loc59 = loc("tmp9"(#loc19))
+#loc60 = loc("tmp9"(#loc20))
+#loc61 = loc("tmp11"(#loc22))
+#loc62 = loc("tmp12"(#loc26))
+#loc63 = loc("tmp14"(#loc27))
+#loc64 = loc("tmp14"(#loc28))
+#loc65 = loc("tmp14"(#loc29))
+#loc66 = loc("tmp14"(#loc30))
+#loc67 = loc("tmp14"(#loc31))
+#loc68 = loc("tmp15"(#loc32))
+#loc69 = loc("tmp17"(#loc33))
+#loc70 = loc("tmp18"(#loc34))
+#loc71 = loc(callsite(#loc21 at #loc61))
+#loc72 = loc(callsite(#loc23 at #loc61))
+#loc73 = loc(callsite(#loc24 at #loc61))
+#loc74 = loc(callsite(#loc25 at #loc61))
diff --git a/triton/AW5ZCLPC4IBBI4EFLOYOOAPMG5OIZIY564UDYNREA6OGUZ6JK6GQ/triton_poi_fused_cat_mul_silu_split_view_0.ttir b/triton/AW5ZCLPC4IBBI4EFLOYOOAPMG5OIZIY564UDYNREA6OGUZ6JK6GQ/triton_poi_fused_cat_mul_silu_split_view_0.ttir
new file mode 100644
index 0000000000000000000000000000000000000000..c67c3914290bff4ff21152033f06dee0c0668cda
--- /dev/null
+++ b/triton/AW5ZCLPC4IBBI4EFLOYOOAPMG5OIZIY564UDYNREA6OGUZ6JK6GQ/triton_poi_fused_cat_mul_silu_split_view_0.ttir
@@ -0,0 +1,131 @@
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":18:0)
+#loc38 = loc("in_ptr0"(#loc))
+#loc39 = loc("in_ptr1"(#loc))
+#loc40 = loc("out_ptr0"(#loc))
+#loc41 = loc("xnumel"(#loc))
+module {
+  tt.func public @triton_poi_fused_cat_mul_silu_split_view_0(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} {
+    %tmp11 = arith.constant dense<1.000000e+00> : tensor<1024xf32> loc(#loc71)
+    %cst = arith.constant dense<0.000000e+00> : tensor<1024xbf16> loc(#loc1)
+    %tmp14 = arith.constant dense<12288> : tensor<1024xi32> loc(#loc43)
+    %cst_0 = arith.constant dense<-4096> : tensor<1024xi32> loc(#loc1)
+    %cst_1 = arith.constant dense<36864> : tensor<1024xi32> loc(#loc1)
+    %cst_2 = arith.constant dense<0.000000e+00> : tensor<1024xf32> loc(#loc1)
+    %tmp5 = arith.constant dense<4096> : tensor<1024xi32> loc(#loc44)
+    %cst_3 = arith.constant dense<4096> : tensor<1024xi64> loc(#loc1)
+    %cst_4 = arith.constant dense<16384> : tensor<1024xi32> loc(#loc1)
+    %c1024_i32 = arith.constant 1024 : i32 loc(#loc1)
+    %xoffset = tt.get_program_id x : i32 loc(#loc45)
+    %xoffset_5 = arith.muli %xoffset, %c1024_i32 : i32 loc(#loc46)
+    %xindex = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32> loc(#loc47)
+    %xindex_6 = tt.splat %xoffset_5 : i32 -> tensor<1024xi32> loc(#loc48)
+    %xindex_7 = arith.addi %xindex_6, %xindex : tensor<1024xi32> loc(#loc48)
+    %x0 = arith.remsi %xindex_7, %cst_4 : tensor<1024xi32> loc(#loc49)
+    %x1 = arith.divsi %xindex_7, %cst_4 : tensor<1024xi32> loc(#loc50)
+    %tmp4 = arith.extsi %x0 : tensor<1024xi32> to tensor<1024xi64> loc(#loc51)
+    %tmp4_8 = arith.cmpi slt, %tmp4, %cst_3 : tensor<1024xi64> loc(#loc51)
+    %tmp5_9 = arith.muli %x1, %tmp5 : tensor<1024xi32> loc(#loc44)
+    %tmp5_10 = arith.addi %tmp5_9, %x0 : tensor<1024xi32> loc(#loc52)
+    %tmp5_11 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<1024x!tt.ptr<bf16>> loc(#loc53)
+    %tmp5_12 = tt.addptr %tmp5_11, %tmp5_10 : tensor<1024x!tt.ptr<bf16>>, tensor<1024xi32> loc(#loc53)
+    %tmp5_13 = tt.load %tmp5_12, %tmp4_8, %cst evictionPolicy = evict_last : tensor<1024x!tt.ptr<bf16>> loc(#loc54)
+    %tmp5_14 = arith.extf %tmp5_13 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc55)
+    %tmp6 = arith.cmpi sge, %tmp4, %cst_3 : tensor<1024xi64> loc(#loc56)
+    %tmp9 = arith.muli %x1, %cst_1 : tensor<1024xi32> loc(#loc57)
+    %tmp9_15 = arith.addi %x0, %cst_0 : tensor<1024xi32> loc(#loc58)
+    %tmp9_16 = arith.addi %tmp9, %tmp9_15 : tensor<1024xi32> loc(#loc59)
+    %tmp9_17 = tt.splat %in_ptr1 : !tt.ptr<bf16> -> tensor<1024x!tt.ptr<bf16>> loc(#loc60)
+    %tmp9_18 = tt.addptr %tmp9_17, %tmp9_16 : tensor<1024x!tt.ptr<bf16>>, tensor<1024xi32> loc(#loc60)
+    %tmp9_19 = tt.load %tmp9_18, %tmp6, %cst evictionPolicy = evict_last : tensor<1024x!tt.ptr<bf16>> loc(#loc61)
+    %tmp9_20 = arith.extf %tmp9_19 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc62)
+    %tmp11_21 = arith.subf %cst_2, %tmp9_20 : tensor<1024xf32> loc(#loc72)
+    %tmp11_22 = math.exp %tmp11_21 : tensor<1024xf32> loc(#loc73)
+    %tmp11_23 = arith.addf %tmp11_22, %tmp11 : tensor<1024xf32> loc(#loc74)
+    %tmp11_24 = arith.divf %tmp11, %tmp11_23 : tensor<1024xf32> loc(#loc75)
+    %tmp12 = arith.mulf %tmp9_20, %tmp11_24 : tensor<1024xf32> loc(#loc63)
+    %tmp14_25 = arith.addi %tmp9, %tmp14 : tensor<1024xi32> loc(#loc43)
+    %tmp14_26 = arith.addi %tmp14_25, %tmp9_15 : tensor<1024xi32> loc(#loc64)
+    %tmp14_27 = tt.addptr %tmp9_17, %tmp14_26 : tensor<1024x!tt.ptr<bf16>>, tensor<1024xi32> loc(#loc65)
+    %tmp14_28 = tt.load %tmp14_27, %tmp6, %cst evictionPolicy = evict_last : tensor<1024x!tt.ptr<bf16>> loc(#loc66)
+    %tmp14_29 = arith.extf %tmp14_28 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc67)
+    %tmp15 = arith.mulf %tmp12, %tmp14_29 : tensor<1024xf32> loc(#loc68)
+    %tmp17 = arith.select %tmp6, %tmp15, %cst_2 : tensor<1024xi1>, tensor<1024xf32> loc(#loc69)
+    %tmp18 = arith.select %tmp4_8, %tmp5_14, %tmp17 : tensor<1024xi1>, tensor<1024xf32> loc(#loc70)
+    %0 = tt.splat %out_ptr0 : !tt.ptr<bf16> -> tensor<1024x!tt.ptr<bf16>> loc(#loc35)
+    %1 = tt.addptr %0, %xindex_7 : tensor<1024x!tt.ptr<bf16>>, tensor<1024xi32> loc(#loc35)
+    %2 = arith.truncf %tmp18 : tensor<1024xf32> to tensor<1024xbf16> loc(#loc36)
+    tt.store %1, %2 : tensor<1024x!tt.ptr<bf16>> loc(#loc36)
+    tt.return loc(#loc37)
+  } loc(#loc)
+} loc(#loc)
+#loc1 = loc(unknown)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":37:23)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":40:39)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":31:35)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":20:28)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":20:33)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":21:36)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":21:23)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":23:19)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":24:19)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":30:18)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":31:41)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":31:30)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":31:47)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":31:97)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":32:19)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":35:36)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":35:52)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":35:42)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":35:30)
+#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":35:58)
+#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":35:108)
+#loc23 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:30)
+#loc24 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:29)
+#loc25 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:20)
+#loc26 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:16)
+#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":38:20)
+#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":40:51)
+#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":40:31)
+#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":40:67)
+#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":40:117)
+#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":41:20)
+#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":43:34)
+#loc34 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":44:33)
+#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":45:25)
+#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":45:37)
+#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":45:4)
+#loc42 = loc("tmp11"(#loc2))
+#loc43 = loc("tmp14"(#loc3))
+#loc44 = loc("tmp5"(#loc4))
+#loc45 = loc("xoffset"(#loc5))
+#loc46 = loc("xoffset"(#loc6))
+#loc47 = loc("xindex"(#loc7))
+#loc48 = loc("xindex"(#loc8))
+#loc49 = loc("x0"(#loc9))
+#loc50 = loc("x1"(#loc10))
+#loc51 = loc("tmp4"(#loc11))
+#loc52 = loc("tmp5"(#loc12))
+#loc53 = loc("tmp5"(#loc13))
+#loc54 = loc("tmp5"(#loc14))
+#loc55 = loc("tmp5"(#loc15))
+#loc56 = loc("tmp6"(#loc16))
+#loc57 = loc("tmp9"(#loc17))
+#loc58 = loc("tmp9"(#loc18))
+#loc59 = loc("tmp9"(#loc19))
+#loc60 = loc("tmp9"(#loc20))
+#loc61 = loc("tmp9"(#loc21))
+#loc62 = loc("tmp9"(#loc22))
+#loc63 = loc("tmp12"(#loc27))
+#loc64 = loc("tmp14"(#loc28))
+#loc65 = loc("tmp14"(#loc29))
+#loc66 = loc("tmp14"(#loc30))
+#loc67 = loc("tmp14"(#loc31))
+#loc68 = loc("tmp15"(#loc32))
+#loc69 = loc("tmp17"(#loc33))
+#loc70 = loc("tmp18"(#loc34))
+#loc71 = loc(callsite(#loc1 at #loc42))
+#loc72 = loc(callsite(#loc23 at #loc42))
+#loc73 = loc(callsite(#loc24 at #loc42))
+#loc74 = loc(callsite(#loc25 at #loc42))
+#loc75 = loc(callsite(#loc26 at #loc42))
diff --git a/triton/AYSZDJBZHGD4X4V6Y5K2ZI7KV3J3O3ME6MPETVGALPCMDZX3DFIA/__grp__triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json b/triton/AYSZDJBZHGD4X4V6Y5K2ZI7KV3J3O3ME6MPETVGALPCMDZX3DFIA/__grp__triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..4f196ce56abae6a12888f0787e904f9939d4e900
--- /dev/null
+++ b/triton/AYSZDJBZHGD4X4V6Y5K2ZI7KV3J3O3ME6MPETVGALPCMDZX3DFIA/__grp__triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json
@@ -0,0 +1 @@
+{"child_paths": {"triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.source": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/AYSZDJBZHGD4X4V6Y5K2ZI7KV3J3O3ME6MPETVGALPCMDZX3DFIA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.source", "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/AYSZDJBZHGD4X4V6Y5K2ZI7KV3J3O3ME6MPETVGALPCMDZX3DFIA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttir", "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttgir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/AYSZDJBZHGD4X4V6Y5K2ZI7KV3J3O3ME6MPETVGALPCMDZX3DFIA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttgir", "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.llir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/AYSZDJBZHGD4X4V6Y5K2ZI7KV3J3O3ME6MPETVGALPCMDZX3DFIA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.llir", "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ptx": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/AYSZDJBZHGD4X4V6Y5K2ZI7KV3J3O3ME6MPETVGALPCMDZX3DFIA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ptx", "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.cubin": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/AYSZDJBZHGD4X4V6Y5K2ZI7KV3J3O3ME6MPETVGALPCMDZX3DFIA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.cubin", "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/AYSZDJBZHGD4X4V6Y5K2ZI7KV3J3O3ME6MPETVGALPCMDZX3DFIA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json"}}
\ No newline at end of file
diff --git a/triton/AYSZDJBZHGD4X4V6Y5K2ZI7KV3J3O3ME6MPETVGALPCMDZX3DFIA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.cubin b/triton/AYSZDJBZHGD4X4V6Y5K2ZI7KV3J3O3ME6MPETVGALPCMDZX3DFIA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..d112e25531c1d164834cc36957ee3a3323e6e5f9
Binary files /dev/null and b/triton/AYSZDJBZHGD4X4V6Y5K2ZI7KV3J3O3ME6MPETVGALPCMDZX3DFIA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.cubin differ
diff --git a/triton/AYSZDJBZHGD4X4V6Y5K2ZI7KV3J3O3ME6MPETVGALPCMDZX3DFIA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json b/triton/AYSZDJBZHGD4X4V6Y5K2ZI7KV3J3O3ME6MPETVGALPCMDZX3DFIA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..0adb75b03d2d4461e712081b76e7a8566d2ff5c4
--- /dev/null
+++ b/triton/AYSZDJBZHGD4X4V6Y5K2ZI7KV3J3O3ME6MPETVGALPCMDZX3DFIA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json
@@ -0,0 +1 @@
+{"hash": "062591a4393987cbf2bec755aca3eaaed3b76d84f31e49d4c05bc4c1e6fb1950", "target": {"backend": "cuda", "arch": 89, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "enable_reflect_ftz": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee", "bf16x3", "bf16x6"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm89", "instrumentation_mode": "", "triton_version": "3.6.0", "tensordesc_meta": [], "shared": 1024, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0"}
\ No newline at end of file
diff --git a/triton/AYSZDJBZHGD4X4V6Y5K2ZI7KV3J3O3ME6MPETVGALPCMDZX3DFIA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.llir b/triton/AYSZDJBZHGD4X4V6Y5K2ZI7KV3J3O3ME6MPETVGALPCMDZX3DFIA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.llir
new file mode 100644
index 0000000000000000000000000000000000000000..fcea99d2c79476dd2b0b0964e54b8fb42dff0453
--- /dev/null
+++ b/triton/AYSZDJBZHGD4X4V6Y5K2ZI7KV3J3O3ME6MPETVGALPCMDZX3DFIA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.llir
@@ -0,0 +1,664 @@
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64"
+
+@global_smem = external local_unnamed_addr addrspace(3) global [0 x i8], align 16
+@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1
+
+; Function Attrs: nounwind
+define ptx_kernel void @triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, ptr addrspace(1) %6, i32 %7, i32 %8, ptr addrspace(1) readnone captures(none) %9, ptr addrspace(1) readnone captures(none) %10) local_unnamed_addr #0 !dbg !5 {
+  %12 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !8
+  %13 = shl i32 %12, 2, !dbg !9
+  %14 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
+  %15 = and i32 %14, 96, !dbg !10
+  %16 = lshr exact i32 %15, 5, !dbg !10
+  %17 = or disjoint i32 %16, %13, !dbg !11
+  %18 = shl nuw nsw i32 %14, 1, !dbg !12
+  %19 = and i32 %18, 62, !dbg !12
+  %20 = sdiv i32 %17, 32, !dbg !13
+  %21 = shl i32 %17, 7
+  %22 = shl i32 %20, 15
+  %23 = add i32 %22, %21
+  %24 = add i32 %23, 4096
+  %25 = zext nneg i32 %19 to i64, !dbg !14
+  %26 = or disjoint i32 %24, %19, !dbg !15
+  %27 = sext i32 %26 to i64, !dbg !16
+  %28 = getelementptr bfloat, ptr addrspace(1) %2, i64 %27, !dbg !16
+  %29 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !17
+  %30 = tail call i32 asm sideeffect "mov.u32 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $2 + 0 ], $3;", "=r,r,l,l,b"(i32 0, ptr addrspace(1) %28, i64 %29, i1 true) #6, !dbg !17
+  %31 = bitcast i32 %30 to <2 x bfloat>, !dbg !17
+  %32 = extractelement <2 x bfloat> %31, i64 0, !dbg !17
+  %33 = extractelement <2 x bfloat> %31, i64 1, !dbg !17
+  %34 = fpext bfloat %32 to float, !dbg !18
+  %35 = fpext bfloat %33 to float, !dbg !18
+  %36 = or disjoint i32 %23, %19, !dbg !19
+  %37 = sext i32 %36 to i64, !dbg !20
+  %38 = getelementptr bfloat, ptr addrspace(1) %2, i64 %37, !dbg !20
+  %39 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !21
+  %40 = tail call i32 asm sideeffect "mov.u32 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $2 + 0 ], $3;", "=r,r,l,l,b"(i32 0, ptr addrspace(1) %38, i64 %39, i1 true) #6, !dbg !21
+  %41 = bitcast i32 %40 to <2 x bfloat>, !dbg !21
+  %42 = extractelement <2 x bfloat> %41, i64 0, !dbg !21
+  %43 = extractelement <2 x bfloat> %41, i64 1, !dbg !21
+  %44 = fpext bfloat %42 to float, !dbg !22
+  %45 = fpext bfloat %43 to float, !dbg !22
+  %46 = fmul float %34, %34, !dbg !23
+  %47 = fmul float %35, %35, !dbg !23
+  %48 = fmul float %44, %44, !dbg !24
+  %49 = fmul float %45, %45, !dbg !24
+  %50 = or disjoint i32 %19, 64, !dbg !25
+  %51 = or disjoint i32 %24, %50, !dbg !15
+  %52 = sext i32 %51 to i64, !dbg !16
+  %53 = getelementptr bfloat, ptr addrspace(1) %2, i64 %52, !dbg !16
+  %54 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !17
+  %55 = tail call i32 asm sideeffect "mov.u32 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $2 + 0 ], $3;", "=r,r,l,l,b"(i32 0, ptr addrspace(1) %53, i64 %54, i1 true) #6, !dbg !17
+  %56 = bitcast i32 %55 to <2 x bfloat>, !dbg !17
+  %57 = extractelement <2 x bfloat> %56, i64 0, !dbg !17
+  %58 = extractelement <2 x bfloat> %56, i64 1, !dbg !17
+  %59 = fpext bfloat %57 to float, !dbg !18
+  %60 = fpext bfloat %58 to float, !dbg !18
+  %61 = or disjoint i32 %23, %50, !dbg !19
+  %62 = sext i32 %61 to i64, !dbg !20
+  %63 = getelementptr bfloat, ptr addrspace(1) %2, i64 %62, !dbg !20
+  %64 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !21
+  %65 = tail call i32 asm sideeffect "mov.u32 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $2 + 0 ], $3;", "=r,r,l,l,b"(i32 0, ptr addrspace(1) %63, i64 %64, i1 true) #6, !dbg !21
+  %66 = bitcast i32 %65 to <2 x bfloat>, !dbg !21
+  %67 = extractelement <2 x bfloat> %66, i64 0, !dbg !21
+  %68 = extractelement <2 x bfloat> %66, i64 1, !dbg !21
+  %69 = fpext bfloat %67 to float, !dbg !22
+  %70 = fpext bfloat %68 to float, !dbg !22
+  %71 = fmul float %59, %59, !dbg !23
+  %72 = fmul float %60, %60, !dbg !23
+  %73 = fadd float %46, %71, !dbg !26
+  %74 = fadd float %47, %72, !dbg !26
+  %75 = fmul float %69, %69, !dbg !24
+  %76 = fmul float %70, %70, !dbg !24
+  %77 = fadd float %48, %75, !dbg !27
+  %78 = fadd float %49, %76, !dbg !27
+  %79 = and i32 %14, 3, !dbg !10
+  %80 = or disjoint i32 %13, %79, !dbg !11
+  %81 = and i32 %14, 124, !dbg !12
+  %82 = lshr exact i32 %81, 2, !dbg !12
+  %83 = sdiv i32 %80, 32, !dbg !13
+  %84 = fadd float %73, %74, !dbg !28
+  %85 = bitcast float %84 to i32, !dbg !31
+  %86 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %85, i32 16, i32 31), !dbg !31
+  %87 = bitcast i32 %86 to float, !dbg !31
+  %88 = fadd float %84, %87, !dbg !28
+  %89 = bitcast float %88 to i32, !dbg !31
+  %90 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %89, i32 8, i32 31), !dbg !31
+  %91 = bitcast i32 %90 to float, !dbg !31
+  %92 = fadd float %88, %91, !dbg !28
+  %93 = bitcast float %92 to i32, !dbg !31
+  %94 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %93, i32 4, i32 31), !dbg !31
+  %95 = bitcast i32 %94 to float, !dbg !31
+  %96 = fadd float %92, %95, !dbg !28
+  %97 = bitcast float %96 to i32, !dbg !31
+  %98 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %97, i32 2, i32 31), !dbg !31
+  %99 = bitcast i32 %98 to float, !dbg !31
+  %100 = fadd float %96, %99, !dbg !28
+  %101 = bitcast float %100 to i32, !dbg !31
+  %102 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %101, i32 1, i32 31), !dbg !31
+  %103 = bitcast i32 %102 to float, !dbg !31
+  %104 = fadd float %100, %103, !dbg !28
+  %105 = fadd float %77, %78, !dbg !34
+  %106 = bitcast float %105 to i32, !dbg !35
+  %107 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %106, i32 16, i32 31), !dbg !35
+  %108 = bitcast i32 %107 to float, !dbg !35
+  %109 = fadd float %105, %108, !dbg !34
+  %110 = bitcast float %109 to i32, !dbg !35
+  %111 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %110, i32 8, i32 31), !dbg !35
+  %112 = bitcast i32 %111 to float, !dbg !35
+  %113 = fadd float %109, %112, !dbg !34
+  %114 = bitcast float %113 to i32, !dbg !35
+  %115 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %114, i32 4, i32 31), !dbg !35
+  %116 = bitcast i32 %115 to float, !dbg !35
+  %117 = fadd float %113, %116, !dbg !34
+  %118 = bitcast float %117 to i32, !dbg !35
+  %119 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %118, i32 2, i32 31), !dbg !35
+  %120 = bitcast i32 %119 to float, !dbg !35
+  %121 = fadd float %117, %120, !dbg !34
+  %122 = bitcast float %121 to i32, !dbg !35
+  %123 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %122, i32 1, i32 31), !dbg !35
+  %124 = bitcast i32 %123 to float, !dbg !35
+  %125 = fadd float %121, %124, !dbg !34
+  %126 = shl i32 %20, 7, !dbg !37
+  %127 = tail call float @llvm.nvvm.div.full(float %125, float 1.280000e+02), !dbg !38
+  %128 = fadd float %127, 0x3EB0C6F7A0000000, !dbg !39
+  %129 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !40
+  %.not.i = icmp eq i32 %129, 0, !dbg !40
+  br i1 %.not.i, label %132, label %130, !dbg !40
+
+130:                                              ; preds = %11
+  %131 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %128), !dbg !40
+  br label %__nv_rsqrtf.exit, !dbg !40
+
+132:                                              ; preds = %11
+  %133 = tail call float @llvm.nvvm.rsqrt.approx.f(float %128), !dbg !40
+  br label %__nv_rsqrtf.exit, !dbg !40
+
+__nv_rsqrtf.exit:                                 ; preds = %130, %132
+  %.0.i = phi float [ %131, %130 ], [ %133, %132 ], !dbg !40
+  %134 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !40
+  %.not.i3 = icmp eq i32 %134, 0, !dbg !40
+  br i1 %.not.i3, label %137, label %135, !dbg !40
+
+135:                                              ; preds = %__nv_rsqrtf.exit
+  %136 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %128), !dbg !40
+  br label %__nv_rsqrtf.exit5, !dbg !40
+
+137:                                              ; preds = %__nv_rsqrtf.exit
+  %138 = tail call float @llvm.nvvm.rsqrt.approx.f(float %128), !dbg !40
+  br label %__nv_rsqrtf.exit5, !dbg !40
+
+__nv_rsqrtf.exit5:                                ; preds = %135, %137
+  %.0.i4 = phi float [ %136, %135 ], [ %138, %137 ], !dbg !40
+  %139 = lshr exact i32 %15, 3, !dbg !41
+  %140 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %139, !dbg !41
+  store float %.0.i, ptr addrspace(3) %140, align 4, !dbg !41
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !41
+  %141 = shl nuw nsw i32 %79, 2, !dbg !41
+  %142 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %141, !dbg !41
+  %143 = load float, ptr addrspace(3) %142, align 4, !dbg !41
+  %144 = tail call float @llvm.nvvm.div.full(float %104, float 1.280000e+02), !dbg !42
+  %145 = fadd float %144, 0x3EB0C6F7A0000000, !dbg !43
+  %146 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44
+  %.not.i6 = icmp eq i32 %146, 0, !dbg !44
+  br i1 %.not.i6, label %149, label %147, !dbg !44
+
+147:                                              ; preds = %__nv_rsqrtf.exit5
+  %148 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %145), !dbg !44
+  br label %__nv_rsqrtf.exit8, !dbg !44
+
+149:                                              ; preds = %__nv_rsqrtf.exit5
+  %150 = tail call float @llvm.nvvm.rsqrt.approx.f(float %145), !dbg !44
+  br label %__nv_rsqrtf.exit8, !dbg !44
+
+__nv_rsqrtf.exit8:                                ; preds = %147, %149
+  %.0.i7 = phi float [ %148, %147 ], [ %150, %149 ], !dbg !44
+  %151 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44
+  %.not.i9 = icmp eq i32 %151, 0, !dbg !44
+  br i1 %.not.i9, label %154, label %152, !dbg !44
+
+152:                                              ; preds = %__nv_rsqrtf.exit8
+  %153 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %145), !dbg !44
+  br label %__nv_rsqrtf.exit11, !dbg !44
+
+154:                                              ; preds = %__nv_rsqrtf.exit8
+  %155 = tail call float @llvm.nvvm.rsqrt.approx.f(float %145), !dbg !44
+  br label %__nv_rsqrtf.exit11, !dbg !44
+
+__nv_rsqrtf.exit11:                               ; preds = %152, %154
+  %.0.i10 = phi float [ %153, %152 ], [ %155, %154 ], !dbg !44
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !45
+  store float %.0.i7, ptr addrspace(3) %140, align 4, !dbg !45
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !45
+  %156 = load float, ptr addrspace(3) %142, align 4, !dbg !45
+  %157 = shl i32 %17, 7, !dbg !46
+  %158 = and i32 %82, 1
+  %.masked = and i32 %82, 30
+  %159 = shl nuw nsw i32 %14, 3
+  %160 = and i32 %159, 120
+  %161 = shl nuw nsw i32 %15, 2
+  %162 = lshr i32 %14, 2
+  %163 = and i32 %162, 4
+  %164 = or disjoint i32 %160, %161
+  %165 = xor i32 %164, %15
+  %166 = or disjoint i32 %165, %163
+  %167 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %166
+  %168 = xor i32 %166, 516
+  %169 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %168
+  %170 = shl nuw nsw i32 %14, 7
+  %171 = and i32 %170, 896
+  %172 = shl nuw nsw i32 %79, 5
+  %173 = xor i32 %172, %81
+  %174 = or disjoint i32 %173, %171
+  %175 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %174
+  %176 = xor i32 %174, 4
+  %177 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %176
+  %178 = icmp eq i32 %158, 0
+  %179 = shl i32 %80, 7
+  %180 = shl i32 %83, 15
+  %181 = add i32 %180, %179
+  %182 = icmp ne i32 %158, 0
+  %183 = add i32 %181, 4097
+  %184 = add i32 %181, 4096
+  %185 = shl nuw nsw i32 %79, 7
+  %186 = and i32 %14, 28
+  %187 = lshr i32 %14, 4
+  %188 = and i32 %187, 2
+  %189 = lshr i32 %14, 1
+  %190 = and i32 %189, 32
+  %191 = or disjoint i32 %185, %188
+  %192 = or disjoint i32 %172, %186
+  %193 = xor i32 %192, %190
+  %194 = or disjoint i32 %193, %191
+  %195 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %194
+  %196 = xor i32 %194, 64
+  %197 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %196
+  %198 = shl nuw nsw i32 %79, 3
+  %199 = shl nuw nsw i32 %14, 2
+  %200 = and i32 %199, 480
+  %201 = and i32 %189, 2
+  %202 = or disjoint i32 %198, %200
+  %203 = xor i32 %202, %15
+  %204 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %201
+  %205 = getelementptr inbounds nuw i8, ptr addrspace(3) %204, i32 %203
+  %206 = getelementptr inbounds nuw i8, ptr addrspace(3) %205, i32 4
+  %207 = zext nneg i32 %.masked to i64, !dbg !47
+  %208 = sext i32 %126 to i64, !dbg !47
+  %209 = sext i32 %157 to i64, !dbg !47
+  br label %210, !dbg !47
+
+210:                                              ; preds = %__nv_rsqrtf.exit11, %210
+  %211 = phi i1 [ true, %__nv_rsqrtf.exit11 ], [ false, %210 ]
+  %indvars.iv = phi i64 [ 0, %__nv_rsqrtf.exit11 ], [ 64, %210 ]
+  %212 = or disjoint i64 %indvars.iv, %25, !dbg !48
+  %213 = or disjoint i64 %indvars.iv, %207, !dbg !49
+  %214 = or disjoint i64 %213, 32, !dbg !49
+  %215 = trunc nuw nsw i64 %212 to i32, !dbg !50
+  %216 = or disjoint i32 %23, %215, !dbg !50
+  %217 = sext i32 %216 to i64, !dbg !51
+  %218 = getelementptr bfloat, ptr addrspace(1) %2, i64 %217, !dbg !51
+  %219 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !52
+  %220 = tail call i32 asm sideeffect "mov.u32 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $2 + 0 ], $3;", "=r,r,l,l,b"(i32 0, ptr addrspace(1) %218, i64 %219, i1 true) #6, !dbg !52
+  %221 = bitcast i32 %220 to <2 x bfloat>, !dbg !52
+  %222 = extractelement <2 x bfloat> %221, i64 0, !dbg !52
+  %223 = extractelement <2 x bfloat> %221, i64 1, !dbg !52
+  %224 = fpext bfloat %222 to float, !dbg !53
+  %225 = fpext bfloat %223 to float, !dbg !53
+  %226 = getelementptr bfloat, ptr addrspace(1) %3, i64 %212, !dbg !54
+  %227 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !55
+  %228 = tail call i32 asm sideeffect "mov.u32 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $2 + 0 ], $3;", "=r,r,l,l,b"(i32 0, ptr addrspace(1) %226, i64 %227, i1 true) #6, !dbg !55
+  %229 = bitcast i32 %228 to <2 x bfloat>, !dbg !55
+  %230 = extractelement <2 x bfloat> %229, i64 0, !dbg !55
+  %231 = extractelement <2 x bfloat> %229, i64 1, !dbg !55
+  %232 = fpext bfloat %230 to float, !dbg !56
+  %233 = fpext bfloat %231 to float, !dbg !56
+  %234 = or disjoint i64 %212, %208, !dbg !57
+  %235 = getelementptr float, ptr addrspace(1) %4, i64 %234, !dbg !58
+  %236 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !59
+  %237 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %235, i64 %236, i1 true) #6, !dbg !59
+  %238 = extractvalue { i32, i32 } %237, 0, !dbg !59
+  %239 = extractvalue { i32, i32 } %237, 1, !dbg !59
+  %240 = bitcast i32 %238 to float, !dbg !59
+  %241 = bitcast i32 %239 to float, !dbg !59
+  %242 = getelementptr float, ptr addrspace(1) %5, i64 %234, !dbg !60
+  %243 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !61
+  %244 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %242, i64 %243, i1 true) #6, !dbg !61
+  %245 = extractvalue { i32, i32 } %244, 0, !dbg !61
+  %246 = extractvalue { i32, i32 } %244, 1, !dbg !61
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !61
+  %247 = insertelement <1 x i32> poison, i32 %245, i64 0, !dbg !61
+  store <1 x i32> %247, ptr addrspace(3) %167, align 4, !dbg !61
+  %248 = insertelement <1 x i32> poison, i32 %246, i64 0, !dbg !61
+  store <1 x i32> %248, ptr addrspace(3) %169, align 4, !dbg !61
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !61
+  %249 = load float, ptr addrspace(3) %175, align 4, !dbg !61
+  %250 = load float, ptr addrspace(3) %177, align 4, !dbg !61
+  %251 = or disjoint i32 %24, %215, !dbg !62
+  %252 = sext i32 %251 to i64, !dbg !63
+  %253 = getelementptr bfloat, ptr addrspace(1) %2, i64 %252, !dbg !63
+  %254 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #6, !dbg !64
+  %255 = tail call i32 asm sideeffect "mov.u32 $0, $1;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.b32 { $0 }, [ $2 + 0 ], $3;", "=r,r,l,l,b"(i32 0, ptr addrspace(1) %253, i64 %254, i1 true) #6, !dbg !64
+  %256 = bitcast i32 %255 to <2 x bfloat>, !dbg !64
+  %257 = extractelement <2 x bfloat> %256, i64 0, !dbg !64
+  %258 = extractelement <2 x bfloat> %256, i64 1, !dbg !64
+  %259 = fpext bfloat %257 to float, !dbg !65
+  %260 = fpext bfloat %258 to float, !dbg !65
+  %261 = getelementptr bfloat, ptr addrspace(1) %6, i64 %212, !dbg !66
+  %262 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !67
+  %263 = tail call i32 asm sideeffect "mov.u32 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $2 + 0 ], $3;", "=r,r,l,l,b"(i32 0, ptr addrspace(1) %261, i64 %262, i1 true) #6, !dbg !67
+  %264 = bitcast i32 %263 to <2 x bfloat>, !dbg !67
+  %265 = extractelement <2 x bfloat> %264, i64 0, !dbg !67
+  %266 = extractelement <2 x bfloat> %264, i64 1, !dbg !67
+  %267 = fpext bfloat %265 to float, !dbg !68
+  %268 = fpext bfloat %266 to float, !dbg !68
+  %269 = or disjoint i64 %213, 1, !dbg !69
+  %270 = or disjoint i64 %213, 33, !dbg !69
+  %271 = trunc nuw nsw i64 %269 to i32, !dbg !70
+  %272 = or disjoint i32 %181, %271, !dbg !70
+  %273 = trunc nuw nsw i64 %270 to i32, !dbg !70
+  %274 = or disjoint i32 %181, %273, !dbg !70
+  %275 = sext i32 %272 to i64, !dbg !71
+  %276 = getelementptr bfloat, ptr addrspace(1) %2, i64 %275, !dbg !71
+  %277 = sext i32 %274 to i64, !dbg !71
+  %278 = getelementptr bfloat, ptr addrspace(1) %2, i64 %277, !dbg !71
+  %279 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !72
+  %280 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %276, i64 %279, i1 %178) #6, !dbg !72
+  %281 = bitcast i16 %280 to bfloat, !dbg !72
+  %282 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !72
+  %283 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %278, i64 %282, i1 %178) #6, !dbg !72
+  %284 = bitcast i16 %283 to bfloat, !dbg !72
+  %285 = fpext bfloat %281 to float, !dbg !73
+  %286 = fpext bfloat %284 to float, !dbg !73
+  %287 = fmul float %143, %285, !dbg !41
+  %288 = fmul float %143, %286, !dbg !41
+  %289 = getelementptr bfloat, ptr addrspace(1) %3, i64 %269, !dbg !74
+  %290 = getelementptr bfloat, ptr addrspace(1) %3, i64 %270, !dbg !74
+  %291 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !75
+  %292 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %289, i64 %291, i1 %178) #6, !dbg !75
+  %293 = bitcast i16 %292 to bfloat, !dbg !75
+  %294 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !75
+  %295 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %290, i64 %294, i1 %178) #6, !dbg !75
+  %296 = bitcast i16 %295 to bfloat, !dbg !75
+  %297 = fpext bfloat %293 to float, !dbg !76
+  %298 = fpext bfloat %296 to float, !dbg !76
+  %299 = fmul float %287, %297, !dbg !77
+  %300 = fmul float %288, %298, !dbg !77
+  %301 = fsub float 0.000000e+00, %299, !dbg !78
+  %302 = fsub float 0.000000e+00, %300, !dbg !78
+  %303 = trunc nuw nsw i64 %213 to i32, !dbg !79
+  %304 = or disjoint i32 %181, %303, !dbg !79
+  %305 = trunc nuw nsw i64 %214 to i32, !dbg !79
+  %306 = or disjoint i32 %181, %305, !dbg !79
+  %307 = sext i32 %304 to i64, !dbg !80
+  %308 = getelementptr bfloat, ptr addrspace(1) %2, i64 %307, !dbg !80
+  %309 = sext i32 %306 to i64, !dbg !80
+  %310 = getelementptr bfloat, ptr addrspace(1) %2, i64 %309, !dbg !80
+  %311 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !81
+  %312 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %308, i64 %311, i1 %182) #6, !dbg !81
+  %313 = bitcast i16 %312 to bfloat, !dbg !81
+  %314 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !81
+  %315 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %310, i64 %314, i1 %182) #6, !dbg !81
+  %316 = bitcast i16 %315 to bfloat, !dbg !81
+  %317 = fpext bfloat %313 to float, !dbg !82
+  %318 = fpext bfloat %316 to float, !dbg !82
+  %319 = fmul float %143, %317, !dbg !83
+  %320 = fmul float %143, %318, !dbg !83
+  %321 = getelementptr bfloat, ptr addrspace(1) %3, i64 %213, !dbg !84
+  %322 = getelementptr bfloat, ptr addrspace(1) %3, i64 %214, !dbg !84
+  %323 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !85
+  %324 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %321, i64 %323, i1 %182) #6, !dbg !85
+  %325 = bitcast i16 %324 to bfloat, !dbg !85
+  %326 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !85
+  %327 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %322, i64 %326, i1 %182) #6, !dbg !85
+  %328 = bitcast i16 %327 to bfloat, !dbg !85
+  %329 = fpext bfloat %325 to float, !dbg !86
+  %330 = fpext bfloat %328 to float, !dbg !86
+  %331 = fmul float %319, %329, !dbg !87
+  %332 = fmul float %320, %330, !dbg !87
+  %333 = select i1 %178, float %301, float %331, !dbg !88
+  %334 = select i1 %178, float %302, float %332, !dbg !88
+  %335 = fmul float %.0.i4, %224, !dbg !89
+  %336 = fmul float %.0.i4, %225, !dbg !89
+  %337 = fmul float %335, %232, !dbg !90
+  %338 = fmul float %336, %233, !dbg !90
+  %339 = fmul float %337, %240, !dbg !91
+  %340 = fmul float %338, %241, !dbg !91
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !91
+  store float %339, ptr addrspace(3) %167, align 4, !dbg !91
+  store float %340, ptr addrspace(3) %169, align 4, !dbg !91
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !91
+  %341 = load float, ptr addrspace(3) %175, align 4, !dbg !91
+  %342 = load float, ptr addrspace(3) %177, align 4, !dbg !91
+  %343 = fmul float %249, %333, !dbg !92
+  %344 = fmul float %250, %334, !dbg !92
+  %345 = fadd float %343, %341, !dbg !93
+  %346 = fadd float %344, %342, !dbg !93
+  %347 = or disjoint i32 %183, %303, !dbg !94
+  %348 = or disjoint i32 %183, %305, !dbg !94
+  %349 = sext i32 %347 to i64, !dbg !95
+  %350 = getelementptr bfloat, ptr addrspace(1) %2, i64 %349, !dbg !95
+  %351 = sext i32 %348 to i64, !dbg !95
+  %352 = getelementptr bfloat, ptr addrspace(1) %2, i64 %351, !dbg !95
+  %353 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !96
+  %354 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %350, i64 %353, i1 %178) #6, !dbg !96
+  %355 = bitcast i16 %354 to bfloat, !dbg !96
+  %356 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !96
+  %357 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %352, i64 %356, i1 %178) #6, !dbg !96
+  %358 = bitcast i16 %357 to bfloat, !dbg !96
+  %359 = fpext bfloat %355 to float, !dbg !97
+  %360 = fpext bfloat %358 to float, !dbg !97
+  %361 = fmul float %156, %359, !dbg !45
+  %362 = fmul float %156, %360, !dbg !45
+  %363 = getelementptr bfloat, ptr addrspace(1) %6, i64 %269, !dbg !98
+  %364 = getelementptr bfloat, ptr addrspace(1) %6, i64 %270, !dbg !98
+  %365 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !99
+  %366 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %363, i64 %365, i1 %178) #6, !dbg !99
+  %367 = bitcast i16 %366 to bfloat, !dbg !99
+  %368 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !99
+  %369 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %364, i64 %368, i1 %178) #6, !dbg !99
+  %370 = bitcast i16 %369 to bfloat, !dbg !99
+  %371 = fpext bfloat %367 to float, !dbg !100
+  %372 = fpext bfloat %370 to float, !dbg !100
+  %373 = fmul float %361, %371, !dbg !101
+  %374 = fmul float %362, %372, !dbg !101
+  %375 = fsub float 0.000000e+00, %373, !dbg !102
+  %376 = fsub float 0.000000e+00, %374, !dbg !102
+  %377 = or disjoint i32 %184, %303, !dbg !103
+  %378 = or disjoint i32 %184, %305, !dbg !103
+  %379 = sext i32 %377 to i64, !dbg !104
+  %380 = getelementptr bfloat, ptr addrspace(1) %2, i64 %379, !dbg !104
+  %381 = sext i32 %378 to i64, !dbg !104
+  %382 = getelementptr bfloat, ptr addrspace(1) %2, i64 %381, !dbg !104
+  %383 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !105
+  %384 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %380, i64 %383, i1 %182) #6, !dbg !105
+  %385 = bitcast i16 %384 to bfloat, !dbg !105
+  %386 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !105
+  %387 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %382, i64 %386, i1 %182) #6, !dbg !105
+  %388 = bitcast i16 %387 to bfloat, !dbg !105
+  %389 = fpext bfloat %385 to float, !dbg !106
+  %390 = fpext bfloat %388 to float, !dbg !106
+  %391 = fmul float %156, %389, !dbg !107
+  %392 = fmul float %156, %390, !dbg !107
+  %393 = getelementptr bfloat, ptr addrspace(1) %6, i64 %213, !dbg !108
+  %394 = getelementptr bfloat, ptr addrspace(1) %6, i64 %214, !dbg !108
+  %395 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !109
+  %396 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %393, i64 %395, i1 %182) #6, !dbg !109
+  %397 = bitcast i16 %396 to bfloat, !dbg !109
+  %398 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !109
+  %399 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %394, i64 %398, i1 %182) #6, !dbg !109
+  %400 = bitcast i16 %399 to bfloat, !dbg !109
+  %401 = fpext bfloat %397 to float, !dbg !110
+  %402 = fpext bfloat %400 to float, !dbg !110
+  %403 = fmul float %391, %401, !dbg !111
+  %404 = fmul float %392, %402, !dbg !111
+  %405 = select i1 %178, float %375, float %403, !dbg !88
+  %406 = select i1 %178, float %376, float %404, !dbg !88
+  %407 = fmul float %.0.i10, %259, !dbg !112
+  %408 = fmul float %.0.i10, %260, !dbg !112
+  %409 = fmul float %407, %267, !dbg !113
+  %410 = fmul float %408, %268, !dbg !113
+  %411 = fmul float %409, %240, !dbg !114
+  %412 = fmul float %410, %241, !dbg !114
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !114
+  store float %411, ptr addrspace(3) %167, align 4, !dbg !114
+  store float %412, ptr addrspace(3) %169, align 4, !dbg !114
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !114
+  %413 = load float, ptr addrspace(3) %175, align 4, !dbg !114
+  %414 = load float, ptr addrspace(3) %177, align 4, !dbg !114
+  %415 = fmul float %249, %405, !dbg !115
+  %416 = fmul float %250, %406, !dbg !115
+  %417 = fadd float %415, %413, !dbg !116
+  %418 = fadd float %416, %414, !dbg !116
+  %419 = or disjoint i64 %212, %209, !dbg !117
+  %420 = getelementptr bfloat, ptr addrspace(1) %0, i64 %419, !dbg !118
+  %421 = fptrunc float %345 to bfloat, !dbg !119
+  %422 = fptrunc float %346 to bfloat, !dbg !119
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !119
+  store bfloat %421, ptr addrspace(3) %195, align 2, !dbg !119
+  store bfloat %422, ptr addrspace(3) %197, align 2, !dbg !119
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !119
+  %423 = load bfloat, ptr addrspace(3) %205, align 2, !dbg !119
+  %424 = load bfloat, ptr addrspace(3) %206, align 2, !dbg !119
+  %425 = insertelement <2 x bfloat> poison, bfloat %423, i64 0, !dbg !119
+  %426 = insertelement <2 x bfloat> %425, bfloat %424, i64 1, !dbg !119
+  %427 = bitcast <2 x bfloat> %426 to i32, !dbg !119
+  tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %427, ptr addrspace(1) %420, i1 true) #6, !dbg !119
+  %428 = getelementptr bfloat, ptr addrspace(1) %1, i64 %419, !dbg !120
+  %429 = fptrunc float %417 to bfloat, !dbg !121
+  %430 = fptrunc float %418 to bfloat, !dbg !121
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !121
+  store bfloat %429, ptr addrspace(3) %195, align 2, !dbg !121
+  store bfloat %430, ptr addrspace(3) %197, align 2, !dbg !121
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !121
+  %431 = load bfloat, ptr addrspace(3) %205, align 2, !dbg !121
+  %432 = load bfloat, ptr addrspace(3) %206, align 2, !dbg !121
+  %433 = insertelement <2 x bfloat> poison, bfloat %431, i64 0, !dbg !121
+  %434 = insertelement <2 x bfloat> %433, bfloat %432, i64 1, !dbg !121
+  %435 = bitcast <2 x bfloat> %434 to i32, !dbg !121
+  tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %435, ptr addrspace(1) %428, i1 true) #6, !dbg !121
+  br i1 %211, label %210, label %436, !dbg !47
+
+436:                                              ; preds = %210
+  ret void, !dbg !122
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1
+
+; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
+declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.div.full(float, float) #3
+
+; Function Attrs: convergent nocallback nounwind
+declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #4
+
+declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #5
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #3
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.rsqrt.approx.f(float) #3
+
+attributes #0 = { nounwind "nvvm.reqntid"="128" }
+attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
+attributes #3 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
+attributes #4 = { convergent nocallback nounwind }
+attributes #5 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #6 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3}
+!llvm.ident = !{!4}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly)
+!1 = !DIFile(filename: "cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py", directory: "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv")
+!2 = !{i32 2, !"Debug Info Version", i32 3}
+!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
+!4 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
+!5 = distinct !DISubprogram(name: "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0", linkageName: "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0", scope: !1, file: !1, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
+!7 = !{}
+!8 = !DILocation(line: 23, column: 28, scope: !5)
+!9 = !DILocation(line: 23, column: 33, scope: !5)
+!10 = !DILocation(line: 24, column: 44, scope: !5)
+!11 = !DILocation(line: 24, column: 23, scope: !5)
+!12 = !DILocation(line: 26, column: 37, scope: !5)
+!13 = !DILocation(line: 29, column: 19, scope: !5)
+!14 = !DILocation(line: 33, column: 43, scope: !5)
+!15 = !DILocation(line: 39, column: 57, scope: !5)
+!16 = !DILocation(line: 39, column: 34, scope: !5)
+!17 = !DILocation(line: 39, column: 68, scope: !5)
+!18 = !DILocation(line: 39, column: 121, scope: !5)
+!19 = !DILocation(line: 40, column: 50, scope: !5)
+!20 = !DILocation(line: 40, column: 34, scope: !5)
+!21 = !DILocation(line: 40, column: 61, scope: !5)
+!22 = !DILocation(line: 40, column: 114, scope: !5)
+!23 = !DILocation(line: 42, column: 22, scope: !5)
+!24 = !DILocation(line: 47, column: 22, scope: !5)
+!25 = !DILocation(line: 34, column: 31, scope: !5)
+!26 = !DILocation(line: 44, column: 23, scope: !5)
+!27 = !DILocation(line: 49, column: 25, scope: !5)
+!28 = !DILocation(line: 263, column: 15, scope: !29, inlinedAt: !31)
+!29 = distinct !DILexicalBlockFile(scope: !5, file: !30, discriminator: 0)
+!30 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.12/dist-packages/triton/language")
+!31 = !DILocation(line: 293, column: 36, scope: !29, inlinedAt: !32)
+!32 = !DILocation(line: 51, column: 25, scope: !33)
+!33 = distinct !DILexicalBlockFile(scope: !5, file: !1, discriminator: 0)
+!34 = !DILocation(line: 263, column: 15, scope: !29, inlinedAt: !35)
+!35 = !DILocation(line: 293, column: 36, scope: !29, inlinedAt: !36)
+!36 = !DILocation(line: 52, column: 27, scope: !33)
+!37 = !DILocation(line: 63, column: 46, scope: !5)
+!38 = !DILocation(line: 75, column: 25, scope: !5)
+!39 = !DILocation(line: 77, column: 24, scope: !5)
+!40 = !DILocation(line: 78, column: 32, scope: !5)
+!41 = !DILocation(line: 79, column: 24, scope: !5)
+!42 = !DILocation(line: 123, column: 24, scope: !5)
+!43 = !DILocation(line: 124, column: 24, scope: !5)
+!44 = !DILocation(line: 125, column: 32, scope: !5)
+!45 = !DILocation(line: 126, column: 24, scope: !5)
+!46 = !DILocation(line: 161, column: 43, scope: !5)
+!47 = !DILocation(line: 53, column: 43, scope: !5)
+!48 = !DILocation(line: 54, column: 31, scope: !5)
+!49 = !DILocation(line: 72, column: 41, scope: !5)
+!50 = !DILocation(line: 61, column: 51, scope: !5)
+!51 = !DILocation(line: 61, column: 35, scope: !5)
+!52 = !DILocation(line: 61, column: 62, scope: !5)
+!53 = !DILocation(line: 61, column: 115, scope: !5)
+!54 = !DILocation(line: 62, column: 35, scope: !5)
+!55 = !DILocation(line: 62, column: 42, scope: !5)
+!56 = !DILocation(line: 62, column: 95, scope: !5)
+!57 = !DILocation(line: 63, column: 42, scope: !5)
+!58 = !DILocation(line: 63, column: 35, scope: !5)
+!59 = !DILocation(line: 63, column: 51, scope: !5)
+!60 = !DILocation(line: 64, column: 35, scope: !5)
+!61 = !DILocation(line: 64, column: 51, scope: !5)
+!62 = !DILocation(line: 65, column: 58, scope: !5)
+!63 = !DILocation(line: 65, column: 35, scope: !5)
+!64 = !DILocation(line: 65, column: 69, scope: !5)
+!65 = !DILocation(line: 65, column: 123, scope: !5)
+!66 = !DILocation(line: 66, column: 36, scope: !5)
+!67 = !DILocation(line: 66, column: 43, scope: !5)
+!68 = !DILocation(line: 66, column: 96, scope: !5)
+!69 = !DILocation(line: 72, column: 39, scope: !5)
+!70 = !DILocation(line: 72, column: 57, scope: !5)
+!71 = !DILocation(line: 72, column: 35, scope: !5)
+!72 = !DILocation(line: 72, column: 68, scope: !5)
+!73 = !DILocation(line: 72, column: 129, scope: !5)
+!74 = !DILocation(line: 80, column: 35, scope: !5)
+!75 = !DILocation(line: 80, column: 85, scope: !5)
+!76 = !DILocation(line: 80, column: 146, scope: !5)
+!77 = !DILocation(line: 82, column: 24, scope: !5)
+!78 = !DILocation(line: 84, column: 17, scope: !5)
+!79 = !DILocation(line: 90, column: 53, scope: !5)
+!80 = !DILocation(line: 90, column: 35, scope: !5)
+!81 = !DILocation(line: 90, column: 64, scope: !5)
+!82 = !DILocation(line: 90, column: 125, scope: !5)
+!83 = !DILocation(line: 97, column: 24, scope: !5)
+!84 = !DILocation(line: 98, column: 35, scope: !5)
+!85 = !DILocation(line: 98, column: 81, scope: !5)
+!86 = !DILocation(line: 98, column: 142, scope: !5)
+!87 = !DILocation(line: 100, column: 24, scope: !5)
+!88 = !DILocation(line: 0, scope: !5)
+!89 = !DILocation(line: 111, column: 24, scope: !5)
+!90 = !DILocation(line: 113, column: 24, scope: !5)
+!91 = !DILocation(line: 116, column: 24, scope: !5)
+!92 = !DILocation(line: 118, column: 24, scope: !5)
+!93 = !DILocation(line: 119, column: 24, scope: !5)
+!94 = !DILocation(line: 121, column: 60, scope: !5)
+!95 = !DILocation(line: 121, column: 35, scope: !5)
+!96 = !DILocation(line: 121, column: 71, scope: !5)
+!97 = !DILocation(line: 121, column: 132, scope: !5)
+!98 = !DILocation(line: 127, column: 35, scope: !5)
+!99 = !DILocation(line: 127, column: 85, scope: !5)
+!100 = !DILocation(line: 127, column: 146, scope: !5)
+!101 = !DILocation(line: 129, column: 24, scope: !5)
+!102 = !DILocation(line: 131, column: 17, scope: !5)
+!103 = !DILocation(line: 134, column: 60, scope: !5)
+!104 = !DILocation(line: 134, column: 35, scope: !5)
+!105 = !DILocation(line: 134, column: 71, scope: !5)
+!106 = !DILocation(line: 134, column: 132, scope: !5)
+!107 = !DILocation(line: 139, column: 24, scope: !5)
+!108 = !DILocation(line: 140, column: 35, scope: !5)
+!109 = !DILocation(line: 140, column: 81, scope: !5)
+!110 = !DILocation(line: 140, column: 142, scope: !5)
+!111 = !DILocation(line: 142, column: 24, scope: !5)
+!112 = !DILocation(line: 151, column: 25, scope: !5)
+!113 = !DILocation(line: 153, column: 26, scope: !5)
+!114 = !DILocation(line: 156, column: 26, scope: !5)
+!115 = !DILocation(line: 158, column: 26, scope: !5)
+!116 = !DILocation(line: 159, column: 26, scope: !5)
+!117 = !DILocation(line: 161, column: 39, scope: !5)
+!118 = !DILocation(line: 161, column: 32, scope: !5)
+!119 = !DILocation(line: 161, column: 55, scope: !5)
+!120 = !DILocation(line: 162, column: 32, scope: !5)
+!121 = !DILocation(line: 162, column: 56, scope: !5)
+!122 = !DILocation(line: 53, column: 4, scope: !5)
diff --git a/triton/AYSZDJBZHGD4X4V6Y5K2ZI7KV3J3O3ME6MPETVGALPCMDZX3DFIA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ptx b/triton/AYSZDJBZHGD4X4V6Y5K2ZI7KV3J3O3ME6MPETVGALPCMDZX3DFIA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ptx
new file mode 100644
index 0000000000000000000000000000000000000000..8ba494e4a77bc50970fb7c8a5d907621a088bb1e
--- /dev/null
+++ b/triton/AYSZDJBZHGD4X4V6Y5K2ZI7KV3J3O3ME6MPETVGALPCMDZX3DFIA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ptx
@@ -0,0 +1,1188 @@
+//
+// Generated by LLVM NVPTX Back-End
+//
+
+.version 9.1
+.target sm_89
+.address_size 64
+
+	// .globl	triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0 // -- Begin function triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0
+.extern .shared .align 16 .b8 global_smem[];
+.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90};
+                                        // @triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0
+.visible .entry triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0(
+	.param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_0,
+	.param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_1,
+	.param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_2,
+	.param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_3,
+	.param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_4,
+	.param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_5,
+	.param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_6,
+	.param .u32 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_7,
+	.param .u32 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_8,
+	.param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_9,
+	.param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_10
+)
+.reqntid 128
+{
+	.reg .pred 	%p<6>;
+	.reg .b16 	%rs<42>;
+	.reg .b32 	%r<217>;
+	.reg .b64 	%rd<96>;
+	.loc	1 18 0                          // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:18:0
+$L__func_begin0:
+	.loc	1 18 0                          // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:18:0
+
+// %bb.0:                               // %__nv_rsqrtf.exit
+	ld.param.b64 	%rd11, [triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_6];
+	ld.param.b64 	%rd10, [triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_5];
+	ld.param.b64 	%rd9, [triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_4];
+	ld.param.b64 	%rd8, [triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_3];
+	ld.param.b64 	%rd7, [triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_2];
+	ld.param.b64 	%rd6, [triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_1];
+	ld.param.b64 	%rd5, [triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_0];
+$L__tmp0:
+	.loc	1 23 28                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:23:28
+	mov.u32 	%r23, %ctaid.x;
+	.loc	1 23 33                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:23:33
+	shl.b32 	%r24, %r23, 2;
+	.loc	1 24 44                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:24:44
+	mov.u32 	%r25, %tid.x;
+	and.b32 	%r26, %r25, 96;
+	bfe.u32 	%r27, %r25, 5, 2;
+	.loc	1 24 23                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:24:23
+	or.b32 	%r28, %r27, %r24;
+	.loc	1 26 37                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:26:37
+	shl.b32 	%r29, %r25, 1;
+	and.b32 	%r30, %r29, 62;
+	.loc	1 29 19                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:29:19
+	bfe.s32 	%r31, %r23, 29, 1;
+	shr.u32 	%r32, %r31, 27;
+	add.s32 	%r33, %r28, %r32;
+	shr.s32 	%r34, %r33, 5;
+	shl.b32 	%r35, %r28, 7;
+	shl.b32 	%r36, %r34, 15;
+	add.s32 	%r1, %r36, %r35;
+	add.s32 	%r2, %r1, 4096;
+	.loc	1 33 43                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:33:43
+	cvt.u64.u32 	%rd1, %r30;
+	.loc	1 39 57                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:39:57
+	or.b32 	%r37, %r2, %r30;
+	.loc	1 39 34                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:39:34
+	mad.wide.s32 	%rd12, %r37, 2, %rd7;
+	.loc	1 39 68                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:39:68
+	// begin inline asm
+	mov.u64 %rd13, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd13, 1.0;
+	// end inline asm
+	mov.b32 	%r19, 0;
+	mov.pred 	%p2, -1;
+	// begin inline asm
+	mov.u32 %r18, %r19;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.b32 { %r18 }, [ %rd12 + 0 ], %rd13;
+	// end inline asm
+	mov.b32 	{%rs1, %rs2}, %r18;
+	.loc	1 39 121                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:39:121
+	cvt.f32.bf16 	%r38, %rs1;
+	cvt.f32.bf16 	%r39, %rs2;
+	.loc	1 40 50                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:40:50
+	or.b32 	%r40, %r1, %r30;
+	.loc	1 40 34                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:40:34
+	mad.wide.s32 	%rd14, %r40, 2, %rd7;
+	.loc	1 40 61                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:40:61
+	// begin inline asm
+	mov.u64 %rd15, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd15, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r20, %r19;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.b32 { %r20 }, [ %rd14 + 0 ], %rd15;
+	// end inline asm
+	mov.b32 	{%rs3, %rs4}, %r20;
+	.loc	1 40 114                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:40:114
+	cvt.f32.bf16 	%r41, %rs3;
+	cvt.f32.bf16 	%r42, %rs4;
+	.loc	1 39 34                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:39:34
+	cvt.s64.s32 	%rd20, %r2;
+	or.b64 	%rd21, %rd20, %rd1;
+	shl.b64 	%rd22, %rd21, 1;
+	add.s64 	%rd23, %rd7, %rd22;
+	add.s64 	%rd16, %rd23, 128;
+	.loc	1 39 68                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:39:68
+	// begin inline asm
+	mov.u64 %rd17, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd17, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r21, %r19;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.b32 { %r21 }, [ %rd16 + 0 ], %rd17;
+	// end inline asm
+	mov.b32 	{%rs5, %rs6}, %r21;
+	.loc	1 39 121                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:39:121
+	cvt.f32.bf16 	%r43, %rs5;
+	cvt.f32.bf16 	%r44, %rs6;
+	.loc	1 40 34                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:40:34
+	cvt.s64.s32 	%rd24, %r1;
+	or.b64 	%rd25, %rd24, %rd1;
+	shl.b64 	%rd26, %rd25, 1;
+	add.s64 	%rd27, %rd7, %rd26;
+	add.s64 	%rd18, %rd27, 128;
+	.loc	1 40 61                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:40:61
+	// begin inline asm
+	mov.u64 %rd19, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd19, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r22, %r19;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.b32 { %r22 }, [ %rd18 + 0 ], %rd19;
+	// end inline asm
+	mov.b32 	{%rs7, %rs8}, %r22;
+	.loc	1 40 114                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:40:114
+	cvt.f32.bf16 	%r45, %rs7;
+	cvt.f32.bf16 	%r46, %rs8;
+	.loc	1 42 22                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:42:22
+	mul.f32 	%r47, %r43, %r43;
+	mul.f32 	%r48, %r44, %r44;
+	.loc	1 44 23                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:44:23
+	fma.rn.f32 	%r49, %r38, %r38, %r47;
+	fma.rn.f32 	%r50, %r39, %r39, %r48;
+	.loc	1 47 22                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:47:22
+	mul.f32 	%r51, %r45, %r45;
+	mul.f32 	%r52, %r46, %r46;
+	.loc	1 49 25                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:49:25
+	fma.rn.f32 	%r53, %r41, %r41, %r51;
+	fma.rn.f32 	%r54, %r42, %r42, %r52;
+	.loc	1 24 44                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:24:44
+	and.b32 	%r55, %r25, 3;
+	.loc	1 24 23                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:24:23
+	or.b32 	%r56, %r24, %r55;
+	.loc	1 26 37                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:26:37
+	and.b32 	%r57, %r25, 124;
+	bfe.u32 	%r58, %r25, 2, 5;
+	.loc	1 29 19                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:29:19
+	add.s32 	%r59, %r56, %r32;
+$L__tmp1:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ] ]
+	add.f32 	%r60, %r49, %r50;
+$L__tmp2:
+	.loc	2 293 36                        // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ]
+	shfl.sync.bfly.b32 	%r61, %r60, 16, 31, -1;
+$L__tmp3:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ] ]
+	add.f32 	%r62, %r60, %r61;
+$L__tmp4:
+	.loc	2 293 36                        // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ]
+	shfl.sync.bfly.b32 	%r63, %r62, 8, 31, -1;
+$L__tmp5:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ] ]
+	add.f32 	%r64, %r62, %r63;
+$L__tmp6:
+	.loc	2 293 36                        // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ]
+	shfl.sync.bfly.b32 	%r65, %r64, 4, 31, -1;
+$L__tmp7:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ] ]
+	add.f32 	%r66, %r64, %r65;
+$L__tmp8:
+	.loc	2 293 36                        // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ]
+	shfl.sync.bfly.b32 	%r67, %r66, 2, 31, -1;
+$L__tmp9:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ] ]
+	add.f32 	%r68, %r66, %r67;
+$L__tmp10:
+	.loc	2 293 36                        // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ]
+	shfl.sync.bfly.b32 	%r69, %r68, 1, 31, -1;
+$L__tmp11:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ] ]
+	add.f32 	%r70, %r68, %r69;
+$L__tmp12:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ] ]
+	add.f32 	%r71, %r53, %r54;
+$L__tmp13:
+	.loc	2 293 36                        // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ]
+	shfl.sync.bfly.b32 	%r72, %r71, 16, 31, -1;
+$L__tmp14:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ] ]
+	add.f32 	%r73, %r71, %r72;
+$L__tmp15:
+	.loc	2 293 36                        // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ]
+	shfl.sync.bfly.b32 	%r74, %r73, 8, 31, -1;
+$L__tmp16:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ] ]
+	add.f32 	%r75, %r73, %r74;
+$L__tmp17:
+	.loc	2 293 36                        // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ]
+	shfl.sync.bfly.b32 	%r76, %r75, 4, 31, -1;
+$L__tmp18:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ] ]
+	add.f32 	%r77, %r75, %r76;
+$L__tmp19:
+	.loc	2 293 36                        // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ]
+	shfl.sync.bfly.b32 	%r78, %r77, 2, 31, -1;
+$L__tmp20:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ] ]
+	add.f32 	%r79, %r77, %r78;
+$L__tmp21:
+	.loc	2 293 36                        // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ]
+	shfl.sync.bfly.b32 	%r80, %r79, 1, 31, -1;
+$L__tmp22:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ] ]
+	add.f32 	%r81, %r79, %r80;
+$L__tmp23:
+	.loc	1 63 46                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:63:46
+	shl.b32 	%r82, %r34, 7;
+	mov.b32 	%r83, 0f43000000;
+	.loc	1 75 25                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:75:25
+	div.full.f32 	%r84, %r81, %r83;
+	.loc	1 77 24                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:77:24
+	add.f32 	%r85, %r84, 0f358637BD;
+	.loc	1 78 32                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:78:32
+	rsqrt.approx.ftz.f32 	%r3, %r85;
+	.loc	1 79 24                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:79:24
+	shr.u32 	%r86, %r26, 3;
+	mov.b32 	%r87, global_smem;
+	add.s32 	%r88, %r87, %r86;
+	st.shared.b32 	[%r88], %r3;
+	bar.sync 	0;
+	shl.b32 	%r89, %r55, 2;
+	add.s32 	%r90, %r87, %r89;
+	ld.shared.b32 	%r4, [%r90];
+	.loc	1 123 24                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:123:24
+	div.full.f32 	%r91, %r70, %r83;
+	.loc	1 124 24                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:124:24
+	add.f32 	%r92, %r91, 0f358637BD;
+	.loc	1 125 32                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:125:32
+	rsqrt.approx.ftz.f32 	%r5, %r92;
+	.loc	1 126 24                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:126:24
+	bar.sync 	0;
+	st.shared.b32 	[%r88], %r5;
+	bar.sync 	0;
+	ld.shared.b32 	%r6, [%r90];
+	bfe.u32 	%r7, %r57, 2, 1;
+	and.b32 	%r93, %r58, 30;
+	shl.b32 	%r94, %r25, 3;
+	and.b32 	%r95, %r94, 120;
+	shl.b32 	%r96, %r26, 2;
+	shr.u32 	%r97, %r25, 2;
+	and.b32 	%r98, %r97, 4;
+	or.b32 	%r99, %r95, %r96;
+	xor.b32 	%r100, %r99, %r26;
+	or.b32 	%r101, %r100, %r98;
+	add.s32 	%r8, %r87, %r101;
+	xor.b32 	%r102, %r101, 4;
+	add.s32 	%r9, %r87, %r102;
+	shl.b32 	%r103, %r25, 7;
+	and.b32 	%r104, %r103, 896;
+	shl.b32 	%r105, %r55, 5;
+	xor.b32 	%r106, %r105, %r57;
+	or.b32 	%r107, %r106, %r104;
+	add.s32 	%r10, %r87, %r107;
+	xor.b32 	%r108, %r107, 4;
+	add.s32 	%r11, %r87, %r108;
+	shl.b32 	%r109, %r56, 7;
+	shl.b32 	%r110, %r59, 10;
+	and.b32 	%r111, %r110, -32768;
+	add.s32 	%r12, %r111, %r109;
+	add.s32 	%r13, %r12, 4097;
+	add.s32 	%r14, %r12, 4096;
+	shl.b32 	%r112, %r55, 7;
+	and.b32 	%r113, %r25, 28;
+	shr.u32 	%r114, %r25, 4;
+	and.b32 	%r115, %r114, 2;
+	shr.u32 	%r116, %r25, 1;
+	and.b32 	%r117, %r116, 32;
+	or.b32 	%r118, %r112, %r115;
+	or.b32 	%r119, %r105, %r113;
+	xor.b32 	%r120, %r119, %r117;
+	or.b32 	%r121, %r120, %r118;
+	add.s32 	%r15, %r87, %r121;
+	xor.b32 	%r122, %r121, 64;
+	add.s32 	%r16, %r87, %r122;
+	shl.b32 	%r123, %r55, 3;
+	shl.b32 	%r124, %r25, 2;
+	and.b32 	%r125, %r124, 480;
+	and.b32 	%r126, %r116, 2;
+	or.b32 	%r127, %r123, %r125;
+	xor.b32 	%r128, %r127, %r26;
+	add.s32 	%r129, %r87, %r126;
+	add.s32 	%r17, %r129, %r128;
+	.loc	1 53 43                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:53:43
+	cvt.u64.u32 	%rd2, %r93;
+	cvt.s64.s32 	%rd3, %r82;
+	cvt.s64.s32 	%rd4, %r35;
+	mov.b64 	%rd95, 0;
+	mov.pred 	%p5, %p2;
+$L__BB0_1:                              // =>This Inner Loop Header: Depth=1
+	.loc	1 0 43                          // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:0:43
+	mov.pred 	%p1, %p5;
+	setp.ne.b32 	%p4, %r7, 0;
+	setp.eq.b32 	%p3, %r7, 0;
+	.loc	1 54 31                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:54:31
+	or.b64 	%rd74, %rd95, %rd1;
+	.loc	1 72 41                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:72:41
+	or.b64 	%rd75, %rd95, %rd2;
+	.loc	1 61 51                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:61:51
+	cvt.u32.u64 	%r140, %rd74;
+	or.b32 	%r141, %r1, %r140;
+	.loc	1 61 35                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:61:35
+	mad.wide.s32 	%rd29, %r141, 2, %rd7;
+	.loc	1 61 62                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:61:62
+	// begin inline asm
+	mov.u64 %rd28, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd28, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r130, %r19;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.b32 { %r130 }, [ %rd29 + 0 ], %rd28;
+	// end inline asm
+	mov.b32 	{%rs26, %rs27}, %r130;
+	.loc	1 61 115                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:61:115
+	cvt.f32.bf16 	%r142, %rs26;
+	cvt.f32.bf16 	%r143, %rs27;
+	.loc	1 62 35                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:62:35
+	shl.b64 	%rd76, %rd74, 1;
+	add.s64 	%rd31, %rd8, %rd76;
+	.loc	1 62 42                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:62:42
+	// begin inline asm
+	mov.u64 %rd30, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd30, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r131, %r19;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.b32 { %r131 }, [ %rd31 + 0 ], %rd30;
+	// end inline asm
+	mov.b32 	{%rs28, %rs29}, %r131;
+	.loc	1 62 95                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:62:95
+	cvt.f32.bf16 	%r144, %rs28;
+	cvt.f32.bf16 	%r145, %rs29;
+	.loc	1 63 42                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:63:42
+	or.b64 	%rd77, %rd74, %rd3;
+	.loc	1 63 35                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:63:35
+	shl.b64 	%rd78, %rd77, 2;
+	add.s64 	%rd33, %rd9, %rd78;
+	.loc	1 63 51                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:63:51
+	// begin inline asm
+	mov.u64 %rd32, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd32, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r132, %r19;
+	mov.u32 %r133, %r19;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { %r132, %r133 }, [ %rd33 + 0 ], %rd32;
+	// end inline asm
+	.loc	1 64 35                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:64:35
+	add.s64 	%rd35, %rd10, %rd78;
+	.loc	1 64 51                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:64:51
+	// begin inline asm
+	mov.u64 %rd34, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd34, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r134, %r19;
+	mov.u32 %r135, %r19;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { %r134, %r135 }, [ %rd35 + 0 ], %rd34;
+	// end inline asm
+	bar.sync 	0;
+	st.shared.b32 	[%r8], %r134;
+	st.shared.b32 	[%r9+512], %r135;
+	bar.sync 	0;
+	ld.shared.b32 	%r146, [%r10];
+	ld.shared.b32 	%r147, [%r11];
+	.loc	1 65 58                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:65:58
+	or.b32 	%r148, %r2, %r140;
+	.loc	1 65 35                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:65:35
+	mad.wide.s32 	%rd37, %r148, 2, %rd7;
+	.loc	1 65 69                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:65:69
+	// begin inline asm
+	mov.u64 %rd36, 0x0;
+	createpolicy.fractional.L2::evict_first.b64 %rd36, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r136, %r19;
+	@%p2 ld.global.L1::evict_first.L2::cache_hint.b32 { %r136 }, [ %rd37 + 0 ], %rd36;
+	// end inline asm
+	mov.b32 	{%rs30, %rs31}, %r136;
+	.loc	1 65 123                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:65:123
+	cvt.f32.bf16 	%r149, %rs30;
+	cvt.f32.bf16 	%r150, %rs31;
+	.loc	1 66 36                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:66:36
+	add.s64 	%rd39, %rd11, %rd76;
+	.loc	1 66 43                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:66:43
+	// begin inline asm
+	mov.u64 %rd38, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd38, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r137, %r19;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.b32 { %r137 }, [ %rd39 + 0 ], %rd38;
+	// end inline asm
+	mov.b32 	{%rs32, %rs33}, %r137;
+	.loc	1 66 96                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:66:96
+	cvt.f32.bf16 	%r151, %rs32;
+	cvt.f32.bf16 	%r152, %rs33;
+	.loc	1 72 35                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:72:35
+	cvt.s64.s32 	%rd79, %r12;
+	.loc	1 72 57                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:72:57
+	cvt.u32.u64 	%r153, %rd75;
+	.loc	1 72 35                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:72:35
+	cvt.s64.s32 	%rd80, %rd75;
+	add.s64 	%rd81, %rd79, %rd80;
+	shl.b64 	%rd82, %rd81, 1;
+	add.s64 	%rd83, %rd7, %rd82;
+	add.s64 	%rd41, %rd83, 2;
+	add.s64 	%rd43, %rd83, 66;
+	.loc	1 72 68                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:72:68
+	// begin inline asm
+	mov.u64 %rd40, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd40, 1.0;
+	// end inline asm
+	mov.b16 	%rs10, 0;
+	// begin inline asm
+	mov.u16 %rs9, %rs10;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs9 }, [ %rd41 + 0 ], %rd40;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd42, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd42, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs11, %rs10;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs11 }, [ %rd43 + 0 ], %rd42;
+	// end inline asm
+	.loc	1 72 129                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:72:129
+	cvt.f32.bf16 	%r154, %rs9;
+	cvt.f32.bf16 	%r155, %rs11;
+	.loc	1 79 24                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:79:24
+	mul.f32 	%r156, %r4, %r154;
+	mul.f32 	%r157, %r4, %r155;
+	.loc	1 80 35                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:80:35
+	shl.b64 	%rd84, %rd75, 1;
+	add.s64 	%rd53, %rd8, %rd84;
+	add.s64 	%rd45, %rd53, 2;
+	add.s64 	%rd47, %rd53, 66;
+	.loc	1 80 85                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:80:85
+	// begin inline asm
+	mov.u64 %rd44, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd44, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs12, %rs10;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs12 }, [ %rd45 + 0 ], %rd44;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd46, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd46, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs13, %rs10;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs13 }, [ %rd47 + 0 ], %rd46;
+	// end inline asm
+	.loc	1 80 146                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:80:146
+	cvt.f32.bf16 	%r158, %rs12;
+	cvt.f32.bf16 	%r159, %rs13;
+	.loc	1 84 17                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:84:17
+	neg.f32 	%r160, %r156;
+	fma.rn.f32 	%r161, %r160, %r158, 0f00000000;
+	neg.f32 	%r162, %r157;
+	fma.rn.f32 	%r163, %r162, %r159, 0f00000000;
+	.loc	1 90 53                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:90:53
+	or.b32 	%r164, %r12, %r153;
+	.loc	1 90 35                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:90:35
+	mad.wide.s32 	%rd49, %r164, 2, %rd7;
+	add.s64 	%rd51, %rd83, 64;
+	.loc	1 90 64                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:90:64
+	// begin inline asm
+	mov.u64 %rd48, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd48, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs14, %rs10;
+	@%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs14 }, [ %rd49 + 0 ], %rd48;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd50, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd50, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs15, %rs10;
+	@%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs15 }, [ %rd51 + 0 ], %rd50;
+	// end inline asm
+	.loc	1 90 125                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:90:125
+	cvt.f32.bf16 	%r165, %rs14;
+	cvt.f32.bf16 	%r166, %rs15;
+	.loc	1 97 24                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:97:24
+	mul.f32 	%r167, %r4, %r165;
+	mul.f32 	%r168, %r4, %r166;
+	.loc	1 98 35                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:98:35
+	add.s64 	%rd55, %rd53, 64;
+	.loc	1 98 81                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:98:81
+	// begin inline asm
+	mov.u64 %rd52, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd52, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs16, %rs10;
+	@%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs16 }, [ %rd53 + 0 ], %rd52;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd54, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd54, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs17, %rs10;
+	@%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs17 }, [ %rd55 + 0 ], %rd54;
+	// end inline asm
+	.loc	1 98 142                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:98:142
+	cvt.f32.bf16 	%r169, %rs16;
+	cvt.f32.bf16 	%r170, %rs17;
+	.loc	1 100 24                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:100:24
+	mul.f32 	%r171, %r167, %r169;
+	mul.f32 	%r172, %r168, %r170;
+	.loc	1 0 0                           // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:0
+	selp.f32 	%r173, %r161, %r171, %p3;
+	selp.f32 	%r174, %r163, %r172, %p3;
+	.loc	1 111 24                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:111:24
+	mul.f32 	%r175, %r3, %r142;
+	mul.f32 	%r176, %r3, %r143;
+	.loc	1 113 24                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:113:24
+	mul.f32 	%r177, %r175, %r144;
+	mul.f32 	%r178, %r176, %r145;
+	.loc	1 116 24                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:116:24
+	mul.f32 	%r179, %r177, %r132;
+	mul.f32 	%r180, %r178, %r133;
+	bar.sync 	0;
+	st.shared.b32 	[%r8], %r179;
+	st.shared.b32 	[%r9+512], %r180;
+	bar.sync 	0;
+	ld.shared.b32 	%r181, [%r10];
+	ld.shared.b32 	%r182, [%r11];
+	.loc	1 119 24                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:119:24
+	fma.rn.f32 	%r183, %r146, %r173, %r181;
+	fma.rn.f32 	%r184, %r147, %r174, %r182;
+	.loc	1 121 60                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:121:60
+	or.b32 	%r185, %r13, %r153;
+	.loc	1 121 35                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:121:35
+	mad.wide.s32 	%rd57, %r185, 2, %rd7;
+	cvt.s64.s32 	%rd85, %r13;
+	add.s64 	%rd86, %rd85, %rd80;
+	shl.b64 	%rd87, %rd86, 1;
+	add.s64 	%rd88, %rd7, %rd87;
+	add.s64 	%rd59, %rd88, 64;
+	.loc	1 121 71                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:121:71
+	// begin inline asm
+	mov.u64 %rd56, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd56, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs18, %rs10;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs18 }, [ %rd57 + 0 ], %rd56;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd58, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd58, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs19, %rs10;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs19 }, [ %rd59 + 0 ], %rd58;
+	// end inline asm
+	.loc	1 121 132                       // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:121:132
+	cvt.f32.bf16 	%r186, %rs18;
+	cvt.f32.bf16 	%r187, %rs19;
+	.loc	1 126 24                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:126:24
+	mul.f32 	%r188, %r6, %r186;
+	mul.f32 	%r189, %r6, %r187;
+	.loc	1 127 35                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:127:35
+	add.s64 	%rd69, %rd11, %rd84;
+	add.s64 	%rd61, %rd69, 2;
+	add.s64 	%rd63, %rd69, 66;
+	.loc	1 127 85                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:127:85
+	// begin inline asm
+	mov.u64 %rd60, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd60, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs20, %rs10;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs20 }, [ %rd61 + 0 ], %rd60;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd62, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd62, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs21, %rs10;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs21 }, [ %rd63 + 0 ], %rd62;
+	// end inline asm
+	.loc	1 127 146                       // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:127:146
+	cvt.f32.bf16 	%r190, %rs20;
+	cvt.f32.bf16 	%r191, %rs21;
+	.loc	1 131 17                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:131:17
+	neg.f32 	%r192, %r188;
+	fma.rn.f32 	%r193, %r192, %r190, 0f00000000;
+	neg.f32 	%r194, %r189;
+	fma.rn.f32 	%r195, %r194, %r191, 0f00000000;
+	.loc	1 134 60                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:134:60
+	or.b32 	%r196, %r14, %r153;
+	.loc	1 134 35                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:134:35
+	mad.wide.s32 	%rd65, %r196, 2, %rd7;
+	cvt.s64.s32 	%rd89, %r14;
+	add.s64 	%rd90, %rd89, %rd80;
+	shl.b64 	%rd91, %rd90, 1;
+	add.s64 	%rd92, %rd7, %rd91;
+	add.s64 	%rd67, %rd92, 64;
+	.loc	1 134 71                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:134:71
+	// begin inline asm
+	mov.u64 %rd64, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd64, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs22, %rs10;
+	@%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs22 }, [ %rd65 + 0 ], %rd64;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd66, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd66, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs23, %rs10;
+	@%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs23 }, [ %rd67 + 0 ], %rd66;
+	// end inline asm
+	.loc	1 134 132                       // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:134:132
+	cvt.f32.bf16 	%r197, %rs22;
+	cvt.f32.bf16 	%r198, %rs23;
+	.loc	1 139 24                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:139:24
+	mul.f32 	%r199, %r6, %r197;
+	mul.f32 	%r200, %r6, %r198;
+	.loc	1 140 35                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:140:35
+	add.s64 	%rd71, %rd69, 64;
+	.loc	1 140 81                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:140:81
+	// begin inline asm
+	mov.u64 %rd68, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd68, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs24, %rs10;
+	@%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs24 }, [ %rd69 + 0 ], %rd68;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd70, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd70, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs25, %rs10;
+	@%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs25 }, [ %rd71 + 0 ], %rd70;
+	// end inline asm
+	.loc	1 140 142                       // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:140:142
+	cvt.f32.bf16 	%r201, %rs24;
+	cvt.f32.bf16 	%r202, %rs25;
+	.loc	1 142 24                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:142:24
+	mul.f32 	%r203, %r199, %r201;
+	mul.f32 	%r204, %r200, %r202;
+	.loc	1 0 0                           // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:0
+	selp.f32 	%r205, %r193, %r203, %p3;
+	selp.f32 	%r206, %r195, %r204, %p3;
+	.loc	1 151 25                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:151:25
+	mul.f32 	%r207, %r5, %r149;
+	mul.f32 	%r208, %r5, %r150;
+	.loc	1 153 26                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:153:26
+	mul.f32 	%r209, %r207, %r151;
+	mul.f32 	%r210, %r208, %r152;
+	.loc	1 156 26                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:156:26
+	mul.f32 	%r211, %r209, %r132;
+	mul.f32 	%r212, %r210, %r133;
+	bar.sync 	0;
+	st.shared.b32 	[%r8], %r211;
+	st.shared.b32 	[%r9+512], %r212;
+	bar.sync 	0;
+	ld.shared.b32 	%r213, [%r10];
+	ld.shared.b32 	%r214, [%r11];
+	.loc	1 159 26                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:159:26
+	fma.rn.f32 	%r215, %r146, %r205, %r213;
+	fma.rn.f32 	%r216, %r147, %r206, %r214;
+	.loc	1 161 39                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:161:39
+	or.b64 	%rd93, %rd74, %rd4;
+	.loc	1 161 32                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:161:32
+	shl.b64 	%rd94, %rd93, 1;
+	add.s64 	%rd72, %rd5, %rd94;
+	.loc	1 161 55                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:161:55
+	cvt.rn.bf16.f32 	%rs34, %r183;
+	cvt.rn.bf16.f32 	%rs35, %r184;
+	bar.sync 	0;
+	st.shared.b16 	[%r15], %rs34;
+	st.shared.b16 	[%r16], %rs35;
+	bar.sync 	0;
+	ld.shared.b16 	%rs36, [%r17];
+	ld.shared.b16 	%rs37, [%r17+4];
+	mov.b32 	%r138, {%rs36, %rs37};
+	// begin inline asm
+	@%p2 st.global.b32 [ %rd72 + 0 ], { %r138 };
+	// end inline asm
+	.loc	1 162 32                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:162:32
+	add.s64 	%rd73, %rd6, %rd94;
+	.loc	1 162 56                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:162:56
+	cvt.rn.bf16.f32 	%rs38, %r215;
+	cvt.rn.bf16.f32 	%rs39, %r216;
+	bar.sync 	0;
+	st.shared.b16 	[%r15], %rs38;
+	st.shared.b16 	[%r16], %rs39;
+	bar.sync 	0;
+	ld.shared.b16 	%rs40, [%r17];
+	ld.shared.b16 	%rs41, [%r17+4];
+	mov.b32 	%r139, {%rs40, %rs41};
+	// begin inline asm
+	@%p2 st.global.b32 [ %rd73 + 0 ], { %r139 };
+	// end inline asm
+	mov.b64 	%rd95, 64;
+	mov.pred 	%p5, 0;
+	.loc	1 53 43                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:53:43
+	@%p1 bra 	$L__BB0_1;
+// %bb.2:
+	.loc	1 53 4                          // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:53:4
+	ret;
+$L__tmp24:
+$L__func_end0:
+                                        // -- End function
+}
+	.file	1 "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py"
+	.file	2 "/usr/local/lib/python3.12/dist-packages/triton/language/standard.py"
+	.section	.debug_abbrev
+	{
+.b8 1                                   // Abbreviation Code
+.b8 17                                  // DW_TAG_compile_unit
+.b8 1                                   // DW_CHILDREN_yes
+.b8 37                                  // DW_AT_producer
+.b8 8                                   // DW_FORM_string
+.b8 19                                  // DW_AT_language
+.b8 5                                   // DW_FORM_data2
+.b8 3                                   // DW_AT_name
+.b8 8                                   // DW_FORM_string
+.b8 16                                  // DW_AT_stmt_list
+.b8 6                                   // DW_FORM_data4
+.b8 27                                  // DW_AT_comp_dir
+.b8 8                                   // DW_FORM_string
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 2                                   // Abbreviation Code
+.b8 46                                  // DW_TAG_subprogram
+.b8 0                                   // DW_CHILDREN_no
+.b8 3                                   // DW_AT_name
+.b8 8                                   // DW_FORM_string
+.b8 32                                  // DW_AT_inline
+.b8 11                                  // DW_FORM_data1
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 3                                   // Abbreviation Code
+.b8 46                                  // DW_TAG_subprogram
+.b8 1                                   // DW_CHILDREN_yes
+.b8 17                                  // DW_AT_low_pc
+.b8 1                                   // DW_FORM_addr
+.b8 18                                  // DW_AT_high_pc
+.b8 1                                   // DW_FORM_addr
+.b8 49                                  // DW_AT_abstract_origin
+.b8 19                                  // DW_FORM_ref4
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 4                                   // Abbreviation Code
+.b8 29                                  // DW_TAG_inlined_subroutine
+.b8 1                                   // DW_CHILDREN_yes
+.b8 49                                  // DW_AT_abstract_origin
+.b8 19                                  // DW_FORM_ref4
+.b8 17                                  // DW_AT_low_pc
+.b8 1                                   // DW_FORM_addr
+.b8 18                                  // DW_AT_high_pc
+.b8 1                                   // DW_FORM_addr
+.b8 88                                  // DW_AT_call_file
+.b8 11                                  // DW_FORM_data1
+.b8 89                                  // DW_AT_call_line
+.b8 11                                  // DW_FORM_data1
+.b8 87                                  // DW_AT_call_column
+.b8 11                                  // DW_FORM_data1
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 5                                   // Abbreviation Code
+.b8 29                                  // DW_TAG_inlined_subroutine
+.b8 0                                   // DW_CHILDREN_no
+.b8 49                                  // DW_AT_abstract_origin
+.b8 19                                  // DW_FORM_ref4
+.b8 17                                  // DW_AT_low_pc
+.b8 1                                   // DW_FORM_addr
+.b8 18                                  // DW_AT_high_pc
+.b8 1                                   // DW_FORM_addr
+.b8 88                                  // DW_AT_call_file
+.b8 11                                  // DW_FORM_data1
+.b8 89                                  // DW_AT_call_line
+.b8 5                                   // DW_FORM_data2
+.b8 87                                  // DW_AT_call_column
+.b8 11                                  // DW_FORM_data1
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 0                                   // EOM(3)
+	}
+	.section	.debug_info
+	{
+.b32 456                                // Length of Unit
+.b8 2                                   // DWARF version number
+.b8 0
+.b32 .debug_abbrev                      // Offset Into Abbrev. Section
+.b8 8                                   // Address Size (in bytes)
+.b8 1                                   // Abbrev [1] 0xb:0x1c1 DW_TAG_compile_unit
+.b8 116                                 // DW_AT_producer
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 0
+.b8 2                                   // DW_AT_language
+.b8 0
+.b8 99                                  // DW_AT_name
+.b8 98
+.b8 118
+.b8 113
+.b8 104
+.b8 106
+.b8 116
+.b8 121
+.b8 103
+.b8 55
+.b8 102
+.b8 118
+.b8 120
+.b8 122
+.b8 119
+.b8 116
+.b8 98
+.b8 116
+.b8 116
+.b8 52
+.b8 118
+.b8 114
+.b8 100
+.b8 107
+.b8 98
+.b8 110
+.b8 98
+.b8 54
+.b8 110
+.b8 51
+.b8 50
+.b8 102
+.b8 110
+.b8 114
+.b8 105
+.b8 106
+.b8 106
+.b8 112
+.b8 108
+.b8 51
+.b8 118
+.b8 118
+.b8 52
+.b8 99
+.b8 102
+.b8 113
+.b8 100
+.b8 52
+.b8 109
+.b8 122
+.b8 110
+.b8 114
+.b8 46
+.b8 112
+.b8 121
+.b8 0
+.b32 .debug_line                        // DW_AT_stmt_list
+.b8 47                                  // DW_AT_comp_dir
+.b8 97
+.b8 112
+.b8 112
+.b8 47
+.b8 116
+.b8 101
+.b8 110
+.b8 115
+.b8 111
+.b8 114
+.b8 114
+.b8 116
+.b8 95
+.b8 108
+.b8 108
+.b8 109
+.b8 47
+.b8 118
+.b8 105
+.b8 115
+.b8 117
+.b8 97
+.b8 108
+.b8 95
+.b8 103
+.b8 101
+.b8 110
+.b8 47
+.b8 99
+.b8 111
+.b8 109
+.b8 112
+.b8 105
+.b8 108
+.b8 101
+.b8 100
+.b8 95
+.b8 99
+.b8 97
+.b8 99
+.b8 104
+.b8 101
+.b8 47
+.b8 102
+.b8 108
+.b8 117
+.b8 120
+.b8 50
+.b8 95
+.b8 107
+.b8 108
+.b8 101
+.b8 105
+.b8 110
+.b8 95
+.b8 57
+.b8 98
+.b8 95
+.b8 78
+.b8 86
+.b8 73
+.b8 68
+.b8 73
+.b8 65
+.b8 95
+.b8 71
+.b8 101
+.b8 70
+.b8 111
+.b8 114
+.b8 99
+.b8 101
+.b8 95
+.b8 82
+.b8 84
+.b8 88
+.b8 95
+.b8 52
+.b8 48
+.b8 57
+.b8 48
+.b8 95
+.b8 115
+.b8 109
+.b8 56
+.b8 57
+.b8 95
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 50
+.b8 46
+.b8 49
+.b8 48
+.b8 46
+.b8 48
+.b8 97
+.b8 48
+.b8 95
+.b8 98
+.b8 52
+.b8 101
+.b8 52
+.b8 101
+.b8 101
+.b8 56
+.b8 49
+.b8 100
+.b8 51
+.b8 46
+.b8 110
+.b8 118
+.b8 50
+.b8 53
+.b8 46
+.b8 49
+.b8 50
+.b8 95
+.b8 99
+.b8 117
+.b8 100
+.b8 97
+.b8 49
+.b8 51
+.b8 95
+.b8 49
+.b8 47
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 105
+.b8 110
+.b8 100
+.b8 117
+.b8 99
+.b8 116
+.b8 111
+.b8 114
+.b8 47
+.b8 98
+.b8 118
+.b8 0
+.b8 2                                   // Abbrev [2] 0xe4:0x6d DW_TAG_subprogram
+.b8 116                                 // DW_AT_name
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 114
+.b8 101
+.b8 100
+.b8 95
+.b8 102
+.b8 117
+.b8 115
+.b8 101
+.b8 100
+.b8 95
+.b8 95
+.b8 102
+.b8 117
+.b8 115
+.b8 101
+.b8 100
+.b8 95
+.b8 114
+.b8 109
+.b8 115
+.b8 95
+.b8 110
+.b8 111
+.b8 114
+.b8 109
+.b8 95
+.b8 95
+.b8 116
+.b8 111
+.b8 95
+.b8 99
+.b8 111
+.b8 112
+.b8 121
+.b8 95
+.b8 97
+.b8 100
+.b8 100
+.b8 95
+.b8 109
+.b8 117
+.b8 108
+.b8 95
+.b8 110
+.b8 101
+.b8 103
+.b8 95
+.b8 115
+.b8 112
+.b8 108
+.b8 105
+.b8 116
+.b8 95
+.b8 115
+.b8 112
+.b8 108
+.b8 105
+.b8 116
+.b8 95
+.b8 119
+.b8 105
+.b8 116
+.b8 104
+.b8 95
+.b8 115
+.b8 105
+.b8 122
+.b8 101
+.b8 115
+.b8 95
+.b8 115
+.b8 116
+.b8 97
+.b8 99
+.b8 107
+.b8 95
+.b8 117
+.b8 110
+.b8 98
+.b8 105
+.b8 110
+.b8 100
+.b8 95
+.b8 117
+.b8 110
+.b8 115
+.b8 113
+.b8 117
+.b8 101
+.b8 101
+.b8 122
+.b8 101
+.b8 95
+.b8 118
+.b8 105
+.b8 101
+.b8 119
+.b8 95
+.b8 48
+.b8 0
+.b8 1                                   // DW_AT_inline
+.b8 3                                   // Abbrev [3] 0x151:0x7a DW_TAG_subprogram
+.b64 $L__func_begin0                    // DW_AT_low_pc
+.b64 $L__func_end0                      // DW_AT_high_pc
+.b32 228                                // DW_AT_abstract_origin
+.b8 4                                   // Abbrev [4] 0x166:0x32 DW_TAG_inlined_subroutine
+.b32 228                                // DW_AT_abstract_origin
+.b64 $L__tmp1                           // DW_AT_low_pc
+.b64 $L__tmp12                          // DW_AT_high_pc
+.b8 1                                   // DW_AT_call_file
+.b8 51                                  // DW_AT_call_line
+.b8 25                                  // DW_AT_call_column
+.b8 5                                   // Abbrev [5] 0x17e:0x19 DW_TAG_inlined_subroutine
+.b32 228                                // DW_AT_abstract_origin
+.b64 $L__tmp1                           // DW_AT_low_pc
+.b64 $L__tmp12                          // DW_AT_high_pc
+.b8 2                                   // DW_AT_call_file
+.b8 37                                  // DW_AT_call_line
+.b8 1
+.b8 36                                  // DW_AT_call_column
+.b8 0                                   // End Of Children Mark
+.b8 4                                   // Abbrev [4] 0x198:0x32 DW_TAG_inlined_subroutine
+.b32 228                                // DW_AT_abstract_origin
+.b64 $L__tmp12                          // DW_AT_low_pc
+.b64 $L__tmp23                          // DW_AT_high_pc
+.b8 1                                   // DW_AT_call_file
+.b8 52                                  // DW_AT_call_line
+.b8 27                                  // DW_AT_call_column
+.b8 5                                   // Abbrev [5] 0x1b0:0x19 DW_TAG_inlined_subroutine
+.b32 228                                // DW_AT_abstract_origin
+.b64 $L__tmp12                          // DW_AT_low_pc
+.b64 $L__tmp23                          // DW_AT_high_pc
+.b8 2                                   // DW_AT_call_file
+.b8 37                                  // DW_AT_call_line
+.b8 1
+.b8 36                                  // DW_AT_call_column
+.b8 0                                   // End Of Children Mark
+.b8 0                                   // End Of Children Mark
+.b8 0                                   // End Of Children Mark
+	}
+	.section	.debug_macinfo	{	}
diff --git a/triton/AYSZDJBZHGD4X4V6Y5K2ZI7KV3J3O3ME6MPETVGALPCMDZX3DFIA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.source b/triton/AYSZDJBZHGD4X4V6Y5K2ZI7KV3J3O3ME6MPETVGALPCMDZX3DFIA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.source
new file mode 100644
index 0000000000000000000000000000000000000000..379b8a0b14f9ca7ecd9c4707e3e09d300cd2a4d8
--- /dev/null
+++ b/triton/AYSZDJBZHGD4X4V6Y5K2ZI7KV3J3O3ME6MPETVGALPCMDZX3DFIA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.source
@@ -0,0 +1,972 @@
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":18:0)
+#loc213 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":287:0)
+#loc215 = loc(unknown)
+#loc218 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":262:0)
+#loc222 = loc("in_out_ptr0"(#loc))
+#loc223 = loc("in_out_ptr1"(#loc))
+#loc224 = loc("in_ptr0"(#loc))
+#loc225 = loc("in_ptr1"(#loc))
+#loc226 = loc("in_ptr2"(#loc))
+#loc227 = loc("in_ptr3"(#loc))
+#loc228 = loc("in_ptr4"(#loc))
+#loc229 = loc("xnumel"(#loc))
+#loc230 = loc("r0_numel"(#loc))
+#loc432 = loc("input"(#loc213))
+#loc433 = loc("a"(#loc218))
+#loc434 = loc("b"(#loc218))
+module {
+  tt.func public @triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0(%in_out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_out_ptr0"(#loc)), %in_out_ptr1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_out_ptr1"(#loc)), %in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %in_ptr4: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr4"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
+    %xnumel_0 = arith.constant 73728 : i32 loc(#loc231)
+    %r0_numel_1 = arith.constant 128 : i32 loc(#loc232)
+    %xoffset = tt.get_program_id x : i32 loc(#loc233)
+    %xoffset_2 = arith.constant 4 : i32 loc(#loc234)
+    %xoffset_3 = arith.constant 4 : i32 loc(#loc234)
+    %xoffset_4 = arith.muli %xoffset, %xoffset_3 : i32 loc(#loc234)
+    %xindex = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32> loc(#loc235)
+    %xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<4xi32> -> tensor<4x1xi32> loc(#loc236)
+    %xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<4x1xi32> loc(#loc237)
+    %xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<4x1xi32> loc(#loc237)
+    %xmask = arith.constant true loc(#loc238)
+    %xmask_8 = arith.constant dense<true> : tensor<4x64xi1> loc(#loc238)
+    %r0_base = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc239)
+    %r0_base_9 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc240)
+    %x0 = arith.constant 32 : i32 loc(#loc241)
+    %x0_10 = arith.constant 32 : i32 loc(#loc241)
+    %x0_11 = arith.constant dense<32> : tensor<4x1xi32> loc(#loc241)
+    %x0_12 = arith.remsi %xindex_7, %x0_11 : tensor<4x1xi32> loc(#loc241)
+    %x1 = arith.constant 32 : i32 loc(#loc242)
+    %x1_13 = arith.constant 32 : i32 loc(#loc242)
+    %x1_14 = arith.constant dense<32> : tensor<4x1xi32> loc(#loc242)
+    %x1_15 = arith.divsi %xindex_7, %x1_14 : tensor<4x1xi32> loc(#loc242)
+    %_tmp4 = arith.constant 0.000000e+00 : f32 loc(#loc243)
+    %_tmp4_16 = arith.constant dense<0.000000e+00> : tensor<4x64xf32> loc(#loc243)
+    %_tmp10 = arith.constant 0.000000e+00 : f32 loc(#loc244)
+    %_tmp10_17 = arith.constant dense<0.000000e+00> : tensor<4x64xf32> loc(#loc244)
+    %c0_i32 = arith.constant 0 : i32 loc(#loc15)
+    %c64_i32 = arith.constant 64 : i32 loc(#loc15)
+    %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc15)
+    %1 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc15)
+    %2 = arith.bitcast %c64_i32 : i32 to i32 loc(#loc15)
+    %3 = ub.poison : i32 loc(#loc15)
+    %_tmp10_18:2 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%_tmp4_23 = %_tmp4_16, %_tmp10_24 = %_tmp10_17) -> (tensor<4x64xf32>, tensor<4x64xf32>)  : i32 {
+      %r0_index = tt.splat %r0_offset : i32 -> tensor<1x64xi32> loc(#loc246)
+      %r0_index_25 = arith.addi %r0_index, %r0_base_9 : tensor<1x64xi32> loc(#loc246)
+      %r0_mask = arith.constant dense<128> : tensor<1x64xi32> loc(#loc247)
+      %r0_mask_26 = arith.cmpi slt, %r0_index_25, %r0_mask : tensor<1x64xi32> loc(#loc247)
+      %tmp0 = arith.constant 4096 : i32 loc(#loc248)
+      %tmp0_27 = arith.constant 4096 : i32 loc(#loc248)
+      %tmp0_28 = arith.constant dense<4096> : tensor<1x64xi32> loc(#loc248)
+      %tmp0_29 = arith.addi %tmp0_28, %r0_index_25 : tensor<1x64xi32> loc(#loc248)
+      %tmp0_30 = arith.constant 128 : i32 loc(#loc249)
+      %tmp0_31 = arith.constant 128 : i32 loc(#loc249)
+      %tmp0_32 = arith.constant dense<128> : tensor<4x1xi32> loc(#loc249)
+      %tmp0_33 = arith.muli %tmp0_32, %x0_12 : tensor<4x1xi32> loc(#loc249)
+      %tmp0_34 = tt.broadcast %tmp0_29 : tensor<1x64xi32> -> tensor<4x64xi32> loc(#loc250)
+      %tmp0_35 = tt.broadcast %tmp0_33 : tensor<4x1xi32> -> tensor<4x64xi32> loc(#loc250)
+      %tmp0_36 = arith.addi %tmp0_34, %tmp0_35 : tensor<4x64xi32> loc(#loc250)
+      %tmp0_37 = arith.constant 36864 : i32 loc(#loc251)
+      %tmp0_38 = arith.constant 36864 : i32 loc(#loc251)
+      %tmp0_39 = arith.constant dense<36864> : tensor<4x1xi32> loc(#loc251)
+      %tmp0_40 = arith.muli %tmp0_39, %x1_15 : tensor<4x1xi32> loc(#loc251)
+      %tmp0_41 = tt.broadcast %tmp0_40 : tensor<4x1xi32> -> tensor<4x64xi32> loc(#loc252)
+      %tmp0_42 = arith.addi %tmp0_36, %tmp0_41 : tensor<4x64xi32> loc(#loc252)
+      %tmp0_43 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<4x64x!tt.ptr<bf16>> loc(#loc253)
+      %tmp0_44 = tt.addptr %tmp0_43, %tmp0_42 : tensor<4x64x!tt.ptr<bf16>>, tensor<4x64xi32> loc(#loc253)
+      %tmp0_45 = arith.constant 0.000000e+00 : f32 loc(#loc254)
+      %tmp0_46 = tt.broadcast %r0_mask_26 : tensor<1x64xi1> -> tensor<4x64xi1> loc(#loc254)
+      %tmp0_47 = arith.constant dense<0.000000e+00> : tensor<4x64xf32> loc(#loc254)
+      %tmp0_48 = arith.truncf %tmp0_47 : tensor<4x64xf32> to tensor<4x64xbf16> loc(#loc254)
+      %tmp0_49 = tt.load %tmp0_44, %tmp0_46, %tmp0_48 evictionPolicy = evict_last : tensor<4x64x!tt.ptr<bf16>> loc(#loc254)
+      %tmp0_50 = arith.extf %tmp0_49 : tensor<4x64xbf16> to tensor<4x64xf32> loc(#loc255)
+      %tmp6 = arith.constant 128 : i32 loc(#loc256)
+      %tmp6_51 = arith.constant 128 : i32 loc(#loc256)
+      %tmp6_52 = arith.constant dense<128> : tensor<4x1xi32> loc(#loc256)
+      %tmp6_53 = arith.muli %tmp6_52, %x0_12 : tensor<4x1xi32> loc(#loc256)
+      %tmp6_54 = tt.broadcast %r0_index_25 : tensor<1x64xi32> -> tensor<4x64xi32> loc(#loc257)
+      %tmp6_55 = tt.broadcast %tmp6_53 : tensor<4x1xi32> -> tensor<4x64xi32> loc(#loc257)
+      %tmp6_56 = arith.addi %tmp6_54, %tmp6_55 : tensor<4x64xi32> loc(#loc257)
+      %tmp6_57 = arith.constant 36864 : i32 loc(#loc258)
+      %tmp6_58 = arith.constant 36864 : i32 loc(#loc258)
+      %tmp6_59 = arith.constant dense<36864> : tensor<4x1xi32> loc(#loc258)
+      %tmp6_60 = arith.muli %tmp6_59, %x1_15 : tensor<4x1xi32> loc(#loc258)
+      %tmp6_61 = tt.broadcast %tmp6_60 : tensor<4x1xi32> -> tensor<4x64xi32> loc(#loc259)
+      %tmp6_62 = arith.addi %tmp6_56, %tmp6_61 : tensor<4x64xi32> loc(#loc259)
+      %tmp6_63 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<4x64x!tt.ptr<bf16>> loc(#loc260)
+      %tmp6_64 = tt.addptr %tmp6_63, %tmp6_62 : tensor<4x64x!tt.ptr<bf16>>, tensor<4x64xi32> loc(#loc260)
+      %tmp6_65 = arith.constant 0.000000e+00 : f32 loc(#loc261)
+      %tmp6_66 = tt.broadcast %r0_mask_26 : tensor<1x64xi1> -> tensor<4x64xi1> loc(#loc261)
+      %tmp6_67 = arith.constant dense<0.000000e+00> : tensor<4x64xf32> loc(#loc261)
+      %tmp6_68 = arith.truncf %tmp6_67 : tensor<4x64xf32> to tensor<4x64xbf16> loc(#loc261)
+      %tmp6_69 = tt.load %tmp6_64, %tmp6_66, %tmp6_68 evictionPolicy = evict_last : tensor<4x64x!tt.ptr<bf16>> loc(#loc261)
+      %tmp6_70 = arith.extf %tmp6_69 : tensor<4x64xbf16> to tensor<4x64xf32> loc(#loc262)
+      %tmp2 = arith.mulf %tmp0_50, %tmp0_50 : tensor<4x64xf32> loc(#loc263)
+      %tmp5 = arith.addf %_tmp4_23, %tmp2 : tensor<4x64xf32> loc(#loc264)
+      %_tmp4_71 = tt.broadcast %r0_mask_26 : tensor<1x64xi1> -> tensor<4x64xi1> loc(#loc265)
+      %_tmp4_72 = arith.select %_tmp4_71, %tmp5, %_tmp4_23 : tensor<4x64xi1>, tensor<4x64xf32> loc(#loc265)
+      %tmp8 = arith.mulf %tmp6_70, %tmp6_70 : tensor<4x64xf32> loc(#loc266)
+      %tmp11 = arith.addf %_tmp10_24, %tmp8 : tensor<4x64xf32> loc(#loc267)
+      %_tmp10_73 = tt.broadcast %r0_mask_26 : tensor<1x64xi1> -> tensor<4x64xi1> loc(#loc268)
+      %_tmp10_74 = arith.select %_tmp10_73, %tmp11, %_tmp10_24 : tensor<4x64xi1>, tensor<4x64xf32> loc(#loc268)
+      scf.yield %_tmp4_72, %_tmp10_74 : tensor<4x64xf32>, tensor<4x64xf32> loc(#loc39)
+    } loc(#loc435)
+    %tmp4 = tt.call @"triton.language.standard.sum__fp32S4_64S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%_tmp10_18#0) : (tensor<4x64xf32>) -> tensor<4xf32> loc(#loc269)
+    %tmp4_19 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<4xf32> -> tensor<4x1xf32> loc(#loc270)
+    %tmp10 = tt.call @"triton.language.standard.sum__fp32S4_64S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%_tmp10_18#1) : (tensor<4x64xf32>) -> tensor<4xf32> loc(#loc271)
+    %tmp10_20 = tt.expand_dims %tmp10 {axis = 1 : i32} : tensor<4xf32> -> tensor<4x1xf32> loc(#loc272)
+    %c0_i32_21 = arith.constant 0 : i32 loc(#loc44)
+    %c64_i32_22 = arith.constant 64 : i32 loc(#loc44)
+    %4 = arith.bitcast %c0_i32_21 : i32 to i32 loc(#loc44)
+    %5 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc44)
+    %6 = arith.bitcast %c64_i32_22 : i32 to i32 loc(#loc44)
+    %7 = ub.poison : i32 loc(#loc44)
+    scf.for %r0_offset = %4 to %5 step %6  : i32 {
+      %r0_index = tt.splat %r0_offset : i32 -> tensor<1x64xi32> loc(#loc273)
+      %r0_index_23 = arith.addi %r0_index, %r0_base_9 : tensor<1x64xi32> loc(#loc273)
+      %r0_mask = arith.constant dense<128> : tensor<1x64xi32> loc(#loc274)
+      %r0_mask_24 = arith.cmpi slt, %r0_index_23, %r0_mask : tensor<1x64xi32> loc(#loc274)
+      %r0_3 = arith.constant 2 : i32 loc(#loc275)
+      %r0_3_25 = arith.constant 2 : i32 loc(#loc275)
+      %r0_3_26 = arith.constant dense<2> : tensor<1x64xi32> loc(#loc275)
+      %r0_3_27 = arith.remsi %r0_index_23, %r0_3_26 : tensor<1x64xi32> loc(#loc275)
+      %r0_4 = arith.constant 2 : i32 loc(#loc276)
+      %r0_4_28 = arith.constant 2 : i32 loc(#loc276)
+      %r0_4_29 = arith.constant dense<2> : tensor<1x64xi32> loc(#loc276)
+      %r0_4_30 = arith.divsi %r0_index_23, %r0_4_29 : tensor<1x64xi32> loc(#loc276)
+      %tmp50 = arith.constant 128 : i32 loc(#loc277)
+      %tmp50_31 = arith.constant 128 : i32 loc(#loc277)
+      %tmp50_32 = arith.constant dense<128> : tensor<4x1xi32> loc(#loc277)
+      %tmp50_33 = arith.muli %tmp50_32, %x0_12 : tensor<4x1xi32> loc(#loc277)
+      %tmp50_34 = tt.broadcast %r0_index_23 : tensor<1x64xi32> -> tensor<4x64xi32> loc(#loc278)
+      %tmp50_35 = tt.broadcast %tmp50_33 : tensor<4x1xi32> -> tensor<4x64xi32> loc(#loc278)
+      %tmp50_36 = arith.addi %tmp50_34, %tmp50_35 : tensor<4x64xi32> loc(#loc278)
+      %tmp50_37 = arith.constant 36864 : i32 loc(#loc279)
+      %tmp50_38 = arith.constant 36864 : i32 loc(#loc279)
+      %tmp50_39 = arith.constant dense<36864> : tensor<4x1xi32> loc(#loc279)
+      %tmp50_40 = arith.muli %tmp50_39, %x1_15 : tensor<4x1xi32> loc(#loc279)
+      %tmp50_41 = tt.broadcast %tmp50_40 : tensor<4x1xi32> -> tensor<4x64xi32> loc(#loc280)
+      %tmp50_42 = arith.addi %tmp50_36, %tmp50_41 : tensor<4x64xi32> loc(#loc280)
+      %tmp50_43 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<4x64x!tt.ptr<bf16>> loc(#loc281)
+      %tmp50_44 = tt.addptr %tmp50_43, %tmp50_42 : tensor<4x64x!tt.ptr<bf16>>, tensor<4x64xi32> loc(#loc281)
+      %tmp50_45 = arith.constant 0.000000e+00 : f32 loc(#loc282)
+      %tmp50_46 = tt.broadcast %r0_mask_24 : tensor<1x64xi1> -> tensor<4x64xi1> loc(#loc282)
+      %tmp50_47 = arith.constant dense<0.000000e+00> : tensor<4x64xf32> loc(#loc282)
+      %tmp50_48 = arith.truncf %tmp50_47 : tensor<4x64xf32> to tensor<4x64xbf16> loc(#loc282)
+      %tmp50_49 = tt.load %tmp50_44, %tmp50_46, %tmp50_48 evictionPolicy = evict_last : tensor<4x64x!tt.ptr<bf16>> loc(#loc282)
+      %tmp50_50 = arith.extf %tmp50_49 : tensor<4x64xbf16> to tensor<4x64xf32> loc(#loc283)
+      %tmp58 = tt.splat %in_ptr1 : !tt.ptr<bf16> -> tensor<1x64x!tt.ptr<bf16>> loc(#loc284)
+      %tmp58_51 = tt.addptr %tmp58, %r0_index_23 : tensor<1x64x!tt.ptr<bf16>>, tensor<1x64xi32> loc(#loc284)
+      %tmp58_52 = arith.constant 0.000000e+00 : f32 loc(#loc285)
+      %tmp58_53 = arith.constant dense<0.000000e+00> : tensor<1x64xf32> loc(#loc285)
+      %tmp58_54 = arith.truncf %tmp58_53 : tensor<1x64xf32> to tensor<1x64xbf16> loc(#loc285)
+      %tmp58_55 = tt.load %tmp58_51, %r0_mask_24, %tmp58_54 evictionPolicy = evict_last : tensor<1x64x!tt.ptr<bf16>> loc(#loc285)
+      %tmp58_56 = arith.extf %tmp58_55 : tensor<1x64xbf16> to tensor<1x64xf32> loc(#loc286)
+      %tmp63 = arith.constant 128 : i32 loc(#loc287)
+      %tmp63_57 = arith.constant 128 : i32 loc(#loc287)
+      %tmp63_58 = arith.constant dense<128> : tensor<4x1xi32> loc(#loc287)
+      %tmp63_59 = arith.muli %tmp63_58, %x1_15 : tensor<4x1xi32> loc(#loc287)
+      %tmp63_60 = tt.broadcast %r0_index_23 : tensor<1x64xi32> -> tensor<4x64xi32> loc(#loc288)
+      %tmp63_61 = tt.broadcast %tmp63_59 : tensor<4x1xi32> -> tensor<4x64xi32> loc(#loc288)
+      %tmp63_62 = arith.addi %tmp63_60, %tmp63_61 : tensor<4x64xi32> loc(#loc288)
+      %tmp63_63 = tt.splat %in_ptr2 : !tt.ptr<f32> -> tensor<4x64x!tt.ptr<f32>> loc(#loc289)
+      %tmp63_64 = tt.addptr %tmp63_63, %tmp63_62 : tensor<4x64x!tt.ptr<f32>>, tensor<4x64xi32> loc(#loc289)
+      %tmp63_65 = arith.constant 0.000000e+00 : f32 loc(#loc290)
+      %tmp63_66 = tt.broadcast %r0_mask_24 : tensor<1x64xi1> -> tensor<4x64xi1> loc(#loc290)
+      %tmp63_67 = arith.constant dense<0.000000e+00> : tensor<4x64xf32> loc(#loc290)
+      %tmp63_68 = tt.load %tmp63_64, %tmp63_66, %tmp63_67 evictionPolicy = evict_last : tensor<4x64x!tt.ptr<f32>> loc(#loc290)
+      %tmp66 = arith.constant 128 : i32 loc(#loc291)
+      %tmp66_69 = arith.constant 128 : i32 loc(#loc291)
+      %tmp66_70 = arith.constant dense<128> : tensor<4x1xi32> loc(#loc291)
+      %tmp66_71 = arith.muli %tmp66_70, %x1_15 : tensor<4x1xi32> loc(#loc291)
+      %tmp66_72 = tt.broadcast %r0_index_23 : tensor<1x64xi32> -> tensor<4x64xi32> loc(#loc292)
+      %tmp66_73 = tt.broadcast %tmp66_71 : tensor<4x1xi32> -> tensor<4x64xi32> loc(#loc292)
+      %tmp66_74 = arith.addi %tmp66_72, %tmp66_73 : tensor<4x64xi32> loc(#loc292)
+      %tmp66_75 = tt.splat %in_ptr3 : !tt.ptr<f32> -> tensor<4x64x!tt.ptr<f32>> loc(#loc293)
+      %tmp66_76 = tt.addptr %tmp66_75, %tmp66_74 : tensor<4x64x!tt.ptr<f32>>, tensor<4x64xi32> loc(#loc293)
+      %tmp66_77 = arith.constant 0.000000e+00 : f32 loc(#loc294)
+      %tmp66_78 = tt.broadcast %r0_mask_24 : tensor<1x64xi1> -> tensor<4x64xi1> loc(#loc294)
+      %tmp66_79 = arith.constant dense<0.000000e+00> : tensor<4x64xf32> loc(#loc294)
+      %tmp66_80 = tt.load %tmp66_76, %tmp66_78, %tmp66_79 evictionPolicy = evict_last : tensor<4x64x!tt.ptr<f32>> loc(#loc294)
+      %tmp96 = arith.constant 4096 : i32 loc(#loc295)
+      %tmp96_81 = arith.constant 4096 : i32 loc(#loc295)
+      %tmp96_82 = arith.constant dense<4096> : tensor<1x64xi32> loc(#loc295)
+      %tmp96_83 = arith.addi %tmp96_82, %r0_index_23 : tensor<1x64xi32> loc(#loc295)
+      %tmp96_84 = arith.constant 128 : i32 loc(#loc296)
+      %tmp96_85 = arith.constant 128 : i32 loc(#loc296)
+      %tmp96_86 = arith.constant dense<128> : tensor<4x1xi32> loc(#loc296)
+      %tmp96_87 = arith.muli %tmp96_86, %x0_12 : tensor<4x1xi32> loc(#loc296)
+      %tmp96_88 = tt.broadcast %tmp96_83 : tensor<1x64xi32> -> tensor<4x64xi32> loc(#loc297)
+      %tmp96_89 = tt.broadcast %tmp96_87 : tensor<4x1xi32> -> tensor<4x64xi32> loc(#loc297)
+      %tmp96_90 = arith.addi %tmp96_88, %tmp96_89 : tensor<4x64xi32> loc(#loc297)
+      %tmp96_91 = arith.constant 36864 : i32 loc(#loc298)
+      %tmp96_92 = arith.constant 36864 : i32 loc(#loc298)
+      %tmp96_93 = arith.constant dense<36864> : tensor<4x1xi32> loc(#loc298)
+      %tmp96_94 = arith.muli %tmp96_93, %x1_15 : tensor<4x1xi32> loc(#loc298)
+      %tmp96_95 = tt.broadcast %tmp96_94 : tensor<4x1xi32> -> tensor<4x64xi32> loc(#loc299)
+      %tmp96_96 = arith.addi %tmp96_90, %tmp96_95 : tensor<4x64xi32> loc(#loc299)
+      %tmp96_97 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<4x64x!tt.ptr<bf16>> loc(#loc300)
+      %tmp96_98 = tt.addptr %tmp96_97, %tmp96_96 : tensor<4x64x!tt.ptr<bf16>>, tensor<4x64xi32> loc(#loc300)
+      %tmp96_99 = arith.constant 0.000000e+00 : f32 loc(#loc301)
+      %tmp96_100 = tt.broadcast %r0_mask_24 : tensor<1x64xi1> -> tensor<4x64xi1> loc(#loc301)
+      %tmp96_101 = arith.constant dense<0.000000e+00> : tensor<4x64xf32> loc(#loc301)
+      %tmp96_102 = arith.truncf %tmp96_101 : tensor<4x64xf32> to tensor<4x64xbf16> loc(#loc301)
+      %tmp96_103 = tt.load %tmp96_98, %tmp96_100, %tmp96_102 evictionPolicy = evict_first : tensor<4x64x!tt.ptr<bf16>> loc(#loc301)
+      %tmp96_104 = arith.extf %tmp96_103 : tensor<4x64xbf16> to tensor<4x64xf32> loc(#loc302)
+      %tmp102 = tt.splat %in_ptr4 : !tt.ptr<bf16> -> tensor<1x64x!tt.ptr<bf16>> loc(#loc303)
+      %tmp102_105 = tt.addptr %tmp102, %r0_index_23 : tensor<1x64x!tt.ptr<bf16>>, tensor<1x64xi32> loc(#loc303)
+      %tmp102_106 = arith.constant 0.000000e+00 : f32 loc(#loc304)
+      %tmp102_107 = arith.constant dense<0.000000e+00> : tensor<1x64xf32> loc(#loc304)
+      %tmp102_108 = arith.truncf %tmp102_107 : tensor<1x64xf32> to tensor<1x64xbf16> loc(#loc304)
+      %tmp102_109 = tt.load %tmp102_105, %r0_mask_24, %tmp102_108 evictionPolicy = evict_last : tensor<1x64x!tt.ptr<bf16>> loc(#loc304)
+      %tmp102_110 = arith.extf %tmp102_109 : tensor<1x64xbf16> to tensor<1x64xf32> loc(#loc305)
+      %tmp13 = arith.constant 0 : i64 loc(#loc306)
+      %tmp13_111 = arith.constant dense<0> : tensor<1x1xi64> loc(#loc306)
+      %tmp14 = arith.extsi %r0_3_27 : tensor<1x64xi32> to tensor<1x64xi64> loc(#loc307)
+      %tmp14_112 = arith.constant dense<0> : tensor<1x64xi64> loc(#loc307)
+      %tmp14_113 = arith.cmpi sge, %tmp14, %tmp14_112 : tensor<1x64xi64> loc(#loc307)
+      %tmp15 = arith.constant 1 : i64 loc(#loc308)
+      %tmp15_114 = arith.constant dense<1> : tensor<1x1xi64> loc(#loc308)
+      %tmp16 = arith.extsi %r0_3_27 : tensor<1x64xi32> to tensor<1x64xi64> loc(#loc309)
+      %tmp16_115 = arith.constant dense<1> : tensor<1x64xi64> loc(#loc309)
+      %tmp16_116 = arith.cmpi slt, %tmp16, %tmp16_115 : tensor<1x64xi64> loc(#loc309)
+      %tmp17 = arith.constant 2 : i32 loc(#loc310)
+      %tmp17_117 = arith.constant 2 : i32 loc(#loc310)
+      %tmp17_118 = arith.constant dense<2> : tensor<1x64xi32> loc(#loc310)
+      %tmp17_119 = arith.muli %tmp17_118, %r0_4_30 : tensor<1x64xi32> loc(#loc310)
+      %tmp17_120 = arith.constant 1 : i32 loc(#loc311)
+      %tmp17_121 = arith.constant 1 : i32 loc(#loc311)
+      %tmp17_122 = arith.constant dense<1> : tensor<1x64xi32> loc(#loc311)
+      %tmp17_123 = arith.addi %tmp17_122, %tmp17_119 : tensor<1x64xi32> loc(#loc311)
+      %tmp17_124 = arith.constant 128 : i32 loc(#loc312)
+      %tmp17_125 = arith.constant 128 : i32 loc(#loc312)
+      %tmp17_126 = arith.constant dense<128> : tensor<4x1xi32> loc(#loc312)
+      %tmp17_127 = arith.muli %tmp17_126, %x0_12 : tensor<4x1xi32> loc(#loc312)
+      %tmp17_128 = tt.broadcast %tmp17_123 : tensor<1x64xi32> -> tensor<4x64xi32> loc(#loc313)
+      %tmp17_129 = tt.broadcast %tmp17_127 : tensor<4x1xi32> -> tensor<4x64xi32> loc(#loc313)
+      %tmp17_130 = arith.addi %tmp17_128, %tmp17_129 : tensor<4x64xi32> loc(#loc313)
+      %tmp17_131 = arith.constant 36864 : i32 loc(#loc314)
+      %tmp17_132 = arith.constant 36864 : i32 loc(#loc314)
+      %tmp17_133 = arith.constant dense<36864> : tensor<4x1xi32> loc(#loc314)
+      %tmp17_134 = arith.muli %tmp17_133, %x1_15 : tensor<4x1xi32> loc(#loc314)
+      %tmp17_135 = tt.broadcast %tmp17_134 : tensor<4x1xi32> -> tensor<4x64xi32> loc(#loc315)
+      %tmp17_136 = arith.addi %tmp17_130, %tmp17_135 : tensor<4x64xi32> loc(#loc315)
+      %tmp17_137 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<4x64x!tt.ptr<bf16>> loc(#loc316)
+      %tmp17_138 = tt.addptr %tmp17_137, %tmp17_136 : tensor<4x64x!tt.ptr<bf16>>, tensor<4x64xi32> loc(#loc316)
+      %tmp17_139 = arith.andi %r0_mask_24, %tmp16_116 : tensor<1x64xi1> loc(#loc317)
+      %tmp17_140 = arith.constant 0.000000e+00 : f32 loc(#loc318)
+      %tmp17_141 = tt.broadcast %tmp17_139 : tensor<1x64xi1> -> tensor<4x64xi1> loc(#loc318)
+      %tmp17_142 = arith.constant dense<0.000000e+00> : tensor<4x64xf32> loc(#loc318)
+      %tmp17_143 = arith.truncf %tmp17_142 : tensor<4x64xf32> to tensor<4x64xbf16> loc(#loc318)
+      %tmp17_144 = tt.load %tmp17_138, %tmp17_141, %tmp17_143 evictionPolicy = evict_last : tensor<4x64x!tt.ptr<bf16>> loc(#loc318)
+      %tmp17_145 = arith.extf %tmp17_144 : tensor<4x64xbf16> to tensor<4x64xf32> loc(#loc319)
+      %tmp19 = arith.constant 1.280000e+02 : f32 loc(#loc320)
+      %tmp20 = arith.constant dense<1.280000e+02> : tensor<4x1xf32> loc(#loc321)
+      %tmp20_146 = arith.divf %tmp10_20, %tmp20 : tensor<4x1xf32> loc(#loc321)
+      %tmp21 = arith.constant 9.99999997E-7 : f32 loc(#loc322)
+      %tmp22 = arith.constant dense<9.99999997E-7> : tensor<4x1xf32> loc(#loc323)
+      %tmp22_147 = arith.addf %tmp20_146, %tmp22 : tensor<4x1xf32> loc(#loc323)
+      %tmp23 = tt.extern_elementwise %tmp22_147 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<4x1xf32>) -> tensor<4x1xf32> loc(#loc324)
+      %tmp24 = tt.broadcast %tmp23 : tensor<4x1xf32> -> tensor<4x64xf32> loc(#loc325)
+      %tmp24_148 = arith.mulf %tmp17_145, %tmp24 : tensor<4x64xf32> loc(#loc325)
+      %tmp25 = arith.constant 2 : i32 loc(#loc326)
+      %tmp25_149 = arith.constant 2 : i32 loc(#loc326)
+      %tmp25_150 = arith.constant dense<2> : tensor<1x64xi32> loc(#loc326)
+      %tmp25_151 = arith.muli %tmp25_150, %r0_4_30 : tensor<1x64xi32> loc(#loc326)
+      %tmp25_152 = arith.constant 1 : i32 loc(#loc327)
+      %tmp25_153 = arith.constant 1 : i32 loc(#loc327)
+      %tmp25_154 = arith.constant dense<1> : tensor<1x64xi32> loc(#loc327)
+      %tmp25_155 = arith.addi %tmp25_154, %tmp25_151 : tensor<1x64xi32> loc(#loc327)
+      %tmp25_156 = tt.broadcast %tmp25_155 : tensor<1x64xi32> -> tensor<4x64xi32> loc(#loc328)
+      %tmp25_157 = tt.splat %in_ptr1 : !tt.ptr<bf16> -> tensor<4x64x!tt.ptr<bf16>> loc(#loc329)
+      %tmp25_158 = tt.addptr %tmp25_157, %tmp25_156 : tensor<4x64x!tt.ptr<bf16>>, tensor<4x64xi32> loc(#loc329)
+      %tmp25_159 = arith.andi %r0_mask_24, %tmp16_116 : tensor<1x64xi1> loc(#loc330)
+      %tmp25_160 = arith.constant 0.000000e+00 : f32 loc(#loc331)
+      %tmp25_161 = tt.broadcast %tmp25_159 : tensor<1x64xi1> -> tensor<4x64xi1> loc(#loc331)
+      %tmp25_162 = arith.constant dense<0.000000e+00> : tensor<4x64xf32> loc(#loc331)
+      %tmp25_163 = arith.truncf %tmp25_162 : tensor<4x64xf32> to tensor<4x64xbf16> loc(#loc331)
+      %tmp25_164 = tt.load %tmp25_158, %tmp25_161, %tmp25_163 evictionPolicy = evict_last : tensor<4x64x!tt.ptr<bf16>> loc(#loc331)
+      %tmp25_165 = arith.extf %tmp25_164 : tensor<4x64xbf16> to tensor<4x64xf32> loc(#loc332)
+      %tmp27 = arith.mulf %tmp24_148, %tmp25_165 : tensor<4x64xf32> loc(#loc333)
+      %tmp29 = arith.constant 0.000000e+00 : f32 loc(#loc334)
+      %tmp29_166 = arith.constant dense<0.000000e+00> : tensor<4x64xf32> loc(#loc334)
+      %tmp29_167 = arith.subf %tmp29_166, %tmp27 : tensor<4x64xf32> loc(#loc334)
+      %tmp30 = arith.constant 0.000000e+00 : f32 loc(#loc335)
+      %tmp30_168 = arith.constant dense<0.000000e+00> : tensor<4x64xf32> loc(#loc335)
+      %tmp31 = tt.broadcast %tmp16_116 : tensor<1x64xi1> -> tensor<4x64xi1> loc(#loc336)
+      %tmp31_169 = arith.select %tmp31, %tmp29_167, %tmp30_168 : tensor<4x64xi1>, tensor<4x64xf32> loc(#loc336)
+      %tmp32 = arith.extsi %r0_3_27 : tensor<1x64xi32> to tensor<1x64xi64> loc(#loc337)
+      %tmp32_170 = arith.constant dense<1> : tensor<1x64xi64> loc(#loc337)
+      %tmp32_171 = arith.cmpi sge, %tmp32, %tmp32_170 : tensor<1x64xi64> loc(#loc337)
+      %tmp33 = arith.constant 2 : i64 loc(#loc338)
+      %tmp33_172 = arith.constant dense<2> : tensor<1x1xi64> loc(#loc338)
+      %tmp34 = arith.extsi %r0_3_27 : tensor<1x64xi32> to tensor<1x64xi64> loc(#loc339)
+      %tmp34_173 = arith.constant dense<2> : tensor<1x64xi64> loc(#loc339)
+      %tmp34_174 = arith.cmpi slt, %tmp34, %tmp34_173 : tensor<1x64xi64> loc(#loc339)
+      %tmp35 = arith.constant 2 : i32 loc(#loc340)
+      %tmp35_175 = arith.constant 2 : i32 loc(#loc340)
+      %tmp35_176 = arith.constant dense<2> : tensor<1x64xi32> loc(#loc340)
+      %tmp35_177 = arith.muli %tmp35_176, %r0_4_30 : tensor<1x64xi32> loc(#loc340)
+      %tmp35_178 = arith.constant 128 : i32 loc(#loc341)
+      %tmp35_179 = arith.constant 128 : i32 loc(#loc341)
+      %tmp35_180 = arith.constant dense<128> : tensor<4x1xi32> loc(#loc341)
+      %tmp35_181 = arith.muli %tmp35_180, %x0_12 : tensor<4x1xi32> loc(#loc341)
+      %tmp35_182 = tt.broadcast %tmp35_177 : tensor<1x64xi32> -> tensor<4x64xi32> loc(#loc342)
+      %tmp35_183 = tt.broadcast %tmp35_181 : tensor<4x1xi32> -> tensor<4x64xi32> loc(#loc342)
+      %tmp35_184 = arith.addi %tmp35_182, %tmp35_183 : tensor<4x64xi32> loc(#loc342)
+      %tmp35_185 = arith.constant 36864 : i32 loc(#loc343)
+      %tmp35_186 = arith.constant 36864 : i32 loc(#loc343)
+      %tmp35_187 = arith.constant dense<36864> : tensor<4x1xi32> loc(#loc343)
+      %tmp35_188 = arith.muli %tmp35_187, %x1_15 : tensor<4x1xi32> loc(#loc343)
+      %tmp35_189 = tt.broadcast %tmp35_188 : tensor<4x1xi32> -> tensor<4x64xi32> loc(#loc344)
+      %tmp35_190 = arith.addi %tmp35_184, %tmp35_189 : tensor<4x64xi32> loc(#loc344)
+      %tmp35_191 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<4x64x!tt.ptr<bf16>> loc(#loc345)
+      %tmp35_192 = tt.addptr %tmp35_191, %tmp35_190 : tensor<4x64x!tt.ptr<bf16>>, tensor<4x64xi32> loc(#loc345)
+      %tmp35_193 = arith.andi %r0_mask_24, %tmp32_171 : tensor<1x64xi1> loc(#loc346)
+      %tmp35_194 = arith.constant 0.000000e+00 : f32 loc(#loc347)
+      %tmp35_195 = tt.broadcast %tmp35_193 : tensor<1x64xi1> -> tensor<4x64xi1> loc(#loc347)
+      %tmp35_196 = arith.constant dense<0.000000e+00> : tensor<4x64xf32> loc(#loc347)
+      %tmp35_197 = arith.truncf %tmp35_196 : tensor<4x64xf32> to tensor<4x64xbf16> loc(#loc347)
+      %tmp35_198 = tt.load %tmp35_192, %tmp35_195, %tmp35_197 evictionPolicy = evict_last : tensor<4x64x!tt.ptr<bf16>> loc(#loc347)
+      %tmp35_199 = arith.extf %tmp35_198 : tensor<4x64xbf16> to tensor<4x64xf32> loc(#loc348)
+      %tmp37 = arith.constant 1.280000e+02 : f32 loc(#loc349)
+      %tmp38 = arith.constant dense<1.280000e+02> : tensor<4x1xf32> loc(#loc350)
+      %tmp38_200 = arith.divf %tmp10_20, %tmp38 : tensor<4x1xf32> loc(#loc350)
+      %tmp39 = arith.constant 9.99999997E-7 : f32 loc(#loc351)
+      %tmp40 = arith.constant dense<9.99999997E-7> : tensor<4x1xf32> loc(#loc352)
+      %tmp40_201 = arith.addf %tmp38_200, %tmp40 : tensor<4x1xf32> loc(#loc352)
+      %tmp41 = tt.extern_elementwise %tmp40_201 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<4x1xf32>) -> tensor<4x1xf32> loc(#loc353)
+      %tmp42 = tt.broadcast %tmp41 : tensor<4x1xf32> -> tensor<4x64xf32> loc(#loc354)
+      %tmp42_202 = arith.mulf %tmp35_199, %tmp42 : tensor<4x64xf32> loc(#loc354)
+      %tmp43 = arith.constant 2 : i32 loc(#loc355)
+      %tmp43_203 = arith.constant 2 : i32 loc(#loc355)
+      %tmp43_204 = arith.constant dense<2> : tensor<1x64xi32> loc(#loc355)
+      %tmp43_205 = arith.muli %tmp43_204, %r0_4_30 : tensor<1x64xi32> loc(#loc355)
+      %tmp43_206 = tt.broadcast %tmp43_205 : tensor<1x64xi32> -> tensor<4x64xi32> loc(#loc356)
+      %tmp43_207 = tt.splat %in_ptr1 : !tt.ptr<bf16> -> tensor<4x64x!tt.ptr<bf16>> loc(#loc357)
+      %tmp43_208 = tt.addptr %tmp43_207, %tmp43_206 : tensor<4x64x!tt.ptr<bf16>>, tensor<4x64xi32> loc(#loc357)
+      %tmp43_209 = arith.andi %r0_mask_24, %tmp32_171 : tensor<1x64xi1> loc(#loc358)
+      %tmp43_210 = arith.constant 0.000000e+00 : f32 loc(#loc359)
+      %tmp43_211 = tt.broadcast %tmp43_209 : tensor<1x64xi1> -> tensor<4x64xi1> loc(#loc359)
+      %tmp43_212 = arith.constant dense<0.000000e+00> : tensor<4x64xf32> loc(#loc359)
+      %tmp43_213 = arith.truncf %tmp43_212 : tensor<4x64xf32> to tensor<4x64xbf16> loc(#loc359)
+      %tmp43_214 = tt.load %tmp43_208, %tmp43_211, %tmp43_213 evictionPolicy = evict_last : tensor<4x64x!tt.ptr<bf16>> loc(#loc359)
+      %tmp43_215 = arith.extf %tmp43_214 : tensor<4x64xbf16> to tensor<4x64xf32> loc(#loc360)
+      %tmp45 = arith.mulf %tmp42_202, %tmp43_215 : tensor<4x64xf32> loc(#loc361)
+      %tmp47 = arith.constant 0.000000e+00 : f32 loc(#loc362)
+      %tmp47_216 = arith.constant dense<0.000000e+00> : tensor<4x64xf32> loc(#loc362)
+      %tmp48 = tt.broadcast %tmp32_171 : tensor<1x64xi1> -> tensor<4x64xi1> loc(#loc363)
+      %tmp48_217 = arith.select %tmp48, %tmp45, %tmp47_216 : tensor<4x64xi1>, tensor<4x64xf32> loc(#loc363)
+      %tmp49 = tt.broadcast %tmp16_116 : tensor<1x64xi1> -> tensor<4x64xi1> loc(#loc364)
+      %tmp49_218 = arith.select %tmp49, %tmp31_169, %tmp48_217 : tensor<4x64xi1>, tensor<4x64xf32> loc(#loc364)
+      %tmp52 = arith.constant 1.280000e+02 : f32 loc(#loc365)
+      %tmp53 = arith.constant dense<1.280000e+02> : tensor<4x1xf32> loc(#loc366)
+      %tmp53_219 = arith.divf %tmp10_20, %tmp53 : tensor<4x1xf32> loc(#loc366)
+      %tmp54 = arith.constant 9.99999997E-7 : f32 loc(#loc367)
+      %tmp55 = arith.constant dense<9.99999997E-7> : tensor<4x1xf32> loc(#loc368)
+      %tmp55_220 = arith.addf %tmp53_219, %tmp55 : tensor<4x1xf32> loc(#loc368)
+      %tmp56 = tt.extern_elementwise %tmp55_220 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<4x1xf32>) -> tensor<4x1xf32> loc(#loc369)
+      %tmp57 = tt.broadcast %tmp56 : tensor<4x1xf32> -> tensor<4x64xf32> loc(#loc370)
+      %tmp57_221 = arith.mulf %tmp50_50, %tmp57 : tensor<4x64xf32> loc(#loc370)
+      %tmp60 = tt.broadcast %tmp58_56 : tensor<1x64xf32> -> tensor<4x64xf32> loc(#loc371)
+      %tmp60_222 = arith.mulf %tmp57_221, %tmp60 : tensor<4x64xf32> loc(#loc371)
+      %tmp64 = arith.mulf %tmp60_222, %tmp63_68 : tensor<4x64xf32> loc(#loc372)
+      %tmp67 = arith.mulf %tmp49_218, %tmp66_80 : tensor<4x64xf32> loc(#loc373)
+      %tmp68 = arith.addf %tmp64, %tmp67 : tensor<4x64xf32> loc(#loc374)
+      %tmp70 = arith.constant 2 : i32 loc(#loc375)
+      %tmp70_223 = arith.constant 2 : i32 loc(#loc375)
+      %tmp70_224 = arith.constant dense<2> : tensor<1x64xi32> loc(#loc375)
+      %tmp70_225 = arith.muli %tmp70_224, %r0_4_30 : tensor<1x64xi32> loc(#loc375)
+      %tmp70_226 = arith.constant 4097 : i32 loc(#loc376)
+      %tmp70_227 = arith.constant 4097 : i32 loc(#loc376)
+      %tmp70_228 = arith.constant dense<4097> : tensor<1x64xi32> loc(#loc376)
+      %tmp70_229 = arith.addi %tmp70_228, %tmp70_225 : tensor<1x64xi32> loc(#loc376)
+      %tmp70_230 = arith.constant 128 : i32 loc(#loc377)
+      %tmp70_231 = arith.constant 128 : i32 loc(#loc377)
+      %tmp70_232 = arith.constant dense<128> : tensor<4x1xi32> loc(#loc377)
+      %tmp70_233 = arith.muli %tmp70_232, %x0_12 : tensor<4x1xi32> loc(#loc377)
+      %tmp70_234 = tt.broadcast %tmp70_229 : tensor<1x64xi32> -> tensor<4x64xi32> loc(#loc378)
+      %tmp70_235 = tt.broadcast %tmp70_233 : tensor<4x1xi32> -> tensor<4x64xi32> loc(#loc378)
+      %tmp70_236 = arith.addi %tmp70_234, %tmp70_235 : tensor<4x64xi32> loc(#loc378)
+      %tmp70_237 = arith.constant 36864 : i32 loc(#loc379)
+      %tmp70_238 = arith.constant 36864 : i32 loc(#loc379)
+      %tmp70_239 = arith.constant dense<36864> : tensor<4x1xi32> loc(#loc379)
+      %tmp70_240 = arith.muli %tmp70_239, %x1_15 : tensor<4x1xi32> loc(#loc379)
+      %tmp70_241 = tt.broadcast %tmp70_240 : tensor<4x1xi32> -> tensor<4x64xi32> loc(#loc380)
+      %tmp70_242 = arith.addi %tmp70_236, %tmp70_241 : tensor<4x64xi32> loc(#loc380)
+      %tmp70_243 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<4x64x!tt.ptr<bf16>> loc(#loc381)
+      %tmp70_244 = tt.addptr %tmp70_243, %tmp70_242 : tensor<4x64x!tt.ptr<bf16>>, tensor<4x64xi32> loc(#loc381)
+      %tmp70_245 = arith.andi %r0_mask_24, %tmp16_116 : tensor<1x64xi1> loc(#loc382)
+      %tmp70_246 = arith.constant 0.000000e+00 : f32 loc(#loc383)
+      %tmp70_247 = tt.broadcast %tmp70_245 : tensor<1x64xi1> -> tensor<4x64xi1> loc(#loc383)
+      %tmp70_248 = arith.constant dense<0.000000e+00> : tensor<4x64xf32> loc(#loc383)
+      %tmp70_249 = arith.truncf %tmp70_248 : tensor<4x64xf32> to tensor<4x64xbf16> loc(#loc383)
+      %tmp70_250 = tt.load %tmp70_244, %tmp70_247, %tmp70_249 evictionPolicy = evict_last : tensor<4x64x!tt.ptr<bf16>> loc(#loc383)
+      %tmp70_251 = arith.extf %tmp70_250 : tensor<4x64xbf16> to tensor<4x64xf32> loc(#loc384)
+      %tmp72 = arith.constant dense<1.280000e+02> : tensor<4x1xf32> loc(#loc385)
+      %tmp72_252 = arith.divf %tmp4_19, %tmp72 : tensor<4x1xf32> loc(#loc385)
+      %tmp73 = arith.constant dense<9.99999997E-7> : tensor<4x1xf32> loc(#loc386)
+      %tmp73_253 = arith.addf %tmp72_252, %tmp73 : tensor<4x1xf32> loc(#loc386)
+      %tmp74 = tt.extern_elementwise %tmp73_253 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<4x1xf32>) -> tensor<4x1xf32> loc(#loc387)
+      %tmp75 = tt.broadcast %tmp74 : tensor<4x1xf32> -> tensor<4x64xf32> loc(#loc388)
+      %tmp75_254 = arith.mulf %tmp70_251, %tmp75 : tensor<4x64xf32> loc(#loc388)
+      %tmp76 = arith.constant 2 : i32 loc(#loc389)
+      %tmp76_255 = arith.constant 2 : i32 loc(#loc389)
+      %tmp76_256 = arith.constant dense<2> : tensor<1x64xi32> loc(#loc389)
+      %tmp76_257 = arith.muli %tmp76_256, %r0_4_30 : tensor<1x64xi32> loc(#loc389)
+      %tmp76_258 = arith.constant 1 : i32 loc(#loc390)
+      %tmp76_259 = arith.constant 1 : i32 loc(#loc390)
+      %tmp76_260 = arith.constant dense<1> : tensor<1x64xi32> loc(#loc390)
+      %tmp76_261 = arith.addi %tmp76_260, %tmp76_257 : tensor<1x64xi32> loc(#loc390)
+      %tmp76_262 = tt.broadcast %tmp76_261 : tensor<1x64xi32> -> tensor<4x64xi32> loc(#loc391)
+      %tmp76_263 = tt.splat %in_ptr4 : !tt.ptr<bf16> -> tensor<4x64x!tt.ptr<bf16>> loc(#loc392)
+      %tmp76_264 = tt.addptr %tmp76_263, %tmp76_262 : tensor<4x64x!tt.ptr<bf16>>, tensor<4x64xi32> loc(#loc392)
+      %tmp76_265 = arith.andi %r0_mask_24, %tmp16_116 : tensor<1x64xi1> loc(#loc393)
+      %tmp76_266 = arith.constant 0.000000e+00 : f32 loc(#loc394)
+      %tmp76_267 = tt.broadcast %tmp76_265 : tensor<1x64xi1> -> tensor<4x64xi1> loc(#loc394)
+      %tmp76_268 = arith.constant dense<0.000000e+00> : tensor<4x64xf32> loc(#loc394)
+      %tmp76_269 = arith.truncf %tmp76_268 : tensor<4x64xf32> to tensor<4x64xbf16> loc(#loc394)
+      %tmp76_270 = tt.load %tmp76_264, %tmp76_267, %tmp76_269 evictionPolicy = evict_last : tensor<4x64x!tt.ptr<bf16>> loc(#loc394)
+      %tmp76_271 = arith.extf %tmp76_270 : tensor<4x64xbf16> to tensor<4x64xf32> loc(#loc395)
+      %tmp78 = arith.mulf %tmp75_254, %tmp76_271 : tensor<4x64xf32> loc(#loc396)
+      %tmp80 = arith.constant 0.000000e+00 : f32 loc(#loc397)
+      %tmp80_272 = arith.constant dense<0.000000e+00> : tensor<4x64xf32> loc(#loc397)
+      %tmp80_273 = arith.subf %tmp80_272, %tmp78 : tensor<4x64xf32> loc(#loc397)
+      %tmp81 = arith.constant 0.000000e+00 : f32 loc(#loc398)
+      %tmp81_274 = arith.constant dense<0.000000e+00> : tensor<4x64xf32> loc(#loc398)
+      %tmp82 = tt.broadcast %tmp16_116 : tensor<1x64xi1> -> tensor<4x64xi1> loc(#loc399)
+      %tmp82_275 = arith.select %tmp82, %tmp80_273, %tmp81_274 : tensor<4x64xi1>, tensor<4x64xf32> loc(#loc399)
+      %tmp83 = arith.constant 2 : i32 loc(#loc400)
+      %tmp83_276 = arith.constant 2 : i32 loc(#loc400)
+      %tmp83_277 = arith.constant dense<2> : tensor<1x64xi32> loc(#loc400)
+      %tmp83_278 = arith.muli %tmp83_277, %r0_4_30 : tensor<1x64xi32> loc(#loc400)
+      %tmp83_279 = arith.constant 4096 : i32 loc(#loc401)
+      %tmp83_280 = arith.constant 4096 : i32 loc(#loc401)
+      %tmp83_281 = arith.constant dense<4096> : tensor<1x64xi32> loc(#loc401)
+      %tmp83_282 = arith.addi %tmp83_281, %tmp83_278 : tensor<1x64xi32> loc(#loc401)
+      %tmp83_283 = arith.constant 128 : i32 loc(#loc402)
+      %tmp83_284 = arith.constant 128 : i32 loc(#loc402)
+      %tmp83_285 = arith.constant dense<128> : tensor<4x1xi32> loc(#loc402)
+      %tmp83_286 = arith.muli %tmp83_285, %x0_12 : tensor<4x1xi32> loc(#loc402)
+      %tmp83_287 = tt.broadcast %tmp83_282 : tensor<1x64xi32> -> tensor<4x64xi32> loc(#loc403)
+      %tmp83_288 = tt.broadcast %tmp83_286 : tensor<4x1xi32> -> tensor<4x64xi32> loc(#loc403)
+      %tmp83_289 = arith.addi %tmp83_287, %tmp83_288 : tensor<4x64xi32> loc(#loc403)
+      %tmp83_290 = arith.constant 36864 : i32 loc(#loc404)
+      %tmp83_291 = arith.constant 36864 : i32 loc(#loc404)
+      %tmp83_292 = arith.constant dense<36864> : tensor<4x1xi32> loc(#loc404)
+      %tmp83_293 = arith.muli %tmp83_292, %x1_15 : tensor<4x1xi32> loc(#loc404)
+      %tmp83_294 = tt.broadcast %tmp83_293 : tensor<4x1xi32> -> tensor<4x64xi32> loc(#loc405)
+      %tmp83_295 = arith.addi %tmp83_289, %tmp83_294 : tensor<4x64xi32> loc(#loc405)
+      %tmp83_296 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<4x64x!tt.ptr<bf16>> loc(#loc406)
+      %tmp83_297 = tt.addptr %tmp83_296, %tmp83_295 : tensor<4x64x!tt.ptr<bf16>>, tensor<4x64xi32> loc(#loc406)
+      %tmp83_298 = arith.andi %r0_mask_24, %tmp32_171 : tensor<1x64xi1> loc(#loc407)
+      %tmp83_299 = arith.constant 0.000000e+00 : f32 loc(#loc408)
+      %tmp83_300 = tt.broadcast %tmp83_298 : tensor<1x64xi1> -> tensor<4x64xi1> loc(#loc408)
+      %tmp83_301 = arith.constant dense<0.000000e+00> : tensor<4x64xf32> loc(#loc408)
+      %tmp83_302 = arith.truncf %tmp83_301 : tensor<4x64xf32> to tensor<4x64xbf16> loc(#loc408)
+      %tmp83_303 = tt.load %tmp83_297, %tmp83_300, %tmp83_302 evictionPolicy = evict_last : tensor<4x64x!tt.ptr<bf16>> loc(#loc408)
+      %tmp83_304 = arith.extf %tmp83_303 : tensor<4x64xbf16> to tensor<4x64xf32> loc(#loc409)
+      %tmp85 = arith.constant dense<1.280000e+02> : tensor<4x1xf32> loc(#loc410)
+      %tmp85_305 = arith.divf %tmp4_19, %tmp85 : tensor<4x1xf32> loc(#loc410)
+      %tmp86 = arith.constant dense<9.99999997E-7> : tensor<4x1xf32> loc(#loc411)
+      %tmp86_306 = arith.addf %tmp85_305, %tmp86 : tensor<4x1xf32> loc(#loc411)
+      %tmp87 = tt.extern_elementwise %tmp86_306 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<4x1xf32>) -> tensor<4x1xf32> loc(#loc412)
+      %tmp88 = tt.broadcast %tmp87 : tensor<4x1xf32> -> tensor<4x64xf32> loc(#loc413)
+      %tmp88_307 = arith.mulf %tmp83_304, %tmp88 : tensor<4x64xf32> loc(#loc413)
+      %tmp89 = arith.constant 2 : i32 loc(#loc414)
+      %tmp89_308 = arith.constant 2 : i32 loc(#loc414)
+      %tmp89_309 = arith.constant dense<2> : tensor<1x64xi32> loc(#loc414)
+      %tmp89_310 = arith.muli %tmp89_309, %r0_4_30 : tensor<1x64xi32> loc(#loc414)
+      %tmp89_311 = tt.broadcast %tmp89_310 : tensor<1x64xi32> -> tensor<4x64xi32> loc(#loc415)
+      %tmp89_312 = tt.splat %in_ptr4 : !tt.ptr<bf16> -> tensor<4x64x!tt.ptr<bf16>> loc(#loc416)
+      %tmp89_313 = tt.addptr %tmp89_312, %tmp89_311 : tensor<4x64x!tt.ptr<bf16>>, tensor<4x64xi32> loc(#loc416)
+      %tmp89_314 = arith.andi %r0_mask_24, %tmp32_171 : tensor<1x64xi1> loc(#loc417)
+      %tmp89_315 = arith.constant 0.000000e+00 : f32 loc(#loc418)
+      %tmp89_316 = tt.broadcast %tmp89_314 : tensor<1x64xi1> -> tensor<4x64xi1> loc(#loc418)
+      %tmp89_317 = arith.constant dense<0.000000e+00> : tensor<4x64xf32> loc(#loc418)
+      %tmp89_318 = arith.truncf %tmp89_317 : tensor<4x64xf32> to tensor<4x64xbf16> loc(#loc418)
+      %tmp89_319 = tt.load %tmp89_313, %tmp89_316, %tmp89_318 evictionPolicy = evict_last : tensor<4x64x!tt.ptr<bf16>> loc(#loc418)
+      %tmp89_320 = arith.extf %tmp89_319 : tensor<4x64xbf16> to tensor<4x64xf32> loc(#loc419)
+      %tmp91 = arith.mulf %tmp88_307, %tmp89_320 : tensor<4x64xf32> loc(#loc420)
+      %tmp93 = arith.constant 0.000000e+00 : f32 loc(#loc421)
+      %tmp93_321 = arith.constant dense<0.000000e+00> : tensor<4x64xf32> loc(#loc421)
+      %tmp94 = tt.broadcast %tmp32_171 : tensor<1x64xi1> -> tensor<4x64xi1> loc(#loc422)
+      %tmp94_322 = arith.select %tmp94, %tmp91, %tmp93_321 : tensor<4x64xi1>, tensor<4x64xf32> loc(#loc422)
+      %tmp95 = tt.broadcast %tmp16_116 : tensor<1x64xi1> -> tensor<4x64xi1> loc(#loc423)
+      %tmp95_323 = arith.select %tmp95, %tmp82_275, %tmp94_322 : tensor<4x64xi1>, tensor<4x64xf32> loc(#loc423)
+      %tmp98 = arith.constant dense<1.280000e+02> : tensor<4x1xf32> loc(#loc424)
+      %tmp98_324 = arith.divf %tmp4_19, %tmp98 : tensor<4x1xf32> loc(#loc424)
+      %tmp99 = arith.constant dense<9.99999997E-7> : tensor<4x1xf32> loc(#loc425)
+      %tmp99_325 = arith.addf %tmp98_324, %tmp99 : tensor<4x1xf32> loc(#loc425)
+      %tmp100 = tt.extern_elementwise %tmp99_325 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<4x1xf32>) -> tensor<4x1xf32> loc(#loc426)
+      %tmp101 = tt.broadcast %tmp100 : tensor<4x1xf32> -> tensor<4x64xf32> loc(#loc427)
+      %tmp101_326 = arith.mulf %tmp96_104, %tmp101 : tensor<4x64xf32> loc(#loc427)
+      %tmp104 = tt.broadcast %tmp102_110 : tensor<1x64xf32> -> tensor<4x64xf32> loc(#loc428)
+      %tmp104_327 = arith.mulf %tmp101_326, %tmp104 : tensor<4x64xf32> loc(#loc428)
+      %tmp107 = arith.mulf %tmp104_327, %tmp63_68 : tensor<4x64xf32> loc(#loc429)
+      %tmp109 = arith.mulf %tmp95_323, %tmp66_80 : tensor<4x64xf32> loc(#loc430)
+      %tmp110 = arith.addf %tmp107, %tmp109 : tensor<4x64xf32> loc(#loc431)
+      %c128_i32 = arith.constant 128 : i32 loc(#loc204)
+      %c128_i32_328 = arith.constant 128 : i32 loc(#loc204)
+      %cst = arith.constant dense<128> : tensor<4x1xi32> loc(#loc204)
+      %8 = arith.muli %cst, %xindex_7 : tensor<4x1xi32> loc(#loc204)
+      %9 = tt.broadcast %r0_index_23 : tensor<1x64xi32> -> tensor<4x64xi32> loc(#loc205)
+      %10 = tt.broadcast %8 : tensor<4x1xi32> -> tensor<4x64xi32> loc(#loc205)
+      %11 = arith.addi %9, %10 : tensor<4x64xi32> loc(#loc205)
+      %12 = tt.splat %in_out_ptr0 : !tt.ptr<bf16> -> tensor<4x64x!tt.ptr<bf16>> loc(#loc206)
+      %13 = tt.addptr %12, %11 : tensor<4x64x!tt.ptr<bf16>>, tensor<4x64xi32> loc(#loc206)
+      %14 = tt.broadcast %r0_mask_24 : tensor<1x64xi1> -> tensor<4x64xi1> loc(#loc207)
+      %15 = arith.truncf %tmp68 : tensor<4x64xf32> to tensor<4x64xbf16> loc(#loc207)
+      tt.store %13, %15, %14 : tensor<4x64x!tt.ptr<bf16>> loc(#loc207)
+      %c128_i32_329 = arith.constant 128 : i32 loc(#loc208)
+      %c128_i32_330 = arith.constant 128 : i32 loc(#loc208)
+      %cst_331 = arith.constant dense<128> : tensor<4x1xi32> loc(#loc208)
+      %16 = arith.muli %cst_331, %xindex_7 : tensor<4x1xi32> loc(#loc208)
+      %17 = tt.broadcast %r0_index_23 : tensor<1x64xi32> -> tensor<4x64xi32> loc(#loc209)
+      %18 = tt.broadcast %16 : tensor<4x1xi32> -> tensor<4x64xi32> loc(#loc209)
+      %19 = arith.addi %17, %18 : tensor<4x64xi32> loc(#loc209)
+      %20 = tt.splat %in_out_ptr1 : !tt.ptr<bf16> -> tensor<4x64x!tt.ptr<bf16>> loc(#loc210)
+      %21 = tt.addptr %20, %19 : tensor<4x64x!tt.ptr<bf16>>, tensor<4x64xi32> loc(#loc210)
+      %22 = tt.broadcast %r0_mask_24 : tensor<1x64xi1> -> tensor<4x64xi1> loc(#loc211)
+      %23 = arith.truncf %tmp110 : tensor<4x64xf32> to tensor<4x64xbf16> loc(#loc211)
+      tt.store %21, %23, %22 : tensor<4x64x!tt.ptr<bf16>> loc(#loc211)
+    } loc(#loc44)
+    tt.return loc(#loc212)
+  } loc(#loc)
+  tt.func private @"triton.language.standard.sum__fp32S4_64S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<4x64xf32> loc("input"(#loc213))) -> tensor<4xf32> attributes {noinline = false} {
+    %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({
+    ^bb0(%arg1: f32 loc(unknown), %arg2: f32 loc(unknown)):
+      %2 = tt.call @triton.language.standard._sum_combine__fp32_fp32__(%arg1, %arg2) : (f32, f32) -> f32 loc(#loc214)
+      tt.reduce.return %2 : f32 loc(#loc214)
+    }) : (tensor<4x64xf32>) -> tensor<4xf32> loc(#loc214)
+    tt.return %0 : tensor<4xf32> loc(#loc216)
+  ^bb1:  // no predecessors
+    %1 = ub.poison : tensor<4xf32> loc(#loc217)
+    tt.return %1 : tensor<4xf32> loc(#loc217)
+  } loc(#loc213)
+  tt.func private @triton.language.standard._sum_combine__fp32_fp32__(%a: f32 loc("a"(#loc218)), %b: f32 loc("b"(#loc218))) -> f32 attributes {noinline = false} {
+    %0 = arith.addf %a, %b : f32 loc(#loc219)
+    tt.return %0 : f32 loc(#loc220)
+  ^bb1:  // no predecessors
+    %1 = ub.poison : f32 loc(#loc221)
+    tt.return %1 : f32 loc(#loc221)
+  } loc(#loc218)
+} loc(#loc)
+#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":19:13)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":20:15)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":23:28)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":23:33)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:36)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:44)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:23)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":25:46)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":26:27)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":26:37)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":28:19)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":29:19)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":30:43)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":32:44)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":33:43)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":34:31)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":35:29)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:41)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:52)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:48)
+#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:63)
+#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:57)
+#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:34)
+#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:68)
+#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:121)
+#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:45)
+#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:41)
+#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:56)
+#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:50)
+#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:34)
+#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:61)
+#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:114)
+#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":42:22)
+#loc34 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":44:23)
+#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":45:40)
+#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":47:22)
+#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":49:25)
+#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":50:42)
+#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":50:8)
+#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":51:25)
+#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":51:28)
+#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":52:27)
+#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":52:30)
+#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":53:43)
+#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":54:31)
+#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":55:29)
+#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":58:27)
+#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":59:27)
+#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:46)
+#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:42)
+#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:57)
+#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:51)
+#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:35)
+#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:62)
+#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:115)
+#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:35)
+#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:42)
+#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:95)
+#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:46)
+#loc60 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:42)
+#loc61 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:35)
+#loc62 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:51)
+#loc63 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:46)
+#loc64 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:42)
+#loc65 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:35)
+#loc66 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:51)
+#loc67 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:42)
+#loc68 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:53)
+#loc69 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:49)
+#loc70 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:64)
+#loc71 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:58)
+#loc72 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:35)
+#loc73 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:69)
+#loc74 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:123)
+#loc75 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:36)
+#loc76 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:43)
+#loc77 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:96)
+#loc78 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":68:35)
+#loc79 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":69:25)
+#loc80 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":70:35)
+#loc81 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":71:24)
+#loc82 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:41)
+#loc83 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:39)
+#loc84 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:52)
+#loc85 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:48)
+#loc86 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:63)
+#loc87 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:57)
+#loc88 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:35)
+#loc89 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:78)
+#loc90 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:68)
+#loc91 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:129)
+#loc92 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":74:16)
+#loc93 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":75:25)
+#loc94 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":76:16)
+#loc95 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":77:24)
+#loc96 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":78:32)
+#loc97 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":79:24)
+#loc98 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:57)
+#loc99 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:55)
+#loc100 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:63)
+#loc101 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:35)
+#loc102 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:95)
+#loc103 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:85)
+#loc104 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:146)
+#loc105 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":82:24)
+#loc106 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":84:17)
+#loc107 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":85:42)
+#loc108 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":86:39)
+#loc109 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":87:25)
+#loc110 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":88:35)
+#loc111 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":89:24)
+#loc112 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:37)
+#loc113 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:48)
+#loc114 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:44)
+#loc115 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:59)
+#loc116 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:53)
+#loc117 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:35)
+#loc118 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:74)
+#loc119 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:64)
+#loc120 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:125)
+#loc121 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":92:16)
+#loc122 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":93:25)
+#loc123 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":94:16)
+#loc124 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":95:24)
+#loc125 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":96:32)
+#loc126 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":97:24)
+#loc127 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:53)
+#loc128 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:59)
+#loc129 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:35)
+#loc130 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:91)
+#loc131 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:81)
+#loc132 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:142)
+#loc133 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":100:24)
+#loc134 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":102:42)
+#loc135 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":103:39)
+#loc136 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":104:39)
+#loc137 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":106:16)
+#loc138 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":107:25)
+#loc139 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":108:16)
+#loc140 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":109:24)
+#loc141 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":110:32)
+#loc142 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":111:24)
+#loc143 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":113:24)
+#loc144 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":116:24)
+#loc145 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":118:24)
+#loc146 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":119:24)
+#loc147 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:44)
+#loc148 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:42)
+#loc149 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:55)
+#loc150 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:51)
+#loc151 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:66)
+#loc152 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:60)
+#loc153 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:35)
+#loc154 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:81)
+#loc155 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:71)
+#loc156 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:132)
+#loc157 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":123:24)
+#loc158 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":124:24)
+#loc159 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":125:32)
+#loc160 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":126:24)
+#loc161 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:57)
+#loc162 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:55)
+#loc163 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:63)
+#loc164 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:35)
+#loc165 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:95)
+#loc166 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:85)
+#loc167 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:146)
+#loc168 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":129:24)
+#loc169 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":131:17)
+#loc170 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":132:42)
+#loc171 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":133:39)
+#loc172 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:44)
+#loc173 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:42)
+#loc174 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:55)
+#loc175 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:51)
+#loc176 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:66)
+#loc177 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:60)
+#loc178 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:35)
+#loc179 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:81)
+#loc180 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:71)
+#loc181 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:132)
+#loc182 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":136:24)
+#loc183 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":137:24)
+#loc184 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":138:32)
+#loc185 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":139:24)
+#loc186 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:53)
+#loc187 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:59)
+#loc188 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:35)
+#loc189 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:91)
+#loc190 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:81)
+#loc191 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:142)
+#loc192 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":142:24)
+#loc193 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":144:42)
+#loc194 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":145:39)
+#loc195 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":146:39)
+#loc196 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":148:24)
+#loc197 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":149:24)
+#loc198 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":150:33)
+#loc199 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":151:25)
+#loc200 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":153:26)
+#loc201 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":156:26)
+#loc202 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":158:26)
+#loc203 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":159:26)
+#loc204 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:43)
+#loc205 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:39)
+#loc206 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:32)
+#loc207 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:55)
+#loc208 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:43)
+#loc209 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:39)
+#loc210 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:32)
+#loc211 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:56)
+#loc212 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":53:4)
+#loc214 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:36)
+#loc216 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:11)
+#loc217 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:4)
+#loc219 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:15)
+#loc220 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:11)
+#loc221 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:4)
+#loc231 = loc("xnumel"(#loc1))
+#loc232 = loc("r0_numel"(#loc2))
+#loc233 = loc("xoffset"(#loc3))
+#loc234 = loc("xoffset"(#loc4))
+#loc235 = loc("xindex"(#loc5))
+#loc236 = loc("xindex"(#loc6))
+#loc237 = loc("xindex"(#loc7))
+#loc238 = loc("xmask"(#loc8))
+#loc239 = loc("r0_base"(#loc9))
+#loc240 = loc("r0_base"(#loc10))
+#loc241 = loc("x0"(#loc11))
+#loc242 = loc("x1"(#loc12))
+#loc243 = loc("_tmp4"(#loc13))
+#loc244 = loc("_tmp10"(#loc14))
+#loc245 = loc("_tmp4"(#loc15))
+#loc246 = loc("r0_index"(#loc16))
+#loc247 = loc("r0_mask"(#loc17))
+#loc248 = loc("tmp0"(#loc18))
+#loc249 = loc("tmp0"(#loc19))
+#loc250 = loc("tmp0"(#loc20))
+#loc251 = loc("tmp0"(#loc21))
+#loc252 = loc("tmp0"(#loc22))
+#loc253 = loc("tmp0"(#loc23))
+#loc254 = loc("tmp0"(#loc24))
+#loc255 = loc("tmp0"(#loc25))
+#loc256 = loc("tmp6"(#loc26))
+#loc257 = loc("tmp6"(#loc27))
+#loc258 = loc("tmp6"(#loc28))
+#loc259 = loc("tmp6"(#loc29))
+#loc260 = loc("tmp6"(#loc30))
+#loc261 = loc("tmp6"(#loc31))
+#loc262 = loc("tmp6"(#loc32))
+#loc263 = loc("tmp2"(#loc33))
+#loc264 = loc("tmp5"(#loc34))
+#loc265 = loc("_tmp4"(#loc35))
+#loc266 = loc("tmp8"(#loc36))
+#loc267 = loc("tmp11"(#loc37))
+#loc268 = loc("_tmp10"(#loc38))
+#loc269 = loc("tmp4"(#loc40))
+#loc270 = loc("tmp4"(#loc41))
+#loc271 = loc("tmp10"(#loc42))
+#loc272 = loc("tmp10"(#loc43))
+#loc273 = loc("r0_index"(#loc45))
+#loc274 = loc("r0_mask"(#loc46))
+#loc275 = loc("r0_3"(#loc47))
+#loc276 = loc("r0_4"(#loc48))
+#loc277 = loc("tmp50"(#loc49))
+#loc278 = loc("tmp50"(#loc50))
+#loc279 = loc("tmp50"(#loc51))
+#loc280 = loc("tmp50"(#loc52))
+#loc281 = loc("tmp50"(#loc53))
+#loc282 = loc("tmp50"(#loc54))
+#loc283 = loc("tmp50"(#loc55))
+#loc284 = loc("tmp58"(#loc56))
+#loc285 = loc("tmp58"(#loc57))
+#loc286 = loc("tmp58"(#loc58))
+#loc287 = loc("tmp63"(#loc59))
+#loc288 = loc("tmp63"(#loc60))
+#loc289 = loc("tmp63"(#loc61))
+#loc290 = loc("tmp63"(#loc62))
+#loc291 = loc("tmp66"(#loc63))
+#loc292 = loc("tmp66"(#loc64))
+#loc293 = loc("tmp66"(#loc65))
+#loc294 = loc("tmp66"(#loc66))
+#loc295 = loc("tmp96"(#loc67))
+#loc296 = loc("tmp96"(#loc68))
+#loc297 = loc("tmp96"(#loc69))
+#loc298 = loc("tmp96"(#loc70))
+#loc299 = loc("tmp96"(#loc71))
+#loc300 = loc("tmp96"(#loc72))
+#loc301 = loc("tmp96"(#loc73))
+#loc302 = loc("tmp96"(#loc74))
+#loc303 = loc("tmp102"(#loc75))
+#loc304 = loc("tmp102"(#loc76))
+#loc305 = loc("tmp102"(#loc77))
+#loc306 = loc("tmp13"(#loc78))
+#loc307 = loc("tmp14"(#loc79))
+#loc308 = loc("tmp15"(#loc80))
+#loc309 = loc("tmp16"(#loc81))
+#loc310 = loc("tmp17"(#loc82))
+#loc311 = loc("tmp17"(#loc83))
+#loc312 = loc("tmp17"(#loc84))
+#loc313 = loc("tmp17"(#loc85))
+#loc314 = loc("tmp17"(#loc86))
+#loc315 = loc("tmp17"(#loc87))
+#loc316 = loc("tmp17"(#loc88))
+#loc317 = loc("tmp17"(#loc89))
+#loc318 = loc("tmp17"(#loc90))
+#loc319 = loc("tmp17"(#loc91))
+#loc320 = loc("tmp19"(#loc92))
+#loc321 = loc("tmp20"(#loc93))
+#loc322 = loc("tmp21"(#loc94))
+#loc323 = loc("tmp22"(#loc95))
+#loc324 = loc("tmp23"(#loc96))
+#loc325 = loc("tmp24"(#loc97))
+#loc326 = loc("tmp25"(#loc98))
+#loc327 = loc("tmp25"(#loc99))
+#loc328 = loc("tmp25"(#loc100))
+#loc329 = loc("tmp25"(#loc101))
+#loc330 = loc("tmp25"(#loc102))
+#loc331 = loc("tmp25"(#loc103))
+#loc332 = loc("tmp25"(#loc104))
+#loc333 = loc("tmp27"(#loc105))
+#loc334 = loc("tmp29"(#loc106))
+#loc335 = loc("tmp30"(#loc107))
+#loc336 = loc("tmp31"(#loc108))
+#loc337 = loc("tmp32"(#loc109))
+#loc338 = loc("tmp33"(#loc110))
+#loc339 = loc("tmp34"(#loc111))
+#loc340 = loc("tmp35"(#loc112))
+#loc341 = loc("tmp35"(#loc113))
+#loc342 = loc("tmp35"(#loc114))
+#loc343 = loc("tmp35"(#loc115))
+#loc344 = loc("tmp35"(#loc116))
+#loc345 = loc("tmp35"(#loc117))
+#loc346 = loc("tmp35"(#loc118))
+#loc347 = loc("tmp35"(#loc119))
+#loc348 = loc("tmp35"(#loc120))
+#loc349 = loc("tmp37"(#loc121))
+#loc350 = loc("tmp38"(#loc122))
+#loc351 = loc("tmp39"(#loc123))
+#loc352 = loc("tmp40"(#loc124))
+#loc353 = loc("tmp41"(#loc125))
+#loc354 = loc("tmp42"(#loc126))
+#loc355 = loc("tmp43"(#loc127))
+#loc356 = loc("tmp43"(#loc128))
+#loc357 = loc("tmp43"(#loc129))
+#loc358 = loc("tmp43"(#loc130))
+#loc359 = loc("tmp43"(#loc131))
+#loc360 = loc("tmp43"(#loc132))
+#loc361 = loc("tmp45"(#loc133))
+#loc362 = loc("tmp47"(#loc134))
+#loc363 = loc("tmp48"(#loc135))
+#loc364 = loc("tmp49"(#loc136))
+#loc365 = loc("tmp52"(#loc137))
+#loc366 = loc("tmp53"(#loc138))
+#loc367 = loc("tmp54"(#loc139))
+#loc368 = loc("tmp55"(#loc140))
+#loc369 = loc("tmp56"(#loc141))
+#loc370 = loc("tmp57"(#loc142))
+#loc371 = loc("tmp60"(#loc143))
+#loc372 = loc("tmp64"(#loc144))
+#loc373 = loc("tmp67"(#loc145))
+#loc374 = loc("tmp68"(#loc146))
+#loc375 = loc("tmp70"(#loc147))
+#loc376 = loc("tmp70"(#loc148))
+#loc377 = loc("tmp70"(#loc149))
+#loc378 = loc("tmp70"(#loc150))
+#loc379 = loc("tmp70"(#loc151))
+#loc380 = loc("tmp70"(#loc152))
+#loc381 = loc("tmp70"(#loc153))
+#loc382 = loc("tmp70"(#loc154))
+#loc383 = loc("tmp70"(#loc155))
+#loc384 = loc("tmp70"(#loc156))
+#loc385 = loc("tmp72"(#loc157))
+#loc386 = loc("tmp73"(#loc158))
+#loc387 = loc("tmp74"(#loc159))
+#loc388 = loc("tmp75"(#loc160))
+#loc389 = loc("tmp76"(#loc161))
+#loc390 = loc("tmp76"(#loc162))
+#loc391 = loc("tmp76"(#loc163))
+#loc392 = loc("tmp76"(#loc164))
+#loc393 = loc("tmp76"(#loc165))
+#loc394 = loc("tmp76"(#loc166))
+#loc395 = loc("tmp76"(#loc167))
+#loc396 = loc("tmp78"(#loc168))
+#loc397 = loc("tmp80"(#loc169))
+#loc398 = loc("tmp81"(#loc170))
+#loc399 = loc("tmp82"(#loc171))
+#loc400 = loc("tmp83"(#loc172))
+#loc401 = loc("tmp83"(#loc173))
+#loc402 = loc("tmp83"(#loc174))
+#loc403 = loc("tmp83"(#loc175))
+#loc404 = loc("tmp83"(#loc176))
+#loc405 = loc("tmp83"(#loc177))
+#loc406 = loc("tmp83"(#loc178))
+#loc407 = loc("tmp83"(#loc179))
+#loc408 = loc("tmp83"(#loc180))
+#loc409 = loc("tmp83"(#loc181))
+#loc410 = loc("tmp85"(#loc182))
+#loc411 = loc("tmp86"(#loc183))
+#loc412 = loc("tmp87"(#loc184))
+#loc413 = loc("tmp88"(#loc185))
+#loc414 = loc("tmp89"(#loc186))
+#loc415 = loc("tmp89"(#loc187))
+#loc416 = loc("tmp89"(#loc188))
+#loc417 = loc("tmp89"(#loc189))
+#loc418 = loc("tmp89"(#loc190))
+#loc419 = loc("tmp89"(#loc191))
+#loc420 = loc("tmp91"(#loc192))
+#loc421 = loc("tmp93"(#loc193))
+#loc422 = loc("tmp94"(#loc194))
+#loc423 = loc("tmp95"(#loc195))
+#loc424 = loc("tmp98"(#loc196))
+#loc425 = loc("tmp99"(#loc197))
+#loc426 = loc("tmp100"(#loc198))
+#loc427 = loc("tmp101"(#loc199))
+#loc428 = loc("tmp104"(#loc200))
+#loc429 = loc("tmp107"(#loc201))
+#loc430 = loc("tmp109"(#loc202))
+#loc431 = loc("tmp110"(#loc203))
+#loc435 = loc("_tmp10"(#loc245))
diff --git a/triton/AYSZDJBZHGD4X4V6Y5K2ZI7KV3J3O3ME6MPETVGALPCMDZX3DFIA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttgir b/triton/AYSZDJBZHGD4X4V6Y5K2ZI7KV3J3O3ME6MPETVGALPCMDZX3DFIA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttgir
new file mode 100644
index 0000000000000000000000000000000000000000..8115df19745abae3957c66ab1adb50400f930743
--- /dev/null
+++ b/triton/AYSZDJBZHGD4X4V6Y5K2ZI7KV3J3O3ME6MPETVGALPCMDZX3DFIA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttgir
@@ -0,0 +1,547 @@
+#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [4, 8], warpsPerCTA = [1, 4], order = [0, 1]}>
+#blocked1 = #ttg.blocked<{sizePerThread = [1, 2], threadsPerWarp = [1, 32], warpsPerCTA = [4, 1], order = [1, 0]}>
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":18:0)
+#loc1 = loc(unknown)
+#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":51:25)
+#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":52:27)
+#loc147 = loc("in_out_ptr0"(#loc))
+#loc148 = loc("in_out_ptr1"(#loc))
+#loc149 = loc("in_ptr0"(#loc))
+#loc150 = loc("in_ptr1"(#loc))
+#loc151 = loc("in_ptr2"(#loc))
+#loc152 = loc("in_ptr3"(#loc))
+#loc153 = loc("in_ptr4"(#loc))
+#loc154 = loc("xnumel"(#loc))
+#loc155 = loc("r0_numel"(#loc))
+#loc185 = loc("tmp4"(#loc33))
+#loc187 = loc("tmp10"(#loc36))
+#loc292 = loc(callsite(#loc1 at #loc185))
+#loc294 = loc(callsite(#loc1 at #loc187))
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:89", "ttg.threads-per-warp" = 32 : i32} {
+  tt.func public @triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0(%in_out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_out_ptr0"(#loc)), %in_out_ptr1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_out_ptr1"(#loc)), %in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %in_ptr4: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr4"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
+    %cst = arith.constant dense<4097> : tensor<1x64xi32, #blocked> loc(#loc1)
+    %cst_0 = arith.constant dense<0.000000e+00> : tensor<1x64xbf16, #blocked1> loc(#loc1)
+    %cst_1 = arith.constant dense<1> : tensor<1x64xi32, #blocked> loc(#loc1)
+    %cst_2 = arith.constant dense<1> : tensor<1x64xi64, #blocked> loc(#loc1)
+    %cst_3 = arith.constant dense<2> : tensor<1x64xi32, #blocked> loc(#loc1)
+    %cst_4 = arith.constant dense<36864> : tensor<4x1xi32, #blocked> loc(#loc1)
+    %cst_5 = arith.constant dense<36864> : tensor<4x1xi32, #blocked1> loc(#loc1)
+    %cst_6 = arith.constant dense<128> : tensor<4x1xi32, #blocked> loc(#loc1)
+    %cst_7 = arith.constant dense<128> : tensor<4x1xi32, #blocked1> loc(#loc1)
+    %cst_8 = arith.constant dense<4096> : tensor<1x64xi32, #blocked> loc(#loc1)
+    %cst_9 = arith.constant dense<4096> : tensor<1x64xi32, #blocked1> loc(#loc1)
+    %cst_10 = arith.constant dense<128> : tensor<1x64xi32, #blocked> loc(#loc1)
+    %cst_11 = arith.constant dense<128> : tensor<1x64xi32, #blocked1> loc(#loc1)
+    %cst_12 = arith.constant dense<32> : tensor<4x1xi32, #blocked> loc(#loc1)
+    %cst_13 = arith.constant dense<32> : tensor<4x1xi32, #blocked1> loc(#loc1)
+    %c4_i32 = arith.constant 4 : i32 loc(#loc1)
+    %cst_14 = arith.constant dense<0.000000e+00> : tensor<4x64xbf16, #blocked1> loc(#loc1)
+    %cst_15 = arith.constant dense<0.000000e+00> : tensor<4x64xbf16, #blocked> loc(#loc1)
+    %c64_i32 = arith.constant 64 : i32 loc(#loc1)
+    %c128_i32 = arith.constant 128 : i32 loc(#loc1)
+    %c0_i32 = arith.constant 0 : i32 loc(#loc1)
+    %cst_16 = arith.constant dense<9.99999997E-7> : tensor<4x1xf32, #blocked1> loc(#loc1)
+    %cst_17 = arith.constant dense<1.280000e+02> : tensor<4x1xf32, #blocked1> loc(#loc1)
+    %cst_18 = arith.constant dense<0.000000e+00> : tensor<4x64xf32, #blocked> loc(#loc1)
+    %cst_19 = arith.constant dense<0.000000e+00> : tensor<4x64xf32, #blocked1> loc(#loc1)
+    %xoffset = tt.get_program_id x : i32 loc(#loc156)
+    %xoffset_20 = arith.muli %xoffset, %c4_i32 : i32 loc(#loc157)
+    %xindex = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc158)
+    %xindex_21 = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc158)
+    %xindex_22 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<4xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<4x1xi32, #blocked1> loc(#loc158)
+    %xindex_23 = tt.expand_dims %xindex_21 {axis = 1 : i32} : tensor<4xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<4x1xi32, #blocked> loc(#loc158)
+    %xindex_24 = tt.splat %xoffset_20 : i32 -> tensor<4x1xi32, #blocked1> loc(#loc159)
+    %xindex_25 = tt.splat %xoffset_20 : i32 -> tensor<4x1xi32, #blocked> loc(#loc159)
+    %xindex_26 = arith.addi %xindex_24, %xindex_22 : tensor<4x1xi32, #blocked1> loc(#loc159)
+    %xindex_27 = arith.addi %xindex_25, %xindex_23 : tensor<4x1xi32, #blocked> loc(#loc159)
+    %r0_base = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc160)
+    %r0_base_28 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc160)
+    %r0_base_29 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x64xi32, #blocked1> loc(#loc160)
+    %r0_base_30 = tt.expand_dims %r0_base_28 {axis = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x64xi32, #blocked> loc(#loc160)
+    %x0 = arith.remsi %xindex_26, %cst_13 : tensor<4x1xi32, #blocked1> loc(#loc161)
+    %x0_31 = arith.remsi %xindex_27, %cst_12 : tensor<4x1xi32, #blocked> loc(#loc161)
+    %x1 = arith.divsi %xindex_26, %cst_13 : tensor<4x1xi32, #blocked1> loc(#loc162)
+    %x1_32 = arith.divsi %xindex_27, %cst_12 : tensor<4x1xi32, #blocked> loc(#loc162)
+    %tmp0 = arith.muli %x0, %cst_7 : tensor<4x1xi32, #blocked1> loc(#loc163)
+    %tmp0_33 = tt.broadcast %tmp0 : tensor<4x1xi32, #blocked1> -> tensor<4x64xi32, #blocked1> loc(#loc164)
+    %tmp0_34 = arith.muli %x1, %cst_5 : tensor<4x1xi32, #blocked1> loc(#loc165)
+    %tmp0_35 = tt.broadcast %tmp0_34 : tensor<4x1xi32, #blocked1> -> tensor<4x64xi32, #blocked1> loc(#loc166)
+    %tmp0_36 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<4x64x!tt.ptr<bf16>, #blocked1> loc(#loc167)
+    %_tmp10:2 = scf.for %_tmp10_51 = %c0_i32 to %c128_i32 step %c64_i32 iter_args(%arg10 = %cst_19, %arg11 = %cst_19) -> (tensor<4x64xf32, #blocked1>, tensor<4x64xf32, #blocked1>)  : i32 {
+      %r0_index = tt.splat %_tmp10_51 : i32 -> tensor<1x64xi32, #blocked1> loc(#loc169)
+      %r0_index_52 = arith.addi %r0_index, %r0_base_29 : tensor<1x64xi32, #blocked1> loc(#loc169)
+      %r0_mask = arith.cmpi slt, %r0_index_52, %cst_11 : tensor<1x64xi32, #blocked1> loc(#loc170)
+      %tmp0_53 = arith.addi %r0_index_52, %cst_9 : tensor<1x64xi32, #blocked1> loc(#loc171)
+      %tmp0_54 = tt.broadcast %tmp0_53 : tensor<1x64xi32, #blocked1> -> tensor<4x64xi32, #blocked1> loc(#loc164)
+      %tmp0_55 = arith.addi %tmp0_54, %tmp0_33 : tensor<4x64xi32, #blocked1> loc(#loc164)
+      %tmp0_56 = arith.addi %tmp0_55, %tmp0_35 : tensor<4x64xi32, #blocked1> loc(#loc166)
+      %tmp0_57 = tt.addptr %tmp0_36, %tmp0_56 : tensor<4x64x!tt.ptr<bf16>, #blocked1>, tensor<4x64xi32, #blocked1> loc(#loc167)
+      %tmp0_58 = tt.broadcast %r0_mask : tensor<1x64xi1, #blocked1> -> tensor<4x64xi1, #blocked1> loc(#loc172)
+      %tmp0_59 = tt.load %tmp0_57, %tmp0_58, %cst_14 evictionPolicy = evict_last : tensor<4x64x!tt.ptr<bf16>, #blocked1> loc(#loc172)
+      %tmp0_60 = arith.extf %tmp0_59 : tensor<4x64xbf16, #blocked1> to tensor<4x64xf32, #blocked1> loc(#loc173)
+      %tmp6 = tt.broadcast %r0_index_52 : tensor<1x64xi32, #blocked1> -> tensor<4x64xi32, #blocked1> loc(#loc174)
+      %tmp6_61 = arith.addi %tmp6, %tmp0_33 : tensor<4x64xi32, #blocked1> loc(#loc174)
+      %tmp6_62 = arith.addi %tmp6_61, %tmp0_35 : tensor<4x64xi32, #blocked1> loc(#loc175)
+      %tmp6_63 = tt.addptr %tmp0_36, %tmp6_62 : tensor<4x64x!tt.ptr<bf16>, #blocked1>, tensor<4x64xi32, #blocked1> loc(#loc176)
+      %tmp6_64 = tt.load %tmp6_63, %tmp0_58, %cst_14 evictionPolicy = evict_last : tensor<4x64x!tt.ptr<bf16>, #blocked1> loc(#loc177)
+      %tmp6_65 = arith.extf %tmp6_64 : tensor<4x64xbf16, #blocked1> to tensor<4x64xf32, #blocked1> loc(#loc178)
+      %tmp2 = arith.mulf %tmp0_60, %tmp0_60 : tensor<4x64xf32, #blocked1> loc(#loc179)
+      %tmp5 = arith.addf %arg10, %tmp2 : tensor<4x64xf32, #blocked1> loc(#loc180)
+      %_tmp4 = arith.select %tmp0_58, %tmp5, %arg10 : tensor<4x64xi1, #blocked1>, tensor<4x64xf32, #blocked1> loc(#loc181)
+      %tmp8 = arith.mulf %tmp6_65, %tmp6_65 : tensor<4x64xf32, #blocked1> loc(#loc182)
+      %tmp11 = arith.addf %arg11, %tmp8 : tensor<4x64xf32, #blocked1> loc(#loc183)
+      %_tmp10_66 = arith.select %tmp0_58, %tmp11, %arg11 : tensor<4x64xi1, #blocked1>, tensor<4x64xf32, #blocked1> loc(#loc184)
+      scf.yield %_tmp4, %_tmp10_66 : tensor<4x64xf32, #blocked1>, tensor<4x64xf32, #blocked1> loc(#loc31)
+    } loc(#loc290)
+    %tmp4 = "tt.reduce"(%_tmp10#0) <{axis = 1 : i32}> ({
+    ^bb0(%tmp4_51: f32 loc(callsite(#loc1 at #loc185)), %tmp4_52: f32 loc(callsite(#loc1 at #loc185))):
+      %tmp4_53 = arith.addf %tmp4_51, %tmp4_52 : f32 loc(#loc297)
+      tt.reduce.return %tmp4_53 : f32 loc(#loc291)
+    }) : (tensor<4x64xf32, #blocked1>) -> tensor<4xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc291)
+    %tmp4_37 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<4xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<4x1xf32, #blocked1> loc(#loc186)
+    %tmp10 = "tt.reduce"(%_tmp10#1) <{axis = 1 : i32}> ({
+    ^bb0(%tmp10_51: f32 loc(callsite(#loc1 at #loc187)), %tmp10_52: f32 loc(callsite(#loc1 at #loc187))):
+      %tmp10_53 = arith.addf %tmp10_51, %tmp10_52 : f32 loc(#loc298)
+      tt.reduce.return %tmp10_53 : f32 loc(#loc293)
+    }) : (tensor<4x64xf32, #blocked1>) -> tensor<4xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc293)
+    %tmp10_38 = tt.expand_dims %tmp10 {axis = 1 : i32} : tensor<4xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<4x1xf32, #blocked1> loc(#loc188)
+    %tmp50 = arith.muli %x0_31, %cst_6 : tensor<4x1xi32, #blocked> loc(#loc189)
+    %tmp50_39 = tt.broadcast %tmp50 : tensor<4x1xi32, #blocked> -> tensor<4x64xi32, #blocked> loc(#loc190)
+    %tmp50_40 = arith.muli %x1_32, %cst_4 : tensor<4x1xi32, #blocked> loc(#loc191)
+    %tmp50_41 = tt.broadcast %tmp50_40 : tensor<4x1xi32, #blocked> -> tensor<4x64xi32, #blocked> loc(#loc192)
+    %tmp50_42 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<4x64x!tt.ptr<bf16>, #blocked> loc(#loc193)
+    %tmp58 = tt.splat %in_ptr1 : !tt.ptr<bf16> -> tensor<1x64x!tt.ptr<bf16>, #blocked> loc(#loc194)
+    %tmp58_43 = tt.splat %in_ptr1 : !tt.ptr<bf16> -> tensor<1x64x!tt.ptr<bf16>, #blocked1> loc(#loc194)
+    %tmp63 = arith.muli %x1, %cst_7 : tensor<4x1xi32, #blocked1> loc(#loc195)
+    %tmp63_44 = tt.broadcast %tmp63 : tensor<4x1xi32, #blocked1> -> tensor<4x64xi32, #blocked1> loc(#loc196)
+    %tmp63_45 = tt.splat %in_ptr2 : !tt.ptr<f32> -> tensor<4x64x!tt.ptr<f32>, #blocked1> loc(#loc197)
+    %tmp66 = tt.splat %in_ptr3 : !tt.ptr<f32> -> tensor<4x64x!tt.ptr<f32>, #blocked1> loc(#loc198)
+    %tmp102 = tt.splat %in_ptr4 : !tt.ptr<bf16> -> tensor<1x64x!tt.ptr<bf16>, #blocked> loc(#loc199)
+    %tmp102_46 = tt.splat %in_ptr4 : !tt.ptr<bf16> -> tensor<1x64x!tt.ptr<bf16>, #blocked1> loc(#loc199)
+    %tmp20 = arith.divf %tmp10_38, %cst_17 : tensor<4x1xf32, #blocked1> loc(#loc200)
+    %tmp22 = arith.addf %tmp20, %cst_16 : tensor<4x1xf32, #blocked1> loc(#loc201)
+    %tmp23 = tt.extern_elementwise %tmp22 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<4x1xf32, #blocked1>) -> tensor<4x1xf32, #blocked1> loc(#loc202)
+    %tmp24 = ttg.convert_layout %tmp23 : tensor<4x1xf32, #blocked1> -> tensor<4x1xf32, #blocked> loc(#loc203)
+    %tmp24_47 = tt.broadcast %tmp24 : tensor<4x1xf32, #blocked> -> tensor<4x64xf32, #blocked> loc(#loc203)
+    %tmp24_48 = tt.broadcast %tmp23 : tensor<4x1xf32, #blocked1> -> tensor<4x64xf32, #blocked1> loc(#loc203)
+    %tmp72 = arith.divf %tmp4_37, %cst_17 : tensor<4x1xf32, #blocked1> loc(#loc204)
+    %tmp73 = arith.addf %tmp72, %cst_16 : tensor<4x1xf32, #blocked1> loc(#loc205)
+    %tmp74 = tt.extern_elementwise %tmp73 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<4x1xf32, #blocked1>) -> tensor<4x1xf32, #blocked1> loc(#loc206)
+    %tmp75 = ttg.convert_layout %tmp74 : tensor<4x1xf32, #blocked1> -> tensor<4x1xf32, #blocked> loc(#loc207)
+    %tmp75_49 = tt.broadcast %tmp75 : tensor<4x1xf32, #blocked> -> tensor<4x64xf32, #blocked> loc(#loc207)
+    %tmp75_50 = tt.broadcast %tmp74 : tensor<4x1xf32, #blocked1> -> tensor<4x64xf32, #blocked1> loc(#loc207)
+    %0 = arith.muli %xindex_26, %cst_7 : tensor<4x1xi32, #blocked1> loc(#loc57)
+    %1 = tt.broadcast %0 : tensor<4x1xi32, #blocked1> -> tensor<4x64xi32, #blocked1> loc(#loc58)
+    %2 = tt.splat %in_out_ptr0 : !tt.ptr<bf16> -> tensor<4x64x!tt.ptr<bf16>, #blocked1> loc(#loc59)
+    %3 = tt.splat %in_out_ptr1 : !tt.ptr<bf16> -> tensor<4x64x!tt.ptr<bf16>, #blocked1> loc(#loc60)
+    scf.for %r0_offset = %c0_i32 to %c128_i32 step %c64_i32  : i32 {
+      %r0_index = tt.splat %r0_offset : i32 -> tensor<1x64xi32, #blocked1> loc(#loc208)
+      %r0_index_51 = tt.splat %r0_offset : i32 -> tensor<1x64xi32, #blocked> loc(#loc208)
+      %r0_index_52 = arith.addi %r0_index, %r0_base_29 : tensor<1x64xi32, #blocked1> loc(#loc208)
+      %r0_index_53 = arith.addi %r0_index_51, %r0_base_30 : tensor<1x64xi32, #blocked> loc(#loc208)
+      %r0_mask = arith.cmpi slt, %r0_index_52, %cst_11 : tensor<1x64xi32, #blocked1> loc(#loc209)
+      %r0_mask_54 = arith.cmpi slt, %r0_index_53, %cst_10 : tensor<1x64xi32, #blocked> loc(#loc209)
+      %r0_3 = arith.remsi %r0_index_53, %cst_3 : tensor<1x64xi32, #blocked> loc(#loc210)
+      %r0_4 = arith.divsi %r0_index_53, %cst_3 : tensor<1x64xi32, #blocked> loc(#loc211)
+      %tmp50_55 = tt.broadcast %r0_index_52 : tensor<1x64xi32, #blocked1> -> tensor<4x64xi32, #blocked1> loc(#loc190)
+      %tmp50_56 = arith.addi %tmp50_55, %tmp0_33 : tensor<4x64xi32, #blocked1> loc(#loc190)
+      %tmp50_57 = arith.addi %tmp50_56, %tmp0_35 : tensor<4x64xi32, #blocked1> loc(#loc192)
+      %tmp50_58 = tt.addptr %tmp0_36, %tmp50_57 : tensor<4x64x!tt.ptr<bf16>, #blocked1>, tensor<4x64xi32, #blocked1> loc(#loc193)
+      %tmp50_59 = tt.broadcast %r0_mask : tensor<1x64xi1, #blocked1> -> tensor<4x64xi1, #blocked1> loc(#loc212)
+      %tmp50_60 = tt.load %tmp50_58, %tmp50_59, %cst_14 evictionPolicy = evict_last : tensor<4x64x!tt.ptr<bf16>, #blocked1> loc(#loc212)
+      %tmp50_61 = arith.extf %tmp50_60 : tensor<4x64xbf16, #blocked1> to tensor<4x64xf32, #blocked1> loc(#loc213)
+      %tmp58_62 = tt.addptr %tmp58_43, %r0_index_52 : tensor<1x64x!tt.ptr<bf16>, #blocked1>, tensor<1x64xi32, #blocked1> loc(#loc194)
+      %tmp58_63 = tt.load %tmp58_62, %r0_mask, %cst_0 evictionPolicy = evict_last : tensor<1x64x!tt.ptr<bf16>, #blocked1> loc(#loc214)
+      %tmp58_64 = arith.extf %tmp58_63 : tensor<1x64xbf16, #blocked1> to tensor<1x64xf32, #blocked1> loc(#loc215)
+      %tmp63_65 = arith.addi %tmp50_55, %tmp63_44 : tensor<4x64xi32, #blocked1> loc(#loc196)
+      %tmp63_66 = tt.addptr %tmp63_45, %tmp63_65 : tensor<4x64x!tt.ptr<f32>, #blocked1>, tensor<4x64xi32, #blocked1> loc(#loc197)
+      %tmp63_67 = tt.load %tmp63_66, %tmp50_59, %cst_19 evictionPolicy = evict_last : tensor<4x64x!tt.ptr<f32>, #blocked1> loc(#loc216)
+      %tmp66_68 = tt.addptr %tmp66, %tmp63_65 : tensor<4x64x!tt.ptr<f32>, #blocked1>, tensor<4x64xi32, #blocked1> loc(#loc198)
+      %tmp66_69 = tt.load %tmp66_68, %tmp50_59, %cst_19 evictionPolicy = evict_last : tensor<4x64x!tt.ptr<f32>, #blocked1> loc(#loc217)
+      %tmp66_70 = ttg.convert_layout %tmp66_69 : tensor<4x64xf32, #blocked1> -> tensor<4x64xf32, #blocked> loc(#loc217)
+      %tmp96 = arith.addi %r0_index_52, %cst_9 : tensor<1x64xi32, #blocked1> loc(#loc218)
+      %tmp96_71 = tt.broadcast %tmp96 : tensor<1x64xi32, #blocked1> -> tensor<4x64xi32, #blocked1> loc(#loc219)
+      %tmp96_72 = arith.addi %tmp96_71, %tmp0_33 : tensor<4x64xi32, #blocked1> loc(#loc219)
+      %tmp96_73 = arith.addi %tmp96_72, %tmp0_35 : tensor<4x64xi32, #blocked1> loc(#loc220)
+      %tmp96_74 = tt.addptr %tmp0_36, %tmp96_73 : tensor<4x64x!tt.ptr<bf16>, #blocked1>, tensor<4x64xi32, #blocked1> loc(#loc221)
+      %tmp96_75 = tt.load %tmp96_74, %tmp50_59, %cst_14 evictionPolicy = evict_first : tensor<4x64x!tt.ptr<bf16>, #blocked1> loc(#loc222)
+      %tmp96_76 = arith.extf %tmp96_75 : tensor<4x64xbf16, #blocked1> to tensor<4x64xf32, #blocked1> loc(#loc223)
+      %tmp102_77 = tt.addptr %tmp102_46, %r0_index_52 : tensor<1x64x!tt.ptr<bf16>, #blocked1>, tensor<1x64xi32, #blocked1> loc(#loc199)
+      %tmp102_78 = tt.load %tmp102_77, %r0_mask, %cst_0 evictionPolicy = evict_last : tensor<1x64x!tt.ptr<bf16>, #blocked1> loc(#loc224)
+      %tmp102_79 = arith.extf %tmp102_78 : tensor<1x64xbf16, #blocked1> to tensor<1x64xf32, #blocked1> loc(#loc225)
+      %tmp16 = arith.extsi %r0_3 : tensor<1x64xi32, #blocked> to tensor<1x64xi64, #blocked> loc(#loc226)
+      %tmp16_80 = arith.cmpi slt, %tmp16, %cst_2 : tensor<1x64xi64, #blocked> loc(#loc226)
+      %tmp17 = arith.muli %r0_4, %cst_3 : tensor<1x64xi32, #blocked> loc(#loc227)
+      %tmp17_81 = arith.addi %tmp17, %cst_1 : tensor<1x64xi32, #blocked> loc(#loc228)
+      %tmp17_82 = tt.broadcast %tmp17_81 : tensor<1x64xi32, #blocked> -> tensor<4x64xi32, #blocked> loc(#loc229)
+      %tmp17_83 = arith.addi %tmp17_82, %tmp50_39 : tensor<4x64xi32, #blocked> loc(#loc229)
+      %tmp17_84 = arith.addi %tmp17_83, %tmp50_41 : tensor<4x64xi32, #blocked> loc(#loc230)
+      %tmp17_85 = tt.addptr %tmp50_42, %tmp17_84 : tensor<4x64x!tt.ptr<bf16>, #blocked>, tensor<4x64xi32, #blocked> loc(#loc231)
+      %tmp17_86 = arith.andi %r0_mask_54, %tmp16_80 : tensor<1x64xi1, #blocked> loc(#loc232)
+      %tmp17_87 = tt.broadcast %tmp17_86 : tensor<1x64xi1, #blocked> -> tensor<4x64xi1, #blocked> loc(#loc233)
+      %tmp17_88 = tt.load %tmp17_85, %tmp17_87, %cst_15 evictionPolicy = evict_last : tensor<4x64x!tt.ptr<bf16>, #blocked> loc(#loc233)
+      %tmp17_89 = arith.extf %tmp17_88 : tensor<4x64xbf16, #blocked> to tensor<4x64xf32, #blocked> loc(#loc234)
+      %tmp24_90 = arith.mulf %tmp17_89, %tmp24_47 : tensor<4x64xf32, #blocked> loc(#loc203)
+      %tmp25 = tt.addptr %tmp58, %tmp17_81 : tensor<1x64x!tt.ptr<bf16>, #blocked>, tensor<1x64xi32, #blocked> loc(#loc235)
+      %tmp25_91 = tt.broadcast %tmp25 : tensor<1x64x!tt.ptr<bf16>, #blocked> -> tensor<4x64x!tt.ptr<bf16>, #blocked> loc(#loc235)
+      %tmp25_92 = tt.load %tmp25_91, %tmp17_87, %cst_15 evictionPolicy = evict_last : tensor<4x64x!tt.ptr<bf16>, #blocked> loc(#loc236)
+      %tmp25_93 = arith.extf %tmp25_92 : tensor<4x64xbf16, #blocked> to tensor<4x64xf32, #blocked> loc(#loc237)
+      %tmp27 = arith.mulf %tmp24_90, %tmp25_93 : tensor<4x64xf32, #blocked> loc(#loc238)
+      %tmp29 = arith.subf %cst_18, %tmp27 : tensor<4x64xf32, #blocked> loc(#loc239)
+      %tmp31 = tt.broadcast %tmp16_80 : tensor<1x64xi1, #blocked> -> tensor<4x64xi1, #blocked> loc(#loc240)
+      %tmp32 = arith.cmpi sge, %tmp16, %cst_2 : tensor<1x64xi64, #blocked> loc(#loc241)
+      %tmp35 = tt.broadcast %tmp17 : tensor<1x64xi32, #blocked> -> tensor<4x64xi32, #blocked> loc(#loc242)
+      %tmp35_94 = arith.addi %tmp35, %tmp50_39 : tensor<4x64xi32, #blocked> loc(#loc242)
+      %tmp35_95 = arith.addi %tmp35_94, %tmp50_41 : tensor<4x64xi32, #blocked> loc(#loc243)
+      %tmp35_96 = tt.addptr %tmp50_42, %tmp35_95 : tensor<4x64x!tt.ptr<bf16>, #blocked>, tensor<4x64xi32, #blocked> loc(#loc244)
+      %tmp35_97 = arith.andi %r0_mask_54, %tmp32 : tensor<1x64xi1, #blocked> loc(#loc245)
+      %tmp35_98 = tt.broadcast %tmp35_97 : tensor<1x64xi1, #blocked> -> tensor<4x64xi1, #blocked> loc(#loc246)
+      %tmp35_99 = tt.load %tmp35_96, %tmp35_98, %cst_15 evictionPolicy = evict_last : tensor<4x64x!tt.ptr<bf16>, #blocked> loc(#loc246)
+      %tmp35_100 = arith.extf %tmp35_99 : tensor<4x64xbf16, #blocked> to tensor<4x64xf32, #blocked> loc(#loc247)
+      %tmp42 = arith.mulf %tmp35_100, %tmp24_47 : tensor<4x64xf32, #blocked> loc(#loc248)
+      %tmp43 = tt.addptr %tmp58, %tmp17 : tensor<1x64x!tt.ptr<bf16>, #blocked>, tensor<1x64xi32, #blocked> loc(#loc249)
+      %tmp43_101 = tt.broadcast %tmp43 : tensor<1x64x!tt.ptr<bf16>, #blocked> -> tensor<4x64x!tt.ptr<bf16>, #blocked> loc(#loc249)
+      %tmp43_102 = tt.load %tmp43_101, %tmp35_98, %cst_15 evictionPolicy = evict_last : tensor<4x64x!tt.ptr<bf16>, #blocked> loc(#loc250)
+      %tmp43_103 = arith.extf %tmp43_102 : tensor<4x64xbf16, #blocked> to tensor<4x64xf32, #blocked> loc(#loc251)
+      %tmp45 = arith.mulf %tmp42, %tmp43_103 : tensor<4x64xf32, #blocked> loc(#loc252)
+      %tmp48 = tt.broadcast %tmp32 : tensor<1x64xi1, #blocked> -> tensor<4x64xi1, #blocked> loc(#loc253)
+      %tmp48_104 = arith.select %tmp48, %tmp45, %cst_18 : tensor<4x64xi1, #blocked>, tensor<4x64xf32, #blocked> loc(#loc253)
+      %tmp49 = arith.select %tmp31, %tmp29, %tmp48_104 : tensor<4x64xi1, #blocked>, tensor<4x64xf32, #blocked> loc(#loc295)
+      %tmp57 = arith.mulf %tmp50_61, %tmp24_48 : tensor<4x64xf32, #blocked1> loc(#loc255)
+      %tmp60 = tt.broadcast %tmp58_64 : tensor<1x64xf32, #blocked1> -> tensor<4x64xf32, #blocked1> loc(#loc256)
+      %tmp60_105 = arith.mulf %tmp57, %tmp60 : tensor<4x64xf32, #blocked1> loc(#loc256)
+      %tmp64 = arith.mulf %tmp60_105, %tmp63_67 : tensor<4x64xf32, #blocked1> loc(#loc257)
+      %tmp64_106 = ttg.convert_layout %tmp64 : tensor<4x64xf32, #blocked1> -> tensor<4x64xf32, #blocked> loc(#loc257)
+      %tmp67 = arith.mulf %tmp49, %tmp66_70 : tensor<4x64xf32, #blocked> loc(#loc258)
+      %tmp68 = arith.addf %tmp64_106, %tmp67 : tensor<4x64xf32, #blocked> loc(#loc259)
+      %tmp70 = arith.addi %tmp17, %cst : tensor<1x64xi32, #blocked> loc(#loc260)
+      %tmp70_107 = tt.broadcast %tmp70 : tensor<1x64xi32, #blocked> -> tensor<4x64xi32, #blocked> loc(#loc261)
+      %tmp70_108 = arith.addi %tmp70_107, %tmp50_39 : tensor<4x64xi32, #blocked> loc(#loc261)
+      %tmp70_109 = arith.addi %tmp70_108, %tmp50_41 : tensor<4x64xi32, #blocked> loc(#loc262)
+      %tmp70_110 = tt.addptr %tmp50_42, %tmp70_109 : tensor<4x64x!tt.ptr<bf16>, #blocked>, tensor<4x64xi32, #blocked> loc(#loc263)
+      %tmp70_111 = tt.load %tmp70_110, %tmp17_87, %cst_15 evictionPolicy = evict_last : tensor<4x64x!tt.ptr<bf16>, #blocked> loc(#loc264)
+      %tmp70_112 = arith.extf %tmp70_111 : tensor<4x64xbf16, #blocked> to tensor<4x64xf32, #blocked> loc(#loc265)
+      %tmp75_113 = arith.mulf %tmp70_112, %tmp75_49 : tensor<4x64xf32, #blocked> loc(#loc207)
+      %tmp76 = tt.addptr %tmp102, %tmp17_81 : tensor<1x64x!tt.ptr<bf16>, #blocked>, tensor<1x64xi32, #blocked> loc(#loc266)
+      %tmp76_114 = tt.broadcast %tmp76 : tensor<1x64x!tt.ptr<bf16>, #blocked> -> tensor<4x64x!tt.ptr<bf16>, #blocked> loc(#loc266)
+      %tmp76_115 = tt.load %tmp76_114, %tmp17_87, %cst_15 evictionPolicy = evict_last : tensor<4x64x!tt.ptr<bf16>, #blocked> loc(#loc267)
+      %tmp76_116 = arith.extf %tmp76_115 : tensor<4x64xbf16, #blocked> to tensor<4x64xf32, #blocked> loc(#loc268)
+      %tmp78 = arith.mulf %tmp75_113, %tmp76_116 : tensor<4x64xf32, #blocked> loc(#loc269)
+      %tmp80 = arith.subf %cst_18, %tmp78 : tensor<4x64xf32, #blocked> loc(#loc270)
+      %tmp83 = arith.addi %tmp17, %cst_8 : tensor<1x64xi32, #blocked> loc(#loc271)
+      %tmp83_117 = tt.broadcast %tmp83 : tensor<1x64xi32, #blocked> -> tensor<4x64xi32, #blocked> loc(#loc272)
+      %tmp83_118 = arith.addi %tmp83_117, %tmp50_39 : tensor<4x64xi32, #blocked> loc(#loc272)
+      %tmp83_119 = arith.addi %tmp83_118, %tmp50_41 : tensor<4x64xi32, #blocked> loc(#loc273)
+      %tmp83_120 = tt.addptr %tmp50_42, %tmp83_119 : tensor<4x64x!tt.ptr<bf16>, #blocked>, tensor<4x64xi32, #blocked> loc(#loc274)
+      %tmp83_121 = tt.load %tmp83_120, %tmp35_98, %cst_15 evictionPolicy = evict_last : tensor<4x64x!tt.ptr<bf16>, #blocked> loc(#loc275)
+      %tmp83_122 = arith.extf %tmp83_121 : tensor<4x64xbf16, #blocked> to tensor<4x64xf32, #blocked> loc(#loc276)
+      %tmp88 = arith.mulf %tmp83_122, %tmp75_49 : tensor<4x64xf32, #blocked> loc(#loc277)
+      %tmp89 = tt.addptr %tmp102, %tmp17 : tensor<1x64x!tt.ptr<bf16>, #blocked>, tensor<1x64xi32, #blocked> loc(#loc278)
+      %tmp89_123 = tt.broadcast %tmp89 : tensor<1x64x!tt.ptr<bf16>, #blocked> -> tensor<4x64x!tt.ptr<bf16>, #blocked> loc(#loc278)
+      %tmp89_124 = tt.load %tmp89_123, %tmp35_98, %cst_15 evictionPolicy = evict_last : tensor<4x64x!tt.ptr<bf16>, #blocked> loc(#loc279)
+      %tmp89_125 = arith.extf %tmp89_124 : tensor<4x64xbf16, #blocked> to tensor<4x64xf32, #blocked> loc(#loc280)
+      %tmp91 = arith.mulf %tmp88, %tmp89_125 : tensor<4x64xf32, #blocked> loc(#loc281)
+      %tmp94 = arith.select %tmp48, %tmp91, %cst_18 : tensor<4x64xi1, #blocked>, tensor<4x64xf32, #blocked> loc(#loc282)
+      %tmp95 = arith.select %tmp31, %tmp80, %tmp94 : tensor<4x64xi1, #blocked>, tensor<4x64xf32, #blocked> loc(#loc296)
+      %tmp101 = arith.mulf %tmp96_76, %tmp75_50 : tensor<4x64xf32, #blocked1> loc(#loc285)
+      %tmp104 = tt.broadcast %tmp102_79 : tensor<1x64xf32, #blocked1> -> tensor<4x64xf32, #blocked1> loc(#loc286)
+      %tmp104_126 = arith.mulf %tmp101, %tmp104 : tensor<4x64xf32, #blocked1> loc(#loc286)
+      %tmp107 = arith.mulf %tmp104_126, %tmp63_67 : tensor<4x64xf32, #blocked1> loc(#loc287)
+      %tmp107_127 = ttg.convert_layout %tmp107 : tensor<4x64xf32, #blocked1> -> tensor<4x64xf32, #blocked> loc(#loc287)
+      %tmp109 = arith.mulf %tmp95, %tmp66_70 : tensor<4x64xf32, #blocked> loc(#loc288)
+      %tmp110 = arith.addf %tmp107_127, %tmp109 : tensor<4x64xf32, #blocked> loc(#loc289)
+      %4 = arith.addi %tmp50_55, %1 : tensor<4x64xi32, #blocked1> loc(#loc58)
+      %5 = tt.addptr %2, %4 : tensor<4x64x!tt.ptr<bf16>, #blocked1>, tensor<4x64xi32, #blocked1> loc(#loc59)
+      %6 = arith.truncf %tmp68 : tensor<4x64xf32, #blocked> to tensor<4x64xbf16, #blocked> loc(#loc144)
+      %7 = ttg.convert_layout %6 : tensor<4x64xbf16, #blocked> -> tensor<4x64xbf16, #blocked1> loc(#loc144)
+      tt.store %5, %7, %tmp50_59 : tensor<4x64x!tt.ptr<bf16>, #blocked1> loc(#loc144)
+      %8 = tt.addptr %3, %4 : tensor<4x64x!tt.ptr<bf16>, #blocked1>, tensor<4x64xi32, #blocked1> loc(#loc60)
+      %9 = arith.truncf %tmp110 : tensor<4x64xf32, #blocked> to tensor<4x64xbf16, #blocked> loc(#loc145)
+      %10 = ttg.convert_layout %9 : tensor<4x64xbf16, #blocked> -> tensor<4x64xbf16, #blocked1> loc(#loc145)
+      tt.store %8, %10, %tmp50_59 : tensor<4x64x!tt.ptr<bf16>, #blocked1> loc(#loc145)
+    } loc(#loc61)
+    tt.return loc(#loc146)
+  } loc(#loc)
+} loc(#loc)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":23:28)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":23:33)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:44)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:23)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":26:37)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":28:19)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":29:19)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:52)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:48)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:63)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:57)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:34)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":33:43)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":34:31)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":35:29)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:41)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:68)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:121)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:41)
+#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:50)
+#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:34)
+#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:61)
+#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:114)
+#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":42:22)
+#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":44:23)
+#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":45:40)
+#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":47:22)
+#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":49:25)
+#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":50:42)
+#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":50:8)
+#loc32 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:36)
+#loc34 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:15)
+#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":51:28)
+#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":52:30)
+#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:46)
+#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:42)
+#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:57)
+#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:51)
+#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:35)
+#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:35)
+#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:46)
+#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:42)
+#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:35)
+#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:35)
+#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:36)
+#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":75:25)
+#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":77:24)
+#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":78:32)
+#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":79:24)
+#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":123:24)
+#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":124:24)
+#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":125:32)
+#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":126:24)
+#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:43)
+#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:39)
+#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:32)
+#loc60 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:32)
+#loc61 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":53:43)
+#loc62 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":54:31)
+#loc63 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":55:29)
+#loc64 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":58:27)
+#loc65 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":59:27)
+#loc66 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:62)
+#loc67 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:115)
+#loc68 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:42)
+#loc69 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:95)
+#loc70 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:51)
+#loc71 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:51)
+#loc72 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:42)
+#loc73 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:49)
+#loc74 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:58)
+#loc75 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:35)
+#loc76 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:69)
+#loc77 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:123)
+#loc78 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:43)
+#loc79 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:96)
+#loc80 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":71:24)
+#loc81 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:41)
+#loc82 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:39)
+#loc83 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:48)
+#loc84 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:57)
+#loc85 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:35)
+#loc86 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:78)
+#loc87 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:68)
+#loc88 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:129)
+#loc89 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:35)
+#loc90 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:85)
+#loc91 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:146)
+#loc92 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":82:24)
+#loc93 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":84:17)
+#loc94 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":86:39)
+#loc95 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":87:25)
+#loc96 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:44)
+#loc97 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:53)
+#loc98 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:35)
+#loc99 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:74)
+#loc100 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:64)
+#loc101 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:125)
+#loc102 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":97:24)
+#loc103 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:35)
+#loc104 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:81)
+#loc105 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:142)
+#loc106 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":100:24)
+#loc107 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":103:39)
+#loc108 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":104:39)
+#loc109 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":111:24)
+#loc110 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":113:24)
+#loc111 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":116:24)
+#loc112 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":118:24)
+#loc113 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":119:24)
+#loc114 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:42)
+#loc115 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:51)
+#loc116 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:60)
+#loc117 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:35)
+#loc118 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:71)
+#loc119 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:132)
+#loc120 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:35)
+#loc121 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:85)
+#loc122 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:146)
+#loc123 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":129:24)
+#loc124 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":131:17)
+#loc125 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:42)
+#loc126 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:51)
+#loc127 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:60)
+#loc128 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:35)
+#loc129 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:71)
+#loc130 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:132)
+#loc131 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":139:24)
+#loc132 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:35)
+#loc133 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:81)
+#loc134 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:142)
+#loc135 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":142:24)
+#loc136 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":145:39)
+#loc137 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":146:39)
+#loc138 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":133:39)
+#loc139 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":151:25)
+#loc140 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":153:26)
+#loc141 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":156:26)
+#loc142 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":158:26)
+#loc143 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":159:26)
+#loc144 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:55)
+#loc145 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:56)
+#loc146 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":53:4)
+#loc156 = loc("xoffset"(#loc2))
+#loc157 = loc("xoffset"(#loc3))
+#loc158 = loc("xindex"(#loc4))
+#loc159 = loc("xindex"(#loc5))
+#loc160 = loc("r0_base"(#loc6))
+#loc161 = loc("x0"(#loc7))
+#loc162 = loc("x1"(#loc8))
+#loc163 = loc("tmp0"(#loc9))
+#loc164 = loc("tmp0"(#loc10))
+#loc165 = loc("tmp0"(#loc11))
+#loc166 = loc("tmp0"(#loc12))
+#loc167 = loc("tmp0"(#loc13))
+#loc168 = loc("_tmp4"(#loc14))
+#loc169 = loc("r0_index"(#loc15))
+#loc170 = loc("r0_mask"(#loc16))
+#loc171 = loc("tmp0"(#loc17))
+#loc172 = loc("tmp0"(#loc18))
+#loc173 = loc("tmp0"(#loc19))
+#loc174 = loc("tmp6"(#loc20))
+#loc175 = loc("tmp6"(#loc21))
+#loc176 = loc("tmp6"(#loc22))
+#loc177 = loc("tmp6"(#loc23))
+#loc178 = loc("tmp6"(#loc24))
+#loc179 = loc("tmp2"(#loc25))
+#loc180 = loc("tmp5"(#loc26))
+#loc181 = loc("_tmp4"(#loc27))
+#loc182 = loc("tmp8"(#loc28))
+#loc183 = loc("tmp11"(#loc29))
+#loc184 = loc("_tmp10"(#loc30))
+#loc186 = loc("tmp4"(#loc35))
+#loc188 = loc("tmp10"(#loc37))
+#loc189 = loc("tmp50"(#loc38))
+#loc190 = loc("tmp50"(#loc39))
+#loc191 = loc("tmp50"(#loc40))
+#loc192 = loc("tmp50"(#loc41))
+#loc193 = loc("tmp50"(#loc42))
+#loc194 = loc("tmp58"(#loc43))
+#loc195 = loc("tmp63"(#loc44))
+#loc196 = loc("tmp63"(#loc45))
+#loc197 = loc("tmp63"(#loc46))
+#loc198 = loc("tmp66"(#loc47))
+#loc199 = loc("tmp102"(#loc48))
+#loc200 = loc("tmp20"(#loc49))
+#loc201 = loc("tmp22"(#loc50))
+#loc202 = loc("tmp23"(#loc51))
+#loc203 = loc("tmp24"(#loc52))
+#loc204 = loc("tmp72"(#loc53))
+#loc205 = loc("tmp73"(#loc54))
+#loc206 = loc("tmp74"(#loc55))
+#loc207 = loc("tmp75"(#loc56))
+#loc208 = loc("r0_index"(#loc62))
+#loc209 = loc("r0_mask"(#loc63))
+#loc210 = loc("r0_3"(#loc64))
+#loc211 = loc("r0_4"(#loc65))
+#loc212 = loc("tmp50"(#loc66))
+#loc213 = loc("tmp50"(#loc67))
+#loc214 = loc("tmp58"(#loc68))
+#loc215 = loc("tmp58"(#loc69))
+#loc216 = loc("tmp63"(#loc70))
+#loc217 = loc("tmp66"(#loc71))
+#loc218 = loc("tmp96"(#loc72))
+#loc219 = loc("tmp96"(#loc73))
+#loc220 = loc("tmp96"(#loc74))
+#loc221 = loc("tmp96"(#loc75))
+#loc222 = loc("tmp96"(#loc76))
+#loc223 = loc("tmp96"(#loc77))
+#loc224 = loc("tmp102"(#loc78))
+#loc225 = loc("tmp102"(#loc79))
+#loc226 = loc("tmp16"(#loc80))
+#loc227 = loc("tmp17"(#loc81))
+#loc228 = loc("tmp17"(#loc82))
+#loc229 = loc("tmp17"(#loc83))
+#loc230 = loc("tmp17"(#loc84))
+#loc231 = loc("tmp17"(#loc85))
+#loc232 = loc("tmp17"(#loc86))
+#loc233 = loc("tmp17"(#loc87))
+#loc234 = loc("tmp17"(#loc88))
+#loc235 = loc("tmp25"(#loc89))
+#loc236 = loc("tmp25"(#loc90))
+#loc237 = loc("tmp25"(#loc91))
+#loc238 = loc("tmp27"(#loc92))
+#loc239 = loc("tmp29"(#loc93))
+#loc240 = loc("tmp31"(#loc94))
+#loc241 = loc("tmp32"(#loc95))
+#loc242 = loc("tmp35"(#loc96))
+#loc243 = loc("tmp35"(#loc97))
+#loc244 = loc("tmp35"(#loc98))
+#loc245 = loc("tmp35"(#loc99))
+#loc246 = loc("tmp35"(#loc100))
+#loc247 = loc("tmp35"(#loc101))
+#loc248 = loc("tmp42"(#loc102))
+#loc249 = loc("tmp43"(#loc103))
+#loc250 = loc("tmp43"(#loc104))
+#loc251 = loc("tmp43"(#loc105))
+#loc252 = loc("tmp45"(#loc106))
+#loc253 = loc("tmp48"(#loc107))
+#loc254 = loc("tmp49"(#loc108))
+#loc255 = loc("tmp57"(#loc109))
+#loc256 = loc("tmp60"(#loc110))
+#loc257 = loc("tmp64"(#loc111))
+#loc258 = loc("tmp67"(#loc112))
+#loc259 = loc("tmp68"(#loc113))
+#loc260 = loc("tmp70"(#loc114))
+#loc261 = loc("tmp70"(#loc115))
+#loc262 = loc("tmp70"(#loc116))
+#loc263 = loc("tmp70"(#loc117))
+#loc264 = loc("tmp70"(#loc118))
+#loc265 = loc("tmp70"(#loc119))
+#loc266 = loc("tmp76"(#loc120))
+#loc267 = loc("tmp76"(#loc121))
+#loc268 = loc("tmp76"(#loc122))
+#loc269 = loc("tmp78"(#loc123))
+#loc270 = loc("tmp80"(#loc124))
+#loc271 = loc("tmp83"(#loc125))
+#loc272 = loc("tmp83"(#loc126))
+#loc273 = loc("tmp83"(#loc127))
+#loc274 = loc("tmp83"(#loc128))
+#loc275 = loc("tmp83"(#loc129))
+#loc276 = loc("tmp83"(#loc130))
+#loc277 = loc("tmp88"(#loc131))
+#loc278 = loc("tmp89"(#loc132))
+#loc279 = loc("tmp89"(#loc133))
+#loc280 = loc("tmp89"(#loc134))
+#loc281 = loc("tmp91"(#loc135))
+#loc282 = loc("tmp94"(#loc136))
+#loc283 = loc("tmp95"(#loc137))
+#loc284 = loc("tmp82"(#loc138))
+#loc285 = loc("tmp101"(#loc139))
+#loc286 = loc("tmp104"(#loc140))
+#loc287 = loc("tmp107"(#loc141))
+#loc288 = loc("tmp109"(#loc142))
+#loc289 = loc("tmp110"(#loc143))
+#loc290 = loc("_tmp10"(#loc168))
+#loc291 = loc(callsite(#loc32 at #loc185))
+#loc293 = loc(callsite(#loc32 at #loc187))
+#loc295 = loc(fused[#loc254, #loc240])
+#loc296 = loc(fused[#loc283, #loc284])
+#loc297 = loc(callsite(#loc34 at #loc291))
+#loc298 = loc(callsite(#loc34 at #loc293))
diff --git a/triton/AYSZDJBZHGD4X4V6Y5K2ZI7KV3J3O3ME6MPETVGALPCMDZX3DFIA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttir b/triton/AYSZDJBZHGD4X4V6Y5K2ZI7KV3J3O3ME6MPETVGALPCMDZX3DFIA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttir
new file mode 100644
index 0000000000000000000000000000000000000000..f6dcf3e5eba5525eb089d70ff0a8d18a14ef8875
--- /dev/null
+++ b/triton/AYSZDJBZHGD4X4V6Y5K2ZI7KV3J3O3ME6MPETVGALPCMDZX3DFIA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttir
@@ -0,0 +1,520 @@
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":18:0)
+#loc1 = loc(unknown)
+#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":51:25)
+#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":52:27)
+#loc149 = loc("in_out_ptr0"(#loc))
+#loc150 = loc("in_out_ptr1"(#loc))
+#loc151 = loc("in_ptr0"(#loc))
+#loc152 = loc("in_ptr1"(#loc))
+#loc153 = loc("in_ptr2"(#loc))
+#loc154 = loc("in_ptr3"(#loc))
+#loc155 = loc("in_ptr4"(#loc))
+#loc156 = loc("xnumel"(#loc))
+#loc157 = loc("r0_numel"(#loc))
+#loc189 = loc("tmp4"(#loc35))
+#loc191 = loc("tmp10"(#loc38))
+#loc296 = loc(callsite(#loc1 at #loc189))
+#loc298 = loc(callsite(#loc1 at #loc191))
+module {
+  tt.func public @triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0(%in_out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_out_ptr0"(#loc)), %in_out_ptr1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_out_ptr1"(#loc)), %in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %in_ptr4: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr4"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
+    %cst = arith.constant dense<0.000000e+00> : tensor<1x64xbf16> loc(#loc1)
+    %cst_0 = arith.constant dense<0.000000e+00> : tensor<4x64xbf16> loc(#loc1)
+    %c64_i32 = arith.constant 64 : i32 loc(#loc1)
+    %c128_i32 = arith.constant 128 : i32 loc(#loc1)
+    %c0_i32 = arith.constant 0 : i32 loc(#loc1)
+    %cst_1 = arith.constant dense<4097> : tensor<1x64xi32> loc(#loc1)
+    %cst_2 = arith.constant dense<9.99999997E-7> : tensor<4x1xf32> loc(#loc1)
+    %cst_3 = arith.constant dense<1.280000e+02> : tensor<4x1xf32> loc(#loc1)
+    %cst_4 = arith.constant dense<1> : tensor<1x64xi32> loc(#loc1)
+    %cst_5 = arith.constant dense<1> : tensor<1x64xi64> loc(#loc1)
+    %cst_6 = arith.constant dense<2> : tensor<1x64xi32> loc(#loc1)
+    %cst_7 = arith.constant dense<36864> : tensor<4x1xi32> loc(#loc1)
+    %cst_8 = arith.constant dense<128> : tensor<4x1xi32> loc(#loc1)
+    %cst_9 = arith.constant dense<4096> : tensor<1x64xi32> loc(#loc1)
+    %cst_10 = arith.constant dense<128> : tensor<1x64xi32> loc(#loc1)
+    %cst_11 = arith.constant dense<0.000000e+00> : tensor<4x64xf32> loc(#loc1)
+    %cst_12 = arith.constant dense<32> : tensor<4x1xi32> loc(#loc1)
+    %c4_i32 = arith.constant 4 : i32 loc(#loc1)
+    %xoffset = tt.get_program_id x : i32 loc(#loc158)
+    %xoffset_13 = arith.muli %xoffset, %c4_i32 : i32 loc(#loc159)
+    %xindex = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32> loc(#loc160)
+    %xindex_14 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<4xi32> -> tensor<4x1xi32> loc(#loc161)
+    %xindex_15 = tt.splat %xoffset_13 : i32 -> tensor<4x1xi32> loc(#loc162)
+    %xindex_16 = arith.addi %xindex_15, %xindex_14 : tensor<4x1xi32> loc(#loc162)
+    %r0_base = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc163)
+    %r0_base_17 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc164)
+    %x0 = arith.remsi %xindex_16, %cst_12 : tensor<4x1xi32> loc(#loc165)
+    %x1 = arith.divsi %xindex_16, %cst_12 : tensor<4x1xi32> loc(#loc166)
+    %_tmp10:2 = scf.for %r0_offset = %c0_i32 to %c128_i32 step %c64_i32 iter_args(%_tmp4 = %cst_11, %_tmp10_20 = %cst_11) -> (tensor<4x64xf32>, tensor<4x64xf32>)  : i32 {
+      %r0_index = tt.splat %r0_offset : i32 -> tensor<1x64xi32> loc(#loc168)
+      %r0_index_21 = arith.addi %r0_index, %r0_base_17 : tensor<1x64xi32> loc(#loc168)
+      %r0_mask = arith.cmpi slt, %r0_index_21, %cst_10 : tensor<1x64xi32> loc(#loc169)
+      %tmp0 = arith.addi %r0_index_21, %cst_9 : tensor<1x64xi32> loc(#loc170)
+      %tmp0_22 = arith.muli %x0, %cst_8 : tensor<4x1xi32> loc(#loc171)
+      %tmp0_23 = tt.broadcast %tmp0 : tensor<1x64xi32> -> tensor<4x64xi32> loc(#loc172)
+      %tmp0_24 = tt.broadcast %tmp0_22 : tensor<4x1xi32> -> tensor<4x64xi32> loc(#loc172)
+      %tmp0_25 = arith.addi %tmp0_23, %tmp0_24 : tensor<4x64xi32> loc(#loc172)
+      %tmp0_26 = arith.muli %x1, %cst_7 : tensor<4x1xi32> loc(#loc173)
+      %tmp0_27 = tt.broadcast %tmp0_26 : tensor<4x1xi32> -> tensor<4x64xi32> loc(#loc174)
+      %tmp0_28 = arith.addi %tmp0_25, %tmp0_27 : tensor<4x64xi32> loc(#loc174)
+      %tmp0_29 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<4x64x!tt.ptr<bf16>> loc(#loc175)
+      %tmp0_30 = tt.addptr %tmp0_29, %tmp0_28 : tensor<4x64x!tt.ptr<bf16>>, tensor<4x64xi32> loc(#loc175)
+      %tmp0_31 = tt.broadcast %r0_mask : tensor<1x64xi1> -> tensor<4x64xi1> loc(#loc176)
+      %tmp0_32 = tt.load %tmp0_30, %tmp0_31, %cst_0 evictionPolicy = evict_last : tensor<4x64x!tt.ptr<bf16>> loc(#loc176)
+      %tmp0_33 = arith.extf %tmp0_32 : tensor<4x64xbf16> to tensor<4x64xf32> loc(#loc177)
+      %tmp6 = tt.broadcast %r0_index_21 : tensor<1x64xi32> -> tensor<4x64xi32> loc(#loc178)
+      %tmp6_34 = arith.addi %tmp6, %tmp0_24 : tensor<4x64xi32> loc(#loc178)
+      %tmp6_35 = arith.addi %tmp6_34, %tmp0_27 : tensor<4x64xi32> loc(#loc179)
+      %tmp6_36 = tt.addptr %tmp0_29, %tmp6_35 : tensor<4x64x!tt.ptr<bf16>>, tensor<4x64xi32> loc(#loc180)
+      %tmp6_37 = tt.load %tmp6_36, %tmp0_31, %cst_0 evictionPolicy = evict_last : tensor<4x64x!tt.ptr<bf16>> loc(#loc181)
+      %tmp6_38 = arith.extf %tmp6_37 : tensor<4x64xbf16> to tensor<4x64xf32> loc(#loc182)
+      %tmp2 = arith.mulf %tmp0_33, %tmp0_33 : tensor<4x64xf32> loc(#loc183)
+      %tmp5 = arith.addf %_tmp4, %tmp2 : tensor<4x64xf32> loc(#loc184)
+      %_tmp4_39 = arith.select %tmp0_31, %tmp5, %_tmp4 : tensor<4x64xi1>, tensor<4x64xf32> loc(#loc185)
+      %tmp8 = arith.mulf %tmp6_38, %tmp6_38 : tensor<4x64xf32> loc(#loc186)
+      %tmp11 = arith.addf %_tmp10_20, %tmp8 : tensor<4x64xf32> loc(#loc187)
+      %_tmp10_40 = arith.select %tmp0_31, %tmp11, %_tmp10_20 : tensor<4x64xi1>, tensor<4x64xf32> loc(#loc188)
+      scf.yield %_tmp4_39, %_tmp10_40 : tensor<4x64xf32>, tensor<4x64xf32> loc(#loc33)
+    } loc(#loc294)
+    %tmp4 = "tt.reduce"(%_tmp10#0) <{axis = 1 : i32}> ({
+    ^bb0(%tmp4_20: f32 loc(callsite(#loc1 at #loc189)), %tmp4_21: f32 loc(callsite(#loc1 at #loc189))):
+      %tmp4_22 = arith.addf %tmp4_20, %tmp4_21 : f32 loc(#loc299)
+      tt.reduce.return %tmp4_22 : f32 loc(#loc295)
+    }) : (tensor<4x64xf32>) -> tensor<4xf32> loc(#loc295)
+    %tmp4_18 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<4xf32> -> tensor<4x1xf32> loc(#loc190)
+    %tmp10 = "tt.reduce"(%_tmp10#1) <{axis = 1 : i32}> ({
+    ^bb0(%tmp10_20: f32 loc(callsite(#loc1 at #loc191)), %tmp10_21: f32 loc(callsite(#loc1 at #loc191))):
+      %tmp10_22 = arith.addf %tmp10_20, %tmp10_21 : f32 loc(#loc300)
+      tt.reduce.return %tmp10_22 : f32 loc(#loc297)
+    }) : (tensor<4x64xf32>) -> tensor<4xf32> loc(#loc297)
+    %tmp10_19 = tt.expand_dims %tmp10 {axis = 1 : i32} : tensor<4xf32> -> tensor<4x1xf32> loc(#loc192)
+    scf.for %r0_offset = %c0_i32 to %c128_i32 step %c64_i32  : i32 {
+      %r0_index = tt.splat %r0_offset : i32 -> tensor<1x64xi32> loc(#loc193)
+      %r0_index_20 = arith.addi %r0_index, %r0_base_17 : tensor<1x64xi32> loc(#loc193)
+      %r0_mask = arith.cmpi slt, %r0_index_20, %cst_10 : tensor<1x64xi32> loc(#loc194)
+      %r0_3 = arith.remsi %r0_index_20, %cst_6 : tensor<1x64xi32> loc(#loc195)
+      %r0_4 = arith.divsi %r0_index_20, %cst_6 : tensor<1x64xi32> loc(#loc196)
+      %tmp50 = arith.muli %x0, %cst_8 : tensor<4x1xi32> loc(#loc197)
+      %tmp50_21 = tt.broadcast %r0_index_20 : tensor<1x64xi32> -> tensor<4x64xi32> loc(#loc198)
+      %tmp50_22 = tt.broadcast %tmp50 : tensor<4x1xi32> -> tensor<4x64xi32> loc(#loc198)
+      %tmp50_23 = arith.addi %tmp50_21, %tmp50_22 : tensor<4x64xi32> loc(#loc198)
+      %tmp50_24 = arith.muli %x1, %cst_7 : tensor<4x1xi32> loc(#loc199)
+      %tmp50_25 = tt.broadcast %tmp50_24 : tensor<4x1xi32> -> tensor<4x64xi32> loc(#loc200)
+      %tmp50_26 = arith.addi %tmp50_23, %tmp50_25 : tensor<4x64xi32> loc(#loc200)
+      %tmp50_27 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<4x64x!tt.ptr<bf16>> loc(#loc201)
+      %tmp50_28 = tt.addptr %tmp50_27, %tmp50_26 : tensor<4x64x!tt.ptr<bf16>>, tensor<4x64xi32> loc(#loc201)
+      %tmp50_29 = tt.broadcast %r0_mask : tensor<1x64xi1> -> tensor<4x64xi1> loc(#loc202)
+      %tmp50_30 = tt.load %tmp50_28, %tmp50_29, %cst_0 evictionPolicy = evict_last : tensor<4x64x!tt.ptr<bf16>> loc(#loc202)
+      %tmp50_31 = arith.extf %tmp50_30 : tensor<4x64xbf16> to tensor<4x64xf32> loc(#loc203)
+      %tmp58 = tt.splat %in_ptr1 : !tt.ptr<bf16> -> tensor<1x64x!tt.ptr<bf16>> loc(#loc204)
+      %tmp58_32 = tt.addptr %tmp58, %r0_index_20 : tensor<1x64x!tt.ptr<bf16>>, tensor<1x64xi32> loc(#loc204)
+      %tmp58_33 = tt.load %tmp58_32, %r0_mask, %cst evictionPolicy = evict_last : tensor<1x64x!tt.ptr<bf16>> loc(#loc205)
+      %tmp58_34 = arith.extf %tmp58_33 : tensor<1x64xbf16> to tensor<1x64xf32> loc(#loc206)
+      %tmp63 = arith.muli %x1, %cst_8 : tensor<4x1xi32> loc(#loc207)
+      %tmp63_35 = tt.broadcast %tmp63 : tensor<4x1xi32> -> tensor<4x64xi32> loc(#loc208)
+      %tmp63_36 = arith.addi %tmp50_21, %tmp63_35 : tensor<4x64xi32> loc(#loc208)
+      %tmp63_37 = tt.splat %in_ptr2 : !tt.ptr<f32> -> tensor<4x64x!tt.ptr<f32>> loc(#loc209)
+      %tmp63_38 = tt.addptr %tmp63_37, %tmp63_36 : tensor<4x64x!tt.ptr<f32>>, tensor<4x64xi32> loc(#loc209)
+      %tmp63_39 = tt.load %tmp63_38, %tmp50_29, %cst_11 evictionPolicy = evict_last : tensor<4x64x!tt.ptr<f32>> loc(#loc210)
+      %tmp66 = tt.splat %in_ptr3 : !tt.ptr<f32> -> tensor<4x64x!tt.ptr<f32>> loc(#loc211)
+      %tmp66_40 = tt.addptr %tmp66, %tmp63_36 : tensor<4x64x!tt.ptr<f32>>, tensor<4x64xi32> loc(#loc211)
+      %tmp66_41 = tt.load %tmp66_40, %tmp50_29, %cst_11 evictionPolicy = evict_last : tensor<4x64x!tt.ptr<f32>> loc(#loc212)
+      %tmp96 = arith.addi %r0_index_20, %cst_9 : tensor<1x64xi32> loc(#loc213)
+      %tmp96_42 = tt.broadcast %tmp96 : tensor<1x64xi32> -> tensor<4x64xi32> loc(#loc214)
+      %tmp96_43 = arith.addi %tmp96_42, %tmp50_22 : tensor<4x64xi32> loc(#loc214)
+      %tmp96_44 = arith.addi %tmp96_43, %tmp50_25 : tensor<4x64xi32> loc(#loc215)
+      %tmp96_45 = tt.addptr %tmp50_27, %tmp96_44 : tensor<4x64x!tt.ptr<bf16>>, tensor<4x64xi32> loc(#loc216)
+      %tmp96_46 = tt.load %tmp96_45, %tmp50_29, %cst_0 evictionPolicy = evict_first : tensor<4x64x!tt.ptr<bf16>> loc(#loc217)
+      %tmp96_47 = arith.extf %tmp96_46 : tensor<4x64xbf16> to tensor<4x64xf32> loc(#loc218)
+      %tmp102 = tt.splat %in_ptr4 : !tt.ptr<bf16> -> tensor<1x64x!tt.ptr<bf16>> loc(#loc219)
+      %tmp102_48 = tt.addptr %tmp102, %r0_index_20 : tensor<1x64x!tt.ptr<bf16>>, tensor<1x64xi32> loc(#loc219)
+      %tmp102_49 = tt.load %tmp102_48, %r0_mask, %cst evictionPolicy = evict_last : tensor<1x64x!tt.ptr<bf16>> loc(#loc220)
+      %tmp102_50 = arith.extf %tmp102_49 : tensor<1x64xbf16> to tensor<1x64xf32> loc(#loc221)
+      %tmp16 = arith.extsi %r0_3 : tensor<1x64xi32> to tensor<1x64xi64> loc(#loc222)
+      %tmp16_51 = arith.cmpi slt, %tmp16, %cst_5 : tensor<1x64xi64> loc(#loc222)
+      %tmp17 = arith.muli %r0_4, %cst_6 : tensor<1x64xi32> loc(#loc223)
+      %tmp17_52 = arith.addi %tmp17, %cst_4 : tensor<1x64xi32> loc(#loc224)
+      %tmp17_53 = tt.broadcast %tmp17_52 : tensor<1x64xi32> -> tensor<4x64xi32> loc(#loc225)
+      %tmp17_54 = arith.addi %tmp17_53, %tmp50_22 : tensor<4x64xi32> loc(#loc225)
+      %tmp17_55 = arith.addi %tmp17_54, %tmp50_25 : tensor<4x64xi32> loc(#loc226)
+      %tmp17_56 = tt.addptr %tmp50_27, %tmp17_55 : tensor<4x64x!tt.ptr<bf16>>, tensor<4x64xi32> loc(#loc227)
+      %tmp17_57 = arith.andi %r0_mask, %tmp16_51 : tensor<1x64xi1> loc(#loc228)
+      %tmp17_58 = tt.broadcast %tmp17_57 : tensor<1x64xi1> -> tensor<4x64xi1> loc(#loc229)
+      %tmp17_59 = tt.load %tmp17_56, %tmp17_58, %cst_0 evictionPolicy = evict_last : tensor<4x64x!tt.ptr<bf16>> loc(#loc229)
+      %tmp17_60 = arith.extf %tmp17_59 : tensor<4x64xbf16> to tensor<4x64xf32> loc(#loc230)
+      %tmp20 = arith.divf %tmp10_19, %cst_3 : tensor<4x1xf32> loc(#loc231)
+      %tmp22 = arith.addf %tmp20, %cst_2 : tensor<4x1xf32> loc(#loc232)
+      %tmp23 = tt.extern_elementwise %tmp22 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<4x1xf32>) -> tensor<4x1xf32> loc(#loc233)
+      %tmp24 = tt.broadcast %tmp23 : tensor<4x1xf32> -> tensor<4x64xf32> loc(#loc234)
+      %tmp24_61 = arith.mulf %tmp17_60, %tmp24 : tensor<4x64xf32> loc(#loc234)
+      %tmp25 = tt.addptr %tmp58, %tmp17_52 : tensor<1x64x!tt.ptr<bf16>>, tensor<1x64xi32> loc(#loc235)
+      %tmp25_62 = tt.broadcast %tmp25 : tensor<1x64x!tt.ptr<bf16>> -> tensor<4x64x!tt.ptr<bf16>> loc(#loc235)
+      %tmp25_63 = tt.load %tmp25_62, %tmp17_58, %cst_0 evictionPolicy = evict_last : tensor<4x64x!tt.ptr<bf16>> loc(#loc236)
+      %tmp25_64 = arith.extf %tmp25_63 : tensor<4x64xbf16> to tensor<4x64xf32> loc(#loc237)
+      %tmp27 = arith.mulf %tmp24_61, %tmp25_64 : tensor<4x64xf32> loc(#loc238)
+      %tmp29 = arith.subf %cst_11, %tmp27 : tensor<4x64xf32> loc(#loc239)
+      %tmp31 = tt.broadcast %tmp16_51 : tensor<1x64xi1> -> tensor<4x64xi1> loc(#loc240)
+      %tmp31_65 = arith.select %tmp31, %tmp29, %cst_11 : tensor<4x64xi1>, tensor<4x64xf32> loc(#loc240)
+      %tmp32 = arith.cmpi sge, %tmp16, %cst_5 : tensor<1x64xi64> loc(#loc241)
+      %tmp35 = tt.broadcast %tmp17 : tensor<1x64xi32> -> tensor<4x64xi32> loc(#loc242)
+      %tmp35_66 = arith.addi %tmp35, %tmp50_22 : tensor<4x64xi32> loc(#loc242)
+      %tmp35_67 = arith.addi %tmp35_66, %tmp50_25 : tensor<4x64xi32> loc(#loc243)
+      %tmp35_68 = tt.addptr %tmp50_27, %tmp35_67 : tensor<4x64x!tt.ptr<bf16>>, tensor<4x64xi32> loc(#loc244)
+      %tmp35_69 = arith.andi %r0_mask, %tmp32 : tensor<1x64xi1> loc(#loc245)
+      %tmp35_70 = tt.broadcast %tmp35_69 : tensor<1x64xi1> -> tensor<4x64xi1> loc(#loc246)
+      %tmp35_71 = tt.load %tmp35_68, %tmp35_70, %cst_0 evictionPolicy = evict_last : tensor<4x64x!tt.ptr<bf16>> loc(#loc246)
+      %tmp35_72 = arith.extf %tmp35_71 : tensor<4x64xbf16> to tensor<4x64xf32> loc(#loc247)
+      %tmp42 = arith.mulf %tmp35_72, %tmp24 : tensor<4x64xf32> loc(#loc248)
+      %tmp43 = tt.addptr %tmp58, %tmp17 : tensor<1x64x!tt.ptr<bf16>>, tensor<1x64xi32> loc(#loc249)
+      %tmp43_73 = tt.broadcast %tmp43 : tensor<1x64x!tt.ptr<bf16>> -> tensor<4x64x!tt.ptr<bf16>> loc(#loc249)
+      %tmp43_74 = tt.load %tmp43_73, %tmp35_70, %cst_0 evictionPolicy = evict_last : tensor<4x64x!tt.ptr<bf16>> loc(#loc250)
+      %tmp43_75 = arith.extf %tmp43_74 : tensor<4x64xbf16> to tensor<4x64xf32> loc(#loc251)
+      %tmp45 = arith.mulf %tmp42, %tmp43_75 : tensor<4x64xf32> loc(#loc252)
+      %tmp48 = tt.broadcast %tmp32 : tensor<1x64xi1> -> tensor<4x64xi1> loc(#loc253)
+      %tmp48_76 = arith.select %tmp48, %tmp45, %cst_11 : tensor<4x64xi1>, tensor<4x64xf32> loc(#loc253)
+      %tmp49 = arith.select %tmp31, %tmp31_65, %tmp48_76 : tensor<4x64xi1>, tensor<4x64xf32> loc(#loc254)
+      %tmp57 = arith.mulf %tmp50_31, %tmp24 : tensor<4x64xf32> loc(#loc255)
+      %tmp60 = tt.broadcast %tmp58_34 : tensor<1x64xf32> -> tensor<4x64xf32> loc(#loc256)
+      %tmp60_77 = arith.mulf %tmp57, %tmp60 : tensor<4x64xf32> loc(#loc256)
+      %tmp64 = arith.mulf %tmp60_77, %tmp63_39 : tensor<4x64xf32> loc(#loc257)
+      %tmp67 = arith.mulf %tmp49, %tmp66_41 : tensor<4x64xf32> loc(#loc258)
+      %tmp68 = arith.addf %tmp64, %tmp67 : tensor<4x64xf32> loc(#loc259)
+      %tmp70 = arith.addi %tmp17, %cst_1 : tensor<1x64xi32> loc(#loc260)
+      %tmp70_78 = tt.broadcast %tmp70 : tensor<1x64xi32> -> tensor<4x64xi32> loc(#loc261)
+      %tmp70_79 = arith.addi %tmp70_78, %tmp50_22 : tensor<4x64xi32> loc(#loc261)
+      %tmp70_80 = arith.addi %tmp70_79, %tmp50_25 : tensor<4x64xi32> loc(#loc262)
+      %tmp70_81 = tt.addptr %tmp50_27, %tmp70_80 : tensor<4x64x!tt.ptr<bf16>>, tensor<4x64xi32> loc(#loc263)
+      %tmp70_82 = tt.load %tmp70_81, %tmp17_58, %cst_0 evictionPolicy = evict_last : tensor<4x64x!tt.ptr<bf16>> loc(#loc264)
+      %tmp70_83 = arith.extf %tmp70_82 : tensor<4x64xbf16> to tensor<4x64xf32> loc(#loc265)
+      %tmp72 = arith.divf %tmp4_18, %cst_3 : tensor<4x1xf32> loc(#loc266)
+      %tmp73 = arith.addf %tmp72, %cst_2 : tensor<4x1xf32> loc(#loc267)
+      %tmp74 = tt.extern_elementwise %tmp73 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<4x1xf32>) -> tensor<4x1xf32> loc(#loc268)
+      %tmp75 = tt.broadcast %tmp74 : tensor<4x1xf32> -> tensor<4x64xf32> loc(#loc269)
+      %tmp75_84 = arith.mulf %tmp70_83, %tmp75 : tensor<4x64xf32> loc(#loc269)
+      %tmp76 = tt.addptr %tmp102, %tmp17_52 : tensor<1x64x!tt.ptr<bf16>>, tensor<1x64xi32> loc(#loc270)
+      %tmp76_85 = tt.broadcast %tmp76 : tensor<1x64x!tt.ptr<bf16>> -> tensor<4x64x!tt.ptr<bf16>> loc(#loc270)
+      %tmp76_86 = tt.load %tmp76_85, %tmp17_58, %cst_0 evictionPolicy = evict_last : tensor<4x64x!tt.ptr<bf16>> loc(#loc271)
+      %tmp76_87 = arith.extf %tmp76_86 : tensor<4x64xbf16> to tensor<4x64xf32> loc(#loc272)
+      %tmp78 = arith.mulf %tmp75_84, %tmp76_87 : tensor<4x64xf32> loc(#loc273)
+      %tmp80 = arith.subf %cst_11, %tmp78 : tensor<4x64xf32> loc(#loc274)
+      %tmp82 = arith.select %tmp31, %tmp80, %cst_11 : tensor<4x64xi1>, tensor<4x64xf32> loc(#loc275)
+      %tmp83 = arith.addi %tmp17, %cst_9 : tensor<1x64xi32> loc(#loc276)
+      %tmp83_88 = tt.broadcast %tmp83 : tensor<1x64xi32> -> tensor<4x64xi32> loc(#loc277)
+      %tmp83_89 = arith.addi %tmp83_88, %tmp50_22 : tensor<4x64xi32> loc(#loc277)
+      %tmp83_90 = arith.addi %tmp83_89, %tmp50_25 : tensor<4x64xi32> loc(#loc278)
+      %tmp83_91 = tt.addptr %tmp50_27, %tmp83_90 : tensor<4x64x!tt.ptr<bf16>>, tensor<4x64xi32> loc(#loc279)
+      %tmp83_92 = tt.load %tmp83_91, %tmp35_70, %cst_0 evictionPolicy = evict_last : tensor<4x64x!tt.ptr<bf16>> loc(#loc280)
+      %tmp83_93 = arith.extf %tmp83_92 : tensor<4x64xbf16> to tensor<4x64xf32> loc(#loc281)
+      %tmp88 = arith.mulf %tmp83_93, %tmp75 : tensor<4x64xf32> loc(#loc282)
+      %tmp89 = tt.addptr %tmp102, %tmp17 : tensor<1x64x!tt.ptr<bf16>>, tensor<1x64xi32> loc(#loc283)
+      %tmp89_94 = tt.broadcast %tmp89 : tensor<1x64x!tt.ptr<bf16>> -> tensor<4x64x!tt.ptr<bf16>> loc(#loc283)
+      %tmp89_95 = tt.load %tmp89_94, %tmp35_70, %cst_0 evictionPolicy = evict_last : tensor<4x64x!tt.ptr<bf16>> loc(#loc284)
+      %tmp89_96 = arith.extf %tmp89_95 : tensor<4x64xbf16> to tensor<4x64xf32> loc(#loc285)
+      %tmp91 = arith.mulf %tmp88, %tmp89_96 : tensor<4x64xf32> loc(#loc286)
+      %tmp94 = arith.select %tmp48, %tmp91, %cst_11 : tensor<4x64xi1>, tensor<4x64xf32> loc(#loc287)
+      %tmp95 = arith.select %tmp31, %tmp82, %tmp94 : tensor<4x64xi1>, tensor<4x64xf32> loc(#loc288)
+      %tmp101 = arith.mulf %tmp96_47, %tmp75 : tensor<4x64xf32> loc(#loc289)
+      %tmp104 = tt.broadcast %tmp102_50 : tensor<1x64xf32> -> tensor<4x64xf32> loc(#loc290)
+      %tmp104_97 = arith.mulf %tmp101, %tmp104 : tensor<4x64xf32> loc(#loc290)
+      %tmp107 = arith.mulf %tmp104_97, %tmp63_39 : tensor<4x64xf32> loc(#loc291)
+      %tmp109 = arith.mulf %tmp95, %tmp66_41 : tensor<4x64xf32> loc(#loc292)
+      %tmp110 = arith.addf %tmp107, %tmp109 : tensor<4x64xf32> loc(#loc293)
+      %0 = arith.muli %xindex_16, %cst_8 : tensor<4x1xi32> loc(#loc142)
+      %1 = tt.broadcast %0 : tensor<4x1xi32> -> tensor<4x64xi32> loc(#loc143)
+      %2 = arith.addi %tmp50_21, %1 : tensor<4x64xi32> loc(#loc143)
+      %3 = tt.splat %in_out_ptr0 : !tt.ptr<bf16> -> tensor<4x64x!tt.ptr<bf16>> loc(#loc144)
+      %4 = tt.addptr %3, %2 : tensor<4x64x!tt.ptr<bf16>>, tensor<4x64xi32> loc(#loc144)
+      %5 = arith.truncf %tmp68 : tensor<4x64xf32> to tensor<4x64xbf16> loc(#loc145)
+      tt.store %4, %5, %tmp50_29 : tensor<4x64x!tt.ptr<bf16>> loc(#loc145)
+      %6 = tt.splat %in_out_ptr1 : !tt.ptr<bf16> -> tensor<4x64x!tt.ptr<bf16>> loc(#loc146)
+      %7 = tt.addptr %6, %2 : tensor<4x64x!tt.ptr<bf16>>, tensor<4x64xi32> loc(#loc146)
+      %8 = arith.truncf %tmp110 : tensor<4x64xf32> to tensor<4x64xbf16> loc(#loc147)
+      tt.store %7, %8, %tmp50_29 : tensor<4x64x!tt.ptr<bf16>> loc(#loc147)
+    } loc(#loc40)
+    tt.return loc(#loc148)
+  } loc(#loc)
+} loc(#loc)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":23:28)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":23:33)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:36)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:44)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:23)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":26:27)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":26:37)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":28:19)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":29:19)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":33:43)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":34:31)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":35:29)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:41)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:52)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:48)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:63)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:57)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:34)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:68)
+#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:121)
+#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:41)
+#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:50)
+#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:34)
+#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:61)
+#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:114)
+#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":42:22)
+#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":44:23)
+#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":45:40)
+#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":47:22)
+#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":49:25)
+#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":50:42)
+#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":50:8)
+#loc34 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:36)
+#loc36 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:15)
+#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":51:28)
+#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":52:30)
+#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":53:43)
+#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":54:31)
+#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":55:29)
+#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":58:27)
+#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":59:27)
+#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:46)
+#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:42)
+#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:57)
+#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:51)
+#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:35)
+#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:62)
+#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:115)
+#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:35)
+#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:42)
+#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:95)
+#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:46)
+#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:42)
+#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:35)
+#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:51)
+#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:35)
+#loc60 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:51)
+#loc61 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:42)
+#loc62 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:49)
+#loc63 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:58)
+#loc64 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:35)
+#loc65 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:69)
+#loc66 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:123)
+#loc67 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:36)
+#loc68 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:43)
+#loc69 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:96)
+#loc70 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":71:24)
+#loc71 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:41)
+#loc72 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:39)
+#loc73 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:48)
+#loc74 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:57)
+#loc75 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:35)
+#loc76 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:78)
+#loc77 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:68)
+#loc78 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:129)
+#loc79 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":75:25)
+#loc80 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":77:24)
+#loc81 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":78:32)
+#loc82 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":79:24)
+#loc83 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:35)
+#loc84 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:85)
+#loc85 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:146)
+#loc86 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":82:24)
+#loc87 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":84:17)
+#loc88 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":86:39)
+#loc89 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":87:25)
+#loc90 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:44)
+#loc91 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:53)
+#loc92 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:35)
+#loc93 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:74)
+#loc94 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:64)
+#loc95 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:125)
+#loc96 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":97:24)
+#loc97 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:35)
+#loc98 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:81)
+#loc99 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:142)
+#loc100 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":100:24)
+#loc101 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":103:39)
+#loc102 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":104:39)
+#loc103 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":111:24)
+#loc104 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":113:24)
+#loc105 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":116:24)
+#loc106 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":118:24)
+#loc107 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":119:24)
+#loc108 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:42)
+#loc109 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:51)
+#loc110 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:60)
+#loc111 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:35)
+#loc112 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:71)
+#loc113 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:132)
+#loc114 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":123:24)
+#loc115 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":124:24)
+#loc116 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":125:32)
+#loc117 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":126:24)
+#loc118 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:35)
+#loc119 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:85)
+#loc120 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:146)
+#loc121 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":129:24)
+#loc122 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":131:17)
+#loc123 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":133:39)
+#loc124 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:42)
+#loc125 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:51)
+#loc126 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:60)
+#loc127 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:35)
+#loc128 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:71)
+#loc129 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:132)
+#loc130 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":139:24)
+#loc131 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:35)
+#loc132 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:81)
+#loc133 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:142)
+#loc134 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":142:24)
+#loc135 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":145:39)
+#loc136 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":146:39)
+#loc137 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":151:25)
+#loc138 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":153:26)
+#loc139 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":156:26)
+#loc140 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":158:26)
+#loc141 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":159:26)
+#loc142 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:43)
+#loc143 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:39)
+#loc144 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:32)
+#loc145 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:55)
+#loc146 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:32)
+#loc147 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:56)
+#loc148 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":53:4)
+#loc158 = loc("xoffset"(#loc2))
+#loc159 = loc("xoffset"(#loc3))
+#loc160 = loc("xindex"(#loc4))
+#loc161 = loc("xindex"(#loc5))
+#loc162 = loc("xindex"(#loc6))
+#loc163 = loc("r0_base"(#loc7))
+#loc164 = loc("r0_base"(#loc8))
+#loc165 = loc("x0"(#loc9))
+#loc166 = loc("x1"(#loc10))
+#loc167 = loc("_tmp4"(#loc11))
+#loc168 = loc("r0_index"(#loc12))
+#loc169 = loc("r0_mask"(#loc13))
+#loc170 = loc("tmp0"(#loc14))
+#loc171 = loc("tmp0"(#loc15))
+#loc172 = loc("tmp0"(#loc16))
+#loc173 = loc("tmp0"(#loc17))
+#loc174 = loc("tmp0"(#loc18))
+#loc175 = loc("tmp0"(#loc19))
+#loc176 = loc("tmp0"(#loc20))
+#loc177 = loc("tmp0"(#loc21))
+#loc178 = loc("tmp6"(#loc22))
+#loc179 = loc("tmp6"(#loc23))
+#loc180 = loc("tmp6"(#loc24))
+#loc181 = loc("tmp6"(#loc25))
+#loc182 = loc("tmp6"(#loc26))
+#loc183 = loc("tmp2"(#loc27))
+#loc184 = loc("tmp5"(#loc28))
+#loc185 = loc("_tmp4"(#loc29))
+#loc186 = loc("tmp8"(#loc30))
+#loc187 = loc("tmp11"(#loc31))
+#loc188 = loc("_tmp10"(#loc32))
+#loc190 = loc("tmp4"(#loc37))
+#loc192 = loc("tmp10"(#loc39))
+#loc193 = loc("r0_index"(#loc41))
+#loc194 = loc("r0_mask"(#loc42))
+#loc195 = loc("r0_3"(#loc43))
+#loc196 = loc("r0_4"(#loc44))
+#loc197 = loc("tmp50"(#loc45))
+#loc198 = loc("tmp50"(#loc46))
+#loc199 = loc("tmp50"(#loc47))
+#loc200 = loc("tmp50"(#loc48))
+#loc201 = loc("tmp50"(#loc49))
+#loc202 = loc("tmp50"(#loc50))
+#loc203 = loc("tmp50"(#loc51))
+#loc204 = loc("tmp58"(#loc52))
+#loc205 = loc("tmp58"(#loc53))
+#loc206 = loc("tmp58"(#loc54))
+#loc207 = loc("tmp63"(#loc55))
+#loc208 = loc("tmp63"(#loc56))
+#loc209 = loc("tmp63"(#loc57))
+#loc210 = loc("tmp63"(#loc58))
+#loc211 = loc("tmp66"(#loc59))
+#loc212 = loc("tmp66"(#loc60))
+#loc213 = loc("tmp96"(#loc61))
+#loc214 = loc("tmp96"(#loc62))
+#loc215 = loc("tmp96"(#loc63))
+#loc216 = loc("tmp96"(#loc64))
+#loc217 = loc("tmp96"(#loc65))
+#loc218 = loc("tmp96"(#loc66))
+#loc219 = loc("tmp102"(#loc67))
+#loc220 = loc("tmp102"(#loc68))
+#loc221 = loc("tmp102"(#loc69))
+#loc222 = loc("tmp16"(#loc70))
+#loc223 = loc("tmp17"(#loc71))
+#loc224 = loc("tmp17"(#loc72))
+#loc225 = loc("tmp17"(#loc73))
+#loc226 = loc("tmp17"(#loc74))
+#loc227 = loc("tmp17"(#loc75))
+#loc228 = loc("tmp17"(#loc76))
+#loc229 = loc("tmp17"(#loc77))
+#loc230 = loc("tmp17"(#loc78))
+#loc231 = loc("tmp20"(#loc79))
+#loc232 = loc("tmp22"(#loc80))
+#loc233 = loc("tmp23"(#loc81))
+#loc234 = loc("tmp24"(#loc82))
+#loc235 = loc("tmp25"(#loc83))
+#loc236 = loc("tmp25"(#loc84))
+#loc237 = loc("tmp25"(#loc85))
+#loc238 = loc("tmp27"(#loc86))
+#loc239 = loc("tmp29"(#loc87))
+#loc240 = loc("tmp31"(#loc88))
+#loc241 = loc("tmp32"(#loc89))
+#loc242 = loc("tmp35"(#loc90))
+#loc243 = loc("tmp35"(#loc91))
+#loc244 = loc("tmp35"(#loc92))
+#loc245 = loc("tmp35"(#loc93))
+#loc246 = loc("tmp35"(#loc94))
+#loc247 = loc("tmp35"(#loc95))
+#loc248 = loc("tmp42"(#loc96))
+#loc249 = loc("tmp43"(#loc97))
+#loc250 = loc("tmp43"(#loc98))
+#loc251 = loc("tmp43"(#loc99))
+#loc252 = loc("tmp45"(#loc100))
+#loc253 = loc("tmp48"(#loc101))
+#loc254 = loc("tmp49"(#loc102))
+#loc255 = loc("tmp57"(#loc103))
+#loc256 = loc("tmp60"(#loc104))
+#loc257 = loc("tmp64"(#loc105))
+#loc258 = loc("tmp67"(#loc106))
+#loc259 = loc("tmp68"(#loc107))
+#loc260 = loc("tmp70"(#loc108))
+#loc261 = loc("tmp70"(#loc109))
+#loc262 = loc("tmp70"(#loc110))
+#loc263 = loc("tmp70"(#loc111))
+#loc264 = loc("tmp70"(#loc112))
+#loc265 = loc("tmp70"(#loc113))
+#loc266 = loc("tmp72"(#loc114))
+#loc267 = loc("tmp73"(#loc115))
+#loc268 = loc("tmp74"(#loc116))
+#loc269 = loc("tmp75"(#loc117))
+#loc270 = loc("tmp76"(#loc118))
+#loc271 = loc("tmp76"(#loc119))
+#loc272 = loc("tmp76"(#loc120))
+#loc273 = loc("tmp78"(#loc121))
+#loc274 = loc("tmp80"(#loc122))
+#loc275 = loc("tmp82"(#loc123))
+#loc276 = loc("tmp83"(#loc124))
+#loc277 = loc("tmp83"(#loc125))
+#loc278 = loc("tmp83"(#loc126))
+#loc279 = loc("tmp83"(#loc127))
+#loc280 = loc("tmp83"(#loc128))
+#loc281 = loc("tmp83"(#loc129))
+#loc282 = loc("tmp88"(#loc130))
+#loc283 = loc("tmp89"(#loc131))
+#loc284 = loc("tmp89"(#loc132))
+#loc285 = loc("tmp89"(#loc133))
+#loc286 = loc("tmp91"(#loc134))
+#loc287 = loc("tmp94"(#loc135))
+#loc288 = loc("tmp95"(#loc136))
+#loc289 = loc("tmp101"(#loc137))
+#loc290 = loc("tmp104"(#loc138))
+#loc291 = loc("tmp107"(#loc139))
+#loc292 = loc("tmp109"(#loc140))
+#loc293 = loc("tmp110"(#loc141))
+#loc294 = loc("_tmp10"(#loc167))
+#loc295 = loc(callsite(#loc34 at #loc189))
+#loc297 = loc(callsite(#loc34 at #loc191))
+#loc299 = loc(callsite(#loc36 at #loc295))
+#loc300 = loc(callsite(#loc36 at #loc297))
diff --git a/triton/B332B73IRJVPV7KEXQCJ4B6VIXVZKZ6STDLWEXIOLGVNUOTPYFEQ/__grp__triton_poi_fused__fused_rms_norm_cat_view_2.json b/triton/B332B73IRJVPV7KEXQCJ4B6VIXVZKZ6STDLWEXIOLGVNUOTPYFEQ/__grp__triton_poi_fused__fused_rms_norm_cat_view_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..a420e8cd9e9ae76456d920f45dd4b423201c2a9b
--- /dev/null
+++ b/triton/B332B73IRJVPV7KEXQCJ4B6VIXVZKZ6STDLWEXIOLGVNUOTPYFEQ/__grp__triton_poi_fused__fused_rms_norm_cat_view_2.json
@@ -0,0 +1 @@
+{"child_paths": {"triton_poi_fused__fused_rms_norm_cat_view_2.source": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/B332B73IRJVPV7KEXQCJ4B6VIXVZKZ6STDLWEXIOLGVNUOTPYFEQ/triton_poi_fused__fused_rms_norm_cat_view_2.source", "triton_poi_fused__fused_rms_norm_cat_view_2.ttir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/B332B73IRJVPV7KEXQCJ4B6VIXVZKZ6STDLWEXIOLGVNUOTPYFEQ/triton_poi_fused__fused_rms_norm_cat_view_2.ttir", "triton_poi_fused__fused_rms_norm_cat_view_2.ttgir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/B332B73IRJVPV7KEXQCJ4B6VIXVZKZ6STDLWEXIOLGVNUOTPYFEQ/triton_poi_fused__fused_rms_norm_cat_view_2.ttgir", "triton_poi_fused__fused_rms_norm_cat_view_2.llir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/B332B73IRJVPV7KEXQCJ4B6VIXVZKZ6STDLWEXIOLGVNUOTPYFEQ/triton_poi_fused__fused_rms_norm_cat_view_2.llir", "triton_poi_fused__fused_rms_norm_cat_view_2.ptx": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/B332B73IRJVPV7KEXQCJ4B6VIXVZKZ6STDLWEXIOLGVNUOTPYFEQ/triton_poi_fused__fused_rms_norm_cat_view_2.ptx", "triton_poi_fused__fused_rms_norm_cat_view_2.cubin": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/B332B73IRJVPV7KEXQCJ4B6VIXVZKZ6STDLWEXIOLGVNUOTPYFEQ/triton_poi_fused__fused_rms_norm_cat_view_2.cubin", "triton_poi_fused__fused_rms_norm_cat_view_2.json": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/B332B73IRJVPV7KEXQCJ4B6VIXVZKZ6STDLWEXIOLGVNUOTPYFEQ/triton_poi_fused__fused_rms_norm_cat_view_2.json"}}
\ No newline at end of file
diff --git a/triton/B332B73IRJVPV7KEXQCJ4B6VIXVZKZ6STDLWEXIOLGVNUOTPYFEQ/triton_poi_fused__fused_rms_norm_cat_view_2.cubin b/triton/B332B73IRJVPV7KEXQCJ4B6VIXVZKZ6STDLWEXIOLGVNUOTPYFEQ/triton_poi_fused__fused_rms_norm_cat_view_2.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..f293897e93aa63d1726bb0aee45921e1aa487f5e
Binary files /dev/null and b/triton/B332B73IRJVPV7KEXQCJ4B6VIXVZKZ6STDLWEXIOLGVNUOTPYFEQ/triton_poi_fused__fused_rms_norm_cat_view_2.cubin differ
diff --git a/triton/B332B73IRJVPV7KEXQCJ4B6VIXVZKZ6STDLWEXIOLGVNUOTPYFEQ/triton_poi_fused__fused_rms_norm_cat_view_2.json b/triton/B332B73IRJVPV7KEXQCJ4B6VIXVZKZ6STDLWEXIOLGVNUOTPYFEQ/triton_poi_fused__fused_rms_norm_cat_view_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..e7c8affebb3d3cd055f0e1d6e44bb48abe7d50b6
--- /dev/null
+++ b/triton/B332B73IRJVPV7KEXQCJ4B6VIXVZKZ6STDLWEXIOLGVNUOTPYFEQ/triton_poi_fused__fused_rms_norm_cat_view_2.json
@@ -0,0 +1 @@
+{"hash": "0ef7a0ff688a6afafd44bc049e07d545eb9567d298d7625d0e59aada3a6fc149", "target": {"backend": "cuda", "arch": 89, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "enable_reflect_ftz": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee", "bf16x3", "bf16x6"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm89", "instrumentation_mode": "", "triton_version": "3.6.0", "tensordesc_meta": [], "shared": 2048, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_poi_fused__fused_rms_norm_cat_view_2"}
\ No newline at end of file
diff --git a/triton/B332B73IRJVPV7KEXQCJ4B6VIXVZKZ6STDLWEXIOLGVNUOTPYFEQ/triton_poi_fused__fused_rms_norm_cat_view_2.llir b/triton/B332B73IRJVPV7KEXQCJ4B6VIXVZKZ6STDLWEXIOLGVNUOTPYFEQ/triton_poi_fused__fused_rms_norm_cat_view_2.llir
new file mode 100644
index 0000000000000000000000000000000000000000..071e1b38becc22671c480565256e1ffc7bf92768
--- /dev/null
+++ b/triton/B332B73IRJVPV7KEXQCJ4B6VIXVZKZ6STDLWEXIOLGVNUOTPYFEQ/triton_poi_fused__fused_rms_norm_cat_view_2.llir
@@ -0,0 +1,795 @@
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64"
+
+@global_smem = external addrspace(3) global [0 x i8], align 16
+@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1
+
+; Function Attrs: nounwind
+define ptx_kernel void @triton_poi_fused__fused_rms_norm_cat_view_2(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, ptr addrspace(1) %6, i32 %7, i32 %8, ptr addrspace(1) readnone captures(none) %9, ptr addrspace(1) readnone captures(none) %10) local_unnamed_addr #0 !dbg !5 {
+  %12 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y(), !dbg !8
+  %13 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.z(), !dbg !9
+  %14 = tail call i32 @llvm.nvvm.read.ptx.sreg.nctaid.y(), !dbg !10
+  %15 = mul nuw i32 %13, %14, !dbg !11
+  %16 = add nuw i32 %15, %12, !dbg !12
+  %17 = shl i32 %16, 10, !dbg !13
+  %18 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !14
+  %19 = and i32 %18, 127, !dbg !14
+  %20 = shl nuw nsw i32 %19, 2, !dbg !14
+  %21 = or disjoint i32 %17, %20, !dbg !15
+  %22 = or disjoint i32 %21, 1, !dbg !15
+  %23 = or disjoint i32 %21, 2, !dbg !15
+  %24 = or disjoint i32 %21, 3, !dbg !15
+  %25 = or disjoint i32 %21, 512, !dbg !15
+  %26 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !16
+  %27 = icmp samesign ult i32 %26, 128, !dbg !17
+  %28 = sdiv i32 %21, 32, !dbg !18
+  %29 = sdiv i32 %25, 32, !dbg !18
+  %30 = mul i32 %28, 32, !dbg !19
+  %.decomposed = sub i32 %21, %30, !dbg !19
+  %31 = srem i32 %22, 32, !dbg !19
+  %32 = srem i32 %23, 32, !dbg !19
+  %33 = srem i32 %24, 32, !dbg !19
+  %34 = icmp slt i32 %21, 8192, !dbg !20
+  %35 = shl nsw i32 %.decomposed, 7, !dbg !21
+  %36 = shl nsw i32 %31, 7, !dbg !21
+  %37 = shl nsw i32 %32, 7, !dbg !21
+  %38 = shl nsw i32 %33, 7, !dbg !21
+  %39 = add i32 %35, %26, !dbg !22
+  %40 = add i32 %36, %26, !dbg !22
+  %41 = add i32 %37, %26, !dbg !22
+  %42 = add i32 %38, %26, !dbg !22
+  %43 = mul i32 %28, 12288, !dbg !23
+  %44 = mul i32 %29, 12288, !dbg !23
+  %45 = add i32 %39, %43, !dbg !24
+  %46 = add i32 %40, %43, !dbg !24
+  %47 = add i32 %41, %43, !dbg !24
+  %48 = add i32 %42, %43, !dbg !24
+  %49 = add i32 %39, %44, !dbg !24
+  %50 = add i32 %40, %44, !dbg !24
+  %51 = add i32 %41, %44, !dbg !24
+  %52 = add i32 %42, %44, !dbg !24
+  %53 = sext i32 %45 to i64, !dbg !25
+  %54 = getelementptr bfloat, ptr addrspace(1) %0, i64 %53, !dbg !25
+  %55 = sext i32 %46 to i64, !dbg !25
+  %56 = getelementptr bfloat, ptr addrspace(1) %0, i64 %55, !dbg !25
+  %57 = sext i32 %47 to i64, !dbg !25
+  %58 = getelementptr bfloat, ptr addrspace(1) %0, i64 %57, !dbg !25
+  %59 = sext i32 %48 to i64, !dbg !25
+  %60 = getelementptr bfloat, ptr addrspace(1) %0, i64 %59, !dbg !25
+  %61 = sext i32 %49 to i64, !dbg !25
+  %62 = getelementptr bfloat, ptr addrspace(1) %0, i64 %61, !dbg !25
+  %63 = sext i32 %50 to i64, !dbg !25
+  %64 = getelementptr bfloat, ptr addrspace(1) %0, i64 %63, !dbg !25
+  %65 = sext i32 %51 to i64, !dbg !25
+  %66 = getelementptr bfloat, ptr addrspace(1) %0, i64 %65, !dbg !25
+  %67 = sext i32 %52 to i64, !dbg !25
+  %68 = getelementptr bfloat, ptr addrspace(1) %0, i64 %67, !dbg !25
+  %69 = and i1 %27, %34, !dbg !26
+  %70 = icmp slt i32 %21, 7680, !dbg !27
+  %71 = and i1 %27, %70, !dbg !27
+  %72 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !28
+  %73 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %54, i64 %72, i1 %69) #6, !dbg !28
+  %74 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !28
+  %75 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %56, i64 %74, i1 %69) #6, !dbg !28
+  %76 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !28
+  %77 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %58, i64 %76, i1 %69) #6, !dbg !28
+  %78 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !28
+  %79 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %60, i64 %78, i1 %69) #6, !dbg !28
+  %80 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !28
+  %81 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %62, i64 %80, i1 %71) #6, !dbg !28
+  %82 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !28
+  %83 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %64, i64 %82, i1 %71) #6, !dbg !28
+  %84 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !28
+  %85 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %66, i64 %84, i1 %71) #6, !dbg !28
+  %86 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !28
+  %87 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %68, i64 %86, i1 %71) #6, !dbg !28
+  %88 = sext i32 %21 to i64, !dbg !29
+  %89 = getelementptr float, ptr addrspace(1) %1, i64 %88, !dbg !29
+  %90 = sext i32 %25 to i64, !dbg !29
+  %91 = getelementptr float, ptr addrspace(1) %1, i64 %90, !dbg !29
+  %92 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !30
+  %93 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %89, i64 %92, i1 %69) #6, !dbg !30
+  %94 = extractvalue { i32, i32, i32, i32 } %93, 0, !dbg !30
+  %95 = extractvalue { i32, i32, i32, i32 } %93, 1, !dbg !30
+  %96 = extractvalue { i32, i32, i32, i32 } %93, 2, !dbg !30
+  %97 = extractvalue { i32, i32, i32, i32 } %93, 3, !dbg !30
+  %98 = bitcast i32 %94 to float, !dbg !30
+  %99 = bitcast i32 %95 to float, !dbg !30
+  %100 = bitcast i32 %96 to float, !dbg !30
+  %101 = bitcast i32 %97 to float, !dbg !30
+  %102 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !30
+  %103 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %91, i64 %102, i1 %71) #6, !dbg !30
+  %104 = extractvalue { i32, i32, i32, i32 } %103, 0, !dbg !30
+  %105 = extractvalue { i32, i32, i32, i32 } %103, 1, !dbg !30
+  %106 = extractvalue { i32, i32, i32, i32 } %103, 2, !dbg !30
+  %107 = extractvalue { i32, i32, i32, i32 } %103, 3, !dbg !30
+  %108 = bitcast i32 %104 to float, !dbg !30
+  %109 = bitcast i32 %105 to float, !dbg !30
+  %110 = bitcast i32 %106 to float, !dbg !30
+  %111 = bitcast i32 %107 to float, !dbg !30
+  %112 = tail call float @llvm.nvvm.div.full(float %98, float 1.280000e+02), !dbg !31
+  %113 = tail call float @llvm.nvvm.div.full(float %99, float 1.280000e+02), !dbg !31
+  %114 = tail call float @llvm.nvvm.div.full(float %100, float 1.280000e+02), !dbg !31
+  %115 = tail call float @llvm.nvvm.div.full(float %101, float 1.280000e+02), !dbg !31
+  %116 = tail call float @llvm.nvvm.div.full(float %108, float 1.280000e+02), !dbg !31
+  %117 = tail call float @llvm.nvvm.div.full(float %109, float 1.280000e+02), !dbg !31
+  %118 = tail call float @llvm.nvvm.div.full(float %110, float 1.280000e+02), !dbg !31
+  %119 = tail call float @llvm.nvvm.div.full(float %111, float 1.280000e+02), !dbg !31
+  %120 = fadd float %112, 0x3EB0C6F7A0000000, !dbg !32
+  %121 = fadd float %113, 0x3EB0C6F7A0000000, !dbg !32
+  %122 = fadd float %114, 0x3EB0C6F7A0000000, !dbg !32
+  %123 = fadd float %115, 0x3EB0C6F7A0000000, !dbg !32
+  %124 = fadd float %116, 0x3EB0C6F7A0000000, !dbg !32
+  %125 = fadd float %117, 0x3EB0C6F7A0000000, !dbg !32
+  %126 = fadd float %118, 0x3EB0C6F7A0000000, !dbg !32
+  %127 = fadd float %119, 0x3EB0C6F7A0000000, !dbg !32
+  %128 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !33
+  %.not.i = icmp eq i32 %128, 0, !dbg !33
+  br i1 %.not.i, label %131, label %129, !dbg !33
+
+129:                                              ; preds = %11
+  %130 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %120), !dbg !33
+  br label %__nv_rsqrtf.exit, !dbg !33
+
+131:                                              ; preds = %11
+  %132 = tail call float @llvm.nvvm.rsqrt.approx.f(float %120), !dbg !33
+  br label %__nv_rsqrtf.exit, !dbg !33
+
+__nv_rsqrtf.exit:                                 ; preds = %129, %131
+  %.0.i = phi float [ %130, %129 ], [ %132, %131 ], !dbg !33
+  %133 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !33
+  %.not.i7 = icmp eq i32 %133, 0, !dbg !33
+  br i1 %.not.i7, label %136, label %134, !dbg !33
+
+134:                                              ; preds = %__nv_rsqrtf.exit
+  %135 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %121), !dbg !33
+  br label %__nv_rsqrtf.exit9, !dbg !33
+
+136:                                              ; preds = %__nv_rsqrtf.exit
+  %137 = tail call float @llvm.nvvm.rsqrt.approx.f(float %121), !dbg !33
+  br label %__nv_rsqrtf.exit9, !dbg !33
+
+__nv_rsqrtf.exit9:                                ; preds = %134, %136
+  %.0.i8 = phi float [ %135, %134 ], [ %137, %136 ], !dbg !33
+  %138 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !33
+  %.not.i10 = icmp eq i32 %138, 0, !dbg !33
+  br i1 %.not.i10, label %141, label %139, !dbg !33
+
+139:                                              ; preds = %__nv_rsqrtf.exit9
+  %140 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %122), !dbg !33
+  br label %__nv_rsqrtf.exit12, !dbg !33
+
+141:                                              ; preds = %__nv_rsqrtf.exit9
+  %142 = tail call float @llvm.nvvm.rsqrt.approx.f(float %122), !dbg !33
+  br label %__nv_rsqrtf.exit12, !dbg !33
+
+__nv_rsqrtf.exit12:                               ; preds = %139, %141
+  %.0.i11 = phi float [ %140, %139 ], [ %142, %141 ], !dbg !33
+  %143 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !33
+  %.not.i13 = icmp eq i32 %143, 0, !dbg !33
+  br i1 %.not.i13, label %146, label %144, !dbg !33
+
+144:                                              ; preds = %__nv_rsqrtf.exit12
+  %145 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %123), !dbg !33
+  br label %__nv_rsqrtf.exit15, !dbg !33
+
+146:                                              ; preds = %__nv_rsqrtf.exit12
+  %147 = tail call float @llvm.nvvm.rsqrt.approx.f(float %123), !dbg !33
+  br label %__nv_rsqrtf.exit15, !dbg !33
+
+__nv_rsqrtf.exit15:                               ; preds = %144, %146
+  %.0.i14 = phi float [ %145, %144 ], [ %147, %146 ], !dbg !33
+  %148 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !33
+  %.not.i16 = icmp eq i32 %148, 0, !dbg !33
+  br i1 %.not.i16, label %151, label %149, !dbg !33
+
+149:                                              ; preds = %__nv_rsqrtf.exit15
+  %150 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %124), !dbg !33
+  br label %__nv_rsqrtf.exit18, !dbg !33
+
+151:                                              ; preds = %__nv_rsqrtf.exit15
+  %152 = tail call float @llvm.nvvm.rsqrt.approx.f(float %124), !dbg !33
+  br label %__nv_rsqrtf.exit18, !dbg !33
+
+__nv_rsqrtf.exit18:                               ; preds = %149, %151
+  %.0.i17 = phi float [ %150, %149 ], [ %152, %151 ], !dbg !33
+  %153 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !33
+  %.not.i19 = icmp eq i32 %153, 0, !dbg !33
+  br i1 %.not.i19, label %156, label %154, !dbg !33
+
+154:                                              ; preds = %__nv_rsqrtf.exit18
+  %155 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %125), !dbg !33
+  br label %__nv_rsqrtf.exit21, !dbg !33
+
+156:                                              ; preds = %__nv_rsqrtf.exit18
+  %157 = tail call float @llvm.nvvm.rsqrt.approx.f(float %125), !dbg !33
+  br label %__nv_rsqrtf.exit21, !dbg !33
+
+__nv_rsqrtf.exit21:                               ; preds = %154, %156
+  %.0.i20 = phi float [ %155, %154 ], [ %157, %156 ], !dbg !33
+  %158 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !33
+  %.not.i22 = icmp eq i32 %158, 0, !dbg !33
+  br i1 %.not.i22, label %161, label %159, !dbg !33
+
+159:                                              ; preds = %__nv_rsqrtf.exit21
+  %160 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %126), !dbg !33
+  br label %__nv_rsqrtf.exit24, !dbg !33
+
+161:                                              ; preds = %__nv_rsqrtf.exit21
+  %162 = tail call float @llvm.nvvm.rsqrt.approx.f(float %126), !dbg !33
+  br label %__nv_rsqrtf.exit24, !dbg !33
+
+__nv_rsqrtf.exit24:                               ; preds = %159, %161
+  %.0.i23 = phi float [ %160, %159 ], [ %162, %161 ], !dbg !33
+  %163 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !33
+  %.not.i25 = icmp eq i32 %163, 0, !dbg !33
+  br i1 %.not.i25, label %166, label %164, !dbg !33
+
+164:                                              ; preds = %__nv_rsqrtf.exit24
+  %165 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %127), !dbg !33
+  br label %__nv_rsqrtf.exit27, !dbg !33
+
+166:                                              ; preds = %__nv_rsqrtf.exit24
+  %167 = tail call float @llvm.nvvm.rsqrt.approx.f(float %127), !dbg !33
+  br label %__nv_rsqrtf.exit27, !dbg !33
+
+__nv_rsqrtf.exit27:                               ; preds = %164, %166
+  %.0.i26 = phi float [ %165, %164 ], [ %167, %166 ], !dbg !33
+  %168 = zext nneg i32 %26 to i64, !dbg !34
+  %169 = getelementptr bfloat, ptr addrspace(1) %2, i64 %168, !dbg !34
+  %170 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !35
+  %171 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %169, i64 %170, i1 %69) #6, !dbg !35
+  %172 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !35
+  %173 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %169, i64 %172, i1 %69) #6, !dbg !35
+  %174 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !35
+  %175 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %169, i64 %174, i1 %69) #6, !dbg !35
+  %176 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !35
+  %177 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %169, i64 %176, i1 %69) #6, !dbg !35
+  %178 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !35
+  %179 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %169, i64 %178, i1 %71) #6, !dbg !35
+  %180 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !35
+  %181 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %169, i64 %180, i1 %71) #6, !dbg !35
+  %182 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !35
+  %183 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %169, i64 %182, i1 %71) #6, !dbg !35
+  %184 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !35
+  %185 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %169, i64 %184, i1 %71) #6, !dbg !35
+  %186 = add nsw i32 %28, -256, !dbg !36
+  %187 = add nsw i32 %29, -256, !dbg !36
+  %188 = mul i32 %186, 12288, !dbg !37
+  %189 = mul i32 %187, 12288, !dbg !37
+  %190 = add i32 %39, %188, !dbg !38
+  %191 = add i32 %40, %188, !dbg !38
+  %192 = add i32 %41, %188, !dbg !38
+  %193 = add i32 %42, %188, !dbg !38
+  %194 = add i32 %39, %189, !dbg !38
+  %195 = add i32 %40, %189, !dbg !38
+  %196 = add i32 %41, %189, !dbg !38
+  %197 = add i32 %42, %189, !dbg !38
+  %198 = sext i32 %190 to i64, !dbg !39
+  %199 = getelementptr bfloat, ptr addrspace(1) %3, i64 %198, !dbg !39
+  %200 = sext i32 %191 to i64, !dbg !39
+  %201 = getelementptr bfloat, ptr addrspace(1) %3, i64 %200, !dbg !39
+  %202 = sext i32 %192 to i64, !dbg !39
+  %203 = getelementptr bfloat, ptr addrspace(1) %3, i64 %202, !dbg !39
+  %204 = sext i32 %193 to i64, !dbg !39
+  %205 = getelementptr bfloat, ptr addrspace(1) %3, i64 %204, !dbg !39
+  %206 = sext i32 %194 to i64, !dbg !39
+  %207 = getelementptr bfloat, ptr addrspace(1) %3, i64 %206, !dbg !39
+  %208 = sext i32 %195 to i64, !dbg !39
+  %209 = getelementptr bfloat, ptr addrspace(1) %3, i64 %208, !dbg !39
+  %210 = sext i32 %196 to i64, !dbg !39
+  %211 = getelementptr bfloat, ptr addrspace(1) %3, i64 %210, !dbg !39
+  %212 = sext i32 %197 to i64, !dbg !39
+  %213 = getelementptr bfloat, ptr addrspace(1) %3, i64 %212, !dbg !39
+  %214 = add i32 %17, -8192, !dbg !40
+  %215 = icmp ult i32 %214, 65536, !dbg !40
+  %216 = and i1 %27, %215, !dbg !40
+  %217 = add i32 %17, -7680, !dbg !40
+  %218 = icmp ult i32 %217, 66048, !dbg !40
+  %219 = and i1 %27, %218, !dbg !40
+  %220 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !41
+  %221 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %199, i64 %220, i1 %216) #6, !dbg !41
+  %222 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !41
+  %223 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %201, i64 %222, i1 %216) #6, !dbg !41
+  %224 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !41
+  %225 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %203, i64 %224, i1 %216) #6, !dbg !41
+  %226 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !41
+  %227 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %205, i64 %226, i1 %216) #6, !dbg !41
+  %228 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !41
+  %229 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %207, i64 %228, i1 %219) #6, !dbg !41
+  %230 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !41
+  %231 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %209, i64 %230, i1 %219) #6, !dbg !41
+  %232 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !41
+  %233 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %211, i64 %232, i1 %219) #6, !dbg !41
+  %234 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !41
+  %235 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %213, i64 %234, i1 %219) #6, !dbg !41
+  %236 = shl i32 %186, 5, !dbg !42
+  %237 = shl i32 %187, 5, !dbg !42
+  %238 = add i32 %236, %.decomposed, !dbg !43
+  %239 = add i32 %237, %.decomposed, !dbg !43
+  %240 = sext i32 %238 to i64, !dbg !44
+  %241 = getelementptr float, ptr addrspace(1) %4, i64 %240, !dbg !44
+  %242 = sext i32 %239 to i64, !dbg !44
+  %243 = getelementptr float, ptr addrspace(1) %4, i64 %242, !dbg !44
+  %244 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !45
+  %245 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %241, i64 %244, i1 %216) #6, !dbg !45
+  %246 = extractvalue { i32, i32, i32, i32 } %245, 0, !dbg !45
+  %247 = extractvalue { i32, i32, i32, i32 } %245, 1, !dbg !45
+  %248 = extractvalue { i32, i32, i32, i32 } %245, 2, !dbg !45
+  %249 = extractvalue { i32, i32, i32, i32 } %245, 3, !dbg !45
+  %250 = bitcast i32 %246 to float, !dbg !45
+  %251 = bitcast i32 %247 to float, !dbg !45
+  %252 = bitcast i32 %248 to float, !dbg !45
+  %253 = bitcast i32 %249 to float, !dbg !45
+  %254 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !45
+  %255 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %243, i64 %254, i1 %219) #6, !dbg !45
+  %256 = extractvalue { i32, i32, i32, i32 } %255, 0, !dbg !45
+  %257 = extractvalue { i32, i32, i32, i32 } %255, 1, !dbg !45
+  %258 = extractvalue { i32, i32, i32, i32 } %255, 2, !dbg !45
+  %259 = extractvalue { i32, i32, i32, i32 } %255, 3, !dbg !45
+  %260 = bitcast i32 %256 to float, !dbg !45
+  %261 = bitcast i32 %257 to float, !dbg !45
+  %262 = bitcast i32 %258 to float, !dbg !45
+  %263 = bitcast i32 %259 to float, !dbg !45
+  %264 = tail call float @llvm.nvvm.div.full(float %250, float 1.280000e+02), !dbg !46
+  %265 = tail call float @llvm.nvvm.div.full(float %251, float 1.280000e+02), !dbg !46
+  %266 = tail call float @llvm.nvvm.div.full(float %252, float 1.280000e+02), !dbg !46
+  %267 = tail call float @llvm.nvvm.div.full(float %253, float 1.280000e+02), !dbg !46
+  %268 = tail call float @llvm.nvvm.div.full(float %260, float 1.280000e+02), !dbg !46
+  %269 = tail call float @llvm.nvvm.div.full(float %261, float 1.280000e+02), !dbg !46
+  %270 = tail call float @llvm.nvvm.div.full(float %262, float 1.280000e+02), !dbg !46
+  %271 = tail call float @llvm.nvvm.div.full(float %263, float 1.280000e+02), !dbg !46
+  %272 = fadd float %264, 0x3EB0C6F7A0000000, !dbg !47
+  %273 = fadd float %265, 0x3EB0C6F7A0000000, !dbg !47
+  %274 = fadd float %266, 0x3EB0C6F7A0000000, !dbg !47
+  %275 = fadd float %267, 0x3EB0C6F7A0000000, !dbg !47
+  %276 = fadd float %268, 0x3EB0C6F7A0000000, !dbg !47
+  %277 = fadd float %269, 0x3EB0C6F7A0000000, !dbg !47
+  %278 = fadd float %270, 0x3EB0C6F7A0000000, !dbg !47
+  %279 = fadd float %271, 0x3EB0C6F7A0000000, !dbg !47
+  %280 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !48
+  %.not.i28 = icmp eq i32 %280, 0, !dbg !48
+  br i1 %.not.i28, label %283, label %281, !dbg !48
+
+281:                                              ; preds = %__nv_rsqrtf.exit27
+  %282 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %272), !dbg !48
+  br label %__nv_rsqrtf.exit30, !dbg !48
+
+283:                                              ; preds = %__nv_rsqrtf.exit27
+  %284 = tail call float @llvm.nvvm.rsqrt.approx.f(float %272), !dbg !48
+  br label %__nv_rsqrtf.exit30, !dbg !48
+
+__nv_rsqrtf.exit30:                               ; preds = %281, %283
+  %.0.i29 = phi float [ %282, %281 ], [ %284, %283 ], !dbg !48
+  %285 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !48
+  %.not.i31 = icmp eq i32 %285, 0, !dbg !48
+  br i1 %.not.i31, label %288, label %286, !dbg !48
+
+286:                                              ; preds = %__nv_rsqrtf.exit30
+  %287 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %273), !dbg !48
+  br label %__nv_rsqrtf.exit33, !dbg !48
+
+288:                                              ; preds = %__nv_rsqrtf.exit30
+  %289 = tail call float @llvm.nvvm.rsqrt.approx.f(float %273), !dbg !48
+  br label %__nv_rsqrtf.exit33, !dbg !48
+
+__nv_rsqrtf.exit33:                               ; preds = %286, %288
+  %.0.i32 = phi float [ %287, %286 ], [ %289, %288 ], !dbg !48
+  %290 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !48
+  %.not.i34 = icmp eq i32 %290, 0, !dbg !48
+  br i1 %.not.i34, label %293, label %291, !dbg !48
+
+291:                                              ; preds = %__nv_rsqrtf.exit33
+  %292 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %274), !dbg !48
+  br label %__nv_rsqrtf.exit36, !dbg !48
+
+293:                                              ; preds = %__nv_rsqrtf.exit33
+  %294 = tail call float @llvm.nvvm.rsqrt.approx.f(float %274), !dbg !48
+  br label %__nv_rsqrtf.exit36, !dbg !48
+
+__nv_rsqrtf.exit36:                               ; preds = %291, %293
+  %.0.i35 = phi float [ %292, %291 ], [ %294, %293 ], !dbg !48
+  %295 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !48
+  %.not.i37 = icmp eq i32 %295, 0, !dbg !48
+  br i1 %.not.i37, label %298, label %296, !dbg !48
+
+296:                                              ; preds = %__nv_rsqrtf.exit36
+  %297 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %275), !dbg !48
+  br label %__nv_rsqrtf.exit39, !dbg !48
+
+298:                                              ; preds = %__nv_rsqrtf.exit36
+  %299 = tail call float @llvm.nvvm.rsqrt.approx.f(float %275), !dbg !48
+  br label %__nv_rsqrtf.exit39, !dbg !48
+
+__nv_rsqrtf.exit39:                               ; preds = %296, %298
+  %.0.i38 = phi float [ %297, %296 ], [ %299, %298 ], !dbg !48
+  %300 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !48
+  %.not.i40 = icmp eq i32 %300, 0, !dbg !48
+  br i1 %.not.i40, label %303, label %301, !dbg !48
+
+301:                                              ; preds = %__nv_rsqrtf.exit39
+  %302 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %276), !dbg !48
+  br label %__nv_rsqrtf.exit42, !dbg !48
+
+303:                                              ; preds = %__nv_rsqrtf.exit39
+  %304 = tail call float @llvm.nvvm.rsqrt.approx.f(float %276), !dbg !48
+  br label %__nv_rsqrtf.exit42, !dbg !48
+
+__nv_rsqrtf.exit42:                               ; preds = %301, %303
+  %.0.i41 = phi float [ %302, %301 ], [ %304, %303 ], !dbg !48
+  %305 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !48
+  %.not.i43 = icmp eq i32 %305, 0, !dbg !48
+  br i1 %.not.i43, label %308, label %306, !dbg !48
+
+306:                                              ; preds = %__nv_rsqrtf.exit42
+  %307 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %277), !dbg !48
+  br label %__nv_rsqrtf.exit45, !dbg !48
+
+308:                                              ; preds = %__nv_rsqrtf.exit42
+  %309 = tail call float @llvm.nvvm.rsqrt.approx.f(float %277), !dbg !48
+  br label %__nv_rsqrtf.exit45, !dbg !48
+
+__nv_rsqrtf.exit45:                               ; preds = %306, %308
+  %.0.i44 = phi float [ %307, %306 ], [ %309, %308 ], !dbg !48
+  %310 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !48
+  %.not.i46 = icmp eq i32 %310, 0, !dbg !48
+  br i1 %.not.i46, label %313, label %311, !dbg !48
+
+311:                                              ; preds = %__nv_rsqrtf.exit45
+  %312 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %278), !dbg !48
+  br label %__nv_rsqrtf.exit48, !dbg !48
+
+313:                                              ; preds = %__nv_rsqrtf.exit45
+  %314 = tail call float @llvm.nvvm.rsqrt.approx.f(float %278), !dbg !48
+  br label %__nv_rsqrtf.exit48, !dbg !48
+
+__nv_rsqrtf.exit48:                               ; preds = %311, %313
+  %.0.i47 = phi float [ %312, %311 ], [ %314, %313 ], !dbg !48
+  %315 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !48
+  %.not.i49 = icmp eq i32 %315, 0, !dbg !48
+  br i1 %.not.i49, label %318, label %316, !dbg !48
+
+316:                                              ; preds = %__nv_rsqrtf.exit48
+  %317 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %279), !dbg !48
+  br label %__nv_rsqrtf.exit51, !dbg !48
+
+318:                                              ; preds = %__nv_rsqrtf.exit48
+  %319 = tail call float @llvm.nvvm.rsqrt.approx.f(float %279), !dbg !48
+  br label %__nv_rsqrtf.exit51, !dbg !48
+
+__nv_rsqrtf.exit51:                               ; preds = %316, %318
+  %.0.i50 = phi float [ %317, %316 ], [ %319, %318 ], !dbg !48
+  %320 = icmp slt i32 %25, 8192, !dbg !20
+  %321 = insertelement <2 x i16> poison, i16 %227, i64 0, !dbg !41
+  %322 = insertelement <2 x i16> %321, i16 %235, i64 1, !dbg !41
+  %323 = bitcast <2 x i16> %322 to <2 x bfloat>, !dbg !41
+  %324 = insertelement <2 x i16> poison, i16 %225, i64 0, !dbg !41
+  %325 = insertelement <2 x i16> %324, i16 %233, i64 1, !dbg !41
+  %326 = bitcast <2 x i16> %325 to <2 x bfloat>, !dbg !41
+  %327 = insertelement <2 x i16> poison, i16 %223, i64 0, !dbg !41
+  %328 = insertelement <2 x i16> %327, i16 %231, i64 1, !dbg !41
+  %329 = bitcast <2 x i16> %328 to <2 x bfloat>, !dbg !41
+  %330 = insertelement <2 x i16> poison, i16 %221, i64 0, !dbg !41
+  %331 = insertelement <2 x i16> %330, i16 %229, i64 1, !dbg !41
+  %332 = bitcast <2 x i16> %331 to <2 x bfloat>, !dbg !41
+  %333 = insertelement <2 x i16> poison, i16 %79, i64 0, !dbg !28
+  %334 = insertelement <2 x i16> %333, i16 %87, i64 1, !dbg !28
+  %335 = bitcast <2 x i16> %334 to <2 x bfloat>, !dbg !28
+  %336 = insertelement <2 x i16> poison, i16 %177, i64 0, !dbg !35
+  %337 = insertelement <2 x i16> %336, i16 %185, i64 1, !dbg !35
+  %338 = bitcast <2 x i16> %337 to <2 x bfloat>, !dbg !35
+  %339 = insertelement <2 x i16> poison, i16 %77, i64 0, !dbg !28
+  %340 = insertelement <2 x i16> %339, i16 %85, i64 1, !dbg !28
+  %341 = bitcast <2 x i16> %340 to <2 x bfloat>, !dbg !28
+  %342 = insertelement <2 x i16> poison, i16 %175, i64 0, !dbg !35
+  %343 = insertelement <2 x i16> %342, i16 %183, i64 1, !dbg !35
+  %344 = bitcast <2 x i16> %343 to <2 x bfloat>, !dbg !35
+  %345 = insertelement <2 x i16> poison, i16 %75, i64 0, !dbg !28
+  %346 = insertelement <2 x i16> %345, i16 %83, i64 1, !dbg !28
+  %347 = bitcast <2 x i16> %346 to <2 x bfloat>, !dbg !28
+  %348 = insertelement <2 x i16> poison, i16 %173, i64 0, !dbg !35
+  %349 = insertelement <2 x i16> %348, i16 %181, i64 1, !dbg !35
+  %350 = bitcast <2 x i16> %349 to <2 x bfloat>, !dbg !35
+  %351 = insertelement <2 x i16> poison, i16 %73, i64 0, !dbg !28
+  %352 = insertelement <2 x i16> %351, i16 %81, i64 1, !dbg !28
+  %353 = bitcast <2 x i16> %352 to <2 x bfloat>, !dbg !28
+  %354 = insertelement <2 x i16> poison, i16 %171, i64 0, !dbg !35
+  %355 = insertelement <2 x i16> %354, i16 %179, i64 1, !dbg !35
+  %356 = bitcast <2 x i16> %355 to <2 x bfloat>, !dbg !35
+  %357 = or disjoint i32 %17, %19, !dbg !15
+  %358 = icmp slt i32 %357, 73728, !dbg !49
+  %359 = or i32 %18, 896, !dbg !14
+  %360 = or disjoint i32 %17, %359, !dbg !15
+  %361 = or disjoint i32 %19, 768, !dbg !14
+  %362 = or disjoint i32 %17, %361, !dbg !15
+  %363 = or disjoint i32 %19, 640, !dbg !14
+  %364 = or disjoint i32 %17, %363, !dbg !15
+  %365 = or disjoint i32 %19, 512, !dbg !14
+  %366 = or disjoint i32 %17, %365, !dbg !15
+  %367 = or disjoint i32 %19, 384, !dbg !14
+  %368 = or disjoint i32 %17, %367, !dbg !15
+  %369 = or disjoint i32 %19, 256, !dbg !14
+  %370 = or disjoint i32 %17, %369, !dbg !15
+  %371 = or disjoint i32 %19, 128, !dbg !14
+  %372 = or disjoint i32 %17, %371, !dbg !15
+  %373 = getelementptr bfloat, ptr addrspace(1) %5, i64 %168, !dbg !50
+  %374 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !51
+  %375 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %373, i64 %374, i1 %216) #6, !dbg !51
+  %376 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !51
+  %377 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %373, i64 %376, i1 %216) #6, !dbg !51
+  %378 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !51
+  %379 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %373, i64 %378, i1 %216) #6, !dbg !51
+  %380 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !51
+  %381 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %373, i64 %380, i1 %216) #6, !dbg !51
+  %382 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !51
+  %383 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %373, i64 %382, i1 %219) #6, !dbg !51
+  %384 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !51
+  %385 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %373, i64 %384, i1 %219) #6, !dbg !51
+  %386 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !51
+  %387 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %373, i64 %386, i1 %219) #6, !dbg !51
+  %388 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !51
+  %389 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %373, i64 %388, i1 %219) #6, !dbg !51
+  %390 = shl i32 %357, 7, !dbg !52
+  %391 = shl i32 %372, 7, !dbg !52
+  %392 = shl i32 %370, 7, !dbg !52
+  %393 = shl i32 %368, 7, !dbg !52
+  %394 = shl i32 %366, 7, !dbg !52
+  %395 = shl i32 %364, 7, !dbg !52
+  %396 = shl i32 %362, 7, !dbg !52
+  %397 = shl i32 %360, 7, !dbg !52
+  %398 = add i32 %390, %26, !dbg !53
+  %399 = add i32 %391, %26, !dbg !53
+  %400 = add i32 %392, %26, !dbg !53
+  %401 = add i32 %393, %26, !dbg !53
+  %402 = add i32 %394, %26, !dbg !53
+  %403 = add i32 %395, %26, !dbg !53
+  %404 = add i32 %396, %26, !dbg !53
+  %405 = add i32 %397, %26, !dbg !53
+  %406 = sext i32 %398 to i64, !dbg !54
+  %407 = getelementptr bfloat, ptr addrspace(1) %6, i64 %406, !dbg !54
+  %408 = sext i32 %399 to i64, !dbg !54
+  %409 = getelementptr bfloat, ptr addrspace(1) %6, i64 %408, !dbg !54
+  %410 = sext i32 %400 to i64, !dbg !54
+  %411 = getelementptr bfloat, ptr addrspace(1) %6, i64 %410, !dbg !54
+  %412 = sext i32 %401 to i64, !dbg !54
+  %413 = getelementptr bfloat, ptr addrspace(1) %6, i64 %412, !dbg !54
+  %414 = sext i32 %402 to i64, !dbg !54
+  %415 = getelementptr bfloat, ptr addrspace(1) %6, i64 %414, !dbg !54
+  %416 = sext i32 %403 to i64, !dbg !54
+  %417 = getelementptr bfloat, ptr addrspace(1) %6, i64 %416, !dbg !54
+  %418 = sext i32 %404 to i64, !dbg !54
+  %419 = getelementptr bfloat, ptr addrspace(1) %6, i64 %418, !dbg !54
+  %420 = sext i32 %405 to i64, !dbg !54
+  %421 = getelementptr bfloat, ptr addrspace(1) %6, i64 %420, !dbg !54
+  %422 = and i1 %27, %358, !dbg !55
+  %423 = fpext <2 x bfloat> %332 to <2 x float>, !dbg !56
+  %424 = fpext <2 x bfloat> %353 to <2 x float>, !dbg !57
+  %425 = insertelement <2 x float> poison, float %.0.i, i64 0, !dbg !58
+  %426 = insertelement <2 x float> %425, float %.0.i17, i64 1, !dbg !58
+  %427 = fmul <2 x float> %426, %424, !dbg !58
+  %428 = fpext <2 x bfloat> %356 to <2 x float>, !dbg !59
+  %429 = fmul <2 x float> %427, %428, !dbg !60
+  %430 = insertelement <2 x float> poison, float %.0.i29, i64 0, !dbg !61
+  %431 = insertelement <2 x float> %430, float %.0.i41, i64 1, !dbg !61
+  %432 = fmul <2 x float> %431, %423, !dbg !61
+  %433 = insertelement <2 x i16> poison, i16 %375, i64 0, !dbg !51
+  %434 = insertelement <2 x i16> %433, i16 %383, i64 1, !dbg !51
+  %435 = bitcast <2 x i16> %434 to <2 x bfloat>, !dbg !51
+  %436 = fpext <2 x bfloat> %435 to <2 x float>, !dbg !62
+  %437 = fmul <2 x float> %432, %436, !dbg !63
+  %438 = insertelement <2 x i1> poison, i1 %34, i64 0, !dbg !64
+  %439 = insertelement <2 x i1> %438, i1 %320, i64 1, !dbg !64
+  %440 = select <2 x i1> %439, <2 x float> %429, <2 x float> %437, !dbg !64
+  %441 = fptrunc <2 x float> %440 to <2 x bfloat>, !dbg !65
+  %442 = fpext <2 x bfloat> %329 to <2 x float>, !dbg !56
+  %443 = fpext <2 x bfloat> %347 to <2 x float>, !dbg !57
+  %444 = insertelement <2 x float> poison, float %.0.i8, i64 0, !dbg !58
+  %445 = insertelement <2 x float> %444, float %.0.i20, i64 1, !dbg !58
+  %446 = fmul <2 x float> %445, %443, !dbg !58
+  %447 = fpext <2 x bfloat> %350 to <2 x float>, !dbg !59
+  %448 = fmul <2 x float> %446, %447, !dbg !60
+  %449 = insertelement <2 x float> poison, float %.0.i32, i64 0, !dbg !61
+  %450 = insertelement <2 x float> %449, float %.0.i44, i64 1, !dbg !61
+  %451 = fmul <2 x float> %450, %442, !dbg !61
+  %452 = insertelement <2 x i16> poison, i16 %377, i64 0, !dbg !51
+  %453 = insertelement <2 x i16> %452, i16 %385, i64 1, !dbg !51
+  %454 = bitcast <2 x i16> %453 to <2 x bfloat>, !dbg !51
+  %455 = fpext <2 x bfloat> %454 to <2 x float>, !dbg !62
+  %456 = fmul <2 x float> %451, %455, !dbg !63
+  %457 = select <2 x i1> %439, <2 x float> %448, <2 x float> %456, !dbg !64
+  %458 = fptrunc <2 x float> %457 to <2 x bfloat>, !dbg !65
+  %459 = fpext <2 x bfloat> %326 to <2 x float>, !dbg !56
+  %460 = fpext <2 x bfloat> %341 to <2 x float>, !dbg !57
+  %461 = insertelement <2 x float> poison, float %.0.i11, i64 0, !dbg !58
+  %462 = insertelement <2 x float> %461, float %.0.i23, i64 1, !dbg !58
+  %463 = fmul <2 x float> %462, %460, !dbg !58
+  %464 = fpext <2 x bfloat> %344 to <2 x float>, !dbg !59
+  %465 = fmul <2 x float> %463, %464, !dbg !60
+  %466 = insertelement <2 x float> poison, float %.0.i35, i64 0, !dbg !61
+  %467 = insertelement <2 x float> %466, float %.0.i47, i64 1, !dbg !61
+  %468 = fmul <2 x float> %467, %459, !dbg !61
+  %469 = insertelement <2 x i16> poison, i16 %379, i64 0, !dbg !51
+  %470 = insertelement <2 x i16> %469, i16 %387, i64 1, !dbg !51
+  %471 = bitcast <2 x i16> %470 to <2 x bfloat>, !dbg !51
+  %472 = fpext <2 x bfloat> %471 to <2 x float>, !dbg !62
+  %473 = fmul <2 x float> %468, %472, !dbg !63
+  %474 = select <2 x i1> %439, <2 x float> %465, <2 x float> %473, !dbg !64
+  %475 = fptrunc <2 x float> %474 to <2 x bfloat>, !dbg !65
+  %476 = fpext <2 x bfloat> %323 to <2 x float>, !dbg !56
+  %477 = fpext <2 x bfloat> %335 to <2 x float>, !dbg !57
+  %478 = insertelement <2 x float> poison, float %.0.i14, i64 0, !dbg !58
+  %479 = insertelement <2 x float> %478, float %.0.i26, i64 1, !dbg !58
+  %480 = fmul <2 x float> %479, %477, !dbg !58
+  %481 = fpext <2 x bfloat> %338 to <2 x float>, !dbg !59
+  %482 = fmul <2 x float> %480, %481, !dbg !60
+  %483 = insertelement <2 x float> poison, float %.0.i38, i64 0, !dbg !61
+  %484 = insertelement <2 x float> %483, float %.0.i50, i64 1, !dbg !61
+  %485 = fmul <2 x float> %484, %476, !dbg !61
+  %486 = insertelement <2 x i16> poison, i16 %381, i64 0, !dbg !51
+  %487 = insertelement <2 x i16> %486, i16 %389, i64 1, !dbg !51
+  %488 = bitcast <2 x i16> %487 to <2 x bfloat>, !dbg !51
+  %489 = fpext <2 x bfloat> %488 to <2 x float>, !dbg !62
+  %490 = fmul <2 x float> %485, %489, !dbg !63
+  %491 = select <2 x i1> %439, <2 x float> %482, <2 x float> %490, !dbg !64
+  %492 = fptrunc <2 x float> %491 to <2 x bfloat>, !dbg !65
+  %493 = shl nuw nsw i32 %19, 4, !dbg !65
+  %494 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %493, !dbg !65
+  %495 = bitcast <2 x bfloat> %441 to i32, !dbg !65
+  %496 = bitcast <2 x bfloat> %458 to i32, !dbg !65
+  %497 = bitcast <2 x bfloat> %475 to i32, !dbg !65
+  %498 = bitcast <2 x bfloat> %492 to i32, !dbg !65
+  %499 = insertelement <4 x i32> poison, i32 %495, i64 0, !dbg !65
+  %500 = insertelement <4 x i32> %499, i32 %496, i64 1, !dbg !65
+  %501 = insertelement <4 x i32> %500, i32 %497, i64 2, !dbg !65
+  %502 = insertelement <4 x i32> %501, i32 %498, i64 3, !dbg !65
+  store <4 x i32> %502, ptr addrspace(3) %494, align 16, !dbg !65
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !65
+  %503 = shl nuw nsw i32 %18, 6, !dbg !65
+  %504 = and i32 %503, 1536, !dbg !65
+  %505 = shl nuw nsw i32 %18, 4, !dbg !65
+  %506 = and i32 %505, 112, !dbg !65
+  %507 = shl nuw nsw i32 %18, 2, !dbg !65
+  %508 = and i32 %507, 384, !dbg !65
+  %509 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %504, !dbg !65
+  %510 = getelementptr inbounds nuw i8, ptr addrspace(3) %509, i32 %506, !dbg !65
+  %511 = getelementptr inbounds nuw i8, ptr addrspace(3) %510, i32 %508, !dbg !65
+  %512 = tail call { i32, i32, i32, i32 } @llvm.nvvm.ldmatrix.sync.aligned.m8n8.x4.b16.p3(ptr addrspace(3) %511), !dbg !65
+  %513 = extractvalue { i32, i32, i32, i32 } %512, 0, !dbg !65
+  %extelt.offset = lshr i32 %513, 16, !dbg !65
+  %514 = trunc nuw i32 %extelt.offset to i16, !dbg !65
+  %515 = extractvalue { i32, i32, i32, i32 } %512, 1, !dbg !65
+  %extelt.offset2 = lshr i32 %515, 16, !dbg !65
+  %516 = trunc nuw i32 %extelt.offset2 to i16, !dbg !65
+  %517 = extractvalue { i32, i32, i32, i32 } %512, 2, !dbg !65
+  %extelt.offset4 = lshr i32 %517, 16, !dbg !65
+  %518 = trunc nuw i32 %extelt.offset4 to i16, !dbg !65
+  %519 = extractvalue { i32, i32, i32, i32 } %512, 3, !dbg !65
+  %extelt.offset6 = lshr i32 %519, 16, !dbg !65
+  %520 = trunc nuw i32 %extelt.offset6 to i16, !dbg !65
+  %.extract = trunc i32 %513 to i16, !dbg !65
+  tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %.extract, ptr addrspace(1) %407, i1 %422) #6, !dbg !65
+  %.extract1 = trunc i32 %515 to i16, !dbg !65
+  tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %.extract1, ptr addrspace(1) %409, i1 %422) #6, !dbg !65
+  %.extract3 = trunc i32 %517 to i16, !dbg !65
+  tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %.extract3, ptr addrspace(1) %411, i1 %422) #6, !dbg !65
+  %.extract5 = trunc i32 %519 to i16, !dbg !65
+  tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %.extract5, ptr addrspace(1) %413, i1 %422) #6, !dbg !65
+  tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %514, ptr addrspace(1) %415, i1 %422) #6, !dbg !65
+  tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %516, ptr addrspace(1) %417, i1 %422) #6, !dbg !65
+  tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %518, ptr addrspace(1) %419, i1 %422) #6, !dbg !65
+  tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %520, ptr addrspace(1) %421, i1 %422) #6, !dbg !65
+  ret void, !dbg !66
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 65535) i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() #1
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 65535) i32 @llvm.nvvm.read.ptx.sreg.ctaid.z() #1
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 1, 65536) i32 @llvm.nvvm.read.ptx.sreg.nctaid.y() #1
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.div.full(float, float) #2
+
+; Function Attrs: convergent nocallback nounwind
+declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #3
+
+; Function Attrs: nocallback nofree nounwind memory(argmem: read)
+declare { i32, i32, i32, i32 } @llvm.nvvm.ldmatrix.sync.aligned.m8n8.x4.b16.p3(ptr addrspace(3) readonly captures(none)) #4
+
+declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #5
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #2
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.rsqrt.approx.f(float) #2
+
+attributes #0 = { nounwind "nvvm.reqntid"="128" }
+attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #2 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
+attributes #3 = { convergent nocallback nounwind }
+attributes #4 = { nocallback nofree nounwind memory(argmem: read) }
+attributes #5 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #6 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3}
+!llvm.ident = !{!4}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly)
+!1 = !DIFile(filename: "c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py", directory: "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h")
+!2 = !{i32 2, !"Debug Info Version", i32 3}
+!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
+!4 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
+!5 = distinct !DISubprogram(name: "triton_poi_fused__fused_rms_norm_cat_view_2", linkageName: "triton_poi_fused__fused_rms_norm_cat_view_2", scope: !1, file: !1, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
+!7 = !{}
+!8 = !DILocation(line: 21, column: 29, scope: !5)
+!9 = !DILocation(line: 21, column: 48, scope: !5)
+!10 = !DILocation(line: 21, column: 69, scope: !5)
+!11 = !DILocation(line: 21, column: 53, scope: !5)
+!12 = !DILocation(line: 21, column: 34, scope: !5)
+!13 = !DILocation(line: 21, column: 75, scope: !5)
+!14 = !DILocation(line: 22, column: 44, scope: !5)
+!15 = !DILocation(line: 22, column: 23, scope: !5)
+!16 = !DILocation(line: 24, column: 28, scope: !5)
+!17 = !DILocation(line: 26, column: 21, scope: !5)
+!18 = !DILocation(line: 27, column: 19, scope: !5)
+!19 = !DILocation(line: 29, column: 19, scope: !5)
+!20 = !DILocation(line: 35, column: 18, scope: !5)
+!21 = !DILocation(line: 36, column: 39, scope: !5)
+!22 = !DILocation(line: 36, column: 35, scope: !5)
+!23 = !DILocation(line: 36, column: 51, scope: !5)
+!24 = !DILocation(line: 36, column: 44, scope: !5)
+!25 = !DILocation(line: 36, column: 30, scope: !5)
+!26 = !DILocation(line: 36, column: 64, scope: !5)
+!27 = !DILocation(line: 36, column: 72, scope: !5)
+!28 = !DILocation(line: 36, column: 57, scope: !5)
+!29 = !DILocation(line: 38, column: 30, scope: !5)
+!30 = !DILocation(line: 38, column: 80, scope: !5)
+!31 = !DILocation(line: 40, column: 19, scope: !5)
+!32 = !DILocation(line: 42, column: 19, scope: !5)
+!33 = !DILocation(line: 43, column: 28, scope: !5)
+!34 = !DILocation(line: 45, column: 31, scope: !5)
+!35 = !DILocation(line: 45, column: 71, scope: !5)
+!36 = !DILocation(line: 54, column: 61, scope: !5)
+!37 = !DILocation(line: 54, column: 52, scope: !5)
+!38 = !DILocation(line: 54, column: 45, scope: !5)
+!39 = !DILocation(line: 54, column: 31, scope: !5)
+!40 = !DILocation(line: 54, column: 83, scope: !5)
+!41 = !DILocation(line: 54, column: 67, scope: !5)
+!42 = !DILocation(line: 56, column: 56, scope: !5)
+!43 = !DILocation(line: 56, column: 52, scope: !5)
+!44 = !DILocation(line: 56, column: 31, scope: !5)
+!45 = !DILocation(line: 56, column: 90, scope: !5)
+!46 = !DILocation(line: 58, column: 21, scope: !5)
+!47 = !DILocation(line: 60, column: 20, scope: !5)
+!48 = !DILocation(line: 61, column: 28, scope: !5)
+!49 = !DILocation(line: 23, column: 21, scope: !5)
+!50 = !DILocation(line: 63, column: 31, scope: !5)
+!51 = !DILocation(line: 63, column: 71, scope: !5)
+!52 = !DILocation(line: 70, column: 34, scope: !5)
+!53 = !DILocation(line: 70, column: 30, scope: !5)
+!54 = !DILocation(line: 70, column: 25, scope: !5)
+!55 = !DILocation(line: 70, column: 54, scope: !5)
+!56 = !DILocation(line: 54, column: 134, scope: !5)
+!57 = !DILocation(line: 36, column: 123, scope: !5)
+!58 = !DILocation(line: 44, column: 19, scope: !5)
+!59 = !DILocation(line: 45, column: 137, scope: !5)
+!60 = !DILocation(line: 47, column: 20, scope: !5)
+!61 = !DILocation(line: 62, column: 20, scope: !5)
+!62 = !DILocation(line: 63, column: 138, scope: !5)
+!63 = !DILocation(line: 65, column: 20, scope: !5)
+!64 = !DILocation(line: 0, scope: !5)
+!65 = !DILocation(line: 70, column: 46, scope: !5)
+!66 = !DILocation(line: 70, column: 4, scope: !5)
diff --git a/triton/B332B73IRJVPV7KEXQCJ4B6VIXVZKZ6STDLWEXIOLGVNUOTPYFEQ/triton_poi_fused__fused_rms_norm_cat_view_2.ptx b/triton/B332B73IRJVPV7KEXQCJ4B6VIXVZKZ6STDLWEXIOLGVNUOTPYFEQ/triton_poi_fused__fused_rms_norm_cat_view_2.ptx
new file mode 100644
index 0000000000000000000000000000000000000000..3a9cade9cf109074e60b25f8870efa05ea116469
--- /dev/null
+++ b/triton/B332B73IRJVPV7KEXQCJ4B6VIXVZKZ6STDLWEXIOLGVNUOTPYFEQ/triton_poi_fused__fused_rms_norm_cat_view_2.ptx
@@ -0,0 +1,1027 @@
+//
+// Generated by LLVM NVPTX Back-End
+//
+
+.version 9.1
+.target sm_89
+.address_size 64
+
+	// .globl	triton_poi_fused__fused_rms_norm_cat_view_2 // -- Begin function triton_poi_fused__fused_rms_norm_cat_view_2
+.extern .shared .align 16 .b8 global_smem[];
+.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90};
+                                        // @triton_poi_fused__fused_rms_norm_cat_view_2
+.visible .entry triton_poi_fused__fused_rms_norm_cat_view_2(
+	.param .u64 .ptr .global .align 1 triton_poi_fused__fused_rms_norm_cat_view_2_param_0,
+	.param .u64 .ptr .global .align 1 triton_poi_fused__fused_rms_norm_cat_view_2_param_1,
+	.param .u64 .ptr .global .align 1 triton_poi_fused__fused_rms_norm_cat_view_2_param_2,
+	.param .u64 .ptr .global .align 1 triton_poi_fused__fused_rms_norm_cat_view_2_param_3,
+	.param .u64 .ptr .global .align 1 triton_poi_fused__fused_rms_norm_cat_view_2_param_4,
+	.param .u64 .ptr .global .align 1 triton_poi_fused__fused_rms_norm_cat_view_2_param_5,
+	.param .u64 .ptr .global .align 1 triton_poi_fused__fused_rms_norm_cat_view_2_param_6,
+	.param .u32 triton_poi_fused__fused_rms_norm_cat_view_2_param_7,
+	.param .u32 triton_poi_fused__fused_rms_norm_cat_view_2_param_8,
+	.param .u64 .ptr .global .align 1 triton_poi_fused__fused_rms_norm_cat_view_2_param_9,
+	.param .u64 .ptr .global .align 1 triton_poi_fused__fused_rms_norm_cat_view_2_param_10
+)
+.reqntid 128
+{
+	.reg .pred 	%p<13>;
+	.reg .b16 	%rs<74>;
+	.reg .b32 	%r<253>;
+	.reg .b64 	%rd<75>;
+	.loc	1 18 0                          // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:18:0
+$L__func_begin0:
+	.loc	1 18 0                          // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:18:0
+
+// %bb.0:                               // %__nv_rsqrtf.exit
+	ld.param.b64 	%rd67, [triton_poi_fused__fused_rms_norm_cat_view_2_param_0];
+	ld.param.b64 	%rd68, [triton_poi_fused__fused_rms_norm_cat_view_2_param_1];
+$L__tmp0:
+	.loc	1 21 29                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:21:29
+	mov.u32 	%r18, %ctaid.y;
+	ld.param.b64 	%rd69, [triton_poi_fused__fused_rms_norm_cat_view_2_param_2];
+	.loc	1 21 48                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:21:48
+	mov.u32 	%r19, %ctaid.z;
+	ld.param.b64 	%rd70, [triton_poi_fused__fused_rms_norm_cat_view_2_param_3];
+	.loc	1 21 69                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:21:69
+	mov.u32 	%r20, %nctaid.y;
+	ld.param.b64 	%rd71, [triton_poi_fused__fused_rms_norm_cat_view_2_param_4];
+	.loc	1 21 34                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:21:34
+	mad.lo.s32 	%r21, %r19, %r20, %r18;
+	ld.param.b64 	%rd72, [triton_poi_fused__fused_rms_norm_cat_view_2_param_5];
+	.loc	1 21 75                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:21:75
+	shl.b32 	%r22, %r21, 10;
+	ld.param.b64 	%rd73, [triton_poi_fused__fused_rms_norm_cat_view_2_param_6];
+	.loc	1 22 44                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:22:44
+	mov.u32 	%r23, %tid.x;
+	and.b32 	%r24, %r23, 127;
+	shl.b32 	%r25, %r24, 2;
+	.loc	1 22 23                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:22:23
+	or.b32 	%r26, %r22, %r25;
+	or.b32 	%r27, %r26, 1;
+	or.b32 	%r28, %r26, 2;
+	or.b32 	%r29, %r26, 3;
+	or.b32 	%r30, %r26, 512;
+	.loc	1 24 28                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:24:28
+	mov.u32 	%r31, %ctaid.x;
+	.loc	1 26 21                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:26:21
+	setp.lt.u32 	%p6, %r31, 128;
+	.loc	1 27 19                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:27:19
+	bfe.s32 	%r32, %r21, 21, 1;
+	.loc	1 29 19                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:29:19
+	shr.u32 	%r33, %r32, 27;
+	.loc	1 27 19                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:27:19
+	add.s32 	%r34, %r26, %r33;
+	shr.u32 	%r35, %r34, 5;
+	add.s32 	%r36, %r30, %r33;
+	shr.u32 	%r37, %r36, 5;
+	.loc	1 29 19                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:29:19
+	and.b32 	%r38, %r34, -32;
+	sub.s32 	%r39, %r26, %r38;
+	add.s32 	%r40, %r27, %r33;
+	and.b32 	%r41, %r40, 33554400;
+	sub.s32 	%r42, %r27, %r41;
+	add.s32 	%r43, %r28, %r33;
+	and.b32 	%r44, %r43, 33554400;
+	sub.s32 	%r45, %r28, %r44;
+	add.s32 	%r46, %r29, %r33;
+	and.b32 	%r47, %r46, 33554400;
+	sub.s32 	%r48, %r29, %r47;
+	.loc	1 35 18                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:35:18
+	setp.lt.s32 	%p7, %r26, 8192;
+	.loc	1 36 39                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:36:39
+	shl.b32 	%r49, %r39, 7;
+	shl.b32 	%r50, %r42, 7;
+	shl.b32 	%r51, %r45, 7;
+	shl.b32 	%r52, %r48, 7;
+	.loc	1 36 35                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:36:35
+	add.s32 	%r53, %r49, %r31;
+	add.s32 	%r54, %r50, %r31;
+	add.s32 	%r55, %r51, %r31;
+	add.s32 	%r56, %r52, %r31;
+	.loc	1 36 51                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:36:51
+	mul.lo.s32 	%r57, %r35, 12288;
+	mul.lo.s32 	%r58, %r37, 12288;
+	.loc	1 36 44                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:36:44
+	add.s32 	%r59, %r53, %r57;
+	add.s32 	%r60, %r54, %r57;
+	add.s32 	%r61, %r55, %r57;
+	add.s32 	%r62, %r56, %r57;
+	add.s32 	%r63, %r53, %r58;
+	add.s32 	%r64, %r54, %r58;
+	add.s32 	%r65, %r55, %r58;
+	add.s32 	%r66, %r56, %r58;
+	.loc	1 36 30                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:36:30
+	mad.wide.s32 	%rd1, %r59, 2, %rd67;
+	mad.wide.s32 	%rd3, %r60, 2, %rd67;
+	mad.wide.s32 	%rd5, %r61, 2, %rd67;
+	mad.wide.s32 	%rd7, %r62, 2, %rd67;
+	mad.wide.s32 	%rd9, %r63, 2, %rd67;
+	mad.wide.s32 	%rd11, %r64, 2, %rd67;
+	mad.wide.s32 	%rd13, %r65, 2, %rd67;
+	mad.wide.s32 	%rd15, %r66, 2, %rd67;
+	.loc	1 36 64                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:36:64
+	and.pred 	%p1, %p6, %p7;
+	.loc	1 36 72                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:36:72
+	setp.lt.s32 	%p8, %r26, 7680;
+	and.pred 	%p2, %p6, %p8;
+	.loc	1 36 57                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:36:57
+	// begin inline asm
+	mov.u64 %rd2, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd2, 1.0;
+	// end inline asm
+	mov.b16 	%rs2, 0;
+	// begin inline asm
+	mov.u16 %rs1, %rs2;
+	@%p1 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs1 }, [ %rd1 + 0 ], %rd2;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd4, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd4, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs3, %rs2;
+	@%p1 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs3 }, [ %rd3 + 0 ], %rd4;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd6, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd6, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs4, %rs2;
+	@%p1 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs4 }, [ %rd5 + 0 ], %rd6;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd8, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd8, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs5, %rs2;
+	@%p1 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs5 }, [ %rd7 + 0 ], %rd8;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd10, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd10, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs6, %rs2;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs6 }, [ %rd9 + 0 ], %rd10;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd12, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd12, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs7, %rs2;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs7 }, [ %rd11 + 0 ], %rd12;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd14, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd14, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs8, %rs2;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs8 }, [ %rd13 + 0 ], %rd14;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd16, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd16, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs9, %rs2;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs9 }, [ %rd15 + 0 ], %rd16;
+	// end inline asm
+	.loc	1 38 30                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:38:30
+	mad.wide.s32 	%rd17, %r26, 4, %rd68;
+	add.s64 	%rd19, %rd17, 2048;
+	.loc	1 38 80                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:38:80
+	// begin inline asm
+	mov.u64 %rd18, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd18, 1.0;
+	// end inline asm
+	mov.b32 	%r5, 0;
+	// begin inline asm
+	mov.u32 %r1, %r5;
+	mov.u32 %r2, %r5;
+	mov.u32 %r3, %r5;
+	mov.u32 %r4, %r5;
+	@%p1 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r1, %r2, %r3, %r4 }, [ %rd17 + 0 ], %rd18;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd20, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd20, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r6, %r5;
+	mov.u32 %r7, %r5;
+	mov.u32 %r8, %r5;
+	mov.u32 %r9, %r5;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r6, %r7, %r8, %r9 }, [ %rd19 + 0 ], %rd20;
+	// end inline asm
+	mov.b32 	%r67, 0f43000000;
+	.loc	1 40 19                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:40:19
+	div.full.f32 	%r68, %r1, %r67;
+	div.full.f32 	%r69, %r2, %r67;
+	div.full.f32 	%r70, %r3, %r67;
+	div.full.f32 	%r71, %r4, %r67;
+	div.full.f32 	%r72, %r6, %r67;
+	div.full.f32 	%r73, %r7, %r67;
+	div.full.f32 	%r74, %r8, %r67;
+	div.full.f32 	%r75, %r9, %r67;
+	.loc	1 42 19                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:42:19
+	add.f32 	%r76, %r68, 0f358637BD;
+	add.f32 	%r77, %r69, 0f358637BD;
+	add.f32 	%r78, %r70, 0f358637BD;
+	add.f32 	%r79, %r71, 0f358637BD;
+	add.f32 	%r80, %r72, 0f358637BD;
+	add.f32 	%r81, %r73, 0f358637BD;
+	add.f32 	%r82, %r74, 0f358637BD;
+	add.f32 	%r83, %r75, 0f358637BD;
+	.loc	1 43 28                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:43:28
+	rsqrt.approx.ftz.f32 	%r84, %r76;
+	rsqrt.approx.ftz.f32 	%r85, %r77;
+	rsqrt.approx.ftz.f32 	%r86, %r78;
+	rsqrt.approx.ftz.f32 	%r87, %r79;
+	rsqrt.approx.ftz.f32 	%r88, %r80;
+	rsqrt.approx.ftz.f32 	%r89, %r81;
+	rsqrt.approx.ftz.f32 	%r90, %r82;
+	rsqrt.approx.ftz.f32 	%r91, %r83;
+	.loc	1 45 31                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:45:31
+	mul.wide.u32 	%rd74, %r31, 2;
+	add.s64 	%rd21, %rd69, %rd74;
+	.loc	1 45 71                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:45:71
+	// begin inline asm
+	mov.u64 %rd22, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd22, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs10, %rs2;
+	@%p1 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs10 }, [ %rd21 + 0 ], %rd22;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd23, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd23, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs11, %rs2;
+	@%p1 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs11 }, [ %rd21 + 0 ], %rd23;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd24, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd24, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs12, %rs2;
+	@%p1 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs12 }, [ %rd21 + 0 ], %rd24;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd25, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd25, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs13, %rs2;
+	@%p1 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs13 }, [ %rd21 + 0 ], %rd25;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd26, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd26, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs14, %rs2;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs14 }, [ %rd21 + 0 ], %rd26;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd27, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd27, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs15, %rs2;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs15 }, [ %rd21 + 0 ], %rd27;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd28, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd28, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs16, %rs2;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs16 }, [ %rd21 + 0 ], %rd28;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd29, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd29, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs17, %rs2;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs17 }, [ %rd21 + 0 ], %rd29;
+	// end inline asm
+	.loc	1 54 61                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:54:61
+	and.b32 	%r92, %r36, -32;
+	.loc	1 54 52                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:54:52
+	add.s32 	%r93, %r57, -3145728;
+	add.s32 	%r94, %r58, -3145728;
+	.loc	1 54 45                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:54:45
+	add.s32 	%r95, %r53, %r93;
+	add.s32 	%r96, %r54, %r93;
+	add.s32 	%r97, %r55, %r93;
+	add.s32 	%r98, %r56, %r93;
+	add.s32 	%r99, %r53, %r94;
+	add.s32 	%r100, %r54, %r94;
+	add.s32 	%r101, %r55, %r94;
+	add.s32 	%r102, %r56, %r94;
+	.loc	1 54 31                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:54:31
+	mad.wide.s32 	%rd30, %r95, 2, %rd70;
+	mad.wide.s32 	%rd32, %r96, 2, %rd70;
+	mad.wide.s32 	%rd34, %r97, 2, %rd70;
+	mad.wide.s32 	%rd36, %r98, 2, %rd70;
+	mad.wide.s32 	%rd38, %r99, 2, %rd70;
+	mad.wide.s32 	%rd40, %r100, 2, %rd70;
+	mad.wide.s32 	%rd42, %r101, 2, %rd70;
+	mad.wide.s32 	%rd44, %r102, 2, %rd70;
+	.loc	1 54 83                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:54:83
+	add.s32 	%r103, %r22, -8192;
+	setp.lt.u32 	%p9, %r103, 65536;
+	and.pred 	%p3, %p6, %p9;
+	add.s32 	%r104, %r22, -7680;
+	setp.lt.u32 	%p10, %r104, 66048;
+	and.pred 	%p4, %p6, %p10;
+	.loc	1 54 67                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:54:67
+	// begin inline asm
+	mov.u64 %rd31, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd31, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs18, %rs2;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs18 }, [ %rd30 + 0 ], %rd31;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd33, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd33, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs19, %rs2;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs19 }, [ %rd32 + 0 ], %rd33;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd35, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd35, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs20, %rs2;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs20 }, [ %rd34 + 0 ], %rd35;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd37, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd37, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs21, %rs2;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs21 }, [ %rd36 + 0 ], %rd37;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd39, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd39, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs22, %rs2;
+	@%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs22 }, [ %rd38 + 0 ], %rd39;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd41, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd41, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs23, %rs2;
+	@%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs23 }, [ %rd40 + 0 ], %rd41;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd43, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd43, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs24, %rs2;
+	@%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs24 }, [ %rd42 + 0 ], %rd43;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd45, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd45, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs25, %rs2;
+	@%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs25 }, [ %rd44 + 0 ], %rd45;
+	// end inline asm
+	.loc	1 56 56                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:56:56
+	add.s32 	%r105, %r92, %r39;
+	.loc	1 56 52                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:56:52
+	add.s32 	%r106, %r26, -8192;
+	add.s32 	%r107, %r105, -8192;
+	.loc	1 56 31                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:56:31
+	mad.wide.s32 	%rd46, %r106, 4, %rd71;
+	mad.wide.s32 	%rd48, %r107, 4, %rd71;
+	.loc	1 56 90                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:56:90
+	// begin inline asm
+	mov.u64 %rd47, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd47, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r10, %r5;
+	mov.u32 %r11, %r5;
+	mov.u32 %r12, %r5;
+	mov.u32 %r13, %r5;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r10, %r11, %r12, %r13 }, [ %rd46 + 0 ], %rd47;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd49, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd49, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r14, %r5;
+	mov.u32 %r15, %r5;
+	mov.u32 %r16, %r5;
+	mov.u32 %r17, %r5;
+	@%p4 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r14, %r15, %r16, %r17 }, [ %rd48 + 0 ], %rd49;
+	// end inline asm
+	.loc	1 58 21                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:58:21
+	div.full.f32 	%r108, %r10, %r67;
+	div.full.f32 	%r109, %r11, %r67;
+	div.full.f32 	%r110, %r12, %r67;
+	div.full.f32 	%r111, %r13, %r67;
+	div.full.f32 	%r112, %r14, %r67;
+	div.full.f32 	%r113, %r15, %r67;
+	div.full.f32 	%r114, %r16, %r67;
+	div.full.f32 	%r115, %r17, %r67;
+	.loc	1 60 20                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:60:20
+	add.f32 	%r116, %r108, 0f358637BD;
+	add.f32 	%r117, %r109, 0f358637BD;
+	add.f32 	%r118, %r110, 0f358637BD;
+	add.f32 	%r119, %r111, 0f358637BD;
+	add.f32 	%r120, %r112, 0f358637BD;
+	add.f32 	%r121, %r113, 0f358637BD;
+	add.f32 	%r122, %r114, 0f358637BD;
+	add.f32 	%r123, %r115, 0f358637BD;
+	.loc	1 61 28                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:61:28
+	rsqrt.approx.ftz.f32 	%r124, %r116;
+	rsqrt.approx.ftz.f32 	%r125, %r117;
+	rsqrt.approx.ftz.f32 	%r126, %r118;
+	rsqrt.approx.ftz.f32 	%r127, %r119;
+	rsqrt.approx.ftz.f32 	%r128, %r120;
+	rsqrt.approx.ftz.f32 	%r129, %r121;
+	rsqrt.approx.ftz.f32 	%r130, %r122;
+	rsqrt.approx.ftz.f32 	%r131, %r123;
+	.loc	1 35 18                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:35:18
+	setp.lt.s32 	%p11, %r30, 8192;
+	.loc	1 54 67                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:54:67
+	mov.b32 	%r132, {%rs21, %rs25};
+	mov.b32 	%r133, {%rs20, %rs24};
+	mov.b32 	%r134, {%rs19, %rs23};
+	mov.b32 	%r135, {%rs18, %rs22};
+	.loc	1 36 57                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:36:57
+	mov.b32 	%r136, {%rs5, %rs9};
+	.loc	1 45 71                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:45:71
+	mov.b32 	%r137, {%rs13, %rs17};
+	.loc	1 36 57                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:36:57
+	mov.b32 	%r138, {%rs4, %rs8};
+	.loc	1 45 71                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:45:71
+	mov.b32 	%r139, {%rs12, %rs16};
+	.loc	1 36 57                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:36:57
+	mov.b32 	%r140, {%rs3, %rs7};
+	.loc	1 45 71                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:45:71
+	mov.b32 	%r141, {%rs11, %rs15};
+	.loc	1 36 57                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:36:57
+	mov.b32 	%r142, {%rs1, %rs6};
+	.loc	1 45 71                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:45:71
+	mov.b32 	%r143, {%rs10, %rs14};
+	.loc	1 22 23                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:22:23
+	or.b32 	%r144, %r22, %r24;
+	.loc	1 23 21                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:23:21
+	setp.lt.s32 	%p12, %r144, 73728;
+	.loc	1 22 44                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:22:44
+	or.b32 	%r145, %r23, %r22;
+	.loc	1 22 23                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:22:23
+	shl.b32 	%r146, %r145, 7;
+	.loc	1 63 31                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:63:31
+	add.s64 	%rd50, %rd72, %rd74;
+	.loc	1 63 71                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:63:71
+	// begin inline asm
+	mov.u64 %rd51, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd51, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs26, %rs2;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs26 }, [ %rd50 + 0 ], %rd51;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd52, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd52, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs27, %rs2;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs27 }, [ %rd50 + 0 ], %rd52;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd53, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd53, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs28, %rs2;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs28 }, [ %rd50 + 0 ], %rd53;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd54, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd54, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs29, %rs2;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs29 }, [ %rd50 + 0 ], %rd54;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd55, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd55, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs30, %rs2;
+	@%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs30 }, [ %rd50 + 0 ], %rd55;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd56, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd56, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs31, %rs2;
+	@%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs31 }, [ %rd50 + 0 ], %rd56;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd57, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd57, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs32, %rs2;
+	@%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs32 }, [ %rd50 + 0 ], %rd57;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd58, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd58, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs33, %rs2;
+	@%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs33 }, [ %rd50 + 0 ], %rd58;
+	// end inline asm
+	.loc	1 22 23                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:22:23
+	shl.b32 	%r147, %r144, 7;
+	.loc	1 70 34                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:70:34
+	or.b32 	%r148, %r146, 114688;
+	.loc	1 70 30                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:70:30
+	add.s32 	%r149, %r147, %r31;
+	add.s32 	%r150, %r149, 16384;
+	add.s32 	%r151, %r149, 32768;
+	add.s32 	%r152, %r149, 49152;
+	add.s32 	%r153, %r149, 65536;
+	add.s32 	%r154, %r149, 81920;
+	add.s32 	%r155, %r149, 98304;
+	add.s32 	%r156, %r148, %r31;
+	.loc	1 70 25                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:70:25
+	mad.wide.s32 	%rd59, %r149, 2, %rd73;
+	mad.wide.s32 	%rd60, %r150, 2, %rd73;
+	mad.wide.s32 	%rd61, %r151, 2, %rd73;
+	mad.wide.s32 	%rd62, %r152, 2, %rd73;
+	mad.wide.s32 	%rd63, %r153, 2, %rd73;
+	mad.wide.s32 	%rd64, %r154, 2, %rd73;
+	mad.wide.s32 	%rd65, %r155, 2, %rd73;
+	mad.wide.s32 	%rd66, %r156, 2, %rd73;
+	.loc	1 70 54                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:70:54
+	and.pred 	%p5, %p6, %p12;
+	.loc	1 54 134                        // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:54:134
+	mov.b32 	{%rs42, %rs43}, %r135;
+	cvt.f32.bf16 	%r157, %rs43;
+	cvt.f32.bf16 	%r158, %rs42;
+	.loc	1 36 123                        // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:36:123
+	mov.b32 	{%rs44, %rs45}, %r142;
+	cvt.f32.bf16 	%r159, %rs45;
+	cvt.f32.bf16 	%r160, %rs44;
+	.loc	1 44 19                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:44:19
+	mul.f32 	%r161, %r84, %r160;
+	mul.f32 	%r162, %r88, %r159;
+	.loc	1 45 137                        // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:45:137
+	mov.b32 	{%rs46, %rs47}, %r143;
+	cvt.f32.bf16 	%r163, %rs46;
+	cvt.f32.bf16 	%r164, %rs47;
+	.loc	1 47 20                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:47:20
+	mul.f32 	%r165, %r162, %r164;
+	mul.f32 	%r166, %r161, %r163;
+	.loc	1 62 20                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:62:20
+	mul.f32 	%r167, %r124, %r158;
+	mul.f32 	%r168, %r128, %r157;
+	.loc	1 63 71                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:63:71
+	mov.b32 	%r169, {%rs26, %rs30};
+	.loc	1 63 138                        // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:63:138
+	mov.b32 	{%rs48, %rs49}, %r169;
+	cvt.f32.bf16 	%r170, %rs48;
+	cvt.f32.bf16 	%r171, %rs49;
+	.loc	1 65 20                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:65:20
+	mul.f32 	%r172, %r168, %r171;
+	mul.f32 	%r173, %r167, %r170;
+	.loc	1 0 0                           // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:0
+	selp.f32 	%r174, %r166, %r173, %p7;
+	selp.f32 	%r175, %r165, %r172, %p11;
+	.loc	1 70 46                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:70:46
+	cvt.rn.bf16x2.f32 	%r176, %r175, %r174;
+	.loc	1 54 134                        // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:54:134
+	mov.b32 	{%rs50, %rs51}, %r134;
+	cvt.f32.bf16 	%r177, %rs51;
+	cvt.f32.bf16 	%r178, %rs50;
+	.loc	1 36 123                        // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:36:123
+	mov.b32 	{%rs52, %rs53}, %r140;
+	cvt.f32.bf16 	%r179, %rs53;
+	cvt.f32.bf16 	%r180, %rs52;
+	.loc	1 44 19                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:44:19
+	mul.f32 	%r181, %r85, %r180;
+	mul.f32 	%r182, %r89, %r179;
+	.loc	1 45 137                        // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:45:137
+	mov.b32 	{%rs54, %rs55}, %r141;
+	cvt.f32.bf16 	%r183, %rs54;
+	cvt.f32.bf16 	%r184, %rs55;
+	.loc	1 47 20                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:47:20
+	mul.f32 	%r185, %r182, %r184;
+	mul.f32 	%r186, %r181, %r183;
+	.loc	1 62 20                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:62:20
+	mul.f32 	%r187, %r125, %r178;
+	mul.f32 	%r188, %r129, %r177;
+	.loc	1 63 71                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:63:71
+	mov.b32 	%r189, {%rs27, %rs31};
+	.loc	1 63 138                        // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:63:138
+	mov.b32 	{%rs56, %rs57}, %r189;
+	cvt.f32.bf16 	%r190, %rs56;
+	cvt.f32.bf16 	%r191, %rs57;
+	.loc	1 65 20                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:65:20
+	mul.f32 	%r192, %r188, %r191;
+	mul.f32 	%r193, %r187, %r190;
+	.loc	1 0 0                           // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:0
+	selp.f32 	%r194, %r186, %r193, %p7;
+	selp.f32 	%r195, %r185, %r192, %p11;
+	.loc	1 70 46                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:70:46
+	cvt.rn.bf16x2.f32 	%r196, %r195, %r194;
+	.loc	1 54 134                        // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:54:134
+	mov.b32 	{%rs58, %rs59}, %r133;
+	cvt.f32.bf16 	%r197, %rs59;
+	cvt.f32.bf16 	%r198, %rs58;
+	.loc	1 36 123                        // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:36:123
+	mov.b32 	{%rs60, %rs61}, %r138;
+	cvt.f32.bf16 	%r199, %rs61;
+	cvt.f32.bf16 	%r200, %rs60;
+	.loc	1 44 19                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:44:19
+	mul.f32 	%r201, %r86, %r200;
+	mul.f32 	%r202, %r90, %r199;
+	.loc	1 45 137                        // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:45:137
+	mov.b32 	{%rs62, %rs63}, %r139;
+	cvt.f32.bf16 	%r203, %rs62;
+	cvt.f32.bf16 	%r204, %rs63;
+	.loc	1 47 20                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:47:20
+	mul.f32 	%r205, %r202, %r204;
+	mul.f32 	%r206, %r201, %r203;
+	.loc	1 62 20                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:62:20
+	mul.f32 	%r207, %r126, %r198;
+	mul.f32 	%r208, %r130, %r197;
+	.loc	1 63 71                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:63:71
+	mov.b32 	%r209, {%rs28, %rs32};
+	.loc	1 63 138                        // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:63:138
+	mov.b32 	{%rs64, %rs65}, %r209;
+	cvt.f32.bf16 	%r210, %rs64;
+	cvt.f32.bf16 	%r211, %rs65;
+	.loc	1 65 20                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:65:20
+	mul.f32 	%r212, %r208, %r211;
+	mul.f32 	%r213, %r207, %r210;
+	.loc	1 0 0                           // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:0
+	selp.f32 	%r214, %r206, %r213, %p7;
+	selp.f32 	%r215, %r205, %r212, %p11;
+	.loc	1 70 46                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:70:46
+	cvt.rn.bf16x2.f32 	%r216, %r215, %r214;
+	.loc	1 54 134                        // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:54:134
+	mov.b32 	{%rs66, %rs67}, %r132;
+	cvt.f32.bf16 	%r217, %rs67;
+	cvt.f32.bf16 	%r218, %rs66;
+	.loc	1 36 123                        // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:36:123
+	mov.b32 	{%rs68, %rs69}, %r136;
+	cvt.f32.bf16 	%r219, %rs69;
+	cvt.f32.bf16 	%r220, %rs68;
+	.loc	1 44 19                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:44:19
+	mul.f32 	%r221, %r87, %r220;
+	mul.f32 	%r222, %r91, %r219;
+	.loc	1 45 137                        // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:45:137
+	mov.b32 	{%rs70, %rs71}, %r137;
+	cvt.f32.bf16 	%r223, %rs70;
+	cvt.f32.bf16 	%r224, %rs71;
+	.loc	1 47 20                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:47:20
+	mul.f32 	%r225, %r222, %r224;
+	mul.f32 	%r226, %r221, %r223;
+	.loc	1 62 20                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:62:20
+	mul.f32 	%r227, %r127, %r218;
+	mul.f32 	%r228, %r131, %r217;
+	.loc	1 63 71                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:63:71
+	mov.b32 	%r229, {%rs29, %rs33};
+	.loc	1 63 138                        // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:63:138
+	mov.b32 	{%rs72, %rs73}, %r229;
+	cvt.f32.bf16 	%r230, %rs72;
+	cvt.f32.bf16 	%r231, %rs73;
+	.loc	1 65 20                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:65:20
+	mul.f32 	%r232, %r228, %r231;
+	mul.f32 	%r233, %r227, %r230;
+	.loc	1 0 0                           // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:0
+	selp.f32 	%r234, %r226, %r233, %p7;
+	selp.f32 	%r235, %r225, %r232, %p11;
+	.loc	1 70 46                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:70:46
+	cvt.rn.bf16x2.f32 	%r236, %r235, %r234;
+	shl.b32 	%r237, %r24, 4;
+	mov.b32 	%r238, global_smem;
+	add.s32 	%r239, %r238, %r237;
+	st.shared.v4.b32 	[%r239], {%r176, %r196, %r216, %r236};
+	bar.sync 	0;
+	shl.b32 	%r240, %r23, 6;
+	and.b32 	%r241, %r240, 1536;
+	shl.b32 	%r242, %r23, 4;
+	and.b32 	%r243, %r242, 112;
+	shl.b32 	%r244, %r23, 2;
+	and.b32 	%r245, %r244, 384;
+	add.s32 	%r246, %r238, %r241;
+	add.s32 	%r247, %r246, %r243;
+	add.s32 	%r248, %r247, %r245;
+	ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r249, %r250, %r251, %r252}, [%r248];
+	mov.b32 	{_, %rs38}, %r249;
+	mov.b32 	{_, %rs39}, %r250;
+	mov.b32 	{_, %rs40}, %r251;
+	mov.b32 	{_, %rs41}, %r252;
+	cvt.u16.u32 	%rs34, %r249;
+	// begin inline asm
+	@%p5 st.global.b16 [ %rd59 + 0 ], { %rs34 };
+	// end inline asm
+	cvt.u16.u32 	%rs35, %r250;
+	// begin inline asm
+	@%p5 st.global.b16 [ %rd60 + 0 ], { %rs35 };
+	// end inline asm
+	cvt.u16.u32 	%rs36, %r251;
+	// begin inline asm
+	@%p5 st.global.b16 [ %rd61 + 0 ], { %rs36 };
+	// end inline asm
+	cvt.u16.u32 	%rs37, %r252;
+	// begin inline asm
+	@%p5 st.global.b16 [ %rd62 + 0 ], { %rs37 };
+	// end inline asm
+	// begin inline asm
+	@%p5 st.global.b16 [ %rd63 + 0 ], { %rs38 };
+	// end inline asm
+	// begin inline asm
+	@%p5 st.global.b16 [ %rd64 + 0 ], { %rs39 };
+	// end inline asm
+	// begin inline asm
+	@%p5 st.global.b16 [ %rd65 + 0 ], { %rs40 };
+	// end inline asm
+	// begin inline asm
+	@%p5 st.global.b16 [ %rd66 + 0 ], { %rs41 };
+	// end inline asm
+	.loc	1 70 4                          // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:70:4
+	ret;
+$L__tmp1:
+$L__func_end0:
+                                        // -- End function
+}
+	.file	1 "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py"
+	.section	.debug_abbrev
+	{
+.b8 1                                   // Abbreviation Code
+.b8 17                                  // DW_TAG_compile_unit
+.b8 0                                   // DW_CHILDREN_no
+.b8 37                                  // DW_AT_producer
+.b8 8                                   // DW_FORM_string
+.b8 19                                  // DW_AT_language
+.b8 5                                   // DW_FORM_data2
+.b8 3                                   // DW_AT_name
+.b8 8                                   // DW_FORM_string
+.b8 16                                  // DW_AT_stmt_list
+.b8 6                                   // DW_FORM_data4
+.b8 27                                  // DW_AT_comp_dir
+.b8 8                                   // DW_FORM_string
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 0                                   // EOM(3)
+	}
+	.section	.debug_info
+	{
+.b32 224                                // Length of Unit
+.b8 2                                   // DWARF version number
+.b8 0
+.b32 .debug_abbrev                      // Offset Into Abbrev. Section
+.b8 8                                   // Address Size (in bytes)
+.b8 1                                   // Abbrev [1] 0xb:0xd9 DW_TAG_compile_unit
+.b8 116                                 // DW_AT_producer
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 0
+.b8 2                                   // DW_AT_language
+.b8 0
+.b8 99                                  // DW_AT_name
+.b8 50
+.b8 104
+.b8 105
+.b8 106
+.b8 51
+.b8 104
+.b8 109
+.b8 108
+.b8 111
+.b8 117
+.b8 109
+.b8 120
+.b8 100
+.b8 109
+.b8 104
+.b8 117
+.b8 101
+.b8 122
+.b8 115
+.b8 121
+.b8 104
+.b8 107
+.b8 109
+.b8 110
+.b8 113
+.b8 103
+.b8 110
+.b8 102
+.b8 97
+.b8 53
+.b8 105
+.b8 118
+.b8 114
+.b8 101
+.b8 50
+.b8 55
+.b8 117
+.b8 111
+.b8 115
+.b8 121
+.b8 109
+.b8 97
+.b8 109
+.b8 51
+.b8 100
+.b8 114
+.b8 55
+.b8 97
+.b8 53
+.b8 120
+.b8 98
+.b8 46
+.b8 112
+.b8 121
+.b8 0
+.b32 .debug_line                        // DW_AT_stmt_list
+.b8 47                                  // DW_AT_comp_dir
+.b8 97
+.b8 112
+.b8 112
+.b8 47
+.b8 116
+.b8 101
+.b8 110
+.b8 115
+.b8 111
+.b8 114
+.b8 114
+.b8 116
+.b8 95
+.b8 108
+.b8 108
+.b8 109
+.b8 47
+.b8 118
+.b8 105
+.b8 115
+.b8 117
+.b8 97
+.b8 108
+.b8 95
+.b8 103
+.b8 101
+.b8 110
+.b8 47
+.b8 99
+.b8 111
+.b8 109
+.b8 112
+.b8 105
+.b8 108
+.b8 101
+.b8 100
+.b8 95
+.b8 99
+.b8 97
+.b8 99
+.b8 104
+.b8 101
+.b8 47
+.b8 102
+.b8 108
+.b8 117
+.b8 120
+.b8 50
+.b8 95
+.b8 107
+.b8 108
+.b8 101
+.b8 105
+.b8 110
+.b8 95
+.b8 57
+.b8 98
+.b8 95
+.b8 78
+.b8 86
+.b8 73
+.b8 68
+.b8 73
+.b8 65
+.b8 95
+.b8 71
+.b8 101
+.b8 70
+.b8 111
+.b8 114
+.b8 99
+.b8 101
+.b8 95
+.b8 82
+.b8 84
+.b8 88
+.b8 95
+.b8 52
+.b8 48
+.b8 57
+.b8 48
+.b8 95
+.b8 115
+.b8 109
+.b8 56
+.b8 57
+.b8 95
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 50
+.b8 46
+.b8 49
+.b8 48
+.b8 46
+.b8 48
+.b8 97
+.b8 48
+.b8 95
+.b8 98
+.b8 52
+.b8 101
+.b8 52
+.b8 101
+.b8 101
+.b8 56
+.b8 49
+.b8 100
+.b8 51
+.b8 46
+.b8 110
+.b8 118
+.b8 50
+.b8 53
+.b8 46
+.b8 49
+.b8 50
+.b8 95
+.b8 99
+.b8 117
+.b8 100
+.b8 97
+.b8 49
+.b8 51
+.b8 95
+.b8 49
+.b8 47
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 105
+.b8 110
+.b8 100
+.b8 117
+.b8 99
+.b8 116
+.b8 111
+.b8 114
+.b8 47
+.b8 50
+.b8 104
+.b8 0
+	}
+	.section	.debug_macinfo	{	}
diff --git a/triton/B332B73IRJVPV7KEXQCJ4B6VIXVZKZ6STDLWEXIOLGVNUOTPYFEQ/triton_poi_fused__fused_rms_norm_cat_view_2.source b/triton/B332B73IRJVPV7KEXQCJ4B6VIXVZKZ6STDLWEXIOLGVNUOTPYFEQ/triton_poi_fused__fused_rms_norm_cat_view_2.source
new file mode 100644
index 0000000000000000000000000000000000000000..1092c1a412ae5774676d09a311853fa35927b82c
--- /dev/null
+++ b/triton/B332B73IRJVPV7KEXQCJ4B6VIXVZKZ6STDLWEXIOLGVNUOTPYFEQ/triton_poi_fused__fused_rms_norm_cat_view_2.source
@@ -0,0 +1,388 @@
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":18:0)
+#loc97 = loc("in_ptr0"(#loc))
+#loc98 = loc("in_ptr1"(#loc))
+#loc99 = loc("in_ptr2"(#loc))
+#loc100 = loc("in_ptr3"(#loc))
+#loc101 = loc("in_ptr4"(#loc))
+#loc102 = loc("in_ptr5"(#loc))
+#loc103 = loc("out_ptr0"(#loc))
+#loc104 = loc("ynumel"(#loc))
+#loc105 = loc("xnumel"(#loc))
+module {
+  tt.func public @triton_poi_fused__fused_rms_norm_cat_view_2(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %in_ptr4: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr4"(#loc)), %in_ptr5: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr5"(#loc)), %out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ynumel: i32 {tt.divisibility = 16 : i32} loc("ynumel"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} {
+    %ynumel_0 = arith.constant 73728 : i32 loc(#loc106)
+    %xnumel_1 = arith.constant 128 : i32 loc(#loc107)
+    %yoffset = tt.get_program_id y : i32 loc(#loc108)
+    %yoffset_2 = tt.get_program_id z : i32 loc(#loc109)
+    %yoffset_3 = tt.get_num_programs y : i32 loc(#loc110)
+    %yoffset_4 = arith.muli %yoffset_2, %yoffset_3 : i32 loc(#loc111)
+    %yoffset_5 = arith.addi %yoffset, %yoffset_4 : i32 loc(#loc112)
+    %yoffset_6 = arith.constant 1024 : i32 loc(#loc113)
+    %yoffset_7 = arith.constant 1024 : i32 loc(#loc113)
+    %yoffset_8 = arith.muli %yoffset_5, %yoffset_7 : i32 loc(#loc113)
+    %yindex = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32> loc(#loc114)
+    %yindex_9 = tt.expand_dims %yindex {axis = 1 : i32} : tensor<1024xi32> -> tensor<1024x1xi32> loc(#loc115)
+    %yindex_10 = tt.splat %yoffset_8 : i32 -> tensor<1024x1xi32> loc(#loc116)
+    %yindex_11 = arith.addi %yindex_10, %yindex_9 : tensor<1024x1xi32> loc(#loc116)
+    %ymask = arith.constant dense<73728> : tensor<1024x1xi32> loc(#loc117)
+    %ymask_12 = arith.cmpi slt, %yindex_11, %ymask : tensor<1024x1xi32> loc(#loc117)
+    %xoffset = tt.get_program_id x : i32 loc(#loc118)
+    %xoffset_13 = arith.constant 1 : i32 loc(#loc119)
+    %xoffset_14 = arith.constant 1 : i32 loc(#loc119)
+    %xoffset_15 = arith.muli %xoffset, %xoffset_14 : i32 loc(#loc119)
+    %xindex = tt.make_range {end = 1 : i32, start = 0 : i32} : tensor<1xi32> loc(#loc120)
+    %xindex_16 = tt.expand_dims %xindex {axis = 0 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc121)
+    %xindex_17 = tt.splat %xoffset_15 : i32 -> tensor<1x1xi32> loc(#loc122)
+    %xindex_18 = arith.addi %xindex_17, %xindex_16 : tensor<1x1xi32> loc(#loc122)
+    %xmask = arith.constant dense<128> : tensor<1x1xi32> loc(#loc123)
+    %xmask_19 = arith.cmpi slt, %xindex_18, %xmask : tensor<1x1xi32> loc(#loc123)
+    %y1 = arith.constant 32 : i32 loc(#loc124)
+    %y1_20 = arith.constant 32 : i32 loc(#loc124)
+    %y1_21 = arith.constant dense<32> : tensor<1024x1xi32> loc(#loc124)
+    %y1_22 = arith.divsi %yindex_11, %y1_21 : tensor<1024x1xi32> loc(#loc124)
+    %y0 = arith.constant 32 : i32 loc(#loc125)
+    %y0_23 = arith.constant 32 : i32 loc(#loc125)
+    %y0_24 = arith.constant dense<32> : tensor<1024x1xi32> loc(#loc125)
+    %y0_25 = arith.remsi %yindex_11, %y0_24 : tensor<1024x1xi32> loc(#loc125)
+    %tmp1 = arith.constant 0 : i64 loc(#loc126)
+    %tmp1_26 = arith.constant dense<0> : tensor<1x1xi64> loc(#loc126)
+    %tmp2 = arith.extsi %y1_22 : tensor<1024x1xi32> to tensor<1024x1xi64> loc(#loc127)
+    %tmp2_27 = arith.constant dense<0> : tensor<1024x1xi64> loc(#loc127)
+    %tmp2_28 = arith.cmpi sge, %tmp2, %tmp2_27 : tensor<1024x1xi64> loc(#loc127)
+    %tmp3 = arith.constant 256 : i64 loc(#loc128)
+    %tmp3_29 = arith.constant dense<256> : tensor<1x1xi64> loc(#loc128)
+    %tmp4 = arith.extsi %y1_22 : tensor<1024x1xi32> to tensor<1024x1xi64> loc(#loc129)
+    %tmp4_30 = arith.constant dense<256> : tensor<1024x1xi64> loc(#loc129)
+    %tmp4_31 = arith.cmpi slt, %tmp4, %tmp4_30 : tensor<1024x1xi64> loc(#loc129)
+    %tmp5 = arith.constant 128 : i32 loc(#loc130)
+    %tmp5_32 = arith.constant 128 : i32 loc(#loc130)
+    %tmp5_33 = arith.constant dense<128> : tensor<1024x1xi32> loc(#loc130)
+    %tmp5_34 = arith.muli %tmp5_33, %y0_25 : tensor<1024x1xi32> loc(#loc130)
+    %tmp5_35 = tt.broadcast %xindex_18 : tensor<1x1xi32> -> tensor<1024x1xi32> loc(#loc131)
+    %tmp5_36 = arith.addi %tmp5_35, %tmp5_34 : tensor<1024x1xi32> loc(#loc131)
+    %tmp5_37 = arith.constant 12288 : i32 loc(#loc132)
+    %tmp5_38 = arith.constant 12288 : i32 loc(#loc132)
+    %tmp5_39 = arith.constant dense<12288> : tensor<1024x1xi32> loc(#loc132)
+    %tmp5_40 = arith.muli %tmp5_39, %y1_22 : tensor<1024x1xi32> loc(#loc132)
+    %tmp5_41 = arith.addi %tmp5_36, %tmp5_40 : tensor<1024x1xi32> loc(#loc133)
+    %tmp5_42 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<1024x1x!tt.ptr<bf16>> loc(#loc134)
+    %tmp5_43 = tt.addptr %tmp5_42, %tmp5_41 : tensor<1024x1x!tt.ptr<bf16>>, tensor<1024x1xi32> loc(#loc134)
+    %tmp5_44 = tt.broadcast %xmask_19 : tensor<1x1xi1> -> tensor<1024x1xi1> loc(#loc135)
+    %tmp5_45 = arith.andi %tmp4_31, %tmp5_44 : tensor<1024x1xi1> loc(#loc135)
+    %tmp5_46 = arith.andi %tmp5_45, %ymask_12 : tensor<1024x1xi1> loc(#loc136)
+    %tmp5_47 = arith.constant 0.000000e+00 : f32 loc(#loc137)
+    %tmp5_48 = arith.constant dense<0.000000e+00> : tensor<1024x1xf32> loc(#loc137)
+    %tmp5_49 = arith.truncf %tmp5_48 : tensor<1024x1xf32> to tensor<1024x1xbf16> loc(#loc137)
+    %tmp5_50 = tt.load %tmp5_43, %tmp5_46, %tmp5_49 evictionPolicy = evict_last : tensor<1024x1x!tt.ptr<bf16>> loc(#loc137)
+    %tmp5_51 = arith.extf %tmp5_50 : tensor<1024x1xbf16> to tensor<1024x1xf32> loc(#loc138)
+    %tmp7 = arith.constant 32 : i32 loc(#loc139)
+    %tmp7_52 = arith.constant 32 : i32 loc(#loc139)
+    %tmp7_53 = arith.constant dense<32> : tensor<1024x1xi32> loc(#loc139)
+    %tmp7_54 = arith.muli %tmp7_53, %y1_22 : tensor<1024x1xi32> loc(#loc139)
+    %tmp7_55 = arith.addi %y0_25, %tmp7_54 : tensor<1024x1xi32> loc(#loc140)
+    %tmp7_56 = tt.splat %in_ptr1 : !tt.ptr<f32> -> tensor<1024x1x!tt.ptr<f32>> loc(#loc141)
+    %tmp7_57 = tt.addptr %tmp7_56, %tmp7_55 : tensor<1024x1x!tt.ptr<f32>>, tensor<1024x1xi32> loc(#loc141)
+    %tmp7_58 = tt.broadcast %xmask_19 : tensor<1x1xi1> -> tensor<1024x1xi1> loc(#loc142)
+    %tmp7_59 = arith.andi %tmp4_31, %tmp7_58 : tensor<1024x1xi1> loc(#loc142)
+    %tmp7_60 = arith.andi %tmp7_59, %ymask_12 : tensor<1024x1xi1> loc(#loc143)
+    %tmp7_61 = arith.constant 0.000000e+00 : f32 loc(#loc144)
+    %tmp7_62 = arith.constant dense<0.000000e+00> : tensor<1024x1xf32> loc(#loc144)
+    %tmp7_63 = tt.load %tmp7_57, %tmp7_60, %tmp7_62 evictionPolicy = evict_last : tensor<1024x1x!tt.ptr<f32>> loc(#loc144)
+    %tmp8 = arith.constant 1.280000e+02 : f32 loc(#loc145)
+    %tmp9 = arith.constant dense<1.280000e+02> : tensor<1024x1xf32> loc(#loc146)
+    %tmp9_64 = arith.divf %tmp7_63, %tmp9 : tensor<1024x1xf32> loc(#loc146)
+    %tmp10 = arith.constant 9.99999997E-7 : f32 loc(#loc147)
+    %tmp11 = arith.constant dense<9.99999997E-7> : tensor<1024x1xf32> loc(#loc148)
+    %tmp11_65 = arith.addf %tmp9_64, %tmp11 : tensor<1024x1xf32> loc(#loc148)
+    %tmp12 = tt.extern_elementwise %tmp11_65 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<1024x1xf32>) -> tensor<1024x1xf32> loc(#loc149)
+    %tmp13 = arith.mulf %tmp5_51, %tmp12 : tensor<1024x1xf32> loc(#loc150)
+    %tmp14 = tt.broadcast %xindex_18 : tensor<1x1xi32> -> tensor<1024x1xi32> loc(#loc151)
+    %tmp14_66 = tt.splat %in_ptr2 : !tt.ptr<bf16> -> tensor<1024x1x!tt.ptr<bf16>> loc(#loc152)
+    %tmp14_67 = tt.addptr %tmp14_66, %tmp14 : tensor<1024x1x!tt.ptr<bf16>>, tensor<1024x1xi32> loc(#loc152)
+    %tmp14_68 = tt.broadcast %xmask_19 : tensor<1x1xi1> -> tensor<1024x1xi1> loc(#loc153)
+    %tmp14_69 = arith.andi %tmp4_31, %tmp14_68 : tensor<1024x1xi1> loc(#loc153)
+    %tmp14_70 = arith.andi %tmp14_69, %ymask_12 : tensor<1024x1xi1> loc(#loc154)
+    %tmp14_71 = arith.constant 0.000000e+00 : f32 loc(#loc155)
+    %tmp14_72 = arith.constant dense<0.000000e+00> : tensor<1024x1xf32> loc(#loc155)
+    %tmp14_73 = arith.truncf %tmp14_72 : tensor<1024x1xf32> to tensor<1024x1xbf16> loc(#loc155)
+    %tmp14_74 = tt.load %tmp14_67, %tmp14_70, %tmp14_73 evictionPolicy = evict_last : tensor<1024x1x!tt.ptr<bf16>> loc(#loc155)
+    %tmp14_75 = arith.extf %tmp14_74 : tensor<1024x1xbf16> to tensor<1024x1xf32> loc(#loc156)
+    %tmp16 = arith.mulf %tmp13, %tmp14_75 : tensor<1024x1xf32> loc(#loc157)
+    %tmp18 = arith.constant 0.000000e+00 : f32 loc(#loc158)
+    %tmp18_76 = arith.constant dense<0.000000e+00> : tensor<1024x1xf32> loc(#loc158)
+    %tmp19 = arith.select %tmp4_31, %tmp16, %tmp18_76 : tensor<1024x1xi1>, tensor<1024x1xf32> loc(#loc159)
+    %tmp20 = arith.extsi %y1_22 : tensor<1024x1xi32> to tensor<1024x1xi64> loc(#loc160)
+    %tmp20_77 = arith.constant dense<256> : tensor<1024x1xi64> loc(#loc160)
+    %tmp20_78 = arith.cmpi sge, %tmp20, %tmp20_77 : tensor<1024x1xi64> loc(#loc160)
+    %tmp21 = arith.constant 2304 : i64 loc(#loc161)
+    %tmp21_79 = arith.constant dense<2304> : tensor<1x1xi64> loc(#loc161)
+    %tmp22 = arith.extsi %y1_22 : tensor<1024x1xi32> to tensor<1024x1xi64> loc(#loc162)
+    %tmp22_80 = arith.constant dense<2304> : tensor<1024x1xi64> loc(#loc162)
+    %tmp22_81 = arith.cmpi slt, %tmp22, %tmp22_80 : tensor<1024x1xi64> loc(#loc162)
+    %tmp23 = arith.constant 128 : i32 loc(#loc163)
+    %tmp23_82 = arith.constant 128 : i32 loc(#loc163)
+    %tmp23_83 = arith.constant dense<128> : tensor<1024x1xi32> loc(#loc163)
+    %tmp23_84 = arith.muli %tmp23_83, %y0_25 : tensor<1024x1xi32> loc(#loc163)
+    %tmp23_85 = tt.broadcast %xindex_18 : tensor<1x1xi32> -> tensor<1024x1xi32> loc(#loc164)
+    %tmp23_86 = arith.addi %tmp23_85, %tmp23_84 : tensor<1024x1xi32> loc(#loc164)
+    %tmp23_87 = arith.constant -256 : i32 loc(#loc165)
+    %tmp23_88 = arith.constant -256 : i32 loc(#loc165)
+    %tmp23_89 = arith.constant dense<-256> : tensor<1024x1xi32> loc(#loc165)
+    %tmp23_90 = arith.addi %tmp23_89, %y1_22 : tensor<1024x1xi32> loc(#loc165)
+    %tmp23_91 = arith.constant 12288 : i32 loc(#loc166)
+    %tmp23_92 = arith.constant 12288 : i32 loc(#loc166)
+    %tmp23_93 = arith.constant dense<12288> : tensor<1024x1xi32> loc(#loc166)
+    %tmp23_94 = arith.muli %tmp23_93, %tmp23_90 : tensor<1024x1xi32> loc(#loc166)
+    %tmp23_95 = arith.addi %tmp23_86, %tmp23_94 : tensor<1024x1xi32> loc(#loc167)
+    %tmp23_96 = tt.splat %in_ptr3 : !tt.ptr<bf16> -> tensor<1024x1x!tt.ptr<bf16>> loc(#loc168)
+    %tmp23_97 = tt.addptr %tmp23_96, %tmp23_95 : tensor<1024x1x!tt.ptr<bf16>>, tensor<1024x1xi32> loc(#loc168)
+    %tmp23_98 = tt.broadcast %xmask_19 : tensor<1x1xi1> -> tensor<1024x1xi1> loc(#loc169)
+    %tmp23_99 = arith.andi %tmp20_78, %tmp23_98 : tensor<1024x1xi1> loc(#loc169)
+    %tmp23_100 = arith.andi %tmp23_99, %ymask_12 : tensor<1024x1xi1> loc(#loc170)
+    %tmp23_101 = arith.constant 0.000000e+00 : f32 loc(#loc171)
+    %tmp23_102 = arith.constant dense<0.000000e+00> : tensor<1024x1xf32> loc(#loc171)
+    %tmp23_103 = arith.truncf %tmp23_102 : tensor<1024x1xf32> to tensor<1024x1xbf16> loc(#loc171)
+    %tmp23_104 = tt.load %tmp23_97, %tmp23_100, %tmp23_103 evictionPolicy = evict_last : tensor<1024x1x!tt.ptr<bf16>> loc(#loc171)
+    %tmp23_105 = arith.extf %tmp23_104 : tensor<1024x1xbf16> to tensor<1024x1xf32> loc(#loc172)
+    %tmp25 = arith.constant -256 : i32 loc(#loc173)
+    %tmp25_106 = arith.constant -256 : i32 loc(#loc173)
+    %tmp25_107 = arith.constant dense<-256> : tensor<1024x1xi32> loc(#loc173)
+    %tmp25_108 = arith.addi %tmp25_107, %y1_22 : tensor<1024x1xi32> loc(#loc173)
+    %tmp25_109 = arith.constant 32 : i32 loc(#loc174)
+    %tmp25_110 = arith.constant 32 : i32 loc(#loc174)
+    %tmp25_111 = arith.constant dense<32> : tensor<1024x1xi32> loc(#loc174)
+    %tmp25_112 = arith.muli %tmp25_111, %tmp25_108 : tensor<1024x1xi32> loc(#loc174)
+    %tmp25_113 = arith.addi %y0_25, %tmp25_112 : tensor<1024x1xi32> loc(#loc175)
+    %tmp25_114 = tt.splat %in_ptr4 : !tt.ptr<f32> -> tensor<1024x1x!tt.ptr<f32>> loc(#loc176)
+    %tmp25_115 = tt.addptr %tmp25_114, %tmp25_113 : tensor<1024x1x!tt.ptr<f32>>, tensor<1024x1xi32> loc(#loc176)
+    %tmp25_116 = tt.broadcast %xmask_19 : tensor<1x1xi1> -> tensor<1024x1xi1> loc(#loc177)
+    %tmp25_117 = arith.andi %tmp20_78, %tmp25_116 : tensor<1024x1xi1> loc(#loc177)
+    %tmp25_118 = arith.andi %tmp25_117, %ymask_12 : tensor<1024x1xi1> loc(#loc178)
+    %tmp25_119 = arith.constant 0.000000e+00 : f32 loc(#loc179)
+    %tmp25_120 = arith.constant dense<0.000000e+00> : tensor<1024x1xf32> loc(#loc179)
+    %tmp25_121 = tt.load %tmp25_115, %tmp25_118, %tmp25_120 evictionPolicy = evict_last : tensor<1024x1x!tt.ptr<f32>> loc(#loc179)
+    %tmp26 = arith.constant 1.280000e+02 : f32 loc(#loc180)
+    %tmp27 = arith.constant dense<1.280000e+02> : tensor<1024x1xf32> loc(#loc181)
+    %tmp27_122 = arith.divf %tmp25_121, %tmp27 : tensor<1024x1xf32> loc(#loc181)
+    %tmp28 = arith.constant 9.99999997E-7 : f32 loc(#loc182)
+    %tmp29 = arith.constant dense<9.99999997E-7> : tensor<1024x1xf32> loc(#loc183)
+    %tmp29_123 = arith.addf %tmp27_122, %tmp29 : tensor<1024x1xf32> loc(#loc183)
+    %tmp30 = tt.extern_elementwise %tmp29_123 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<1024x1xf32>) -> tensor<1024x1xf32> loc(#loc184)
+    %tmp31 = arith.mulf %tmp23_105, %tmp30 : tensor<1024x1xf32> loc(#loc185)
+    %tmp32 = tt.broadcast %xindex_18 : tensor<1x1xi32> -> tensor<1024x1xi32> loc(#loc186)
+    %tmp32_124 = tt.splat %in_ptr5 : !tt.ptr<bf16> -> tensor<1024x1x!tt.ptr<bf16>> loc(#loc187)
+    %tmp32_125 = tt.addptr %tmp32_124, %tmp32 : tensor<1024x1x!tt.ptr<bf16>>, tensor<1024x1xi32> loc(#loc187)
+    %tmp32_126 = tt.broadcast %xmask_19 : tensor<1x1xi1> -> tensor<1024x1xi1> loc(#loc188)
+    %tmp32_127 = arith.andi %tmp20_78, %tmp32_126 : tensor<1024x1xi1> loc(#loc188)
+    %tmp32_128 = arith.andi %tmp32_127, %ymask_12 : tensor<1024x1xi1> loc(#loc189)
+    %tmp32_129 = arith.constant 0.000000e+00 : f32 loc(#loc190)
+    %tmp32_130 = arith.constant dense<0.000000e+00> : tensor<1024x1xf32> loc(#loc190)
+    %tmp32_131 = arith.truncf %tmp32_130 : tensor<1024x1xf32> to tensor<1024x1xbf16> loc(#loc190)
+    %tmp32_132 = tt.load %tmp32_125, %tmp32_128, %tmp32_131 evictionPolicy = evict_last : tensor<1024x1x!tt.ptr<bf16>> loc(#loc190)
+    %tmp32_133 = arith.extf %tmp32_132 : tensor<1024x1xbf16> to tensor<1024x1xf32> loc(#loc191)
+    %tmp34 = arith.mulf %tmp31, %tmp32_133 : tensor<1024x1xf32> loc(#loc192)
+    %tmp36 = arith.constant 0.000000e+00 : f32 loc(#loc193)
+    %tmp36_134 = arith.constant dense<0.000000e+00> : tensor<1024x1xf32> loc(#loc193)
+    %tmp37 = arith.select %tmp20_78, %tmp34, %tmp36_134 : tensor<1024x1xi1>, tensor<1024x1xf32> loc(#loc194)
+    %tmp38 = arith.select %tmp4_31, %tmp19, %tmp37 : tensor<1024x1xi1>, tensor<1024x1xf32> loc(#loc195)
+    %c128_i32 = arith.constant 128 : i32 loc(#loc91)
+    %c128_i32_135 = arith.constant 128 : i32 loc(#loc91)
+    %cst = arith.constant dense<128> : tensor<1024x1xi32> loc(#loc91)
+    %0 = arith.muli %cst, %yindex_11 : tensor<1024x1xi32> loc(#loc91)
+    %1 = tt.broadcast %xindex_18 : tensor<1x1xi32> -> tensor<1024x1xi32> loc(#loc92)
+    %2 = arith.addi %1, %0 : tensor<1024x1xi32> loc(#loc92)
+    %3 = tt.splat %out_ptr0 : !tt.ptr<bf16> -> tensor<1024x1x!tt.ptr<bf16>> loc(#loc93)
+    %4 = tt.addptr %3, %2 : tensor<1024x1x!tt.ptr<bf16>>, tensor<1024x1xi32> loc(#loc93)
+    %5 = tt.broadcast %xmask_19 : tensor<1x1xi1> -> tensor<1024x1xi1> loc(#loc94)
+    %6 = arith.andi %5, %ymask_12 : tensor<1024x1xi1> loc(#loc94)
+    %7 = arith.truncf %tmp38 : tensor<1024x1xf32> to tensor<1024x1xbf16> loc(#loc95)
+    tt.store %4, %7, %6 : tensor<1024x1x!tt.ptr<bf16>> loc(#loc95)
+    tt.return loc(#loc96)
+  } loc(#loc)
+} loc(#loc)
+#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":19:13)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":20:13)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:29)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:48)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:69)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:53)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:34)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:75)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":22:36)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":22:44)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":22:23)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":23:21)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":24:28)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":24:33)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":25:36)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":25:44)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":25:23)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":26:21)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":27:19)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":29:19)
+#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":32:30)
+#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":33:19)
+#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":34:32)
+#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":35:18)
+#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:39)
+#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:35)
+#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:51)
+#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:44)
+#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:30)
+#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:64)
+#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:72)
+#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:57)
+#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:123)
+#loc34 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:55)
+#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:51)
+#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:30)
+#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:87)
+#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:95)
+#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:80)
+#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":39:11)
+#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":40:19)
+#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":41:12)
+#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":42:19)
+#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":43:28)
+#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":44:19)
+#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:51)
+#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:31)
+#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:78)
+#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:86)
+#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:71)
+#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:137)
+#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":47:20)
+#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":49:38)
+#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":50:34)
+#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":51:20)
+#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":52:34)
+#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":53:19)
+#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:40)
+#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:36)
+#loc60 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:61)
+#loc61 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:52)
+#loc62 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:45)
+#loc63 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:31)
+#loc64 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:75)
+#loc65 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:83)
+#loc66 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:67)
+#loc67 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:134)
+#loc68 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:65)
+#loc69 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:56)
+#loc70 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:52)
+#loc71 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:31)
+#loc72 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:98)
+#loc73 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:106)
+#loc74 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:90)
+#loc75 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":57:12)
+#loc76 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":58:21)
+#loc77 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":59:12)
+#loc78 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":60:20)
+#loc79 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":61:28)
+#loc80 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":62:20)
+#loc81 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:51)
+#loc82 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:31)
+#loc83 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:79)
+#loc84 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:87)
+#loc85 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:71)
+#loc86 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:138)
+#loc87 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":65:20)
+#loc88 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":67:38)
+#loc89 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":68:35)
+#loc90 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":69:34)
+#loc91 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:34)
+#loc92 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:30)
+#loc93 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:25)
+#loc94 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:54)
+#loc95 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:46)
+#loc96 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:4)
+#loc106 = loc("ynumel"(#loc1))
+#loc107 = loc("xnumel"(#loc2))
+#loc108 = loc("yoffset"(#loc3))
+#loc109 = loc("yoffset"(#loc4))
+#loc110 = loc("yoffset"(#loc5))
+#loc111 = loc("yoffset"(#loc6))
+#loc112 = loc("yoffset"(#loc7))
+#loc113 = loc("yoffset"(#loc8))
+#loc114 = loc("yindex"(#loc9))
+#loc115 = loc("yindex"(#loc10))
+#loc116 = loc("yindex"(#loc11))
+#loc117 = loc("ymask"(#loc12))
+#loc118 = loc("xoffset"(#loc13))
+#loc119 = loc("xoffset"(#loc14))
+#loc120 = loc("xindex"(#loc15))
+#loc121 = loc("xindex"(#loc16))
+#loc122 = loc("xindex"(#loc17))
+#loc123 = loc("xmask"(#loc18))
+#loc124 = loc("y1"(#loc19))
+#loc125 = loc("y0"(#loc20))
+#loc126 = loc("tmp1"(#loc21))
+#loc127 = loc("tmp2"(#loc22))
+#loc128 = loc("tmp3"(#loc23))
+#loc129 = loc("tmp4"(#loc24))
+#loc130 = loc("tmp5"(#loc25))
+#loc131 = loc("tmp5"(#loc26))
+#loc132 = loc("tmp5"(#loc27))
+#loc133 = loc("tmp5"(#loc28))
+#loc134 = loc("tmp5"(#loc29))
+#loc135 = loc("tmp5"(#loc30))
+#loc136 = loc("tmp5"(#loc31))
+#loc137 = loc("tmp5"(#loc32))
+#loc138 = loc("tmp5"(#loc33))
+#loc139 = loc("tmp7"(#loc34))
+#loc140 = loc("tmp7"(#loc35))
+#loc141 = loc("tmp7"(#loc36))
+#loc142 = loc("tmp7"(#loc37))
+#loc143 = loc("tmp7"(#loc38))
+#loc144 = loc("tmp7"(#loc39))
+#loc145 = loc("tmp8"(#loc40))
+#loc146 = loc("tmp9"(#loc41))
+#loc147 = loc("tmp10"(#loc42))
+#loc148 = loc("tmp11"(#loc43))
+#loc149 = loc("tmp12"(#loc44))
+#loc150 = loc("tmp13"(#loc45))
+#loc151 = loc("tmp14"(#loc46))
+#loc152 = loc("tmp14"(#loc47))
+#loc153 = loc("tmp14"(#loc48))
+#loc154 = loc("tmp14"(#loc49))
+#loc155 = loc("tmp14"(#loc50))
+#loc156 = loc("tmp14"(#loc51))
+#loc157 = loc("tmp16"(#loc52))
+#loc158 = loc("tmp18"(#loc53))
+#loc159 = loc("tmp19"(#loc54))
+#loc160 = loc("tmp20"(#loc55))
+#loc161 = loc("tmp21"(#loc56))
+#loc162 = loc("tmp22"(#loc57))
+#loc163 = loc("tmp23"(#loc58))
+#loc164 = loc("tmp23"(#loc59))
+#loc165 = loc("tmp23"(#loc60))
+#loc166 = loc("tmp23"(#loc61))
+#loc167 = loc("tmp23"(#loc62))
+#loc168 = loc("tmp23"(#loc63))
+#loc169 = loc("tmp23"(#loc64))
+#loc170 = loc("tmp23"(#loc65))
+#loc171 = loc("tmp23"(#loc66))
+#loc172 = loc("tmp23"(#loc67))
+#loc173 = loc("tmp25"(#loc68))
+#loc174 = loc("tmp25"(#loc69))
+#loc175 = loc("tmp25"(#loc70))
+#loc176 = loc("tmp25"(#loc71))
+#loc177 = loc("tmp25"(#loc72))
+#loc178 = loc("tmp25"(#loc73))
+#loc179 = loc("tmp25"(#loc74))
+#loc180 = loc("tmp26"(#loc75))
+#loc181 = loc("tmp27"(#loc76))
+#loc182 = loc("tmp28"(#loc77))
+#loc183 = loc("tmp29"(#loc78))
+#loc184 = loc("tmp30"(#loc79))
+#loc185 = loc("tmp31"(#loc80))
+#loc186 = loc("tmp32"(#loc81))
+#loc187 = loc("tmp32"(#loc82))
+#loc188 = loc("tmp32"(#loc83))
+#loc189 = loc("tmp32"(#loc84))
+#loc190 = loc("tmp32"(#loc85))
+#loc191 = loc("tmp32"(#loc86))
+#loc192 = loc("tmp34"(#loc87))
+#loc193 = loc("tmp36"(#loc88))
+#loc194 = loc("tmp37"(#loc89))
+#loc195 = loc("tmp38"(#loc90))
diff --git a/triton/B332B73IRJVPV7KEXQCJ4B6VIXVZKZ6STDLWEXIOLGVNUOTPYFEQ/triton_poi_fused__fused_rms_norm_cat_view_2.ttgir b/triton/B332B73IRJVPV7KEXQCJ4B6VIXVZKZ6STDLWEXIOLGVNUOTPYFEQ/triton_poi_fused__fused_rms_norm_cat_view_2.ttgir
new file mode 100644
index 0000000000000000000000000000000000000000..8088717d2b119b05218586033863c1a6e15d3d4a
--- /dev/null
+++ b/triton/B332B73IRJVPV7KEXQCJ4B6VIXVZKZ6STDLWEXIOLGVNUOTPYFEQ/triton_poi_fused__fused_rms_norm_cat_view_2.ttgir
@@ -0,0 +1,245 @@
+#blocked = #ttg.blocked<{sizePerThread = [4, 1], threadsPerWarp = [32, 1], warpsPerCTA = [4, 1], order = [0, 1]}>
+#blocked1 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [4, 1], order = [0, 1]}>
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":18:0)
+#loc68 = loc("in_ptr0"(#loc))
+#loc69 = loc("in_ptr1"(#loc))
+#loc70 = loc("in_ptr2"(#loc))
+#loc71 = loc("in_ptr3"(#loc))
+#loc72 = loc("in_ptr4"(#loc))
+#loc73 = loc("in_ptr5"(#loc))
+#loc74 = loc("out_ptr0"(#loc))
+#loc75 = loc("ynumel"(#loc))
+#loc76 = loc("xnumel"(#loc))
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:89", "ttg.threads-per-warp" = 32 : i32} {
+  tt.func public @triton_poi_fused__fused_rms_norm_cat_view_2(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %in_ptr4: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr4"(#loc)), %in_ptr5: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr5"(#loc)), %out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ynumel: i32 {tt.divisibility = 16 : i32} loc("ynumel"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} {
+    %cst = arith.constant dense<-256> : tensor<1024x1xi32, #blocked> loc(#loc1)
+    %cst_0 = arith.constant dense<12288> : tensor<1024x1xi32, #blocked> loc(#loc1)
+    %cst_1 = arith.constant dense<128> : tensor<1024x1xi32, #blocked1> loc(#loc1)
+    %cst_2 = arith.constant dense<128> : tensor<1024x1xi32, #blocked> loc(#loc1)
+    %cst_3 = arith.constant dense<256> : tensor<1024x1xi64, #blocked> loc(#loc1)
+    %cst_4 = arith.constant dense<32> : tensor<1024x1xi32, #blocked> loc(#loc1)
+    %cst_5 = arith.constant dense<73728> : tensor<1024x1xi32, #blocked1> loc(#loc1)
+    %cst_6 = arith.constant dense<73728> : tensor<1024x1xi32, #blocked> loc(#loc1)
+    %c1024_i32 = arith.constant 1024 : i32 loc(#loc1)
+    %cst_7 = arith.constant dense<0.000000e+00> : tensor<1024x1xbf16, #blocked> loc(#loc1)
+    %c128_i32 = arith.constant 128 : i32 loc(#loc1)
+    %cst_8 = arith.constant dense<9.99999997E-7> : tensor<1024x1xf32, #blocked> loc(#loc1)
+    %cst_9 = arith.constant dense<1.280000e+02> : tensor<1024x1xf32, #blocked> loc(#loc1)
+    %cst_10 = arith.constant dense<0.000000e+00> : tensor<1024x1xf32, #blocked> loc(#loc1)
+    %yoffset = tt.get_program_id y : i32 loc(#loc77)
+    %yoffset_11 = tt.get_program_id z : i32 loc(#loc78)
+    %yoffset_12 = tt.get_num_programs y : i32 loc(#loc79)
+    %yoffset_13 = arith.muli %yoffset_11, %yoffset_12 : i32 loc(#loc80)
+    %yoffset_14 = arith.addi %yoffset, %yoffset_13 : i32 loc(#loc81)
+    %yoffset_15 = arith.muli %yoffset_14, %c1024_i32 : i32 loc(#loc82)
+    %yindex = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc83)
+    %yindex_16 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc83)
+    %yindex_17 = tt.expand_dims %yindex {axis = 1 : i32} : tensor<1024xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<1024x1xi32, #blocked> loc(#loc83)
+    %yindex_18 = tt.expand_dims %yindex_16 {axis = 1 : i32} : tensor<1024xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<1024x1xi32, #blocked1> loc(#loc83)
+    %yindex_19 = tt.splat %yoffset_15 : i32 -> tensor<1024x1xi32, #blocked> loc(#loc84)
+    %yindex_20 = tt.splat %yoffset_15 : i32 -> tensor<1024x1xi32, #blocked1> loc(#loc84)
+    %yindex_21 = arith.addi %yindex_19, %yindex_17 : tensor<1024x1xi32, #blocked> loc(#loc84)
+    %yindex_22 = arith.addi %yindex_20, %yindex_18 : tensor<1024x1xi32, #blocked1> loc(#loc84)
+    %ymask = arith.cmpi slt, %yindex_21, %cst_6 : tensor<1024x1xi32, #blocked> loc(#loc85)
+    %ymask_23 = arith.cmpi slt, %yindex_22, %cst_5 : tensor<1024x1xi32, #blocked1> loc(#loc85)
+    %xoffset = tt.get_program_id x : i32 loc(#loc86)
+    %xmask = arith.cmpi slt, %xoffset, %c128_i32 : i32 loc(#loc87)
+    %y1 = arith.divsi %yindex_21, %cst_4 : tensor<1024x1xi32, #blocked> loc(#loc88)
+    %y0 = arith.remsi %yindex_21, %cst_4 : tensor<1024x1xi32, #blocked> loc(#loc89)
+    %tmp4 = arith.extsi %y1 : tensor<1024x1xi32, #blocked> to tensor<1024x1xi64, #blocked> loc(#loc90)
+    %tmp4_24 = arith.cmpi slt, %tmp4, %cst_3 : tensor<1024x1xi64, #blocked> loc(#loc90)
+    %tmp5 = arith.muli %y0, %cst_2 : tensor<1024x1xi32, #blocked> loc(#loc91)
+    %tmp5_25 = tt.splat %xoffset : i32 -> tensor<1024x1xi32, #blocked> loc(#loc137)
+    %tmp5_26 = tt.splat %xoffset : i32 -> tensor<1024x1xi32, #blocked1> loc(#loc137)
+    %tmp5_27 = arith.addi %tmp5_25, %tmp5 : tensor<1024x1xi32, #blocked> loc(#loc92)
+    %tmp5_28 = arith.muli %y1, %cst_0 : tensor<1024x1xi32, #blocked> loc(#loc94)
+    %tmp5_29 = arith.addi %tmp5_27, %tmp5_28 : tensor<1024x1xi32, #blocked> loc(#loc95)
+    %tmp5_30 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<1024x1x!tt.ptr<bf16>, #blocked> loc(#loc96)
+    %tmp5_31 = tt.addptr %tmp5_30, %tmp5_29 : tensor<1024x1x!tt.ptr<bf16>, #blocked>, tensor<1024x1xi32, #blocked> loc(#loc96)
+    %tmp5_32 = tt.splat %xmask : i1 -> tensor<1024x1xi1, #blocked> loc(#loc138)
+    %tmp5_33 = tt.splat %xmask : i1 -> tensor<1024x1xi1, #blocked1> loc(#loc138)
+    %tmp5_34 = arith.andi %tmp4_24, %tmp5_32 : tensor<1024x1xi1, #blocked> loc(#loc97)
+    %tmp5_35 = arith.andi %tmp5_34, %ymask : tensor<1024x1xi1, #blocked> loc(#loc98)
+    %tmp5_36 = tt.load %tmp5_31, %tmp5_35, %cst_7 evictionPolicy = evict_last : tensor<1024x1x!tt.ptr<bf16>, #blocked> loc(#loc99)
+    %tmp5_37 = arith.extf %tmp5_36 : tensor<1024x1xbf16, #blocked> to tensor<1024x1xf32, #blocked> loc(#loc100)
+    %tmp7 = arith.muli %y1, %cst_4 : tensor<1024x1xi32, #blocked> loc(#loc101)
+    %tmp7_38 = arith.addi %y0, %tmp7 : tensor<1024x1xi32, #blocked> loc(#loc102)
+    %tmp7_39 = tt.splat %in_ptr1 : !tt.ptr<f32> -> tensor<1024x1x!tt.ptr<f32>, #blocked> loc(#loc103)
+    %tmp7_40 = tt.addptr %tmp7_39, %tmp7_38 : tensor<1024x1x!tt.ptr<f32>, #blocked>, tensor<1024x1xi32, #blocked> loc(#loc103)
+    %tmp7_41 = tt.load %tmp7_40, %tmp5_35, %cst_10 evictionPolicy = evict_last : tensor<1024x1x!tt.ptr<f32>, #blocked> loc(#loc104)
+    %tmp9 = arith.divf %tmp7_41, %cst_9 : tensor<1024x1xf32, #blocked> loc(#loc105)
+    %tmp11 = arith.addf %tmp9, %cst_8 : tensor<1024x1xf32, #blocked> loc(#loc106)
+    %tmp12 = tt.extern_elementwise %tmp11 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<1024x1xf32, #blocked>) -> tensor<1024x1xf32, #blocked> loc(#loc107)
+    %tmp13 = arith.mulf %tmp5_37, %tmp12 : tensor<1024x1xf32, #blocked> loc(#loc108)
+    %tmp14 = tt.addptr %in_ptr2, %xoffset : !tt.ptr<bf16>, i32 loc(#loc109)
+    %tmp14_42 = tt.splat %tmp14 : !tt.ptr<bf16> -> tensor<1024x1x!tt.ptr<bf16>, #blocked> loc(#loc110)
+    %tmp14_43 = tt.load %tmp14_42, %tmp5_35, %cst_7 evictionPolicy = evict_last : tensor<1024x1x!tt.ptr<bf16>, #blocked> loc(#loc110)
+    %tmp14_44 = arith.extf %tmp14_43 : tensor<1024x1xbf16, #blocked> to tensor<1024x1xf32, #blocked> loc(#loc111)
+    %tmp16 = arith.mulf %tmp13, %tmp14_44 : tensor<1024x1xf32, #blocked> loc(#loc112)
+    %tmp20 = arith.cmpi sge, %tmp4, %cst_3 : tensor<1024x1xi64, #blocked> loc(#loc113)
+    %tmp23 = arith.addi %y1, %cst : tensor<1024x1xi32, #blocked> loc(#loc114)
+    %tmp23_45 = arith.muli %tmp23, %cst_0 : tensor<1024x1xi32, #blocked> loc(#loc115)
+    %tmp23_46 = arith.addi %tmp5_27, %tmp23_45 : tensor<1024x1xi32, #blocked> loc(#loc116)
+    %tmp23_47 = tt.splat %in_ptr3 : !tt.ptr<bf16> -> tensor<1024x1x!tt.ptr<bf16>, #blocked> loc(#loc117)
+    %tmp23_48 = tt.addptr %tmp23_47, %tmp23_46 : tensor<1024x1x!tt.ptr<bf16>, #blocked>, tensor<1024x1xi32, #blocked> loc(#loc117)
+    %tmp23_49 = arith.andi %tmp20, %tmp5_32 : tensor<1024x1xi1, #blocked> loc(#loc118)
+    %tmp23_50 = arith.andi %tmp23_49, %ymask : tensor<1024x1xi1, #blocked> loc(#loc119)
+    %tmp23_51 = tt.load %tmp23_48, %tmp23_50, %cst_7 evictionPolicy = evict_last : tensor<1024x1x!tt.ptr<bf16>, #blocked> loc(#loc120)
+    %tmp23_52 = arith.extf %tmp23_51 : tensor<1024x1xbf16, #blocked> to tensor<1024x1xf32, #blocked> loc(#loc121)
+    %tmp25 = arith.muli %tmp23, %cst_4 : tensor<1024x1xi32, #blocked> loc(#loc122)
+    %tmp25_53 = arith.addi %y0, %tmp25 : tensor<1024x1xi32, #blocked> loc(#loc123)
+    %tmp25_54 = tt.splat %in_ptr4 : !tt.ptr<f32> -> tensor<1024x1x!tt.ptr<f32>, #blocked> loc(#loc124)
+    %tmp25_55 = tt.addptr %tmp25_54, %tmp25_53 : tensor<1024x1x!tt.ptr<f32>, #blocked>, tensor<1024x1xi32, #blocked> loc(#loc124)
+    %tmp25_56 = tt.load %tmp25_55, %tmp23_50, %cst_10 evictionPolicy = evict_last : tensor<1024x1x!tt.ptr<f32>, #blocked> loc(#loc125)
+    %tmp27 = arith.divf %tmp25_56, %cst_9 : tensor<1024x1xf32, #blocked> loc(#loc126)
+    %tmp29 = arith.addf %tmp27, %cst_8 : tensor<1024x1xf32, #blocked> loc(#loc127)
+    %tmp30 = tt.extern_elementwise %tmp29 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<1024x1xf32, #blocked>) -> tensor<1024x1xf32, #blocked> loc(#loc128)
+    %tmp31 = arith.mulf %tmp23_52, %tmp30 : tensor<1024x1xf32, #blocked> loc(#loc129)
+    %tmp32 = tt.addptr %in_ptr5, %xoffset : !tt.ptr<bf16>, i32 loc(#loc130)
+    %tmp32_57 = tt.splat %tmp32 : !tt.ptr<bf16> -> tensor<1024x1x!tt.ptr<bf16>, #blocked> loc(#loc131)
+    %tmp32_58 = tt.load %tmp32_57, %tmp23_50, %cst_7 evictionPolicy = evict_last : tensor<1024x1x!tt.ptr<bf16>, #blocked> loc(#loc131)
+    %tmp32_59 = arith.extf %tmp32_58 : tensor<1024x1xbf16, #blocked> to tensor<1024x1xf32, #blocked> loc(#loc132)
+    %tmp34 = arith.mulf %tmp31, %tmp32_59 : tensor<1024x1xf32, #blocked> loc(#loc133)
+    %tmp37 = arith.select %tmp20, %tmp34, %cst_10 : tensor<1024x1xi1, #blocked>, tensor<1024x1xf32, #blocked> loc(#loc134)
+    %tmp38 = arith.select %tmp4_24, %tmp16, %tmp37 : tensor<1024x1xi1, #blocked>, tensor<1024x1xf32, #blocked> loc(#loc139)
+    %0 = arith.muli %yindex_22, %cst_1 : tensor<1024x1xi32, #blocked1> loc(#loc62)
+    %1 = arith.addi %tmp5_26, %0 : tensor<1024x1xi32, #blocked1> loc(#loc63)
+    %2 = tt.splat %out_ptr0 : !tt.ptr<bf16> -> tensor<1024x1x!tt.ptr<bf16>, #blocked1> loc(#loc64)
+    %3 = tt.addptr %2, %1 : tensor<1024x1x!tt.ptr<bf16>, #blocked1>, tensor<1024x1xi32, #blocked1> loc(#loc64)
+    %4 = arith.andi %tmp5_33, %ymask_23 : tensor<1024x1xi1, #blocked1> loc(#loc65)
+    %5 = arith.truncf %tmp38 : tensor<1024x1xf32, #blocked> to tensor<1024x1xbf16, #blocked> loc(#loc66)
+    %6 = ttg.convert_layout %5 : tensor<1024x1xbf16, #blocked> -> tensor<1024x1xbf16, #blocked1> loc(#loc66)
+    tt.store %3, %6, %4 : tensor<1024x1x!tt.ptr<bf16>, #blocked1> loc(#loc66)
+    tt.return loc(#loc67)
+  } loc(#loc)
+} loc(#loc)
+#loc1 = loc(unknown)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:29)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:48)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:69)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:53)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:34)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:75)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":22:44)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":22:23)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":23:21)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":24:28)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":26:21)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":27:19)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":29:19)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":35:18)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:39)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:35)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":25:23)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:51)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:44)
+#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:30)
+#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:64)
+#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:72)
+#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:57)
+#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:123)
+#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:55)
+#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:51)
+#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:30)
+#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:80)
+#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":40:19)
+#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":42:19)
+#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":43:28)
+#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":44:19)
+#loc34 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:31)
+#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:71)
+#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:137)
+#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":47:20)
+#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":51:20)
+#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:61)
+#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:52)
+#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:45)
+#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:31)
+#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:75)
+#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:83)
+#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:67)
+#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:134)
+#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:56)
+#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:52)
+#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:31)
+#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:90)
+#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":58:21)
+#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":60:20)
+#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":61:28)
+#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":62:20)
+#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:31)
+#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:71)
+#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:138)
+#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":65:20)
+#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":68:35)
+#loc60 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":69:34)
+#loc61 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":50:34)
+#loc62 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:34)
+#loc63 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:30)
+#loc64 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:25)
+#loc65 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:54)
+#loc66 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:46)
+#loc67 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:4)
+#loc77 = loc("yoffset"(#loc2))
+#loc78 = loc("yoffset"(#loc3))
+#loc79 = loc("yoffset"(#loc4))
+#loc80 = loc("yoffset"(#loc5))
+#loc81 = loc("yoffset"(#loc6))
+#loc82 = loc("yoffset"(#loc7))
+#loc83 = loc("yindex"(#loc8))
+#loc84 = loc("yindex"(#loc9))
+#loc85 = loc("ymask"(#loc10))
+#loc86 = loc("xoffset"(#loc11))
+#loc87 = loc("xmask"(#loc12))
+#loc88 = loc("y1"(#loc13))
+#loc89 = loc("y0"(#loc14))
+#loc90 = loc("tmp4"(#loc15))
+#loc91 = loc("tmp5"(#loc16))
+#loc92 = loc("tmp5"(#loc17))
+#loc93 = loc("xindex"(#loc18))
+#loc94 = loc("tmp5"(#loc19))
+#loc95 = loc("tmp5"(#loc20))
+#loc96 = loc("tmp5"(#loc21))
+#loc97 = loc("tmp5"(#loc22))
+#loc98 = loc("tmp5"(#loc23))
+#loc99 = loc("tmp5"(#loc24))
+#loc100 = loc("tmp5"(#loc25))
+#loc101 = loc("tmp7"(#loc26))
+#loc102 = loc("tmp7"(#loc27))
+#loc103 = loc("tmp7"(#loc28))
+#loc104 = loc("tmp7"(#loc29))
+#loc105 = loc("tmp9"(#loc30))
+#loc106 = loc("tmp11"(#loc31))
+#loc107 = loc("tmp12"(#loc32))
+#loc108 = loc("tmp13"(#loc33))
+#loc109 = loc("tmp14"(#loc34))
+#loc110 = loc("tmp14"(#loc35))
+#loc111 = loc("tmp14"(#loc36))
+#loc112 = loc("tmp16"(#loc37))
+#loc113 = loc("tmp20"(#loc38))
+#loc114 = loc("tmp23"(#loc39))
+#loc115 = loc("tmp23"(#loc40))
+#loc116 = loc("tmp23"(#loc41))
+#loc117 = loc("tmp23"(#loc42))
+#loc118 = loc("tmp23"(#loc43))
+#loc119 = loc("tmp23"(#loc44))
+#loc120 = loc("tmp23"(#loc45))
+#loc121 = loc("tmp23"(#loc46))
+#loc122 = loc("tmp25"(#loc47))
+#loc123 = loc("tmp25"(#loc48))
+#loc124 = loc("tmp25"(#loc49))
+#loc125 = loc("tmp25"(#loc50))
+#loc126 = loc("tmp27"(#loc51))
+#loc127 = loc("tmp29"(#loc52))
+#loc128 = loc("tmp30"(#loc53))
+#loc129 = loc("tmp31"(#loc54))
+#loc130 = loc("tmp32"(#loc55))
+#loc131 = loc("tmp32"(#loc56))
+#loc132 = loc("tmp32"(#loc57))
+#loc133 = loc("tmp34"(#loc58))
+#loc134 = loc("tmp37"(#loc59))
+#loc135 = loc("tmp38"(#loc60))
+#loc136 = loc("tmp19"(#loc61))
+#loc137 = loc(fused[#loc92, #loc93])
+#loc138 = loc(fused[#loc97, #loc87])
+#loc139 = loc(fused[#loc135, #loc136])
diff --git a/triton/B332B73IRJVPV7KEXQCJ4B6VIXVZKZ6STDLWEXIOLGVNUOTPYFEQ/triton_poi_fused__fused_rms_norm_cat_view_2.ttir b/triton/B332B73IRJVPV7KEXQCJ4B6VIXVZKZ6STDLWEXIOLGVNUOTPYFEQ/triton_poi_fused__fused_rms_norm_cat_view_2.ttir
new file mode 100644
index 0000000000000000000000000000000000000000..a436cfae703c216844493123f1bec2ef26eec285
--- /dev/null
+++ b/triton/B332B73IRJVPV7KEXQCJ4B6VIXVZKZ6STDLWEXIOLGVNUOTPYFEQ/triton_poi_fused__fused_rms_norm_cat_view_2.ttir
@@ -0,0 +1,235 @@
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":18:0)
+#loc69 = loc("in_ptr0"(#loc))
+#loc70 = loc("in_ptr1"(#loc))
+#loc71 = loc("in_ptr2"(#loc))
+#loc72 = loc("in_ptr3"(#loc))
+#loc73 = loc("in_ptr4"(#loc))
+#loc74 = loc("in_ptr5"(#loc))
+#loc75 = loc("out_ptr0"(#loc))
+#loc76 = loc("ynumel"(#loc))
+#loc77 = loc("xnumel"(#loc))
+module {
+  tt.func public @triton_poi_fused__fused_rms_norm_cat_view_2(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %in_ptr4: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr4"(#loc)), %in_ptr5: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr5"(#loc)), %out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ynumel: i32 {tt.divisibility = 16 : i32} loc("ynumel"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} {
+    %xmask = arith.constant 128 : i32 loc(#loc78)
+    %cst = arith.constant dense<0.000000e+00> : tensor<1024x1xbf16> loc(#loc2)
+    %cst_0 = arith.constant dense<-256> : tensor<1024x1xi32> loc(#loc2)
+    %cst_1 = arith.constant dense<9.99999997E-7> : tensor<1024x1xf32> loc(#loc2)
+    %cst_2 = arith.constant dense<1.280000e+02> : tensor<1024x1xf32> loc(#loc2)
+    %cst_3 = arith.constant dense<0.000000e+00> : tensor<1024x1xf32> loc(#loc2)
+    %cst_4 = arith.constant dense<12288> : tensor<1024x1xi32> loc(#loc2)
+    %cst_5 = arith.constant dense<128> : tensor<1024x1xi32> loc(#loc2)
+    %cst_6 = arith.constant dense<256> : tensor<1024x1xi64> loc(#loc2)
+    %cst_7 = arith.constant dense<32> : tensor<1024x1xi32> loc(#loc2)
+    %ymask = arith.constant dense<73728> : tensor<1024x1xi32> loc(#loc79)
+    %c1024_i32 = arith.constant 1024 : i32 loc(#loc2)
+    %yoffset = tt.get_program_id y : i32 loc(#loc80)
+    %yoffset_8 = tt.get_program_id z : i32 loc(#loc81)
+    %yoffset_9 = tt.get_num_programs y : i32 loc(#loc82)
+    %yoffset_10 = arith.muli %yoffset_8, %yoffset_9 : i32 loc(#loc83)
+    %yoffset_11 = arith.addi %yoffset, %yoffset_10 : i32 loc(#loc84)
+    %yoffset_12 = arith.muli %yoffset_11, %c1024_i32 : i32 loc(#loc85)
+    %yindex = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32> loc(#loc86)
+    %yindex_13 = tt.expand_dims %yindex {axis = 1 : i32} : tensor<1024xi32> -> tensor<1024x1xi32> loc(#loc87)
+    %yindex_14 = tt.splat %yoffset_12 : i32 -> tensor<1024x1xi32> loc(#loc88)
+    %yindex_15 = arith.addi %yindex_14, %yindex_13 : tensor<1024x1xi32> loc(#loc88)
+    %ymask_16 = arith.cmpi slt, %yindex_15, %ymask : tensor<1024x1xi32> loc(#loc79)
+    %xoffset = tt.get_program_id x : i32 loc(#loc89)
+    %xmask_17 = arith.cmpi slt, %xoffset, %xmask : i32 loc(#loc78)
+    %y1 = arith.divsi %yindex_15, %cst_7 : tensor<1024x1xi32> loc(#loc90)
+    %y0 = arith.remsi %yindex_15, %cst_7 : tensor<1024x1xi32> loc(#loc91)
+    %tmp4 = arith.extsi %y1 : tensor<1024x1xi32> to tensor<1024x1xi64> loc(#loc92)
+    %tmp4_18 = arith.cmpi slt, %tmp4, %cst_6 : tensor<1024x1xi64> loc(#loc92)
+    %tmp5 = arith.muli %y0, %cst_5 : tensor<1024x1xi32> loc(#loc93)
+    %tmp5_19 = tt.splat %xoffset : i32 -> tensor<1024x1xi32> loc(#loc139)
+    %tmp5_20 = arith.addi %tmp5_19, %tmp5 : tensor<1024x1xi32> loc(#loc94)
+    %tmp5_21 = arith.muli %y1, %cst_4 : tensor<1024x1xi32> loc(#loc96)
+    %tmp5_22 = arith.addi %tmp5_20, %tmp5_21 : tensor<1024x1xi32> loc(#loc97)
+    %tmp5_23 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<1024x1x!tt.ptr<bf16>> loc(#loc98)
+    %tmp5_24 = tt.addptr %tmp5_23, %tmp5_22 : tensor<1024x1x!tt.ptr<bf16>>, tensor<1024x1xi32> loc(#loc98)
+    %tmp5_25 = tt.splat %xmask_17 : i1 -> tensor<1024x1xi1> loc(#loc140)
+    %tmp5_26 = arith.andi %tmp4_18, %tmp5_25 : tensor<1024x1xi1> loc(#loc99)
+    %tmp5_27 = arith.andi %tmp5_26, %ymask_16 : tensor<1024x1xi1> loc(#loc100)
+    %tmp5_28 = tt.load %tmp5_24, %tmp5_27, %cst evictionPolicy = evict_last : tensor<1024x1x!tt.ptr<bf16>> loc(#loc101)
+    %tmp5_29 = arith.extf %tmp5_28 : tensor<1024x1xbf16> to tensor<1024x1xf32> loc(#loc102)
+    %tmp7 = arith.muli %y1, %cst_7 : tensor<1024x1xi32> loc(#loc103)
+    %tmp7_30 = arith.addi %y0, %tmp7 : tensor<1024x1xi32> loc(#loc104)
+    %tmp7_31 = tt.splat %in_ptr1 : !tt.ptr<f32> -> tensor<1024x1x!tt.ptr<f32>> loc(#loc105)
+    %tmp7_32 = tt.addptr %tmp7_31, %tmp7_30 : tensor<1024x1x!tt.ptr<f32>>, tensor<1024x1xi32> loc(#loc105)
+    %tmp7_33 = tt.load %tmp7_32, %tmp5_27, %cst_3 evictionPolicy = evict_last : tensor<1024x1x!tt.ptr<f32>> loc(#loc106)
+    %tmp9 = arith.divf %tmp7_33, %cst_2 : tensor<1024x1xf32> loc(#loc107)
+    %tmp11 = arith.addf %tmp9, %cst_1 : tensor<1024x1xf32> loc(#loc108)
+    %tmp12 = tt.extern_elementwise %tmp11 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<1024x1xf32>) -> tensor<1024x1xf32> loc(#loc109)
+    %tmp13 = arith.mulf %tmp5_29, %tmp12 : tensor<1024x1xf32> loc(#loc110)
+    %tmp14 = tt.addptr %in_ptr2, %xoffset : !tt.ptr<bf16>, i32 loc(#loc111)
+    %tmp14_34 = tt.splat %tmp14 : !tt.ptr<bf16> -> tensor<1024x1x!tt.ptr<bf16>> loc(#loc111)
+    %tmp14_35 = tt.load %tmp14_34, %tmp5_27, %cst evictionPolicy = evict_last : tensor<1024x1x!tt.ptr<bf16>> loc(#loc112)
+    %tmp14_36 = arith.extf %tmp14_35 : tensor<1024x1xbf16> to tensor<1024x1xf32> loc(#loc113)
+    %tmp16 = arith.mulf %tmp13, %tmp14_36 : tensor<1024x1xf32> loc(#loc114)
+    %tmp20 = arith.cmpi sge, %tmp4, %cst_6 : tensor<1024x1xi64> loc(#loc115)
+    %tmp23 = arith.addi %y1, %cst_0 : tensor<1024x1xi32> loc(#loc116)
+    %tmp23_37 = arith.muli %tmp23, %cst_4 : tensor<1024x1xi32> loc(#loc117)
+    %tmp23_38 = arith.addi %tmp5_20, %tmp23_37 : tensor<1024x1xi32> loc(#loc118)
+    %tmp23_39 = tt.splat %in_ptr3 : !tt.ptr<bf16> -> tensor<1024x1x!tt.ptr<bf16>> loc(#loc119)
+    %tmp23_40 = tt.addptr %tmp23_39, %tmp23_38 : tensor<1024x1x!tt.ptr<bf16>>, tensor<1024x1xi32> loc(#loc119)
+    %tmp23_41 = arith.andi %tmp20, %tmp5_25 : tensor<1024x1xi1> loc(#loc120)
+    %tmp23_42 = arith.andi %tmp23_41, %ymask_16 : tensor<1024x1xi1> loc(#loc121)
+    %tmp23_43 = tt.load %tmp23_40, %tmp23_42, %cst evictionPolicy = evict_last : tensor<1024x1x!tt.ptr<bf16>> loc(#loc122)
+    %tmp23_44 = arith.extf %tmp23_43 : tensor<1024x1xbf16> to tensor<1024x1xf32> loc(#loc123)
+    %tmp25 = arith.muli %tmp23, %cst_7 : tensor<1024x1xi32> loc(#loc124)
+    %tmp25_45 = arith.addi %y0, %tmp25 : tensor<1024x1xi32> loc(#loc125)
+    %tmp25_46 = tt.splat %in_ptr4 : !tt.ptr<f32> -> tensor<1024x1x!tt.ptr<f32>> loc(#loc126)
+    %tmp25_47 = tt.addptr %tmp25_46, %tmp25_45 : tensor<1024x1x!tt.ptr<f32>>, tensor<1024x1xi32> loc(#loc126)
+    %tmp25_48 = tt.load %tmp25_47, %tmp23_42, %cst_3 evictionPolicy = evict_last : tensor<1024x1x!tt.ptr<f32>> loc(#loc127)
+    %tmp27 = arith.divf %tmp25_48, %cst_2 : tensor<1024x1xf32> loc(#loc128)
+    %tmp29 = arith.addf %tmp27, %cst_1 : tensor<1024x1xf32> loc(#loc129)
+    %tmp30 = tt.extern_elementwise %tmp29 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<1024x1xf32>) -> tensor<1024x1xf32> loc(#loc130)
+    %tmp31 = arith.mulf %tmp23_44, %tmp30 : tensor<1024x1xf32> loc(#loc131)
+    %tmp32 = tt.addptr %in_ptr5, %xoffset : !tt.ptr<bf16>, i32 loc(#loc132)
+    %tmp32_49 = tt.splat %tmp32 : !tt.ptr<bf16> -> tensor<1024x1x!tt.ptr<bf16>> loc(#loc132)
+    %tmp32_50 = tt.load %tmp32_49, %tmp23_42, %cst evictionPolicy = evict_last : tensor<1024x1x!tt.ptr<bf16>> loc(#loc133)
+    %tmp32_51 = arith.extf %tmp32_50 : tensor<1024x1xbf16> to tensor<1024x1xf32> loc(#loc134)
+    %tmp34 = arith.mulf %tmp31, %tmp32_51 : tensor<1024x1xf32> loc(#loc135)
+    %tmp37 = arith.select %tmp20, %tmp34, %cst_3 : tensor<1024x1xi1>, tensor<1024x1xf32> loc(#loc136)
+    %tmp38 = arith.select %tmp4_18, %tmp16, %tmp37 : tensor<1024x1xi1>, tensor<1024x1xf32> loc(#loc141)
+    %0 = arith.muli %yindex_15, %cst_5 : tensor<1024x1xi32> loc(#loc63)
+    %1 = arith.addi %tmp5_19, %0 : tensor<1024x1xi32> loc(#loc64)
+    %2 = tt.splat %out_ptr0 : !tt.ptr<bf16> -> tensor<1024x1x!tt.ptr<bf16>> loc(#loc65)
+    %3 = tt.addptr %2, %1 : tensor<1024x1x!tt.ptr<bf16>>, tensor<1024x1xi32> loc(#loc65)
+    %4 = arith.andi %tmp5_25, %ymask_16 : tensor<1024x1xi1> loc(#loc66)
+    %5 = arith.truncf %tmp38 : tensor<1024x1xf32> to tensor<1024x1xbf16> loc(#loc67)
+    tt.store %3, %5, %4 : tensor<1024x1x!tt.ptr<bf16>> loc(#loc67)
+    tt.return loc(#loc68)
+  } loc(#loc)
+} loc(#loc)
+#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":26:21)
+#loc2 = loc(unknown)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":23:21)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:29)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:48)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:69)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:53)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:34)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:75)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":22:36)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":22:44)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":22:23)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":24:28)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":27:19)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":29:19)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":35:18)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:39)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:35)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":25:23)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:51)
+#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:44)
+#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:30)
+#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:64)
+#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:72)
+#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:57)
+#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:123)
+#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:55)
+#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:51)
+#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:30)
+#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:80)
+#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":40:19)
+#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":42:19)
+#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":43:28)
+#loc34 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":44:19)
+#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:31)
+#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:71)
+#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:137)
+#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":47:20)
+#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":51:20)
+#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:61)
+#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:52)
+#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:45)
+#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:31)
+#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:75)
+#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:83)
+#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:67)
+#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:134)
+#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:56)
+#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:52)
+#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:31)
+#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:90)
+#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":58:21)
+#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":60:20)
+#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":61:28)
+#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":62:20)
+#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:31)
+#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:71)
+#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:138)
+#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":65:20)
+#loc60 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":68:35)
+#loc61 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":69:34)
+#loc62 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":50:34)
+#loc63 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:34)
+#loc64 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:30)
+#loc65 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:25)
+#loc66 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:54)
+#loc67 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:46)
+#loc68 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:4)
+#loc78 = loc("xmask"(#loc1))
+#loc79 = loc("ymask"(#loc3))
+#loc80 = loc("yoffset"(#loc4))
+#loc81 = loc("yoffset"(#loc5))
+#loc82 = loc("yoffset"(#loc6))
+#loc83 = loc("yoffset"(#loc7))
+#loc84 = loc("yoffset"(#loc8))
+#loc85 = loc("yoffset"(#loc9))
+#loc86 = loc("yindex"(#loc10))
+#loc87 = loc("yindex"(#loc11))
+#loc88 = loc("yindex"(#loc12))
+#loc89 = loc("xoffset"(#loc13))
+#loc90 = loc("y1"(#loc14))
+#loc91 = loc("y0"(#loc15))
+#loc92 = loc("tmp4"(#loc16))
+#loc93 = loc("tmp5"(#loc17))
+#loc94 = loc("tmp5"(#loc18))
+#loc95 = loc("xindex"(#loc19))
+#loc96 = loc("tmp5"(#loc20))
+#loc97 = loc("tmp5"(#loc21))
+#loc98 = loc("tmp5"(#loc22))
+#loc99 = loc("tmp5"(#loc23))
+#loc100 = loc("tmp5"(#loc24))
+#loc101 = loc("tmp5"(#loc25))
+#loc102 = loc("tmp5"(#loc26))
+#loc103 = loc("tmp7"(#loc27))
+#loc104 = loc("tmp7"(#loc28))
+#loc105 = loc("tmp7"(#loc29))
+#loc106 = loc("tmp7"(#loc30))
+#loc107 = loc("tmp9"(#loc31))
+#loc108 = loc("tmp11"(#loc32))
+#loc109 = loc("tmp12"(#loc33))
+#loc110 = loc("tmp13"(#loc34))
+#loc111 = loc("tmp14"(#loc35))
+#loc112 = loc("tmp14"(#loc36))
+#loc113 = loc("tmp14"(#loc37))
+#loc114 = loc("tmp16"(#loc38))
+#loc115 = loc("tmp20"(#loc39))
+#loc116 = loc("tmp23"(#loc40))
+#loc117 = loc("tmp23"(#loc41))
+#loc118 = loc("tmp23"(#loc42))
+#loc119 = loc("tmp23"(#loc43))
+#loc120 = loc("tmp23"(#loc44))
+#loc121 = loc("tmp23"(#loc45))
+#loc122 = loc("tmp23"(#loc46))
+#loc123 = loc("tmp23"(#loc47))
+#loc124 = loc("tmp25"(#loc48))
+#loc125 = loc("tmp25"(#loc49))
+#loc126 = loc("tmp25"(#loc50))
+#loc127 = loc("tmp25"(#loc51))
+#loc128 = loc("tmp27"(#loc52))
+#loc129 = loc("tmp29"(#loc53))
+#loc130 = loc("tmp30"(#loc54))
+#loc131 = loc("tmp31"(#loc55))
+#loc132 = loc("tmp32"(#loc56))
+#loc133 = loc("tmp32"(#loc57))
+#loc134 = loc("tmp32"(#loc58))
+#loc135 = loc("tmp34"(#loc59))
+#loc136 = loc("tmp37"(#loc60))
+#loc137 = loc("tmp38"(#loc61))
+#loc138 = loc("tmp19"(#loc62))
+#loc139 = loc(fused[#loc94, #loc95])
+#loc140 = loc(fused[#loc99, #loc78])
+#loc141 = loc(fused[#loc137, #loc138])
diff --git a/triton/BFXM4JRLILKWOXIXJK63CNDMPAQTDH7WSFLZFJRWWUXYOHS6MUXQ/__grp__triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json b/triton/BFXM4JRLILKWOXIXJK63CNDMPAQTDH7WSFLZFJRWWUXYOHS6MUXQ/__grp__triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..84d8ce34d74e23c71524ab7c2b0da86972d0eef7
--- /dev/null
+++ b/triton/BFXM4JRLILKWOXIXJK63CNDMPAQTDH7WSFLZFJRWWUXYOHS6MUXQ/__grp__triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json
@@ -0,0 +1 @@
+{"child_paths": {"triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.source": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/BFXM4JRLILKWOXIXJK63CNDMPAQTDH7WSFLZFJRWWUXYOHS6MUXQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.source", "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/BFXM4JRLILKWOXIXJK63CNDMPAQTDH7WSFLZFJRWWUXYOHS6MUXQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttir", "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttgir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/BFXM4JRLILKWOXIXJK63CNDMPAQTDH7WSFLZFJRWWUXYOHS6MUXQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttgir", "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.llir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/BFXM4JRLILKWOXIXJK63CNDMPAQTDH7WSFLZFJRWWUXYOHS6MUXQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.llir", "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ptx": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/BFXM4JRLILKWOXIXJK63CNDMPAQTDH7WSFLZFJRWWUXYOHS6MUXQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ptx", "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.cubin": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/BFXM4JRLILKWOXIXJK63CNDMPAQTDH7WSFLZFJRWWUXYOHS6MUXQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.cubin", "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/BFXM4JRLILKWOXIXJK63CNDMPAQTDH7WSFLZFJRWWUXYOHS6MUXQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json"}}
\ No newline at end of file
diff --git a/triton/BFXM4JRLILKWOXIXJK63CNDMPAQTDH7WSFLZFJRWWUXYOHS6MUXQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.cubin b/triton/BFXM4JRLILKWOXIXJK63CNDMPAQTDH7WSFLZFJRWWUXYOHS6MUXQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..b1639b04b5f54f687154d6ac1d2ee4b09d1c59e3
Binary files /dev/null and b/triton/BFXM4JRLILKWOXIXJK63CNDMPAQTDH7WSFLZFJRWWUXYOHS6MUXQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.cubin differ
diff --git a/triton/BFXM4JRLILKWOXIXJK63CNDMPAQTDH7WSFLZFJRWWUXYOHS6MUXQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json b/triton/BFXM4JRLILKWOXIXJK63CNDMPAQTDH7WSFLZFJRWWUXYOHS6MUXQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..c2dbcb990bca7cd4df3f246e84d379e2f2d85d47
--- /dev/null
+++ b/triton/BFXM4JRLILKWOXIXJK63CNDMPAQTDH7WSFLZFJRWWUXYOHS6MUXQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json
@@ -0,0 +1 @@
+{"hash": "096ece262b42d5675d174abdb1346c7821319ff6915792a636b52f871e5e652f", "target": {"backend": "cuda", "arch": 89, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "enable_reflect_ftz": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee", "bf16x3", "bf16x6"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm89", "instrumentation_mode": "", "triton_version": "3.6.0", "tensordesc_meta": [], "shared": 2048, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0"}
\ No newline at end of file
diff --git a/triton/BFXM4JRLILKWOXIXJK63CNDMPAQTDH7WSFLZFJRWWUXYOHS6MUXQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.llir b/triton/BFXM4JRLILKWOXIXJK63CNDMPAQTDH7WSFLZFJRWWUXYOHS6MUXQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.llir
new file mode 100644
index 0000000000000000000000000000000000000000..84b98ec516fe8bb215072712ba024e3414b134a1
--- /dev/null
+++ b/triton/BFXM4JRLILKWOXIXJK63CNDMPAQTDH7WSFLZFJRWWUXYOHS6MUXQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.llir
@@ -0,0 +1,779 @@
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64"
+
+@global_smem = external local_unnamed_addr addrspace(3) global [0 x i8], align 16
+@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1
+
+; Function Attrs: nounwind
+define ptx_kernel void @triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, ptr addrspace(1) %6, i32 %7, i32 %8, ptr addrspace(1) readnone captures(none) %9, ptr addrspace(1) readnone captures(none) %10) local_unnamed_addr #0 !dbg !5 {
+  %12 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !8
+  %13 = shl i32 %12, 6, !dbg !9
+  %14 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
+  %15 = and i32 %14, 126, !dbg !10
+  %16 = lshr exact i32 %15, 1, !dbg !10
+  %17 = or disjoint i32 %16, %13, !dbg !11
+  %18 = shl nuw nsw i32 %14, 2, !dbg !12
+  %19 = and i32 %18, 4, !dbg !12
+  %20 = sdiv i32 %17, 32, !dbg !13
+  %21 = shl i32 %17, 7
+  %22 = shl i32 %20, 15
+  %23 = add i32 %22, %21
+  %24 = add i32 %23, 4096
+  %25 = zext nneg i32 %19 to i64, !dbg !14
+  br label %26, !dbg !14
+
+26:                                               ; preds = %11, %26
+  %indvars.iv = phi i64 [ 0, %11 ], [ %indvars.iv.next, %26 ]
+  %27 = phi <8 x float> [ zeroinitializer, %11 ], [ %59, %26 ]
+  %28 = trunc nuw nsw i64 %indvars.iv to i32, !dbg !15
+  %29 = or disjoint i32 %19, %28, !dbg !15
+  %30 = add i32 %24, %29, !dbg !15
+  %31 = sext i32 %30 to i64, !dbg !16
+  %32 = getelementptr bfloat, ptr addrspace(1) %2, i64 %31, !dbg !16
+  %33 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !17
+  %34 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %32, i64 %33, i1 true) #6, !dbg !17
+  %35 = extractvalue { i32, i32 } %34, 0, !dbg !17
+  %36 = bitcast i32 %35 to <2 x bfloat>, !dbg !17
+  %37 = extractvalue { i32, i32 } %34, 1, !dbg !17
+  %38 = bitcast i32 %37 to <2 x bfloat>, !dbg !17
+  %39 = add i32 %23, %29, !dbg !18
+  %40 = sext i32 %39 to i64, !dbg !19
+  %41 = getelementptr bfloat, ptr addrspace(1) %2, i64 %40, !dbg !19
+  %42 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !20
+  %43 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %41, i64 %42, i1 true) #6, !dbg !20
+  %44 = extractvalue { i32, i32 } %43, 0, !dbg !20
+  %45 = bitcast i32 %44 to <2 x bfloat>, !dbg !20
+  %46 = extractvalue { i32, i32 } %43, 1, !dbg !20
+  %47 = bitcast i32 %46 to <2 x bfloat>, !dbg !20
+  %48 = shufflevector <2 x bfloat> %45, <2 x bfloat> %47, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>, !dbg !21
+  %49 = shufflevector <2 x bfloat> %36, <2 x bfloat> poison, <8 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, !dbg !21
+  %50 = shufflevector <8 x bfloat> %48, <8 x bfloat> %49, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 poison, i32 poison, i32 poison>, !dbg !21
+  %51 = shufflevector <2 x bfloat> %36, <2 x bfloat> poison, <8 x i32> <i32 poison, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, !dbg !21
+  %52 = shufflevector <8 x bfloat> %50, <8 x bfloat> %51, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 9, i32 poison, i32 poison>, !dbg !21
+  %53 = shufflevector <2 x bfloat> %38, <2 x bfloat> poison, <8 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, !dbg !21
+  %54 = shufflevector <8 x bfloat> %52, <8 x bfloat> %53, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 poison>, !dbg !21
+  %55 = shufflevector <2 x bfloat> %38, <2 x bfloat> poison, <8 x i32> <i32 poison, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, !dbg !21
+  %56 = shufflevector <8 x bfloat> %54, <8 x bfloat> %55, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 9>, !dbg !21
+  %57 = fpext <8 x bfloat> %56 to <8 x float>, !dbg !21
+  %58 = fmul <8 x float> %57, %57, !dbg !22
+  %59 = fadd <8 x float> %27, %58, !dbg !23
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 8, !dbg !14
+  %60 = icmp samesign ult i64 %indvars.iv, 120, !dbg !14
+  br i1 %60, label %26, label %61, !dbg !14
+
+61:                                               ; preds = %26
+  %62 = and i32 %14, 63, !dbg !10
+  %63 = or disjoint i32 %13, %62, !dbg !11
+  %64 = and i32 %14, 64, !dbg !12
+  %65 = sdiv i32 %63, 32, !dbg !13
+  %shift = shufflevector <8 x float> %59, <8 x float> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 5, i32 poison, i32 poison, i32 poison>, !dbg !24
+  %foldExtExtBinop = fadd <8 x float> %59, %shift, !dbg !24
+  %shift98 = shufflevector <8 x float> %59, <8 x float> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 6, i32 poison, i32 poison, i32 poison>, !dbg !24
+  %foldExtExtBinop99 = fadd <8 x float> %shift98, %foldExtExtBinop, !dbg !24
+  %shift101 = shufflevector <8 x float> %59, <8 x float> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 7, i32 poison, i32 poison, i32 poison>, !dbg !24
+  %foldExtExtBinop102 = fadd <8 x float> %shift101, %foldExtExtBinop99, !dbg !24
+  %66 = extractelement <8 x float> %foldExtExtBinop102, i64 4, !dbg !24
+  %67 = bitcast float %66 to i32, !dbg !27
+  %68 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %67, i32 1, i32 31), !dbg !27
+  %69 = bitcast i32 %68 to float, !dbg !27
+  %70 = fadd float %66, %69, !dbg !24
+  %shift104 = shufflevector <8 x float> %59, <8 x float> poison, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, !dbg !30
+  %foldExtExtBinop105 = fadd <8 x float> %59, %shift104, !dbg !30
+  %shift107 = shufflevector <8 x float> %59, <8 x float> poison, <8 x i32> <i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, !dbg !30
+  %foldExtExtBinop108 = fadd <8 x float> %shift107, %foldExtExtBinop105, !dbg !30
+  %shift110 = shufflevector <8 x float> %59, <8 x float> poison, <8 x i32> <i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, !dbg !30
+  %foldExtExtBinop111 = fadd <8 x float> %shift110, %foldExtExtBinop108, !dbg !30
+  %71 = extractelement <8 x float> %foldExtExtBinop111, i64 0, !dbg !30
+  %72 = bitcast float %71 to i32, !dbg !31
+  %73 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %72, i32 1, i32 31), !dbg !31
+  %74 = bitcast i32 %73 to float, !dbg !31
+  %75 = fadd float %71, %74, !dbg !30
+  %76 = shl i32 %20, 7, !dbg !33
+  %77 = tail call float @llvm.nvvm.div.full(float %75, float 1.280000e+02), !dbg !34
+  %78 = fadd float %77, 0x3EB0C6F7A0000000, !dbg !35
+  %79 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36
+  %.not.i = icmp eq i32 %79, 0, !dbg !36
+  br i1 %.not.i, label %82, label %80, !dbg !36
+
+80:                                               ; preds = %61
+  %81 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %78), !dbg !36
+  br label %__nv_rsqrtf.exit, !dbg !36
+
+82:                                               ; preds = %61
+  %83 = tail call float @llvm.nvvm.rsqrt.approx.f(float %78), !dbg !36
+  br label %__nv_rsqrtf.exit, !dbg !36
+
+__nv_rsqrtf.exit:                                 ; preds = %80, %82
+  %.0.i = phi float [ %81, %80 ], [ %83, %82 ], !dbg !36
+  %84 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36
+  %85 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36
+  %86 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36
+  %.not.i28 = icmp eq i32 %86, 0, !dbg !36
+  br i1 %.not.i28, label %89, label %87, !dbg !36
+
+87:                                               ; preds = %__nv_rsqrtf.exit
+  %88 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %78), !dbg !36
+  br label %__nv_rsqrtf.exit30, !dbg !36
+
+89:                                               ; preds = %__nv_rsqrtf.exit
+  %90 = tail call float @llvm.nvvm.rsqrt.approx.f(float %78), !dbg !36
+  br label %__nv_rsqrtf.exit30, !dbg !36
+
+__nv_rsqrtf.exit30:                               ; preds = %87, %89
+  %.0.i29 = phi float [ %88, %87 ], [ %90, %89 ], !dbg !36
+  %91 = shl nuw nsw i32 %15, 1, !dbg !37
+  %92 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %91, !dbg !37
+  store float %.0.i, ptr addrspace(3) %92, align 4, !dbg !37
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !37
+  %93 = shl nuw nsw i32 %62, 2, !dbg !37
+  %94 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %93, !dbg !37
+  %95 = load float, ptr addrspace(3) %94, align 4, !dbg !37
+  %96 = tail call float @llvm.nvvm.div.full(float %70, float 1.280000e+02), !dbg !38
+  %97 = fadd float %96, 0x3EB0C6F7A0000000, !dbg !39
+  %98 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !40
+  %.not.i31 = icmp eq i32 %98, 0, !dbg !40
+  br i1 %.not.i31, label %101, label %99, !dbg !40
+
+99:                                               ; preds = %__nv_rsqrtf.exit30
+  %100 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %97), !dbg !40
+  br label %__nv_rsqrtf.exit33, !dbg !40
+
+101:                                              ; preds = %__nv_rsqrtf.exit30
+  %102 = tail call float @llvm.nvvm.rsqrt.approx.f(float %97), !dbg !40
+  br label %__nv_rsqrtf.exit33, !dbg !40
+
+__nv_rsqrtf.exit33:                               ; preds = %99, %101
+  %.0.i32 = phi float [ %100, %99 ], [ %102, %101 ], !dbg !40
+  %103 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !40
+  %104 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !40
+  %105 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !40
+  %.not.i40 = icmp eq i32 %105, 0, !dbg !40
+  br i1 %.not.i40, label %108, label %106, !dbg !40
+
+106:                                              ; preds = %__nv_rsqrtf.exit33
+  %107 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %97), !dbg !40
+  br label %__nv_rsqrtf.exit42, !dbg !40
+
+108:                                              ; preds = %__nv_rsqrtf.exit33
+  %109 = tail call float @llvm.nvvm.rsqrt.approx.f(float %97), !dbg !40
+  br label %__nv_rsqrtf.exit42, !dbg !40
+
+__nv_rsqrtf.exit42:                               ; preds = %106, %108
+  %.0.i41 = phi float [ %107, %106 ], [ %109, %108 ], !dbg !40
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !41
+  store float %.0.i32, ptr addrspace(3) %92, align 4, !dbg !41
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !41
+  %110 = load float, ptr addrspace(3) %94, align 4, !dbg !41
+  %111 = shl i32 %17, 7, !dbg !42
+  %112 = shl nuw nsw i32 %14, 3
+  %113 = and i32 %112, 120
+  %114 = and i32 %18, 384
+  %115 = and i32 %14, 16
+  %116 = icmp eq i32 %115, 0
+  %117 = select i1 %116, i32 0, i32 1032
+  %118 = or disjoint i32 %113, %114
+  %119 = xor i32 %118, %117
+  %120 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %119
+  %121 = getelementptr inbounds nuw i8, ptr addrspace(3) %120, i32 512
+  %122 = shl nuw nsw i32 %14, 4
+  %123 = and i32 %122, 112
+  %124 = and i32 %112, 896
+  %125 = and i32 %14, 8
+  %126 = icmp eq i32 %125, 0
+  %127 = select i1 %126, i32 0, i32 1032
+  %128 = or disjoint i32 %123, %124
+  %129 = or disjoint i32 %128, %127
+  %130 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %129
+  %131 = xor i32 %129, 8
+  %132 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %131
+  %133 = icmp eq i32 %64, 0
+  %134 = shl i32 %63, 7
+  %135 = shl i32 %65, 15
+  %136 = add i32 %135, %134
+  %137 = icmp ne i32 %64, 0
+  %138 = add i32 %136, 4097
+  %139 = add i32 %136, 4099
+  %140 = add i32 %136, 4101
+  %141 = add i32 %136, 4103
+  %142 = add i32 %136, 4096
+  %143 = add i32 %136, 4098
+  %144 = add i32 %136, 4100
+  %145 = add i32 %136, 4102
+  %146 = select i1 %116, i32 0, i32 516
+  %147 = or disjoint i32 %114, %146
+  %148 = or disjoint i32 %147, %113
+  %149 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %148
+  %150 = xor i32 %148, 4
+  %151 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %150
+  %152 = and i32 %18, 124
+  %153 = and i32 %14, 32
+  %154 = icmp eq i32 %153, 0
+  %155 = select i1 %154, i32 0, i32 516
+  %156 = shl nuw nsw i32 %64, 1
+  %157 = xor i32 %155, %152
+  %158 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %156
+  %159 = getelementptr inbounds nuw i8, ptr addrspace(3) %158, i32 %157
+  %160 = getelementptr inbounds nuw i8, ptr addrspace(3) %159, i32 256
+  %161 = sext i32 %145 to i64, !dbg !43
+  %162 = sext i32 %144 to i64, !dbg !43
+  %163 = sext i32 %143 to i64, !dbg !43
+  %164 = sext i32 %142 to i64, !dbg !43
+  %165 = sext i32 %141 to i64, !dbg !43
+  %166 = sext i32 %140 to i64, !dbg !43
+  %167 = sext i32 %139 to i64, !dbg !43
+  %168 = sext i32 %138 to i64, !dbg !43
+  %169 = sext i32 %136 to i64, !dbg !43
+  %170 = sext i32 %76 to i64, !dbg !43
+  %171 = sext i32 %111 to i64, !dbg !43
+  %invariant.gep = getelementptr bfloat, ptr addrspace(1) %2, i64 %169, !dbg !43
+  %invariant.gep60 = getelementptr bfloat, ptr addrspace(1) %2, i64 %169, !dbg !43
+  %invariant.gep62 = getelementptr bfloat, ptr addrspace(1) %2, i64 %169, !dbg !43
+  %invariant.gep64 = getelementptr bfloat, ptr addrspace(1) %2, i64 %169, !dbg !43
+  %invariant.gep66 = getelementptr bfloat, ptr addrspace(1) %2, i64 %169, !dbg !43
+  %invariant.gep68 = getelementptr bfloat, ptr addrspace(1) %2, i64 %169, !dbg !43
+  %invariant.gep70 = getelementptr bfloat, ptr addrspace(1) %2, i64 %169, !dbg !43
+  %invariant.gep72 = getelementptr bfloat, ptr addrspace(1) %2, i64 %169, !dbg !43
+  %invariant.gep74 = getelementptr bfloat, ptr addrspace(1) %2, i64 %168, !dbg !43
+  %invariant.gep76 = getelementptr bfloat, ptr addrspace(1) %2, i64 %167, !dbg !43
+  %invariant.gep78 = getelementptr bfloat, ptr addrspace(1) %2, i64 %166, !dbg !43
+  %invariant.gep80 = getelementptr bfloat, ptr addrspace(1) %2, i64 %165, !dbg !43
+  %invariant.gep82 = getelementptr bfloat, ptr addrspace(1) %2, i64 %164, !dbg !43
+  %invariant.gep84 = getelementptr bfloat, ptr addrspace(1) %2, i64 %163, !dbg !43
+  %invariant.gep86 = getelementptr bfloat, ptr addrspace(1) %2, i64 %162, !dbg !43
+  %invariant.gep88 = getelementptr bfloat, ptr addrspace(1) %2, i64 %161, !dbg !43
+  %172 = insertelement <2 x i1> poison, i1 %133, i64 0, !dbg !44
+  %173 = shufflevector <2 x i1> %172, <2 x i1> poison, <2 x i32> zeroinitializer, !dbg !44
+  %174 = insertelement <2 x float> poison, float %95, i64 0, !dbg !37
+  %175 = shufflevector <2 x float> %174, <2 x float> poison, <2 x i32> zeroinitializer, !dbg !37
+  %176 = insertelement <2 x float> poison, float %110, i64 0, !dbg !41
+  %177 = shufflevector <2 x float> %176, <2 x float> poison, <2 x i32> zeroinitializer, !dbg !41
+  br label %178, !dbg !43
+
+178:                                              ; preds = %__nv_rsqrtf.exit42, %178
+  %indvars.iv51 = phi i64 [ 0, %__nv_rsqrtf.exit42 ], [ %indvars.iv.next52, %178 ]
+  %179 = or disjoint i64 %indvars.iv51, %25, !dbg !45
+  %180 = or disjoint i64 %indvars.iv51, 2, !dbg !46
+  %181 = or disjoint i64 %indvars.iv51, 4, !dbg !46
+  %182 = or disjoint i64 %indvars.iv51, 6, !dbg !46
+  %183 = trunc nuw nsw i64 %179 to i32, !dbg !47
+  %184 = add i32 %23, %183, !dbg !47
+  %185 = sext i32 %184 to i64, !dbg !48
+  %186 = getelementptr bfloat, ptr addrspace(1) %2, i64 %185, !dbg !48
+  %187 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !49
+  %188 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %186, i64 %187, i1 true) #6, !dbg !49
+  %189 = extractvalue { i32, i32 } %188, 0, !dbg !49
+  %190 = bitcast i32 %189 to <2 x bfloat>, !dbg !49
+  %191 = extractvalue { i32, i32 } %188, 1, !dbg !49
+  %192 = bitcast i32 %191 to <2 x bfloat>, !dbg !49
+  %193 = extractelement <2 x bfloat> %190, i64 0, !dbg !49
+  %194 = extractelement <2 x bfloat> %190, i64 1, !dbg !49
+  %195 = extractelement <2 x bfloat> %192, i64 0, !dbg !49
+  %196 = extractelement <2 x bfloat> %192, i64 1, !dbg !49
+  %197 = fpext bfloat %193 to float, !dbg !50
+  %198 = fpext bfloat %194 to float, !dbg !50
+  %199 = fpext bfloat %195 to float, !dbg !50
+  %200 = fpext bfloat %196 to float, !dbg !50
+  %201 = getelementptr bfloat, ptr addrspace(1) %3, i64 %179, !dbg !51
+  %202 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !52
+  %203 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %201, i64 %202, i1 true) #6, !dbg !52
+  %204 = extractvalue { i32, i32 } %203, 0, !dbg !52
+  %205 = bitcast i32 %204 to <2 x bfloat>, !dbg !52
+  %206 = extractvalue { i32, i32 } %203, 1, !dbg !52
+  %207 = bitcast i32 %206 to <2 x bfloat>, !dbg !52
+  %208 = extractelement <2 x bfloat> %205, i64 0, !dbg !52
+  %209 = extractelement <2 x bfloat> %205, i64 1, !dbg !52
+  %210 = extractelement <2 x bfloat> %207, i64 0, !dbg !52
+  %211 = extractelement <2 x bfloat> %207, i64 1, !dbg !52
+  %212 = fpext bfloat %208 to float, !dbg !53
+  %213 = fpext bfloat %209 to float, !dbg !53
+  %214 = fpext bfloat %210 to float, !dbg !53
+  %215 = fpext bfloat %211 to float, !dbg !53
+  %216 = add nuw nsw i64 %179, %170, !dbg !54
+  %217 = getelementptr float, ptr addrspace(1) %4, i64 %216, !dbg !55
+  %218 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !56
+  %219 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %217, i64 %218, i1 true) #6, !dbg !56
+  %220 = extractvalue { i32, i32, i32, i32 } %219, 0, !dbg !56
+  %221 = extractvalue { i32, i32, i32, i32 } %219, 1, !dbg !56
+  %222 = extractvalue { i32, i32, i32, i32 } %219, 2, !dbg !56
+  %223 = extractvalue { i32, i32, i32, i32 } %219, 3, !dbg !56
+  %224 = bitcast i32 %220 to float, !dbg !56
+  %225 = bitcast i32 %221 to float, !dbg !56
+  %226 = bitcast i32 %222 to float, !dbg !56
+  %227 = bitcast i32 %223 to float, !dbg !56
+  %228 = getelementptr float, ptr addrspace(1) %5, i64 %216, !dbg !57
+  %229 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !58
+  %230 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %228, i64 %229, i1 true) #6, !dbg !58
+  %231 = extractvalue { i32, i32, i32, i32 } %230, 0, !dbg !58
+  %232 = extractvalue { i32, i32, i32, i32 } %230, 1, !dbg !58
+  %233 = extractvalue { i32, i32, i32, i32 } %230, 2, !dbg !58
+  %234 = extractvalue { i32, i32, i32, i32 } %230, 3, !dbg !58
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !58
+  %235 = insertelement <2 x i32> poison, i32 %231, i64 0, !dbg !58
+  %236 = insertelement <2 x i32> %235, i32 %233, i64 1, !dbg !58
+  store <2 x i32> %236, ptr addrspace(3) %120, align 8, !dbg !58
+  %237 = insertelement <2 x i32> poison, i32 %232, i64 0, !dbg !58
+  %238 = insertelement <2 x i32> %237, i32 %234, i64 1, !dbg !58
+  store <2 x i32> %238, ptr addrspace(3) %121, align 8, !dbg !58
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !58
+  %239 = add i32 %24, %183, !dbg !59
+  %240 = sext i32 %239 to i64, !dbg !60
+  %241 = getelementptr bfloat, ptr addrspace(1) %2, i64 %240, !dbg !60
+  %242 = getelementptr bfloat, ptr addrspace(1) %6, i64 %179, !dbg !61
+  %243 = or disjoint i64 %indvars.iv51, 1, !dbg !62
+  %244 = or disjoint i64 %indvars.iv51, 3, !dbg !62
+  %245 = or disjoint i64 %indvars.iv51, 5, !dbg !62
+  %246 = or disjoint i64 %indvars.iv51, 7, !dbg !62
+  %gep = getelementptr bfloat, ptr addrspace(1) %invariant.gep, i64 %243, !dbg !63
+  %gep61 = getelementptr bfloat, ptr addrspace(1) %invariant.gep60, i64 %244, !dbg !63
+  %gep63 = getelementptr bfloat, ptr addrspace(1) %invariant.gep62, i64 %245, !dbg !63
+  %gep65 = getelementptr bfloat, ptr addrspace(1) %invariant.gep64, i64 %246, !dbg !63
+  %247 = getelementptr bfloat, ptr addrspace(1) %3, i64 %243, !dbg !64
+  %248 = getelementptr bfloat, ptr addrspace(1) %3, i64 %244, !dbg !64
+  %249 = getelementptr bfloat, ptr addrspace(1) %3, i64 %245, !dbg !64
+  %250 = getelementptr bfloat, ptr addrspace(1) %3, i64 %246, !dbg !64
+  %gep67 = getelementptr bfloat, ptr addrspace(1) %invariant.gep66, i64 %indvars.iv51, !dbg !65
+  %gep69 = getelementptr bfloat, ptr addrspace(1) %invariant.gep68, i64 %180, !dbg !65
+  %gep71 = getelementptr bfloat, ptr addrspace(1) %invariant.gep70, i64 %181, !dbg !65
+  %gep73 = getelementptr bfloat, ptr addrspace(1) %invariant.gep72, i64 %182, !dbg !65
+  %251 = getelementptr bfloat, ptr addrspace(1) %3, i64 %indvars.iv51, !dbg !66
+  %252 = getelementptr bfloat, ptr addrspace(1) %3, i64 %180, !dbg !66
+  %253 = getelementptr bfloat, ptr addrspace(1) %3, i64 %181, !dbg !66
+  %254 = getelementptr bfloat, ptr addrspace(1) %3, i64 %182, !dbg !66
+  %255 = fmul float %.0.i29, %197, !dbg !67
+  %256 = fmul float %.0.i29, %198, !dbg !67
+  %257 = fmul float %.0.i29, %199, !dbg !67
+  %258 = fmul float %.0.i29, %200, !dbg !67
+  %259 = fmul float %255, %212, !dbg !68
+  %260 = fmul float %256, %213, !dbg !68
+  %261 = fmul float %257, %214, !dbg !68
+  %262 = fmul float %258, %215, !dbg !68
+  %263 = fmul float %259, %224, !dbg !69
+  %264 = fmul float %260, %225, !dbg !69
+  %265 = fmul float %261, %226, !dbg !69
+  %266 = fmul float %262, %227, !dbg !69
+  %267 = insertelement <2 x float> poison, float %263, i64 0, !dbg !69
+  %268 = insertelement <2 x float> %267, float %265, i64 1, !dbg !69
+  %269 = insertelement <2 x float> poison, float %264, i64 0, !dbg !69
+  %270 = insertelement <2 x float> %269, float %266, i64 1, !dbg !69
+  %gep75 = getelementptr bfloat, ptr addrspace(1) %invariant.gep74, i64 %indvars.iv51, !dbg !70
+  %gep77 = getelementptr bfloat, ptr addrspace(1) %invariant.gep76, i64 %indvars.iv51, !dbg !70
+  %gep79 = getelementptr bfloat, ptr addrspace(1) %invariant.gep78, i64 %indvars.iv51, !dbg !70
+  %gep81 = getelementptr bfloat, ptr addrspace(1) %invariant.gep80, i64 %indvars.iv51, !dbg !70
+  %271 = getelementptr bfloat, ptr addrspace(1) %6, i64 %243, !dbg !71
+  %272 = getelementptr bfloat, ptr addrspace(1) %6, i64 %244, !dbg !71
+  %273 = getelementptr bfloat, ptr addrspace(1) %6, i64 %245, !dbg !71
+  %274 = getelementptr bfloat, ptr addrspace(1) %6, i64 %246, !dbg !71
+  %gep83 = getelementptr bfloat, ptr addrspace(1) %invariant.gep82, i64 %indvars.iv51, !dbg !72
+  %gep85 = getelementptr bfloat, ptr addrspace(1) %invariant.gep84, i64 %indvars.iv51, !dbg !72
+  %gep87 = getelementptr bfloat, ptr addrspace(1) %invariant.gep86, i64 %indvars.iv51, !dbg !72
+  %gep89 = getelementptr bfloat, ptr addrspace(1) %invariant.gep88, i64 %indvars.iv51, !dbg !72
+  %275 = getelementptr bfloat, ptr addrspace(1) %6, i64 %indvars.iv51, !dbg !73
+  %276 = getelementptr bfloat, ptr addrspace(1) %6, i64 %180, !dbg !73
+  %277 = getelementptr bfloat, ptr addrspace(1) %6, i64 %181, !dbg !73
+  %278 = getelementptr bfloat, ptr addrspace(1) %6, i64 %182, !dbg !73
+  %279 = add nuw nsw i64 %179, %171, !dbg !74
+  %280 = getelementptr bfloat, ptr addrspace(1) %0, i64 %279, !dbg !75
+  %281 = load <2 x float>, ptr addrspace(3) %130, align 8, !dbg !58
+  %282 = load <2 x float>, ptr addrspace(3) %132, align 8, !dbg !58
+  %283 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #6, !dbg !76
+  %284 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %241, i64 %283, i1 true) #6, !dbg !76
+  %285 = extractvalue { i32, i32 } %284, 0, !dbg !76
+  %286 = bitcast i32 %285 to <2 x bfloat>, !dbg !76
+  %287 = extractvalue { i32, i32 } %284, 1, !dbg !76
+  %288 = bitcast i32 %287 to <2 x bfloat>, !dbg !76
+  %289 = extractelement <2 x bfloat> %286, i64 0, !dbg !76
+  %290 = extractelement <2 x bfloat> %286, i64 1, !dbg !76
+  %291 = extractelement <2 x bfloat> %288, i64 0, !dbg !76
+  %292 = extractelement <2 x bfloat> %288, i64 1, !dbg !76
+  %293 = fpext bfloat %289 to float, !dbg !77
+  %294 = fpext bfloat %290 to float, !dbg !77
+  %295 = fpext bfloat %291 to float, !dbg !77
+  %296 = fpext bfloat %292 to float, !dbg !77
+  %297 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !78
+  %298 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %242, i64 %297, i1 true) #6, !dbg !78
+  %299 = extractvalue { i32, i32 } %298, 0, !dbg !78
+  %300 = bitcast i32 %299 to <2 x bfloat>, !dbg !78
+  %301 = extractvalue { i32, i32 } %298, 1, !dbg !78
+  %302 = bitcast i32 %301 to <2 x bfloat>, !dbg !78
+  %303 = extractelement <2 x bfloat> %300, i64 0, !dbg !78
+  %304 = extractelement <2 x bfloat> %300, i64 1, !dbg !78
+  %305 = extractelement <2 x bfloat> %302, i64 0, !dbg !78
+  %306 = extractelement <2 x bfloat> %302, i64 1, !dbg !78
+  %307 = fpext bfloat %303 to float, !dbg !79
+  %308 = fpext bfloat %304 to float, !dbg !79
+  %309 = fpext bfloat %305 to float, !dbg !79
+  %310 = fpext bfloat %306 to float, !dbg !79
+  %311 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !80
+  %312 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %gep, i64 %311, i1 %133) #6, !dbg !80
+  %313 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !80
+  %314 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %gep61, i64 %313, i1 %133) #6, !dbg !80
+  %315 = insertelement <2 x i16> poison, i16 %312, i64 0, !dbg !80
+  %316 = insertelement <2 x i16> %315, i16 %314, i64 1, !dbg !80
+  %317 = bitcast <2 x i16> %316 to <2 x bfloat>, !dbg !80
+  %318 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !80
+  %319 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %gep63, i64 %318, i1 %133) #6, !dbg !80
+  %320 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !80
+  %321 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %gep65, i64 %320, i1 %133) #6, !dbg !80
+  %322 = insertelement <2 x i16> poison, i16 %319, i64 0, !dbg !80
+  %323 = insertelement <2 x i16> %322, i16 %321, i64 1, !dbg !80
+  %324 = bitcast <2 x i16> %323 to <2 x bfloat>, !dbg !80
+  %325 = fpext <2 x bfloat> %317 to <2 x float>, !dbg !81
+  %326 = fpext <2 x bfloat> %324 to <2 x float>, !dbg !81
+  %327 = fmul <2 x float> %175, %325, !dbg !37
+  %328 = fmul <2 x float> %175, %326, !dbg !37
+  %329 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !82
+  %330 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %247, i64 %329, i1 %133) #6, !dbg !82
+  %331 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !82
+  %332 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %248, i64 %331, i1 %133) #6, !dbg !82
+  %333 = insertelement <2 x i16> poison, i16 %330, i64 0, !dbg !82
+  %334 = insertelement <2 x i16> %333, i16 %332, i64 1, !dbg !82
+  %335 = bitcast <2 x i16> %334 to <2 x bfloat>, !dbg !82
+  %336 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !82
+  %337 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %249, i64 %336, i1 %133) #6, !dbg !82
+  %338 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !82
+  %339 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %250, i64 %338, i1 %133) #6, !dbg !82
+  %340 = insertelement <2 x i16> poison, i16 %337, i64 0, !dbg !82
+  %341 = insertelement <2 x i16> %340, i16 %339, i64 1, !dbg !82
+  %342 = bitcast <2 x i16> %341 to <2 x bfloat>, !dbg !82
+  %343 = fpext <2 x bfloat> %335 to <2 x float>, !dbg !83
+  %344 = fpext <2 x bfloat> %342 to <2 x float>, !dbg !83
+  %345 = fmul <2 x float> %327, %343, !dbg !84
+  %346 = fmul <2 x float> %328, %344, !dbg !84
+  %347 = fsub <2 x float> zeroinitializer, %345, !dbg !85
+  %348 = fsub <2 x float> zeroinitializer, %346, !dbg !85
+  %349 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !86
+  %350 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %gep67, i64 %349, i1 %137) #6, !dbg !86
+  %351 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !86
+  %352 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %gep69, i64 %351, i1 %137) #6, !dbg !86
+  %353 = insertelement <2 x i16> poison, i16 %350, i64 0, !dbg !86
+  %354 = insertelement <2 x i16> %353, i16 %352, i64 1, !dbg !86
+  %355 = bitcast <2 x i16> %354 to <2 x bfloat>, !dbg !86
+  %356 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !86
+  %357 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %gep71, i64 %356, i1 %137) #6, !dbg !86
+  %358 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !86
+  %359 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %gep73, i64 %358, i1 %137) #6, !dbg !86
+  %360 = insertelement <2 x i16> poison, i16 %357, i64 0, !dbg !86
+  %361 = insertelement <2 x i16> %360, i16 %359, i64 1, !dbg !86
+  %362 = bitcast <2 x i16> %361 to <2 x bfloat>, !dbg !86
+  %363 = fpext <2 x bfloat> %355 to <2 x float>, !dbg !87
+  %364 = fpext <2 x bfloat> %362 to <2 x float>, !dbg !87
+  %365 = fmul <2 x float> %175, %363, !dbg !88
+  %366 = fmul <2 x float> %175, %364, !dbg !88
+  %367 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !89
+  %368 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %251, i64 %367, i1 %137) #6, !dbg !89
+  %369 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !89
+  %370 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %252, i64 %369, i1 %137) #6, !dbg !89
+  %371 = insertelement <2 x i16> poison, i16 %368, i64 0, !dbg !89
+  %372 = insertelement <2 x i16> %371, i16 %370, i64 1, !dbg !89
+  %373 = bitcast <2 x i16> %372 to <2 x bfloat>, !dbg !89
+  %374 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !89
+  %375 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %253, i64 %374, i1 %137) #6, !dbg !89
+  %376 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !89
+  %377 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %254, i64 %376, i1 %137) #6, !dbg !89
+  %378 = insertelement <2 x i16> poison, i16 %375, i64 0, !dbg !89
+  %379 = insertelement <2 x i16> %378, i16 %377, i64 1, !dbg !89
+  %380 = bitcast <2 x i16> %379 to <2 x bfloat>, !dbg !89
+  %381 = fpext <2 x bfloat> %373 to <2 x float>, !dbg !90
+  %382 = fpext <2 x bfloat> %380 to <2 x float>, !dbg !90
+  %383 = fmul <2 x float> %365, %381, !dbg !91
+  %384 = fmul <2 x float> %366, %382, !dbg !91
+  %385 = select <2 x i1> %173, <2 x float> %347, <2 x float> %383, !dbg !44
+  %386 = select <2 x i1> %173, <2 x float> %348, <2 x float> %384, !dbg !44
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !69
+  store <2 x float> %268, ptr addrspace(3) %120, align 8, !dbg !69
+  store <2 x float> %270, ptr addrspace(3) %121, align 8, !dbg !69
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !69
+  %387 = load <2 x float>, ptr addrspace(3) %130, align 8, !dbg !69
+  %388 = load <2 x float>, ptr addrspace(3) %132, align 8, !dbg !69
+  %389 = fmul <2 x float> %281, %385, !dbg !92
+  %390 = fmul <2 x float> %282, %386, !dbg !92
+  %391 = fadd <2 x float> %389, %387, !dbg !93
+  %392 = fadd <2 x float> %390, %388, !dbg !93
+  %393 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !94
+  %394 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %gep75, i64 %393, i1 %133) #6, !dbg !94
+  %395 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !94
+  %396 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %gep77, i64 %395, i1 %133) #6, !dbg !94
+  %397 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !94
+  %398 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %gep79, i64 %397, i1 %133) #6, !dbg !94
+  %399 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !94
+  %400 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %gep81, i64 %399, i1 %133) #6, !dbg !94
+  %401 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !95
+  %402 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %271, i64 %401, i1 %133) #6, !dbg !95
+  %403 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !95
+  %404 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %272, i64 %403, i1 %133) #6, !dbg !95
+  %405 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !95
+  %406 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %273, i64 %405, i1 %133) #6, !dbg !95
+  %407 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !95
+  %408 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %274, i64 %407, i1 %133) #6, !dbg !95
+  %409 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !96
+  %410 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %gep83, i64 %409, i1 %137) #6, !dbg !96
+  %411 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !96
+  %412 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %gep85, i64 %411, i1 %137) #6, !dbg !96
+  %413 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !96
+  %414 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %gep87, i64 %413, i1 %137) #6, !dbg !96
+  %415 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !96
+  %416 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %gep89, i64 %415, i1 %137) #6, !dbg !96
+  %417 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !97
+  %418 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %275, i64 %417, i1 %137) #6, !dbg !97
+  %419 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !97
+  %420 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %276, i64 %419, i1 %137) #6, !dbg !97
+  %421 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !97
+  %422 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %277, i64 %421, i1 %137) #6, !dbg !97
+  %423 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !97
+  %424 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %278, i64 %423, i1 %137) #6, !dbg !97
+  %425 = fmul float %.0.i41, %293, !dbg !98
+  %426 = fmul float %.0.i41, %294, !dbg !98
+  %427 = fmul float %.0.i41, %295, !dbg !98
+  %428 = fmul float %.0.i41, %296, !dbg !98
+  %429 = fmul float %425, %307, !dbg !99
+  %430 = fmul float %426, %308, !dbg !99
+  %431 = fmul float %427, %309, !dbg !99
+  %432 = fmul float %428, %310, !dbg !99
+  %433 = fmul float %429, %224, !dbg !100
+  %434 = fmul float %430, %225, !dbg !100
+  %435 = fmul float %431, %226, !dbg !100
+  %436 = fmul float %432, %227, !dbg !100
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !100
+  %437 = insertelement <2 x float> poison, float %433, i64 0, !dbg !100
+  %438 = insertelement <2 x float> %437, float %435, i64 1, !dbg !100
+  store <2 x float> %438, ptr addrspace(3) %120, align 8, !dbg !100
+  %439 = insertelement <2 x float> poison, float %434, i64 0, !dbg !100
+  %440 = insertelement <2 x float> %439, float %436, i64 1, !dbg !100
+  store <2 x float> %440, ptr addrspace(3) %121, align 8, !dbg !100
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !100
+  %441 = fptrunc <2 x float> %391 to <2 x bfloat>, !dbg !101
+  %442 = fptrunc <2 x float> %392 to <2 x bfloat>, !dbg !101
+  %443 = getelementptr bfloat, ptr addrspace(1) %1, i64 %279, !dbg !102
+  %444 = insertelement <2 x i16> poison, i16 %394, i64 0, !dbg !94
+  %445 = insertelement <2 x i16> %444, i16 %396, i64 1, !dbg !94
+  %446 = bitcast <2 x i16> %445 to <2 x bfloat>, !dbg !94
+  %447 = fpext <2 x bfloat> %446 to <2 x float>, !dbg !103
+  %448 = fmul <2 x float> %177, %447, !dbg !41
+  %449 = insertelement <2 x i16> poison, i16 %402, i64 0, !dbg !95
+  %450 = insertelement <2 x i16> %449, i16 %404, i64 1, !dbg !95
+  %451 = bitcast <2 x i16> %450 to <2 x bfloat>, !dbg !95
+  %452 = fpext <2 x bfloat> %451 to <2 x float>, !dbg !104
+  %453 = fmul <2 x float> %448, %452, !dbg !105
+  %454 = fsub <2 x float> zeroinitializer, %453, !dbg !106
+  %455 = insertelement <2 x i16> poison, i16 %410, i64 0, !dbg !96
+  %456 = insertelement <2 x i16> %455, i16 %412, i64 1, !dbg !96
+  %457 = bitcast <2 x i16> %456 to <2 x bfloat>, !dbg !96
+  %458 = fpext <2 x bfloat> %457 to <2 x float>, !dbg !107
+  %459 = fmul <2 x float> %177, %458, !dbg !108
+  %460 = insertelement <2 x i16> poison, i16 %418, i64 0, !dbg !97
+  %461 = insertelement <2 x i16> %460, i16 %420, i64 1, !dbg !97
+  %462 = bitcast <2 x i16> %461 to <2 x bfloat>, !dbg !97
+  %463 = fpext <2 x bfloat> %462 to <2 x float>, !dbg !109
+  %464 = fmul <2 x float> %459, %463, !dbg !110
+  %465 = select <2 x i1> %173, <2 x float> %454, <2 x float> %464, !dbg !44
+  %466 = load <2 x float>, ptr addrspace(3) %130, align 8, !dbg !100
+  %467 = fmul <2 x float> %281, %465, !dbg !111
+  %468 = fadd <2 x float> %467, %466, !dbg !112
+  %469 = fptrunc <2 x float> %468 to <2 x bfloat>, !dbg !113
+  %470 = insertelement <2 x i16> poison, i16 %398, i64 0, !dbg !94
+  %471 = insertelement <2 x i16> %470, i16 %400, i64 1, !dbg !94
+  %472 = bitcast <2 x i16> %471 to <2 x bfloat>, !dbg !94
+  %473 = fpext <2 x bfloat> %472 to <2 x float>, !dbg !103
+  %474 = fmul <2 x float> %177, %473, !dbg !41
+  %475 = insertelement <2 x i16> poison, i16 %406, i64 0, !dbg !95
+  %476 = insertelement <2 x i16> %475, i16 %408, i64 1, !dbg !95
+  %477 = bitcast <2 x i16> %476 to <2 x bfloat>, !dbg !95
+  %478 = fpext <2 x bfloat> %477 to <2 x float>, !dbg !104
+  %479 = fmul <2 x float> %474, %478, !dbg !105
+  %480 = fsub <2 x float> zeroinitializer, %479, !dbg !106
+  %481 = insertelement <2 x i16> poison, i16 %414, i64 0, !dbg !96
+  %482 = insertelement <2 x i16> %481, i16 %416, i64 1, !dbg !96
+  %483 = bitcast <2 x i16> %482 to <2 x bfloat>, !dbg !96
+  %484 = fpext <2 x bfloat> %483 to <2 x float>, !dbg !107
+  %485 = fmul <2 x float> %177, %484, !dbg !108
+  %486 = insertelement <2 x i16> poison, i16 %422, i64 0, !dbg !97
+  %487 = insertelement <2 x i16> %486, i16 %424, i64 1, !dbg !97
+  %488 = bitcast <2 x i16> %487 to <2 x bfloat>, !dbg !97
+  %489 = fpext <2 x bfloat> %488 to <2 x float>, !dbg !109
+  %490 = fmul <2 x float> %485, %489, !dbg !110
+  %491 = select <2 x i1> %173, <2 x float> %480, <2 x float> %490, !dbg !44
+  %492 = load <2 x float>, ptr addrspace(3) %132, align 8, !dbg !100
+  %493 = fmul <2 x float> %282, %491, !dbg !111
+  %494 = fadd <2 x float> %493, %492, !dbg !112
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !101
+  store <2 x bfloat> %441, ptr addrspace(3) %149, align 4, !dbg !101
+  store <2 x bfloat> %442, ptr addrspace(3) %151, align 4, !dbg !101
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !101
+  %495 = load <2 x i16>, ptr addrspace(3) %159, align 4, !dbg !101
+  %496 = load <2 x i16>, ptr addrspace(3) %160, align 4, !dbg !101
+  %.uncasted = shufflevector <2 x i16> %495, <2 x i16> %496, <2 x i32> <i32 0, i32 2>, !dbg !101
+  %497 = bitcast <2 x i16> %.uncasted to i32, !dbg !101
+  %498 = shufflevector <2 x i16> %495, <2 x i16> %496, <2 x i32> <i32 1, i32 3>, !dbg !101
+  %499 = bitcast <2 x i16> %498 to i32, !dbg !101
+  tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %497, i32 %499, ptr addrspace(1) %280, i1 true) #6, !dbg !101
+  %500 = fptrunc <2 x float> %494 to <2 x bfloat>, !dbg !113
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !113
+  store <2 x bfloat> %469, ptr addrspace(3) %149, align 4, !dbg !113
+  store <2 x bfloat> %500, ptr addrspace(3) %151, align 4, !dbg !113
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !113
+  %501 = load <2 x i16>, ptr addrspace(3) %159, align 4, !dbg !113
+  %502 = load <2 x i16>, ptr addrspace(3) %160, align 4, !dbg !113
+  %.uncasted21 = shufflevector <2 x i16> %501, <2 x i16> %502, <2 x i32> <i32 0, i32 2>, !dbg !113
+  %503 = bitcast <2 x i16> %.uncasted21 to i32, !dbg !113
+  %504 = shufflevector <2 x i16> %501, <2 x i16> %502, <2 x i32> <i32 1, i32 3>, !dbg !113
+  %505 = bitcast <2 x i16> %504 to i32, !dbg !113
+  tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %503, i32 %505, ptr addrspace(1) %443, i1 true) #6, !dbg !113
+  %indvars.iv.next52 = add nuw nsw i64 %indvars.iv51, 8, !dbg !43
+  %506 = icmp samesign ult i64 %indvars.iv51, 120, !dbg !43
+  br i1 %506, label %178, label %507, !dbg !43
+
+507:                                              ; preds = %178
+  ret void, !dbg !114
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1
+
+; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
+declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.div.full(float, float) #3
+
+; Function Attrs: convergent nocallback nounwind
+declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #4
+
+declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #5
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #3
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.rsqrt.approx.f(float) #3
+
+attributes #0 = { nounwind "nvvm.reqntid"="128" }
+attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
+attributes #3 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
+attributes #4 = { convergent nocallback nounwind }
+attributes #5 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #6 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3}
+!llvm.ident = !{!4}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly)
+!1 = !DIFile(filename: "cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py", directory: "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv")
+!2 = !{i32 2, !"Debug Info Version", i32 3}
+!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
+!4 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
+!5 = distinct !DISubprogram(name: "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0", linkageName: "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0", scope: !1, file: !1, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
+!7 = !{}
+!8 = !DILocation(line: 23, column: 28, scope: !5)
+!9 = !DILocation(line: 23, column: 33, scope: !5)
+!10 = !DILocation(line: 24, column: 44, scope: !5)
+!11 = !DILocation(line: 24, column: 23, scope: !5)
+!12 = !DILocation(line: 26, column: 37, scope: !5)
+!13 = !DILocation(line: 29, column: 19, scope: !5)
+!14 = !DILocation(line: 33, column: 43, scope: !5)
+!15 = !DILocation(line: 39, column: 57, scope: !5)
+!16 = !DILocation(line: 39, column: 34, scope: !5)
+!17 = !DILocation(line: 39, column: 68, scope: !5)
+!18 = !DILocation(line: 40, column: 50, scope: !5)
+!19 = !DILocation(line: 40, column: 34, scope: !5)
+!20 = !DILocation(line: 40, column: 61, scope: !5)
+!21 = !DILocation(line: 40, column: 114, scope: !5)
+!22 = !DILocation(line: 47, column: 22, scope: !5)
+!23 = !DILocation(line: 49, column: 25, scope: !5)
+!24 = !DILocation(line: 263, column: 15, scope: !25, inlinedAt: !27)
+!25 = distinct !DILexicalBlockFile(scope: !5, file: !26, discriminator: 0)
+!26 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.12/dist-packages/triton/language")
+!27 = !DILocation(line: 293, column: 36, scope: !25, inlinedAt: !28)
+!28 = !DILocation(line: 51, column: 25, scope: !29)
+!29 = distinct !DILexicalBlockFile(scope: !5, file: !1, discriminator: 0)
+!30 = !DILocation(line: 263, column: 15, scope: !25, inlinedAt: !31)
+!31 = !DILocation(line: 293, column: 36, scope: !25, inlinedAt: !32)
+!32 = !DILocation(line: 52, column: 27, scope: !29)
+!33 = !DILocation(line: 63, column: 46, scope: !5)
+!34 = !DILocation(line: 75, column: 25, scope: !5)
+!35 = !DILocation(line: 77, column: 24, scope: !5)
+!36 = !DILocation(line: 78, column: 32, scope: !5)
+!37 = !DILocation(line: 79, column: 24, scope: !5)
+!38 = !DILocation(line: 123, column: 24, scope: !5)
+!39 = !DILocation(line: 124, column: 24, scope: !5)
+!40 = !DILocation(line: 125, column: 32, scope: !5)
+!41 = !DILocation(line: 126, column: 24, scope: !5)
+!42 = !DILocation(line: 161, column: 43, scope: !5)
+!43 = !DILocation(line: 53, column: 43, scope: !5)
+!44 = !DILocation(line: 0, scope: !5)
+!45 = !DILocation(line: 54, column: 31, scope: !5)
+!46 = !DILocation(line: 59, column: 27, scope: !5)
+!47 = !DILocation(line: 61, column: 51, scope: !5)
+!48 = !DILocation(line: 61, column: 35, scope: !5)
+!49 = !DILocation(line: 61, column: 62, scope: !5)
+!50 = !DILocation(line: 61, column: 115, scope: !5)
+!51 = !DILocation(line: 62, column: 35, scope: !5)
+!52 = !DILocation(line: 62, column: 42, scope: !5)
+!53 = !DILocation(line: 62, column: 95, scope: !5)
+!54 = !DILocation(line: 63, column: 42, scope: !5)
+!55 = !DILocation(line: 63, column: 35, scope: !5)
+!56 = !DILocation(line: 63, column: 51, scope: !5)
+!57 = !DILocation(line: 64, column: 35, scope: !5)
+!58 = !DILocation(line: 64, column: 51, scope: !5)
+!59 = !DILocation(line: 65, column: 58, scope: !5)
+!60 = !DILocation(line: 65, column: 35, scope: !5)
+!61 = !DILocation(line: 66, column: 36, scope: !5)
+!62 = !DILocation(line: 72, column: 39, scope: !5)
+!63 = !DILocation(line: 72, column: 35, scope: !5)
+!64 = !DILocation(line: 80, column: 35, scope: !5)
+!65 = !DILocation(line: 90, column: 35, scope: !5)
+!66 = !DILocation(line: 98, column: 35, scope: !5)
+!67 = !DILocation(line: 111, column: 24, scope: !5)
+!68 = !DILocation(line: 113, column: 24, scope: !5)
+!69 = !DILocation(line: 116, column: 24, scope: !5)
+!70 = !DILocation(line: 121, column: 35, scope: !5)
+!71 = !DILocation(line: 127, column: 35, scope: !5)
+!72 = !DILocation(line: 134, column: 35, scope: !5)
+!73 = !DILocation(line: 140, column: 35, scope: !5)
+!74 = !DILocation(line: 161, column: 39, scope: !5)
+!75 = !DILocation(line: 161, column: 32, scope: !5)
+!76 = !DILocation(line: 65, column: 69, scope: !5)
+!77 = !DILocation(line: 65, column: 123, scope: !5)
+!78 = !DILocation(line: 66, column: 43, scope: !5)
+!79 = !DILocation(line: 66, column: 96, scope: !5)
+!80 = !DILocation(line: 72, column: 68, scope: !5)
+!81 = !DILocation(line: 72, column: 129, scope: !5)
+!82 = !DILocation(line: 80, column: 85, scope: !5)
+!83 = !DILocation(line: 80, column: 146, scope: !5)
+!84 = !DILocation(line: 82, column: 24, scope: !5)
+!85 = !DILocation(line: 84, column: 17, scope: !5)
+!86 = !DILocation(line: 90, column: 64, scope: !5)
+!87 = !DILocation(line: 90, column: 125, scope: !5)
+!88 = !DILocation(line: 97, column: 24, scope: !5)
+!89 = !DILocation(line: 98, column: 81, scope: !5)
+!90 = !DILocation(line: 98, column: 142, scope: !5)
+!91 = !DILocation(line: 100, column: 24, scope: !5)
+!92 = !DILocation(line: 118, column: 24, scope: !5)
+!93 = !DILocation(line: 119, column: 24, scope: !5)
+!94 = !DILocation(line: 121, column: 71, scope: !5)
+!95 = !DILocation(line: 127, column: 85, scope: !5)
+!96 = !DILocation(line: 134, column: 71, scope: !5)
+!97 = !DILocation(line: 140, column: 81, scope: !5)
+!98 = !DILocation(line: 151, column: 25, scope: !5)
+!99 = !DILocation(line: 153, column: 26, scope: !5)
+!100 = !DILocation(line: 156, column: 26, scope: !5)
+!101 = !DILocation(line: 161, column: 55, scope: !5)
+!102 = !DILocation(line: 162, column: 32, scope: !5)
+!103 = !DILocation(line: 121, column: 132, scope: !5)
+!104 = !DILocation(line: 127, column: 146, scope: !5)
+!105 = !DILocation(line: 129, column: 24, scope: !5)
+!106 = !DILocation(line: 131, column: 17, scope: !5)
+!107 = !DILocation(line: 134, column: 132, scope: !5)
+!108 = !DILocation(line: 139, column: 24, scope: !5)
+!109 = !DILocation(line: 140, column: 142, scope: !5)
+!110 = !DILocation(line: 142, column: 24, scope: !5)
+!111 = !DILocation(line: 158, column: 26, scope: !5)
+!112 = !DILocation(line: 159, column: 26, scope: !5)
+!113 = !DILocation(line: 162, column: 56, scope: !5)
+!114 = !DILocation(line: 53, column: 4, scope: !5)
diff --git a/triton/BFXM4JRLILKWOXIXJK63CNDMPAQTDH7WSFLZFJRWWUXYOHS6MUXQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ptx b/triton/BFXM4JRLILKWOXIXJK63CNDMPAQTDH7WSFLZFJRWWUXYOHS6MUXQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ptx
new file mode 100644
index 0000000000000000000000000000000000000000..2219885f7430914c0f039f99c636750b678a787d
--- /dev/null
+++ b/triton/BFXM4JRLILKWOXIXJK63CNDMPAQTDH7WSFLZFJRWWUXYOHS6MUXQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ptx
@@ -0,0 +1,1369 @@
+//
+// Generated by LLVM NVPTX Back-End
+//
+
+.version 9.1
+.target sm_89
+.address_size 64
+
+	// .globl	triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0 // -- Begin function triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0
+.extern .shared .align 16 .b8 global_smem[];
+.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90};
+                                        // @triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0
+.visible .entry triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0(
+	.param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_0,
+	.param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_1,
+	.param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_2,
+	.param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_3,
+	.param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_4,
+	.param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_5,
+	.param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_6,
+	.param .u32 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_7,
+	.param .u32 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_8,
+	.param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_9,
+	.param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_10
+)
+.reqntid 128
+{
+	.reg .pred 	%p<7>;
+	.reg .b16 	%rs<98>;
+	.reg .b32 	%r<297>;
+	.reg .b64 	%rd<107>;
+	.loc	1 18 0                          // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:18:0
+$L__func_begin0:
+	.loc	1 18 0                          // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:18:0
+
+// %bb.0:
+	ld.param.b64 	%rd14, [triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_6];
+	ld.param.b64 	%rd13, [triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_5];
+	ld.param.b64 	%rd12, [triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_4];
+	ld.param.b64 	%rd11, [triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_3];
+	ld.param.b64 	%rd10, [triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_2];
+	ld.param.b64 	%rd9, [triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_1];
+	ld.param.b64 	%rd8, [triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_0];
+$L__tmp0:
+	.loc	1 23 28                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:23:28
+	mov.u32 	%r19, %ctaid.x;
+	.loc	1 23 33                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:23:33
+	shl.b32 	%r1, %r19, 6;
+	.loc	1 24 44                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:24:44
+	mov.u32 	%r2, %tid.x;
+	and.b32 	%r3, %r2, 126;
+	bfe.u32 	%r20, %r2, 1, 6;
+	.loc	1 24 23                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:24:23
+	or.b32 	%r21, %r20, %r1;
+	.loc	1 26 37                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:26:37
+	shl.b32 	%r4, %r2, 2;
+	and.b32 	%r22, %r4, 4;
+	.loc	1 29 19                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:29:19
+	bfe.s32 	%r23, %r19, 25, 1;
+	shr.u32 	%r24, %r23, 27;
+	add.s32 	%r25, %r21, %r24;
+	shr.s32 	%r5, %r25, 5;
+	shl.b32 	%r26, %r5, 15;
+	.loc	1 33 43                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:33:43
+	shl.b32 	%r6, %r19, 13;
+	add.s32 	%r27, %r26, %r6;
+	shl.b32 	%r7, %r20, 7;
+	or.b32 	%r28, %r27, %r7;
+	or.b32 	%r29, %r28, %r22;
+	cvt.u64.u32 	%rd1, %r29;
+	mov.b32 	%r289, 0f00000000;
+	mov.b64 	%rd102, -8;
+	mov.b32 	%r290, %r289;
+	mov.b32 	%r291, %r289;
+	mov.b32 	%r292, %r289;
+	mov.b32 	%r293, %r289;
+	mov.b32 	%r294, %r289;
+	mov.b32 	%r295, %r289;
+	mov.b32 	%r296, %r289;
+$L__BB0_1:                              // =>This Inner Loop Header: Depth=1
+	.loc	1 39 34                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:39:34
+	add.s64 	%rd19, %rd1, %rd102;
+	cvt.u32.u64 	%r35, %rd19;
+	add.s32 	%r36, %r35, 4104;
+	mad.wide.s32 	%rd16, %r36, 2, %rd10;
+	.loc	1 39 68                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:39:68
+	// begin inline asm
+	mov.u64 %rd15, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd15, 1.0;
+	// end inline asm
+	mov.b32 	%r32, 0;
+	mov.pred 	%p1, -1;
+	// begin inline asm
+	mov.u32 %r30, %r32;
+	mov.u32 %r31, %r32;
+	@%p1 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { %r30, %r31 }, [ %rd16 + 0 ], %rd15;
+	// end inline asm
+	add.s32 	%r37, %r35, 8;
+	.loc	1 40 34                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:40:34
+	mad.wide.s32 	%rd18, %r37, 2, %rd10;
+	.loc	1 40 61                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:40:61
+	// begin inline asm
+	mov.u64 %rd17, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd17, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r33, %r32;
+	mov.u32 %r34, %r32;
+	@%p1 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { %r33, %r34 }, [ %rd18 + 0 ], %rd17;
+	// end inline asm
+	.loc	1 40 114                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:40:114
+	mov.b32 	{%rs1, %rs2}, %r33;
+	cvt.f32.bf16 	%r38, %rs1;
+	cvt.f32.bf16 	%r39, %rs2;
+	mov.b32 	{%rs3, %rs4}, %r34;
+	cvt.f32.bf16 	%r40, %rs3;
+	cvt.f32.bf16 	%r41, %rs4;
+	mov.b32 	{%rs5, %rs6}, %r30;
+	cvt.f32.bf16 	%r42, %rs5;
+	cvt.f32.bf16 	%r43, %rs6;
+	mov.b32 	{%rs7, %rs8}, %r31;
+	cvt.f32.bf16 	%r44, %rs7;
+	cvt.f32.bf16 	%r45, %rs8;
+	.loc	1 49 25                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:49:25
+	fma.rn.f32 	%r296, %r45, %r45, %r296;
+	fma.rn.f32 	%r295, %r44, %r44, %r295;
+	fma.rn.f32 	%r294, %r43, %r43, %r294;
+	fma.rn.f32 	%r293, %r42, %r42, %r293;
+	fma.rn.f32 	%r292, %r41, %r41, %r292;
+	fma.rn.f32 	%r291, %r40, %r40, %r291;
+	fma.rn.f32 	%r290, %r39, %r39, %r290;
+	fma.rn.f32 	%r289, %r38, %r38, %r289;
+	.loc	1 33 43                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:33:43
+	add.s64 	%rd102, %rd102, 8;
+	setp.lt.u64 	%p2, %rd102, 120;
+	@%p2 bra 	$L__BB0_1;
+// %bb.2:                               // %__nv_rsqrtf.exit
+	.loc	1 24 44                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:24:44
+	and.b32 	%r46, %r2, 63;
+	.loc	1 24 23                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:24:23
+	or.b32 	%r47, %r1, %r46;
+	.loc	1 26 37                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:26:37
+	and.b32 	%r8, %r2, 64;
+	.loc	1 29 19                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:29:19
+	shr.s32 	%r48, %r1, 31;
+	shr.u32 	%r49, %r48, 27;
+	add.s32 	%r50, %r47, %r49;
+$L__tmp1:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ] ]
+	add.f32 	%r51, %r293, %r294;
+	add.f32 	%r52, %r295, %r51;
+	add.f32 	%r53, %r296, %r52;
+$L__tmp2:
+	.loc	2 293 36                        // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ]
+	shfl.sync.bfly.b32 	%r54, %r53, 1, 31, -1;
+$L__tmp3:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ] ]
+	add.f32 	%r55, %r53, %r54;
+$L__tmp4:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ] ]
+	add.f32 	%r56, %r289, %r290;
+	add.f32 	%r57, %r291, %r56;
+	add.f32 	%r58, %r292, %r57;
+$L__tmp5:
+	.loc	2 293 36                        // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ]
+	shfl.sync.bfly.b32 	%r59, %r58, 1, 31, -1;
+$L__tmp6:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ] ]
+	add.f32 	%r60, %r58, %r59;
+$L__tmp7:
+	.loc	1 63 46                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:63:46
+	shl.b32 	%r61, %r5, 7;
+	mov.b32 	%r62, 0f43000000;
+	.loc	1 75 25                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:75:25
+	div.full.f32 	%r63, %r60, %r62;
+	.loc	1 77 24                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:77:24
+	add.f32 	%r64, %r63, 0f358637BD;
+	.loc	1 78 32                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:78:32
+	rsqrt.approx.ftz.f32 	%r9, %r64;
+	.loc	1 79 24                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:79:24
+	shl.b32 	%r65, %r3, 1;
+	mov.b32 	%r66, global_smem;
+	add.s32 	%r67, %r66, %r65;
+	st.shared.b32 	[%r67], %r9;
+	bar.sync 	0;
+	shl.b32 	%r68, %r46, 2;
+	add.s32 	%r69, %r66, %r68;
+	ld.shared.b32 	%r17, [%r69];
+	.loc	1 123 24                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:123:24
+	div.full.f32 	%r70, %r55, %r62;
+	.loc	1 124 24                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:124:24
+	add.f32 	%r71, %r70, 0f358637BD;
+	.loc	1 125 32                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:125:32
+	rsqrt.approx.ftz.f32 	%r10, %r71;
+	.loc	1 126 24                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:126:24
+	bar.sync 	0;
+	st.shared.b32 	[%r67], %r10;
+	bar.sync 	0;
+	ld.shared.b32 	%r18, [%r69];
+	shl.b32 	%r72, %r2, 3;
+	and.b32 	%r73, %r72, 120;
+	and.b32 	%r74, %r4, 384;
+	bfe.s32 	%r75, %r2, 4, 1;
+	and.b32 	%r76, %r75, 1032;
+	or.b32 	%r77, %r73, %r74;
+	xor.b32 	%r78, %r77, %r76;
+	add.s32 	%r11, %r66, %r78;
+	shl.b32 	%r79, %r2, 4;
+	and.b32 	%r80, %r79, 112;
+	and.b32 	%r81, %r72, 896;
+	bfe.s32 	%r82, %r2, 3, 1;
+	and.b32 	%r83, %r82, 1032;
+	or.b32 	%r84, %r80, %r81;
+	or.b32 	%r85, %r84, %r83;
+	add.s32 	%r12, %r66, %r85;
+	xor.b32 	%r86, %r85, 8;
+	add.s32 	%r13, %r66, %r86;
+	setp.eq.b32 	%p4, %r8, 0;
+	shl.b32 	%r87, %r50, 10;
+	and.b32 	%r88, %r87, -32768;
+	and.b32 	%r89, %r75, 516;
+	or.b32 	%r90, %r74, %r89;
+	or.b32 	%r91, %r90, %r73;
+	add.s32 	%r14, %r66, %r91;
+	xor.b32 	%r92, %r91, 4;
+	add.s32 	%r15, %r66, %r92;
+	and.b32 	%r93, %r4, 124;
+	bfe.s32 	%r94, %r2, 5, 1;
+	and.b32 	%r95, %r94, 516;
+	shl.b32 	%r96, %r8, 1;
+	xor.b32 	%r97, %r95, %r93;
+	add.s32 	%r98, %r66, %r96;
+	add.s32 	%r16, %r98, %r97;
+	.loc	1 53 43                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:53:43
+	add.s32 	%r99, %r88, %r6;
+	shl.b32 	%r100, %r46, 7;
+	add.s32 	%r101, %r99, %r100;
+	mad.wide.s32 	%rd2, %r101, 2, %rd10;
+	add.s32 	%r102, %r101, 4096;
+	mad.wide.s32 	%rd3, %r102, 2, %rd10;
+	and.b32 	%r103, %r2, 1;
+	mul.wide.u32 	%rd20, %r103, 8;
+	add.s32 	%r104, %r6, %r7;
+	mad.wide.s32 	%rd21, %r104, 2, %rd20;
+	add.s64 	%rd4, %rd9, %rd21;
+	add.s64 	%rd5, %rd8, %rd21;
+	add.s64 	%rd6, %rd14, %rd20;
+	mul.wide.u32 	%rd22, %r103, 16;
+	mul.wide.s32 	%rd23, %r61, 4;
+	or.b64 	%rd24, %rd22, %rd23;
+	add.s64 	%rd104, %rd13, %rd24;
+	add.s64 	%rd103, %rd12, %rd24;
+	add.s64 	%rd7, %rd11, %rd20;
+	mov.b64 	%rd106, 0;
+	mov.b64 	%rd105, -8;
+$L__BB0_3:                              // =>This Inner Loop Header: Depth=1
+	.loc	1 0 43                          // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:0:43
+	setp.ne.b32 	%p5, %r8, 0;
+	.loc	1 61 35                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:61:35
+	add.s64 	%rd101, %rd1, %rd105;
+	cvt.u32.u64 	%r126, %rd101;
+	add.s32 	%r127, %r126, 8;
+	mad.wide.s32 	%rd26, %r127, 2, %rd10;
+	.loc	1 61 62                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:61:62
+	// begin inline asm
+	mov.u64 %rd25, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd25, 1.0;
+	// end inline asm
+	mov.b32 	%r107, 0;
+	mov.pred 	%p3, -1;
+	// begin inline asm
+	mov.u32 %r105, %r107;
+	mov.u32 %r106, %r107;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { %r105, %r106 }, [ %rd26 + 0 ], %rd25;
+	// end inline asm
+	mov.b32 	{%rs42, %rs43}, %r105;
+	mov.b32 	{%rs44, %rs45}, %r106;
+	.loc	1 61 115                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:61:115
+	cvt.f32.bf16 	%r128, %rs42;
+	cvt.f32.bf16 	%r129, %rs43;
+	cvt.f32.bf16 	%r130, %rs44;
+	cvt.f32.bf16 	%r131, %rs45;
+	.loc	1 62 42                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:62:42
+	add.s64 	%rd28, %rd7, %rd106;
+	// begin inline asm
+	mov.u64 %rd27, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd27, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r108, %r107;
+	mov.u32 %r109, %r107;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { %r108, %r109 }, [ %rd28 + 0 ], %rd27;
+	// end inline asm
+	mov.b32 	{%rs46, %rs47}, %r108;
+	mov.b32 	{%rs48, %rs49}, %r109;
+	.loc	1 62 95                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:62:95
+	cvt.f32.bf16 	%r132, %rs46;
+	cvt.f32.bf16 	%r133, %rs47;
+	cvt.f32.bf16 	%r134, %rs48;
+	cvt.f32.bf16 	%r135, %rs49;
+	.loc	1 63 51                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:63:51
+	// begin inline asm
+	mov.u64 %rd29, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd29, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r110, %r107;
+	mov.u32 %r111, %r107;
+	mov.u32 %r112, %r107;
+	mov.u32 %r113, %r107;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r110, %r111, %r112, %r113 }, [ %rd103 + 0 ], %rd29;
+	// end inline asm
+	.loc	1 64 51                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:64:51
+	// begin inline asm
+	mov.u64 %rd30, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd30, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r114, %r107;
+	mov.u32 %r115, %r107;
+	mov.u32 %r116, %r107;
+	mov.u32 %r117, %r107;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r114, %r115, %r116, %r117 }, [ %rd104 + 0 ], %rd30;
+	// end inline asm
+	bar.sync 	0;
+	st.shared.v2.b32 	[%r11], {%r114, %r116};
+	st.shared.v2.b32 	[%r11+512], {%r115, %r117};
+	bar.sync 	0;
+	add.s32 	%r136, %r126, 4104;
+	.loc	1 65 35                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:65:35
+	mad.wide.s32 	%rd32, %r136, 2, %rd10;
+	.loc	1 72 35                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:72:35
+	add.s64 	%rd34, %rd6, %rd106;
+	add.s64 	%rd52, %rd2, %rd106;
+	add.s64 	%rd36, %rd52, 2;
+	add.s64 	%rd38, %rd52, 6;
+	add.s64 	%rd40, %rd52, 10;
+	.loc	1 80 35                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:80:35
+	add.s64 	%rd42, %rd52, 14;
+	add.s64 	%rd60, %rd11, %rd106;
+	add.s64 	%rd44, %rd60, 2;
+	add.s64 	%rd46, %rd60, 6;
+	add.s64 	%rd48, %rd60, 10;
+	.loc	1 90 35                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:90:35
+	add.s64 	%rd50, %rd60, 14;
+	add.s64 	%rd54, %rd52, 4;
+	add.s64 	%rd56, %rd52, 8;
+	.loc	1 98 35                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:98:35
+	add.s64 	%rd58, %rd52, 12;
+	add.s64 	%rd62, %rd60, 4;
+	add.s64 	%rd64, %rd60, 8;
+	.loc	1 111 24                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:111:24
+	add.s64 	%rd66, %rd60, 12;
+	mul.f32 	%r137, %r9, %r128;
+	mul.f32 	%r138, %r9, %r129;
+	mul.f32 	%r139, %r9, %r130;
+	mul.f32 	%r140, %r9, %r131;
+	.loc	1 113 24                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:113:24
+	mul.f32 	%r141, %r137, %r132;
+	mul.f32 	%r142, %r138, %r133;
+	mul.f32 	%r143, %r139, %r134;
+	mul.f32 	%r144, %r140, %r135;
+	.loc	1 116 24                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:116:24
+	mul.f32 	%r145, %r141, %r110;
+	mul.f32 	%r146, %r142, %r111;
+	mul.f32 	%r147, %r143, %r112;
+	mul.f32 	%r148, %r144, %r113;
+	.loc	1 121 35                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:121:35
+	add.s64 	%rd84, %rd3, %rd106;
+	add.s64 	%rd68, %rd84, 2;
+	add.s64 	%rd70, %rd84, 6;
+	add.s64 	%rd72, %rd84, 10;
+	.loc	1 127 35                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:127:35
+	add.s64 	%rd74, %rd84, 14;
+	add.s64 	%rd92, %rd14, %rd106;
+	add.s64 	%rd76, %rd92, 2;
+	add.s64 	%rd78, %rd92, 6;
+	add.s64 	%rd80, %rd92, 10;
+	.loc	1 134 35                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:134:35
+	add.s64 	%rd82, %rd92, 14;
+	add.s64 	%rd86, %rd84, 4;
+	add.s64 	%rd88, %rd84, 8;
+	.loc	1 140 35                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:140:35
+	add.s64 	%rd90, %rd84, 12;
+	add.s64 	%rd94, %rd92, 4;
+	add.s64 	%rd96, %rd92, 8;
+	.loc	1 161 39                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:161:39
+	add.s64 	%rd98, %rd92, 12;
+	.loc	1 64 51                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:64:51
+	add.s64 	%rd99, %rd5, %rd106;
+	ld.shared.v2.b32 	{%r149, %r150}, [%r12];
+	ld.shared.v2.b32 	{%r151, %r152}, [%r13];
+	.loc	1 65 69                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:65:69
+	// begin inline asm
+	mov.u64 %rd31, 0x0;
+	createpolicy.fractional.L2::evict_first.b64 %rd31, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r118, %r107;
+	mov.u32 %r119, %r107;
+	@%p3 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { %r118, %r119 }, [ %rd32 + 0 ], %rd31;
+	// end inline asm
+	mov.b32 	{%rs50, %rs51}, %r118;
+	mov.b32 	{%rs52, %rs53}, %r119;
+	.loc	1 65 123                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:65:123
+	cvt.f32.bf16 	%r153, %rs50;
+	cvt.f32.bf16 	%r154, %rs51;
+	cvt.f32.bf16 	%r155, %rs52;
+	cvt.f32.bf16 	%r156, %rs53;
+	.loc	1 66 43                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:66:43
+	// begin inline asm
+	mov.u64 %rd33, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd33, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r120, %r107;
+	mov.u32 %r121, %r107;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { %r120, %r121 }, [ %rd34 + 0 ], %rd33;
+	// end inline asm
+	mov.b32 	{%rs54, %rs55}, %r120;
+	mov.b32 	{%rs56, %rs57}, %r121;
+	.loc	1 66 96                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:66:96
+	cvt.f32.bf16 	%r157, %rs54;
+	cvt.f32.bf16 	%r158, %rs55;
+	cvt.f32.bf16 	%r159, %rs56;
+	cvt.f32.bf16 	%r160, %rs57;
+	.loc	1 72 68                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:72:68
+	// begin inline asm
+	mov.u64 %rd35, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd35, 1.0;
+	// end inline asm
+	mov.b16 	%rs10, 0;
+	// begin inline asm
+	mov.u16 %rs9, %rs10;
+	@%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs9 }, [ %rd36 + 0 ], %rd35;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd37, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd37, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs11, %rs10;
+	@%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs11 }, [ %rd38 + 0 ], %rd37;
+	// end inline asm
+	mov.b32 	%r161, {%rs9, %rs11};
+	// begin inline asm
+	mov.u64 %rd39, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd39, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs12, %rs10;
+	@%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs12 }, [ %rd40 + 0 ], %rd39;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd41, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd41, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs13, %rs10;
+	@%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs13 }, [ %rd42 + 0 ], %rd41;
+	// end inline asm
+	mov.b32 	%r162, {%rs12, %rs13};
+	.loc	1 72 129                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:72:129
+	mov.b32 	{%rs58, %rs59}, %r161;
+	cvt.f32.bf16 	%r163, %rs58;
+	cvt.f32.bf16 	%r164, %rs59;
+	mov.b32 	{%rs60, %rs61}, %r162;
+	cvt.f32.bf16 	%r165, %rs60;
+	cvt.f32.bf16 	%r166, %rs61;
+	.loc	1 79 24                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:79:24
+	mul.f32 	%r167, %r17, %r164;
+	mul.f32 	%r168, %r17, %r163;
+	mul.f32 	%r169, %r17, %r166;
+	mul.f32 	%r170, %r17, %r165;
+	.loc	1 80 85                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:80:85
+	// begin inline asm
+	mov.u64 %rd43, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd43, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs14, %rs10;
+	@%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs14 }, [ %rd44 + 0 ], %rd43;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd45, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd45, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs15, %rs10;
+	@%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs15 }, [ %rd46 + 0 ], %rd45;
+	// end inline asm
+	mov.b32 	%r171, {%rs14, %rs15};
+	// begin inline asm
+	mov.u64 %rd47, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd47, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs16, %rs10;
+	@%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs16 }, [ %rd48 + 0 ], %rd47;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd49, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd49, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs17, %rs10;
+	@%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs17 }, [ %rd50 + 0 ], %rd49;
+	// end inline asm
+	mov.b32 	%r172, {%rs16, %rs17};
+	.loc	1 80 146                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:80:146
+	mov.b32 	{%rs62, %rs63}, %r171;
+	cvt.f32.bf16 	%r173, %rs63;
+	cvt.f32.bf16 	%r174, %rs62;
+	mov.b32 	{%rs64, %rs65}, %r172;
+	cvt.f32.bf16 	%r175, %rs65;
+	cvt.f32.bf16 	%r176, %rs64;
+	.loc	1 84 17                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:84:17
+	neg.f32 	%r177, %r168;
+	fma.rn.f32 	%r178, %r177, %r174, 0f00000000;
+	neg.f32 	%r179, %r167;
+	fma.rn.f32 	%r180, %r179, %r173, 0f00000000;
+	neg.f32 	%r181, %r170;
+	fma.rn.f32 	%r182, %r181, %r176, 0f00000000;
+	neg.f32 	%r183, %r169;
+	fma.rn.f32 	%r184, %r183, %r175, 0f00000000;
+	.loc	1 90 64                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:90:64
+	// begin inline asm
+	mov.u64 %rd51, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd51, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs18, %rs10;
+	@%p5 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs18 }, [ %rd52 + 0 ], %rd51;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd53, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd53, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs19, %rs10;
+	@%p5 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs19 }, [ %rd54 + 0 ], %rd53;
+	// end inline asm
+	mov.b32 	%r185, {%rs18, %rs19};
+	// begin inline asm
+	mov.u64 %rd55, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd55, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs20, %rs10;
+	@%p5 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs20 }, [ %rd56 + 0 ], %rd55;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd57, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd57, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs21, %rs10;
+	@%p5 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs21 }, [ %rd58 + 0 ], %rd57;
+	// end inline asm
+	mov.b32 	%r186, {%rs20, %rs21};
+	.loc	1 90 125                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:90:125
+	mov.b32 	{%rs66, %rs67}, %r185;
+	cvt.f32.bf16 	%r187, %rs66;
+	cvt.f32.bf16 	%r188, %rs67;
+	mov.b32 	{%rs68, %rs69}, %r186;
+	cvt.f32.bf16 	%r189, %rs68;
+	cvt.f32.bf16 	%r190, %rs69;
+	.loc	1 97 24                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:97:24
+	mul.f32 	%r191, %r17, %r188;
+	mul.f32 	%r192, %r17, %r187;
+	mul.f32 	%r193, %r17, %r190;
+	mul.f32 	%r194, %r17, %r189;
+	.loc	1 98 81                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:98:81
+	// begin inline asm
+	mov.u64 %rd59, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd59, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs22, %rs10;
+	@%p5 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs22 }, [ %rd60 + 0 ], %rd59;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd61, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd61, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs23, %rs10;
+	@%p5 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs23 }, [ %rd62 + 0 ], %rd61;
+	// end inline asm
+	mov.b32 	%r195, {%rs22, %rs23};
+	// begin inline asm
+	mov.u64 %rd63, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd63, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs24, %rs10;
+	@%p5 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs24 }, [ %rd64 + 0 ], %rd63;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd65, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd65, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs25, %rs10;
+	@%p5 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs25 }, [ %rd66 + 0 ], %rd65;
+	// end inline asm
+	mov.b32 	%r196, {%rs24, %rs25};
+	.loc	1 98 142                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:98:142
+	mov.b32 	{%rs70, %rs71}, %r195;
+	cvt.f32.bf16 	%r197, %rs71;
+	cvt.f32.bf16 	%r198, %rs70;
+	mov.b32 	{%rs72, %rs73}, %r196;
+	cvt.f32.bf16 	%r199, %rs73;
+	cvt.f32.bf16 	%r200, %rs72;
+	.loc	1 100 24                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:100:24
+	mul.f32 	%r201, %r192, %r198;
+	mul.f32 	%r202, %r191, %r197;
+	mul.f32 	%r203, %r194, %r200;
+	mul.f32 	%r204, %r193, %r199;
+	.loc	1 0 0                           // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:0
+	selp.f32 	%r205, %r180, %r202, %p4;
+	selp.f32 	%r206, %r178, %r201, %p4;
+	selp.f32 	%r207, %r184, %r204, %p4;
+	selp.f32 	%r208, %r182, %r203, %p4;
+	.loc	1 116 24                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:116:24
+	bar.sync 	0;
+	st.shared.v2.b32 	[%r11], {%r145, %r147};
+	st.shared.v2.b32 	[%r11+512], {%r146, %r148};
+	bar.sync 	0;
+	ld.shared.v2.b32 	{%r209, %r210}, [%r12];
+	ld.shared.v2.b32 	{%r211, %r212}, [%r13];
+	.loc	1 119 24                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:119:24
+	fma.rn.f32 	%r213, %r149, %r206, %r209;
+	fma.rn.f32 	%r214, %r150, %r205, %r210;
+	fma.rn.f32 	%r215, %r151, %r208, %r211;
+	fma.rn.f32 	%r216, %r152, %r207, %r212;
+	.loc	1 121 71                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:121:71
+	// begin inline asm
+	mov.u64 %rd67, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd67, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs26, %rs10;
+	@%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs26 }, [ %rd68 + 0 ], %rd67;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd69, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd69, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs27, %rs10;
+	@%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs27 }, [ %rd70 + 0 ], %rd69;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd71, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd71, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs28, %rs10;
+	@%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs28 }, [ %rd72 + 0 ], %rd71;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd73, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd73, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs29, %rs10;
+	@%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs29 }, [ %rd74 + 0 ], %rd73;
+	// end inline asm
+	.loc	1 127 85                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:127:85
+	// begin inline asm
+	mov.u64 %rd75, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd75, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs30, %rs10;
+	@%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs30 }, [ %rd76 + 0 ], %rd75;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd77, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd77, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs31, %rs10;
+	@%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs31 }, [ %rd78 + 0 ], %rd77;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd79, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd79, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs32, %rs10;
+	@%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs32 }, [ %rd80 + 0 ], %rd79;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd81, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd81, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs33, %rs10;
+	@%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs33 }, [ %rd82 + 0 ], %rd81;
+	// end inline asm
+	.loc	1 134 71                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:134:71
+	// begin inline asm
+	mov.u64 %rd83, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd83, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs34, %rs10;
+	@%p5 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs34 }, [ %rd84 + 0 ], %rd83;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd85, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd85, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs35, %rs10;
+	@%p5 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs35 }, [ %rd86 + 0 ], %rd85;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd87, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd87, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs36, %rs10;
+	@%p5 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs36 }, [ %rd88 + 0 ], %rd87;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd89, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd89, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs37, %rs10;
+	@%p5 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs37 }, [ %rd90 + 0 ], %rd89;
+	// end inline asm
+	.loc	1 140 81                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:140:81
+	// begin inline asm
+	mov.u64 %rd91, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd91, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs38, %rs10;
+	@%p5 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs38 }, [ %rd92 + 0 ], %rd91;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd93, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd93, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs39, %rs10;
+	@%p5 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs39 }, [ %rd94 + 0 ], %rd93;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd95, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd95, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs40, %rs10;
+	@%p5 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs40 }, [ %rd96 + 0 ], %rd95;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd97, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd97, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs41, %rs10;
+	@%p5 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs41 }, [ %rd98 + 0 ], %rd97;
+	// end inline asm
+	.loc	1 151 25                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:151:25
+	mul.f32 	%r217, %r10, %r153;
+	mul.f32 	%r218, %r10, %r154;
+	mul.f32 	%r219, %r10, %r155;
+	mul.f32 	%r220, %r10, %r156;
+	.loc	1 153 26                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:153:26
+	mul.f32 	%r221, %r217, %r157;
+	mul.f32 	%r222, %r218, %r158;
+	mul.f32 	%r223, %r219, %r159;
+	mul.f32 	%r224, %r220, %r160;
+	.loc	1 156 26                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:156:26
+	mul.f32 	%r225, %r221, %r110;
+	mul.f32 	%r226, %r222, %r111;
+	mul.f32 	%r227, %r223, %r112;
+	mul.f32 	%r228, %r224, %r113;
+	bar.sync 	0;
+	st.shared.v2.b32 	[%r11], {%r225, %r227};
+	st.shared.v2.b32 	[%r11+512], {%r226, %r228};
+	bar.sync 	0;
+	.loc	1 161 55                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:161:55
+	cvt.rn.bf16x2.f32 	%r229, %r214, %r213;
+	cvt.rn.bf16x2.f32 	%r230, %r216, %r215;
+	.loc	1 121 71                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:121:71
+	add.s64 	%rd100, %rd4, %rd106;
+	mov.b32 	%r231, {%rs26, %rs27};
+	.loc	1 121 132                       // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:121:132
+	mov.b32 	{%rs74, %rs75}, %r231;
+	cvt.f32.bf16 	%r232, %rs74;
+	cvt.f32.bf16 	%r233, %rs75;
+	.loc	1 126 24                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:126:24
+	mul.f32 	%r234, %r18, %r233;
+	mul.f32 	%r235, %r18, %r232;
+	.loc	1 127 85                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:127:85
+	mov.b32 	%r236, {%rs30, %rs31};
+	.loc	1 127 146                       // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:127:146
+	mov.b32 	{%rs76, %rs77}, %r236;
+	cvt.f32.bf16 	%r237, %rs77;
+	cvt.f32.bf16 	%r238, %rs76;
+	.loc	1 131 17                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:131:17
+	neg.f32 	%r239, %r235;
+	fma.rn.f32 	%r240, %r239, %r238, 0f00000000;
+	neg.f32 	%r241, %r234;
+	fma.rn.f32 	%r242, %r241, %r237, 0f00000000;
+	.loc	1 134 71                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:134:71
+	mov.b32 	%r243, {%rs34, %rs35};
+	.loc	1 134 132                       // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:134:132
+	mov.b32 	{%rs78, %rs79}, %r243;
+	cvt.f32.bf16 	%r244, %rs78;
+	cvt.f32.bf16 	%r245, %rs79;
+	.loc	1 139 24                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:139:24
+	mul.f32 	%r246, %r18, %r245;
+	mul.f32 	%r247, %r18, %r244;
+	.loc	1 140 81                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:140:81
+	mov.b32 	%r248, {%rs38, %rs39};
+	.loc	1 140 142                       // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:140:142
+	mov.b32 	{%rs80, %rs81}, %r248;
+	cvt.f32.bf16 	%r249, %rs81;
+	cvt.f32.bf16 	%r250, %rs80;
+	.loc	1 142 24                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:142:24
+	mul.f32 	%r251, %r247, %r250;
+	mul.f32 	%r252, %r246, %r249;
+	.loc	1 0 0                           // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:0
+	selp.f32 	%r253, %r242, %r252, %p4;
+	selp.f32 	%r254, %r240, %r251, %p4;
+	.loc	1 156 26                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:156:26
+	ld.shared.v2.b32 	{%r255, %r256}, [%r12];
+	.loc	1 159 26                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:159:26
+	fma.rn.f32 	%r257, %r149, %r254, %r255;
+	fma.rn.f32 	%r258, %r150, %r253, %r256;
+	.loc	1 162 56                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:162:56
+	cvt.rn.bf16x2.f32 	%r259, %r258, %r257;
+	.loc	1 121 71                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:121:71
+	mov.b32 	%r260, {%rs28, %rs29};
+	.loc	1 121 132                       // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:121:132
+	mov.b32 	{%rs82, %rs83}, %r260;
+	cvt.f32.bf16 	%r261, %rs82;
+	cvt.f32.bf16 	%r262, %rs83;
+	.loc	1 126 24                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:126:24
+	mul.f32 	%r263, %r18, %r262;
+	mul.f32 	%r264, %r18, %r261;
+	.loc	1 127 85                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:127:85
+	mov.b32 	%r265, {%rs32, %rs33};
+	.loc	1 127 146                       // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:127:146
+	mov.b32 	{%rs84, %rs85}, %r265;
+	cvt.f32.bf16 	%r266, %rs85;
+	cvt.f32.bf16 	%r267, %rs84;
+	.loc	1 131 17                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:131:17
+	neg.f32 	%r268, %r264;
+	fma.rn.f32 	%r269, %r268, %r267, 0f00000000;
+	neg.f32 	%r270, %r263;
+	fma.rn.f32 	%r271, %r270, %r266, 0f00000000;
+	.loc	1 134 71                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:134:71
+	mov.b32 	%r272, {%rs36, %rs37};
+	.loc	1 134 132                       // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:134:132
+	mov.b32 	{%rs86, %rs87}, %r272;
+	cvt.f32.bf16 	%r273, %rs86;
+	cvt.f32.bf16 	%r274, %rs87;
+	.loc	1 139 24                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:139:24
+	mul.f32 	%r275, %r18, %r274;
+	mul.f32 	%r276, %r18, %r273;
+	.loc	1 140 81                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:140:81
+	mov.b32 	%r277, {%rs40, %rs41};
+	.loc	1 140 142                       // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:140:142
+	mov.b32 	{%rs88, %rs89}, %r277;
+	cvt.f32.bf16 	%r278, %rs89;
+	cvt.f32.bf16 	%r279, %rs88;
+	.loc	1 142 24                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:142:24
+	mul.f32 	%r280, %r276, %r279;
+	mul.f32 	%r281, %r275, %r278;
+	.loc	1 0 0                           // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:0
+	selp.f32 	%r282, %r271, %r281, %p4;
+	selp.f32 	%r283, %r269, %r280, %p4;
+	.loc	1 156 26                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:156:26
+	ld.shared.v2.b32 	{%r284, %r285}, [%r13];
+	.loc	1 159 26                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:159:26
+	fma.rn.f32 	%r286, %r151, %r283, %r284;
+	fma.rn.f32 	%r287, %r152, %r282, %r285;
+	.loc	1 161 55                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:161:55
+	bar.sync 	0;
+	st.shared.b32 	[%r14], %r229;
+	st.shared.b32 	[%r15], %r230;
+	bar.sync 	0;
+	ld.shared.v2.b16 	{%rs90, %rs91}, [%r16];
+	ld.shared.v2.b16 	{%rs92, %rs93}, [%r16+256];
+	mov.b32 	%r122, {%rs90, %rs92};
+	mov.b32 	%r123, {%rs91, %rs93};
+	// begin inline asm
+	@%p3 st.global.v2.b32 [ %rd99 + 0 ], { %r122, %r123 };
+	// end inline asm
+	.loc	1 162 56                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:162:56
+	cvt.rn.bf16x2.f32 	%r288, %r287, %r286;
+	bar.sync 	0;
+	st.shared.b32 	[%r14], %r259;
+	st.shared.b32 	[%r15], %r288;
+	bar.sync 	0;
+	ld.shared.v2.b16 	{%rs94, %rs95}, [%r16];
+	ld.shared.v2.b16 	{%rs96, %rs97}, [%r16+256];
+	mov.b32 	%r124, {%rs94, %rs96};
+	mov.b32 	%r125, {%rs95, %rs97};
+	// begin inline asm
+	@%p3 st.global.v2.b32 [ %rd100 + 0 ], { %r124, %r125 };
+	// end inline asm
+	.loc	1 53 43                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:53:43
+	add.s64 	%rd106, %rd106, 16;
+	add.s64 	%rd105, %rd105, 8;
+	add.s64 	%rd104, %rd104, 32;
+	add.s64 	%rd103, %rd103, 32;
+	setp.lt.u64 	%p6, %rd105, 120;
+	@%p6 bra 	$L__BB0_3;
+// %bb.4:
+	.loc	1 53 4                          // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:53:4
+	ret;
+$L__tmp8:
+$L__func_end0:
+                                        // -- End function
+}
+	.file	1 "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py"
+	.file	2 "/usr/local/lib/python3.12/dist-packages/triton/language/standard.py"
+	.section	.debug_abbrev
+	{
+.b8 1                                   // Abbreviation Code
+.b8 17                                  // DW_TAG_compile_unit
+.b8 1                                   // DW_CHILDREN_yes
+.b8 37                                  // DW_AT_producer
+.b8 8                                   // DW_FORM_string
+.b8 19                                  // DW_AT_language
+.b8 5                                   // DW_FORM_data2
+.b8 3                                   // DW_AT_name
+.b8 8                                   // DW_FORM_string
+.b8 16                                  // DW_AT_stmt_list
+.b8 6                                   // DW_FORM_data4
+.b8 27                                  // DW_AT_comp_dir
+.b8 8                                   // DW_FORM_string
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 2                                   // Abbreviation Code
+.b8 46                                  // DW_TAG_subprogram
+.b8 0                                   // DW_CHILDREN_no
+.b8 3                                   // DW_AT_name
+.b8 8                                   // DW_FORM_string
+.b8 32                                  // DW_AT_inline
+.b8 11                                  // DW_FORM_data1
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 3                                   // Abbreviation Code
+.b8 46                                  // DW_TAG_subprogram
+.b8 1                                   // DW_CHILDREN_yes
+.b8 17                                  // DW_AT_low_pc
+.b8 1                                   // DW_FORM_addr
+.b8 18                                  // DW_AT_high_pc
+.b8 1                                   // DW_FORM_addr
+.b8 49                                  // DW_AT_abstract_origin
+.b8 19                                  // DW_FORM_ref4
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 4                                   // Abbreviation Code
+.b8 29                                  // DW_TAG_inlined_subroutine
+.b8 1                                   // DW_CHILDREN_yes
+.b8 49                                  // DW_AT_abstract_origin
+.b8 19                                  // DW_FORM_ref4
+.b8 17                                  // DW_AT_low_pc
+.b8 1                                   // DW_FORM_addr
+.b8 18                                  // DW_AT_high_pc
+.b8 1                                   // DW_FORM_addr
+.b8 88                                  // DW_AT_call_file
+.b8 11                                  // DW_FORM_data1
+.b8 89                                  // DW_AT_call_line
+.b8 11                                  // DW_FORM_data1
+.b8 87                                  // DW_AT_call_column
+.b8 11                                  // DW_FORM_data1
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 5                                   // Abbreviation Code
+.b8 29                                  // DW_TAG_inlined_subroutine
+.b8 0                                   // DW_CHILDREN_no
+.b8 49                                  // DW_AT_abstract_origin
+.b8 19                                  // DW_FORM_ref4
+.b8 17                                  // DW_AT_low_pc
+.b8 1                                   // DW_FORM_addr
+.b8 18                                  // DW_AT_high_pc
+.b8 1                                   // DW_FORM_addr
+.b8 88                                  // DW_AT_call_file
+.b8 11                                  // DW_FORM_data1
+.b8 89                                  // DW_AT_call_line
+.b8 5                                   // DW_FORM_data2
+.b8 87                                  // DW_AT_call_column
+.b8 11                                  // DW_FORM_data1
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 0                                   // EOM(3)
+	}
+	.section	.debug_info
+	{
+.b32 456                                // Length of Unit
+.b8 2                                   // DWARF version number
+.b8 0
+.b32 .debug_abbrev                      // Offset Into Abbrev. Section
+.b8 8                                   // Address Size (in bytes)
+.b8 1                                   // Abbrev [1] 0xb:0x1c1 DW_TAG_compile_unit
+.b8 116                                 // DW_AT_producer
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 0
+.b8 2                                   // DW_AT_language
+.b8 0
+.b8 99                                  // DW_AT_name
+.b8 98
+.b8 118
+.b8 113
+.b8 104
+.b8 106
+.b8 116
+.b8 121
+.b8 103
+.b8 55
+.b8 102
+.b8 118
+.b8 120
+.b8 122
+.b8 119
+.b8 116
+.b8 98
+.b8 116
+.b8 116
+.b8 52
+.b8 118
+.b8 114
+.b8 100
+.b8 107
+.b8 98
+.b8 110
+.b8 98
+.b8 54
+.b8 110
+.b8 51
+.b8 50
+.b8 102
+.b8 110
+.b8 114
+.b8 105
+.b8 106
+.b8 106
+.b8 112
+.b8 108
+.b8 51
+.b8 118
+.b8 118
+.b8 52
+.b8 99
+.b8 102
+.b8 113
+.b8 100
+.b8 52
+.b8 109
+.b8 122
+.b8 110
+.b8 114
+.b8 46
+.b8 112
+.b8 121
+.b8 0
+.b32 .debug_line                        // DW_AT_stmt_list
+.b8 47                                  // DW_AT_comp_dir
+.b8 97
+.b8 112
+.b8 112
+.b8 47
+.b8 116
+.b8 101
+.b8 110
+.b8 115
+.b8 111
+.b8 114
+.b8 114
+.b8 116
+.b8 95
+.b8 108
+.b8 108
+.b8 109
+.b8 47
+.b8 118
+.b8 105
+.b8 115
+.b8 117
+.b8 97
+.b8 108
+.b8 95
+.b8 103
+.b8 101
+.b8 110
+.b8 47
+.b8 99
+.b8 111
+.b8 109
+.b8 112
+.b8 105
+.b8 108
+.b8 101
+.b8 100
+.b8 95
+.b8 99
+.b8 97
+.b8 99
+.b8 104
+.b8 101
+.b8 47
+.b8 102
+.b8 108
+.b8 117
+.b8 120
+.b8 50
+.b8 95
+.b8 107
+.b8 108
+.b8 101
+.b8 105
+.b8 110
+.b8 95
+.b8 57
+.b8 98
+.b8 95
+.b8 78
+.b8 86
+.b8 73
+.b8 68
+.b8 73
+.b8 65
+.b8 95
+.b8 71
+.b8 101
+.b8 70
+.b8 111
+.b8 114
+.b8 99
+.b8 101
+.b8 95
+.b8 82
+.b8 84
+.b8 88
+.b8 95
+.b8 52
+.b8 48
+.b8 57
+.b8 48
+.b8 95
+.b8 115
+.b8 109
+.b8 56
+.b8 57
+.b8 95
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 50
+.b8 46
+.b8 49
+.b8 48
+.b8 46
+.b8 48
+.b8 97
+.b8 48
+.b8 95
+.b8 98
+.b8 52
+.b8 101
+.b8 52
+.b8 101
+.b8 101
+.b8 56
+.b8 49
+.b8 100
+.b8 51
+.b8 46
+.b8 110
+.b8 118
+.b8 50
+.b8 53
+.b8 46
+.b8 49
+.b8 50
+.b8 95
+.b8 99
+.b8 117
+.b8 100
+.b8 97
+.b8 49
+.b8 51
+.b8 95
+.b8 49
+.b8 47
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 105
+.b8 110
+.b8 100
+.b8 117
+.b8 99
+.b8 116
+.b8 111
+.b8 114
+.b8 47
+.b8 98
+.b8 118
+.b8 0
+.b8 2                                   // Abbrev [2] 0xe4:0x6d DW_TAG_subprogram
+.b8 116                                 // DW_AT_name
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 114
+.b8 101
+.b8 100
+.b8 95
+.b8 102
+.b8 117
+.b8 115
+.b8 101
+.b8 100
+.b8 95
+.b8 95
+.b8 102
+.b8 117
+.b8 115
+.b8 101
+.b8 100
+.b8 95
+.b8 114
+.b8 109
+.b8 115
+.b8 95
+.b8 110
+.b8 111
+.b8 114
+.b8 109
+.b8 95
+.b8 95
+.b8 116
+.b8 111
+.b8 95
+.b8 99
+.b8 111
+.b8 112
+.b8 121
+.b8 95
+.b8 97
+.b8 100
+.b8 100
+.b8 95
+.b8 109
+.b8 117
+.b8 108
+.b8 95
+.b8 110
+.b8 101
+.b8 103
+.b8 95
+.b8 115
+.b8 112
+.b8 108
+.b8 105
+.b8 116
+.b8 95
+.b8 115
+.b8 112
+.b8 108
+.b8 105
+.b8 116
+.b8 95
+.b8 119
+.b8 105
+.b8 116
+.b8 104
+.b8 95
+.b8 115
+.b8 105
+.b8 122
+.b8 101
+.b8 115
+.b8 95
+.b8 115
+.b8 116
+.b8 97
+.b8 99
+.b8 107
+.b8 95
+.b8 117
+.b8 110
+.b8 98
+.b8 105
+.b8 110
+.b8 100
+.b8 95
+.b8 117
+.b8 110
+.b8 115
+.b8 113
+.b8 117
+.b8 101
+.b8 101
+.b8 122
+.b8 101
+.b8 95
+.b8 118
+.b8 105
+.b8 101
+.b8 119
+.b8 95
+.b8 48
+.b8 0
+.b8 1                                   // DW_AT_inline
+.b8 3                                   // Abbrev [3] 0x151:0x7a DW_TAG_subprogram
+.b64 $L__func_begin0                    // DW_AT_low_pc
+.b64 $L__func_end0                      // DW_AT_high_pc
+.b32 228                                // DW_AT_abstract_origin
+.b8 4                                   // Abbrev [4] 0x166:0x32 DW_TAG_inlined_subroutine
+.b32 228                                // DW_AT_abstract_origin
+.b64 $L__tmp1                           // DW_AT_low_pc
+.b64 $L__tmp4                           // DW_AT_high_pc
+.b8 1                                   // DW_AT_call_file
+.b8 51                                  // DW_AT_call_line
+.b8 25                                  // DW_AT_call_column
+.b8 5                                   // Abbrev [5] 0x17e:0x19 DW_TAG_inlined_subroutine
+.b32 228                                // DW_AT_abstract_origin
+.b64 $L__tmp1                           // DW_AT_low_pc
+.b64 $L__tmp4                           // DW_AT_high_pc
+.b8 2                                   // DW_AT_call_file
+.b8 37                                  // DW_AT_call_line
+.b8 1
+.b8 36                                  // DW_AT_call_column
+.b8 0                                   // End Of Children Mark
+.b8 4                                   // Abbrev [4] 0x198:0x32 DW_TAG_inlined_subroutine
+.b32 228                                // DW_AT_abstract_origin
+.b64 $L__tmp4                           // DW_AT_low_pc
+.b64 $L__tmp7                           // DW_AT_high_pc
+.b8 1                                   // DW_AT_call_file
+.b8 52                                  // DW_AT_call_line
+.b8 27                                  // DW_AT_call_column
+.b8 5                                   // Abbrev [5] 0x1b0:0x19 DW_TAG_inlined_subroutine
+.b32 228                                // DW_AT_abstract_origin
+.b64 $L__tmp4                           // DW_AT_low_pc
+.b64 $L__tmp7                           // DW_AT_high_pc
+.b8 2                                   // DW_AT_call_file
+.b8 37                                  // DW_AT_call_line
+.b8 1
+.b8 36                                  // DW_AT_call_column
+.b8 0                                   // End Of Children Mark
+.b8 0                                   // End Of Children Mark
+.b8 0                                   // End Of Children Mark
+	}
+	.section	.debug_macinfo	{	}
diff --git a/triton/BFXM4JRLILKWOXIXJK63CNDMPAQTDH7WSFLZFJRWWUXYOHS6MUXQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.source b/triton/BFXM4JRLILKWOXIXJK63CNDMPAQTDH7WSFLZFJRWWUXYOHS6MUXQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.source
new file mode 100644
index 0000000000000000000000000000000000000000..cdae67b40e78911a51e4fe2359a389cd266df553
--- /dev/null
+++ b/triton/BFXM4JRLILKWOXIXJK63CNDMPAQTDH7WSFLZFJRWWUXYOHS6MUXQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.source
@@ -0,0 +1,972 @@
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":18:0)
+#loc213 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":287:0)
+#loc215 = loc(unknown)
+#loc218 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":262:0)
+#loc222 = loc("in_out_ptr0"(#loc))
+#loc223 = loc("in_out_ptr1"(#loc))
+#loc224 = loc("in_ptr0"(#loc))
+#loc225 = loc("in_ptr1"(#loc))
+#loc226 = loc("in_ptr2"(#loc))
+#loc227 = loc("in_ptr3"(#loc))
+#loc228 = loc("in_ptr4"(#loc))
+#loc229 = loc("xnumel"(#loc))
+#loc230 = loc("r0_numel"(#loc))
+#loc432 = loc("input"(#loc213))
+#loc433 = loc("a"(#loc218))
+#loc434 = loc("b"(#loc218))
+module {
+  tt.func public @triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0(%in_out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_out_ptr0"(#loc)), %in_out_ptr1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_out_ptr1"(#loc)), %in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %in_ptr4: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr4"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
+    %xnumel_0 = arith.constant 73728 : i32 loc(#loc231)
+    %r0_numel_1 = arith.constant 128 : i32 loc(#loc232)
+    %xoffset = tt.get_program_id x : i32 loc(#loc233)
+    %xoffset_2 = arith.constant 64 : i32 loc(#loc234)
+    %xoffset_3 = arith.constant 64 : i32 loc(#loc234)
+    %xoffset_4 = arith.muli %xoffset, %xoffset_3 : i32 loc(#loc234)
+    %xindex = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc235)
+    %xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc236)
+    %xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<64x1xi32> loc(#loc237)
+    %xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<64x1xi32> loc(#loc237)
+    %xmask = arith.constant true loc(#loc238)
+    %xmask_8 = arith.constant dense<true> : tensor<64x8xi1> loc(#loc238)
+    %r0_base = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32> loc(#loc239)
+    %r0_base_9 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<8xi32> -> tensor<1x8xi32> loc(#loc240)
+    %x0 = arith.constant 32 : i32 loc(#loc241)
+    %x0_10 = arith.constant 32 : i32 loc(#loc241)
+    %x0_11 = arith.constant dense<32> : tensor<64x1xi32> loc(#loc241)
+    %x0_12 = arith.remsi %xindex_7, %x0_11 : tensor<64x1xi32> loc(#loc241)
+    %x1 = arith.constant 32 : i32 loc(#loc242)
+    %x1_13 = arith.constant 32 : i32 loc(#loc242)
+    %x1_14 = arith.constant dense<32> : tensor<64x1xi32> loc(#loc242)
+    %x1_15 = arith.divsi %xindex_7, %x1_14 : tensor<64x1xi32> loc(#loc242)
+    %_tmp4 = arith.constant 0.000000e+00 : f32 loc(#loc243)
+    %_tmp4_16 = arith.constant dense<0.000000e+00> : tensor<64x8xf32> loc(#loc243)
+    %_tmp10 = arith.constant 0.000000e+00 : f32 loc(#loc244)
+    %_tmp10_17 = arith.constant dense<0.000000e+00> : tensor<64x8xf32> loc(#loc244)
+    %c0_i32 = arith.constant 0 : i32 loc(#loc15)
+    %c8_i32 = arith.constant 8 : i32 loc(#loc15)
+    %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc15)
+    %1 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc15)
+    %2 = arith.bitcast %c8_i32 : i32 to i32 loc(#loc15)
+    %3 = ub.poison : i32 loc(#loc15)
+    %_tmp10_18:2 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%_tmp4_23 = %_tmp4_16, %_tmp10_24 = %_tmp10_17) -> (tensor<64x8xf32>, tensor<64x8xf32>)  : i32 {
+      %r0_index = tt.splat %r0_offset : i32 -> tensor<1x8xi32> loc(#loc246)
+      %r0_index_25 = arith.addi %r0_index, %r0_base_9 : tensor<1x8xi32> loc(#loc246)
+      %r0_mask = arith.constant dense<128> : tensor<1x8xi32> loc(#loc247)
+      %r0_mask_26 = arith.cmpi slt, %r0_index_25, %r0_mask : tensor<1x8xi32> loc(#loc247)
+      %tmp0 = arith.constant 4096 : i32 loc(#loc248)
+      %tmp0_27 = arith.constant 4096 : i32 loc(#loc248)
+      %tmp0_28 = arith.constant dense<4096> : tensor<1x8xi32> loc(#loc248)
+      %tmp0_29 = arith.addi %tmp0_28, %r0_index_25 : tensor<1x8xi32> loc(#loc248)
+      %tmp0_30 = arith.constant 128 : i32 loc(#loc249)
+      %tmp0_31 = arith.constant 128 : i32 loc(#loc249)
+      %tmp0_32 = arith.constant dense<128> : tensor<64x1xi32> loc(#loc249)
+      %tmp0_33 = arith.muli %tmp0_32, %x0_12 : tensor<64x1xi32> loc(#loc249)
+      %tmp0_34 = tt.broadcast %tmp0_29 : tensor<1x8xi32> -> tensor<64x8xi32> loc(#loc250)
+      %tmp0_35 = tt.broadcast %tmp0_33 : tensor<64x1xi32> -> tensor<64x8xi32> loc(#loc250)
+      %tmp0_36 = arith.addi %tmp0_34, %tmp0_35 : tensor<64x8xi32> loc(#loc250)
+      %tmp0_37 = arith.constant 36864 : i32 loc(#loc251)
+      %tmp0_38 = arith.constant 36864 : i32 loc(#loc251)
+      %tmp0_39 = arith.constant dense<36864> : tensor<64x1xi32> loc(#loc251)
+      %tmp0_40 = arith.muli %tmp0_39, %x1_15 : tensor<64x1xi32> loc(#loc251)
+      %tmp0_41 = tt.broadcast %tmp0_40 : tensor<64x1xi32> -> tensor<64x8xi32> loc(#loc252)
+      %tmp0_42 = arith.addi %tmp0_36, %tmp0_41 : tensor<64x8xi32> loc(#loc252)
+      %tmp0_43 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<64x8x!tt.ptr<bf16>> loc(#loc253)
+      %tmp0_44 = tt.addptr %tmp0_43, %tmp0_42 : tensor<64x8x!tt.ptr<bf16>>, tensor<64x8xi32> loc(#loc253)
+      %tmp0_45 = arith.constant 0.000000e+00 : f32 loc(#loc254)
+      %tmp0_46 = tt.broadcast %r0_mask_26 : tensor<1x8xi1> -> tensor<64x8xi1> loc(#loc254)
+      %tmp0_47 = arith.constant dense<0.000000e+00> : tensor<64x8xf32> loc(#loc254)
+      %tmp0_48 = arith.truncf %tmp0_47 : tensor<64x8xf32> to tensor<64x8xbf16> loc(#loc254)
+      %tmp0_49 = tt.load %tmp0_44, %tmp0_46, %tmp0_48 evictionPolicy = evict_last : tensor<64x8x!tt.ptr<bf16>> loc(#loc254)
+      %tmp0_50 = arith.extf %tmp0_49 : tensor<64x8xbf16> to tensor<64x8xf32> loc(#loc255)
+      %tmp6 = arith.constant 128 : i32 loc(#loc256)
+      %tmp6_51 = arith.constant 128 : i32 loc(#loc256)
+      %tmp6_52 = arith.constant dense<128> : tensor<64x1xi32> loc(#loc256)
+      %tmp6_53 = arith.muli %tmp6_52, %x0_12 : tensor<64x1xi32> loc(#loc256)
+      %tmp6_54 = tt.broadcast %r0_index_25 : tensor<1x8xi32> -> tensor<64x8xi32> loc(#loc257)
+      %tmp6_55 = tt.broadcast %tmp6_53 : tensor<64x1xi32> -> tensor<64x8xi32> loc(#loc257)
+      %tmp6_56 = arith.addi %tmp6_54, %tmp6_55 : tensor<64x8xi32> loc(#loc257)
+      %tmp6_57 = arith.constant 36864 : i32 loc(#loc258)
+      %tmp6_58 = arith.constant 36864 : i32 loc(#loc258)
+      %tmp6_59 = arith.constant dense<36864> : tensor<64x1xi32> loc(#loc258)
+      %tmp6_60 = arith.muli %tmp6_59, %x1_15 : tensor<64x1xi32> loc(#loc258)
+      %tmp6_61 = tt.broadcast %tmp6_60 : tensor<64x1xi32> -> tensor<64x8xi32> loc(#loc259)
+      %tmp6_62 = arith.addi %tmp6_56, %tmp6_61 : tensor<64x8xi32> loc(#loc259)
+      %tmp6_63 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<64x8x!tt.ptr<bf16>> loc(#loc260)
+      %tmp6_64 = tt.addptr %tmp6_63, %tmp6_62 : tensor<64x8x!tt.ptr<bf16>>, tensor<64x8xi32> loc(#loc260)
+      %tmp6_65 = arith.constant 0.000000e+00 : f32 loc(#loc261)
+      %tmp6_66 = tt.broadcast %r0_mask_26 : tensor<1x8xi1> -> tensor<64x8xi1> loc(#loc261)
+      %tmp6_67 = arith.constant dense<0.000000e+00> : tensor<64x8xf32> loc(#loc261)
+      %tmp6_68 = arith.truncf %tmp6_67 : tensor<64x8xf32> to tensor<64x8xbf16> loc(#loc261)
+      %tmp6_69 = tt.load %tmp6_64, %tmp6_66, %tmp6_68 evictionPolicy = evict_last : tensor<64x8x!tt.ptr<bf16>> loc(#loc261)
+      %tmp6_70 = arith.extf %tmp6_69 : tensor<64x8xbf16> to tensor<64x8xf32> loc(#loc262)
+      %tmp2 = arith.mulf %tmp0_50, %tmp0_50 : tensor<64x8xf32> loc(#loc263)
+      %tmp5 = arith.addf %_tmp4_23, %tmp2 : tensor<64x8xf32> loc(#loc264)
+      %_tmp4_71 = tt.broadcast %r0_mask_26 : tensor<1x8xi1> -> tensor<64x8xi1> loc(#loc265)
+      %_tmp4_72 = arith.select %_tmp4_71, %tmp5, %_tmp4_23 : tensor<64x8xi1>, tensor<64x8xf32> loc(#loc265)
+      %tmp8 = arith.mulf %tmp6_70, %tmp6_70 : tensor<64x8xf32> loc(#loc266)
+      %tmp11 = arith.addf %_tmp10_24, %tmp8 : tensor<64x8xf32> loc(#loc267)
+      %_tmp10_73 = tt.broadcast %r0_mask_26 : tensor<1x8xi1> -> tensor<64x8xi1> loc(#loc268)
+      %_tmp10_74 = arith.select %_tmp10_73, %tmp11, %_tmp10_24 : tensor<64x8xi1>, tensor<64x8xf32> loc(#loc268)
+      scf.yield %_tmp4_72, %_tmp10_74 : tensor<64x8xf32>, tensor<64x8xf32> loc(#loc39)
+    } loc(#loc435)
+    %tmp4 = tt.call @"triton.language.standard.sum__fp32S64_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%_tmp10_18#0) : (tensor<64x8xf32>) -> tensor<64xf32> loc(#loc269)
+    %tmp4_19 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<64xf32> -> tensor<64x1xf32> loc(#loc270)
+    %tmp10 = tt.call @"triton.language.standard.sum__fp32S64_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%_tmp10_18#1) : (tensor<64x8xf32>) -> tensor<64xf32> loc(#loc271)
+    %tmp10_20 = tt.expand_dims %tmp10 {axis = 1 : i32} : tensor<64xf32> -> tensor<64x1xf32> loc(#loc272)
+    %c0_i32_21 = arith.constant 0 : i32 loc(#loc44)
+    %c8_i32_22 = arith.constant 8 : i32 loc(#loc44)
+    %4 = arith.bitcast %c0_i32_21 : i32 to i32 loc(#loc44)
+    %5 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc44)
+    %6 = arith.bitcast %c8_i32_22 : i32 to i32 loc(#loc44)
+    %7 = ub.poison : i32 loc(#loc44)
+    scf.for %r0_offset = %4 to %5 step %6  : i32 {
+      %r0_index = tt.splat %r0_offset : i32 -> tensor<1x8xi32> loc(#loc273)
+      %r0_index_23 = arith.addi %r0_index, %r0_base_9 : tensor<1x8xi32> loc(#loc273)
+      %r0_mask = arith.constant dense<128> : tensor<1x8xi32> loc(#loc274)
+      %r0_mask_24 = arith.cmpi slt, %r0_index_23, %r0_mask : tensor<1x8xi32> loc(#loc274)
+      %r0_3 = arith.constant 2 : i32 loc(#loc275)
+      %r0_3_25 = arith.constant 2 : i32 loc(#loc275)
+      %r0_3_26 = arith.constant dense<2> : tensor<1x8xi32> loc(#loc275)
+      %r0_3_27 = arith.remsi %r0_index_23, %r0_3_26 : tensor<1x8xi32> loc(#loc275)
+      %r0_4 = arith.constant 2 : i32 loc(#loc276)
+      %r0_4_28 = arith.constant 2 : i32 loc(#loc276)
+      %r0_4_29 = arith.constant dense<2> : tensor<1x8xi32> loc(#loc276)
+      %r0_4_30 = arith.divsi %r0_index_23, %r0_4_29 : tensor<1x8xi32> loc(#loc276)
+      %tmp50 = arith.constant 128 : i32 loc(#loc277)
+      %tmp50_31 = arith.constant 128 : i32 loc(#loc277)
+      %tmp50_32 = arith.constant dense<128> : tensor<64x1xi32> loc(#loc277)
+      %tmp50_33 = arith.muli %tmp50_32, %x0_12 : tensor<64x1xi32> loc(#loc277)
+      %tmp50_34 = tt.broadcast %r0_index_23 : tensor<1x8xi32> -> tensor<64x8xi32> loc(#loc278)
+      %tmp50_35 = tt.broadcast %tmp50_33 : tensor<64x1xi32> -> tensor<64x8xi32> loc(#loc278)
+      %tmp50_36 = arith.addi %tmp50_34, %tmp50_35 : tensor<64x8xi32> loc(#loc278)
+      %tmp50_37 = arith.constant 36864 : i32 loc(#loc279)
+      %tmp50_38 = arith.constant 36864 : i32 loc(#loc279)
+      %tmp50_39 = arith.constant dense<36864> : tensor<64x1xi32> loc(#loc279)
+      %tmp50_40 = arith.muli %tmp50_39, %x1_15 : tensor<64x1xi32> loc(#loc279)
+      %tmp50_41 = tt.broadcast %tmp50_40 : tensor<64x1xi32> -> tensor<64x8xi32> loc(#loc280)
+      %tmp50_42 = arith.addi %tmp50_36, %tmp50_41 : tensor<64x8xi32> loc(#loc280)
+      %tmp50_43 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<64x8x!tt.ptr<bf16>> loc(#loc281)
+      %tmp50_44 = tt.addptr %tmp50_43, %tmp50_42 : tensor<64x8x!tt.ptr<bf16>>, tensor<64x8xi32> loc(#loc281)
+      %tmp50_45 = arith.constant 0.000000e+00 : f32 loc(#loc282)
+      %tmp50_46 = tt.broadcast %r0_mask_24 : tensor<1x8xi1> -> tensor<64x8xi1> loc(#loc282)
+      %tmp50_47 = arith.constant dense<0.000000e+00> : tensor<64x8xf32> loc(#loc282)
+      %tmp50_48 = arith.truncf %tmp50_47 : tensor<64x8xf32> to tensor<64x8xbf16> loc(#loc282)
+      %tmp50_49 = tt.load %tmp50_44, %tmp50_46, %tmp50_48 evictionPolicy = evict_last : tensor<64x8x!tt.ptr<bf16>> loc(#loc282)
+      %tmp50_50 = arith.extf %tmp50_49 : tensor<64x8xbf16> to tensor<64x8xf32> loc(#loc283)
+      %tmp58 = tt.splat %in_ptr1 : !tt.ptr<bf16> -> tensor<1x8x!tt.ptr<bf16>> loc(#loc284)
+      %tmp58_51 = tt.addptr %tmp58, %r0_index_23 : tensor<1x8x!tt.ptr<bf16>>, tensor<1x8xi32> loc(#loc284)
+      %tmp58_52 = arith.constant 0.000000e+00 : f32 loc(#loc285)
+      %tmp58_53 = arith.constant dense<0.000000e+00> : tensor<1x8xf32> loc(#loc285)
+      %tmp58_54 = arith.truncf %tmp58_53 : tensor<1x8xf32> to tensor<1x8xbf16> loc(#loc285)
+      %tmp58_55 = tt.load %tmp58_51, %r0_mask_24, %tmp58_54 evictionPolicy = evict_last : tensor<1x8x!tt.ptr<bf16>> loc(#loc285)
+      %tmp58_56 = arith.extf %tmp58_55 : tensor<1x8xbf16> to tensor<1x8xf32> loc(#loc286)
+      %tmp63 = arith.constant 128 : i32 loc(#loc287)
+      %tmp63_57 = arith.constant 128 : i32 loc(#loc287)
+      %tmp63_58 = arith.constant dense<128> : tensor<64x1xi32> loc(#loc287)
+      %tmp63_59 = arith.muli %tmp63_58, %x1_15 : tensor<64x1xi32> loc(#loc287)
+      %tmp63_60 = tt.broadcast %r0_index_23 : tensor<1x8xi32> -> tensor<64x8xi32> loc(#loc288)
+      %tmp63_61 = tt.broadcast %tmp63_59 : tensor<64x1xi32> -> tensor<64x8xi32> loc(#loc288)
+      %tmp63_62 = arith.addi %tmp63_60, %tmp63_61 : tensor<64x8xi32> loc(#loc288)
+      %tmp63_63 = tt.splat %in_ptr2 : !tt.ptr<f32> -> tensor<64x8x!tt.ptr<f32>> loc(#loc289)
+      %tmp63_64 = tt.addptr %tmp63_63, %tmp63_62 : tensor<64x8x!tt.ptr<f32>>, tensor<64x8xi32> loc(#loc289)
+      %tmp63_65 = arith.constant 0.000000e+00 : f32 loc(#loc290)
+      %tmp63_66 = tt.broadcast %r0_mask_24 : tensor<1x8xi1> -> tensor<64x8xi1> loc(#loc290)
+      %tmp63_67 = arith.constant dense<0.000000e+00> : tensor<64x8xf32> loc(#loc290)
+      %tmp63_68 = tt.load %tmp63_64, %tmp63_66, %tmp63_67 evictionPolicy = evict_last : tensor<64x8x!tt.ptr<f32>> loc(#loc290)
+      %tmp66 = arith.constant 128 : i32 loc(#loc291)
+      %tmp66_69 = arith.constant 128 : i32 loc(#loc291)
+      %tmp66_70 = arith.constant dense<128> : tensor<64x1xi32> loc(#loc291)
+      %tmp66_71 = arith.muli %tmp66_70, %x1_15 : tensor<64x1xi32> loc(#loc291)
+      %tmp66_72 = tt.broadcast %r0_index_23 : tensor<1x8xi32> -> tensor<64x8xi32> loc(#loc292)
+      %tmp66_73 = tt.broadcast %tmp66_71 : tensor<64x1xi32> -> tensor<64x8xi32> loc(#loc292)
+      %tmp66_74 = arith.addi %tmp66_72, %tmp66_73 : tensor<64x8xi32> loc(#loc292)
+      %tmp66_75 = tt.splat %in_ptr3 : !tt.ptr<f32> -> tensor<64x8x!tt.ptr<f32>> loc(#loc293)
+      %tmp66_76 = tt.addptr %tmp66_75, %tmp66_74 : tensor<64x8x!tt.ptr<f32>>, tensor<64x8xi32> loc(#loc293)
+      %tmp66_77 = arith.constant 0.000000e+00 : f32 loc(#loc294)
+      %tmp66_78 = tt.broadcast %r0_mask_24 : tensor<1x8xi1> -> tensor<64x8xi1> loc(#loc294)
+      %tmp66_79 = arith.constant dense<0.000000e+00> : tensor<64x8xf32> loc(#loc294)
+      %tmp66_80 = tt.load %tmp66_76, %tmp66_78, %tmp66_79 evictionPolicy = evict_last : tensor<64x8x!tt.ptr<f32>> loc(#loc294)
+      %tmp96 = arith.constant 4096 : i32 loc(#loc295)
+      %tmp96_81 = arith.constant 4096 : i32 loc(#loc295)
+      %tmp96_82 = arith.constant dense<4096> : tensor<1x8xi32> loc(#loc295)
+      %tmp96_83 = arith.addi %tmp96_82, %r0_index_23 : tensor<1x8xi32> loc(#loc295)
+      %tmp96_84 = arith.constant 128 : i32 loc(#loc296)
+      %tmp96_85 = arith.constant 128 : i32 loc(#loc296)
+      %tmp96_86 = arith.constant dense<128> : tensor<64x1xi32> loc(#loc296)
+      %tmp96_87 = arith.muli %tmp96_86, %x0_12 : tensor<64x1xi32> loc(#loc296)
+      %tmp96_88 = tt.broadcast %tmp96_83 : tensor<1x8xi32> -> tensor<64x8xi32> loc(#loc297)
+      %tmp96_89 = tt.broadcast %tmp96_87 : tensor<64x1xi32> -> tensor<64x8xi32> loc(#loc297)
+      %tmp96_90 = arith.addi %tmp96_88, %tmp96_89 : tensor<64x8xi32> loc(#loc297)
+      %tmp96_91 = arith.constant 36864 : i32 loc(#loc298)
+      %tmp96_92 = arith.constant 36864 : i32 loc(#loc298)
+      %tmp96_93 = arith.constant dense<36864> : tensor<64x1xi32> loc(#loc298)
+      %tmp96_94 = arith.muli %tmp96_93, %x1_15 : tensor<64x1xi32> loc(#loc298)
+      %tmp96_95 = tt.broadcast %tmp96_94 : tensor<64x1xi32> -> tensor<64x8xi32> loc(#loc299)
+      %tmp96_96 = arith.addi %tmp96_90, %tmp96_95 : tensor<64x8xi32> loc(#loc299)
+      %tmp96_97 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<64x8x!tt.ptr<bf16>> loc(#loc300)
+      %tmp96_98 = tt.addptr %tmp96_97, %tmp96_96 : tensor<64x8x!tt.ptr<bf16>>, tensor<64x8xi32> loc(#loc300)
+      %tmp96_99 = arith.constant 0.000000e+00 : f32 loc(#loc301)
+      %tmp96_100 = tt.broadcast %r0_mask_24 : tensor<1x8xi1> -> tensor<64x8xi1> loc(#loc301)
+      %tmp96_101 = arith.constant dense<0.000000e+00> : tensor<64x8xf32> loc(#loc301)
+      %tmp96_102 = arith.truncf %tmp96_101 : tensor<64x8xf32> to tensor<64x8xbf16> loc(#loc301)
+      %tmp96_103 = tt.load %tmp96_98, %tmp96_100, %tmp96_102 evictionPolicy = evict_first : tensor<64x8x!tt.ptr<bf16>> loc(#loc301)
+      %tmp96_104 = arith.extf %tmp96_103 : tensor<64x8xbf16> to tensor<64x8xf32> loc(#loc302)
+      %tmp102 = tt.splat %in_ptr4 : !tt.ptr<bf16> -> tensor<1x8x!tt.ptr<bf16>> loc(#loc303)
+      %tmp102_105 = tt.addptr %tmp102, %r0_index_23 : tensor<1x8x!tt.ptr<bf16>>, tensor<1x8xi32> loc(#loc303)
+      %tmp102_106 = arith.constant 0.000000e+00 : f32 loc(#loc304)
+      %tmp102_107 = arith.constant dense<0.000000e+00> : tensor<1x8xf32> loc(#loc304)
+      %tmp102_108 = arith.truncf %tmp102_107 : tensor<1x8xf32> to tensor<1x8xbf16> loc(#loc304)
+      %tmp102_109 = tt.load %tmp102_105, %r0_mask_24, %tmp102_108 evictionPolicy = evict_last : tensor<1x8x!tt.ptr<bf16>> loc(#loc304)
+      %tmp102_110 = arith.extf %tmp102_109 : tensor<1x8xbf16> to tensor<1x8xf32> loc(#loc305)
+      %tmp13 = arith.constant 0 : i64 loc(#loc306)
+      %tmp13_111 = arith.constant dense<0> : tensor<1x1xi64> loc(#loc306)
+      %tmp14 = arith.extsi %r0_3_27 : tensor<1x8xi32> to tensor<1x8xi64> loc(#loc307)
+      %tmp14_112 = arith.constant dense<0> : tensor<1x8xi64> loc(#loc307)
+      %tmp14_113 = arith.cmpi sge, %tmp14, %tmp14_112 : tensor<1x8xi64> loc(#loc307)
+      %tmp15 = arith.constant 1 : i64 loc(#loc308)
+      %tmp15_114 = arith.constant dense<1> : tensor<1x1xi64> loc(#loc308)
+      %tmp16 = arith.extsi %r0_3_27 : tensor<1x8xi32> to tensor<1x8xi64> loc(#loc309)
+      %tmp16_115 = arith.constant dense<1> : tensor<1x8xi64> loc(#loc309)
+      %tmp16_116 = arith.cmpi slt, %tmp16, %tmp16_115 : tensor<1x8xi64> loc(#loc309)
+      %tmp17 = arith.constant 2 : i32 loc(#loc310)
+      %tmp17_117 = arith.constant 2 : i32 loc(#loc310)
+      %tmp17_118 = arith.constant dense<2> : tensor<1x8xi32> loc(#loc310)
+      %tmp17_119 = arith.muli %tmp17_118, %r0_4_30 : tensor<1x8xi32> loc(#loc310)
+      %tmp17_120 = arith.constant 1 : i32 loc(#loc311)
+      %tmp17_121 = arith.constant 1 : i32 loc(#loc311)
+      %tmp17_122 = arith.constant dense<1> : tensor<1x8xi32> loc(#loc311)
+      %tmp17_123 = arith.addi %tmp17_122, %tmp17_119 : tensor<1x8xi32> loc(#loc311)
+      %tmp17_124 = arith.constant 128 : i32 loc(#loc312)
+      %tmp17_125 = arith.constant 128 : i32 loc(#loc312)
+      %tmp17_126 = arith.constant dense<128> : tensor<64x1xi32> loc(#loc312)
+      %tmp17_127 = arith.muli %tmp17_126, %x0_12 : tensor<64x1xi32> loc(#loc312)
+      %tmp17_128 = tt.broadcast %tmp17_123 : tensor<1x8xi32> -> tensor<64x8xi32> loc(#loc313)
+      %tmp17_129 = tt.broadcast %tmp17_127 : tensor<64x1xi32> -> tensor<64x8xi32> loc(#loc313)
+      %tmp17_130 = arith.addi %tmp17_128, %tmp17_129 : tensor<64x8xi32> loc(#loc313)
+      %tmp17_131 = arith.constant 36864 : i32 loc(#loc314)
+      %tmp17_132 = arith.constant 36864 : i32 loc(#loc314)
+      %tmp17_133 = arith.constant dense<36864> : tensor<64x1xi32> loc(#loc314)
+      %tmp17_134 = arith.muli %tmp17_133, %x1_15 : tensor<64x1xi32> loc(#loc314)
+      %tmp17_135 = tt.broadcast %tmp17_134 : tensor<64x1xi32> -> tensor<64x8xi32> loc(#loc315)
+      %tmp17_136 = arith.addi %tmp17_130, %tmp17_135 : tensor<64x8xi32> loc(#loc315)
+      %tmp17_137 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<64x8x!tt.ptr<bf16>> loc(#loc316)
+      %tmp17_138 = tt.addptr %tmp17_137, %tmp17_136 : tensor<64x8x!tt.ptr<bf16>>, tensor<64x8xi32> loc(#loc316)
+      %tmp17_139 = arith.andi %r0_mask_24, %tmp16_116 : tensor<1x8xi1> loc(#loc317)
+      %tmp17_140 = arith.constant 0.000000e+00 : f32 loc(#loc318)
+      %tmp17_141 = tt.broadcast %tmp17_139 : tensor<1x8xi1> -> tensor<64x8xi1> loc(#loc318)
+      %tmp17_142 = arith.constant dense<0.000000e+00> : tensor<64x8xf32> loc(#loc318)
+      %tmp17_143 = arith.truncf %tmp17_142 : tensor<64x8xf32> to tensor<64x8xbf16> loc(#loc318)
+      %tmp17_144 = tt.load %tmp17_138, %tmp17_141, %tmp17_143 evictionPolicy = evict_last : tensor<64x8x!tt.ptr<bf16>> loc(#loc318)
+      %tmp17_145 = arith.extf %tmp17_144 : tensor<64x8xbf16> to tensor<64x8xf32> loc(#loc319)
+      %tmp19 = arith.constant 1.280000e+02 : f32 loc(#loc320)
+      %tmp20 = arith.constant dense<1.280000e+02> : tensor<64x1xf32> loc(#loc321)
+      %tmp20_146 = arith.divf %tmp10_20, %tmp20 : tensor<64x1xf32> loc(#loc321)
+      %tmp21 = arith.constant 9.99999997E-7 : f32 loc(#loc322)
+      %tmp22 = arith.constant dense<9.99999997E-7> : tensor<64x1xf32> loc(#loc323)
+      %tmp22_147 = arith.addf %tmp20_146, %tmp22 : tensor<64x1xf32> loc(#loc323)
+      %tmp23 = tt.extern_elementwise %tmp22_147 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<64x1xf32>) -> tensor<64x1xf32> loc(#loc324)
+      %tmp24 = tt.broadcast %tmp23 : tensor<64x1xf32> -> tensor<64x8xf32> loc(#loc325)
+      %tmp24_148 = arith.mulf %tmp17_145, %tmp24 : tensor<64x8xf32> loc(#loc325)
+      %tmp25 = arith.constant 2 : i32 loc(#loc326)
+      %tmp25_149 = arith.constant 2 : i32 loc(#loc326)
+      %tmp25_150 = arith.constant dense<2> : tensor<1x8xi32> loc(#loc326)
+      %tmp25_151 = arith.muli %tmp25_150, %r0_4_30 : tensor<1x8xi32> loc(#loc326)
+      %tmp25_152 = arith.constant 1 : i32 loc(#loc327)
+      %tmp25_153 = arith.constant 1 : i32 loc(#loc327)
+      %tmp25_154 = arith.constant dense<1> : tensor<1x8xi32> loc(#loc327)
+      %tmp25_155 = arith.addi %tmp25_154, %tmp25_151 : tensor<1x8xi32> loc(#loc327)
+      %tmp25_156 = tt.broadcast %tmp25_155 : tensor<1x8xi32> -> tensor<64x8xi32> loc(#loc328)
+      %tmp25_157 = tt.splat %in_ptr1 : !tt.ptr<bf16> -> tensor<64x8x!tt.ptr<bf16>> loc(#loc329)
+      %tmp25_158 = tt.addptr %tmp25_157, %tmp25_156 : tensor<64x8x!tt.ptr<bf16>>, tensor<64x8xi32> loc(#loc329)
+      %tmp25_159 = arith.andi %r0_mask_24, %tmp16_116 : tensor<1x8xi1> loc(#loc330)
+      %tmp25_160 = arith.constant 0.000000e+00 : f32 loc(#loc331)
+      %tmp25_161 = tt.broadcast %tmp25_159 : tensor<1x8xi1> -> tensor<64x8xi1> loc(#loc331)
+      %tmp25_162 = arith.constant dense<0.000000e+00> : tensor<64x8xf32> loc(#loc331)
+      %tmp25_163 = arith.truncf %tmp25_162 : tensor<64x8xf32> to tensor<64x8xbf16> loc(#loc331)
+      %tmp25_164 = tt.load %tmp25_158, %tmp25_161, %tmp25_163 evictionPolicy = evict_last : tensor<64x8x!tt.ptr<bf16>> loc(#loc331)
+      %tmp25_165 = arith.extf %tmp25_164 : tensor<64x8xbf16> to tensor<64x8xf32> loc(#loc332)
+      %tmp27 = arith.mulf %tmp24_148, %tmp25_165 : tensor<64x8xf32> loc(#loc333)
+      %tmp29 = arith.constant 0.000000e+00 : f32 loc(#loc334)
+      %tmp29_166 = arith.constant dense<0.000000e+00> : tensor<64x8xf32> loc(#loc334)
+      %tmp29_167 = arith.subf %tmp29_166, %tmp27 : tensor<64x8xf32> loc(#loc334)
+      %tmp30 = arith.constant 0.000000e+00 : f32 loc(#loc335)
+      %tmp30_168 = arith.constant dense<0.000000e+00> : tensor<64x8xf32> loc(#loc335)
+      %tmp31 = tt.broadcast %tmp16_116 : tensor<1x8xi1> -> tensor<64x8xi1> loc(#loc336)
+      %tmp31_169 = arith.select %tmp31, %tmp29_167, %tmp30_168 : tensor<64x8xi1>, tensor<64x8xf32> loc(#loc336)
+      %tmp32 = arith.extsi %r0_3_27 : tensor<1x8xi32> to tensor<1x8xi64> loc(#loc337)
+      %tmp32_170 = arith.constant dense<1> : tensor<1x8xi64> loc(#loc337)
+      %tmp32_171 = arith.cmpi sge, %tmp32, %tmp32_170 : tensor<1x8xi64> loc(#loc337)
+      %tmp33 = arith.constant 2 : i64 loc(#loc338)
+      %tmp33_172 = arith.constant dense<2> : tensor<1x1xi64> loc(#loc338)
+      %tmp34 = arith.extsi %r0_3_27 : tensor<1x8xi32> to tensor<1x8xi64> loc(#loc339)
+      %tmp34_173 = arith.constant dense<2> : tensor<1x8xi64> loc(#loc339)
+      %tmp34_174 = arith.cmpi slt, %tmp34, %tmp34_173 : tensor<1x8xi64> loc(#loc339)
+      %tmp35 = arith.constant 2 : i32 loc(#loc340)
+      %tmp35_175 = arith.constant 2 : i32 loc(#loc340)
+      %tmp35_176 = arith.constant dense<2> : tensor<1x8xi32> loc(#loc340)
+      %tmp35_177 = arith.muli %tmp35_176, %r0_4_30 : tensor<1x8xi32> loc(#loc340)
+      %tmp35_178 = arith.constant 128 : i32 loc(#loc341)
+      %tmp35_179 = arith.constant 128 : i32 loc(#loc341)
+      %tmp35_180 = arith.constant dense<128> : tensor<64x1xi32> loc(#loc341)
+      %tmp35_181 = arith.muli %tmp35_180, %x0_12 : tensor<64x1xi32> loc(#loc341)
+      %tmp35_182 = tt.broadcast %tmp35_177 : tensor<1x8xi32> -> tensor<64x8xi32> loc(#loc342)
+      %tmp35_183 = tt.broadcast %tmp35_181 : tensor<64x1xi32> -> tensor<64x8xi32> loc(#loc342)
+      %tmp35_184 = arith.addi %tmp35_182, %tmp35_183 : tensor<64x8xi32> loc(#loc342)
+      %tmp35_185 = arith.constant 36864 : i32 loc(#loc343)
+      %tmp35_186 = arith.constant 36864 : i32 loc(#loc343)
+      %tmp35_187 = arith.constant dense<36864> : tensor<64x1xi32> loc(#loc343)
+      %tmp35_188 = arith.muli %tmp35_187, %x1_15 : tensor<64x1xi32> loc(#loc343)
+      %tmp35_189 = tt.broadcast %tmp35_188 : tensor<64x1xi32> -> tensor<64x8xi32> loc(#loc344)
+      %tmp35_190 = arith.addi %tmp35_184, %tmp35_189 : tensor<64x8xi32> loc(#loc344)
+      %tmp35_191 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<64x8x!tt.ptr<bf16>> loc(#loc345)
+      %tmp35_192 = tt.addptr %tmp35_191, %tmp35_190 : tensor<64x8x!tt.ptr<bf16>>, tensor<64x8xi32> loc(#loc345)
+      %tmp35_193 = arith.andi %r0_mask_24, %tmp32_171 : tensor<1x8xi1> loc(#loc346)
+      %tmp35_194 = arith.constant 0.000000e+00 : f32 loc(#loc347)
+      %tmp35_195 = tt.broadcast %tmp35_193 : tensor<1x8xi1> -> tensor<64x8xi1> loc(#loc347)
+      %tmp35_196 = arith.constant dense<0.000000e+00> : tensor<64x8xf32> loc(#loc347)
+      %tmp35_197 = arith.truncf %tmp35_196 : tensor<64x8xf32> to tensor<64x8xbf16> loc(#loc347)
+      %tmp35_198 = tt.load %tmp35_192, %tmp35_195, %tmp35_197 evictionPolicy = evict_last : tensor<64x8x!tt.ptr<bf16>> loc(#loc347)
+      %tmp35_199 = arith.extf %tmp35_198 : tensor<64x8xbf16> to tensor<64x8xf32> loc(#loc348)
+      %tmp37 = arith.constant 1.280000e+02 : f32 loc(#loc349)
+      %tmp38 = arith.constant dense<1.280000e+02> : tensor<64x1xf32> loc(#loc350)
+      %tmp38_200 = arith.divf %tmp10_20, %tmp38 : tensor<64x1xf32> loc(#loc350)
+      %tmp39 = arith.constant 9.99999997E-7 : f32 loc(#loc351)
+      %tmp40 = arith.constant dense<9.99999997E-7> : tensor<64x1xf32> loc(#loc352)
+      %tmp40_201 = arith.addf %tmp38_200, %tmp40 : tensor<64x1xf32> loc(#loc352)
+      %tmp41 = tt.extern_elementwise %tmp40_201 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<64x1xf32>) -> tensor<64x1xf32> loc(#loc353)
+      %tmp42 = tt.broadcast %tmp41 : tensor<64x1xf32> -> tensor<64x8xf32> loc(#loc354)
+      %tmp42_202 = arith.mulf %tmp35_199, %tmp42 : tensor<64x8xf32> loc(#loc354)
+      %tmp43 = arith.constant 2 : i32 loc(#loc355)
+      %tmp43_203 = arith.constant 2 : i32 loc(#loc355)
+      %tmp43_204 = arith.constant dense<2> : tensor<1x8xi32> loc(#loc355)
+      %tmp43_205 = arith.muli %tmp43_204, %r0_4_30 : tensor<1x8xi32> loc(#loc355)
+      %tmp43_206 = tt.broadcast %tmp43_205 : tensor<1x8xi32> -> tensor<64x8xi32> loc(#loc356)
+      %tmp43_207 = tt.splat %in_ptr1 : !tt.ptr<bf16> -> tensor<64x8x!tt.ptr<bf16>> loc(#loc357)
+      %tmp43_208 = tt.addptr %tmp43_207, %tmp43_206 : tensor<64x8x!tt.ptr<bf16>>, tensor<64x8xi32> loc(#loc357)
+      %tmp43_209 = arith.andi %r0_mask_24, %tmp32_171 : tensor<1x8xi1> loc(#loc358)
+      %tmp43_210 = arith.constant 0.000000e+00 : f32 loc(#loc359)
+      %tmp43_211 = tt.broadcast %tmp43_209 : tensor<1x8xi1> -> tensor<64x8xi1> loc(#loc359)
+      %tmp43_212 = arith.constant dense<0.000000e+00> : tensor<64x8xf32> loc(#loc359)
+      %tmp43_213 = arith.truncf %tmp43_212 : tensor<64x8xf32> to tensor<64x8xbf16> loc(#loc359)
+      %tmp43_214 = tt.load %tmp43_208, %tmp43_211, %tmp43_213 evictionPolicy = evict_last : tensor<64x8x!tt.ptr<bf16>> loc(#loc359)
+      %tmp43_215 = arith.extf %tmp43_214 : tensor<64x8xbf16> to tensor<64x8xf32> loc(#loc360)
+      %tmp45 = arith.mulf %tmp42_202, %tmp43_215 : tensor<64x8xf32> loc(#loc361)
+      %tmp47 = arith.constant 0.000000e+00 : f32 loc(#loc362)
+      %tmp47_216 = arith.constant dense<0.000000e+00> : tensor<64x8xf32> loc(#loc362)
+      %tmp48 = tt.broadcast %tmp32_171 : tensor<1x8xi1> -> tensor<64x8xi1> loc(#loc363)
+      %tmp48_217 = arith.select %tmp48, %tmp45, %tmp47_216 : tensor<64x8xi1>, tensor<64x8xf32> loc(#loc363)
+      %tmp49 = tt.broadcast %tmp16_116 : tensor<1x8xi1> -> tensor<64x8xi1> loc(#loc364)
+      %tmp49_218 = arith.select %tmp49, %tmp31_169, %tmp48_217 : tensor<64x8xi1>, tensor<64x8xf32> loc(#loc364)
+      %tmp52 = arith.constant 1.280000e+02 : f32 loc(#loc365)
+      %tmp53 = arith.constant dense<1.280000e+02> : tensor<64x1xf32> loc(#loc366)
+      %tmp53_219 = arith.divf %tmp10_20, %tmp53 : tensor<64x1xf32> loc(#loc366)
+      %tmp54 = arith.constant 9.99999997E-7 : f32 loc(#loc367)
+      %tmp55 = arith.constant dense<9.99999997E-7> : tensor<64x1xf32> loc(#loc368)
+      %tmp55_220 = arith.addf %tmp53_219, %tmp55 : tensor<64x1xf32> loc(#loc368)
+      %tmp56 = tt.extern_elementwise %tmp55_220 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<64x1xf32>) -> tensor<64x1xf32> loc(#loc369)
+      %tmp57 = tt.broadcast %tmp56 : tensor<64x1xf32> -> tensor<64x8xf32> loc(#loc370)
+      %tmp57_221 = arith.mulf %tmp50_50, %tmp57 : tensor<64x8xf32> loc(#loc370)
+      %tmp60 = tt.broadcast %tmp58_56 : tensor<1x8xf32> -> tensor<64x8xf32> loc(#loc371)
+      %tmp60_222 = arith.mulf %tmp57_221, %tmp60 : tensor<64x8xf32> loc(#loc371)
+      %tmp64 = arith.mulf %tmp60_222, %tmp63_68 : tensor<64x8xf32> loc(#loc372)
+      %tmp67 = arith.mulf %tmp49_218, %tmp66_80 : tensor<64x8xf32> loc(#loc373)
+      %tmp68 = arith.addf %tmp64, %tmp67 : tensor<64x8xf32> loc(#loc374)
+      %tmp70 = arith.constant 2 : i32 loc(#loc375)
+      %tmp70_223 = arith.constant 2 : i32 loc(#loc375)
+      %tmp70_224 = arith.constant dense<2> : tensor<1x8xi32> loc(#loc375)
+      %tmp70_225 = arith.muli %tmp70_224, %r0_4_30 : tensor<1x8xi32> loc(#loc375)
+      %tmp70_226 = arith.constant 4097 : i32 loc(#loc376)
+      %tmp70_227 = arith.constant 4097 : i32 loc(#loc376)
+      %tmp70_228 = arith.constant dense<4097> : tensor<1x8xi32> loc(#loc376)
+      %tmp70_229 = arith.addi %tmp70_228, %tmp70_225 : tensor<1x8xi32> loc(#loc376)
+      %tmp70_230 = arith.constant 128 : i32 loc(#loc377)
+      %tmp70_231 = arith.constant 128 : i32 loc(#loc377)
+      %tmp70_232 = arith.constant dense<128> : tensor<64x1xi32> loc(#loc377)
+      %tmp70_233 = arith.muli %tmp70_232, %x0_12 : tensor<64x1xi32> loc(#loc377)
+      %tmp70_234 = tt.broadcast %tmp70_229 : tensor<1x8xi32> -> tensor<64x8xi32> loc(#loc378)
+      %tmp70_235 = tt.broadcast %tmp70_233 : tensor<64x1xi32> -> tensor<64x8xi32> loc(#loc378)
+      %tmp70_236 = arith.addi %tmp70_234, %tmp70_235 : tensor<64x8xi32> loc(#loc378)
+      %tmp70_237 = arith.constant 36864 : i32 loc(#loc379)
+      %tmp70_238 = arith.constant 36864 : i32 loc(#loc379)
+      %tmp70_239 = arith.constant dense<36864> : tensor<64x1xi32> loc(#loc379)
+      %tmp70_240 = arith.muli %tmp70_239, %x1_15 : tensor<64x1xi32> loc(#loc379)
+      %tmp70_241 = tt.broadcast %tmp70_240 : tensor<64x1xi32> -> tensor<64x8xi32> loc(#loc380)
+      %tmp70_242 = arith.addi %tmp70_236, %tmp70_241 : tensor<64x8xi32> loc(#loc380)
+      %tmp70_243 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<64x8x!tt.ptr<bf16>> loc(#loc381)
+      %tmp70_244 = tt.addptr %tmp70_243, %tmp70_242 : tensor<64x8x!tt.ptr<bf16>>, tensor<64x8xi32> loc(#loc381)
+      %tmp70_245 = arith.andi %r0_mask_24, %tmp16_116 : tensor<1x8xi1> loc(#loc382)
+      %tmp70_246 = arith.constant 0.000000e+00 : f32 loc(#loc383)
+      %tmp70_247 = tt.broadcast %tmp70_245 : tensor<1x8xi1> -> tensor<64x8xi1> loc(#loc383)
+      %tmp70_248 = arith.constant dense<0.000000e+00> : tensor<64x8xf32> loc(#loc383)
+      %tmp70_249 = arith.truncf %tmp70_248 : tensor<64x8xf32> to tensor<64x8xbf16> loc(#loc383)
+      %tmp70_250 = tt.load %tmp70_244, %tmp70_247, %tmp70_249 evictionPolicy = evict_last : tensor<64x8x!tt.ptr<bf16>> loc(#loc383)
+      %tmp70_251 = arith.extf %tmp70_250 : tensor<64x8xbf16> to tensor<64x8xf32> loc(#loc384)
+      %tmp72 = arith.constant dense<1.280000e+02> : tensor<64x1xf32> loc(#loc385)
+      %tmp72_252 = arith.divf %tmp4_19, %tmp72 : tensor<64x1xf32> loc(#loc385)
+      %tmp73 = arith.constant dense<9.99999997E-7> : tensor<64x1xf32> loc(#loc386)
+      %tmp73_253 = arith.addf %tmp72_252, %tmp73 : tensor<64x1xf32> loc(#loc386)
+      %tmp74 = tt.extern_elementwise %tmp73_253 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<64x1xf32>) -> tensor<64x1xf32> loc(#loc387)
+      %tmp75 = tt.broadcast %tmp74 : tensor<64x1xf32> -> tensor<64x8xf32> loc(#loc388)
+      %tmp75_254 = arith.mulf %tmp70_251, %tmp75 : tensor<64x8xf32> loc(#loc388)
+      %tmp76 = arith.constant 2 : i32 loc(#loc389)
+      %tmp76_255 = arith.constant 2 : i32 loc(#loc389)
+      %tmp76_256 = arith.constant dense<2> : tensor<1x8xi32> loc(#loc389)
+      %tmp76_257 = arith.muli %tmp76_256, %r0_4_30 : tensor<1x8xi32> loc(#loc389)
+      %tmp76_258 = arith.constant 1 : i32 loc(#loc390)
+      %tmp76_259 = arith.constant 1 : i32 loc(#loc390)
+      %tmp76_260 = arith.constant dense<1> : tensor<1x8xi32> loc(#loc390)
+      %tmp76_261 = arith.addi %tmp76_260, %tmp76_257 : tensor<1x8xi32> loc(#loc390)
+      %tmp76_262 = tt.broadcast %tmp76_261 : tensor<1x8xi32> -> tensor<64x8xi32> loc(#loc391)
+      %tmp76_263 = tt.splat %in_ptr4 : !tt.ptr<bf16> -> tensor<64x8x!tt.ptr<bf16>> loc(#loc392)
+      %tmp76_264 = tt.addptr %tmp76_263, %tmp76_262 : tensor<64x8x!tt.ptr<bf16>>, tensor<64x8xi32> loc(#loc392)
+      %tmp76_265 = arith.andi %r0_mask_24, %tmp16_116 : tensor<1x8xi1> loc(#loc393)
+      %tmp76_266 = arith.constant 0.000000e+00 : f32 loc(#loc394)
+      %tmp76_267 = tt.broadcast %tmp76_265 : tensor<1x8xi1> -> tensor<64x8xi1> loc(#loc394)
+      %tmp76_268 = arith.constant dense<0.000000e+00> : tensor<64x8xf32> loc(#loc394)
+      %tmp76_269 = arith.truncf %tmp76_268 : tensor<64x8xf32> to tensor<64x8xbf16> loc(#loc394)
+      %tmp76_270 = tt.load %tmp76_264, %tmp76_267, %tmp76_269 evictionPolicy = evict_last : tensor<64x8x!tt.ptr<bf16>> loc(#loc394)
+      %tmp76_271 = arith.extf %tmp76_270 : tensor<64x8xbf16> to tensor<64x8xf32> loc(#loc395)
+      %tmp78 = arith.mulf %tmp75_254, %tmp76_271 : tensor<64x8xf32> loc(#loc396)
+      %tmp80 = arith.constant 0.000000e+00 : f32 loc(#loc397)
+      %tmp80_272 = arith.constant dense<0.000000e+00> : tensor<64x8xf32> loc(#loc397)
+      %tmp80_273 = arith.subf %tmp80_272, %tmp78 : tensor<64x8xf32> loc(#loc397)
+      %tmp81 = arith.constant 0.000000e+00 : f32 loc(#loc398)
+      %tmp81_274 = arith.constant dense<0.000000e+00> : tensor<64x8xf32> loc(#loc398)
+      %tmp82 = tt.broadcast %tmp16_116 : tensor<1x8xi1> -> tensor<64x8xi1> loc(#loc399)
+      %tmp82_275 = arith.select %tmp82, %tmp80_273, %tmp81_274 : tensor<64x8xi1>, tensor<64x8xf32> loc(#loc399)
+      %tmp83 = arith.constant 2 : i32 loc(#loc400)
+      %tmp83_276 = arith.constant 2 : i32 loc(#loc400)
+      %tmp83_277 = arith.constant dense<2> : tensor<1x8xi32> loc(#loc400)
+      %tmp83_278 = arith.muli %tmp83_277, %r0_4_30 : tensor<1x8xi32> loc(#loc400)
+      %tmp83_279 = arith.constant 4096 : i32 loc(#loc401)
+      %tmp83_280 = arith.constant 4096 : i32 loc(#loc401)
+      %tmp83_281 = arith.constant dense<4096> : tensor<1x8xi32> loc(#loc401)
+      %tmp83_282 = arith.addi %tmp83_281, %tmp83_278 : tensor<1x8xi32> loc(#loc401)
+      %tmp83_283 = arith.constant 128 : i32 loc(#loc402)
+      %tmp83_284 = arith.constant 128 : i32 loc(#loc402)
+      %tmp83_285 = arith.constant dense<128> : tensor<64x1xi32> loc(#loc402)
+      %tmp83_286 = arith.muli %tmp83_285, %x0_12 : tensor<64x1xi32> loc(#loc402)
+      %tmp83_287 = tt.broadcast %tmp83_282 : tensor<1x8xi32> -> tensor<64x8xi32> loc(#loc403)
+      %tmp83_288 = tt.broadcast %tmp83_286 : tensor<64x1xi32> -> tensor<64x8xi32> loc(#loc403)
+      %tmp83_289 = arith.addi %tmp83_287, %tmp83_288 : tensor<64x8xi32> loc(#loc403)
+      %tmp83_290 = arith.constant 36864 : i32 loc(#loc404)
+      %tmp83_291 = arith.constant 36864 : i32 loc(#loc404)
+      %tmp83_292 = arith.constant dense<36864> : tensor<64x1xi32> loc(#loc404)
+      %tmp83_293 = arith.muli %tmp83_292, %x1_15 : tensor<64x1xi32> loc(#loc404)
+      %tmp83_294 = tt.broadcast %tmp83_293 : tensor<64x1xi32> -> tensor<64x8xi32> loc(#loc405)
+      %tmp83_295 = arith.addi %tmp83_289, %tmp83_294 : tensor<64x8xi32> loc(#loc405)
+      %tmp83_296 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<64x8x!tt.ptr<bf16>> loc(#loc406)
+      %tmp83_297 = tt.addptr %tmp83_296, %tmp83_295 : tensor<64x8x!tt.ptr<bf16>>, tensor<64x8xi32> loc(#loc406)
+      %tmp83_298 = arith.andi %r0_mask_24, %tmp32_171 : tensor<1x8xi1> loc(#loc407)
+      %tmp83_299 = arith.constant 0.000000e+00 : f32 loc(#loc408)
+      %tmp83_300 = tt.broadcast %tmp83_298 : tensor<1x8xi1> -> tensor<64x8xi1> loc(#loc408)
+      %tmp83_301 = arith.constant dense<0.000000e+00> : tensor<64x8xf32> loc(#loc408)
+      %tmp83_302 = arith.truncf %tmp83_301 : tensor<64x8xf32> to tensor<64x8xbf16> loc(#loc408)
+      %tmp83_303 = tt.load %tmp83_297, %tmp83_300, %tmp83_302 evictionPolicy = evict_last : tensor<64x8x!tt.ptr<bf16>> loc(#loc408)
+      %tmp83_304 = arith.extf %tmp83_303 : tensor<64x8xbf16> to tensor<64x8xf32> loc(#loc409)
+      %tmp85 = arith.constant dense<1.280000e+02> : tensor<64x1xf32> loc(#loc410)
+      %tmp85_305 = arith.divf %tmp4_19, %tmp85 : tensor<64x1xf32> loc(#loc410)
+      %tmp86 = arith.constant dense<9.99999997E-7> : tensor<64x1xf32> loc(#loc411)
+      %tmp86_306 = arith.addf %tmp85_305, %tmp86 : tensor<64x1xf32> loc(#loc411)
+      %tmp87 = tt.extern_elementwise %tmp86_306 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<64x1xf32>) -> tensor<64x1xf32> loc(#loc412)
+      %tmp88 = tt.broadcast %tmp87 : tensor<64x1xf32> -> tensor<64x8xf32> loc(#loc413)
+      %tmp88_307 = arith.mulf %tmp83_304, %tmp88 : tensor<64x8xf32> loc(#loc413)
+      %tmp89 = arith.constant 2 : i32 loc(#loc414)
+      %tmp89_308 = arith.constant 2 : i32 loc(#loc414)
+      %tmp89_309 = arith.constant dense<2> : tensor<1x8xi32> loc(#loc414)
+      %tmp89_310 = arith.muli %tmp89_309, %r0_4_30 : tensor<1x8xi32> loc(#loc414)
+      %tmp89_311 = tt.broadcast %tmp89_310 : tensor<1x8xi32> -> tensor<64x8xi32> loc(#loc415)
+      %tmp89_312 = tt.splat %in_ptr4 : !tt.ptr<bf16> -> tensor<64x8x!tt.ptr<bf16>> loc(#loc416)
+      %tmp89_313 = tt.addptr %tmp89_312, %tmp89_311 : tensor<64x8x!tt.ptr<bf16>>, tensor<64x8xi32> loc(#loc416)
+      %tmp89_314 = arith.andi %r0_mask_24, %tmp32_171 : tensor<1x8xi1> loc(#loc417)
+      %tmp89_315 = arith.constant 0.000000e+00 : f32 loc(#loc418)
+      %tmp89_316 = tt.broadcast %tmp89_314 : tensor<1x8xi1> -> tensor<64x8xi1> loc(#loc418)
+      %tmp89_317 = arith.constant dense<0.000000e+00> : tensor<64x8xf32> loc(#loc418)
+      %tmp89_318 = arith.truncf %tmp89_317 : tensor<64x8xf32> to tensor<64x8xbf16> loc(#loc418)
+      %tmp89_319 = tt.load %tmp89_313, %tmp89_316, %tmp89_318 evictionPolicy = evict_last : tensor<64x8x!tt.ptr<bf16>> loc(#loc418)
+      %tmp89_320 = arith.extf %tmp89_319 : tensor<64x8xbf16> to tensor<64x8xf32> loc(#loc419)
+      %tmp91 = arith.mulf %tmp88_307, %tmp89_320 : tensor<64x8xf32> loc(#loc420)
+      %tmp93 = arith.constant 0.000000e+00 : f32 loc(#loc421)
+      %tmp93_321 = arith.constant dense<0.000000e+00> : tensor<64x8xf32> loc(#loc421)
+      %tmp94 = tt.broadcast %tmp32_171 : tensor<1x8xi1> -> tensor<64x8xi1> loc(#loc422)
+      %tmp94_322 = arith.select %tmp94, %tmp91, %tmp93_321 : tensor<64x8xi1>, tensor<64x8xf32> loc(#loc422)
+      %tmp95 = tt.broadcast %tmp16_116 : tensor<1x8xi1> -> tensor<64x8xi1> loc(#loc423)
+      %tmp95_323 = arith.select %tmp95, %tmp82_275, %tmp94_322 : tensor<64x8xi1>, tensor<64x8xf32> loc(#loc423)
+      %tmp98 = arith.constant dense<1.280000e+02> : tensor<64x1xf32> loc(#loc424)
+      %tmp98_324 = arith.divf %tmp4_19, %tmp98 : tensor<64x1xf32> loc(#loc424)
+      %tmp99 = arith.constant dense<9.99999997E-7> : tensor<64x1xf32> loc(#loc425)
+      %tmp99_325 = arith.addf %tmp98_324, %tmp99 : tensor<64x1xf32> loc(#loc425)
+      %tmp100 = tt.extern_elementwise %tmp99_325 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<64x1xf32>) -> tensor<64x1xf32> loc(#loc426)
+      %tmp101 = tt.broadcast %tmp100 : tensor<64x1xf32> -> tensor<64x8xf32> loc(#loc427)
+      %tmp101_326 = arith.mulf %tmp96_104, %tmp101 : tensor<64x8xf32> loc(#loc427)
+      %tmp104 = tt.broadcast %tmp102_110 : tensor<1x8xf32> -> tensor<64x8xf32> loc(#loc428)
+      %tmp104_327 = arith.mulf %tmp101_326, %tmp104 : tensor<64x8xf32> loc(#loc428)
+      %tmp107 = arith.mulf %tmp104_327, %tmp63_68 : tensor<64x8xf32> loc(#loc429)
+      %tmp109 = arith.mulf %tmp95_323, %tmp66_80 : tensor<64x8xf32> loc(#loc430)
+      %tmp110 = arith.addf %tmp107, %tmp109 : tensor<64x8xf32> loc(#loc431)
+      %c128_i32 = arith.constant 128 : i32 loc(#loc204)
+      %c128_i32_328 = arith.constant 128 : i32 loc(#loc204)
+      %cst = arith.constant dense<128> : tensor<64x1xi32> loc(#loc204)
+      %8 = arith.muli %cst, %xindex_7 : tensor<64x1xi32> loc(#loc204)
+      %9 = tt.broadcast %r0_index_23 : tensor<1x8xi32> -> tensor<64x8xi32> loc(#loc205)
+      %10 = tt.broadcast %8 : tensor<64x1xi32> -> tensor<64x8xi32> loc(#loc205)
+      %11 = arith.addi %9, %10 : tensor<64x8xi32> loc(#loc205)
+      %12 = tt.splat %in_out_ptr0 : !tt.ptr<bf16> -> tensor<64x8x!tt.ptr<bf16>> loc(#loc206)
+      %13 = tt.addptr %12, %11 : tensor<64x8x!tt.ptr<bf16>>, tensor<64x8xi32> loc(#loc206)
+      %14 = tt.broadcast %r0_mask_24 : tensor<1x8xi1> -> tensor<64x8xi1> loc(#loc207)
+      %15 = arith.truncf %tmp68 : tensor<64x8xf32> to tensor<64x8xbf16> loc(#loc207)
+      tt.store %13, %15, %14 : tensor<64x8x!tt.ptr<bf16>> loc(#loc207)
+      %c128_i32_329 = arith.constant 128 : i32 loc(#loc208)
+      %c128_i32_330 = arith.constant 128 : i32 loc(#loc208)
+      %cst_331 = arith.constant dense<128> : tensor<64x1xi32> loc(#loc208)
+      %16 = arith.muli %cst_331, %xindex_7 : tensor<64x1xi32> loc(#loc208)
+      %17 = tt.broadcast %r0_index_23 : tensor<1x8xi32> -> tensor<64x8xi32> loc(#loc209)
+      %18 = tt.broadcast %16 : tensor<64x1xi32> -> tensor<64x8xi32> loc(#loc209)
+      %19 = arith.addi %17, %18 : tensor<64x8xi32> loc(#loc209)
+      %20 = tt.splat %in_out_ptr1 : !tt.ptr<bf16> -> tensor<64x8x!tt.ptr<bf16>> loc(#loc210)
+      %21 = tt.addptr %20, %19 : tensor<64x8x!tt.ptr<bf16>>, tensor<64x8xi32> loc(#loc210)
+      %22 = tt.broadcast %r0_mask_24 : tensor<1x8xi1> -> tensor<64x8xi1> loc(#loc211)
+      %23 = arith.truncf %tmp110 : tensor<64x8xf32> to tensor<64x8xbf16> loc(#loc211)
+      tt.store %21, %23, %22 : tensor<64x8x!tt.ptr<bf16>> loc(#loc211)
+    } loc(#loc44)
+    tt.return loc(#loc212)
+  } loc(#loc)
+  tt.func private @"triton.language.standard.sum__fp32S64_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<64x8xf32> loc("input"(#loc213))) -> tensor<64xf32> attributes {noinline = false} {
+    %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({
+    ^bb0(%arg1: f32 loc(unknown), %arg2: f32 loc(unknown)):
+      %2 = tt.call @triton.language.standard._sum_combine__fp32_fp32__(%arg1, %arg2) : (f32, f32) -> f32 loc(#loc214)
+      tt.reduce.return %2 : f32 loc(#loc214)
+    }) : (tensor<64x8xf32>) -> tensor<64xf32> loc(#loc214)
+    tt.return %0 : tensor<64xf32> loc(#loc216)
+  ^bb1:  // no predecessors
+    %1 = ub.poison : tensor<64xf32> loc(#loc217)
+    tt.return %1 : tensor<64xf32> loc(#loc217)
+  } loc(#loc213)
+  tt.func private @triton.language.standard._sum_combine__fp32_fp32__(%a: f32 loc("a"(#loc218)), %b: f32 loc("b"(#loc218))) -> f32 attributes {noinline = false} {
+    %0 = arith.addf %a, %b : f32 loc(#loc219)
+    tt.return %0 : f32 loc(#loc220)
+  ^bb1:  // no predecessors
+    %1 = ub.poison : f32 loc(#loc221)
+    tt.return %1 : f32 loc(#loc221)
+  } loc(#loc218)
+} loc(#loc)
+#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":19:13)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":20:15)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":23:28)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":23:33)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:36)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:44)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:23)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":25:46)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":26:27)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":26:37)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":28:19)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":29:19)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":30:43)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":32:44)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":33:43)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":34:31)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":35:29)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:41)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:52)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:48)
+#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:63)
+#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:57)
+#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:34)
+#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:68)
+#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:121)
+#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:45)
+#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:41)
+#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:56)
+#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:50)
+#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:34)
+#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:61)
+#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:114)
+#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":42:22)
+#loc34 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":44:23)
+#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":45:40)
+#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":47:22)
+#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":49:25)
+#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":50:42)
+#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":50:8)
+#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":51:25)
+#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":51:28)
+#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":52:27)
+#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":52:30)
+#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":53:43)
+#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":54:31)
+#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":55:29)
+#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":58:27)
+#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":59:27)
+#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:46)
+#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:42)
+#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:57)
+#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:51)
+#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:35)
+#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:62)
+#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:115)
+#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:35)
+#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:42)
+#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:95)
+#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:46)
+#loc60 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:42)
+#loc61 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:35)
+#loc62 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:51)
+#loc63 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:46)
+#loc64 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:42)
+#loc65 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:35)
+#loc66 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:51)
+#loc67 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:42)
+#loc68 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:53)
+#loc69 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:49)
+#loc70 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:64)
+#loc71 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:58)
+#loc72 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:35)
+#loc73 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:69)
+#loc74 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:123)
+#loc75 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:36)
+#loc76 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:43)
+#loc77 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:96)
+#loc78 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":68:35)
+#loc79 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":69:25)
+#loc80 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":70:35)
+#loc81 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":71:24)
+#loc82 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:41)
+#loc83 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:39)
+#loc84 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:52)
+#loc85 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:48)
+#loc86 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:63)
+#loc87 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:57)
+#loc88 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:35)
+#loc89 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:78)
+#loc90 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:68)
+#loc91 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:129)
+#loc92 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":74:16)
+#loc93 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":75:25)
+#loc94 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":76:16)
+#loc95 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":77:24)
+#loc96 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":78:32)
+#loc97 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":79:24)
+#loc98 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:57)
+#loc99 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:55)
+#loc100 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:63)
+#loc101 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:35)
+#loc102 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:95)
+#loc103 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:85)
+#loc104 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:146)
+#loc105 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":82:24)
+#loc106 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":84:17)
+#loc107 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":85:42)
+#loc108 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":86:39)
+#loc109 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":87:25)
+#loc110 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":88:35)
+#loc111 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":89:24)
+#loc112 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:37)
+#loc113 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:48)
+#loc114 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:44)
+#loc115 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:59)
+#loc116 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:53)
+#loc117 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:35)
+#loc118 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:74)
+#loc119 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:64)
+#loc120 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:125)
+#loc121 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":92:16)
+#loc122 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":93:25)
+#loc123 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":94:16)
+#loc124 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":95:24)
+#loc125 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":96:32)
+#loc126 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":97:24)
+#loc127 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:53)
+#loc128 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:59)
+#loc129 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:35)
+#loc130 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:91)
+#loc131 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:81)
+#loc132 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:142)
+#loc133 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":100:24)
+#loc134 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":102:42)
+#loc135 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":103:39)
+#loc136 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":104:39)
+#loc137 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":106:16)
+#loc138 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":107:25)
+#loc139 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":108:16)
+#loc140 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":109:24)
+#loc141 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":110:32)
+#loc142 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":111:24)
+#loc143 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":113:24)
+#loc144 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":116:24)
+#loc145 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":118:24)
+#loc146 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":119:24)
+#loc147 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:44)
+#loc148 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:42)
+#loc149 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:55)
+#loc150 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:51)
+#loc151 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:66)
+#loc152 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:60)
+#loc153 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:35)
+#loc154 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:81)
+#loc155 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:71)
+#loc156 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:132)
+#loc157 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":123:24)
+#loc158 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":124:24)
+#loc159 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":125:32)
+#loc160 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":126:24)
+#loc161 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:57)
+#loc162 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:55)
+#loc163 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:63)
+#loc164 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:35)
+#loc165 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:95)
+#loc166 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:85)
+#loc167 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:146)
+#loc168 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":129:24)
+#loc169 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":131:17)
+#loc170 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":132:42)
+#loc171 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":133:39)
+#loc172 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:44)
+#loc173 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:42)
+#loc174 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:55)
+#loc175 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:51)
+#loc176 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:66)
+#loc177 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:60)
+#loc178 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:35)
+#loc179 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:81)
+#loc180 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:71)
+#loc181 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:132)
+#loc182 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":136:24)
+#loc183 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":137:24)
+#loc184 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":138:32)
+#loc185 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":139:24)
+#loc186 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:53)
+#loc187 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:59)
+#loc188 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:35)
+#loc189 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:91)
+#loc190 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:81)
+#loc191 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:142)
+#loc192 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":142:24)
+#loc193 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":144:42)
+#loc194 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":145:39)
+#loc195 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":146:39)
+#loc196 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":148:24)
+#loc197 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":149:24)
+#loc198 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":150:33)
+#loc199 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":151:25)
+#loc200 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":153:26)
+#loc201 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":156:26)
+#loc202 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":158:26)
+#loc203 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":159:26)
+#loc204 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:43)
+#loc205 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:39)
+#loc206 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:32)
+#loc207 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:55)
+#loc208 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:43)
+#loc209 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:39)
+#loc210 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:32)
+#loc211 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:56)
+#loc212 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":53:4)
+#loc214 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:36)
+#loc216 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:11)
+#loc217 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:4)
+#loc219 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:15)
+#loc220 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:11)
+#loc221 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:4)
+#loc231 = loc("xnumel"(#loc1))
+#loc232 = loc("r0_numel"(#loc2))
+#loc233 = loc("xoffset"(#loc3))
+#loc234 = loc("xoffset"(#loc4))
+#loc235 = loc("xindex"(#loc5))
+#loc236 = loc("xindex"(#loc6))
+#loc237 = loc("xindex"(#loc7))
+#loc238 = loc("xmask"(#loc8))
+#loc239 = loc("r0_base"(#loc9))
+#loc240 = loc("r0_base"(#loc10))
+#loc241 = loc("x0"(#loc11))
+#loc242 = loc("x1"(#loc12))
+#loc243 = loc("_tmp4"(#loc13))
+#loc244 = loc("_tmp10"(#loc14))
+#loc245 = loc("_tmp4"(#loc15))
+#loc246 = loc("r0_index"(#loc16))
+#loc247 = loc("r0_mask"(#loc17))
+#loc248 = loc("tmp0"(#loc18))
+#loc249 = loc("tmp0"(#loc19))
+#loc250 = loc("tmp0"(#loc20))
+#loc251 = loc("tmp0"(#loc21))
+#loc252 = loc("tmp0"(#loc22))
+#loc253 = loc("tmp0"(#loc23))
+#loc254 = loc("tmp0"(#loc24))
+#loc255 = loc("tmp0"(#loc25))
+#loc256 = loc("tmp6"(#loc26))
+#loc257 = loc("tmp6"(#loc27))
+#loc258 = loc("tmp6"(#loc28))
+#loc259 = loc("tmp6"(#loc29))
+#loc260 = loc("tmp6"(#loc30))
+#loc261 = loc("tmp6"(#loc31))
+#loc262 = loc("tmp6"(#loc32))
+#loc263 = loc("tmp2"(#loc33))
+#loc264 = loc("tmp5"(#loc34))
+#loc265 = loc("_tmp4"(#loc35))
+#loc266 = loc("tmp8"(#loc36))
+#loc267 = loc("tmp11"(#loc37))
+#loc268 = loc("_tmp10"(#loc38))
+#loc269 = loc("tmp4"(#loc40))
+#loc270 = loc("tmp4"(#loc41))
+#loc271 = loc("tmp10"(#loc42))
+#loc272 = loc("tmp10"(#loc43))
+#loc273 = loc("r0_index"(#loc45))
+#loc274 = loc("r0_mask"(#loc46))
+#loc275 = loc("r0_3"(#loc47))
+#loc276 = loc("r0_4"(#loc48))
+#loc277 = loc("tmp50"(#loc49))
+#loc278 = loc("tmp50"(#loc50))
+#loc279 = loc("tmp50"(#loc51))
+#loc280 = loc("tmp50"(#loc52))
+#loc281 = loc("tmp50"(#loc53))
+#loc282 = loc("tmp50"(#loc54))
+#loc283 = loc("tmp50"(#loc55))
+#loc284 = loc("tmp58"(#loc56))
+#loc285 = loc("tmp58"(#loc57))
+#loc286 = loc("tmp58"(#loc58))
+#loc287 = loc("tmp63"(#loc59))
+#loc288 = loc("tmp63"(#loc60))
+#loc289 = loc("tmp63"(#loc61))
+#loc290 = loc("tmp63"(#loc62))
+#loc291 = loc("tmp66"(#loc63))
+#loc292 = loc("tmp66"(#loc64))
+#loc293 = loc("tmp66"(#loc65))
+#loc294 = loc("tmp66"(#loc66))
+#loc295 = loc("tmp96"(#loc67))
+#loc296 = loc("tmp96"(#loc68))
+#loc297 = loc("tmp96"(#loc69))
+#loc298 = loc("tmp96"(#loc70))
+#loc299 = loc("tmp96"(#loc71))
+#loc300 = loc("tmp96"(#loc72))
+#loc301 = loc("tmp96"(#loc73))
+#loc302 = loc("tmp96"(#loc74))
+#loc303 = loc("tmp102"(#loc75))
+#loc304 = loc("tmp102"(#loc76))
+#loc305 = loc("tmp102"(#loc77))
+#loc306 = loc("tmp13"(#loc78))
+#loc307 = loc("tmp14"(#loc79))
+#loc308 = loc("tmp15"(#loc80))
+#loc309 = loc("tmp16"(#loc81))
+#loc310 = loc("tmp17"(#loc82))
+#loc311 = loc("tmp17"(#loc83))
+#loc312 = loc("tmp17"(#loc84))
+#loc313 = loc("tmp17"(#loc85))
+#loc314 = loc("tmp17"(#loc86))
+#loc315 = loc("tmp17"(#loc87))
+#loc316 = loc("tmp17"(#loc88))
+#loc317 = loc("tmp17"(#loc89))
+#loc318 = loc("tmp17"(#loc90))
+#loc319 = loc("tmp17"(#loc91))
+#loc320 = loc("tmp19"(#loc92))
+#loc321 = loc("tmp20"(#loc93))
+#loc322 = loc("tmp21"(#loc94))
+#loc323 = loc("tmp22"(#loc95))
+#loc324 = loc("tmp23"(#loc96))
+#loc325 = loc("tmp24"(#loc97))
+#loc326 = loc("tmp25"(#loc98))
+#loc327 = loc("tmp25"(#loc99))
+#loc328 = loc("tmp25"(#loc100))
+#loc329 = loc("tmp25"(#loc101))
+#loc330 = loc("tmp25"(#loc102))
+#loc331 = loc("tmp25"(#loc103))
+#loc332 = loc("tmp25"(#loc104))
+#loc333 = loc("tmp27"(#loc105))
+#loc334 = loc("tmp29"(#loc106))
+#loc335 = loc("tmp30"(#loc107))
+#loc336 = loc("tmp31"(#loc108))
+#loc337 = loc("tmp32"(#loc109))
+#loc338 = loc("tmp33"(#loc110))
+#loc339 = loc("tmp34"(#loc111))
+#loc340 = loc("tmp35"(#loc112))
+#loc341 = loc("tmp35"(#loc113))
+#loc342 = loc("tmp35"(#loc114))
+#loc343 = loc("tmp35"(#loc115))
+#loc344 = loc("tmp35"(#loc116))
+#loc345 = loc("tmp35"(#loc117))
+#loc346 = loc("tmp35"(#loc118))
+#loc347 = loc("tmp35"(#loc119))
+#loc348 = loc("tmp35"(#loc120))
+#loc349 = loc("tmp37"(#loc121))
+#loc350 = loc("tmp38"(#loc122))
+#loc351 = loc("tmp39"(#loc123))
+#loc352 = loc("tmp40"(#loc124))
+#loc353 = loc("tmp41"(#loc125))
+#loc354 = loc("tmp42"(#loc126))
+#loc355 = loc("tmp43"(#loc127))
+#loc356 = loc("tmp43"(#loc128))
+#loc357 = loc("tmp43"(#loc129))
+#loc358 = loc("tmp43"(#loc130))
+#loc359 = loc("tmp43"(#loc131))
+#loc360 = loc("tmp43"(#loc132))
+#loc361 = loc("tmp45"(#loc133))
+#loc362 = loc("tmp47"(#loc134))
+#loc363 = loc("tmp48"(#loc135))
+#loc364 = loc("tmp49"(#loc136))
+#loc365 = loc("tmp52"(#loc137))
+#loc366 = loc("tmp53"(#loc138))
+#loc367 = loc("tmp54"(#loc139))
+#loc368 = loc("tmp55"(#loc140))
+#loc369 = loc("tmp56"(#loc141))
+#loc370 = loc("tmp57"(#loc142))
+#loc371 = loc("tmp60"(#loc143))
+#loc372 = loc("tmp64"(#loc144))
+#loc373 = loc("tmp67"(#loc145))
+#loc374 = loc("tmp68"(#loc146))
+#loc375 = loc("tmp70"(#loc147))
+#loc376 = loc("tmp70"(#loc148))
+#loc377 = loc("tmp70"(#loc149))
+#loc378 = loc("tmp70"(#loc150))
+#loc379 = loc("tmp70"(#loc151))
+#loc380 = loc("tmp70"(#loc152))
+#loc381 = loc("tmp70"(#loc153))
+#loc382 = loc("tmp70"(#loc154))
+#loc383 = loc("tmp70"(#loc155))
+#loc384 = loc("tmp70"(#loc156))
+#loc385 = loc("tmp72"(#loc157))
+#loc386 = loc("tmp73"(#loc158))
+#loc387 = loc("tmp74"(#loc159))
+#loc388 = loc("tmp75"(#loc160))
+#loc389 = loc("tmp76"(#loc161))
+#loc390 = loc("tmp76"(#loc162))
+#loc391 = loc("tmp76"(#loc163))
+#loc392 = loc("tmp76"(#loc164))
+#loc393 = loc("tmp76"(#loc165))
+#loc394 = loc("tmp76"(#loc166))
+#loc395 = loc("tmp76"(#loc167))
+#loc396 = loc("tmp78"(#loc168))
+#loc397 = loc("tmp80"(#loc169))
+#loc398 = loc("tmp81"(#loc170))
+#loc399 = loc("tmp82"(#loc171))
+#loc400 = loc("tmp83"(#loc172))
+#loc401 = loc("tmp83"(#loc173))
+#loc402 = loc("tmp83"(#loc174))
+#loc403 = loc("tmp83"(#loc175))
+#loc404 = loc("tmp83"(#loc176))
+#loc405 = loc("tmp83"(#loc177))
+#loc406 = loc("tmp83"(#loc178))
+#loc407 = loc("tmp83"(#loc179))
+#loc408 = loc("tmp83"(#loc180))
+#loc409 = loc("tmp83"(#loc181))
+#loc410 = loc("tmp85"(#loc182))
+#loc411 = loc("tmp86"(#loc183))
+#loc412 = loc("tmp87"(#loc184))
+#loc413 = loc("tmp88"(#loc185))
+#loc414 = loc("tmp89"(#loc186))
+#loc415 = loc("tmp89"(#loc187))
+#loc416 = loc("tmp89"(#loc188))
+#loc417 = loc("tmp89"(#loc189))
+#loc418 = loc("tmp89"(#loc190))
+#loc419 = loc("tmp89"(#loc191))
+#loc420 = loc("tmp91"(#loc192))
+#loc421 = loc("tmp93"(#loc193))
+#loc422 = loc("tmp94"(#loc194))
+#loc423 = loc("tmp95"(#loc195))
+#loc424 = loc("tmp98"(#loc196))
+#loc425 = loc("tmp99"(#loc197))
+#loc426 = loc("tmp100"(#loc198))
+#loc427 = loc("tmp101"(#loc199))
+#loc428 = loc("tmp104"(#loc200))
+#loc429 = loc("tmp107"(#loc201))
+#loc430 = loc("tmp109"(#loc202))
+#loc431 = loc("tmp110"(#loc203))
+#loc435 = loc("_tmp10"(#loc245))
diff --git a/triton/BFXM4JRLILKWOXIXJK63CNDMPAQTDH7WSFLZFJRWWUXYOHS6MUXQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttgir b/triton/BFXM4JRLILKWOXIXJK63CNDMPAQTDH7WSFLZFJRWWUXYOHS6MUXQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttgir
new file mode 100644
index 0000000000000000000000000000000000000000..cf51ed3e1bf1a0cb2598cdf88fcb480be31ee33b
--- /dev/null
+++ b/triton/BFXM4JRLILKWOXIXJK63CNDMPAQTDH7WSFLZFJRWWUXYOHS6MUXQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttgir
@@ -0,0 +1,547 @@
+#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [2, 2], order = [0, 1]}>
+#blocked1 = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [16, 2], warpsPerCTA = [4, 1], order = [1, 0]}>
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":18:0)
+#loc1 = loc(unknown)
+#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":51:25)
+#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":52:27)
+#loc147 = loc("in_out_ptr0"(#loc))
+#loc148 = loc("in_out_ptr1"(#loc))
+#loc149 = loc("in_ptr0"(#loc))
+#loc150 = loc("in_ptr1"(#loc))
+#loc151 = loc("in_ptr2"(#loc))
+#loc152 = loc("in_ptr3"(#loc))
+#loc153 = loc("in_ptr4"(#loc))
+#loc154 = loc("xnumel"(#loc))
+#loc155 = loc("r0_numel"(#loc))
+#loc185 = loc("tmp4"(#loc33))
+#loc187 = loc("tmp10"(#loc36))
+#loc292 = loc(callsite(#loc1 at #loc185))
+#loc294 = loc(callsite(#loc1 at #loc187))
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:89", "ttg.threads-per-warp" = 32 : i32} {
+  tt.func public @triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0(%in_out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_out_ptr0"(#loc)), %in_out_ptr1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_out_ptr1"(#loc)), %in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %in_ptr4: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr4"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
+    %cst = arith.constant dense<4097> : tensor<1x8xi32, #blocked> loc(#loc1)
+    %cst_0 = arith.constant dense<0.000000e+00> : tensor<1x8xbf16, #blocked1> loc(#loc1)
+    %cst_1 = arith.constant dense<1> : tensor<1x8xi32, #blocked> loc(#loc1)
+    %cst_2 = arith.constant dense<1> : tensor<1x8xi64, #blocked> loc(#loc1)
+    %cst_3 = arith.constant dense<2> : tensor<1x8xi32, #blocked> loc(#loc1)
+    %cst_4 = arith.constant dense<36864> : tensor<64x1xi32, #blocked> loc(#loc1)
+    %cst_5 = arith.constant dense<36864> : tensor<64x1xi32, #blocked1> loc(#loc1)
+    %cst_6 = arith.constant dense<128> : tensor<64x1xi32, #blocked> loc(#loc1)
+    %cst_7 = arith.constant dense<128> : tensor<64x1xi32, #blocked1> loc(#loc1)
+    %cst_8 = arith.constant dense<4096> : tensor<1x8xi32, #blocked> loc(#loc1)
+    %cst_9 = arith.constant dense<4096> : tensor<1x8xi32, #blocked1> loc(#loc1)
+    %cst_10 = arith.constant dense<128> : tensor<1x8xi32, #blocked> loc(#loc1)
+    %cst_11 = arith.constant dense<128> : tensor<1x8xi32, #blocked1> loc(#loc1)
+    %cst_12 = arith.constant dense<32> : tensor<64x1xi32, #blocked> loc(#loc1)
+    %cst_13 = arith.constant dense<32> : tensor<64x1xi32, #blocked1> loc(#loc1)
+    %c64_i32 = arith.constant 64 : i32 loc(#loc1)
+    %cst_14 = arith.constant dense<0.000000e+00> : tensor<64x8xbf16, #blocked1> loc(#loc1)
+    %cst_15 = arith.constant dense<0.000000e+00> : tensor<64x8xbf16, #blocked> loc(#loc1)
+    %c8_i32 = arith.constant 8 : i32 loc(#loc1)
+    %c128_i32 = arith.constant 128 : i32 loc(#loc1)
+    %c0_i32 = arith.constant 0 : i32 loc(#loc1)
+    %cst_16 = arith.constant dense<9.99999997E-7> : tensor<64x1xf32, #blocked1> loc(#loc1)
+    %cst_17 = arith.constant dense<1.280000e+02> : tensor<64x1xf32, #blocked1> loc(#loc1)
+    %cst_18 = arith.constant dense<0.000000e+00> : tensor<64x8xf32, #blocked> loc(#loc1)
+    %cst_19 = arith.constant dense<0.000000e+00> : tensor<64x8xf32, #blocked1> loc(#loc1)
+    %xoffset = tt.get_program_id x : i32 loc(#loc156)
+    %xoffset_20 = arith.muli %xoffset, %c64_i32 : i32 loc(#loc157)
+    %xindex = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc158)
+    %xindex_21 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc158)
+    %xindex_22 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<64x1xi32, #blocked1> loc(#loc158)
+    %xindex_23 = tt.expand_dims %xindex_21 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> loc(#loc158)
+    %xindex_24 = tt.splat %xoffset_20 : i32 -> tensor<64x1xi32, #blocked1> loc(#loc159)
+    %xindex_25 = tt.splat %xoffset_20 : i32 -> tensor<64x1xi32, #blocked> loc(#loc159)
+    %xindex_26 = arith.addi %xindex_24, %xindex_22 : tensor<64x1xi32, #blocked1> loc(#loc159)
+    %xindex_27 = arith.addi %xindex_25, %xindex_23 : tensor<64x1xi32, #blocked> loc(#loc159)
+    %r0_base = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc160)
+    %r0_base_28 = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc160)
+    %r0_base_29 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<8xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x8xi32, #blocked1> loc(#loc160)
+    %r0_base_30 = tt.expand_dims %r0_base_28 {axis = 0 : i32} : tensor<8xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x8xi32, #blocked> loc(#loc160)
+    %x0 = arith.remsi %xindex_26, %cst_13 : tensor<64x1xi32, #blocked1> loc(#loc161)
+    %x0_31 = arith.remsi %xindex_27, %cst_12 : tensor<64x1xi32, #blocked> loc(#loc161)
+    %x1 = arith.divsi %xindex_26, %cst_13 : tensor<64x1xi32, #blocked1> loc(#loc162)
+    %x1_32 = arith.divsi %xindex_27, %cst_12 : tensor<64x1xi32, #blocked> loc(#loc162)
+    %tmp0 = arith.muli %x0, %cst_7 : tensor<64x1xi32, #blocked1> loc(#loc163)
+    %tmp0_33 = tt.broadcast %tmp0 : tensor<64x1xi32, #blocked1> -> tensor<64x8xi32, #blocked1> loc(#loc164)
+    %tmp0_34 = arith.muli %x1, %cst_5 : tensor<64x1xi32, #blocked1> loc(#loc165)
+    %tmp0_35 = tt.broadcast %tmp0_34 : tensor<64x1xi32, #blocked1> -> tensor<64x8xi32, #blocked1> loc(#loc166)
+    %tmp0_36 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<64x8x!tt.ptr<bf16>, #blocked1> loc(#loc167)
+    %_tmp10:2 = scf.for %_tmp10_51 = %c0_i32 to %c128_i32 step %c8_i32 iter_args(%arg10 = %cst_19, %arg11 = %cst_19) -> (tensor<64x8xf32, #blocked1>, tensor<64x8xf32, #blocked1>)  : i32 {
+      %r0_index = tt.splat %_tmp10_51 : i32 -> tensor<1x8xi32, #blocked1> loc(#loc169)
+      %r0_index_52 = arith.addi %r0_index, %r0_base_29 : tensor<1x8xi32, #blocked1> loc(#loc169)
+      %r0_mask = arith.cmpi slt, %r0_index_52, %cst_11 : tensor<1x8xi32, #blocked1> loc(#loc170)
+      %tmp0_53 = arith.addi %r0_index_52, %cst_9 : tensor<1x8xi32, #blocked1> loc(#loc171)
+      %tmp0_54 = tt.broadcast %tmp0_53 : tensor<1x8xi32, #blocked1> -> tensor<64x8xi32, #blocked1> loc(#loc164)
+      %tmp0_55 = arith.addi %tmp0_54, %tmp0_33 : tensor<64x8xi32, #blocked1> loc(#loc164)
+      %tmp0_56 = arith.addi %tmp0_55, %tmp0_35 : tensor<64x8xi32, #blocked1> loc(#loc166)
+      %tmp0_57 = tt.addptr %tmp0_36, %tmp0_56 : tensor<64x8x!tt.ptr<bf16>, #blocked1>, tensor<64x8xi32, #blocked1> loc(#loc167)
+      %tmp0_58 = tt.broadcast %r0_mask : tensor<1x8xi1, #blocked1> -> tensor<64x8xi1, #blocked1> loc(#loc172)
+      %tmp0_59 = tt.load %tmp0_57, %tmp0_58, %cst_14 evictionPolicy = evict_last : tensor<64x8x!tt.ptr<bf16>, #blocked1> loc(#loc172)
+      %tmp0_60 = arith.extf %tmp0_59 : tensor<64x8xbf16, #blocked1> to tensor<64x8xf32, #blocked1> loc(#loc173)
+      %tmp6 = tt.broadcast %r0_index_52 : tensor<1x8xi32, #blocked1> -> tensor<64x8xi32, #blocked1> loc(#loc174)
+      %tmp6_61 = arith.addi %tmp6, %tmp0_33 : tensor<64x8xi32, #blocked1> loc(#loc174)
+      %tmp6_62 = arith.addi %tmp6_61, %tmp0_35 : tensor<64x8xi32, #blocked1> loc(#loc175)
+      %tmp6_63 = tt.addptr %tmp0_36, %tmp6_62 : tensor<64x8x!tt.ptr<bf16>, #blocked1>, tensor<64x8xi32, #blocked1> loc(#loc176)
+      %tmp6_64 = tt.load %tmp6_63, %tmp0_58, %cst_14 evictionPolicy = evict_last : tensor<64x8x!tt.ptr<bf16>, #blocked1> loc(#loc177)
+      %tmp6_65 = arith.extf %tmp6_64 : tensor<64x8xbf16, #blocked1> to tensor<64x8xf32, #blocked1> loc(#loc178)
+      %tmp2 = arith.mulf %tmp0_60, %tmp0_60 : tensor<64x8xf32, #blocked1> loc(#loc179)
+      %tmp5 = arith.addf %arg10, %tmp2 : tensor<64x8xf32, #blocked1> loc(#loc180)
+      %_tmp4 = arith.select %tmp0_58, %tmp5, %arg10 : tensor<64x8xi1, #blocked1>, tensor<64x8xf32, #blocked1> loc(#loc181)
+      %tmp8 = arith.mulf %tmp6_65, %tmp6_65 : tensor<64x8xf32, #blocked1> loc(#loc182)
+      %tmp11 = arith.addf %arg11, %tmp8 : tensor<64x8xf32, #blocked1> loc(#loc183)
+      %_tmp10_66 = arith.select %tmp0_58, %tmp11, %arg11 : tensor<64x8xi1, #blocked1>, tensor<64x8xf32, #blocked1> loc(#loc184)
+      scf.yield %_tmp4, %_tmp10_66 : tensor<64x8xf32, #blocked1>, tensor<64x8xf32, #blocked1> loc(#loc31)
+    } loc(#loc290)
+    %tmp4 = "tt.reduce"(%_tmp10#0) <{axis = 1 : i32}> ({
+    ^bb0(%tmp4_51: f32 loc(callsite(#loc1 at #loc185)), %tmp4_52: f32 loc(callsite(#loc1 at #loc185))):
+      %tmp4_53 = arith.addf %tmp4_51, %tmp4_52 : f32 loc(#loc297)
+      tt.reduce.return %tmp4_53 : f32 loc(#loc291)
+    }) : (tensor<64x8xf32, #blocked1>) -> tensor<64xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc291)
+    %tmp4_37 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<64xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<64x1xf32, #blocked1> loc(#loc186)
+    %tmp10 = "tt.reduce"(%_tmp10#1) <{axis = 1 : i32}> ({
+    ^bb0(%tmp10_51: f32 loc(callsite(#loc1 at #loc187)), %tmp10_52: f32 loc(callsite(#loc1 at #loc187))):
+      %tmp10_53 = arith.addf %tmp10_51, %tmp10_52 : f32 loc(#loc298)
+      tt.reduce.return %tmp10_53 : f32 loc(#loc293)
+    }) : (tensor<64x8xf32, #blocked1>) -> tensor<64xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc293)
+    %tmp10_38 = tt.expand_dims %tmp10 {axis = 1 : i32} : tensor<64xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<64x1xf32, #blocked1> loc(#loc188)
+    %tmp50 = arith.muli %x0_31, %cst_6 : tensor<64x1xi32, #blocked> loc(#loc189)
+    %tmp50_39 = tt.broadcast %tmp50 : tensor<64x1xi32, #blocked> -> tensor<64x8xi32, #blocked> loc(#loc190)
+    %tmp50_40 = arith.muli %x1_32, %cst_4 : tensor<64x1xi32, #blocked> loc(#loc191)
+    %tmp50_41 = tt.broadcast %tmp50_40 : tensor<64x1xi32, #blocked> -> tensor<64x8xi32, #blocked> loc(#loc192)
+    %tmp50_42 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<64x8x!tt.ptr<bf16>, #blocked> loc(#loc193)
+    %tmp58 = tt.splat %in_ptr1 : !tt.ptr<bf16> -> tensor<1x8x!tt.ptr<bf16>, #blocked> loc(#loc194)
+    %tmp58_43 = tt.splat %in_ptr1 : !tt.ptr<bf16> -> tensor<1x8x!tt.ptr<bf16>, #blocked1> loc(#loc194)
+    %tmp63 = arith.muli %x1, %cst_7 : tensor<64x1xi32, #blocked1> loc(#loc195)
+    %tmp63_44 = tt.broadcast %tmp63 : tensor<64x1xi32, #blocked1> -> tensor<64x8xi32, #blocked1> loc(#loc196)
+    %tmp63_45 = tt.splat %in_ptr2 : !tt.ptr<f32> -> tensor<64x8x!tt.ptr<f32>, #blocked1> loc(#loc197)
+    %tmp66 = tt.splat %in_ptr3 : !tt.ptr<f32> -> tensor<64x8x!tt.ptr<f32>, #blocked1> loc(#loc198)
+    %tmp102 = tt.splat %in_ptr4 : !tt.ptr<bf16> -> tensor<1x8x!tt.ptr<bf16>, #blocked> loc(#loc199)
+    %tmp102_46 = tt.splat %in_ptr4 : !tt.ptr<bf16> -> tensor<1x8x!tt.ptr<bf16>, #blocked1> loc(#loc199)
+    %tmp20 = arith.divf %tmp10_38, %cst_17 : tensor<64x1xf32, #blocked1> loc(#loc200)
+    %tmp22 = arith.addf %tmp20, %cst_16 : tensor<64x1xf32, #blocked1> loc(#loc201)
+    %tmp23 = tt.extern_elementwise %tmp22 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<64x1xf32, #blocked1>) -> tensor<64x1xf32, #blocked1> loc(#loc202)
+    %tmp24 = ttg.convert_layout %tmp23 : tensor<64x1xf32, #blocked1> -> tensor<64x1xf32, #blocked> loc(#loc203)
+    %tmp24_47 = tt.broadcast %tmp24 : tensor<64x1xf32, #blocked> -> tensor<64x8xf32, #blocked> loc(#loc203)
+    %tmp24_48 = tt.broadcast %tmp23 : tensor<64x1xf32, #blocked1> -> tensor<64x8xf32, #blocked1> loc(#loc203)
+    %tmp72 = arith.divf %tmp4_37, %cst_17 : tensor<64x1xf32, #blocked1> loc(#loc204)
+    %tmp73 = arith.addf %tmp72, %cst_16 : tensor<64x1xf32, #blocked1> loc(#loc205)
+    %tmp74 = tt.extern_elementwise %tmp73 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<64x1xf32, #blocked1>) -> tensor<64x1xf32, #blocked1> loc(#loc206)
+    %tmp75 = ttg.convert_layout %tmp74 : tensor<64x1xf32, #blocked1> -> tensor<64x1xf32, #blocked> loc(#loc207)
+    %tmp75_49 = tt.broadcast %tmp75 : tensor<64x1xf32, #blocked> -> tensor<64x8xf32, #blocked> loc(#loc207)
+    %tmp75_50 = tt.broadcast %tmp74 : tensor<64x1xf32, #blocked1> -> tensor<64x8xf32, #blocked1> loc(#loc207)
+    %0 = arith.muli %xindex_26, %cst_7 : tensor<64x1xi32, #blocked1> loc(#loc57)
+    %1 = tt.broadcast %0 : tensor<64x1xi32, #blocked1> -> tensor<64x8xi32, #blocked1> loc(#loc58)
+    %2 = tt.splat %in_out_ptr0 : !tt.ptr<bf16> -> tensor<64x8x!tt.ptr<bf16>, #blocked1> loc(#loc59)
+    %3 = tt.splat %in_out_ptr1 : !tt.ptr<bf16> -> tensor<64x8x!tt.ptr<bf16>, #blocked1> loc(#loc60)
+    scf.for %r0_offset = %c0_i32 to %c128_i32 step %c8_i32  : i32 {
+      %r0_index = tt.splat %r0_offset : i32 -> tensor<1x8xi32, #blocked1> loc(#loc208)
+      %r0_index_51 = tt.splat %r0_offset : i32 -> tensor<1x8xi32, #blocked> loc(#loc208)
+      %r0_index_52 = arith.addi %r0_index, %r0_base_29 : tensor<1x8xi32, #blocked1> loc(#loc208)
+      %r0_index_53 = arith.addi %r0_index_51, %r0_base_30 : tensor<1x8xi32, #blocked> loc(#loc208)
+      %r0_mask = arith.cmpi slt, %r0_index_52, %cst_11 : tensor<1x8xi32, #blocked1> loc(#loc209)
+      %r0_mask_54 = arith.cmpi slt, %r0_index_53, %cst_10 : tensor<1x8xi32, #blocked> loc(#loc209)
+      %r0_3 = arith.remsi %r0_index_53, %cst_3 : tensor<1x8xi32, #blocked> loc(#loc210)
+      %r0_4 = arith.divsi %r0_index_53, %cst_3 : tensor<1x8xi32, #blocked> loc(#loc211)
+      %tmp50_55 = tt.broadcast %r0_index_52 : tensor<1x8xi32, #blocked1> -> tensor<64x8xi32, #blocked1> loc(#loc190)
+      %tmp50_56 = arith.addi %tmp50_55, %tmp0_33 : tensor<64x8xi32, #blocked1> loc(#loc190)
+      %tmp50_57 = arith.addi %tmp50_56, %tmp0_35 : tensor<64x8xi32, #blocked1> loc(#loc192)
+      %tmp50_58 = tt.addptr %tmp0_36, %tmp50_57 : tensor<64x8x!tt.ptr<bf16>, #blocked1>, tensor<64x8xi32, #blocked1> loc(#loc193)
+      %tmp50_59 = tt.broadcast %r0_mask : tensor<1x8xi1, #blocked1> -> tensor<64x8xi1, #blocked1> loc(#loc212)
+      %tmp50_60 = tt.load %tmp50_58, %tmp50_59, %cst_14 evictionPolicy = evict_last : tensor<64x8x!tt.ptr<bf16>, #blocked1> loc(#loc212)
+      %tmp50_61 = arith.extf %tmp50_60 : tensor<64x8xbf16, #blocked1> to tensor<64x8xf32, #blocked1> loc(#loc213)
+      %tmp58_62 = tt.addptr %tmp58_43, %r0_index_52 : tensor<1x8x!tt.ptr<bf16>, #blocked1>, tensor<1x8xi32, #blocked1> loc(#loc194)
+      %tmp58_63 = tt.load %tmp58_62, %r0_mask, %cst_0 evictionPolicy = evict_last : tensor<1x8x!tt.ptr<bf16>, #blocked1> loc(#loc214)
+      %tmp58_64 = arith.extf %tmp58_63 : tensor<1x8xbf16, #blocked1> to tensor<1x8xf32, #blocked1> loc(#loc215)
+      %tmp63_65 = arith.addi %tmp50_55, %tmp63_44 : tensor<64x8xi32, #blocked1> loc(#loc196)
+      %tmp63_66 = tt.addptr %tmp63_45, %tmp63_65 : tensor<64x8x!tt.ptr<f32>, #blocked1>, tensor<64x8xi32, #blocked1> loc(#loc197)
+      %tmp63_67 = tt.load %tmp63_66, %tmp50_59, %cst_19 evictionPolicy = evict_last : tensor<64x8x!tt.ptr<f32>, #blocked1> loc(#loc216)
+      %tmp66_68 = tt.addptr %tmp66, %tmp63_65 : tensor<64x8x!tt.ptr<f32>, #blocked1>, tensor<64x8xi32, #blocked1> loc(#loc198)
+      %tmp66_69 = tt.load %tmp66_68, %tmp50_59, %cst_19 evictionPolicy = evict_last : tensor<64x8x!tt.ptr<f32>, #blocked1> loc(#loc217)
+      %tmp66_70 = ttg.convert_layout %tmp66_69 : tensor<64x8xf32, #blocked1> -> tensor<64x8xf32, #blocked> loc(#loc217)
+      %tmp96 = arith.addi %r0_index_52, %cst_9 : tensor<1x8xi32, #blocked1> loc(#loc218)
+      %tmp96_71 = tt.broadcast %tmp96 : tensor<1x8xi32, #blocked1> -> tensor<64x8xi32, #blocked1> loc(#loc219)
+      %tmp96_72 = arith.addi %tmp96_71, %tmp0_33 : tensor<64x8xi32, #blocked1> loc(#loc219)
+      %tmp96_73 = arith.addi %tmp96_72, %tmp0_35 : tensor<64x8xi32, #blocked1> loc(#loc220)
+      %tmp96_74 = tt.addptr %tmp0_36, %tmp96_73 : tensor<64x8x!tt.ptr<bf16>, #blocked1>, tensor<64x8xi32, #blocked1> loc(#loc221)
+      %tmp96_75 = tt.load %tmp96_74, %tmp50_59, %cst_14 evictionPolicy = evict_first : tensor<64x8x!tt.ptr<bf16>, #blocked1> loc(#loc222)
+      %tmp96_76 = arith.extf %tmp96_75 : tensor<64x8xbf16, #blocked1> to tensor<64x8xf32, #blocked1> loc(#loc223)
+      %tmp102_77 = tt.addptr %tmp102_46, %r0_index_52 : tensor<1x8x!tt.ptr<bf16>, #blocked1>, tensor<1x8xi32, #blocked1> loc(#loc199)
+      %tmp102_78 = tt.load %tmp102_77, %r0_mask, %cst_0 evictionPolicy = evict_last : tensor<1x8x!tt.ptr<bf16>, #blocked1> loc(#loc224)
+      %tmp102_79 = arith.extf %tmp102_78 : tensor<1x8xbf16, #blocked1> to tensor<1x8xf32, #blocked1> loc(#loc225)
+      %tmp16 = arith.extsi %r0_3 : tensor<1x8xi32, #blocked> to tensor<1x8xi64, #blocked> loc(#loc226)
+      %tmp16_80 = arith.cmpi slt, %tmp16, %cst_2 : tensor<1x8xi64, #blocked> loc(#loc226)
+      %tmp17 = arith.muli %r0_4, %cst_3 : tensor<1x8xi32, #blocked> loc(#loc227)
+      %tmp17_81 = arith.addi %tmp17, %cst_1 : tensor<1x8xi32, #blocked> loc(#loc228)
+      %tmp17_82 = tt.broadcast %tmp17_81 : tensor<1x8xi32, #blocked> -> tensor<64x8xi32, #blocked> loc(#loc229)
+      %tmp17_83 = arith.addi %tmp17_82, %tmp50_39 : tensor<64x8xi32, #blocked> loc(#loc229)
+      %tmp17_84 = arith.addi %tmp17_83, %tmp50_41 : tensor<64x8xi32, #blocked> loc(#loc230)
+      %tmp17_85 = tt.addptr %tmp50_42, %tmp17_84 : tensor<64x8x!tt.ptr<bf16>, #blocked>, tensor<64x8xi32, #blocked> loc(#loc231)
+      %tmp17_86 = arith.andi %r0_mask_54, %tmp16_80 : tensor<1x8xi1, #blocked> loc(#loc232)
+      %tmp17_87 = tt.broadcast %tmp17_86 : tensor<1x8xi1, #blocked> -> tensor<64x8xi1, #blocked> loc(#loc233)
+      %tmp17_88 = tt.load %tmp17_85, %tmp17_87, %cst_15 evictionPolicy = evict_last : tensor<64x8x!tt.ptr<bf16>, #blocked> loc(#loc233)
+      %tmp17_89 = arith.extf %tmp17_88 : tensor<64x8xbf16, #blocked> to tensor<64x8xf32, #blocked> loc(#loc234)
+      %tmp24_90 = arith.mulf %tmp17_89, %tmp24_47 : tensor<64x8xf32, #blocked> loc(#loc203)
+      %tmp25 = tt.addptr %tmp58, %tmp17_81 : tensor<1x8x!tt.ptr<bf16>, #blocked>, tensor<1x8xi32, #blocked> loc(#loc235)
+      %tmp25_91 = tt.broadcast %tmp25 : tensor<1x8x!tt.ptr<bf16>, #blocked> -> tensor<64x8x!tt.ptr<bf16>, #blocked> loc(#loc235)
+      %tmp25_92 = tt.load %tmp25_91, %tmp17_87, %cst_15 evictionPolicy = evict_last : tensor<64x8x!tt.ptr<bf16>, #blocked> loc(#loc236)
+      %tmp25_93 = arith.extf %tmp25_92 : tensor<64x8xbf16, #blocked> to tensor<64x8xf32, #blocked> loc(#loc237)
+      %tmp27 = arith.mulf %tmp24_90, %tmp25_93 : tensor<64x8xf32, #blocked> loc(#loc238)
+      %tmp29 = arith.subf %cst_18, %tmp27 : tensor<64x8xf32, #blocked> loc(#loc239)
+      %tmp31 = tt.broadcast %tmp16_80 : tensor<1x8xi1, #blocked> -> tensor<64x8xi1, #blocked> loc(#loc240)
+      %tmp32 = arith.cmpi sge, %tmp16, %cst_2 : tensor<1x8xi64, #blocked> loc(#loc241)
+      %tmp35 = tt.broadcast %tmp17 : tensor<1x8xi32, #blocked> -> tensor<64x8xi32, #blocked> loc(#loc242)
+      %tmp35_94 = arith.addi %tmp35, %tmp50_39 : tensor<64x8xi32, #blocked> loc(#loc242)
+      %tmp35_95 = arith.addi %tmp35_94, %tmp50_41 : tensor<64x8xi32, #blocked> loc(#loc243)
+      %tmp35_96 = tt.addptr %tmp50_42, %tmp35_95 : tensor<64x8x!tt.ptr<bf16>, #blocked>, tensor<64x8xi32, #blocked> loc(#loc244)
+      %tmp35_97 = arith.andi %r0_mask_54, %tmp32 : tensor<1x8xi1, #blocked> loc(#loc245)
+      %tmp35_98 = tt.broadcast %tmp35_97 : tensor<1x8xi1, #blocked> -> tensor<64x8xi1, #blocked> loc(#loc246)
+      %tmp35_99 = tt.load %tmp35_96, %tmp35_98, %cst_15 evictionPolicy = evict_last : tensor<64x8x!tt.ptr<bf16>, #blocked> loc(#loc246)
+      %tmp35_100 = arith.extf %tmp35_99 : tensor<64x8xbf16, #blocked> to tensor<64x8xf32, #blocked> loc(#loc247)
+      %tmp42 = arith.mulf %tmp35_100, %tmp24_47 : tensor<64x8xf32, #blocked> loc(#loc248)
+      %tmp43 = tt.addptr %tmp58, %tmp17 : tensor<1x8x!tt.ptr<bf16>, #blocked>, tensor<1x8xi32, #blocked> loc(#loc249)
+      %tmp43_101 = tt.broadcast %tmp43 : tensor<1x8x!tt.ptr<bf16>, #blocked> -> tensor<64x8x!tt.ptr<bf16>, #blocked> loc(#loc249)
+      %tmp43_102 = tt.load %tmp43_101, %tmp35_98, %cst_15 evictionPolicy = evict_last : tensor<64x8x!tt.ptr<bf16>, #blocked> loc(#loc250)
+      %tmp43_103 = arith.extf %tmp43_102 : tensor<64x8xbf16, #blocked> to tensor<64x8xf32, #blocked> loc(#loc251)
+      %tmp45 = arith.mulf %tmp42, %tmp43_103 : tensor<64x8xf32, #blocked> loc(#loc252)
+      %tmp48 = tt.broadcast %tmp32 : tensor<1x8xi1, #blocked> -> tensor<64x8xi1, #blocked> loc(#loc253)
+      %tmp48_104 = arith.select %tmp48, %tmp45, %cst_18 : tensor<64x8xi1, #blocked>, tensor<64x8xf32, #blocked> loc(#loc253)
+      %tmp49 = arith.select %tmp31, %tmp29, %tmp48_104 : tensor<64x8xi1, #blocked>, tensor<64x8xf32, #blocked> loc(#loc295)
+      %tmp57 = arith.mulf %tmp50_61, %tmp24_48 : tensor<64x8xf32, #blocked1> loc(#loc255)
+      %tmp60 = tt.broadcast %tmp58_64 : tensor<1x8xf32, #blocked1> -> tensor<64x8xf32, #blocked1> loc(#loc256)
+      %tmp60_105 = arith.mulf %tmp57, %tmp60 : tensor<64x8xf32, #blocked1> loc(#loc256)
+      %tmp64 = arith.mulf %tmp60_105, %tmp63_67 : tensor<64x8xf32, #blocked1> loc(#loc257)
+      %tmp64_106 = ttg.convert_layout %tmp64 : tensor<64x8xf32, #blocked1> -> tensor<64x8xf32, #blocked> loc(#loc257)
+      %tmp67 = arith.mulf %tmp49, %tmp66_70 : tensor<64x8xf32, #blocked> loc(#loc258)
+      %tmp68 = arith.addf %tmp64_106, %tmp67 : tensor<64x8xf32, #blocked> loc(#loc259)
+      %tmp70 = arith.addi %tmp17, %cst : tensor<1x8xi32, #blocked> loc(#loc260)
+      %tmp70_107 = tt.broadcast %tmp70 : tensor<1x8xi32, #blocked> -> tensor<64x8xi32, #blocked> loc(#loc261)
+      %tmp70_108 = arith.addi %tmp70_107, %tmp50_39 : tensor<64x8xi32, #blocked> loc(#loc261)
+      %tmp70_109 = arith.addi %tmp70_108, %tmp50_41 : tensor<64x8xi32, #blocked> loc(#loc262)
+      %tmp70_110 = tt.addptr %tmp50_42, %tmp70_109 : tensor<64x8x!tt.ptr<bf16>, #blocked>, tensor<64x8xi32, #blocked> loc(#loc263)
+      %tmp70_111 = tt.load %tmp70_110, %tmp17_87, %cst_15 evictionPolicy = evict_last : tensor<64x8x!tt.ptr<bf16>, #blocked> loc(#loc264)
+      %tmp70_112 = arith.extf %tmp70_111 : tensor<64x8xbf16, #blocked> to tensor<64x8xf32, #blocked> loc(#loc265)
+      %tmp75_113 = arith.mulf %tmp70_112, %tmp75_49 : tensor<64x8xf32, #blocked> loc(#loc207)
+      %tmp76 = tt.addptr %tmp102, %tmp17_81 : tensor<1x8x!tt.ptr<bf16>, #blocked>, tensor<1x8xi32, #blocked> loc(#loc266)
+      %tmp76_114 = tt.broadcast %tmp76 : tensor<1x8x!tt.ptr<bf16>, #blocked> -> tensor<64x8x!tt.ptr<bf16>, #blocked> loc(#loc266)
+      %tmp76_115 = tt.load %tmp76_114, %tmp17_87, %cst_15 evictionPolicy = evict_last : tensor<64x8x!tt.ptr<bf16>, #blocked> loc(#loc267)
+      %tmp76_116 = arith.extf %tmp76_115 : tensor<64x8xbf16, #blocked> to tensor<64x8xf32, #blocked> loc(#loc268)
+      %tmp78 = arith.mulf %tmp75_113, %tmp76_116 : tensor<64x8xf32, #blocked> loc(#loc269)
+      %tmp80 = arith.subf %cst_18, %tmp78 : tensor<64x8xf32, #blocked> loc(#loc270)
+      %tmp83 = arith.addi %tmp17, %cst_8 : tensor<1x8xi32, #blocked> loc(#loc271)
+      %tmp83_117 = tt.broadcast %tmp83 : tensor<1x8xi32, #blocked> -> tensor<64x8xi32, #blocked> loc(#loc272)
+      %tmp83_118 = arith.addi %tmp83_117, %tmp50_39 : tensor<64x8xi32, #blocked> loc(#loc272)
+      %tmp83_119 = arith.addi %tmp83_118, %tmp50_41 : tensor<64x8xi32, #blocked> loc(#loc273)
+      %tmp83_120 = tt.addptr %tmp50_42, %tmp83_119 : tensor<64x8x!tt.ptr<bf16>, #blocked>, tensor<64x8xi32, #blocked> loc(#loc274)
+      %tmp83_121 = tt.load %tmp83_120, %tmp35_98, %cst_15 evictionPolicy = evict_last : tensor<64x8x!tt.ptr<bf16>, #blocked> loc(#loc275)
+      %tmp83_122 = arith.extf %tmp83_121 : tensor<64x8xbf16, #blocked> to tensor<64x8xf32, #blocked> loc(#loc276)
+      %tmp88 = arith.mulf %tmp83_122, %tmp75_49 : tensor<64x8xf32, #blocked> loc(#loc277)
+      %tmp89 = tt.addptr %tmp102, %tmp17 : tensor<1x8x!tt.ptr<bf16>, #blocked>, tensor<1x8xi32, #blocked> loc(#loc278)
+      %tmp89_123 = tt.broadcast %tmp89 : tensor<1x8x!tt.ptr<bf16>, #blocked> -> tensor<64x8x!tt.ptr<bf16>, #blocked> loc(#loc278)
+      %tmp89_124 = tt.load %tmp89_123, %tmp35_98, %cst_15 evictionPolicy = evict_last : tensor<64x8x!tt.ptr<bf16>, #blocked> loc(#loc279)
+      %tmp89_125 = arith.extf %tmp89_124 : tensor<64x8xbf16, #blocked> to tensor<64x8xf32, #blocked> loc(#loc280)
+      %tmp91 = arith.mulf %tmp88, %tmp89_125 : tensor<64x8xf32, #blocked> loc(#loc281)
+      %tmp94 = arith.select %tmp48, %tmp91, %cst_18 : tensor<64x8xi1, #blocked>, tensor<64x8xf32, #blocked> loc(#loc282)
+      %tmp95 = arith.select %tmp31, %tmp80, %tmp94 : tensor<64x8xi1, #blocked>, tensor<64x8xf32, #blocked> loc(#loc296)
+      %tmp101 = arith.mulf %tmp96_76, %tmp75_50 : tensor<64x8xf32, #blocked1> loc(#loc285)
+      %tmp104 = tt.broadcast %tmp102_79 : tensor<1x8xf32, #blocked1> -> tensor<64x8xf32, #blocked1> loc(#loc286)
+      %tmp104_126 = arith.mulf %tmp101, %tmp104 : tensor<64x8xf32, #blocked1> loc(#loc286)
+      %tmp107 = arith.mulf %tmp104_126, %tmp63_67 : tensor<64x8xf32, #blocked1> loc(#loc287)
+      %tmp107_127 = ttg.convert_layout %tmp107 : tensor<64x8xf32, #blocked1> -> tensor<64x8xf32, #blocked> loc(#loc287)
+      %tmp109 = arith.mulf %tmp95, %tmp66_70 : tensor<64x8xf32, #blocked> loc(#loc288)
+      %tmp110 = arith.addf %tmp107_127, %tmp109 : tensor<64x8xf32, #blocked> loc(#loc289)
+      %4 = arith.addi %tmp50_55, %1 : tensor<64x8xi32, #blocked1> loc(#loc58)
+      %5 = tt.addptr %2, %4 : tensor<64x8x!tt.ptr<bf16>, #blocked1>, tensor<64x8xi32, #blocked1> loc(#loc59)
+      %6 = arith.truncf %tmp68 : tensor<64x8xf32, #blocked> to tensor<64x8xbf16, #blocked> loc(#loc144)
+      %7 = ttg.convert_layout %6 : tensor<64x8xbf16, #blocked> -> tensor<64x8xbf16, #blocked1> loc(#loc144)
+      tt.store %5, %7, %tmp50_59 : tensor<64x8x!tt.ptr<bf16>, #blocked1> loc(#loc144)
+      %8 = tt.addptr %3, %4 : tensor<64x8x!tt.ptr<bf16>, #blocked1>, tensor<64x8xi32, #blocked1> loc(#loc60)
+      %9 = arith.truncf %tmp110 : tensor<64x8xf32, #blocked> to tensor<64x8xbf16, #blocked> loc(#loc145)
+      %10 = ttg.convert_layout %9 : tensor<64x8xbf16, #blocked> -> tensor<64x8xbf16, #blocked1> loc(#loc145)
+      tt.store %8, %10, %tmp50_59 : tensor<64x8x!tt.ptr<bf16>, #blocked1> loc(#loc145)
+    } loc(#loc61)
+    tt.return loc(#loc146)
+  } loc(#loc)
+} loc(#loc)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":23:28)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":23:33)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:44)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:23)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":26:37)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":28:19)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":29:19)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:52)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:48)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:63)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:57)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:34)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":33:43)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":34:31)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":35:29)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:41)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:68)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:121)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:41)
+#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:50)
+#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:34)
+#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:61)
+#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:114)
+#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":42:22)
+#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":44:23)
+#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":45:40)
+#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":47:22)
+#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":49:25)
+#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":50:42)
+#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":50:8)
+#loc32 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:36)
+#loc34 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:15)
+#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":51:28)
+#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":52:30)
+#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:46)
+#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:42)
+#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:57)
+#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:51)
+#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:35)
+#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:35)
+#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:46)
+#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:42)
+#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:35)
+#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:35)
+#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:36)
+#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":75:25)
+#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":77:24)
+#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":78:32)
+#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":79:24)
+#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":123:24)
+#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":124:24)
+#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":125:32)
+#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":126:24)
+#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:43)
+#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:39)
+#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:32)
+#loc60 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:32)
+#loc61 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":53:43)
+#loc62 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":54:31)
+#loc63 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":55:29)
+#loc64 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":58:27)
+#loc65 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":59:27)
+#loc66 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:62)
+#loc67 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:115)
+#loc68 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:42)
+#loc69 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:95)
+#loc70 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:51)
+#loc71 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:51)
+#loc72 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:42)
+#loc73 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:49)
+#loc74 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:58)
+#loc75 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:35)
+#loc76 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:69)
+#loc77 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:123)
+#loc78 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:43)
+#loc79 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:96)
+#loc80 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":71:24)
+#loc81 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:41)
+#loc82 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:39)
+#loc83 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:48)
+#loc84 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:57)
+#loc85 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:35)
+#loc86 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:78)
+#loc87 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:68)
+#loc88 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:129)
+#loc89 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:35)
+#loc90 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:85)
+#loc91 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:146)
+#loc92 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":82:24)
+#loc93 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":84:17)
+#loc94 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":86:39)
+#loc95 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":87:25)
+#loc96 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:44)
+#loc97 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:53)
+#loc98 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:35)
+#loc99 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:74)
+#loc100 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:64)
+#loc101 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:125)
+#loc102 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":97:24)
+#loc103 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:35)
+#loc104 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:81)
+#loc105 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:142)
+#loc106 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":100:24)
+#loc107 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":103:39)
+#loc108 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":104:39)
+#loc109 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":111:24)
+#loc110 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":113:24)
+#loc111 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":116:24)
+#loc112 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":118:24)
+#loc113 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":119:24)
+#loc114 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:42)
+#loc115 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:51)
+#loc116 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:60)
+#loc117 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:35)
+#loc118 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:71)
+#loc119 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:132)
+#loc120 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:35)
+#loc121 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:85)
+#loc122 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:146)
+#loc123 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":129:24)
+#loc124 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":131:17)
+#loc125 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:42)
+#loc126 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:51)
+#loc127 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:60)
+#loc128 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:35)
+#loc129 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:71)
+#loc130 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:132)
+#loc131 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":139:24)
+#loc132 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:35)
+#loc133 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:81)
+#loc134 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:142)
+#loc135 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":142:24)
+#loc136 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":145:39)
+#loc137 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":146:39)
+#loc138 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":133:39)
+#loc139 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":151:25)
+#loc140 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":153:26)
+#loc141 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":156:26)
+#loc142 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":158:26)
+#loc143 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":159:26)
+#loc144 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:55)
+#loc145 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:56)
+#loc146 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":53:4)
+#loc156 = loc("xoffset"(#loc2))
+#loc157 = loc("xoffset"(#loc3))
+#loc158 = loc("xindex"(#loc4))
+#loc159 = loc("xindex"(#loc5))
+#loc160 = loc("r0_base"(#loc6))
+#loc161 = loc("x0"(#loc7))
+#loc162 = loc("x1"(#loc8))
+#loc163 = loc("tmp0"(#loc9))
+#loc164 = loc("tmp0"(#loc10))
+#loc165 = loc("tmp0"(#loc11))
+#loc166 = loc("tmp0"(#loc12))
+#loc167 = loc("tmp0"(#loc13))
+#loc168 = loc("_tmp4"(#loc14))
+#loc169 = loc("r0_index"(#loc15))
+#loc170 = loc("r0_mask"(#loc16))
+#loc171 = loc("tmp0"(#loc17))
+#loc172 = loc("tmp0"(#loc18))
+#loc173 = loc("tmp0"(#loc19))
+#loc174 = loc("tmp6"(#loc20))
+#loc175 = loc("tmp6"(#loc21))
+#loc176 = loc("tmp6"(#loc22))
+#loc177 = loc("tmp6"(#loc23))
+#loc178 = loc("tmp6"(#loc24))
+#loc179 = loc("tmp2"(#loc25))
+#loc180 = loc("tmp5"(#loc26))
+#loc181 = loc("_tmp4"(#loc27))
+#loc182 = loc("tmp8"(#loc28))
+#loc183 = loc("tmp11"(#loc29))
+#loc184 = loc("_tmp10"(#loc30))
+#loc186 = loc("tmp4"(#loc35))
+#loc188 = loc("tmp10"(#loc37))
+#loc189 = loc("tmp50"(#loc38))
+#loc190 = loc("tmp50"(#loc39))
+#loc191 = loc("tmp50"(#loc40))
+#loc192 = loc("tmp50"(#loc41))
+#loc193 = loc("tmp50"(#loc42))
+#loc194 = loc("tmp58"(#loc43))
+#loc195 = loc("tmp63"(#loc44))
+#loc196 = loc("tmp63"(#loc45))
+#loc197 = loc("tmp63"(#loc46))
+#loc198 = loc("tmp66"(#loc47))
+#loc199 = loc("tmp102"(#loc48))
+#loc200 = loc("tmp20"(#loc49))
+#loc201 = loc("tmp22"(#loc50))
+#loc202 = loc("tmp23"(#loc51))
+#loc203 = loc("tmp24"(#loc52))
+#loc204 = loc("tmp72"(#loc53))
+#loc205 = loc("tmp73"(#loc54))
+#loc206 = loc("tmp74"(#loc55))
+#loc207 = loc("tmp75"(#loc56))
+#loc208 = loc("r0_index"(#loc62))
+#loc209 = loc("r0_mask"(#loc63))
+#loc210 = loc("r0_3"(#loc64))
+#loc211 = loc("r0_4"(#loc65))
+#loc212 = loc("tmp50"(#loc66))
+#loc213 = loc("tmp50"(#loc67))
+#loc214 = loc("tmp58"(#loc68))
+#loc215 = loc("tmp58"(#loc69))
+#loc216 = loc("tmp63"(#loc70))
+#loc217 = loc("tmp66"(#loc71))
+#loc218 = loc("tmp96"(#loc72))
+#loc219 = loc("tmp96"(#loc73))
+#loc220 = loc("tmp96"(#loc74))
+#loc221 = loc("tmp96"(#loc75))
+#loc222 = loc("tmp96"(#loc76))
+#loc223 = loc("tmp96"(#loc77))
+#loc224 = loc("tmp102"(#loc78))
+#loc225 = loc("tmp102"(#loc79))
+#loc226 = loc("tmp16"(#loc80))
+#loc227 = loc("tmp17"(#loc81))
+#loc228 = loc("tmp17"(#loc82))
+#loc229 = loc("tmp17"(#loc83))
+#loc230 = loc("tmp17"(#loc84))
+#loc231 = loc("tmp17"(#loc85))
+#loc232 = loc("tmp17"(#loc86))
+#loc233 = loc("tmp17"(#loc87))
+#loc234 = loc("tmp17"(#loc88))
+#loc235 = loc("tmp25"(#loc89))
+#loc236 = loc("tmp25"(#loc90))
+#loc237 = loc("tmp25"(#loc91))
+#loc238 = loc("tmp27"(#loc92))
+#loc239 = loc("tmp29"(#loc93))
+#loc240 = loc("tmp31"(#loc94))
+#loc241 = loc("tmp32"(#loc95))
+#loc242 = loc("tmp35"(#loc96))
+#loc243 = loc("tmp35"(#loc97))
+#loc244 = loc("tmp35"(#loc98))
+#loc245 = loc("tmp35"(#loc99))
+#loc246 = loc("tmp35"(#loc100))
+#loc247 = loc("tmp35"(#loc101))
+#loc248 = loc("tmp42"(#loc102))
+#loc249 = loc("tmp43"(#loc103))
+#loc250 = loc("tmp43"(#loc104))
+#loc251 = loc("tmp43"(#loc105))
+#loc252 = loc("tmp45"(#loc106))
+#loc253 = loc("tmp48"(#loc107))
+#loc254 = loc("tmp49"(#loc108))
+#loc255 = loc("tmp57"(#loc109))
+#loc256 = loc("tmp60"(#loc110))
+#loc257 = loc("tmp64"(#loc111))
+#loc258 = loc("tmp67"(#loc112))
+#loc259 = loc("tmp68"(#loc113))
+#loc260 = loc("tmp70"(#loc114))
+#loc261 = loc("tmp70"(#loc115))
+#loc262 = loc("tmp70"(#loc116))
+#loc263 = loc("tmp70"(#loc117))
+#loc264 = loc("tmp70"(#loc118))
+#loc265 = loc("tmp70"(#loc119))
+#loc266 = loc("tmp76"(#loc120))
+#loc267 = loc("tmp76"(#loc121))
+#loc268 = loc("tmp76"(#loc122))
+#loc269 = loc("tmp78"(#loc123))
+#loc270 = loc("tmp80"(#loc124))
+#loc271 = loc("tmp83"(#loc125))
+#loc272 = loc("tmp83"(#loc126))
+#loc273 = loc("tmp83"(#loc127))
+#loc274 = loc("tmp83"(#loc128))
+#loc275 = loc("tmp83"(#loc129))
+#loc276 = loc("tmp83"(#loc130))
+#loc277 = loc("tmp88"(#loc131))
+#loc278 = loc("tmp89"(#loc132))
+#loc279 = loc("tmp89"(#loc133))
+#loc280 = loc("tmp89"(#loc134))
+#loc281 = loc("tmp91"(#loc135))
+#loc282 = loc("tmp94"(#loc136))
+#loc283 = loc("tmp95"(#loc137))
+#loc284 = loc("tmp82"(#loc138))
+#loc285 = loc("tmp101"(#loc139))
+#loc286 = loc("tmp104"(#loc140))
+#loc287 = loc("tmp107"(#loc141))
+#loc288 = loc("tmp109"(#loc142))
+#loc289 = loc("tmp110"(#loc143))
+#loc290 = loc("_tmp10"(#loc168))
+#loc291 = loc(callsite(#loc32 at #loc185))
+#loc293 = loc(callsite(#loc32 at #loc187))
+#loc295 = loc(fused[#loc254, #loc240])
+#loc296 = loc(fused[#loc283, #loc284])
+#loc297 = loc(callsite(#loc34 at #loc291))
+#loc298 = loc(callsite(#loc34 at #loc293))
diff --git a/triton/BFXM4JRLILKWOXIXJK63CNDMPAQTDH7WSFLZFJRWWUXYOHS6MUXQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttir b/triton/BFXM4JRLILKWOXIXJK63CNDMPAQTDH7WSFLZFJRWWUXYOHS6MUXQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttir
new file mode 100644
index 0000000000000000000000000000000000000000..0bb7816a03bb8a7f703b5555aa6e57ba373b5ba6
--- /dev/null
+++ b/triton/BFXM4JRLILKWOXIXJK63CNDMPAQTDH7WSFLZFJRWWUXYOHS6MUXQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttir
@@ -0,0 +1,520 @@
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":18:0)
+#loc1 = loc(unknown)
+#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":51:25)
+#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":52:27)
+#loc149 = loc("in_out_ptr0"(#loc))
+#loc150 = loc("in_out_ptr1"(#loc))
+#loc151 = loc("in_ptr0"(#loc))
+#loc152 = loc("in_ptr1"(#loc))
+#loc153 = loc("in_ptr2"(#loc))
+#loc154 = loc("in_ptr3"(#loc))
+#loc155 = loc("in_ptr4"(#loc))
+#loc156 = loc("xnumel"(#loc))
+#loc157 = loc("r0_numel"(#loc))
+#loc189 = loc("tmp4"(#loc35))
+#loc191 = loc("tmp10"(#loc38))
+#loc296 = loc(callsite(#loc1 at #loc189))
+#loc298 = loc(callsite(#loc1 at #loc191))
+module {
+  tt.func public @triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0(%in_out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_out_ptr0"(#loc)), %in_out_ptr1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_out_ptr1"(#loc)), %in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %in_ptr4: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr4"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
+    %cst = arith.constant dense<0.000000e+00> : tensor<1x8xbf16> loc(#loc1)
+    %cst_0 = arith.constant dense<0.000000e+00> : tensor<64x8xbf16> loc(#loc1)
+    %c8_i32 = arith.constant 8 : i32 loc(#loc1)
+    %c128_i32 = arith.constant 128 : i32 loc(#loc1)
+    %c0_i32 = arith.constant 0 : i32 loc(#loc1)
+    %cst_1 = arith.constant dense<4097> : tensor<1x8xi32> loc(#loc1)
+    %cst_2 = arith.constant dense<9.99999997E-7> : tensor<64x1xf32> loc(#loc1)
+    %cst_3 = arith.constant dense<1.280000e+02> : tensor<64x1xf32> loc(#loc1)
+    %cst_4 = arith.constant dense<1> : tensor<1x8xi32> loc(#loc1)
+    %cst_5 = arith.constant dense<1> : tensor<1x8xi64> loc(#loc1)
+    %cst_6 = arith.constant dense<2> : tensor<1x8xi32> loc(#loc1)
+    %cst_7 = arith.constant dense<36864> : tensor<64x1xi32> loc(#loc1)
+    %cst_8 = arith.constant dense<128> : tensor<64x1xi32> loc(#loc1)
+    %cst_9 = arith.constant dense<4096> : tensor<1x8xi32> loc(#loc1)
+    %cst_10 = arith.constant dense<128> : tensor<1x8xi32> loc(#loc1)
+    %cst_11 = arith.constant dense<0.000000e+00> : tensor<64x8xf32> loc(#loc1)
+    %cst_12 = arith.constant dense<32> : tensor<64x1xi32> loc(#loc1)
+    %c64_i32 = arith.constant 64 : i32 loc(#loc1)
+    %xoffset = tt.get_program_id x : i32 loc(#loc158)
+    %xoffset_13 = arith.muli %xoffset, %c64_i32 : i32 loc(#loc159)
+    %xindex = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc160)
+    %xindex_14 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc161)
+    %xindex_15 = tt.splat %xoffset_13 : i32 -> tensor<64x1xi32> loc(#loc162)
+    %xindex_16 = arith.addi %xindex_15, %xindex_14 : tensor<64x1xi32> loc(#loc162)
+    %r0_base = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32> loc(#loc163)
+    %r0_base_17 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<8xi32> -> tensor<1x8xi32> loc(#loc164)
+    %x0 = arith.remsi %xindex_16, %cst_12 : tensor<64x1xi32> loc(#loc165)
+    %x1 = arith.divsi %xindex_16, %cst_12 : tensor<64x1xi32> loc(#loc166)
+    %_tmp10:2 = scf.for %r0_offset = %c0_i32 to %c128_i32 step %c8_i32 iter_args(%_tmp4 = %cst_11, %_tmp10_20 = %cst_11) -> (tensor<64x8xf32>, tensor<64x8xf32>)  : i32 {
+      %r0_index = tt.splat %r0_offset : i32 -> tensor<1x8xi32> loc(#loc168)
+      %r0_index_21 = arith.addi %r0_index, %r0_base_17 : tensor<1x8xi32> loc(#loc168)
+      %r0_mask = arith.cmpi slt, %r0_index_21, %cst_10 : tensor<1x8xi32> loc(#loc169)
+      %tmp0 = arith.addi %r0_index_21, %cst_9 : tensor<1x8xi32> loc(#loc170)
+      %tmp0_22 = arith.muli %x0, %cst_8 : tensor<64x1xi32> loc(#loc171)
+      %tmp0_23 = tt.broadcast %tmp0 : tensor<1x8xi32> -> tensor<64x8xi32> loc(#loc172)
+      %tmp0_24 = tt.broadcast %tmp0_22 : tensor<64x1xi32> -> tensor<64x8xi32> loc(#loc172)
+      %tmp0_25 = arith.addi %tmp0_23, %tmp0_24 : tensor<64x8xi32> loc(#loc172)
+      %tmp0_26 = arith.muli %x1, %cst_7 : tensor<64x1xi32> loc(#loc173)
+      %tmp0_27 = tt.broadcast %tmp0_26 : tensor<64x1xi32> -> tensor<64x8xi32> loc(#loc174)
+      %tmp0_28 = arith.addi %tmp0_25, %tmp0_27 : tensor<64x8xi32> loc(#loc174)
+      %tmp0_29 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<64x8x!tt.ptr<bf16>> loc(#loc175)
+      %tmp0_30 = tt.addptr %tmp0_29, %tmp0_28 : tensor<64x8x!tt.ptr<bf16>>, tensor<64x8xi32> loc(#loc175)
+      %tmp0_31 = tt.broadcast %r0_mask : tensor<1x8xi1> -> tensor<64x8xi1> loc(#loc176)
+      %tmp0_32 = tt.load %tmp0_30, %tmp0_31, %cst_0 evictionPolicy = evict_last : tensor<64x8x!tt.ptr<bf16>> loc(#loc176)
+      %tmp0_33 = arith.extf %tmp0_32 : tensor<64x8xbf16> to tensor<64x8xf32> loc(#loc177)
+      %tmp6 = tt.broadcast %r0_index_21 : tensor<1x8xi32> -> tensor<64x8xi32> loc(#loc178)
+      %tmp6_34 = arith.addi %tmp6, %tmp0_24 : tensor<64x8xi32> loc(#loc178)
+      %tmp6_35 = arith.addi %tmp6_34, %tmp0_27 : tensor<64x8xi32> loc(#loc179)
+      %tmp6_36 = tt.addptr %tmp0_29, %tmp6_35 : tensor<64x8x!tt.ptr<bf16>>, tensor<64x8xi32> loc(#loc180)
+      %tmp6_37 = tt.load %tmp6_36, %tmp0_31, %cst_0 evictionPolicy = evict_last : tensor<64x8x!tt.ptr<bf16>> loc(#loc181)
+      %tmp6_38 = arith.extf %tmp6_37 : tensor<64x8xbf16> to tensor<64x8xf32> loc(#loc182)
+      %tmp2 = arith.mulf %tmp0_33, %tmp0_33 : tensor<64x8xf32> loc(#loc183)
+      %tmp5 = arith.addf %_tmp4, %tmp2 : tensor<64x8xf32> loc(#loc184)
+      %_tmp4_39 = arith.select %tmp0_31, %tmp5, %_tmp4 : tensor<64x8xi1>, tensor<64x8xf32> loc(#loc185)
+      %tmp8 = arith.mulf %tmp6_38, %tmp6_38 : tensor<64x8xf32> loc(#loc186)
+      %tmp11 = arith.addf %_tmp10_20, %tmp8 : tensor<64x8xf32> loc(#loc187)
+      %_tmp10_40 = arith.select %tmp0_31, %tmp11, %_tmp10_20 : tensor<64x8xi1>, tensor<64x8xf32> loc(#loc188)
+      scf.yield %_tmp4_39, %_tmp10_40 : tensor<64x8xf32>, tensor<64x8xf32> loc(#loc33)
+    } loc(#loc294)
+    %tmp4 = "tt.reduce"(%_tmp10#0) <{axis = 1 : i32}> ({
+    ^bb0(%tmp4_20: f32 loc(callsite(#loc1 at #loc189)), %tmp4_21: f32 loc(callsite(#loc1 at #loc189))):
+      %tmp4_22 = arith.addf %tmp4_20, %tmp4_21 : f32 loc(#loc299)
+      tt.reduce.return %tmp4_22 : f32 loc(#loc295)
+    }) : (tensor<64x8xf32>) -> tensor<64xf32> loc(#loc295)
+    %tmp4_18 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<64xf32> -> tensor<64x1xf32> loc(#loc190)
+    %tmp10 = "tt.reduce"(%_tmp10#1) <{axis = 1 : i32}> ({
+    ^bb0(%tmp10_20: f32 loc(callsite(#loc1 at #loc191)), %tmp10_21: f32 loc(callsite(#loc1 at #loc191))):
+      %tmp10_22 = arith.addf %tmp10_20, %tmp10_21 : f32 loc(#loc300)
+      tt.reduce.return %tmp10_22 : f32 loc(#loc297)
+    }) : (tensor<64x8xf32>) -> tensor<64xf32> loc(#loc297)
+    %tmp10_19 = tt.expand_dims %tmp10 {axis = 1 : i32} : tensor<64xf32> -> tensor<64x1xf32> loc(#loc192)
+    scf.for %r0_offset = %c0_i32 to %c128_i32 step %c8_i32  : i32 {
+      %r0_index = tt.splat %r0_offset : i32 -> tensor<1x8xi32> loc(#loc193)
+      %r0_index_20 = arith.addi %r0_index, %r0_base_17 : tensor<1x8xi32> loc(#loc193)
+      %r0_mask = arith.cmpi slt, %r0_index_20, %cst_10 : tensor<1x8xi32> loc(#loc194)
+      %r0_3 = arith.remsi %r0_index_20, %cst_6 : tensor<1x8xi32> loc(#loc195)
+      %r0_4 = arith.divsi %r0_index_20, %cst_6 : tensor<1x8xi32> loc(#loc196)
+      %tmp50 = arith.muli %x0, %cst_8 : tensor<64x1xi32> loc(#loc197)
+      %tmp50_21 = tt.broadcast %r0_index_20 : tensor<1x8xi32> -> tensor<64x8xi32> loc(#loc198)
+      %tmp50_22 = tt.broadcast %tmp50 : tensor<64x1xi32> -> tensor<64x8xi32> loc(#loc198)
+      %tmp50_23 = arith.addi %tmp50_21, %tmp50_22 : tensor<64x8xi32> loc(#loc198)
+      %tmp50_24 = arith.muli %x1, %cst_7 : tensor<64x1xi32> loc(#loc199)
+      %tmp50_25 = tt.broadcast %tmp50_24 : tensor<64x1xi32> -> tensor<64x8xi32> loc(#loc200)
+      %tmp50_26 = arith.addi %tmp50_23, %tmp50_25 : tensor<64x8xi32> loc(#loc200)
+      %tmp50_27 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<64x8x!tt.ptr<bf16>> loc(#loc201)
+      %tmp50_28 = tt.addptr %tmp50_27, %tmp50_26 : tensor<64x8x!tt.ptr<bf16>>, tensor<64x8xi32> loc(#loc201)
+      %tmp50_29 = tt.broadcast %r0_mask : tensor<1x8xi1> -> tensor<64x8xi1> loc(#loc202)
+      %tmp50_30 = tt.load %tmp50_28, %tmp50_29, %cst_0 evictionPolicy = evict_last : tensor<64x8x!tt.ptr<bf16>> loc(#loc202)
+      %tmp50_31 = arith.extf %tmp50_30 : tensor<64x8xbf16> to tensor<64x8xf32> loc(#loc203)
+      %tmp58 = tt.splat %in_ptr1 : !tt.ptr<bf16> -> tensor<1x8x!tt.ptr<bf16>> loc(#loc204)
+      %tmp58_32 = tt.addptr %tmp58, %r0_index_20 : tensor<1x8x!tt.ptr<bf16>>, tensor<1x8xi32> loc(#loc204)
+      %tmp58_33 = tt.load %tmp58_32, %r0_mask, %cst evictionPolicy = evict_last : tensor<1x8x!tt.ptr<bf16>> loc(#loc205)
+      %tmp58_34 = arith.extf %tmp58_33 : tensor<1x8xbf16> to tensor<1x8xf32> loc(#loc206)
+      %tmp63 = arith.muli %x1, %cst_8 : tensor<64x1xi32> loc(#loc207)
+      %tmp63_35 = tt.broadcast %tmp63 : tensor<64x1xi32> -> tensor<64x8xi32> loc(#loc208)
+      %tmp63_36 = arith.addi %tmp50_21, %tmp63_35 : tensor<64x8xi32> loc(#loc208)
+      %tmp63_37 = tt.splat %in_ptr2 : !tt.ptr<f32> -> tensor<64x8x!tt.ptr<f32>> loc(#loc209)
+      %tmp63_38 = tt.addptr %tmp63_37, %tmp63_36 : tensor<64x8x!tt.ptr<f32>>, tensor<64x8xi32> loc(#loc209)
+      %tmp63_39 = tt.load %tmp63_38, %tmp50_29, %cst_11 evictionPolicy = evict_last : tensor<64x8x!tt.ptr<f32>> loc(#loc210)
+      %tmp66 = tt.splat %in_ptr3 : !tt.ptr<f32> -> tensor<64x8x!tt.ptr<f32>> loc(#loc211)
+      %tmp66_40 = tt.addptr %tmp66, %tmp63_36 : tensor<64x8x!tt.ptr<f32>>, tensor<64x8xi32> loc(#loc211)
+      %tmp66_41 = tt.load %tmp66_40, %tmp50_29, %cst_11 evictionPolicy = evict_last : tensor<64x8x!tt.ptr<f32>> loc(#loc212)
+      %tmp96 = arith.addi %r0_index_20, %cst_9 : tensor<1x8xi32> loc(#loc213)
+      %tmp96_42 = tt.broadcast %tmp96 : tensor<1x8xi32> -> tensor<64x8xi32> loc(#loc214)
+      %tmp96_43 = arith.addi %tmp96_42, %tmp50_22 : tensor<64x8xi32> loc(#loc214)
+      %tmp96_44 = arith.addi %tmp96_43, %tmp50_25 : tensor<64x8xi32> loc(#loc215)
+      %tmp96_45 = tt.addptr %tmp50_27, %tmp96_44 : tensor<64x8x!tt.ptr<bf16>>, tensor<64x8xi32> loc(#loc216)
+      %tmp96_46 = tt.load %tmp96_45, %tmp50_29, %cst_0 evictionPolicy = evict_first : tensor<64x8x!tt.ptr<bf16>> loc(#loc217)
+      %tmp96_47 = arith.extf %tmp96_46 : tensor<64x8xbf16> to tensor<64x8xf32> loc(#loc218)
+      %tmp102 = tt.splat %in_ptr4 : !tt.ptr<bf16> -> tensor<1x8x!tt.ptr<bf16>> loc(#loc219)
+      %tmp102_48 = tt.addptr %tmp102, %r0_index_20 : tensor<1x8x!tt.ptr<bf16>>, tensor<1x8xi32> loc(#loc219)
+      %tmp102_49 = tt.load %tmp102_48, %r0_mask, %cst evictionPolicy = evict_last : tensor<1x8x!tt.ptr<bf16>> loc(#loc220)
+      %tmp102_50 = arith.extf %tmp102_49 : tensor<1x8xbf16> to tensor<1x8xf32> loc(#loc221)
+      %tmp16 = arith.extsi %r0_3 : tensor<1x8xi32> to tensor<1x8xi64> loc(#loc222)
+      %tmp16_51 = arith.cmpi slt, %tmp16, %cst_5 : tensor<1x8xi64> loc(#loc222)
+      %tmp17 = arith.muli %r0_4, %cst_6 : tensor<1x8xi32> loc(#loc223)
+      %tmp17_52 = arith.addi %tmp17, %cst_4 : tensor<1x8xi32> loc(#loc224)
+      %tmp17_53 = tt.broadcast %tmp17_52 : tensor<1x8xi32> -> tensor<64x8xi32> loc(#loc225)
+      %tmp17_54 = arith.addi %tmp17_53, %tmp50_22 : tensor<64x8xi32> loc(#loc225)
+      %tmp17_55 = arith.addi %tmp17_54, %tmp50_25 : tensor<64x8xi32> loc(#loc226)
+      %tmp17_56 = tt.addptr %tmp50_27, %tmp17_55 : tensor<64x8x!tt.ptr<bf16>>, tensor<64x8xi32> loc(#loc227)
+      %tmp17_57 = arith.andi %r0_mask, %tmp16_51 : tensor<1x8xi1> loc(#loc228)
+      %tmp17_58 = tt.broadcast %tmp17_57 : tensor<1x8xi1> -> tensor<64x8xi1> loc(#loc229)
+      %tmp17_59 = tt.load %tmp17_56, %tmp17_58, %cst_0 evictionPolicy = evict_last : tensor<64x8x!tt.ptr<bf16>> loc(#loc229)
+      %tmp17_60 = arith.extf %tmp17_59 : tensor<64x8xbf16> to tensor<64x8xf32> loc(#loc230)
+      %tmp20 = arith.divf %tmp10_19, %cst_3 : tensor<64x1xf32> loc(#loc231)
+      %tmp22 = arith.addf %tmp20, %cst_2 : tensor<64x1xf32> loc(#loc232)
+      %tmp23 = tt.extern_elementwise %tmp22 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<64x1xf32>) -> tensor<64x1xf32> loc(#loc233)
+      %tmp24 = tt.broadcast %tmp23 : tensor<64x1xf32> -> tensor<64x8xf32> loc(#loc234)
+      %tmp24_61 = arith.mulf %tmp17_60, %tmp24 : tensor<64x8xf32> loc(#loc234)
+      %tmp25 = tt.addptr %tmp58, %tmp17_52 : tensor<1x8x!tt.ptr<bf16>>, tensor<1x8xi32> loc(#loc235)
+      %tmp25_62 = tt.broadcast %tmp25 : tensor<1x8x!tt.ptr<bf16>> -> tensor<64x8x!tt.ptr<bf16>> loc(#loc235)
+      %tmp25_63 = tt.load %tmp25_62, %tmp17_58, %cst_0 evictionPolicy = evict_last : tensor<64x8x!tt.ptr<bf16>> loc(#loc236)
+      %tmp25_64 = arith.extf %tmp25_63 : tensor<64x8xbf16> to tensor<64x8xf32> loc(#loc237)
+      %tmp27 = arith.mulf %tmp24_61, %tmp25_64 : tensor<64x8xf32> loc(#loc238)
+      %tmp29 = arith.subf %cst_11, %tmp27 : tensor<64x8xf32> loc(#loc239)
+      %tmp31 = tt.broadcast %tmp16_51 : tensor<1x8xi1> -> tensor<64x8xi1> loc(#loc240)
+      %tmp31_65 = arith.select %tmp31, %tmp29, %cst_11 : tensor<64x8xi1>, tensor<64x8xf32> loc(#loc240)
+      %tmp32 = arith.cmpi sge, %tmp16, %cst_5 : tensor<1x8xi64> loc(#loc241)
+      %tmp35 = tt.broadcast %tmp17 : tensor<1x8xi32> -> tensor<64x8xi32> loc(#loc242)
+      %tmp35_66 = arith.addi %tmp35, %tmp50_22 : tensor<64x8xi32> loc(#loc242)
+      %tmp35_67 = arith.addi %tmp35_66, %tmp50_25 : tensor<64x8xi32> loc(#loc243)
+      %tmp35_68 = tt.addptr %tmp50_27, %tmp35_67 : tensor<64x8x!tt.ptr<bf16>>, tensor<64x8xi32> loc(#loc244)
+      %tmp35_69 = arith.andi %r0_mask, %tmp32 : tensor<1x8xi1> loc(#loc245)
+      %tmp35_70 = tt.broadcast %tmp35_69 : tensor<1x8xi1> -> tensor<64x8xi1> loc(#loc246)
+      %tmp35_71 = tt.load %tmp35_68, %tmp35_70, %cst_0 evictionPolicy = evict_last : tensor<64x8x!tt.ptr<bf16>> loc(#loc246)
+      %tmp35_72 = arith.extf %tmp35_71 : tensor<64x8xbf16> to tensor<64x8xf32> loc(#loc247)
+      %tmp42 = arith.mulf %tmp35_72, %tmp24 : tensor<64x8xf32> loc(#loc248)
+      %tmp43 = tt.addptr %tmp58, %tmp17 : tensor<1x8x!tt.ptr<bf16>>, tensor<1x8xi32> loc(#loc249)
+      %tmp43_73 = tt.broadcast %tmp43 : tensor<1x8x!tt.ptr<bf16>> -> tensor<64x8x!tt.ptr<bf16>> loc(#loc249)
+      %tmp43_74 = tt.load %tmp43_73, %tmp35_70, %cst_0 evictionPolicy = evict_last : tensor<64x8x!tt.ptr<bf16>> loc(#loc250)
+      %tmp43_75 = arith.extf %tmp43_74 : tensor<64x8xbf16> to tensor<64x8xf32> loc(#loc251)
+      %tmp45 = arith.mulf %tmp42, %tmp43_75 : tensor<64x8xf32> loc(#loc252)
+      %tmp48 = tt.broadcast %tmp32 : tensor<1x8xi1> -> tensor<64x8xi1> loc(#loc253)
+      %tmp48_76 = arith.select %tmp48, %tmp45, %cst_11 : tensor<64x8xi1>, tensor<64x8xf32> loc(#loc253)
+      %tmp49 = arith.select %tmp31, %tmp31_65, %tmp48_76 : tensor<64x8xi1>, tensor<64x8xf32> loc(#loc254)
+      %tmp57 = arith.mulf %tmp50_31, %tmp24 : tensor<64x8xf32> loc(#loc255)
+      %tmp60 = tt.broadcast %tmp58_34 : tensor<1x8xf32> -> tensor<64x8xf32> loc(#loc256)
+      %tmp60_77 = arith.mulf %tmp57, %tmp60 : tensor<64x8xf32> loc(#loc256)
+      %tmp64 = arith.mulf %tmp60_77, %tmp63_39 : tensor<64x8xf32> loc(#loc257)
+      %tmp67 = arith.mulf %tmp49, %tmp66_41 : tensor<64x8xf32> loc(#loc258)
+      %tmp68 = arith.addf %tmp64, %tmp67 : tensor<64x8xf32> loc(#loc259)
+      %tmp70 = arith.addi %tmp17, %cst_1 : tensor<1x8xi32> loc(#loc260)
+      %tmp70_78 = tt.broadcast %tmp70 : tensor<1x8xi32> -> tensor<64x8xi32> loc(#loc261)
+      %tmp70_79 = arith.addi %tmp70_78, %tmp50_22 : tensor<64x8xi32> loc(#loc261)
+      %tmp70_80 = arith.addi %tmp70_79, %tmp50_25 : tensor<64x8xi32> loc(#loc262)
+      %tmp70_81 = tt.addptr %tmp50_27, %tmp70_80 : tensor<64x8x!tt.ptr<bf16>>, tensor<64x8xi32> loc(#loc263)
+      %tmp70_82 = tt.load %tmp70_81, %tmp17_58, %cst_0 evictionPolicy = evict_last : tensor<64x8x!tt.ptr<bf16>> loc(#loc264)
+      %tmp70_83 = arith.extf %tmp70_82 : tensor<64x8xbf16> to tensor<64x8xf32> loc(#loc265)
+      %tmp72 = arith.divf %tmp4_18, %cst_3 : tensor<64x1xf32> loc(#loc266)
+      %tmp73 = arith.addf %tmp72, %cst_2 : tensor<64x1xf32> loc(#loc267)
+      %tmp74 = tt.extern_elementwise %tmp73 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<64x1xf32>) -> tensor<64x1xf32> loc(#loc268)
+      %tmp75 = tt.broadcast %tmp74 : tensor<64x1xf32> -> tensor<64x8xf32> loc(#loc269)
+      %tmp75_84 = arith.mulf %tmp70_83, %tmp75 : tensor<64x8xf32> loc(#loc269)
+      %tmp76 = tt.addptr %tmp102, %tmp17_52 : tensor<1x8x!tt.ptr<bf16>>, tensor<1x8xi32> loc(#loc270)
+      %tmp76_85 = tt.broadcast %tmp76 : tensor<1x8x!tt.ptr<bf16>> -> tensor<64x8x!tt.ptr<bf16>> loc(#loc270)
+      %tmp76_86 = tt.load %tmp76_85, %tmp17_58, %cst_0 evictionPolicy = evict_last : tensor<64x8x!tt.ptr<bf16>> loc(#loc271)
+      %tmp76_87 = arith.extf %tmp76_86 : tensor<64x8xbf16> to tensor<64x8xf32> loc(#loc272)
+      %tmp78 = arith.mulf %tmp75_84, %tmp76_87 : tensor<64x8xf32> loc(#loc273)
+      %tmp80 = arith.subf %cst_11, %tmp78 : tensor<64x8xf32> loc(#loc274)
+      %tmp82 = arith.select %tmp31, %tmp80, %cst_11 : tensor<64x8xi1>, tensor<64x8xf32> loc(#loc275)
+      %tmp83 = arith.addi %tmp17, %cst_9 : tensor<1x8xi32> loc(#loc276)
+      %tmp83_88 = tt.broadcast %tmp83 : tensor<1x8xi32> -> tensor<64x8xi32> loc(#loc277)
+      %tmp83_89 = arith.addi %tmp83_88, %tmp50_22 : tensor<64x8xi32> loc(#loc277)
+      %tmp83_90 = arith.addi %tmp83_89, %tmp50_25 : tensor<64x8xi32> loc(#loc278)
+      %tmp83_91 = tt.addptr %tmp50_27, %tmp83_90 : tensor<64x8x!tt.ptr<bf16>>, tensor<64x8xi32> loc(#loc279)
+      %tmp83_92 = tt.load %tmp83_91, %tmp35_70, %cst_0 evictionPolicy = evict_last : tensor<64x8x!tt.ptr<bf16>> loc(#loc280)
+      %tmp83_93 = arith.extf %tmp83_92 : tensor<64x8xbf16> to tensor<64x8xf32> loc(#loc281)
+      %tmp88 = arith.mulf %tmp83_93, %tmp75 : tensor<64x8xf32> loc(#loc282)
+      %tmp89 = tt.addptr %tmp102, %tmp17 : tensor<1x8x!tt.ptr<bf16>>, tensor<1x8xi32> loc(#loc283)
+      %tmp89_94 = tt.broadcast %tmp89 : tensor<1x8x!tt.ptr<bf16>> -> tensor<64x8x!tt.ptr<bf16>> loc(#loc283)
+      %tmp89_95 = tt.load %tmp89_94, %tmp35_70, %cst_0 evictionPolicy = evict_last : tensor<64x8x!tt.ptr<bf16>> loc(#loc284)
+      %tmp89_96 = arith.extf %tmp89_95 : tensor<64x8xbf16> to tensor<64x8xf32> loc(#loc285)
+      %tmp91 = arith.mulf %tmp88, %tmp89_96 : tensor<64x8xf32> loc(#loc286)
+      %tmp94 = arith.select %tmp48, %tmp91, %cst_11 : tensor<64x8xi1>, tensor<64x8xf32> loc(#loc287)
+      %tmp95 = arith.select %tmp31, %tmp82, %tmp94 : tensor<64x8xi1>, tensor<64x8xf32> loc(#loc288)
+      %tmp101 = arith.mulf %tmp96_47, %tmp75 : tensor<64x8xf32> loc(#loc289)
+      %tmp104 = tt.broadcast %tmp102_50 : tensor<1x8xf32> -> tensor<64x8xf32> loc(#loc290)
+      %tmp104_97 = arith.mulf %tmp101, %tmp104 : tensor<64x8xf32> loc(#loc290)
+      %tmp107 = arith.mulf %tmp104_97, %tmp63_39 : tensor<64x8xf32> loc(#loc291)
+      %tmp109 = arith.mulf %tmp95, %tmp66_41 : tensor<64x8xf32> loc(#loc292)
+      %tmp110 = arith.addf %tmp107, %tmp109 : tensor<64x8xf32> loc(#loc293)
+      %0 = arith.muli %xindex_16, %cst_8 : tensor<64x1xi32> loc(#loc142)
+      %1 = tt.broadcast %0 : tensor<64x1xi32> -> tensor<64x8xi32> loc(#loc143)
+      %2 = arith.addi %tmp50_21, %1 : tensor<64x8xi32> loc(#loc143)
+      %3 = tt.splat %in_out_ptr0 : !tt.ptr<bf16> -> tensor<64x8x!tt.ptr<bf16>> loc(#loc144)
+      %4 = tt.addptr %3, %2 : tensor<64x8x!tt.ptr<bf16>>, tensor<64x8xi32> loc(#loc144)
+      %5 = arith.truncf %tmp68 : tensor<64x8xf32> to tensor<64x8xbf16> loc(#loc145)
+      tt.store %4, %5, %tmp50_29 : tensor<64x8x!tt.ptr<bf16>> loc(#loc145)
+      %6 = tt.splat %in_out_ptr1 : !tt.ptr<bf16> -> tensor<64x8x!tt.ptr<bf16>> loc(#loc146)
+      %7 = tt.addptr %6, %2 : tensor<64x8x!tt.ptr<bf16>>, tensor<64x8xi32> loc(#loc146)
+      %8 = arith.truncf %tmp110 : tensor<64x8xf32> to tensor<64x8xbf16> loc(#loc147)
+      tt.store %7, %8, %tmp50_29 : tensor<64x8x!tt.ptr<bf16>> loc(#loc147)
+    } loc(#loc40)
+    tt.return loc(#loc148)
+  } loc(#loc)
+} loc(#loc)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":23:28)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":23:33)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:36)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:44)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:23)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":26:27)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":26:37)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":28:19)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":29:19)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":33:43)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":34:31)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":35:29)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:41)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:52)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:48)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:63)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:57)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:34)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:68)
+#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:121)
+#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:41)
+#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:50)
+#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:34)
+#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:61)
+#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:114)
+#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":42:22)
+#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":44:23)
+#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":45:40)
+#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":47:22)
+#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":49:25)
+#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":50:42)
+#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":50:8)
+#loc34 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:36)
+#loc36 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:15)
+#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":51:28)
+#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":52:30)
+#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":53:43)
+#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":54:31)
+#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":55:29)
+#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":58:27)
+#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":59:27)
+#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:46)
+#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:42)
+#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:57)
+#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:51)
+#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:35)
+#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:62)
+#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:115)
+#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:35)
+#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:42)
+#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:95)
+#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:46)
+#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:42)
+#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:35)
+#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:51)
+#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:35)
+#loc60 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:51)
+#loc61 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:42)
+#loc62 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:49)
+#loc63 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:58)
+#loc64 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:35)
+#loc65 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:69)
+#loc66 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:123)
+#loc67 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:36)
+#loc68 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:43)
+#loc69 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:96)
+#loc70 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":71:24)
+#loc71 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:41)
+#loc72 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:39)
+#loc73 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:48)
+#loc74 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:57)
+#loc75 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:35)
+#loc76 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:78)
+#loc77 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:68)
+#loc78 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:129)
+#loc79 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":75:25)
+#loc80 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":77:24)
+#loc81 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":78:32)
+#loc82 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":79:24)
+#loc83 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:35)
+#loc84 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:85)
+#loc85 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:146)
+#loc86 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":82:24)
+#loc87 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":84:17)
+#loc88 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":86:39)
+#loc89 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":87:25)
+#loc90 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:44)
+#loc91 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:53)
+#loc92 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:35)
+#loc93 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:74)
+#loc94 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:64)
+#loc95 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:125)
+#loc96 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":97:24)
+#loc97 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:35)
+#loc98 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:81)
+#loc99 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:142)
+#loc100 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":100:24)
+#loc101 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":103:39)
+#loc102 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":104:39)
+#loc103 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":111:24)
+#loc104 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":113:24)
+#loc105 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":116:24)
+#loc106 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":118:24)
+#loc107 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":119:24)
+#loc108 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:42)
+#loc109 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:51)
+#loc110 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:60)
+#loc111 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:35)
+#loc112 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:71)
+#loc113 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:132)
+#loc114 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":123:24)
+#loc115 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":124:24)
+#loc116 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":125:32)
+#loc117 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":126:24)
+#loc118 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:35)
+#loc119 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:85)
+#loc120 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:146)
+#loc121 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":129:24)
+#loc122 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":131:17)
+#loc123 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":133:39)
+#loc124 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:42)
+#loc125 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:51)
+#loc126 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:60)
+#loc127 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:35)
+#loc128 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:71)
+#loc129 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:132)
+#loc130 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":139:24)
+#loc131 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:35)
+#loc132 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:81)
+#loc133 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:142)
+#loc134 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":142:24)
+#loc135 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":145:39)
+#loc136 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":146:39)
+#loc137 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":151:25)
+#loc138 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":153:26)
+#loc139 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":156:26)
+#loc140 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":158:26)
+#loc141 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":159:26)
+#loc142 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:43)
+#loc143 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:39)
+#loc144 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:32)
+#loc145 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:55)
+#loc146 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:32)
+#loc147 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:56)
+#loc148 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":53:4)
+#loc158 = loc("xoffset"(#loc2))
+#loc159 = loc("xoffset"(#loc3))
+#loc160 = loc("xindex"(#loc4))
+#loc161 = loc("xindex"(#loc5))
+#loc162 = loc("xindex"(#loc6))
+#loc163 = loc("r0_base"(#loc7))
+#loc164 = loc("r0_base"(#loc8))
+#loc165 = loc("x0"(#loc9))
+#loc166 = loc("x1"(#loc10))
+#loc167 = loc("_tmp4"(#loc11))
+#loc168 = loc("r0_index"(#loc12))
+#loc169 = loc("r0_mask"(#loc13))
+#loc170 = loc("tmp0"(#loc14))
+#loc171 = loc("tmp0"(#loc15))
+#loc172 = loc("tmp0"(#loc16))
+#loc173 = loc("tmp0"(#loc17))
+#loc174 = loc("tmp0"(#loc18))
+#loc175 = loc("tmp0"(#loc19))
+#loc176 = loc("tmp0"(#loc20))
+#loc177 = loc("tmp0"(#loc21))
+#loc178 = loc("tmp6"(#loc22))
+#loc179 = loc("tmp6"(#loc23))
+#loc180 = loc("tmp6"(#loc24))
+#loc181 = loc("tmp6"(#loc25))
+#loc182 = loc("tmp6"(#loc26))
+#loc183 = loc("tmp2"(#loc27))
+#loc184 = loc("tmp5"(#loc28))
+#loc185 = loc("_tmp4"(#loc29))
+#loc186 = loc("tmp8"(#loc30))
+#loc187 = loc("tmp11"(#loc31))
+#loc188 = loc("_tmp10"(#loc32))
+#loc190 = loc("tmp4"(#loc37))
+#loc192 = loc("tmp10"(#loc39))
+#loc193 = loc("r0_index"(#loc41))
+#loc194 = loc("r0_mask"(#loc42))
+#loc195 = loc("r0_3"(#loc43))
+#loc196 = loc("r0_4"(#loc44))
+#loc197 = loc("tmp50"(#loc45))
+#loc198 = loc("tmp50"(#loc46))
+#loc199 = loc("tmp50"(#loc47))
+#loc200 = loc("tmp50"(#loc48))
+#loc201 = loc("tmp50"(#loc49))
+#loc202 = loc("tmp50"(#loc50))
+#loc203 = loc("tmp50"(#loc51))
+#loc204 = loc("tmp58"(#loc52))
+#loc205 = loc("tmp58"(#loc53))
+#loc206 = loc("tmp58"(#loc54))
+#loc207 = loc("tmp63"(#loc55))
+#loc208 = loc("tmp63"(#loc56))
+#loc209 = loc("tmp63"(#loc57))
+#loc210 = loc("tmp63"(#loc58))
+#loc211 = loc("tmp66"(#loc59))
+#loc212 = loc("tmp66"(#loc60))
+#loc213 = loc("tmp96"(#loc61))
+#loc214 = loc("tmp96"(#loc62))
+#loc215 = loc("tmp96"(#loc63))
+#loc216 = loc("tmp96"(#loc64))
+#loc217 = loc("tmp96"(#loc65))
+#loc218 = loc("tmp96"(#loc66))
+#loc219 = loc("tmp102"(#loc67))
+#loc220 = loc("tmp102"(#loc68))
+#loc221 = loc("tmp102"(#loc69))
+#loc222 = loc("tmp16"(#loc70))
+#loc223 = loc("tmp17"(#loc71))
+#loc224 = loc("tmp17"(#loc72))
+#loc225 = loc("tmp17"(#loc73))
+#loc226 = loc("tmp17"(#loc74))
+#loc227 = loc("tmp17"(#loc75))
+#loc228 = loc("tmp17"(#loc76))
+#loc229 = loc("tmp17"(#loc77))
+#loc230 = loc("tmp17"(#loc78))
+#loc231 = loc("tmp20"(#loc79))
+#loc232 = loc("tmp22"(#loc80))
+#loc233 = loc("tmp23"(#loc81))
+#loc234 = loc("tmp24"(#loc82))
+#loc235 = loc("tmp25"(#loc83))
+#loc236 = loc("tmp25"(#loc84))
+#loc237 = loc("tmp25"(#loc85))
+#loc238 = loc("tmp27"(#loc86))
+#loc239 = loc("tmp29"(#loc87))
+#loc240 = loc("tmp31"(#loc88))
+#loc241 = loc("tmp32"(#loc89))
+#loc242 = loc("tmp35"(#loc90))
+#loc243 = loc("tmp35"(#loc91))
+#loc244 = loc("tmp35"(#loc92))
+#loc245 = loc("tmp35"(#loc93))
+#loc246 = loc("tmp35"(#loc94))
+#loc247 = loc("tmp35"(#loc95))
+#loc248 = loc("tmp42"(#loc96))
+#loc249 = loc("tmp43"(#loc97))
+#loc250 = loc("tmp43"(#loc98))
+#loc251 = loc("tmp43"(#loc99))
+#loc252 = loc("tmp45"(#loc100))
+#loc253 = loc("tmp48"(#loc101))
+#loc254 = loc("tmp49"(#loc102))
+#loc255 = loc("tmp57"(#loc103))
+#loc256 = loc("tmp60"(#loc104))
+#loc257 = loc("tmp64"(#loc105))
+#loc258 = loc("tmp67"(#loc106))
+#loc259 = loc("tmp68"(#loc107))
+#loc260 = loc("tmp70"(#loc108))
+#loc261 = loc("tmp70"(#loc109))
+#loc262 = loc("tmp70"(#loc110))
+#loc263 = loc("tmp70"(#loc111))
+#loc264 = loc("tmp70"(#loc112))
+#loc265 = loc("tmp70"(#loc113))
+#loc266 = loc("tmp72"(#loc114))
+#loc267 = loc("tmp73"(#loc115))
+#loc268 = loc("tmp74"(#loc116))
+#loc269 = loc("tmp75"(#loc117))
+#loc270 = loc("tmp76"(#loc118))
+#loc271 = loc("tmp76"(#loc119))
+#loc272 = loc("tmp76"(#loc120))
+#loc273 = loc("tmp78"(#loc121))
+#loc274 = loc("tmp80"(#loc122))
+#loc275 = loc("tmp82"(#loc123))
+#loc276 = loc("tmp83"(#loc124))
+#loc277 = loc("tmp83"(#loc125))
+#loc278 = loc("tmp83"(#loc126))
+#loc279 = loc("tmp83"(#loc127))
+#loc280 = loc("tmp83"(#loc128))
+#loc281 = loc("tmp83"(#loc129))
+#loc282 = loc("tmp88"(#loc130))
+#loc283 = loc("tmp89"(#loc131))
+#loc284 = loc("tmp89"(#loc132))
+#loc285 = loc("tmp89"(#loc133))
+#loc286 = loc("tmp91"(#loc134))
+#loc287 = loc("tmp94"(#loc135))
+#loc288 = loc("tmp95"(#loc136))
+#loc289 = loc("tmp101"(#loc137))
+#loc290 = loc("tmp104"(#loc138))
+#loc291 = loc("tmp107"(#loc139))
+#loc292 = loc("tmp109"(#loc140))
+#loc293 = loc("tmp110"(#loc141))
+#loc294 = loc("_tmp10"(#loc167))
+#loc295 = loc(callsite(#loc34 at #loc189))
+#loc297 = loc(callsite(#loc34 at #loc191))
+#loc299 = loc(callsite(#loc36 at #loc295))
+#loc300 = loc(callsite(#loc36 at #loc297))
diff --git a/triton/BJVG7OCL7JU5W736D67FIT6PQ3DMDC7PLUQRHMEEN5LS5TJIMXXA/__grp__triton_red_fused_add_mul_native_layer_norm_1.json b/triton/BJVG7OCL7JU5W736D67FIT6PQ3DMDC7PLUQRHMEEN5LS5TJIMXXA/__grp__triton_red_fused_add_mul_native_layer_norm_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..5da8c70ec4563eaad58dd39c369f70778d094060
--- /dev/null
+++ b/triton/BJVG7OCL7JU5W736D67FIT6PQ3DMDC7PLUQRHMEEN5LS5TJIMXXA/__grp__triton_red_fused_add_mul_native_layer_norm_1.json
@@ -0,0 +1 @@
+{"child_paths": {"triton_red_fused_add_mul_native_layer_norm_1.source": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/BJVG7OCL7JU5W736D67FIT6PQ3DMDC7PLUQRHMEEN5LS5TJIMXXA/triton_red_fused_add_mul_native_layer_norm_1.source", "triton_red_fused_add_mul_native_layer_norm_1.ttir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/BJVG7OCL7JU5W736D67FIT6PQ3DMDC7PLUQRHMEEN5LS5TJIMXXA/triton_red_fused_add_mul_native_layer_norm_1.ttir", "triton_red_fused_add_mul_native_layer_norm_1.ttgir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/BJVG7OCL7JU5W736D67FIT6PQ3DMDC7PLUQRHMEEN5LS5TJIMXXA/triton_red_fused_add_mul_native_layer_norm_1.ttgir", "triton_red_fused_add_mul_native_layer_norm_1.llir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/BJVG7OCL7JU5W736D67FIT6PQ3DMDC7PLUQRHMEEN5LS5TJIMXXA/triton_red_fused_add_mul_native_layer_norm_1.llir", "triton_red_fused_add_mul_native_layer_norm_1.ptx": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/BJVG7OCL7JU5W736D67FIT6PQ3DMDC7PLUQRHMEEN5LS5TJIMXXA/triton_red_fused_add_mul_native_layer_norm_1.ptx", "triton_red_fused_add_mul_native_layer_norm_1.cubin": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/BJVG7OCL7JU5W736D67FIT6PQ3DMDC7PLUQRHMEEN5LS5TJIMXXA/triton_red_fused_add_mul_native_layer_norm_1.cubin", "triton_red_fused_add_mul_native_layer_norm_1.json": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/BJVG7OCL7JU5W736D67FIT6PQ3DMDC7PLUQRHMEEN5LS5TJIMXXA/triton_red_fused_add_mul_native_layer_norm_1.json"}}
\ No newline at end of file
diff --git a/triton/BJVG7OCL7JU5W736D67FIT6PQ3DMDC7PLUQRHMEEN5LS5TJIMXXA/triton_red_fused_add_mul_native_layer_norm_1.cubin b/triton/BJVG7OCL7JU5W736D67FIT6PQ3DMDC7PLUQRHMEEN5LS5TJIMXXA/triton_red_fused_add_mul_native_layer_norm_1.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..b32ea6cb835745ced51fd66e3138059d7fcb0051
Binary files /dev/null and b/triton/BJVG7OCL7JU5W736D67FIT6PQ3DMDC7PLUQRHMEEN5LS5TJIMXXA/triton_red_fused_add_mul_native_layer_norm_1.cubin differ
diff --git a/triton/BJVG7OCL7JU5W736D67FIT6PQ3DMDC7PLUQRHMEEN5LS5TJIMXXA/triton_red_fused_add_mul_native_layer_norm_1.json b/triton/BJVG7OCL7JU5W736D67FIT6PQ3DMDC7PLUQRHMEEN5LS5TJIMXXA/triton_red_fused_add_mul_native_layer_norm_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..c796febb9bed30f3999a3c4067529a2757119d6c
--- /dev/null
+++ b/triton/BJVG7OCL7JU5W736D67FIT6PQ3DMDC7PLUQRHMEEN5LS5TJIMXXA/triton_red_fused_add_mul_native_layer_norm_1.json
@@ -0,0 +1 @@
+{"hash": "0a6a6fb84bfa69db7f7e1fbe544fcf86c6c18bef5d2113b0846f572ecd2865ee", "target": {"backend": "cuda", "arch": 89, "warp_size": 32}, "num_warps": 16, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "enable_reflect_ftz": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee", "bf16x3", "bf16x6"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm89", "instrumentation_mode": "", "triton_version": "3.6.0", "tensordesc_meta": [], "shared": 192, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused_add_mul_native_layer_norm_1"}
\ No newline at end of file
diff --git a/triton/BJVG7OCL7JU5W736D67FIT6PQ3DMDC7PLUQRHMEEN5LS5TJIMXXA/triton_red_fused_add_mul_native_layer_norm_1.llir b/triton/BJVG7OCL7JU5W736D67FIT6PQ3DMDC7PLUQRHMEEN5LS5TJIMXXA/triton_red_fused_add_mul_native_layer_norm_1.llir
new file mode 100644
index 0000000000000000000000000000000000000000..d6b74e653c2151463c36afcc2ceea324a05a08a2
--- /dev/null
+++ b/triton/BJVG7OCL7JU5W736D67FIT6PQ3DMDC7PLUQRHMEEN5LS5TJIMXXA/triton_red_fused_add_mul_native_layer_norm_1.llir
@@ -0,0 +1,565 @@
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64"
+
+@global_smem = external addrspace(3) global [0 x i8], align 16
+@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1
+
+; Function Attrs: nounwind
+define ptx_kernel void @triton_red_fused_add_mul_native_layer_norm_1(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, i32 %4, i32 %5, ptr addrspace(1) readnone captures(none) %6, ptr addrspace(1) readnone captures(none) %7) local_unnamed_addr #0 !dbg !5 {
+__nv_rsqrtf.exit:
+  %8 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !8
+  %9 = icmp samesign ult i32 %8, 256, !dbg !9
+  %10 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
+  %11 = shl nuw nsw i32 %10, 2, !dbg !10
+  %12 = and i32 %11, 2044, !dbg !10
+  %13 = shl i32 %8, 12, !dbg !11
+  %14 = or disjoint i32 %12, %13
+  %15 = sext i32 %14 to i64, !dbg !12
+  %16 = getelementptr bfloat, ptr addrspace(1) %0, i64 %15, !dbg !13
+  %17 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !14
+  %18 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %16, i64 %17, i1 %9) #6, !dbg !14
+  %19 = extractvalue { i32, i32 } %18, 1, !dbg !14
+  %20 = bitcast i32 %19 to <2 x bfloat>, !dbg !14
+  %21 = extractelement <2 x bfloat> %20, i64 1, !dbg !14
+  %22 = fpext bfloat %21 to float, !dbg !15
+  %23 = extractelement <2 x bfloat> %20, i64 0, !dbg !14
+  %24 = fpext bfloat %23 to float, !dbg !15
+  %25 = extractvalue { i32, i32 } %18, 0, !dbg !14
+  %26 = bitcast i32 %25 to <2 x bfloat>, !dbg !14
+  %27 = extractelement <2 x bfloat> %26, i64 1, !dbg !14
+  %28 = fpext bfloat %27 to float, !dbg !15
+  %29 = extractelement <2 x bfloat> %26, i64 0, !dbg !14
+  %30 = fpext bfloat %29 to float, !dbg !15
+  %31 = select i1 %9, float %30, float 0.000000e+00, !dbg !16
+  %32 = select i1 %9, float %28, float 0.000000e+00, !dbg !16
+  %33 = select i1 %9, float %24, float 0.000000e+00, !dbg !16
+  %34 = select i1 %9, float %22, float 0.000000e+00, !dbg !16
+  %35 = getelementptr bfloat, ptr addrspace(1) %0, i64 %15, !dbg !13
+  %36 = getelementptr i8, ptr addrspace(1) %35, i64 4096, !dbg !13
+  %37 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !14
+  %38 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %36, i64 %37, i1 %9) #6, !dbg !14
+  %39 = extractvalue { i32, i32 } %38, 0, !dbg !14
+  %40 = bitcast i32 %39 to <2 x bfloat>, !dbg !14
+  %41 = extractelement <2 x bfloat> %40, i64 0, !dbg !14
+  %42 = fpext bfloat %41 to float, !dbg !15
+  %43 = fsub float %42, %31, !dbg !17
+  %44 = select i1 %9, float 2.000000e+00, float 1.000000e+00, !dbg !22
+  %45 = tail call float @llvm.nvvm.div.full(float %43, float %44), !dbg !23
+  %46 = fadd float %31, %45, !dbg !24
+  %47 = fsub float %42, %46, !dbg !25
+  %48 = fmul float %43, %47, !dbg !26
+  %49 = fadd float %48, 0.000000e+00, !dbg !27
+  %50 = extractelement <2 x bfloat> %40, i64 1, !dbg !14
+  %51 = fpext bfloat %50 to float, !dbg !15
+  %52 = fsub float %51, %32, !dbg !17
+  %53 = tail call float @llvm.nvvm.div.full(float %52, float %44), !dbg !23
+  %54 = fadd float %32, %53, !dbg !24
+  %55 = fsub float %51, %54, !dbg !25
+  %56 = fmul float %52, %55, !dbg !26
+  %57 = fadd float %56, 0.000000e+00, !dbg !27
+  %58 = extractvalue { i32, i32 } %38, 1, !dbg !14
+  %59 = bitcast i32 %58 to <2 x bfloat>, !dbg !14
+  %60 = extractelement <2 x bfloat> %59, i64 0, !dbg !14
+  %61 = fpext bfloat %60 to float, !dbg !15
+  %62 = fsub float %61, %33, !dbg !17
+  %63 = tail call float @llvm.nvvm.div.full(float %62, float %44), !dbg !23
+  %64 = fadd float %33, %63, !dbg !24
+  %65 = fsub float %61, %64, !dbg !25
+  %66 = fmul float %62, %65, !dbg !26
+  %67 = fadd float %66, 0.000000e+00, !dbg !27
+  %68 = extractelement <2 x bfloat> %59, i64 1, !dbg !14
+  %69 = fpext bfloat %68 to float, !dbg !15
+  %70 = fsub float %69, %34, !dbg !17
+  %71 = tail call float @llvm.nvvm.div.full(float %70, float %44), !dbg !23
+  %72 = fadd float %34, %71, !dbg !24
+  %73 = fsub float %69, %72, !dbg !25
+  %74 = fmul float %70, %73, !dbg !26
+  %75 = fadd float %74, 0.000000e+00, !dbg !27
+  %76 = select i1 %9, float %46, float 0.000000e+00, !dbg !16
+  %77 = select i1 %9, float %54, float 0.000000e+00, !dbg !16
+  %78 = select i1 %9, float %64, float 0.000000e+00, !dbg !16
+  %79 = select i1 %9, float %72, float 0.000000e+00, !dbg !16
+  %80 = select i1 %9, float %67, float 0.000000e+00, !dbg !28
+  %81 = select i1 %9, float %75, float 0.000000e+00, !dbg !28
+  %82 = select i1 %9, float 2.000000e+00, float 0.000000e+00, !dbg !22
+  %83 = select i1 %9, float 2.000000e+00, float 0.000000e+00, !dbg !22
+  %84 = select i1 %9, float 2.000000e+00, float 0.000000e+00, !dbg !22
+  %85 = select i1 %9, float 2.000000e+00, float 0.000000e+00, !dbg !22
+  %86 = and i32 %10, 511, !dbg !10
+  %87 = and i32 %10, 31, !dbg !10
+  %88 = lshr i32 %86, 5, !dbg !10
+  %89 = fsub float %77, %76, !dbg !29
+  %90 = select i1 %9, float 4.000000e+00, float 0.000000e+00, !dbg !32
+  %91 = fcmp oeq float %90, 0.000000e+00, !dbg !33
+  %92 = tail call float @llvm.nvvm.div.full(float %83, float %90), !dbg !34
+  %93 = select i1 %91, float 0.000000e+00, float %92, !dbg !35
+  %94 = fmul float %89, %93, !dbg !36
+  %95 = fadd float %76, %94, !dbg !37
+  %96 = fadd float %49, %57, !dbg !38
+  %97 = select i1 %9, float %96, float 0.000000e+00, !dbg !38
+  %98 = fmul float %89, %89, !dbg !39
+  %99 = fmul float %98, %82, !dbg !40
+  %100 = fmul float %99, %93, !dbg !41
+  %101 = fadd float %97, %100, !dbg !42
+  %102 = fsub float %78, %95, !dbg !29
+  %103 = select i1 %9, float 6.000000e+00, float 0.000000e+00, !dbg !32
+  %104 = fcmp oeq float %103, 0.000000e+00, !dbg !33
+  %105 = tail call float @llvm.nvvm.div.full(float %84, float %103), !dbg !34
+  %106 = select i1 %104, float 0.000000e+00, float %105, !dbg !35
+  %107 = fmul float %106, %102, !dbg !36
+  %108 = fadd float %95, %107, !dbg !37
+  %109 = fadd float %80, %101, !dbg !38
+  %110 = fmul float %102, %102, !dbg !39
+  %111 = fmul float %90, %110, !dbg !40
+  %112 = fmul float %106, %111, !dbg !41
+  %113 = fadd float %109, %112, !dbg !42
+  %114 = fsub float %79, %108, !dbg !29
+  %115 = select i1 %9, float 8.000000e+00, float 0.000000e+00, !dbg !32
+  %116 = fcmp oeq float %115, 0.000000e+00, !dbg !33
+  %117 = tail call float @llvm.nvvm.div.full(float %85, float %115), !dbg !34
+  %118 = select i1 %116, float 0.000000e+00, float %117, !dbg !35
+  %119 = fmul float %118, %114, !dbg !36
+  %120 = fadd float %108, %119, !dbg !37
+  %121 = fadd float %81, %113, !dbg !38
+  %122 = fmul float %114, %114, !dbg !39
+  %123 = fmul float %103, %122, !dbg !40
+  %124 = fmul float %118, %123, !dbg !41
+  %125 = fadd float %121, %124, !dbg !42
+  %126 = bitcast float %120 to i32, !dbg !30
+  %127 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %126, i32 16, i32 31), !dbg !30
+  %128 = bitcast i32 %127 to float, !dbg !30
+  %129 = bitcast float %125 to i32, !dbg !30
+  %130 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %129, i32 16, i32 31), !dbg !30
+  %131 = bitcast i32 %130 to float, !dbg !30
+  %132 = bitcast float %115 to i32, !dbg !30
+  %133 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %132, i32 16, i32 31), !dbg !30
+  %134 = bitcast i32 %133 to float, !dbg !30
+  %135 = fsub float %128, %120, !dbg !29
+  %136 = fadd float %115, %134, !dbg !32
+  %137 = fcmp oeq float %136, 0.000000e+00, !dbg !33
+  %138 = tail call float @llvm.nvvm.div.full(float %134, float %136), !dbg !34
+  %139 = select i1 %137, float 0.000000e+00, float %138, !dbg !35
+  %140 = fmul float %139, %135, !dbg !36
+  %141 = fadd float %120, %140, !dbg !37
+  %142 = fadd float %125, %131, !dbg !38
+  %143 = fmul float %135, %135, !dbg !39
+  %144 = fmul float %115, %143, !dbg !40
+  %145 = fmul float %139, %144, !dbg !41
+  %146 = fadd float %142, %145, !dbg !42
+  %147 = bitcast float %141 to i32, !dbg !30
+  %148 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %147, i32 8, i32 31), !dbg !30
+  %149 = bitcast i32 %148 to float, !dbg !30
+  %150 = bitcast float %146 to i32, !dbg !30
+  %151 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %150, i32 8, i32 31), !dbg !30
+  %152 = bitcast i32 %151 to float, !dbg !30
+  %153 = bitcast float %136 to i32, !dbg !30
+  %154 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %153, i32 8, i32 31), !dbg !30
+  %155 = bitcast i32 %154 to float, !dbg !30
+  %156 = fsub float %149, %141, !dbg !29
+  %157 = fadd float %136, %155, !dbg !32
+  %158 = fcmp oeq float %157, 0.000000e+00, !dbg !33
+  %159 = tail call float @llvm.nvvm.div.full(float %155, float %157), !dbg !34
+  %160 = select i1 %158, float 0.000000e+00, float %159, !dbg !35
+  %161 = fmul float %156, %160, !dbg !36
+  %162 = fadd float %141, %161, !dbg !37
+  %163 = fadd float %146, %152, !dbg !38
+  %164 = fmul float %156, %156, !dbg !39
+  %165 = fmul float %136, %164, !dbg !40
+  %166 = fmul float %160, %165, !dbg !41
+  %167 = fadd float %163, %166, !dbg !42
+  %168 = bitcast float %162 to i32, !dbg !30
+  %169 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %168, i32 4, i32 31), !dbg !30
+  %170 = bitcast i32 %169 to float, !dbg !30
+  %171 = bitcast float %167 to i32, !dbg !30
+  %172 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %171, i32 4, i32 31), !dbg !30
+  %173 = bitcast i32 %172 to float, !dbg !30
+  %174 = bitcast float %157 to i32, !dbg !30
+  %175 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %174, i32 4, i32 31), !dbg !30
+  %176 = bitcast i32 %175 to float, !dbg !30
+  %177 = fsub float %170, %162, !dbg !29
+  %178 = fadd float %157, %176, !dbg !32
+  %179 = fcmp oeq float %178, 0.000000e+00, !dbg !33
+  %180 = tail call float @llvm.nvvm.div.full(float %176, float %178), !dbg !34
+  %181 = select i1 %179, float 0.000000e+00, float %180, !dbg !35
+  %182 = fmul float %177, %181, !dbg !36
+  %183 = fadd float %162, %182, !dbg !37
+  %184 = fadd float %167, %173, !dbg !38
+  %185 = fmul float %177, %177, !dbg !39
+  %186 = fmul float %157, %185, !dbg !40
+  %187 = fmul float %181, %186, !dbg !41
+  %188 = fadd float %184, %187, !dbg !42
+  %189 = bitcast float %183 to i32, !dbg !30
+  %190 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %189, i32 2, i32 31), !dbg !30
+  %191 = bitcast i32 %190 to float, !dbg !30
+  %192 = bitcast float %188 to i32, !dbg !30
+  %193 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %192, i32 2, i32 31), !dbg !30
+  %194 = bitcast i32 %193 to float, !dbg !30
+  %195 = bitcast float %178 to i32, !dbg !30
+  %196 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %195, i32 2, i32 31), !dbg !30
+  %197 = bitcast i32 %196 to float, !dbg !30
+  %198 = fsub float %191, %183, !dbg !29
+  %199 = fadd float %178, %197, !dbg !32
+  %200 = fcmp oeq float %199, 0.000000e+00, !dbg !33
+  %201 = tail call float @llvm.nvvm.div.full(float %197, float %199), !dbg !34
+  %202 = select i1 %200, float 0.000000e+00, float %201, !dbg !35
+  %203 = fmul float %198, %202, !dbg !36
+  %204 = fadd float %183, %203, !dbg !37
+  %205 = fadd float %188, %194, !dbg !38
+  %206 = fmul float %198, %198, !dbg !39
+  %207 = fmul float %178, %206, !dbg !40
+  %208 = fmul float %202, %207, !dbg !41
+  %209 = fadd float %205, %208, !dbg !42
+  %210 = bitcast float %204 to i32, !dbg !30
+  %211 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %210, i32 1, i32 31), !dbg !30
+  %212 = bitcast i32 %211 to float, !dbg !30
+  %213 = bitcast float %209 to i32, !dbg !30
+  %214 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %213, i32 1, i32 31), !dbg !30
+  %215 = bitcast i32 %214 to float, !dbg !30
+  %216 = bitcast float %199 to i32, !dbg !30
+  %217 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %216, i32 1, i32 31), !dbg !30
+  %218 = bitcast i32 %217 to float, !dbg !30
+  %219 = fsub float %212, %204, !dbg !29
+  %220 = fadd float %199, %218, !dbg !32
+  %221 = fcmp oeq float %220, 0.000000e+00, !dbg !33
+  %222 = tail call float @llvm.nvvm.div.full(float %218, float %220), !dbg !34
+  %223 = select i1 %221, float 0.000000e+00, float %222, !dbg !35
+  %224 = fmul float %219, %223, !dbg !36
+  %225 = fadd float %204, %224, !dbg !37
+  %226 = fadd float %209, %215, !dbg !38
+  %227 = fmul float %219, %219, !dbg !39
+  %228 = fmul float %199, %227, !dbg !40
+  %229 = fmul float %223, %228, !dbg !41
+  %230 = fadd float %226, %229, !dbg !42
+  %231 = icmp eq i32 %87, 0, !dbg !30
+  %232 = getelementptr float, ptr addrspace(3) @global_smem, i32 %88, !dbg !30
+  %233 = bitcast float %225 to <1 x i32>, !dbg !30
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %232, <1 x i32> %233, i1 %231) #6, !dbg !30
+  %234 = getelementptr float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 64), i32 %88, !dbg !30
+  %235 = bitcast float %230 to <1 x i32>, !dbg !30
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %234, <1 x i32> %235, i1 %231) #6, !dbg !30
+  %236 = getelementptr float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 128), i32 %88, !dbg !30
+  %237 = bitcast float %220 to <1 x i32>, !dbg !30
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %236, <1 x i32> %237, i1 %231) #6, !dbg !30
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !30
+  %238 = icmp samesign ult i32 %86, 16, !dbg !30
+  %239 = getelementptr float, ptr addrspace(3) @global_smem, i32 %86, !dbg !30
+  %240 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %239, i1 %238) #6, !dbg !30
+  %241 = bitcast i32 %240 to float, !dbg !30
+  %242 = getelementptr float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 64), i32 %86, !dbg !30
+  %243 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %242, i1 %238) #6, !dbg !30
+  %244 = bitcast i32 %243 to float, !dbg !30
+  %245 = getelementptr float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 128), i32 %86, !dbg !30
+  %246 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %245, i1 %238) #6, !dbg !30
+  %247 = bitcast i32 %246 to float, !dbg !30
+  %248 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %240, i32 8, i32 31), !dbg !30
+  %249 = bitcast i32 %248 to float, !dbg !30
+  %250 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %243, i32 8, i32 31), !dbg !30
+  %251 = bitcast i32 %250 to float, !dbg !30
+  %252 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %246, i32 8, i32 31), !dbg !30
+  %253 = bitcast i32 %252 to float, !dbg !30
+  %254 = fsub float %249, %241, !dbg !29
+  %255 = fadd float %247, %253, !dbg !32
+  %256 = fcmp oeq float %255, 0.000000e+00, !dbg !33
+  %257 = tail call float @llvm.nvvm.div.full(float %253, float %255), !dbg !34
+  %258 = select i1 %256, float 0.000000e+00, float %257, !dbg !35
+  %259 = fmul float %254, %258, !dbg !36
+  %260 = fadd float %259, %241, !dbg !37
+  %261 = fadd float %244, %251, !dbg !38
+  %262 = fmul float %254, %254, !dbg !39
+  %263 = fmul float %262, %247, !dbg !40
+  %264 = fmul float %263, %258, !dbg !41
+  %265 = fadd float %261, %264, !dbg !42
+  %266 = bitcast float %260 to i32, !dbg !30
+  %267 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %266, i32 4, i32 31), !dbg !30
+  %268 = bitcast i32 %267 to float, !dbg !30
+  %269 = bitcast float %265 to i32, !dbg !30
+  %270 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %269, i32 4, i32 31), !dbg !30
+  %271 = bitcast i32 %270 to float, !dbg !30
+  %272 = bitcast float %255 to i32, !dbg !30
+  %273 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %272, i32 4, i32 31), !dbg !30
+  %274 = bitcast i32 %273 to float, !dbg !30
+  %275 = fsub float %268, %260, !dbg !29
+  %276 = fadd float %255, %274, !dbg !32
+  %277 = fcmp oeq float %276, 0.000000e+00, !dbg !33
+  %278 = tail call float @llvm.nvvm.div.full(float %274, float %276), !dbg !34
+  %279 = select i1 %277, float 0.000000e+00, float %278, !dbg !35
+  %280 = fmul float %275, %279, !dbg !36
+  %281 = fadd float %260, %280, !dbg !37
+  %282 = fadd float %265, %271, !dbg !38
+  %283 = fmul float %275, %275, !dbg !39
+  %284 = fmul float %255, %283, !dbg !40
+  %285 = fmul float %279, %284, !dbg !41
+  %286 = fadd float %282, %285, !dbg !42
+  %287 = bitcast float %281 to i32, !dbg !30
+  %288 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %287, i32 2, i32 31), !dbg !30
+  %289 = bitcast i32 %288 to float, !dbg !30
+  %290 = bitcast float %286 to i32, !dbg !30
+  %291 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %290, i32 2, i32 31), !dbg !30
+  %292 = bitcast i32 %291 to float, !dbg !30
+  %293 = bitcast float %276 to i32, !dbg !30
+  %294 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %293, i32 2, i32 31), !dbg !30
+  %295 = bitcast i32 %294 to float, !dbg !30
+  %296 = fsub float %289, %281, !dbg !29
+  %297 = fadd float %276, %295, !dbg !32
+  %298 = fcmp oeq float %297, 0.000000e+00, !dbg !33
+  %299 = tail call float @llvm.nvvm.div.full(float %295, float %297), !dbg !34
+  %300 = select i1 %298, float 0.000000e+00, float %299, !dbg !35
+  %301 = fmul float %296, %300, !dbg !36
+  %302 = fadd float %281, %301, !dbg !37
+  %303 = fadd float %286, %292, !dbg !38
+  %304 = fmul float %296, %296, !dbg !39
+  %305 = fmul float %276, %304, !dbg !40
+  %306 = fmul float %300, %305, !dbg !41
+  %307 = fadd float %303, %306, !dbg !42
+  %308 = bitcast float %302 to i32, !dbg !30
+  %309 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %308, i32 1, i32 31), !dbg !30
+  %310 = bitcast i32 %309 to float, !dbg !30
+  %311 = bitcast float %307 to i32, !dbg !30
+  %312 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %311, i32 1, i32 31), !dbg !30
+  %313 = bitcast i32 %312 to float, !dbg !30
+  %314 = bitcast float %297 to i32, !dbg !30
+  %315 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %314, i32 1, i32 31), !dbg !30
+  %316 = bitcast i32 %315 to float, !dbg !30
+  %317 = fsub float %310, %302, !dbg !29
+  %318 = fadd float %297, %316, !dbg !32
+  %319 = fcmp oeq float %318, 0.000000e+00, !dbg !33
+  %320 = tail call float @llvm.nvvm.div.full(float %316, float %318), !dbg !34
+  %321 = select i1 %319, float 0.000000e+00, float %320, !dbg !35
+  %322 = fmul float %317, %321, !dbg !36
+  %323 = fadd float %302, %322, !dbg !37
+  %324 = fadd float %307, %313, !dbg !38
+  %325 = fmul float %317, %317, !dbg !39
+  %326 = fmul float %297, %325, !dbg !40
+  %327 = fmul float %321, %326, !dbg !41
+  %328 = fadd float %324, %327, !dbg !42
+  %329 = and i32 %10, 15, !dbg !30
+  %330 = icmp eq i32 %329, 0, !dbg !30
+  %331 = and i1 %238, %330, !dbg !30
+  %332 = bitcast float %323 to <1 x i32>, !dbg !30
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %239, <1 x i32> %332, i1 %331) #6, !dbg !30
+  %333 = bitcast float %328 to <1 x i32>, !dbg !30
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %242, <1 x i32> %333, i1 %331) #6, !dbg !30
+  %334 = bitcast float %318 to <1 x i32>, !dbg !30
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %245, <1 x i32> %334, i1 %331) #6, !dbg !30
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !30
+  %335 = load float, ptr addrspace(3) @global_smem, align 16, !dbg !30
+  %336 = load float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 64), align 16, !dbg !30
+  %337 = tail call float @llvm.nvvm.div.full(float %336, float 4.096000e+03), !dbg !43
+  %338 = fadd float %337, 0x3EB0C6F7A0000000, !dbg !44
+  %339 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45
+  %340 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45
+  %341 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45
+  %342 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45
+  %.not.i15 = icmp eq i32 %342, 0, !dbg !45
+  br i1 %.not.i15, label %345, label %343, !dbg !45
+
+343:                                              ; preds = %__nv_rsqrtf.exit
+  %344 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %338), !dbg !45
+  br label %__nv_rsqrtf.exit17, !dbg !45
+
+345:                                              ; preds = %__nv_rsqrtf.exit
+  %346 = tail call float @llvm.nvvm.rsqrt.approx.f(float %338), !dbg !45
+  br label %__nv_rsqrtf.exit17, !dbg !45
+
+__nv_rsqrtf.exit17:                               ; preds = %343, %345
+  %.0.i16 = phi float [ %344, %343 ], [ %346, %345 ], !dbg !45
+  %347 = zext nneg i32 %12 to i64, !dbg !46
+  %348 = sext i32 %13 to i64, !dbg !46
+  %349 = getelementptr bfloat, ptr addrspace(1) %1, i64 %347, !dbg !47
+  %350 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !48
+  %351 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %349, i64 %350, i1 true) #6, !dbg !48
+  %352 = extractvalue { i32, i32 } %351, 0, !dbg !48
+  %353 = bitcast i32 %352 to <2 x bfloat>, !dbg !48
+  %354 = extractvalue { i32, i32 } %351, 1, !dbg !48
+  %355 = bitcast i32 %354 to <2 x bfloat>, !dbg !48
+  %356 = or disjoint i64 %347, %348, !dbg !49
+  %357 = getelementptr bfloat, ptr addrspace(1) %0, i64 %356, !dbg !50
+  %358 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #6, !dbg !51
+  %359 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %357, i64 %358, i1 %9) #6, !dbg !51
+  %360 = extractvalue { i32, i32 } %359, 0, !dbg !51
+  %361 = bitcast i32 %360 to <2 x bfloat>, !dbg !51
+  %362 = extractvalue { i32, i32 } %359, 1, !dbg !51
+  %363 = bitcast i32 %362 to <2 x bfloat>, !dbg !51
+  %364 = getelementptr bfloat, ptr addrspace(1) %2, i64 %347, !dbg !52
+  %365 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !53
+  %366 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %364, i64 %365, i1 true) #6, !dbg !53
+  %367 = extractvalue { i32, i32 } %366, 0, !dbg !53
+  %368 = bitcast i32 %367 to <2 x bfloat>, !dbg !53
+  %369 = extractvalue { i32, i32 } %366, 1, !dbg !53
+  %370 = bitcast i32 %369 to <2 x bfloat>, !dbg !53
+  %371 = getelementptr bfloat, ptr addrspace(1) %3, i64 %356, !dbg !54
+  %372 = fpext <2 x bfloat> %353 to <2 x float>, !dbg !55
+  %373 = fpext <2 x bfloat> %361 to <2 x float>, !dbg !56
+  %374 = fpext <2 x bfloat> %368 to <2 x float>, !dbg !57
+  %375 = fadd <2 x float> %372, splat (float 1.000000e+00), !dbg !58
+  %376 = insertelement <2 x float> poison, float %335, i64 0, !dbg !59
+  %377 = shufflevector <2 x float> %376, <2 x float> poison, <2 x i32> zeroinitializer, !dbg !59
+  %378 = fsub <2 x float> %373, %377, !dbg !59
+  %379 = insertelement <2 x float> poison, float %.0.i16, i64 0, !dbg !60
+  %380 = shufflevector <2 x float> %379, <2 x float> poison, <2 x i32> zeroinitializer, !dbg !60
+  %381 = fmul <2 x float> %380, %378, !dbg !60
+  %382 = fmul <2 x float> %375, %381, !dbg !61
+  %383 = fadd <2 x float> %382, %374, !dbg !62
+  %384 = fptrunc <2 x float> %383 to <2 x bfloat>, !dbg !63
+  %385 = fpext <2 x bfloat> %355 to <2 x float>, !dbg !55
+  %386 = fpext <2 x bfloat> %363 to <2 x float>, !dbg !56
+  %387 = fpext <2 x bfloat> %370 to <2 x float>, !dbg !57
+  %388 = fadd <2 x float> %385, splat (float 1.000000e+00), !dbg !58
+  %389 = fsub <2 x float> %386, %377, !dbg !59
+  %390 = fmul <2 x float> %380, %389, !dbg !60
+  %391 = fmul <2 x float> %388, %390, !dbg !61
+  %392 = fadd <2 x float> %391, %387, !dbg !62
+  %393 = fptrunc <2 x float> %392 to <2 x bfloat>, !dbg !63
+  %394 = bitcast <2 x bfloat> %384 to i32, !dbg !63
+  %395 = bitcast <2 x bfloat> %393 to i32, !dbg !63
+  tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %394, i32 %395, ptr addrspace(1) %371, i1 %9) #6, !dbg !63
+  %396 = or disjoint i64 %347, 2048, !dbg !64
+  %397 = getelementptr bfloat, ptr addrspace(1) %1, i64 %396, !dbg !47
+  %398 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !48
+  %399 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %397, i64 %398, i1 true) #6, !dbg !48
+  %400 = extractvalue { i32, i32 } %399, 0, !dbg !48
+  %401 = bitcast i32 %400 to <2 x bfloat>, !dbg !48
+  %402 = extractvalue { i32, i32 } %399, 1, !dbg !48
+  %403 = bitcast i32 %402 to <2 x bfloat>, !dbg !48
+  %404 = or disjoint i64 %396, %348, !dbg !49
+  %405 = getelementptr bfloat, ptr addrspace(1) %0, i64 %404, !dbg !50
+  %406 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #6, !dbg !51
+  %407 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %405, i64 %406, i1 %9) #6, !dbg !51
+  %408 = extractvalue { i32, i32 } %407, 0, !dbg !51
+  %409 = bitcast i32 %408 to <2 x bfloat>, !dbg !51
+  %410 = extractvalue { i32, i32 } %407, 1, !dbg !51
+  %411 = bitcast i32 %410 to <2 x bfloat>, !dbg !51
+  %412 = getelementptr bfloat, ptr addrspace(1) %2, i64 %396, !dbg !52
+  %413 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !53
+  %414 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %412, i64 %413, i1 true) #6, !dbg !53
+  %415 = extractvalue { i32, i32 } %414, 0, !dbg !53
+  %416 = bitcast i32 %415 to <2 x bfloat>, !dbg !53
+  %417 = extractvalue { i32, i32 } %414, 1, !dbg !53
+  %418 = bitcast i32 %417 to <2 x bfloat>, !dbg !53
+  %419 = getelementptr bfloat, ptr addrspace(1) %3, i64 %404, !dbg !54
+  %420 = fpext <2 x bfloat> %401 to <2 x float>, !dbg !55
+  %421 = fpext <2 x bfloat> %409 to <2 x float>, !dbg !56
+  %422 = fpext <2 x bfloat> %416 to <2 x float>, !dbg !57
+  %423 = fadd <2 x float> %420, splat (float 1.000000e+00), !dbg !58
+  %424 = fsub <2 x float> %421, %377, !dbg !59
+  %425 = fmul <2 x float> %380, %424, !dbg !60
+  %426 = fmul <2 x float> %423, %425, !dbg !61
+  %427 = fadd <2 x float> %426, %422, !dbg !62
+  %428 = fptrunc <2 x float> %427 to <2 x bfloat>, !dbg !63
+  %429 = fpext <2 x bfloat> %403 to <2 x float>, !dbg !55
+  %430 = fpext <2 x bfloat> %411 to <2 x float>, !dbg !56
+  %431 = fpext <2 x bfloat> %418 to <2 x float>, !dbg !57
+  %432 = fadd <2 x float> %429, splat (float 1.000000e+00), !dbg !58
+  %433 = fsub <2 x float> %430, %377, !dbg !59
+  %434 = fmul <2 x float> %380, %433, !dbg !60
+  %435 = fmul <2 x float> %432, %434, !dbg !61
+  %436 = fadd <2 x float> %435, %431, !dbg !62
+  %437 = fptrunc <2 x float> %436 to <2 x bfloat>, !dbg !63
+  %438 = bitcast <2 x bfloat> %428 to i32, !dbg !63
+  %439 = bitcast <2 x bfloat> %437 to i32, !dbg !63
+  tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %438, i32 %439, ptr addrspace(1) %419, i1 %9) #6, !dbg !63
+  ret void, !dbg !65
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.div.full(float, float) #2
+
+; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
+declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #3
+
+; Function Attrs: convergent nocallback nounwind
+declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #4
+
+declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #5
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #2
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.rsqrt.approx.f(float) #2
+
+attributes #0 = { nounwind "nvvm.reqntid"="512" }
+attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #2 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
+attributes #3 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
+attributes #4 = { convergent nocallback nounwind }
+attributes #5 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #6 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3}
+!llvm.ident = !{!4}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly)
+!1 = !DIFile(filename: "cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py", directory: "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av")
+!2 = !{i32 2, !"Debug Info Version", i32 3}
+!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
+!4 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
+!5 = distinct !DISubprogram(name: "triton_red_fused_add_mul_native_layer_norm_1", linkageName: "triton_red_fused_add_mul_native_layer_norm_1", scope: !1, file: !1, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
+!7 = !{}
+!8 = !DILocation(line: 23, column: 28, scope: !5)
+!9 = !DILocation(line: 25, column: 21, scope: !5)
+!10 = !DILocation(line: 26, column: 37, scope: !5)
+!11 = !DILocation(line: 38, column: 46, scope: !5)
+!12 = !DILocation(line: 32, column: 43, scope: !5)
+!13 = !DILocation(line: 38, column: 34, scope: !5)
+!14 = !DILocation(line: 38, column: 51, scope: !5)
+!15 = !DILocation(line: 38, column: 112, scope: !5)
+!16 = !DILocation(line: 44, column: 62, scope: !5)
+!17 = !DILocation(line: 222, column: 24, scope: !18, inlinedAt: !20)
+!18 = distinct !DILexicalBlockFile(scope: !5, file: !19, discriminator: 0)
+!19 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime")
+!20 = !DILocation(line: 42, column: 51, scope: !21)
+!21 = distinct !DILexicalBlockFile(scope: !5, file: !1, discriminator: 0)
+!22 = !DILocation(line: 46, column: 66, scope: !5)
+!23 = !DILocation(line: 224, column: 34, scope: !18, inlinedAt: !20)
+!24 = !DILocation(line: 224, column: 26, scope: !18, inlinedAt: !20)
+!25 = !DILocation(line: 225, column: 39, scope: !18, inlinedAt: !20)
+!26 = !DILocation(line: 225, column: 31, scope: !18, inlinedAt: !20)
+!27 = !DILocation(line: 225, column: 22, scope: !18, inlinedAt: !20)
+!28 = !DILocation(line: 45, column: 58, scope: !5)
+!29 = !DILocation(line: 231, column: 21, scope: !18, inlinedAt: !30)
+!30 = !DILocation(line: 243, column: 46, scope: !18, inlinedAt: !31)
+!31 = !DILocation(line: 47, column: 79, scope: !21)
+!32 = !DILocation(line: 232, column: 28, scope: !18, inlinedAt: !30)
+!33 = !DILocation(line: 233, column: 39, scope: !18, inlinedAt: !30)
+!34 = !DILocation(line: 233, column: 60, scope: !18, inlinedAt: !30)
+!35 = !DILocation(line: 233, column: 49, scope: !18, inlinedAt: !30)
+!36 = !DILocation(line: 235, column: 25, scope: !18, inlinedAt: !30)
+!37 = !DILocation(line: 235, column: 17, scope: !18, inlinedAt: !30)
+!38 = !DILocation(line: 236, column: 15, scope: !18, inlinedAt: !30)
+!39 = !DILocation(line: 236, column: 30, scope: !18, inlinedAt: !30)
+!40 = !DILocation(line: 236, column: 38, scope: !18, inlinedAt: !30)
+!41 = !DILocation(line: 236, column: 49, scope: !18, inlinedAt: !30)
+!42 = !DILocation(line: 236, column: 22, scope: !18, inlinedAt: !30)
+!43 = !DILocation(line: 65, column: 24, scope: !5)
+!44 = !DILocation(line: 67, column: 24, scope: !5)
+!45 = !DILocation(line: 68, column: 32, scope: !5)
+!46 = !DILocation(line: 51, column: 43, scope: !5)
+!47 = !DILocation(line: 57, column: 34, scope: !5)
+!48 = !DILocation(line: 57, column: 41, scope: !5)
+!49 = !DILocation(line: 58, column: 42, scope: !5)
+!50 = !DILocation(line: 58, column: 35, scope: !5)
+!51 = !DILocation(line: 58, column: 52, scope: !5)
+!52 = !DILocation(line: 59, column: 35, scope: !5)
+!53 = !DILocation(line: 59, column: 42, scope: !5)
+!54 = !DILocation(line: 73, column: 29, scope: !5)
+!55 = !DILocation(line: 57, column: 94, scope: !5)
+!56 = !DILocation(line: 58, column: 114, scope: !5)
+!57 = !DILocation(line: 59, column: 95, scope: !5)
+!58 = !DILocation(line: 61, column: 23, scope: !5)
+!59 = !DILocation(line: 63, column: 24, scope: !5)
+!60 = !DILocation(line: 69, column: 24, scope: !5)
+!61 = !DILocation(line: 71, column: 24, scope: !5)
+!62 = !DILocation(line: 72, column: 24, scope: !5)
+!63 = !DILocation(line: 73, column: 53, scope: !5)
+!64 = !DILocation(line: 52, column: 31, scope: !5)
+!65 = !DILocation(line: 51, column: 4, scope: !5)
diff --git a/triton/BJVG7OCL7JU5W736D67FIT6PQ3DMDC7PLUQRHMEEN5LS5TJIMXXA/triton_red_fused_add_mul_native_layer_norm_1.ptx b/triton/BJVG7OCL7JU5W736D67FIT6PQ3DMDC7PLUQRHMEEN5LS5TJIMXXA/triton_red_fused_add_mul_native_layer_norm_1.ptx
new file mode 100644
index 0000000000000000000000000000000000000000..6c2edda7459d25df6b9e0e416bf2f6dea1092073
--- /dev/null
+++ b/triton/BJVG7OCL7JU5W736D67FIT6PQ3DMDC7PLUQRHMEEN5LS5TJIMXXA/triton_red_fused_add_mul_native_layer_norm_1.ptx
@@ -0,0 +1,1089 @@
+//
+// Generated by LLVM NVPTX Back-End
+//
+
+.version 9.1
+.target sm_89
+.address_size 64
+
+	// .globl	triton_red_fused_add_mul_native_layer_norm_1 // -- Begin function triton_red_fused_add_mul_native_layer_norm_1
+.extern .shared .align 16 .b8 global_smem[];
+.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90};
+                                        // @triton_red_fused_add_mul_native_layer_norm_1
+.visible .entry triton_red_fused_add_mul_native_layer_norm_1(
+	.param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_1_param_0,
+	.param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_1_param_1,
+	.param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_1_param_2,
+	.param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_1_param_3,
+	.param .u32 triton_red_fused_add_mul_native_layer_norm_1_param_4,
+	.param .u32 triton_red_fused_add_mul_native_layer_norm_1_param_5,
+	.param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_1_param_6,
+	.param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_1_param_7
+)
+.reqntid 512
+{
+	.reg .pred 	%p<19>;
+	.reg .b16 	%rs<33>;
+	.reg .b32 	%r<282>;
+	.reg .b64 	%rd<28>;
+	.loc	1 18 0                          // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:18:0
+$L__func_begin0:
+	.loc	1 18 0                          // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:18:0
+
+// %bb.0:                               // %__nv_rsqrtf.exit
+	ld.param.b64 	%rd19, [triton_red_fused_add_mul_native_layer_norm_1_param_0];
+	ld.param.b64 	%rd20, [triton_red_fused_add_mul_native_layer_norm_1_param_1];
+$L__tmp0:
+	.loc	1 23 28                         // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:23:28
+	mov.u32 	%r37, %ctaid.x;
+	.loc	1 25 21                         // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:25:21
+	setp.lt.u32 	%p1, %r37, 256;
+	ld.param.b64 	%rd21, [triton_red_fused_add_mul_native_layer_norm_1_param_2];
+	ld.param.b64 	%rd22, [triton_red_fused_add_mul_native_layer_norm_1_param_3];
+	.loc	1 26 37                         // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:26:37
+	mov.u32 	%r38, %tid.x;
+	shl.b32 	%r39, %r38, 2;
+	and.b32 	%r40, %r39, 2044;
+	.loc	1 38 46                         // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:38:46
+	shl.b32 	%r41, %r37, 12;
+	or.b32 	%r42, %r40, %r41;
+	.loc	1 38 34                         // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:38:34
+	mad.wide.s32 	%rd1, %r42, 2, %rd19;
+	.loc	1 38 51                         // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:38:51
+	// begin inline asm
+	mov.u64 %rd2, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd2, 1.0;
+	// end inline asm
+	mov.b32 	%r3, 0;
+	// begin inline asm
+	mov.u32 %r1, %r3;
+	mov.u32 %r2, %r3;
+	@%p1 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { %r1, %r2 }, [ %rd1 + 0 ], %rd2;
+	// end inline asm
+	mov.b32 	{%rs1, %rs2}, %r2;
+	.loc	1 38 112                        // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:38:112
+	cvt.f32.bf16 	%r43, %rs2;
+	cvt.f32.bf16 	%r44, %rs1;
+	.loc	1 38 51                         // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:38:51
+	mov.b32 	{%rs3, %rs4}, %r1;
+	.loc	1 38 112                        // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:38:112
+	cvt.f32.bf16 	%r45, %rs4;
+	cvt.f32.bf16 	%r46, %rs3;
+	.loc	1 44 62                         // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:44:62
+	selp.f32 	%r47, %r46, 0f00000000, %p1;
+	selp.f32 	%r48, %r45, 0f00000000, %p1;
+	selp.f32 	%r49, %r44, 0f00000000, %p1;
+	selp.f32 	%r50, %r43, 0f00000000, %p1;
+	.loc	1 38 34                         // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:38:34
+	add.s64 	%rd3, %rd1, 4096;
+	.loc	1 38 51                         // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:38:51
+	// begin inline asm
+	mov.u64 %rd4, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd4, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r4, %r3;
+	mov.u32 %r5, %r3;
+	@%p1 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { %r4, %r5 }, [ %rd3 + 0 ], %rd4;
+	// end inline asm
+	mov.b32 	{%rs5, %rs6}, %r4;
+	.loc	1 38 112                        // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:38:112
+	cvt.f32.bf16 	%r51, %rs5;
+$L__tmp1:
+	.loc	2 222 24                        // triton_helpers.py:222:24 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:42:51 ]
+	sub.f32 	%r52, %r51, %r47;
+$L__tmp2:
+	.loc	1 46 66                         // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:46:66
+	selp.f32 	%r53, 0f40000000, 0f3F800000, %p1;
+$L__tmp3:
+	.loc	2 224 34                        // triton_helpers.py:224:34 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:42:51 ]
+	div.full.f32 	%r54, %r52, %r53;
+	.loc	2 224 26                        // triton_helpers.py:224:26 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:42:51 ]
+	add.f32 	%r55, %r47, %r54;
+	.loc	2 225 39                        // triton_helpers.py:225:39 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:42:51 ]
+	sub.f32 	%r56, %r51, %r55;
+	.loc	2 225 22                        // triton_helpers.py:225:22 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:42:51 ]
+	fma.rn.f32 	%r57, %r52, %r56, 0f00000000;
+$L__tmp4:
+	.loc	1 38 112                        // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:38:112
+	cvt.f32.bf16 	%r58, %rs6;
+$L__tmp5:
+	.loc	2 222 24                        // triton_helpers.py:222:24 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:42:51 ]
+	sub.f32 	%r59, %r58, %r48;
+	.loc	2 224 34                        // triton_helpers.py:224:34 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:42:51 ]
+	div.full.f32 	%r60, %r59, %r53;
+	.loc	2 224 26                        // triton_helpers.py:224:26 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:42:51 ]
+	add.f32 	%r61, %r48, %r60;
+	.loc	2 225 39                        // triton_helpers.py:225:39 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:42:51 ]
+	sub.f32 	%r62, %r58, %r61;
+	.loc	2 225 22                        // triton_helpers.py:225:22 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:42:51 ]
+	fma.rn.f32 	%r63, %r59, %r62, 0f00000000;
+$L__tmp6:
+	.loc	1 38 51                         // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:38:51
+	mov.b32 	{%rs7, %rs8}, %r5;
+	.loc	1 38 112                        // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:38:112
+	cvt.f32.bf16 	%r64, %rs7;
+$L__tmp7:
+	.loc	2 222 24                        // triton_helpers.py:222:24 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:42:51 ]
+	sub.f32 	%r65, %r64, %r49;
+	.loc	2 224 34                        // triton_helpers.py:224:34 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:42:51 ]
+	div.full.f32 	%r66, %r65, %r53;
+	.loc	2 224 26                        // triton_helpers.py:224:26 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:42:51 ]
+	add.f32 	%r67, %r49, %r66;
+	.loc	2 225 39                        // triton_helpers.py:225:39 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:42:51 ]
+	sub.f32 	%r68, %r64, %r67;
+	.loc	2 225 22                        // triton_helpers.py:225:22 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:42:51 ]
+	fma.rn.f32 	%r69, %r65, %r68, 0f00000000;
+$L__tmp8:
+	.loc	1 38 112                        // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:38:112
+	cvt.f32.bf16 	%r70, %rs8;
+$L__tmp9:
+	.loc	2 222 24                        // triton_helpers.py:222:24 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:42:51 ]
+	sub.f32 	%r71, %r70, %r50;
+	.loc	2 224 34                        // triton_helpers.py:224:34 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:42:51 ]
+	div.full.f32 	%r72, %r71, %r53;
+	.loc	2 224 26                        // triton_helpers.py:224:26 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:42:51 ]
+	add.f32 	%r73, %r50, %r72;
+	.loc	2 225 39                        // triton_helpers.py:225:39 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:42:51 ]
+	sub.f32 	%r74, %r70, %r73;
+	.loc	2 225 22                        // triton_helpers.py:225:22 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:42:51 ]
+	fma.rn.f32 	%r75, %r71, %r74, 0f00000000;
+$L__tmp10:
+	.loc	1 44 62                         // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:44:62
+	selp.f32 	%r76, %r55, 0f00000000, %p1;
+	selp.f32 	%r77, %r61, 0f00000000, %p1;
+	selp.f32 	%r78, %r67, 0f00000000, %p1;
+	selp.f32 	%r79, %r73, 0f00000000, %p1;
+	.loc	1 45 58                         // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:45:58
+	selp.f32 	%r80, %r69, 0f00000000, %p1;
+	selp.f32 	%r81, %r75, 0f00000000, %p1;
+	.loc	1 46 66                         // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:46:66
+	selp.f32 	%r82, 0f40000000, 0f00000000, %p1;
+	.loc	1 26 37                         // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:26:37
+	and.b32 	%r83, %r38, 511;
+	and.b32 	%r84, %r38, 31;
+$L__tmp11:
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	sub.f32 	%r85, %r77, %r76;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	selp.f32 	%r86, 0f40800000, 0f00000000, %p1;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	setp.eq.f32 	%p6, %r86, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	div.full.f32 	%r87, %r82, %r86;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	selp.f32 	%r88, 0f00000000, %r87, %p6;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	fma.rn.f32 	%r89, %r85, %r88, %r76;
+	.loc	2 236 15                        // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	add.f32 	%r90, %r57, %r63;
+	selp.f32 	%r91, %r90, 0f00000000, %p1;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	mul.f32 	%r92, %r85, %r85;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	mul.f32 	%r93, %r92, %r82;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	fma.rn.f32 	%r94, %r93, %r88, %r91;
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	sub.f32 	%r95, %r78, %r89;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	selp.f32 	%r96, 0f40C00000, 0f00000000, %p1;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	setp.eq.f32 	%p7, %r96, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	div.full.f32 	%r97, %r82, %r96;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	selp.f32 	%r98, 0f00000000, %r97, %p7;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	fma.rn.f32 	%r99, %r98, %r95, %r89;
+	.loc	2 236 15                        // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	add.f32 	%r100, %r80, %r94;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	mul.f32 	%r101, %r95, %r95;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	mul.f32 	%r102, %r86, %r101;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	fma.rn.f32 	%r103, %r98, %r102, %r100;
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	sub.f32 	%r104, %r79, %r99;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	selp.f32 	%r105, 0f41000000, 0f00000000, %p1;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	setp.eq.f32 	%p8, %r105, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	div.full.f32 	%r106, %r82, %r105;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	selp.f32 	%r107, 0f00000000, %r106, %p8;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	fma.rn.f32 	%r108, %r107, %r104, %r99;
+	.loc	2 236 15                        // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	add.f32 	%r109, %r81, %r103;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	mul.f32 	%r110, %r104, %r104;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	mul.f32 	%r111, %r96, %r110;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	fma.rn.f32 	%r112, %r107, %r111, %r109;
+$L__tmp12:
+	.loc	2 243 46                        // triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ]
+	shfl.sync.bfly.b32 	%r113, %r108, 16, 31, -1;
+	shfl.sync.bfly.b32 	%r114, %r112, 16, 31, -1;
+	shfl.sync.bfly.b32 	%r115, %r105, 16, 31, -1;
+$L__tmp13:
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	sub.f32 	%r116, %r113, %r108;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	add.f32 	%r117, %r105, %r115;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	setp.eq.f32 	%p9, %r117, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	div.full.f32 	%r118, %r115, %r117;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	selp.f32 	%r119, 0f00000000, %r118, %p9;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	fma.rn.f32 	%r120, %r119, %r116, %r108;
+	.loc	2 236 15                        // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	add.f32 	%r121, %r112, %r114;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	mul.f32 	%r122, %r116, %r116;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	mul.f32 	%r123, %r105, %r122;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	fma.rn.f32 	%r124, %r119, %r123, %r121;
+$L__tmp14:
+	.loc	2 243 46                        // triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ]
+	shfl.sync.bfly.b32 	%r125, %r120, 8, 31, -1;
+	shfl.sync.bfly.b32 	%r126, %r124, 8, 31, -1;
+	shfl.sync.bfly.b32 	%r127, %r117, 8, 31, -1;
+$L__tmp15:
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	sub.f32 	%r128, %r125, %r120;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	add.f32 	%r129, %r117, %r127;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	setp.eq.f32 	%p10, %r129, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	div.full.f32 	%r130, %r127, %r129;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	selp.f32 	%r131, 0f00000000, %r130, %p10;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	fma.rn.f32 	%r132, %r128, %r131, %r120;
+	.loc	2 236 15                        // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	add.f32 	%r133, %r124, %r126;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	mul.f32 	%r134, %r128, %r128;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	mul.f32 	%r135, %r117, %r134;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	fma.rn.f32 	%r136, %r131, %r135, %r133;
+$L__tmp16:
+	.loc	2 243 46                        // triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ]
+	shfl.sync.bfly.b32 	%r137, %r132, 4, 31, -1;
+	shfl.sync.bfly.b32 	%r138, %r136, 4, 31, -1;
+	shfl.sync.bfly.b32 	%r139, %r129, 4, 31, -1;
+$L__tmp17:
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	sub.f32 	%r140, %r137, %r132;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	add.f32 	%r141, %r129, %r139;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	setp.eq.f32 	%p11, %r141, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	div.full.f32 	%r142, %r139, %r141;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	selp.f32 	%r143, 0f00000000, %r142, %p11;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	fma.rn.f32 	%r144, %r140, %r143, %r132;
+	.loc	2 236 15                        // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	add.f32 	%r145, %r136, %r138;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	mul.f32 	%r146, %r140, %r140;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	mul.f32 	%r147, %r129, %r146;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	fma.rn.f32 	%r148, %r143, %r147, %r145;
+$L__tmp18:
+	.loc	2 243 46                        // triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ]
+	shfl.sync.bfly.b32 	%r149, %r144, 2, 31, -1;
+	shfl.sync.bfly.b32 	%r150, %r148, 2, 31, -1;
+	shfl.sync.bfly.b32 	%r151, %r141, 2, 31, -1;
+$L__tmp19:
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	sub.f32 	%r152, %r149, %r144;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	add.f32 	%r153, %r141, %r151;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	setp.eq.f32 	%p12, %r153, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	div.full.f32 	%r154, %r151, %r153;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	selp.f32 	%r155, 0f00000000, %r154, %p12;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	fma.rn.f32 	%r156, %r152, %r155, %r144;
+	.loc	2 236 15                        // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	add.f32 	%r157, %r148, %r150;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	mul.f32 	%r158, %r152, %r152;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	mul.f32 	%r159, %r141, %r158;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	fma.rn.f32 	%r160, %r155, %r159, %r157;
+$L__tmp20:
+	.loc	2 243 46                        // triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ]
+	shfl.sync.bfly.b32 	%r161, %r156, 1, 31, -1;
+	shfl.sync.bfly.b32 	%r162, %r160, 1, 31, -1;
+	shfl.sync.bfly.b32 	%r163, %r153, 1, 31, -1;
+$L__tmp21:
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	sub.f32 	%r164, %r161, %r156;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	add.f32 	%r11, %r153, %r163;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	setp.eq.f32 	%p13, %r11, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	div.full.f32 	%r165, %r163, %r11;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	selp.f32 	%r166, 0f00000000, %r165, %p13;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	fma.rn.f32 	%r7, %r164, %r166, %r156;
+	.loc	2 236 15                        // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	add.f32 	%r167, %r160, %r162;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	mul.f32 	%r168, %r164, %r164;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	mul.f32 	%r169, %r153, %r168;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	fma.rn.f32 	%r9, %r166, %r169, %r167;
+$L__tmp22:
+	.loc	2 243 46                        // triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ]
+	setp.eq.b32 	%p2, %r84, 0;
+	shr.u32 	%r170, %r38, 3;
+	and.b32 	%r171, %r170, 60;
+	mov.b32 	%r172, global_smem;
+	add.s32 	%r6, %r172, %r171;
+	// begin inline asm
+	@%p2 st.shared.b32 [ %r6 + 0 ], %r7;
+	// end inline asm
+	add.s32 	%r8, %r6, 64;
+	// begin inline asm
+	@%p2 st.shared.b32 [ %r8 + 0 ], %r9;
+	// end inline asm
+	add.s32 	%r10, %r6, 128;
+	// begin inline asm
+	@%p2 st.shared.b32 [ %r10 + 0 ], %r11;
+	// end inline asm
+	bar.sync 	0;
+	setp.lt.u32 	%p3, %r83, 16;
+	shl.b32 	%r173, %r83, 2;
+	add.s32 	%r13, %r172, %r173;
+	// begin inline asm
+	@%p3 ld.shared.b32 %r12, [ %r13 + 0 ];
+	// end inline asm
+	add.s32 	%r15, %r13, 64;
+	// begin inline asm
+	@%p3 ld.shared.b32 %r14, [ %r15 + 0 ];
+	// end inline asm
+	add.s32 	%r17, %r13, 128;
+	// begin inline asm
+	@%p3 ld.shared.b32 %r16, [ %r17 + 0 ];
+	// end inline asm
+	shfl.sync.bfly.b32 	%r174, %r12, 8, 31, -1;
+	shfl.sync.bfly.b32 	%r175, %r14, 8, 31, -1;
+	shfl.sync.bfly.b32 	%r176, %r16, 8, 31, -1;
+$L__tmp23:
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	sub.f32 	%r177, %r174, %r12;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	add.f32 	%r178, %r16, %r176;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	setp.eq.f32 	%p14, %r178, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	div.full.f32 	%r179, %r176, %r178;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	selp.f32 	%r180, 0f00000000, %r179, %p14;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	fma.rn.f32 	%r181, %r177, %r180, %r12;
+	.loc	2 236 15                        // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	add.f32 	%r182, %r14, %r175;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	mul.f32 	%r183, %r177, %r177;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	mul.f32 	%r184, %r183, %r16;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	fma.rn.f32 	%r185, %r184, %r180, %r182;
+$L__tmp24:
+	.loc	2 243 46                        // triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ]
+	shfl.sync.bfly.b32 	%r186, %r181, 4, 31, -1;
+	shfl.sync.bfly.b32 	%r187, %r185, 4, 31, -1;
+	shfl.sync.bfly.b32 	%r188, %r178, 4, 31, -1;
+$L__tmp25:
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	sub.f32 	%r189, %r186, %r181;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	add.f32 	%r190, %r178, %r188;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	setp.eq.f32 	%p15, %r190, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	div.full.f32 	%r191, %r188, %r190;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	selp.f32 	%r192, 0f00000000, %r191, %p15;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	fma.rn.f32 	%r193, %r189, %r192, %r181;
+	.loc	2 236 15                        // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	add.f32 	%r194, %r185, %r187;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	mul.f32 	%r195, %r189, %r189;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	mul.f32 	%r196, %r178, %r195;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	fma.rn.f32 	%r197, %r192, %r196, %r194;
+$L__tmp26:
+	.loc	2 243 46                        // triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ]
+	shfl.sync.bfly.b32 	%r198, %r193, 2, 31, -1;
+	shfl.sync.bfly.b32 	%r199, %r197, 2, 31, -1;
+	shfl.sync.bfly.b32 	%r200, %r190, 2, 31, -1;
+$L__tmp27:
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	sub.f32 	%r201, %r198, %r193;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	add.f32 	%r202, %r190, %r200;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	setp.eq.f32 	%p16, %r202, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	div.full.f32 	%r203, %r200, %r202;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	selp.f32 	%r204, 0f00000000, %r203, %p16;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	fma.rn.f32 	%r205, %r201, %r204, %r193;
+	.loc	2 236 15                        // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	add.f32 	%r206, %r197, %r199;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	mul.f32 	%r207, %r201, %r201;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	mul.f32 	%r208, %r190, %r207;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	fma.rn.f32 	%r209, %r204, %r208, %r206;
+$L__tmp28:
+	.loc	2 243 46                        // triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ]
+	shfl.sync.bfly.b32 	%r210, %r205, 1, 31, -1;
+	shfl.sync.bfly.b32 	%r211, %r209, 1, 31, -1;
+	shfl.sync.bfly.b32 	%r212, %r202, 1, 31, -1;
+$L__tmp29:
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	sub.f32 	%r213, %r210, %r205;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	add.f32 	%r20, %r202, %r212;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	setp.eq.f32 	%p17, %r20, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	div.full.f32 	%r214, %r212, %r20;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	selp.f32 	%r215, 0f00000000, %r214, %p17;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	fma.rn.f32 	%r18, %r213, %r215, %r205;
+	.loc	2 236 15                        // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	add.f32 	%r216, %r209, %r211;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	mul.f32 	%r217, %r213, %r213;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	mul.f32 	%r218, %r202, %r217;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	fma.rn.f32 	%r19, %r215, %r218, %r216;
+$L__tmp30:
+	.loc	2 243 46                        // triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ]
+	and.b32 	%r219, %r38, 15;
+	setp.eq.b32 	%p18, %r219, 0;
+	and.pred 	%p4, %p3, %p18;
+	// begin inline asm
+	@%p4 st.shared.b32 [ %r13 + 0 ], %r18;
+	// end inline asm
+	// begin inline asm
+	@%p4 st.shared.b32 [ %r15 + 0 ], %r19;
+	// end inline asm
+	// begin inline asm
+	@%p4 st.shared.b32 [ %r17 + 0 ], %r20;
+	// end inline asm
+	bar.sync 	0;
+	ld.shared.b32 	%r220, [global_smem];
+	ld.shared.b32 	%r221, [global_smem+64];
+	mov.b32 	%r222, 0f45800000;
+$L__tmp31:
+	.loc	1 65 24                         // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:65:24
+	div.full.f32 	%r223, %r221, %r222;
+	.loc	1 67 24                         // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:67:24
+	add.f32 	%r224, %r223, 0f358637BD;
+	.loc	1 68 32                         // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:68:32
+	rsqrt.approx.ftz.f32 	%r225, %r224;
+	.loc	1 51 43                         // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:51:43
+	cvt.u64.u32 	%rd23, %r40;
+	cvt.s64.s32 	%rd24, %r41;
+	.loc	1 57 34                         // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:57:34
+	mul.wide.u32 	%rd25, %r40, 2;
+	add.s64 	%rd5, %rd20, %rd25;
+	.loc	1 57 41                         // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:57:41
+	// begin inline asm
+	mov.u64 %rd6, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd6, 1.0;
+	// end inline asm
+	mov.pred 	%p5, -1;
+	// begin inline asm
+	mov.u32 %r21, %r3;
+	mov.u32 %r22, %r3;
+	@%p5 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { %r21, %r22 }, [ %rd5 + 0 ], %rd6;
+	// end inline asm
+	.loc	1 58 42                         // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:58:42
+	or.b64 	%rd26, %rd23, %rd24;
+	.loc	1 58 35                         // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:58:35
+	shl.b64 	%rd27, %rd26, 1;
+	add.s64 	%rd7, %rd19, %rd27;
+	.loc	1 58 52                         // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:58:52
+	// begin inline asm
+	mov.u64 %rd8, 0x0;
+	createpolicy.fractional.L2::evict_first.b64 %rd8, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r23, %r3;
+	mov.u32 %r24, %r3;
+	@%p1 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { %r23, %r24 }, [ %rd7 + 0 ], %rd8;
+	// end inline asm
+	.loc	1 59 35                         // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:59:35
+	add.s64 	%rd9, %rd21, %rd25;
+	.loc	1 59 42                         // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:59:42
+	// begin inline asm
+	mov.u64 %rd10, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd10, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r25, %r3;
+	mov.u32 %r26, %r3;
+	@%p5 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { %r25, %r26 }, [ %rd9 + 0 ], %rd10;
+	// end inline asm
+	.loc	1 73 29                         // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:73:29
+	add.s64 	%rd11, %rd22, %rd27;
+	.loc	1 57 94                         // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:57:94
+	mov.b32 	{%rs9, %rs10}, %r21;
+	cvt.f32.bf16 	%r226, %rs9;
+	cvt.f32.bf16 	%r227, %rs10;
+	.loc	1 58 114                        // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:58:114
+	mov.b32 	{%rs11, %rs12}, %r23;
+	cvt.f32.bf16 	%r228, %rs12;
+	cvt.f32.bf16 	%r229, %rs11;
+	.loc	1 59 95                         // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:59:95
+	mov.b32 	{%rs13, %rs14}, %r25;
+	cvt.f32.bf16 	%r230, %rs14;
+	cvt.f32.bf16 	%r231, %rs13;
+	.loc	1 61 23                         // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:61:23
+	add.f32 	%r232, %r227, 0f3F800000;
+	add.f32 	%r233, %r226, 0f3F800000;
+	.loc	1 63 24                         // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:63:24
+	sub.f32 	%r234, %r229, %r220;
+	sub.f32 	%r235, %r228, %r220;
+	.loc	1 69 24                         // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:69:24
+	mul.f32 	%r236, %r225, %r235;
+	mul.f32 	%r237, %r225, %r234;
+	.loc	1 72 24                         // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:72:24
+	fma.rn.f32 	%r238, %r233, %r237, %r231;
+	fma.rn.f32 	%r239, %r232, %r236, %r230;
+	.loc	1 73 53                         // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:73:53
+	cvt.rn.bf16x2.f32 	%r27, %r239, %r238;
+	.loc	1 57 94                         // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:57:94
+	mov.b32 	{%rs15, %rs16}, %r22;
+	cvt.f32.bf16 	%r240, %rs15;
+	cvt.f32.bf16 	%r241, %rs16;
+	.loc	1 58 114                        // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:58:114
+	mov.b32 	{%rs17, %rs18}, %r24;
+	cvt.f32.bf16 	%r242, %rs18;
+	cvt.f32.bf16 	%r243, %rs17;
+	.loc	1 59 95                         // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:59:95
+	mov.b32 	{%rs19, %rs20}, %r26;
+	cvt.f32.bf16 	%r244, %rs20;
+	cvt.f32.bf16 	%r245, %rs19;
+	.loc	1 61 23                         // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:61:23
+	add.f32 	%r246, %r241, 0f3F800000;
+	add.f32 	%r247, %r240, 0f3F800000;
+	.loc	1 63 24                         // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:63:24
+	sub.f32 	%r248, %r243, %r220;
+	sub.f32 	%r249, %r242, %r220;
+	.loc	1 69 24                         // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:69:24
+	mul.f32 	%r250, %r225, %r249;
+	mul.f32 	%r251, %r225, %r248;
+	.loc	1 72 24                         // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:72:24
+	fma.rn.f32 	%r252, %r247, %r251, %r245;
+	fma.rn.f32 	%r253, %r246, %r250, %r244;
+	.loc	1 73 53                         // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:73:53
+	cvt.rn.bf16x2.f32 	%r28, %r253, %r252;
+	// begin inline asm
+	@%p1 st.global.v2.b32 [ %rd11 + 0 ], { %r27, %r28 };
+	// end inline asm
+	.loc	1 57 34                         // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:57:34
+	add.s64 	%rd12, %rd5, 4096;
+	.loc	1 57 41                         // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:57:41
+	// begin inline asm
+	mov.u64 %rd13, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd13, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r29, %r3;
+	mov.u32 %r30, %r3;
+	@%p5 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { %r29, %r30 }, [ %rd12 + 0 ], %rd13;
+	// end inline asm
+	.loc	1 58 35                         // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:58:35
+	add.s64 	%rd14, %rd7, 4096;
+	.loc	1 58 52                         // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:58:52
+	// begin inline asm
+	mov.u64 %rd15, 0x0;
+	createpolicy.fractional.L2::evict_first.b64 %rd15, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r31, %r3;
+	mov.u32 %r32, %r3;
+	@%p1 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { %r31, %r32 }, [ %rd14 + 0 ], %rd15;
+	// end inline asm
+	.loc	1 59 35                         // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:59:35
+	add.s64 	%rd16, %rd9, 4096;
+	.loc	1 59 42                         // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:59:42
+	// begin inline asm
+	mov.u64 %rd17, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd17, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r33, %r3;
+	mov.u32 %r34, %r3;
+	@%p5 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { %r33, %r34 }, [ %rd16 + 0 ], %rd17;
+	// end inline asm
+	.loc	1 73 29                         // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:73:29
+	add.s64 	%rd18, %rd11, 4096;
+	.loc	1 57 94                         // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:57:94
+	mov.b32 	{%rs21, %rs22}, %r29;
+	cvt.f32.bf16 	%r254, %rs21;
+	cvt.f32.bf16 	%r255, %rs22;
+	.loc	1 58 114                        // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:58:114
+	mov.b32 	{%rs23, %rs24}, %r31;
+	cvt.f32.bf16 	%r256, %rs24;
+	cvt.f32.bf16 	%r257, %rs23;
+	.loc	1 59 95                         // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:59:95
+	mov.b32 	{%rs25, %rs26}, %r33;
+	cvt.f32.bf16 	%r258, %rs26;
+	cvt.f32.bf16 	%r259, %rs25;
+	.loc	1 61 23                         // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:61:23
+	add.f32 	%r260, %r255, 0f3F800000;
+	add.f32 	%r261, %r254, 0f3F800000;
+	.loc	1 63 24                         // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:63:24
+	sub.f32 	%r262, %r257, %r220;
+	sub.f32 	%r263, %r256, %r220;
+	.loc	1 69 24                         // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:69:24
+	mul.f32 	%r264, %r225, %r263;
+	mul.f32 	%r265, %r225, %r262;
+	.loc	1 72 24                         // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:72:24
+	fma.rn.f32 	%r266, %r261, %r265, %r259;
+	fma.rn.f32 	%r267, %r260, %r264, %r258;
+	.loc	1 73 53                         // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:73:53
+	cvt.rn.bf16x2.f32 	%r35, %r267, %r266;
+	.loc	1 57 94                         // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:57:94
+	mov.b32 	{%rs27, %rs28}, %r30;
+	cvt.f32.bf16 	%r268, %rs27;
+	cvt.f32.bf16 	%r269, %rs28;
+	.loc	1 58 114                        // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:58:114
+	mov.b32 	{%rs29, %rs30}, %r32;
+	cvt.f32.bf16 	%r270, %rs30;
+	cvt.f32.bf16 	%r271, %rs29;
+	.loc	1 59 95                         // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:59:95
+	mov.b32 	{%rs31, %rs32}, %r34;
+	cvt.f32.bf16 	%r272, %rs32;
+	cvt.f32.bf16 	%r273, %rs31;
+	.loc	1 61 23                         // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:61:23
+	add.f32 	%r274, %r269, 0f3F800000;
+	add.f32 	%r275, %r268, 0f3F800000;
+	.loc	1 63 24                         // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:63:24
+	sub.f32 	%r276, %r271, %r220;
+	sub.f32 	%r277, %r270, %r220;
+	.loc	1 69 24                         // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:69:24
+	mul.f32 	%r278, %r225, %r277;
+	mul.f32 	%r279, %r225, %r276;
+	.loc	1 72 24                         // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:72:24
+	fma.rn.f32 	%r280, %r275, %r279, %r273;
+	fma.rn.f32 	%r281, %r274, %r278, %r272;
+	.loc	1 73 53                         // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:73:53
+	cvt.rn.bf16x2.f32 	%r36, %r281, %r280;
+	// begin inline asm
+	@%p1 st.global.v2.b32 [ %rd18 + 0 ], { %r35, %r36 };
+	// end inline asm
+	.loc	1 51 4                          // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:51:4
+	ret;
+$L__tmp32:
+$L__func_end0:
+                                        // -- End function
+}
+	.file	1 "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py"
+	.file	2 "/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py"
+	.section	.debug_abbrev
+	{
+.b8 1                                   // Abbreviation Code
+.b8 17                                  // DW_TAG_compile_unit
+.b8 1                                   // DW_CHILDREN_yes
+.b8 37                                  // DW_AT_producer
+.b8 8                                   // DW_FORM_string
+.b8 19                                  // DW_AT_language
+.b8 5                                   // DW_FORM_data2
+.b8 3                                   // DW_AT_name
+.b8 8                                   // DW_FORM_string
+.b8 16                                  // DW_AT_stmt_list
+.b8 6                                   // DW_FORM_data4
+.b8 27                                  // DW_AT_comp_dir
+.b8 8                                   // DW_FORM_string
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 2                                   // Abbreviation Code
+.b8 46                                  // DW_TAG_subprogram
+.b8 0                                   // DW_CHILDREN_no
+.b8 3                                   // DW_AT_name
+.b8 8                                   // DW_FORM_string
+.b8 32                                  // DW_AT_inline
+.b8 11                                  // DW_FORM_data1
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 3                                   // Abbreviation Code
+.b8 46                                  // DW_TAG_subprogram
+.b8 1                                   // DW_CHILDREN_yes
+.b8 17                                  // DW_AT_low_pc
+.b8 1                                   // DW_FORM_addr
+.b8 18                                  // DW_AT_high_pc
+.b8 1                                   // DW_FORM_addr
+.b8 49                                  // DW_AT_abstract_origin
+.b8 19                                  // DW_FORM_ref4
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 4                                   // Abbreviation Code
+.b8 29                                  // DW_TAG_inlined_subroutine
+.b8 0                                   // DW_CHILDREN_no
+.b8 49                                  // DW_AT_abstract_origin
+.b8 19                                  // DW_FORM_ref4
+.b8 17                                  // DW_AT_low_pc
+.b8 1                                   // DW_FORM_addr
+.b8 18                                  // DW_AT_high_pc
+.b8 1                                   // DW_FORM_addr
+.b8 88                                  // DW_AT_call_file
+.b8 11                                  // DW_FORM_data1
+.b8 89                                  // DW_AT_call_line
+.b8 11                                  // DW_FORM_data1
+.b8 87                                  // DW_AT_call_column
+.b8 11                                  // DW_FORM_data1
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 5                                   // Abbreviation Code
+.b8 29                                  // DW_TAG_inlined_subroutine
+.b8 1                                   // DW_CHILDREN_yes
+.b8 49                                  // DW_AT_abstract_origin
+.b8 19                                  // DW_FORM_ref4
+.b8 17                                  // DW_AT_low_pc
+.b8 1                                   // DW_FORM_addr
+.b8 18                                  // DW_AT_high_pc
+.b8 1                                   // DW_FORM_addr
+.b8 88                                  // DW_AT_call_file
+.b8 11                                  // DW_FORM_data1
+.b8 89                                  // DW_AT_call_line
+.b8 11                                  // DW_FORM_data1
+.b8 87                                  // DW_AT_call_column
+.b8 11                                  // DW_FORM_data1
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 0                                   // EOM(3)
+	}
+	.section	.debug_info
+	{
+.b32 367                                // Length of Unit
+.b8 2                                   // DWARF version number
+.b8 0
+.b32 .debug_abbrev                      // Offset Into Abbrev. Section
+.b8 8                                   // Address Size (in bytes)
+.b8 1                                   // Abbrev [1] 0xb:0x168 DW_TAG_compile_unit
+.b8 116                                 // DW_AT_producer
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 0
+.b8 2                                   // DW_AT_language
+.b8 0
+.b8 99                                  // DW_AT_name
+.b8 97
+.b8 118
+.b8 111
+.b8 97
+.b8 122
+.b8 54
+.b8 101
+.b8 55
+.b8 107
+.b8 98
+.b8 107
+.b8 53
+.b8 119
+.b8 113
+.b8 50
+.b8 110
+.b8 55
+.b8 118
+.b8 122
+.b8 54
+.b8 114
+.b8 120
+.b8 104
+.b8 99
+.b8 114
+.b8 119
+.b8 100
+.b8 117
+.b8 50
+.b8 116
+.b8 114
+.b8 97
+.b8 122
+.b8 101
+.b8 120
+.b8 117
+.b8 98
+.b8 100
+.b8 113
+.b8 53
+.b8 113
+.b8 119
+.b8 121
+.b8 118
+.b8 50
+.b8 97
+.b8 106
+.b8 109
+.b8 98
+.b8 107
+.b8 122
+.b8 46
+.b8 112
+.b8 121
+.b8 0
+.b32 .debug_line                        // DW_AT_stmt_list
+.b8 47                                  // DW_AT_comp_dir
+.b8 97
+.b8 112
+.b8 112
+.b8 47
+.b8 116
+.b8 101
+.b8 110
+.b8 115
+.b8 111
+.b8 114
+.b8 114
+.b8 116
+.b8 95
+.b8 108
+.b8 108
+.b8 109
+.b8 47
+.b8 118
+.b8 105
+.b8 115
+.b8 117
+.b8 97
+.b8 108
+.b8 95
+.b8 103
+.b8 101
+.b8 110
+.b8 47
+.b8 99
+.b8 111
+.b8 109
+.b8 112
+.b8 105
+.b8 108
+.b8 101
+.b8 100
+.b8 95
+.b8 99
+.b8 97
+.b8 99
+.b8 104
+.b8 101
+.b8 47
+.b8 102
+.b8 108
+.b8 117
+.b8 120
+.b8 50
+.b8 95
+.b8 107
+.b8 108
+.b8 101
+.b8 105
+.b8 110
+.b8 95
+.b8 57
+.b8 98
+.b8 95
+.b8 78
+.b8 86
+.b8 73
+.b8 68
+.b8 73
+.b8 65
+.b8 95
+.b8 71
+.b8 101
+.b8 70
+.b8 111
+.b8 114
+.b8 99
+.b8 101
+.b8 95
+.b8 82
+.b8 84
+.b8 88
+.b8 95
+.b8 52
+.b8 48
+.b8 57
+.b8 48
+.b8 95
+.b8 115
+.b8 109
+.b8 56
+.b8 57
+.b8 95
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 50
+.b8 46
+.b8 49
+.b8 48
+.b8 46
+.b8 48
+.b8 97
+.b8 48
+.b8 95
+.b8 98
+.b8 52
+.b8 101
+.b8 52
+.b8 101
+.b8 101
+.b8 56
+.b8 49
+.b8 100
+.b8 51
+.b8 46
+.b8 110
+.b8 118
+.b8 50
+.b8 53
+.b8 46
+.b8 49
+.b8 50
+.b8 95
+.b8 99
+.b8 117
+.b8 100
+.b8 97
+.b8 49
+.b8 51
+.b8 95
+.b8 49
+.b8 47
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 105
+.b8 110
+.b8 100
+.b8 117
+.b8 99
+.b8 116
+.b8 111
+.b8 114
+.b8 47
+.b8 97
+.b8 118
+.b8 0
+.b8 2                                   // Abbrev [2] 0xe4:0x2f DW_TAG_subprogram
+.b8 116                                 // DW_AT_name
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 114
+.b8 101
+.b8 100
+.b8 95
+.b8 102
+.b8 117
+.b8 115
+.b8 101
+.b8 100
+.b8 95
+.b8 97
+.b8 100
+.b8 100
+.b8 95
+.b8 109
+.b8 117
+.b8 108
+.b8 95
+.b8 110
+.b8 97
+.b8 116
+.b8 105
+.b8 118
+.b8 101
+.b8 95
+.b8 108
+.b8 97
+.b8 121
+.b8 101
+.b8 114
+.b8 95
+.b8 110
+.b8 111
+.b8 114
+.b8 109
+.b8 95
+.b8 49
+.b8 0
+.b8 1                                   // DW_AT_inline
+.b8 3                                   // Abbrev [3] 0x113:0x5f DW_TAG_subprogram
+.b64 $L__func_begin0                    // DW_AT_low_pc
+.b64 $L__func_end0                      // DW_AT_high_pc
+.b32 228                                // DW_AT_abstract_origin
+.b8 4                                   // Abbrev [4] 0x128:0x18 DW_TAG_inlined_subroutine
+.b32 228                                // DW_AT_abstract_origin
+.b64 $L__tmp1                           // DW_AT_low_pc
+.b64 $L__tmp10                          // DW_AT_high_pc
+.b8 1                                   // DW_AT_call_file
+.b8 42                                  // DW_AT_call_line
+.b8 51                                  // DW_AT_call_column
+.b8 5                                   // Abbrev [5] 0x140:0x31 DW_TAG_inlined_subroutine
+.b32 228                                // DW_AT_abstract_origin
+.b64 $L__tmp11                          // DW_AT_low_pc
+.b64 $L__tmp31                          // DW_AT_high_pc
+.b8 1                                   // DW_AT_call_file
+.b8 47                                  // DW_AT_call_line
+.b8 79                                  // DW_AT_call_column
+.b8 4                                   // Abbrev [4] 0x158:0x18 DW_TAG_inlined_subroutine
+.b32 228                                // DW_AT_abstract_origin
+.b64 $L__tmp11                          // DW_AT_low_pc
+.b64 $L__tmp30                          // DW_AT_high_pc
+.b8 2                                   // DW_AT_call_file
+.b8 243                                 // DW_AT_call_line
+.b8 46                                  // DW_AT_call_column
+.b8 0                                   // End Of Children Mark
+.b8 0                                   // End Of Children Mark
+.b8 0                                   // End Of Children Mark
+	}
+	.section	.debug_macinfo	{	}
diff --git a/triton/BJVG7OCL7JU5W736D67FIT6PQ3DMDC7PLUQRHMEEN5LS5TJIMXXA/triton_red_fused_add_mul_native_layer_norm_1.source b/triton/BJVG7OCL7JU5W736D67FIT6PQ3DMDC7PLUQRHMEEN5LS5TJIMXXA/triton_red_fused_add_mul_native_layer_norm_1.source
new file mode 100644
index 0000000000000000000000000000000000000000..d1abbaf09e8deef29466e387f45a33e0f7e1eb3b
--- /dev/null
+++ b/triton/BJVG7OCL7JU5W736D67FIT6PQ3DMDC7PLUQRHMEEN5LS5TJIMXXA/triton_red_fused_add_mul_native_layer_norm_1.source
@@ -0,0 +1,420 @@
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":18:0)
+#loc72 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":216:0)
+#loc85 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":133:0)
+#loc89 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":242:0)
+#loc91 = loc(unknown)
+#loc94 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":230:0)
+#loc109 = loc("in_ptr0"(#loc))
+#loc110 = loc("in_ptr1"(#loc))
+#loc111 = loc("in_ptr2"(#loc))
+#loc112 = loc("out_ptr2"(#loc))
+#loc113 = loc("xnumel"(#loc))
+#loc114 = loc("r0_numel"(#loc))
+#loc171 = loc("value"(#loc72))
+#loc172 = loc("mean"(#loc72))
+#loc173 = loc("m2"(#loc72))
+#loc174 = loc("weight"(#loc72))
+#loc175 = loc("first_iteration"(#loc72))
+#loc185 = loc("input"(#loc85))
+#loc186 = loc("mean"(#loc89))
+#loc187 = loc("m2"(#loc89))
+#loc188 = loc("weight"(#loc89))
+#loc189 = loc("mean_1"(#loc94))
+#loc190 = loc("m2_1"(#loc94))
+#loc191 = loc("weight_1"(#loc94))
+#loc192 = loc("mean_2"(#loc94))
+#loc193 = loc("m2_2"(#loc94))
+#loc194 = loc("weight_2"(#loc94))
+#loc201 = loc("new_mean"(#loc171))
+module {
+  tt.func public @triton_red_fused_add_mul_native_layer_norm_1(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %out_ptr2: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
+    %xnumel_0 = arith.constant 256 : i32 loc(#loc115)
+    %r0_numel_1 = arith.constant 4096 : i32 loc(#loc116)
+    %xoffset = tt.get_program_id x : i32 loc(#loc117)
+    %xoffset_2 = arith.constant 1 : i32 loc(#loc118)
+    %xoffset_3 = arith.constant 1 : i32 loc(#loc118)
+    %xoffset_4 = arith.muli %xoffset, %xoffset_3 : i32 loc(#loc118)
+    %xindex = tt.make_range {end = 1 : i32, start = 0 : i32} : tensor<1xi32> loc(#loc119)
+    %xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc120)
+    %xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<1x1xi32> loc(#loc121)
+    %xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<1x1xi32> loc(#loc121)
+    %xmask = arith.constant dense<256> : tensor<1x1xi32> loc(#loc122)
+    %xmask_8 = arith.cmpi slt, %xindex_7, %xmask : tensor<1x1xi32> loc(#loc122)
+    %r0_base = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32> loc(#loc123)
+    %r0_base_9 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<2048xi32> -> tensor<1x2048xi32> loc(#loc124)
+    %tmp3_mean = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_2048__(1,)cconstexpr_fp32_"() : () -> tensor<1x2048xf32> loc(#loc125)
+    %tmp3_m2 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_2048__(1,)cconstexpr_fp32_"() : () -> tensor<1x2048xf32> loc(#loc126)
+    %tmp3_weight = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_2048__(1,)cconstexpr_fp32_"() : () -> tensor<1x2048xf32> loc(#loc127)
+    %c0_i32 = arith.constant 0 : i32 loc(#loc14)
+    %c2048_i32 = arith.constant 2048 : i32 loc(#loc14)
+    %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc14)
+    %1 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc14)
+    %2 = arith.bitcast %c2048_i32 : i32 to i32 loc(#loc14)
+    %3 = ub.poison : i32 loc(#loc14)
+    %tmp3_weight_10:3 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%tmp3_mean_13 = %tmp3_mean, %tmp3_m2_14 = %tmp3_m2, %tmp3_weight_15 = %tmp3_weight) -> (tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32>)  : i32 {
+      %r0_index = tt.splat %r0_offset : i32 -> tensor<1x2048xi32> loc(#loc129)
+      %r0_index_16 = arith.addi %r0_index, %r0_base_9 : tensor<1x2048xi32> loc(#loc129)
+      %r0_mask = arith.constant dense<4096> : tensor<1x2048xi32> loc(#loc130)
+      %r0_mask_17 = arith.cmpi slt, %r0_index_16, %r0_mask : tensor<1x2048xi32> loc(#loc130)
+      %tmp0 = arith.constant 4096 : i32 loc(#loc131)
+      %tmp0_18 = arith.constant 4096 : i32 loc(#loc131)
+      %tmp0_19 = arith.constant dense<4096> : tensor<1x1xi32> loc(#loc131)
+      %tmp0_20 = arith.muli %tmp0_19, %xindex_7 : tensor<1x1xi32> loc(#loc131)
+      %tmp0_21 = tt.broadcast %tmp0_20 : tensor<1x1xi32> -> tensor<1x2048xi32> loc(#loc132)
+      %tmp0_22 = arith.addi %r0_index_16, %tmp0_21 : tensor<1x2048xi32> loc(#loc132)
+      %tmp0_23 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<1x2048x!tt.ptr<bf16>> loc(#loc133)
+      %tmp0_24 = tt.addptr %tmp0_23, %tmp0_22 : tensor<1x2048x!tt.ptr<bf16>>, tensor<1x2048xi32> loc(#loc133)
+      %tmp0_25 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x2048xi1> loc(#loc134)
+      %tmp0_26 = arith.andi %r0_mask_17, %tmp0_25 : tensor<1x2048xi1> loc(#loc134)
+      %tmp0_27 = arith.constant 0.000000e+00 : f32 loc(#loc135)
+      %tmp0_28 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc135)
+      %tmp0_29 = arith.truncf %tmp0_28 : tensor<1x2048xf32> to tensor<1x2048xbf16> loc(#loc135)
+      %tmp0_30 = tt.load %tmp0_24, %tmp0_26, %tmp0_29 evictionPolicy = evict_last : tensor<1x2048x!tt.ptr<bf16>> loc(#loc135)
+      %tmp0_31 = arith.extf %tmp0_30 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc136)
+      %c0_i32_32 = arith.constant 0 : i32 loc(#loc23)
+      %9 = arith.cmpi eq, %r0_offset, %c0_i32_32 : i32 loc(#loc23)
+      %10:3 = tt.call @torch._inductor.runtime.triton_helpers.welford_reduce__fp32S1_2048S_fp32S1_2048S_fp32S1_2048S_fp32S1_2048S_u1__(%tmp0_31, %tmp3_mean_13, %tmp3_m2_14, %tmp3_weight_15, %9) : (tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32>, i1) -> (tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32>) loc(#loc24)
+      %tmp3_mean_33 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x2048xi1> loc(#loc137)
+      %tmp3_mean_34 = arith.andi %r0_mask_17, %tmp3_mean_33 : tensor<1x2048xi1> loc(#loc137)
+      %tmp3_mean_35 = arith.select %tmp3_mean_34, %10#0, %tmp3_mean_13 : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc138)
+      %tmp3_m2_36 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x2048xi1> loc(#loc139)
+      %tmp3_m2_37 = arith.andi %r0_mask_17, %tmp3_m2_36 : tensor<1x2048xi1> loc(#loc139)
+      %tmp3_m2_38 = arith.select %tmp3_m2_37, %10#1, %tmp3_m2_14 : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc140)
+      %tmp3_weight_39 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x2048xi1> loc(#loc141)
+      %tmp3_weight_40 = arith.andi %r0_mask_17, %tmp3_weight_39 : tensor<1x2048xi1> loc(#loc141)
+      %tmp3_weight_41 = arith.select %tmp3_weight_40, %10#2, %tmp3_weight_15 : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc142)
+      scf.yield %tmp3_mean_35, %tmp3_m2_38, %tmp3_weight_41 : tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32> loc(#loc31)
+    } loc(#loc207)
+    %4:3 = tt.call @"torch._inductor.runtime.triton_helpers.welford__fp32S1_2048S_fp32S1_2048S_fp32S1_2048S__(3,)cconstexpr_1_"(%tmp3_weight_10#0, %tmp3_weight_10#1, %tmp3_weight_10#2) : (tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32>) -> (tensor<1xf32>, tensor<1xf32>, tensor<1xf32>) loc(#loc32)
+    %tmp3 = tt.expand_dims %4#0 {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc143)
+    %tmp7 = tt.expand_dims %4#1 {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc144)
+    %tmp8 = tt.expand_dims %4#2 {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc145)
+    %c0_i32_11 = arith.constant 0 : i32 loc(#loc36)
+    %c2048_i32_12 = arith.constant 2048 : i32 loc(#loc36)
+    %5 = arith.bitcast %c0_i32_11 : i32 to i32 loc(#loc36)
+    %6 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc36)
+    %7 = arith.bitcast %c2048_i32_12 : i32 to i32 loc(#loc36)
+    %8 = ub.poison : i32 loc(#loc36)
+    scf.for %r0_offset = %5 to %6 step %7  : i32 {
+      %r0_index = tt.splat %r0_offset : i32 -> tensor<1x2048xi32> loc(#loc146)
+      %r0_index_13 = arith.addi %r0_index, %r0_base_9 : tensor<1x2048xi32> loc(#loc146)
+      %r0_mask = arith.constant dense<4096> : tensor<1x2048xi32> loc(#loc147)
+      %r0_mask_14 = arith.cmpi slt, %r0_index_13, %r0_mask : tensor<1x2048xi32> loc(#loc147)
+      %tmp9 = tt.splat %in_ptr1 : !tt.ptr<bf16> -> tensor<1x2048x!tt.ptr<bf16>> loc(#loc148)
+      %tmp9_15 = tt.addptr %tmp9, %r0_index_13 : tensor<1x2048x!tt.ptr<bf16>>, tensor<1x2048xi32> loc(#loc148)
+      %tmp9_16 = arith.constant 0.000000e+00 : f32 loc(#loc149)
+      %tmp9_17 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc149)
+      %tmp9_18 = arith.truncf %tmp9_17 : tensor<1x2048xf32> to tensor<1x2048xbf16> loc(#loc149)
+      %tmp9_19 = tt.load %tmp9_15, %r0_mask_14, %tmp9_18 evictionPolicy = evict_last : tensor<1x2048x!tt.ptr<bf16>> loc(#loc149)
+      %tmp9_20 = arith.extf %tmp9_19 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc150)
+      %tmp12 = arith.constant 4096 : i32 loc(#loc151)
+      %tmp12_21 = arith.constant 4096 : i32 loc(#loc151)
+      %tmp12_22 = arith.constant dense<4096> : tensor<1x1xi32> loc(#loc151)
+      %tmp12_23 = arith.muli %tmp12_22, %xindex_7 : tensor<1x1xi32> loc(#loc151)
+      %tmp12_24 = tt.broadcast %tmp12_23 : tensor<1x1xi32> -> tensor<1x2048xi32> loc(#loc152)
+      %tmp12_25 = arith.addi %r0_index_13, %tmp12_24 : tensor<1x2048xi32> loc(#loc152)
+      %tmp12_26 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<1x2048x!tt.ptr<bf16>> loc(#loc153)
+      %tmp12_27 = tt.addptr %tmp12_26, %tmp12_25 : tensor<1x2048x!tt.ptr<bf16>>, tensor<1x2048xi32> loc(#loc153)
+      %tmp12_28 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x2048xi1> loc(#loc154)
+      %tmp12_29 = arith.andi %r0_mask_14, %tmp12_28 : tensor<1x2048xi1> loc(#loc154)
+      %tmp12_30 = arith.constant 0.000000e+00 : f32 loc(#loc155)
+      %tmp12_31 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc155)
+      %tmp12_32 = arith.truncf %tmp12_31 : tensor<1x2048xf32> to tensor<1x2048xbf16> loc(#loc155)
+      %tmp12_33 = tt.load %tmp12_27, %tmp12_29, %tmp12_32 evictionPolicy = evict_first : tensor<1x2048x!tt.ptr<bf16>> loc(#loc155)
+      %tmp12_34 = arith.extf %tmp12_33 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc156)
+      %tmp23 = tt.splat %in_ptr2 : !tt.ptr<bf16> -> tensor<1x2048x!tt.ptr<bf16>> loc(#loc157)
+      %tmp23_35 = tt.addptr %tmp23, %r0_index_13 : tensor<1x2048x!tt.ptr<bf16>>, tensor<1x2048xi32> loc(#loc157)
+      %tmp23_36 = arith.constant 0.000000e+00 : f32 loc(#loc158)
+      %tmp23_37 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc158)
+      %tmp23_38 = arith.truncf %tmp23_37 : tensor<1x2048xf32> to tensor<1x2048xbf16> loc(#loc158)
+      %tmp23_39 = tt.load %tmp23_35, %r0_mask_14, %tmp23_38 evictionPolicy = evict_last : tensor<1x2048x!tt.ptr<bf16>> loc(#loc158)
+      %tmp23_40 = arith.extf %tmp23_39 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc159)
+      %tmp10 = arith.constant 1.000000e+00 : f32 loc(#loc160)
+      %tmp11 = arith.constant dense<1.000000e+00> : tensor<1x2048xf32> loc(#loc161)
+      %tmp11_41 = arith.addf %tmp9_20, %tmp11 : tensor<1x2048xf32> loc(#loc161)
+      %tmp14 = tt.broadcast %tmp3 : tensor<1x1xf32> -> tensor<1x2048xf32> loc(#loc162)
+      %tmp14_42 = arith.subf %tmp12_34, %tmp14 : tensor<1x2048xf32> loc(#loc162)
+      %tmp15 = arith.constant 4.096000e+03 : f32 loc(#loc163)
+      %tmp16 = arith.constant dense<4.096000e+03> : tensor<1x1xf32> loc(#loc164)
+      %tmp16_43 = arith.divf %tmp7, %tmp16 : tensor<1x1xf32> loc(#loc164)
+      %tmp17 = arith.constant 9.99999997E-7 : f32 loc(#loc165)
+      %tmp18 = arith.constant dense<9.99999997E-7> : tensor<1x1xf32> loc(#loc166)
+      %tmp18_44 = arith.addf %tmp16_43, %tmp18 : tensor<1x1xf32> loc(#loc166)
+      %tmp19 = tt.extern_elementwise %tmp18_44 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<1x1xf32>) -> tensor<1x1xf32> loc(#loc167)
+      %tmp20 = tt.broadcast %tmp19 : tensor<1x1xf32> -> tensor<1x2048xf32> loc(#loc168)
+      %tmp20_45 = arith.mulf %tmp14_42, %tmp20 : tensor<1x2048xf32> loc(#loc168)
+      %tmp22 = arith.mulf %tmp11_41, %tmp20_45 : tensor<1x2048xf32> loc(#loc169)
+      %tmp24 = arith.addf %tmp22, %tmp23_40 : tensor<1x2048xf32> loc(#loc170)
+      %c4096_i32 = arith.constant 4096 : i32 loc(#loc62)
+      %c4096_i32_46 = arith.constant 4096 : i32 loc(#loc62)
+      %cst = arith.constant dense<4096> : tensor<1x1xi32> loc(#loc62)
+      %9 = arith.muli %cst, %xindex_7 : tensor<1x1xi32> loc(#loc62)
+      %10 = tt.broadcast %9 : tensor<1x1xi32> -> tensor<1x2048xi32> loc(#loc63)
+      %11 = arith.addi %r0_index_13, %10 : tensor<1x2048xi32> loc(#loc63)
+      %12 = tt.splat %out_ptr2 : !tt.ptr<bf16> -> tensor<1x2048x!tt.ptr<bf16>> loc(#loc64)
+      %13 = tt.addptr %12, %11 : tensor<1x2048x!tt.ptr<bf16>>, tensor<1x2048xi32> loc(#loc64)
+      %14 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x2048xi1> loc(#loc65)
+      %15 = arith.andi %r0_mask_14, %14 : tensor<1x2048xi1> loc(#loc65)
+      %16 = arith.truncf %tmp24 : tensor<1x2048xf32> to tensor<1x2048xbf16> loc(#loc66)
+      tt.store %13, %16, %15 : tensor<1x2048x!tt.ptr<bf16>> loc(#loc66)
+    } loc(#loc36)
+    tt.return loc(#loc67)
+  } loc(#loc)
+  tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_2048__(1,)cconstexpr_fp32_"() -> tensor<1x2048xf32> attributes {noinline = false} {
+    %cst = arith.constant 0.000000e+00 : f32 loc(#loc69)
+    %cst_0 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc69)
+    tt.return %cst_0 : tensor<1x2048xf32> loc(#loc70)
+  ^bb1:  // no predecessors
+    %0 = ub.poison : tensor<1x2048xf32> loc(#loc71)
+    tt.return %0 : tensor<1x2048xf32> loc(#loc71)
+  } loc(#loc68)
+  tt.func private @torch._inductor.runtime.triton_helpers.welford_reduce__fp32S1_2048S_fp32S1_2048S_fp32S1_2048S_fp32S1_2048S_u1__(%new_mean: tensor<1x2048xf32> loc("new_mean"(#loc171)), %mean: tensor<1x2048xf32> loc("mean"(#loc72)), %m2: tensor<1x2048xf32> loc("m2"(#loc72)), %weight: tensor<1x2048xf32> loc("weight"(#loc72)), %first_iteration: i1 loc("first_iteration"(#loc72))) -> (tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32>) attributes {noinline = false} {
+    %0:3 = scf.if %first_iteration -> (tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32>) {
+      %new_weight = arith.constant 1.000000e+00 : f32 loc(#loc176)
+      %new_weight_0 = arith.constant dense<1.000000e+00> : tensor<1x2048xf32> loc(#loc202)
+      %new_m2 = tt.call @triton.language.standard.zeros_like__fp32S1_2048S__(%m2) : (tensor<1x2048xf32>) -> tensor<1x2048xf32> loc(#loc203)
+      scf.yield %new_m2, %new_mean, %new_weight_0 : tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32> loc(#loc203)
+    } else {
+      %delta = arith.subf %new_mean, %mean : tensor<1x2048xf32> loc(#loc178)
+      %new_weight = arith.constant 1 : i32 loc(#loc179)
+      %new_weight_0 = arith.constant 1.000000e+00 : f32 loc(#loc179)
+      %new_weight_1 = arith.constant dense<1.000000e+00> : tensor<1x2048xf32> loc(#loc179)
+      %new_weight_2 = arith.addf %weight, %new_weight_1 : tensor<1x2048xf32> loc(#loc204)
+      %new_mean_3 = arith.divf %delta, %new_weight_2 : tensor<1x2048xf32> loc(#loc180)
+      %new_mean_4 = arith.addf %mean, %new_mean_3 : tensor<1x2048xf32> loc(#loc205)
+      %new_m2 = arith.subf %new_mean, %new_mean_4 : tensor<1x2048xf32> loc(#loc182)
+      %new_m2_5 = arith.mulf %delta, %new_m2 : tensor<1x2048xf32> loc(#loc183)
+      %new_m2_6 = arith.addf %m2, %new_m2_5 : tensor<1x2048xf32> loc(#loc206)
+      scf.yield %new_m2_6, %new_mean_4, %new_weight_2 : tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32> loc(#loc184)
+    } loc(#loc73)
+    tt.return %0#1, %0#0, %0#2 : tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32> loc(#loc83)
+  ^bb1:  // no predecessors
+    %1 = ub.poison : tensor<1x2048xf32> loc(#loc84)
+    %2 = ub.poison : tensor<1x2048xf32> loc(#loc84)
+    %3 = ub.poison : tensor<1x2048xf32> loc(#loc84)
+    tt.return %1, %2, %3 : tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32> loc(#loc84)
+  } loc(#loc72)
+  tt.func private @triton.language.standard.zeros_like__fp32S1_2048S__(%input: tensor<1x2048xf32> loc("input"(#loc85))) -> tensor<1x2048xf32> attributes {noinline = false} {
+    %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_2048__(1,)cconstexpr_fp32_"() : () -> tensor<1x2048xf32> loc(#loc86)
+    tt.return %0 : tensor<1x2048xf32> loc(#loc87)
+  ^bb1:  // no predecessors
+    %1 = ub.poison : tensor<1x2048xf32> loc(#loc88)
+    tt.return %1 : tensor<1x2048xf32> loc(#loc88)
+  } loc(#loc85)
+  tt.func private @"torch._inductor.runtime.triton_helpers.welford__fp32S1_2048S_fp32S1_2048S_fp32S1_2048S__(3,)cconstexpr_1_"(%mean: tensor<1x2048xf32> loc("mean"(#loc89)), %m2: tensor<1x2048xf32> loc("m2"(#loc89)), %weight: tensor<1x2048xf32> loc("weight"(#loc89))) -> (tensor<1xf32>, tensor<1xf32>, tensor<1xf32>) attributes {noinline = false} {
+    %0:3 = "tt.reduce"(%mean, %m2, %weight) <{axis = 1 : i32}> ({
+    ^bb0(%arg3: f32 loc(unknown), %arg4: f32 loc(unknown), %arg5: f32 loc(unknown), %arg6: f32 loc(unknown), %arg7: f32 loc(unknown), %arg8: f32 loc(unknown)):
+      %4:3 = tt.call @torch._inductor.runtime.triton_helpers.welford_combine__fp32_fp32_fp32_fp32_fp32_fp32__(%arg3, %arg4, %arg5, %arg6, %arg7, %arg8) : (f32, f32, f32, f32, f32, f32) -> (f32, f32, f32) loc(#loc90)
+      tt.reduce.return %4#0, %4#1, %4#2 : f32, f32, f32 loc(#loc90)
+    }) : (tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32>) -> (tensor<1xf32>, tensor<1xf32>, tensor<1xf32>) loc(#loc90)
+    tt.return %0#0, %0#1, %0#2 : tensor<1xf32>, tensor<1xf32>, tensor<1xf32> loc(#loc92)
+  ^bb1:  // no predecessors
+    %1 = ub.poison : tensor<1xf32> loc(#loc93)
+    %2 = ub.poison : tensor<1xf32> loc(#loc93)
+    %3 = ub.poison : tensor<1xf32> loc(#loc93)
+    tt.return %1, %2, %3 : tensor<1xf32>, tensor<1xf32>, tensor<1xf32> loc(#loc93)
+  } loc(#loc89)
+  tt.func private @torch._inductor.runtime.triton_helpers.welford_combine__fp32_fp32_fp32_fp32_fp32_fp32__(%mean_1: f32 loc("mean_1"(#loc94)), %m2_1: f32 loc("m2_1"(#loc94)), %weight_1: f32 loc("weight_1"(#loc94)), %mean_2: f32 loc("mean_2"(#loc94)), %m2_2: f32 loc("m2_2"(#loc94)), %weight_2: f32 loc("weight_2"(#loc94))) -> (f32, f32, f32) attributes {noinline = false} {
+    %delta = arith.subf %mean_2, %mean_1 : f32 loc(#loc195)
+    %new_weight = arith.addf %weight_1, %weight_2 : f32 loc(#loc196)
+    %w2_over_w = arith.constant 0.000000e+00 : f32 loc(#loc197)
+    %w2_over_w_0 = arith.cmpf oeq, %new_weight, %w2_over_w : f32 loc(#loc197)
+    %w2_over_w_1 = arith.divf %weight_2, %new_weight : f32 loc(#loc198)
+    %w2_over_w_2 = arith.constant 0.000000e+00 : f32 loc(#loc199)
+    %w2_over_w_3 = arith.constant 0.000000e+00 : f32 loc(#loc199)
+    %w2_over_w_4 = arith.select %w2_over_w_0, %w2_over_w_3, %w2_over_w_1 : f32 loc(#loc199)
+    %0 = arith.mulf %delta, %w2_over_w_4 : f32 loc(#loc100)
+    %1 = arith.addf %mean_1, %0 : f32 loc(#loc101)
+    %2 = arith.addf %m2_1, %m2_2 : f32 loc(#loc102)
+    %3 = arith.mulf %delta, %delta : f32 loc(#loc103)
+    %4 = arith.mulf %3, %weight_1 : f32 loc(#loc104)
+    %5 = arith.mulf %4, %w2_over_w_4 : f32 loc(#loc105)
+    %6 = arith.addf %2, %5 : f32 loc(#loc106)
+    tt.return %1, %6, %new_weight : f32, f32, f32 loc(#loc107)
+  ^bb1:  // no predecessors
+    %7 = ub.poison : f32 loc(#loc108)
+    %8 = ub.poison : f32 loc(#loc108)
+    %9 = ub.poison : f32 loc(#loc108)
+    tt.return %7, %8, %9 : f32, f32, f32 loc(#loc108)
+  } loc(#loc94)
+} loc(#loc)
+#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":19:13)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":20:15)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":23:28)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":23:33)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":24:36)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":24:44)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":24:23)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":25:21)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":26:27)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":26:37)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":29:45)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":30:43)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":31:47)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":32:43)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":33:31)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":34:29)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":38:46)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":38:41)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":38:34)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":38:61)
+#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":38:51)
+#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":38:112)
+#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":42:62)
+#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":42:51)
+#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":44:39)
+#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":44:62)
+#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":45:37)
+#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":45:58)
+#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":46:41)
+#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":46:66)
+#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":46:8)
+#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":47:79)
+#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":48:16)
+#loc34 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":49:16)
+#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":50:16)
+#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":51:43)
+#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":52:31)
+#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":53:29)
+#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":57:34)
+#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":57:41)
+#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":57:94)
+#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":58:47)
+#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":58:42)
+#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":58:35)
+#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":58:62)
+#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":58:52)
+#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":58:114)
+#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":59:35)
+#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":59:42)
+#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":59:95)
+#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":60:16)
+#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":61:23)
+#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":63:24)
+#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":64:16)
+#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":65:24)
+#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":66:16)
+#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":67:24)
+#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":68:32)
+#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":69:24)
+#loc60 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":71:24)
+#loc61 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":72:24)
+#loc62 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":73:41)
+#loc63 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":73:36)
+#loc64 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":73:29)
+#loc65 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":73:63)
+#loc66 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":73:53)
+#loc67 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":51:4)
+#loc68 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":120:0)
+#loc69 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":129:31)
+#loc70 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":129:11)
+#loc71 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":129:4)
+#loc73 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":217:7)
+#loc74 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":218:46)
+#loc75 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":220:31)
+#loc76 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":222:24)
+#loc77 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":223:30)
+#loc78 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":224:34)
+#loc79 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":224:26)
+#loc80 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":225:39)
+#loc81 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":225:31)
+#loc82 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":225:22)
+#loc83 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":226:11)
+#loc84 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":226:4)
+#loc86 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":140:30)
+#loc87 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":140:11)
+#loc88 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":140:4)
+#loc90 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":243:46)
+#loc92 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":243:11)
+#loc93 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":243:4)
+#loc95 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":231:21)
+#loc96 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":232:28)
+#loc97 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:39)
+#loc98 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:60)
+#loc99 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:49)
+#loc100 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":235:25)
+#loc101 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":235:17)
+#loc102 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:15)
+#loc103 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:30)
+#loc104 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:38)
+#loc105 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:49)
+#loc106 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:22)
+#loc107 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":234:11)
+#loc108 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":234:4)
+#loc115 = loc("xnumel"(#loc1))
+#loc116 = loc("r0_numel"(#loc2))
+#loc117 = loc("xoffset"(#loc3))
+#loc118 = loc("xoffset"(#loc4))
+#loc119 = loc("xindex"(#loc5))
+#loc120 = loc("xindex"(#loc6))
+#loc121 = loc("xindex"(#loc7))
+#loc122 = loc("xmask"(#loc8))
+#loc123 = loc("r0_base"(#loc9))
+#loc124 = loc("r0_base"(#loc10))
+#loc125 = loc("tmp3_mean"(#loc11))
+#loc126 = loc("tmp3_m2"(#loc12))
+#loc127 = loc("tmp3_weight"(#loc13))
+#loc128 = loc("tmp3_mean"(#loc14))
+#loc129 = loc("r0_index"(#loc15))
+#loc130 = loc("r0_mask"(#loc16))
+#loc131 = loc("tmp0"(#loc17))
+#loc132 = loc("tmp0"(#loc18))
+#loc133 = loc("tmp0"(#loc19))
+#loc134 = loc("tmp0"(#loc20))
+#loc135 = loc("tmp0"(#loc21))
+#loc136 = loc("tmp0"(#loc22))
+#loc137 = loc("tmp3_mean"(#loc25))
+#loc138 = loc("tmp3_mean"(#loc26))
+#loc139 = loc("tmp3_m2"(#loc27))
+#loc140 = loc("tmp3_m2"(#loc28))
+#loc141 = loc("tmp3_weight"(#loc29))
+#loc142 = loc("tmp3_weight"(#loc30))
+#loc143 = loc("tmp3"(#loc33))
+#loc144 = loc("tmp7"(#loc34))
+#loc145 = loc("tmp8"(#loc35))
+#loc146 = loc("r0_index"(#loc37))
+#loc147 = loc("r0_mask"(#loc38))
+#loc148 = loc("tmp9"(#loc39))
+#loc149 = loc("tmp9"(#loc40))
+#loc150 = loc("tmp9"(#loc41))
+#loc151 = loc("tmp12"(#loc42))
+#loc152 = loc("tmp12"(#loc43))
+#loc153 = loc("tmp12"(#loc44))
+#loc154 = loc("tmp12"(#loc45))
+#loc155 = loc("tmp12"(#loc46))
+#loc156 = loc("tmp12"(#loc47))
+#loc157 = loc("tmp23"(#loc48))
+#loc158 = loc("tmp23"(#loc49))
+#loc159 = loc("tmp23"(#loc50))
+#loc160 = loc("tmp10"(#loc51))
+#loc161 = loc("tmp11"(#loc52))
+#loc162 = loc("tmp14"(#loc53))
+#loc163 = loc("tmp15"(#loc54))
+#loc164 = loc("tmp16"(#loc55))
+#loc165 = loc("tmp17"(#loc56))
+#loc166 = loc("tmp18"(#loc57))
+#loc167 = loc("tmp19"(#loc58))
+#loc168 = loc("tmp20"(#loc59))
+#loc169 = loc("tmp22"(#loc60))
+#loc170 = loc("tmp24"(#loc61))
+#loc176 = loc("new_weight"(#loc74))
+#loc177 = loc("new_m2"(#loc75))
+#loc178 = loc("delta"(#loc76))
+#loc179 = loc("new_weight"(#loc77))
+#loc180 = loc("new_mean"(#loc78))
+#loc181 = loc("new_mean"(#loc79))
+#loc182 = loc("new_m2"(#loc80))
+#loc183 = loc("new_m2"(#loc81))
+#loc184 = loc("new_m2"(#loc82))
+#loc195 = loc("delta"(#loc95))
+#loc196 = loc("new_weight"(#loc96))
+#loc197 = loc("w2_over_w"(#loc97))
+#loc198 = loc("w2_over_w"(#loc98))
+#loc199 = loc("w2_over_w"(#loc99))
+#loc200 = loc("tmp3_m2"(#loc128))
+#loc202 = loc("new_weight"(#loc176))
+#loc203 = loc("new_m2"(#loc177))
+#loc204 = loc("new_weight"(#loc179))
+#loc205 = loc("new_mean"(#loc181))
+#loc206 = loc("new_m2"(#loc184))
+#loc207 = loc("tmp3_weight"(#loc200))
diff --git a/triton/BJVG7OCL7JU5W736D67FIT6PQ3DMDC7PLUQRHMEEN5LS5TJIMXXA/triton_red_fused_add_mul_native_layer_norm_1.ttgir b/triton/BJVG7OCL7JU5W736D67FIT6PQ3DMDC7PLUQRHMEEN5LS5TJIMXXA/triton_red_fused_add_mul_native_layer_norm_1.ttgir
new file mode 100644
index 0000000000000000000000000000000000000000..e83df533d7c29132d181b171d0529d27df449610
--- /dev/null
+++ b/triton/BJVG7OCL7JU5W736D67FIT6PQ3DMDC7PLUQRHMEEN5LS5TJIMXXA/triton_red_fused_add_mul_native_layer_norm_1.ttgir
@@ -0,0 +1,261 @@
+#blocked = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 32], warpsPerCTA = [1, 16], order = [1, 0]}>
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":18:0)
+#loc1 = loc(unknown)
+#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":47:79)
+#loc70 = loc("in_ptr0"(#loc))
+#loc71 = loc("in_ptr1"(#loc))
+#loc72 = loc("in_ptr2"(#loc))
+#loc73 = loc("out_ptr2"(#loc))
+#loc74 = loc("xnumel"(#loc))
+#loc75 = loc("r0_numel"(#loc))
+#loc101 = loc(callsite(#loc1 at #loc30))
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 16 : i32, ttg.target = "cuda:89", "ttg.threads-per-warp" = 32 : i32} {
+  tt.func public @triton_red_fused_add_mul_native_layer_norm_1(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %out_ptr2: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
+    %cst = arith.constant dense<4096> : tensor<1x2048xi32, #blocked> loc(#loc1)
+    %c0_i32 = arith.constant 0 : i32 loc(#loc1)
+    %cst_0 = arith.constant dense<0.000000e+00> : tensor<1x2048xbf16, #blocked> loc(#loc1)
+    %c4096_i32 = arith.constant 4096 : i32 loc(#loc1)
+    %c2048_i32 = arith.constant 2048 : i32 loc(#loc1)
+    %c256_i32 = arith.constant 256 : i32 loc(#loc1)
+    %cst_1 = arith.constant 0.000000e+00 : f32 loc(#loc1)
+    %cst_2 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32, #blocked> loc(#loc1)
+    %cst_3 = arith.constant dense<9.99999997E-7> : tensor<1x1xf32, #blocked> loc(#loc1)
+    %cst_4 = arith.constant dense<4.096000e+03> : tensor<1x1xf32, #blocked> loc(#loc1)
+    %cst_5 = arith.constant dense<1.000000e+00> : tensor<1x2048xf32, #blocked> loc(#loc1)
+    %xoffset = tt.get_program_id x : i32 loc(#loc76)
+    %xmask = arith.cmpi slt, %xoffset, %c256_i32 : i32 loc(#loc77)
+    %r0_base = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc78)
+    %r0_base_6 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<2048xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x2048xi32, #blocked> loc(#loc78)
+    %tmp0 = arith.muli %xoffset, %c4096_i32 : i32 loc(#loc79)
+    %tmp0_7 = tt.splat %tmp0 : i32 -> tensor<1x2048xi32, #blocked> loc(#loc130)
+    %tmp0_8 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<1x2048x!tt.ptr<bf16>, #blocked> loc(#loc81)
+    %tmp0_9 = tt.splat %xmask : i1 -> tensor<1x2048xi1, #blocked> loc(#loc131)
+    %tmp3_weight:3 = scf.for %tmp3_weight_10 = %c0_i32 to %c4096_i32 step %c2048_i32 iter_args(%arg7 = %cst_2, %arg8 = %cst_2, %arg9 = %cst_2) -> (tensor<1x2048xf32, #blocked>, tensor<1x2048xf32, #blocked>, tensor<1x2048xf32, #blocked>)  : i32 {
+      %r0_index = tt.splat %tmp3_weight_10 : i32 -> tensor<1x2048xi32, #blocked> loc(#loc84)
+      %r0_index_11 = arith.addi %r0_index, %r0_base_6 : tensor<1x2048xi32, #blocked> loc(#loc84)
+      %r0_mask = arith.cmpi slt, %r0_index_11, %cst : tensor<1x2048xi32, #blocked> loc(#loc85)
+      %tmp0_12 = arith.addi %r0_index_11, %tmp0_7 : tensor<1x2048xi32, #blocked> loc(#loc80)
+      %tmp0_13 = tt.addptr %tmp0_8, %tmp0_12 : tensor<1x2048x!tt.ptr<bf16>, #blocked>, tensor<1x2048xi32, #blocked> loc(#loc81)
+      %tmp0_14 = arith.andi %r0_mask, %tmp0_9 : tensor<1x2048xi1, #blocked> loc(#loc82)
+      %tmp0_15 = tt.load %tmp0_13, %tmp0_14, %cst_0 evictionPolicy = evict_last : tensor<1x2048x!tt.ptr<bf16>, #blocked> loc(#loc86)
+      %tmp0_16 = arith.extf %tmp0_15 : tensor<1x2048xbf16, #blocked> to tensor<1x2048xf32, #blocked> loc(#loc87)
+      %2 = arith.cmpi eq, %tmp3_weight_10, %c0_i32 : i32 loc(#loc14)
+      %3:3 = scf.if %2 -> (tensor<1x2048xf32, #blocked>, tensor<1x2048xf32, #blocked>, tensor<1x2048xf32, #blocked>) {
+        scf.yield %cst_2, %tmp0_16, %cst_5 : tensor<1x2048xf32, #blocked>, tensor<1x2048xf32, #blocked>, tensor<1x2048xf32, #blocked> loc(#loc155)
+      } else {
+        %delta = arith.subf %tmp0_16, %arg7 : tensor<1x2048xf32, #blocked> loc(#loc134)
+        %new_weight = arith.addf %arg9, %cst_5 : tensor<1x2048xf32, #blocked> loc(#loc156)
+        %new_mean = arith.divf %delta, %new_weight : tensor<1x2048xf32, #blocked> loc(#loc136)
+        %new_mean_18 = arith.addf %arg7, %new_mean : tensor<1x2048xf32, #blocked> loc(#loc157)
+        %new_m2 = arith.subf %tmp0_16, %new_mean_18 : tensor<1x2048xf32, #blocked> loc(#loc138)
+        %new_m2_19 = arith.mulf %delta, %new_m2 : tensor<1x2048xf32, #blocked> loc(#loc139)
+        %new_m2_20 = arith.addf %arg8, %new_m2_19 : tensor<1x2048xf32, #blocked> loc(#loc158)
+        scf.yield %new_m2_20, %new_mean_18, %new_weight : tensor<1x2048xf32, #blocked>, tensor<1x2048xf32, #blocked>, tensor<1x2048xf32, #blocked> loc(#loc141)
+      } loc(#loc88)
+      %tmp3_mean = arith.select %tmp0_14, %3#1, %arg7 : tensor<1x2048xi1, #blocked>, tensor<1x2048xf32, #blocked> loc(#loc97)
+      %tmp3_m2 = arith.select %tmp0_14, %3#0, %arg8 : tensor<1x2048xi1, #blocked>, tensor<1x2048xf32, #blocked> loc(#loc98)
+      %tmp3_weight_17 = arith.select %tmp0_14, %3#2, %arg9 : tensor<1x2048xi1, #blocked>, tensor<1x2048xf32, #blocked> loc(#loc99)
+      scf.yield %tmp3_mean, %tmp3_m2, %tmp3_weight_17 : tensor<1x2048xf32, #blocked>, tensor<1x2048xf32, #blocked>, tensor<1x2048xf32, #blocked> loc(#loc28)
+    } loc(#loc154)
+    %0:3 = "tt.reduce"(%tmp3_weight#0, %tmp3_weight#1, %tmp3_weight#2) <{axis = 1 : i32}> ({
+    ^bb0(%arg6: f32 loc(callsite(#loc1 at #loc30)), %arg7: f32 loc(callsite(#loc1 at #loc30)), %arg8: f32 loc(callsite(#loc1 at #loc30)), %arg9: f32 loc(callsite(#loc1 at #loc30)), %arg10: f32 loc(callsite(#loc1 at #loc30)), %arg11: f32 loc(callsite(#loc1 at #loc30))):
+      %delta = arith.subf %arg9, %arg6 : f32 loc(#loc142)
+      %new_weight = arith.addf %arg8, %arg11 : f32 loc(#loc143)
+      %w2_over_w = arith.cmpf oeq, %new_weight, %cst_1 : f32 loc(#loc144)
+      %w2_over_w_10 = arith.divf %arg11, %new_weight : f32 loc(#loc145)
+      %w2_over_w_11 = arith.select %w2_over_w, %cst_1, %w2_over_w_10 : f32 loc(#loc146)
+      %2 = arith.mulf %delta, %w2_over_w_11 : f32 loc(#loc147)
+      %3 = arith.addf %arg6, %2 : f32 loc(#loc148)
+      %4 = arith.addf %arg7, %arg10 : f32 loc(#loc149)
+      %5 = arith.mulf %delta, %delta : f32 loc(#loc150)
+      %6 = arith.mulf %5, %arg8 : f32 loc(#loc151)
+      %7 = arith.mulf %6, %w2_over_w_11 : f32 loc(#loc152)
+      %8 = arith.addf %4, %7 : f32 loc(#loc153)
+      tt.reduce.return %3, %8, %new_weight : f32, f32, f32 loc(#loc100)
+    }) : (tensor<1x2048xf32, #blocked>, tensor<1x2048xf32, #blocked>, tensor<1x2048xf32, #blocked>) -> (tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked}>>, tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked}>>, tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked}>>) loc(#loc100)
+    %tmp3 = tt.expand_dims %0#0 {axis = 1 : i32} : tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<1x1xf32, #blocked> loc(#loc107)
+    %tmp7 = tt.expand_dims %0#1 {axis = 1 : i32} : tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<1x1xf32, #blocked> loc(#loc108)
+    %tmp9 = tt.splat %in_ptr1 : !tt.ptr<bf16> -> tensor<1x2048x!tt.ptr<bf16>, #blocked> loc(#loc109)
+    %tmp23 = tt.splat %in_ptr2 : !tt.ptr<bf16> -> tensor<1x2048x!tt.ptr<bf16>, #blocked> loc(#loc110)
+    %tmp14 = tt.broadcast %tmp3 : tensor<1x1xf32, #blocked> -> tensor<1x2048xf32, #blocked> loc(#loc111)
+    %tmp16 = arith.divf %tmp7, %cst_4 : tensor<1x1xf32, #blocked> loc(#loc112)
+    %tmp18 = arith.addf %tmp16, %cst_3 : tensor<1x1xf32, #blocked> loc(#loc113)
+    %tmp19 = tt.extern_elementwise %tmp18 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<1x1xf32, #blocked>) -> tensor<1x1xf32, #blocked> loc(#loc114)
+    %tmp20 = tt.broadcast %tmp19 : tensor<1x1xf32, #blocked> -> tensor<1x2048xf32, #blocked> loc(#loc115)
+    %1 = tt.splat %out_ptr2 : !tt.ptr<bf16> -> tensor<1x2048x!tt.ptr<bf16>, #blocked> loc(#loc52)
+    scf.for %r0_offset = %c0_i32 to %c4096_i32 step %c2048_i32  : i32 {
+      %r0_index = tt.splat %r0_offset : i32 -> tensor<1x2048xi32, #blocked> loc(#loc116)
+      %r0_index_10 = arith.addi %r0_index, %r0_base_6 : tensor<1x2048xi32, #blocked> loc(#loc116)
+      %r0_mask = arith.cmpi slt, %r0_index_10, %cst : tensor<1x2048xi32, #blocked> loc(#loc117)
+      %tmp9_11 = tt.addptr %tmp9, %r0_index_10 : tensor<1x2048x!tt.ptr<bf16>, #blocked>, tensor<1x2048xi32, #blocked> loc(#loc109)
+      %tmp9_12 = tt.load %tmp9_11, %r0_mask, %cst_0 evictionPolicy = evict_last : tensor<1x2048x!tt.ptr<bf16>, #blocked> loc(#loc118)
+      %tmp9_13 = arith.extf %tmp9_12 : tensor<1x2048xbf16, #blocked> to tensor<1x2048xf32, #blocked> loc(#loc119)
+      %tmp12 = arith.addi %r0_index_10, %tmp0_7 : tensor<1x2048xi32, #blocked> loc(#loc120)
+      %tmp12_14 = tt.addptr %tmp0_8, %tmp12 : tensor<1x2048x!tt.ptr<bf16>, #blocked>, tensor<1x2048xi32, #blocked> loc(#loc121)
+      %tmp12_15 = arith.andi %r0_mask, %tmp0_9 : tensor<1x2048xi1, #blocked> loc(#loc122)
+      %tmp12_16 = tt.load %tmp12_14, %tmp12_15, %cst_0 evictionPolicy = evict_first : tensor<1x2048x!tt.ptr<bf16>, #blocked> loc(#loc123)
+      %tmp12_17 = arith.extf %tmp12_16 : tensor<1x2048xbf16, #blocked> to tensor<1x2048xf32, #blocked> loc(#loc124)
+      %tmp23_18 = tt.addptr %tmp23, %r0_index_10 : tensor<1x2048x!tt.ptr<bf16>, #blocked>, tensor<1x2048xi32, #blocked> loc(#loc110)
+      %tmp23_19 = tt.load %tmp23_18, %r0_mask, %cst_0 evictionPolicy = evict_last : tensor<1x2048x!tt.ptr<bf16>, #blocked> loc(#loc125)
+      %tmp23_20 = arith.extf %tmp23_19 : tensor<1x2048xbf16, #blocked> to tensor<1x2048xf32, #blocked> loc(#loc126)
+      %tmp11 = arith.addf %tmp9_13, %cst_5 : tensor<1x2048xf32, #blocked> loc(#loc127)
+      %tmp14_21 = arith.subf %tmp12_17, %tmp14 : tensor<1x2048xf32, #blocked> loc(#loc111)
+      %tmp20_22 = arith.mulf %tmp14_21, %tmp20 : tensor<1x2048xf32, #blocked> loc(#loc115)
+      %tmp22 = arith.mulf %tmp11, %tmp20_22 : tensor<1x2048xf32, #blocked> loc(#loc128)
+      %tmp24 = arith.addf %tmp22, %tmp23_20 : tensor<1x2048xf32, #blocked> loc(#loc129)
+      %2 = tt.addptr %1, %tmp12 : tensor<1x2048x!tt.ptr<bf16>, #blocked>, tensor<1x2048xi32, #blocked> loc(#loc52)
+      %3 = arith.truncf %tmp24 : tensor<1x2048xf32, #blocked> to tensor<1x2048xbf16, #blocked> loc(#loc68)
+      tt.store %2, %3, %tmp12_15 : tensor<1x2048x!tt.ptr<bf16>, #blocked> loc(#loc68)
+    } loc(#loc53)
+    tt.return loc(#loc69)
+  } loc(#loc)
+} loc(#loc)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":23:28)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":25:21)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":26:37)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":38:46)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":38:41)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":38:34)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":38:61)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":32:43)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":33:31)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":34:29)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":38:51)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":38:112)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":42:62)
+#loc15 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":217:7)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":42:51)
+#loc17 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":220:31)
+#loc18 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":222:24)
+#loc19 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":223:30)
+#loc20 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":224:34)
+#loc21 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":224:26)
+#loc22 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":225:39)
+#loc23 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":225:31)
+#loc24 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":225:22)
+#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":44:62)
+#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":45:58)
+#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":46:66)
+#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":46:8)
+#loc29 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":243:46)
+#loc31 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":231:21)
+#loc32 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":232:28)
+#loc33 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:39)
+#loc34 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:60)
+#loc35 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:49)
+#loc36 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":235:25)
+#loc37 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":235:17)
+#loc38 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:15)
+#loc39 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:30)
+#loc40 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:38)
+#loc41 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:49)
+#loc42 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:22)
+#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":48:16)
+#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":49:16)
+#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":57:34)
+#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":59:35)
+#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":63:24)
+#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":65:24)
+#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":67:24)
+#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":68:32)
+#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":69:24)
+#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":73:29)
+#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":51:43)
+#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":52:31)
+#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":53:29)
+#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":57:41)
+#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":57:94)
+#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":58:42)
+#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":58:35)
+#loc60 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":58:62)
+#loc61 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":58:52)
+#loc62 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":58:114)
+#loc63 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":59:42)
+#loc64 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":59:95)
+#loc65 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":61:23)
+#loc66 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":71:24)
+#loc67 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":72:24)
+#loc68 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":73:53)
+#loc69 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":51:4)
+#loc76 = loc("xoffset"(#loc2))
+#loc77 = loc("xmask"(#loc3))
+#loc78 = loc("r0_base"(#loc4))
+#loc79 = loc("tmp0"(#loc5))
+#loc80 = loc("tmp0"(#loc6))
+#loc81 = loc("tmp0"(#loc7))
+#loc82 = loc("tmp0"(#loc8))
+#loc83 = loc("tmp3_mean"(#loc9))
+#loc84 = loc("r0_index"(#loc10))
+#loc85 = loc("r0_mask"(#loc11))
+#loc86 = loc("tmp0"(#loc12))
+#loc87 = loc("tmp0"(#loc13))
+#loc88 = loc(callsite(#loc15 at #loc16))
+#loc89 = loc("new_m2"(#loc17))
+#loc90 = loc("delta"(#loc18))
+#loc91 = loc("new_weight"(#loc19))
+#loc92 = loc("new_mean"(#loc20))
+#loc93 = loc("new_mean"(#loc21))
+#loc94 = loc("new_m2"(#loc22))
+#loc95 = loc("new_m2"(#loc23))
+#loc96 = loc("new_m2"(#loc24))
+#loc97 = loc("tmp3_mean"(#loc25))
+#loc98 = loc("tmp3_m2"(#loc26))
+#loc99 = loc("tmp3_weight"(#loc27))
+#loc100 = loc(callsite(#loc29 at #loc30))
+#loc102 = loc("delta"(#loc31))
+#loc103 = loc("new_weight"(#loc32))
+#loc104 = loc("w2_over_w"(#loc33))
+#loc105 = loc("w2_over_w"(#loc34))
+#loc106 = loc("w2_over_w"(#loc35))
+#loc107 = loc("tmp3"(#loc43))
+#loc108 = loc("tmp7"(#loc44))
+#loc109 = loc("tmp9"(#loc45))
+#loc110 = loc("tmp23"(#loc46))
+#loc111 = loc("tmp14"(#loc47))
+#loc112 = loc("tmp16"(#loc48))
+#loc113 = loc("tmp18"(#loc49))
+#loc114 = loc("tmp19"(#loc50))
+#loc115 = loc("tmp20"(#loc51))
+#loc116 = loc("r0_index"(#loc54))
+#loc117 = loc("r0_mask"(#loc55))
+#loc118 = loc("tmp9"(#loc56))
+#loc119 = loc("tmp9"(#loc57))
+#loc120 = loc("tmp12"(#loc58))
+#loc121 = loc("tmp12"(#loc59))
+#loc122 = loc("tmp12"(#loc60))
+#loc123 = loc("tmp12"(#loc61))
+#loc124 = loc("tmp12"(#loc62))
+#loc125 = loc("tmp23"(#loc63))
+#loc126 = loc("tmp23"(#loc64))
+#loc127 = loc("tmp11"(#loc65))
+#loc128 = loc("tmp22"(#loc66))
+#loc129 = loc("tmp24"(#loc67))
+#loc130 = loc(fused[#loc80, #loc79])
+#loc131 = loc(fused[#loc82, #loc77])
+#loc132 = loc("tmp3_m2"(#loc83))
+#loc133 = loc("new_m2"(#loc89))
+#loc134 = loc(callsite(#loc90 at #loc16))
+#loc135 = loc("new_weight"(#loc91))
+#loc136 = loc(callsite(#loc92 at #loc16))
+#loc137 = loc("new_mean"(#loc93))
+#loc138 = loc(callsite(#loc94 at #loc16))
+#loc139 = loc(callsite(#loc95 at #loc16))
+#loc140 = loc("new_m2"(#loc96))
+#loc141 = loc(callsite(#loc96 at #loc16))
+#loc142 = loc(callsite(#loc102 at #loc100))
+#loc143 = loc(callsite(#loc103 at #loc100))
+#loc144 = loc(callsite(#loc104 at #loc100))
+#loc145 = loc(callsite(#loc105 at #loc100))
+#loc146 = loc(callsite(#loc106 at #loc100))
+#loc147 = loc(callsite(#loc36 at #loc100))
+#loc148 = loc(callsite(#loc37 at #loc100))
+#loc149 = loc(callsite(#loc38 at #loc100))
+#loc150 = loc(callsite(#loc39 at #loc100))
+#loc151 = loc(callsite(#loc40 at #loc100))
+#loc152 = loc(callsite(#loc41 at #loc100))
+#loc153 = loc(callsite(#loc42 at #loc100))
+#loc154 = loc("tmp3_weight"(#loc132))
+#loc155 = loc(callsite(#loc133 at #loc16))
+#loc156 = loc(callsite(#loc135 at #loc16))
+#loc157 = loc(callsite(#loc137 at #loc16))
+#loc158 = loc(callsite(#loc140 at #loc16))
diff --git a/triton/BJVG7OCL7JU5W736D67FIT6PQ3DMDC7PLUQRHMEEN5LS5TJIMXXA/triton_red_fused_add_mul_native_layer_norm_1.ttir b/triton/BJVG7OCL7JU5W736D67FIT6PQ3DMDC7PLUQRHMEEN5LS5TJIMXXA/triton_red_fused_add_mul_native_layer_norm_1.ttir
new file mode 100644
index 0000000000000000000000000000000000000000..e722b4e9c4228271a37fc17a289b6f1fb69c5e8a
--- /dev/null
+++ b/triton/BJVG7OCL7JU5W736D67FIT6PQ3DMDC7PLUQRHMEEN5LS5TJIMXXA/triton_red_fused_add_mul_native_layer_norm_1.ttir
@@ -0,0 +1,270 @@
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":18:0)
+#loc2 = loc(unknown)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":47:79)
+#loc72 = loc("in_ptr0"(#loc))
+#loc73 = loc("in_ptr1"(#loc))
+#loc74 = loc("in_ptr2"(#loc))
+#loc75 = loc("out_ptr2"(#loc))
+#loc76 = loc("xnumel"(#loc))
+#loc77 = loc("r0_numel"(#loc))
+#loc79 = loc(callsite(#loc2 at #loc3))
+module {
+  tt.func public @triton_red_fused_add_mul_native_layer_norm_1(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %out_ptr2: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
+    %xmask = arith.constant 256 : i32 loc(#loc78)
+    %cst = arith.constant 0.000000e+00 : f32 loc(#loc79)
+    %cst_0 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc2)
+    %cst_1 = arith.constant dense<0.000000e+00> : tensor<1x2048xbf16> loc(#loc2)
+    %c2048_i32 = arith.constant 2048 : i32 loc(#loc2)
+    %c4096_i32 = arith.constant 4096 : i32 loc(#loc2)
+    %cst_2 = arith.constant dense<9.99999997E-7> : tensor<1x1xf32> loc(#loc2)
+    %cst_3 = arith.constant dense<4.096000e+03> : tensor<1x1xf32> loc(#loc2)
+    %cst_4 = arith.constant dense<1.000000e+00> : tensor<1x2048xf32> loc(#loc2)
+    %cst_5 = arith.constant dense<4096> : tensor<1x2048xi32> loc(#loc2)
+    %c0_i32 = arith.constant 0 : i32 loc(#loc2)
+    %xoffset = tt.get_program_id x : i32 loc(#loc80)
+    %xmask_6 = arith.cmpi slt, %xoffset, %xmask : i32 loc(#loc78)
+    %r0_base = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32> loc(#loc81)
+    %r0_base_7 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<2048xi32> -> tensor<1x2048xi32> loc(#loc82)
+    %tmp3_weight:3 = scf.for %r0_offset = %c0_i32 to %c4096_i32 step %c2048_i32 iter_args(%tmp3_mean = %cst_0, %tmp3_m2 = %cst_0, %tmp3_weight_8 = %cst_0) -> (tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32>)  : i32 {
+      %r0_index = tt.splat %r0_offset : i32 -> tensor<1x2048xi32> loc(#loc84)
+      %r0_index_9 = arith.addi %r0_index, %r0_base_7 : tensor<1x2048xi32> loc(#loc84)
+      %r0_mask = arith.cmpi slt, %r0_index_9, %cst_5 : tensor<1x2048xi32> loc(#loc85)
+      %tmp0 = arith.muli %xoffset, %c4096_i32 : i32 loc(#loc86)
+      %tmp0_10 = tt.splat %tmp0 : i32 -> tensor<1x2048xi32> loc(#loc135)
+      %tmp0_11 = arith.addi %r0_index_9, %tmp0_10 : tensor<1x2048xi32> loc(#loc87)
+      %tmp0_12 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<1x2048x!tt.ptr<bf16>> loc(#loc88)
+      %tmp0_13 = tt.addptr %tmp0_12, %tmp0_11 : tensor<1x2048x!tt.ptr<bf16>>, tensor<1x2048xi32> loc(#loc88)
+      %tmp0_14 = tt.splat %xmask_6 : i1 -> tensor<1x2048xi1> loc(#loc136)
+      %tmp0_15 = arith.andi %r0_mask, %tmp0_14 : tensor<1x2048xi1> loc(#loc89)
+      %tmp0_16 = tt.load %tmp0_13, %tmp0_15, %cst_1 evictionPolicy = evict_last : tensor<1x2048x!tt.ptr<bf16>> loc(#loc90)
+      %tmp0_17 = arith.extf %tmp0_16 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc91)
+      %1 = arith.cmpi eq, %r0_offset, %c0_i32 : i32 loc(#loc16)
+      %2:3 = scf.if %1 -> (tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32>) {
+        scf.yield %cst_0, %tmp0_17, %cst_4 : tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32> loc(#loc161)
+      } else {
+        %delta = arith.subf %tmp0_17, %tmp3_mean : tensor<1x2048xf32> loc(#loc138)
+        %new_weight = arith.addf %tmp3_weight_8, %cst_4 : tensor<1x2048xf32> loc(#loc162)
+        %new_mean = arith.divf %delta, %new_weight : tensor<1x2048xf32> loc(#loc140)
+        %new_mean_21 = arith.addf %tmp3_mean, %new_mean : tensor<1x2048xf32> loc(#loc163)
+        %new_m2 = arith.subf %tmp0_17, %new_mean_21 : tensor<1x2048xf32> loc(#loc142)
+        %new_m2_22 = arith.mulf %delta, %new_m2 : tensor<1x2048xf32> loc(#loc143)
+        %new_m2_23 = arith.addf %tmp3_m2, %new_m2_22 : tensor<1x2048xf32> loc(#loc164)
+        scf.yield %new_m2_23, %new_mean_21, %new_weight : tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32> loc(#loc145)
+      } loc(#loc92)
+      %tmp3_mean_18 = arith.select %tmp0_15, %2#1, %tmp3_mean : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc101)
+      %tmp3_m2_19 = arith.select %tmp0_15, %2#0, %tmp3_m2 : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc102)
+      %tmp3_weight_20 = arith.select %tmp0_15, %2#2, %tmp3_weight_8 : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc103)
+      scf.yield %tmp3_mean_18, %tmp3_m2_19, %tmp3_weight_20 : tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32> loc(#loc30)
+    } loc(#loc160)
+    %0:3 = "tt.reduce"(%tmp3_weight#0, %tmp3_weight#1, %tmp3_weight#2) <{axis = 1 : i32}> ({
+    ^bb0(%arg6: f32 loc(callsite(#loc2 at #loc3)), %arg7: f32 loc(callsite(#loc2 at #loc3)), %arg8: f32 loc(callsite(#loc2 at #loc3)), %arg9: f32 loc(callsite(#loc2 at #loc3)), %arg10: f32 loc(callsite(#loc2 at #loc3)), %arg11: f32 loc(callsite(#loc2 at #loc3))):
+      %delta = arith.subf %arg9, %arg6 : f32 loc(#loc146)
+      %new_weight = arith.addf %arg8, %arg11 : f32 loc(#loc147)
+      %w2_over_w = arith.cmpf oeq, %new_weight, %cst : f32 loc(#loc148)
+      %w2_over_w_8 = arith.divf %arg11, %new_weight : f32 loc(#loc149)
+      %w2_over_w_9 = arith.select %w2_over_w, %cst, %w2_over_w_8 : f32 loc(#loc150)
+      %1 = arith.mulf %delta, %w2_over_w_9 : f32 loc(#loc151)
+      %2 = arith.addf %arg6, %1 : f32 loc(#loc152)
+      %3 = arith.addf %arg7, %arg10 : f32 loc(#loc153)
+      %4 = arith.mulf %delta, %delta : f32 loc(#loc154)
+      %5 = arith.mulf %4, %arg8 : f32 loc(#loc155)
+      %6 = arith.mulf %5, %w2_over_w_9 : f32 loc(#loc156)
+      %7 = arith.addf %3, %6 : f32 loc(#loc157)
+      tt.reduce.return %2, %7, %new_weight : f32, f32, f32 loc(#loc104)
+    }) : (tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32>) -> (tensor<1xf32>, tensor<1xf32>, tensor<1xf32>) loc(#loc104)
+    %tmp3 = tt.expand_dims %0#0 {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc110)
+    %tmp7 = tt.expand_dims %0#1 {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc111)
+    scf.for %r0_offset = %c0_i32 to %c4096_i32 step %c2048_i32  : i32 {
+      %r0_index = tt.splat %r0_offset : i32 -> tensor<1x2048xi32> loc(#loc112)
+      %r0_index_8 = arith.addi %r0_index, %r0_base_7 : tensor<1x2048xi32> loc(#loc112)
+      %r0_mask = arith.cmpi slt, %r0_index_8, %cst_5 : tensor<1x2048xi32> loc(#loc113)
+      %tmp9 = tt.splat %in_ptr1 : !tt.ptr<bf16> -> tensor<1x2048x!tt.ptr<bf16>> loc(#loc114)
+      %tmp9_9 = tt.addptr %tmp9, %r0_index_8 : tensor<1x2048x!tt.ptr<bf16>>, tensor<1x2048xi32> loc(#loc114)
+      %tmp9_10 = tt.load %tmp9_9, %r0_mask, %cst_1 evictionPolicy = evict_last : tensor<1x2048x!tt.ptr<bf16>> loc(#loc115)
+      %tmp9_11 = arith.extf %tmp9_10 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc116)
+      %tmp12 = arith.muli %xoffset, %c4096_i32 : i32 loc(#loc117)
+      %tmp12_12 = tt.splat %tmp12 : i32 -> tensor<1x2048xi32> loc(#loc158)
+      %tmp12_13 = arith.addi %r0_index_8, %tmp12_12 : tensor<1x2048xi32> loc(#loc118)
+      %tmp12_14 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<1x2048x!tt.ptr<bf16>> loc(#loc119)
+      %tmp12_15 = tt.addptr %tmp12_14, %tmp12_13 : tensor<1x2048x!tt.ptr<bf16>>, tensor<1x2048xi32> loc(#loc119)
+      %tmp12_16 = tt.splat %xmask_6 : i1 -> tensor<1x2048xi1> loc(#loc159)
+      %tmp12_17 = arith.andi %r0_mask, %tmp12_16 : tensor<1x2048xi1> loc(#loc120)
+      %tmp12_18 = tt.load %tmp12_15, %tmp12_17, %cst_1 evictionPolicy = evict_first : tensor<1x2048x!tt.ptr<bf16>> loc(#loc121)
+      %tmp12_19 = arith.extf %tmp12_18 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc122)
+      %tmp23 = tt.splat %in_ptr2 : !tt.ptr<bf16> -> tensor<1x2048x!tt.ptr<bf16>> loc(#loc123)
+      %tmp23_20 = tt.addptr %tmp23, %r0_index_8 : tensor<1x2048x!tt.ptr<bf16>>, tensor<1x2048xi32> loc(#loc123)
+      %tmp23_21 = tt.load %tmp23_20, %r0_mask, %cst_1 evictionPolicy = evict_last : tensor<1x2048x!tt.ptr<bf16>> loc(#loc124)
+      %tmp23_22 = arith.extf %tmp23_21 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc125)
+      %tmp11 = arith.addf %tmp9_11, %cst_4 : tensor<1x2048xf32> loc(#loc126)
+      %tmp14 = tt.broadcast %tmp3 : tensor<1x1xf32> -> tensor<1x2048xf32> loc(#loc127)
+      %tmp14_23 = arith.subf %tmp12_19, %tmp14 : tensor<1x2048xf32> loc(#loc127)
+      %tmp16 = arith.divf %tmp7, %cst_3 : tensor<1x1xf32> loc(#loc128)
+      %tmp18 = arith.addf %tmp16, %cst_2 : tensor<1x1xf32> loc(#loc129)
+      %tmp19 = tt.extern_elementwise %tmp18 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<1x1xf32>) -> tensor<1x1xf32> loc(#loc130)
+      %tmp20 = tt.broadcast %tmp19 : tensor<1x1xf32> -> tensor<1x2048xf32> loc(#loc131)
+      %tmp20_24 = arith.mulf %tmp14_23, %tmp20 : tensor<1x2048xf32> loc(#loc131)
+      %tmp22 = arith.mulf %tmp11, %tmp20_24 : tensor<1x2048xf32> loc(#loc132)
+      %tmp24 = arith.addf %tmp22, %tmp23_22 : tensor<1x2048xf32> loc(#loc133)
+      %1 = tt.splat %out_ptr2 : !tt.ptr<bf16> -> tensor<1x2048x!tt.ptr<bf16>> loc(#loc69)
+      %2 = tt.addptr %1, %tmp12_13 : tensor<1x2048x!tt.ptr<bf16>>, tensor<1x2048xi32> loc(#loc69)
+      %3 = arith.truncf %tmp24 : tensor<1x2048xf32> to tensor<1x2048xbf16> loc(#loc70)
+      tt.store %2, %3, %tmp12_17 : tensor<1x2048x!tt.ptr<bf16>> loc(#loc70)
+    } loc(#loc46)
+    tt.return loc(#loc71)
+  } loc(#loc)
+} loc(#loc)
+#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":25:21)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":23:28)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":26:27)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":26:37)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":32:43)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":33:31)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":34:29)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":38:46)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":38:41)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":38:34)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":38:61)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":38:51)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":38:112)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":42:62)
+#loc17 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":217:7)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":42:51)
+#loc19 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":220:31)
+#loc20 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":222:24)
+#loc21 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":223:30)
+#loc22 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":224:34)
+#loc23 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":224:26)
+#loc24 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":225:39)
+#loc25 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":225:31)
+#loc26 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":225:22)
+#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":44:62)
+#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":45:58)
+#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":46:66)
+#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":46:8)
+#loc31 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":243:46)
+#loc32 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":231:21)
+#loc33 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":232:28)
+#loc34 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:39)
+#loc35 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:60)
+#loc36 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:49)
+#loc37 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":235:25)
+#loc38 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":235:17)
+#loc39 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:15)
+#loc40 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:30)
+#loc41 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:38)
+#loc42 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:49)
+#loc43 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:22)
+#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":48:16)
+#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":49:16)
+#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":51:43)
+#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":52:31)
+#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":53:29)
+#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":57:34)
+#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":57:41)
+#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":57:94)
+#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":58:47)
+#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":58:42)
+#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":58:35)
+#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":58:62)
+#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":58:52)
+#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":58:114)
+#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":59:35)
+#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":59:42)
+#loc60 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":59:95)
+#loc61 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":61:23)
+#loc62 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":63:24)
+#loc63 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":65:24)
+#loc64 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":67:24)
+#loc65 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":68:32)
+#loc66 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":69:24)
+#loc67 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":71:24)
+#loc68 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":72:24)
+#loc69 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":73:29)
+#loc70 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":73:53)
+#loc71 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":51:4)
+#loc78 = loc("xmask"(#loc1))
+#loc80 = loc("xoffset"(#loc4))
+#loc81 = loc("r0_base"(#loc5))
+#loc82 = loc("r0_base"(#loc6))
+#loc83 = loc("tmp3_mean"(#loc7))
+#loc84 = loc("r0_index"(#loc8))
+#loc85 = loc("r0_mask"(#loc9))
+#loc86 = loc("tmp0"(#loc10))
+#loc87 = loc("tmp0"(#loc11))
+#loc88 = loc("tmp0"(#loc12))
+#loc89 = loc("tmp0"(#loc13))
+#loc90 = loc("tmp0"(#loc14))
+#loc91 = loc("tmp0"(#loc15))
+#loc92 = loc(callsite(#loc17 at #loc18))
+#loc93 = loc("new_m2"(#loc19))
+#loc94 = loc("delta"(#loc20))
+#loc95 = loc("new_weight"(#loc21))
+#loc96 = loc("new_mean"(#loc22))
+#loc97 = loc("new_mean"(#loc23))
+#loc98 = loc("new_m2"(#loc24))
+#loc99 = loc("new_m2"(#loc25))
+#loc100 = loc("new_m2"(#loc26))
+#loc101 = loc("tmp3_mean"(#loc27))
+#loc102 = loc("tmp3_m2"(#loc28))
+#loc103 = loc("tmp3_weight"(#loc29))
+#loc104 = loc(callsite(#loc31 at #loc3))
+#loc105 = loc("delta"(#loc32))
+#loc106 = loc("new_weight"(#loc33))
+#loc107 = loc("w2_over_w"(#loc34))
+#loc108 = loc("w2_over_w"(#loc35))
+#loc109 = loc("w2_over_w"(#loc36))
+#loc110 = loc("tmp3"(#loc44))
+#loc111 = loc("tmp7"(#loc45))
+#loc112 = loc("r0_index"(#loc47))
+#loc113 = loc("r0_mask"(#loc48))
+#loc114 = loc("tmp9"(#loc49))
+#loc115 = loc("tmp9"(#loc50))
+#loc116 = loc("tmp9"(#loc51))
+#loc117 = loc("tmp12"(#loc52))
+#loc118 = loc("tmp12"(#loc53))
+#loc119 = loc("tmp12"(#loc54))
+#loc120 = loc("tmp12"(#loc55))
+#loc121 = loc("tmp12"(#loc56))
+#loc122 = loc("tmp12"(#loc57))
+#loc123 = loc("tmp23"(#loc58))
+#loc124 = loc("tmp23"(#loc59))
+#loc125 = loc("tmp23"(#loc60))
+#loc126 = loc("tmp11"(#loc61))
+#loc127 = loc("tmp14"(#loc62))
+#loc128 = loc("tmp16"(#loc63))
+#loc129 = loc("tmp18"(#loc64))
+#loc130 = loc("tmp19"(#loc65))
+#loc131 = loc("tmp20"(#loc66))
+#loc132 = loc("tmp22"(#loc67))
+#loc133 = loc("tmp24"(#loc68))
+#loc134 = loc("tmp3_m2"(#loc83))
+#loc135 = loc(fused[#loc87, #loc86])
+#loc136 = loc(fused[#loc89, #loc78])
+#loc137 = loc("new_m2"(#loc93))
+#loc138 = loc(callsite(#loc94 at #loc18))
+#loc139 = loc("new_weight"(#loc95))
+#loc140 = loc(callsite(#loc96 at #loc18))
+#loc141 = loc("new_mean"(#loc97))
+#loc142 = loc(callsite(#loc98 at #loc18))
+#loc143 = loc(callsite(#loc99 at #loc18))
+#loc144 = loc("new_m2"(#loc100))
+#loc145 = loc(callsite(#loc100 at #loc18))
+#loc146 = loc(callsite(#loc105 at #loc104))
+#loc147 = loc(callsite(#loc106 at #loc104))
+#loc148 = loc(callsite(#loc107 at #loc104))
+#loc149 = loc(callsite(#loc108 at #loc104))
+#loc150 = loc(callsite(#loc109 at #loc104))
+#loc151 = loc(callsite(#loc37 at #loc104))
+#loc152 = loc(callsite(#loc38 at #loc104))
+#loc153 = loc(callsite(#loc39 at #loc104))
+#loc154 = loc(callsite(#loc40 at #loc104))
+#loc155 = loc(callsite(#loc41 at #loc104))
+#loc156 = loc(callsite(#loc42 at #loc104))
+#loc157 = loc(callsite(#loc43 at #loc104))
+#loc158 = loc(fused[#loc118, #loc117])
+#loc159 = loc(fused[#loc120, #loc78])
+#loc160 = loc("tmp3_weight"(#loc134))
+#loc161 = loc(callsite(#loc137 at #loc18))
+#loc162 = loc(callsite(#loc139 at #loc18))
+#loc163 = loc(callsite(#loc141 at #loc18))
+#loc164 = loc(callsite(#loc144 at #loc18))
diff --git a/triton/BMEOK34QH7HISRHLZCWDTEICFHNMYNUUIGVSUVGWE7RQZVRWVZPA/__grp__triton_poi_fused__fused_rms_norm_cat_view_2.json b/triton/BMEOK34QH7HISRHLZCWDTEICFHNMYNUUIGVSUVGWE7RQZVRWVZPA/__grp__triton_poi_fused__fused_rms_norm_cat_view_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..ae5254f06ac9f186cb062a38646ab9e0bd960c61
--- /dev/null
+++ b/triton/BMEOK34QH7HISRHLZCWDTEICFHNMYNUUIGVSUVGWE7RQZVRWVZPA/__grp__triton_poi_fused__fused_rms_norm_cat_view_2.json
@@ -0,0 +1 @@
+{"child_paths": {"triton_poi_fused__fused_rms_norm_cat_view_2.source": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/BMEOK34QH7HISRHLZCWDTEICFHNMYNUUIGVSUVGWE7RQZVRWVZPA/triton_poi_fused__fused_rms_norm_cat_view_2.source", "triton_poi_fused__fused_rms_norm_cat_view_2.ttir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/BMEOK34QH7HISRHLZCWDTEICFHNMYNUUIGVSUVGWE7RQZVRWVZPA/triton_poi_fused__fused_rms_norm_cat_view_2.ttir", "triton_poi_fused__fused_rms_norm_cat_view_2.ttgir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/BMEOK34QH7HISRHLZCWDTEICFHNMYNUUIGVSUVGWE7RQZVRWVZPA/triton_poi_fused__fused_rms_norm_cat_view_2.ttgir", "triton_poi_fused__fused_rms_norm_cat_view_2.llir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/BMEOK34QH7HISRHLZCWDTEICFHNMYNUUIGVSUVGWE7RQZVRWVZPA/triton_poi_fused__fused_rms_norm_cat_view_2.llir", "triton_poi_fused__fused_rms_norm_cat_view_2.ptx": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/BMEOK34QH7HISRHLZCWDTEICFHNMYNUUIGVSUVGWE7RQZVRWVZPA/triton_poi_fused__fused_rms_norm_cat_view_2.ptx", "triton_poi_fused__fused_rms_norm_cat_view_2.cubin": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/BMEOK34QH7HISRHLZCWDTEICFHNMYNUUIGVSUVGWE7RQZVRWVZPA/triton_poi_fused__fused_rms_norm_cat_view_2.cubin", "triton_poi_fused__fused_rms_norm_cat_view_2.json": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/BMEOK34QH7HISRHLZCWDTEICFHNMYNUUIGVSUVGWE7RQZVRWVZPA/triton_poi_fused__fused_rms_norm_cat_view_2.json"}}
\ No newline at end of file
diff --git a/triton/BMEOK34QH7HISRHLZCWDTEICFHNMYNUUIGVSUVGWE7RQZVRWVZPA/triton_poi_fused__fused_rms_norm_cat_view_2.cubin b/triton/BMEOK34QH7HISRHLZCWDTEICFHNMYNUUIGVSUVGWE7RQZVRWVZPA/triton_poi_fused__fused_rms_norm_cat_view_2.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..a4544a8fecd0787b5b68e585fe88b0d99c133d5b
Binary files /dev/null and b/triton/BMEOK34QH7HISRHLZCWDTEICFHNMYNUUIGVSUVGWE7RQZVRWVZPA/triton_poi_fused__fused_rms_norm_cat_view_2.cubin differ
diff --git a/triton/BMEOK34QH7HISRHLZCWDTEICFHNMYNUUIGVSUVGWE7RQZVRWVZPA/triton_poi_fused__fused_rms_norm_cat_view_2.json b/triton/BMEOK34QH7HISRHLZCWDTEICFHNMYNUUIGVSUVGWE7RQZVRWVZPA/triton_poi_fused__fused_rms_norm_cat_view_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..a3bd8cf92daf3a29a1ddc54bfb67c7d6c8e4c0af
--- /dev/null
+++ b/triton/BMEOK34QH7HISRHLZCWDTEICFHNMYNUUIGVSUVGWE7RQZVRWVZPA/triton_poi_fused__fused_rms_norm_cat_view_2.json
@@ -0,0 +1 @@
+{"hash": "0b08e56f903fce8944ebc8ac39910229dacc369441ab2a54d627e30cd636ae5e", "target": {"backend": "cuda", "arch": 89, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "enable_reflect_ftz": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee", "bf16x3", "bf16x6"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm89", "instrumentation_mode": "", "triton_version": "3.6.0", "tensordesc_meta": [], "shared": 4096, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_poi_fused__fused_rms_norm_cat_view_2"}
\ No newline at end of file
diff --git a/triton/BMEOK34QH7HISRHLZCWDTEICFHNMYNUUIGVSUVGWE7RQZVRWVZPA/triton_poi_fused__fused_rms_norm_cat_view_2.llir b/triton/BMEOK34QH7HISRHLZCWDTEICFHNMYNUUIGVSUVGWE7RQZVRWVZPA/triton_poi_fused__fused_rms_norm_cat_view_2.llir
new file mode 100644
index 0000000000000000000000000000000000000000..6b266600e9d26ce4bd1e6bc1befefc354adba5ac
--- /dev/null
+++ b/triton/BMEOK34QH7HISRHLZCWDTEICFHNMYNUUIGVSUVGWE7RQZVRWVZPA/triton_poi_fused__fused_rms_norm_cat_view_2.llir
@@ -0,0 +1,789 @@
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64"
+
+@global_smem = external local_unnamed_addr addrspace(3) global [0 x i8], align 16
+@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1
+
+; Function Attrs: nounwind
+define ptx_kernel void @triton_poi_fused__fused_rms_norm_cat_view_2(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, ptr addrspace(1) %6, i32 %7, i32 %8, ptr addrspace(1) readnone captures(none) %9, ptr addrspace(1) readnone captures(none) %10) local_unnamed_addr #0 !dbg !5 {
+  %12 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y(), !dbg !8
+  %13 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.z(), !dbg !9
+  %14 = tail call i32 @llvm.nvvm.read.ptx.sreg.nctaid.y(), !dbg !10
+  %15 = mul nuw i32 %13, %14, !dbg !11
+  %16 = add nuw i32 %15, %12, !dbg !12
+  %17 = shl i32 %16, 3, !dbg !13
+  %18 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !14
+  %19 = and i32 %18, 112, !dbg !14
+  %20 = lshr exact i32 %19, 4, !dbg !14
+  %21 = and i32 %18, 1, !dbg !14
+  %22 = shl nuw nsw i32 %21, 2, !dbg !14
+  %23 = or disjoint i32 %17, %20, !dbg !15
+  %24 = or disjoint i32 %17, %22, !dbg !15
+  %25 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !16
+  %26 = shl i32 %25, 7, !dbg !17
+  %27 = shl nuw nsw i32 %18, 3, !dbg !18
+  %28 = and i32 %27, 120, !dbg !18
+  %29 = lshr i32 %18, 1, !dbg !18
+  %30 = and i32 %29, 63, !dbg !18
+  %31 = or disjoint i32 %28, %26, !dbg !19
+  %32 = or disjoint i32 %30, %26, !dbg !19
+  %33 = icmp slt i32 %31, 128, !dbg !20
+  %34 = icmp slt i32 %32, 128, !dbg !20
+  %35 = sdiv i32 %23, 32, !dbg !21
+  %36 = sdiv i32 %24, 32, !dbg !21
+  %37 = mul i32 %35, 32, !dbg !22
+  %.decomposed = sub i32 %23, %37, !dbg !22
+  %38 = mul i32 %36, 32, !dbg !22
+  %.decomposed72 = sub i32 %24, %38, !dbg !22
+  %39 = icmp slt i32 %23, 8192, !dbg !23
+  %40 = icmp slt i32 %24, 8192, !dbg !23
+  %41 = shl nsw i32 %.decomposed, 7, !dbg !24
+  %42 = add i32 %41, %31, !dbg !25
+  %43 = mul i32 %35, 12288, !dbg !26
+  %44 = add i32 %42, %43, !dbg !27
+  %45 = sext i32 %44 to i64, !dbg !28
+  %46 = getelementptr bfloat, ptr addrspace(1) %0, i64 %45, !dbg !28
+  %47 = and i1 %33, %39, !dbg !29
+  %48 = and i1 %34, %40, !dbg !29
+  %49 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !30
+  %50 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %46, i64 %49, i1 %47) #5, !dbg !30
+  %51 = extractvalue { i32, i32, i32, i32 } %50, 0, !dbg !30
+  %52 = extractvalue { i32, i32, i32, i32 } %50, 1, !dbg !30
+  %53 = extractvalue { i32, i32, i32, i32 } %50, 2, !dbg !30
+  %54 = extractvalue { i32, i32, i32, i32 } %50, 3, !dbg !30
+  %extelt.offset = lshr i32 %51, 16, !dbg !30
+  %55 = trunc nuw i32 %extelt.offset to i16, !dbg !30
+  %extelt.offset1 = lshr i32 %52, 16, !dbg !30
+  %56 = trunc nuw i32 %extelt.offset1 to i16, !dbg !30
+  %extelt.offset2 = lshr i32 %53, 16, !dbg !30
+  %57 = trunc nuw i32 %extelt.offset2 to i16, !dbg !30
+  %extelt.offset3 = lshr i32 %54, 16, !dbg !30
+  %58 = trunc nuw i32 %extelt.offset3 to i16, !dbg !30
+  %59 = shl nuw nsw i32 %18, 4, !dbg !31
+  %60 = and i32 %59, 112, !dbg !31
+  %61 = and i32 %18, 8, !dbg !31
+  %62 = icmp eq i32 %61, 0, !dbg !31
+  %63 = lshr exact i32 %61, 1, !dbg !31
+  %64 = and i32 %18, 16, !dbg !31
+  %65 = icmp eq i32 %64, 0, !dbg !31
+  %66 = select i1 %65, i32 0, i32 136, !dbg !31
+  %67 = and i32 %18, 32, !dbg !31
+  %68 = lshr exact i32 %67, 4, !dbg !31
+  %69 = lshr i32 %18, 3, !dbg !31
+  %70 = and i32 %69, 8, !dbg !31
+  %71 = or disjoint i32 %63, %68, !dbg !31
+  %72 = or disjoint i32 %66, %60, !dbg !31
+  %73 = xor i32 %72, %70, !dbg !31
+  %74 = or disjoint i32 %71, %73, !dbg !31
+  %75 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %74, !dbg !31
+  %76 = trunc i32 %51 to i16, !dbg !31
+  %77 = insertelement <1 x i16> poison, i16 %76, i64 0, !dbg !31
+  store <1 x i16> %77, ptr addrspace(3) %75, align 2, !dbg !31
+  %78 = xor i32 %74, 288, !dbg !31
+  %79 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %78, !dbg !31
+  %80 = insertelement <1 x i16> poison, i16 %55, i64 0, !dbg !31
+  store <1 x i16> %80, ptr addrspace(3) %79, align 2, !dbg !31
+  %81 = xor i32 %74, 576, !dbg !31
+  %82 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %81, !dbg !31
+  %83 = trunc i32 %52 to i16, !dbg !31
+  %84 = insertelement <1 x i16> poison, i16 %83, i64 0, !dbg !31
+  store <1 x i16> %84, ptr addrspace(3) %82, align 2, !dbg !31
+  %85 = xor i32 %74, 864, !dbg !31
+  %86 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %85, !dbg !31
+  %87 = insertelement <1 x i16> poison, i16 %56, i64 0, !dbg !31
+  store <1 x i16> %87, ptr addrspace(3) %86, align 2, !dbg !31
+  %88 = xor i32 %74, 1028, !dbg !31
+  %89 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %88, !dbg !31
+  %90 = trunc i32 %53 to i16, !dbg !31
+  %91 = insertelement <1 x i16> poison, i16 %90, i64 0, !dbg !31
+  store <1 x i16> %91, ptr addrspace(3) %89, align 2, !dbg !31
+  %92 = xor i32 %74, 1316, !dbg !31
+  %93 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %92, !dbg !31
+  %94 = insertelement <1 x i16> poison, i16 %57, i64 0, !dbg !31
+  store <1 x i16> %94, ptr addrspace(3) %93, align 2, !dbg !31
+  %95 = xor i32 %74, 1604, !dbg !31
+  %96 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %95, !dbg !31
+  %97 = trunc i32 %54 to i16, !dbg !31
+  %98 = insertelement <1 x i16> poison, i16 %97, i64 0, !dbg !31
+  store <1 x i16> %98, ptr addrspace(3) %96, align 2, !dbg !31
+  %99 = xor i32 %74, 1892, !dbg !31
+  %100 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %99, !dbg !31
+  %101 = insertelement <1 x i16> poison, i16 %58, i64 0, !dbg !31
+  store <1 x i16> %101, ptr addrspace(3) %100, align 2, !dbg !31
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !31
+  %102 = and i32 %18, 6, !dbg !31
+  %103 = shl nuw nsw i32 %21, 3, !dbg !31
+  %104 = select i1 %62, i32 0, i32 1028, !dbg !31
+  %105 = mul nuw nsw i32 %102, 144, !dbg !31
+  %106 = xor i32 %105, %19, !dbg !31
+  %107 = or disjoint i32 %104, %106, !dbg !31
+  %108 = or disjoint i32 %107, %103, !dbg !31
+  %109 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %108, !dbg !31
+  %110 = load bfloat, ptr addrspace(3) %109, align 4, !dbg !31
+  %111 = getelementptr inbounds nuw i8, ptr addrspace(3) %109, i32 2, !dbg !31
+  %112 = load bfloat, ptr addrspace(3) %111, align 2, !dbg !31
+  %113 = xor i32 %108, 136, !dbg !31
+  %114 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %113, !dbg !31
+  %115 = load bfloat, ptr addrspace(3) %114, align 4, !dbg !31
+  %116 = getelementptr inbounds nuw i8, ptr addrspace(3) %114, i32 2, !dbg !31
+  %117 = load bfloat, ptr addrspace(3) %116, align 2, !dbg !31
+  %118 = xor i32 %108, 4, !dbg !31
+  %119 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %118, !dbg !31
+  %120 = load bfloat, ptr addrspace(3) %119, align 4, !dbg !31
+  %121 = getelementptr inbounds nuw i8, ptr addrspace(3) %119, i32 2, !dbg !31
+  %122 = load bfloat, ptr addrspace(3) %121, align 2, !dbg !31
+  %123 = xor i32 %108, 140, !dbg !31
+  %124 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %123, !dbg !31
+  %125 = load bfloat, ptr addrspace(3) %124, align 4, !dbg !31
+  %126 = getelementptr inbounds nuw i8, ptr addrspace(3) %124, i32 2, !dbg !31
+  %127 = load bfloat, ptr addrspace(3) %126, align 2, !dbg !31
+  %128 = fpext bfloat %110 to float, !dbg !31
+  %129 = fpext bfloat %115 to float, !dbg !31
+  %130 = fpext bfloat %112 to float, !dbg !31
+  %131 = fpext bfloat %117 to float, !dbg !31
+  %132 = fpext bfloat %120 to float, !dbg !31
+  %133 = fpext bfloat %125 to float, !dbg !31
+  %134 = fpext bfloat %122 to float, !dbg !31
+  %135 = fpext bfloat %127 to float, !dbg !31
+  %136 = sext i32 %24 to i64, !dbg !32
+  %137 = getelementptr float, ptr addrspace(1) %1, i64 %136, !dbg !32
+  %138 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !33
+  %139 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %137, i64 %138, i1 %48) #5, !dbg !33
+  %140 = extractvalue { i32, i32, i32, i32 } %139, 0, !dbg !33
+  %141 = extractvalue { i32, i32, i32, i32 } %139, 1, !dbg !33
+  %142 = extractvalue { i32, i32, i32, i32 } %139, 2, !dbg !33
+  %143 = extractvalue { i32, i32, i32, i32 } %139, 3, !dbg !33
+  %144 = bitcast i32 %140 to float, !dbg !33
+  %145 = bitcast i32 %141 to float, !dbg !33
+  %146 = bitcast i32 %142 to float, !dbg !33
+  %147 = bitcast i32 %143 to float, !dbg !33
+  %148 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !33
+  %149 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %137, i64 %148, i1 %48) #5, !dbg !33
+  %150 = extractvalue { i32, i32, i32, i32 } %149, 0, !dbg !33
+  %151 = extractvalue { i32, i32, i32, i32 } %149, 1, !dbg !33
+  %152 = extractvalue { i32, i32, i32, i32 } %149, 2, !dbg !33
+  %153 = extractvalue { i32, i32, i32, i32 } %149, 3, !dbg !33
+  %154 = bitcast i32 %150 to float, !dbg !33
+  %155 = bitcast i32 %151 to float, !dbg !33
+  %156 = bitcast i32 %152 to float, !dbg !33
+  %157 = bitcast i32 %153 to float, !dbg !33
+  %158 = tail call float @llvm.nvvm.div.full(float %144, float 1.280000e+02), !dbg !34
+  %159 = tail call float @llvm.nvvm.div.full(float %145, float 1.280000e+02), !dbg !34
+  %160 = tail call float @llvm.nvvm.div.full(float %146, float 1.280000e+02), !dbg !34
+  %161 = tail call float @llvm.nvvm.div.full(float %147, float 1.280000e+02), !dbg !34
+  %162 = tail call float @llvm.nvvm.div.full(float %154, float 1.280000e+02), !dbg !34
+  %163 = tail call float @llvm.nvvm.div.full(float %155, float 1.280000e+02), !dbg !34
+  %164 = tail call float @llvm.nvvm.div.full(float %156, float 1.280000e+02), !dbg !34
+  %165 = tail call float @llvm.nvvm.div.full(float %157, float 1.280000e+02), !dbg !34
+  %166 = fadd float %158, 0x3EB0C6F7A0000000, !dbg !35
+  %167 = fadd float %159, 0x3EB0C6F7A0000000, !dbg !35
+  %168 = fadd float %160, 0x3EB0C6F7A0000000, !dbg !35
+  %169 = fadd float %161, 0x3EB0C6F7A0000000, !dbg !35
+  %170 = fadd float %162, 0x3EB0C6F7A0000000, !dbg !35
+  %171 = fadd float %163, 0x3EB0C6F7A0000000, !dbg !35
+  %172 = fadd float %164, 0x3EB0C6F7A0000000, !dbg !35
+  %173 = fadd float %165, 0x3EB0C6F7A0000000, !dbg !35
+  %174 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !36
+  %.not.i = icmp eq i32 %174, 0, !dbg !36
+  br i1 %.not.i, label %177, label %175, !dbg !36
+
+175:                                              ; preds = %11
+  %176 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %166), !dbg !36
+  br label %__nv_rsqrtf.exit, !dbg !36
+
+177:                                              ; preds = %11
+  %178 = tail call float @llvm.nvvm.rsqrt.approx.f(float %166), !dbg !36
+  br label %__nv_rsqrtf.exit, !dbg !36
+
+__nv_rsqrtf.exit:                                 ; preds = %175, %177
+  %.0.i = phi float [ %176, %175 ], [ %178, %177 ], !dbg !36
+  %179 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !36
+  %.not.i27 = icmp eq i32 %179, 0, !dbg !36
+  br i1 %.not.i27, label %182, label %180, !dbg !36
+
+180:                                              ; preds = %__nv_rsqrtf.exit
+  %181 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %167), !dbg !36
+  br label %__nv_rsqrtf.exit29, !dbg !36
+
+182:                                              ; preds = %__nv_rsqrtf.exit
+  %183 = tail call float @llvm.nvvm.rsqrt.approx.f(float %167), !dbg !36
+  br label %__nv_rsqrtf.exit29, !dbg !36
+
+__nv_rsqrtf.exit29:                               ; preds = %180, %182
+  %.0.i28 = phi float [ %181, %180 ], [ %183, %182 ], !dbg !36
+  %184 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !36
+  %.not.i30 = icmp eq i32 %184, 0, !dbg !36
+  br i1 %.not.i30, label %187, label %185, !dbg !36
+
+185:                                              ; preds = %__nv_rsqrtf.exit29
+  %186 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %168), !dbg !36
+  br label %__nv_rsqrtf.exit32, !dbg !36
+
+187:                                              ; preds = %__nv_rsqrtf.exit29
+  %188 = tail call float @llvm.nvvm.rsqrt.approx.f(float %168), !dbg !36
+  br label %__nv_rsqrtf.exit32, !dbg !36
+
+__nv_rsqrtf.exit32:                               ; preds = %185, %187
+  %.0.i31 = phi float [ %186, %185 ], [ %188, %187 ], !dbg !36
+  %189 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !36
+  %.not.i33 = icmp eq i32 %189, 0, !dbg !36
+  br i1 %.not.i33, label %192, label %190, !dbg !36
+
+190:                                              ; preds = %__nv_rsqrtf.exit32
+  %191 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %169), !dbg !36
+  br label %__nv_rsqrtf.exit35, !dbg !36
+
+192:                                              ; preds = %__nv_rsqrtf.exit32
+  %193 = tail call float @llvm.nvvm.rsqrt.approx.f(float %169), !dbg !36
+  br label %__nv_rsqrtf.exit35, !dbg !36
+
+__nv_rsqrtf.exit35:                               ; preds = %190, %192
+  %.0.i34 = phi float [ %191, %190 ], [ %193, %192 ], !dbg !36
+  %194 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !36
+  %.not.i36 = icmp eq i32 %194, 0, !dbg !36
+  br i1 %.not.i36, label %197, label %195, !dbg !36
+
+195:                                              ; preds = %__nv_rsqrtf.exit35
+  %196 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %170), !dbg !36
+  br label %__nv_rsqrtf.exit38, !dbg !36
+
+197:                                              ; preds = %__nv_rsqrtf.exit35
+  %198 = tail call float @llvm.nvvm.rsqrt.approx.f(float %170), !dbg !36
+  br label %__nv_rsqrtf.exit38, !dbg !36
+
+__nv_rsqrtf.exit38:                               ; preds = %195, %197
+  %.0.i37 = phi float [ %196, %195 ], [ %198, %197 ], !dbg !36
+  %199 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !36
+  %.not.i39 = icmp eq i32 %199, 0, !dbg !36
+  br i1 %.not.i39, label %202, label %200, !dbg !36
+
+200:                                              ; preds = %__nv_rsqrtf.exit38
+  %201 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %171), !dbg !36
+  br label %__nv_rsqrtf.exit41, !dbg !36
+
+202:                                              ; preds = %__nv_rsqrtf.exit38
+  %203 = tail call float @llvm.nvvm.rsqrt.approx.f(float %171), !dbg !36
+  br label %__nv_rsqrtf.exit41, !dbg !36
+
+__nv_rsqrtf.exit41:                               ; preds = %200, %202
+  %.0.i40 = phi float [ %201, %200 ], [ %203, %202 ], !dbg !36
+  %204 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !36
+  %.not.i42 = icmp eq i32 %204, 0, !dbg !36
+  br i1 %.not.i42, label %207, label %205, !dbg !36
+
+205:                                              ; preds = %__nv_rsqrtf.exit41
+  %206 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %172), !dbg !36
+  br label %__nv_rsqrtf.exit44, !dbg !36
+
+207:                                              ; preds = %__nv_rsqrtf.exit41
+  %208 = tail call float @llvm.nvvm.rsqrt.approx.f(float %172), !dbg !36
+  br label %__nv_rsqrtf.exit44, !dbg !36
+
+__nv_rsqrtf.exit44:                               ; preds = %205, %207
+  %.0.i43 = phi float [ %206, %205 ], [ %208, %207 ], !dbg !36
+  %209 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !36
+  %.not.i45 = icmp eq i32 %209, 0, !dbg !36
+  br i1 %.not.i45, label %212, label %210, !dbg !36
+
+210:                                              ; preds = %__nv_rsqrtf.exit44
+  %211 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %173), !dbg !36
+  br label %__nv_rsqrtf.exit47, !dbg !36
+
+212:                                              ; preds = %__nv_rsqrtf.exit44
+  %213 = tail call float @llvm.nvvm.rsqrt.approx.f(float %173), !dbg !36
+  br label %__nv_rsqrtf.exit47, !dbg !36
+
+__nv_rsqrtf.exit47:                               ; preds = %210, %212
+  %.0.i46 = phi float [ %211, %210 ], [ %213, %212 ], !dbg !36
+  %214 = fmul float %.0.i, %128, !dbg !37
+  %215 = fmul float %.0.i28, %129, !dbg !37
+  %216 = fmul float %.0.i31, %130, !dbg !37
+  %217 = fmul float %.0.i34, %131, !dbg !37
+  %218 = fmul float %.0.i37, %132, !dbg !37
+  %219 = fmul float %.0.i40, %133, !dbg !37
+  %220 = fmul float %.0.i43, %134, !dbg !37
+  %221 = fmul float %.0.i46, %135, !dbg !37
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !37
+  %222 = select i1 %62, i32 0, i32 2052, !dbg !37
+  %223 = mul nuw nsw i32 %102, 272, !dbg !37
+  %224 = xor i32 %223, %19, !dbg !37
+  %225 = or disjoint i32 %222, %224, !dbg !37
+  %226 = or disjoint i32 %225, %103, !dbg !37
+  %227 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %226, !dbg !37
+  store float %214, ptr addrspace(3) %227, align 4, !dbg !37
+  %228 = getelementptr inbounds nuw i8, ptr addrspace(3) %227, i32 128, !dbg !37
+  store float %216, ptr addrspace(3) %228, align 4, !dbg !37
+  %229 = xor i32 %226, 264, !dbg !37
+  %230 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %229, !dbg !37
+  store float %215, ptr addrspace(3) %230, align 4, !dbg !37
+  %231 = getelementptr inbounds nuw i8, ptr addrspace(3) %230, i32 128, !dbg !37
+  store float %217, ptr addrspace(3) %231, align 4, !dbg !37
+  %232 = xor i32 %226, 4, !dbg !37
+  %233 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %232, !dbg !37
+  store float %218, ptr addrspace(3) %233, align 4, !dbg !37
+  %234 = getelementptr inbounds nuw i8, ptr addrspace(3) %233, i32 128, !dbg !37
+  store float %220, ptr addrspace(3) %234, align 4, !dbg !37
+  %235 = xor i32 %226, 268, !dbg !37
+  %236 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %235, !dbg !37
+  store float %219, ptr addrspace(3) %236, align 4, !dbg !37
+  %237 = getelementptr inbounds nuw i8, ptr addrspace(3) %236, i32 128, !dbg !37
+  store float %221, ptr addrspace(3) %237, align 4, !dbg !37
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !37
+  %238 = and i32 %59, 368, !dbg !37
+  %239 = and i32 %29, 12, !dbg !37
+  %240 = shl nuw nsw i32 %67, 2, !dbg !37
+  %241 = or disjoint i32 %238, %239, !dbg !37
+  %242 = xor i32 %241, %70, !dbg !37
+  %243 = or disjoint i32 %242, %240, !dbg !37
+  %244 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %243, !dbg !37
+  %245 = load float, ptr addrspace(3) %244, align 4, !dbg !37
+  %246 = xor i32 %243, 544, !dbg !37
+  %247 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %246, !dbg !37
+  %248 = load float, ptr addrspace(3) %247, align 4, !dbg !37
+  %249 = xor i32 %243, 1088, !dbg !37
+  %250 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %249, !dbg !37
+  %251 = load float, ptr addrspace(3) %250, align 4, !dbg !37
+  %252 = xor i32 %243, 1632, !dbg !37
+  %253 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %252, !dbg !37
+  %254 = load float, ptr addrspace(3) %253, align 4, !dbg !37
+  %255 = xor i32 %243, 2052, !dbg !37
+  %256 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %255, !dbg !37
+  %257 = load float, ptr addrspace(3) %256, align 4, !dbg !37
+  %258 = xor i32 %243, 2596, !dbg !37
+  %259 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %258, !dbg !37
+  %260 = load float, ptr addrspace(3) %259, align 4, !dbg !37
+  %261 = xor i32 %243, 3140, !dbg !37
+  %262 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %261, !dbg !37
+  %263 = load float, ptr addrspace(3) %262, align 4, !dbg !37
+  %264 = xor i32 %243, 3684, !dbg !37
+  %265 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %264, !dbg !37
+  %266 = load float, ptr addrspace(3) %265, align 4, !dbg !37
+  %267 = sext i32 %31 to i64, !dbg !38
+  %268 = getelementptr bfloat, ptr addrspace(1) %2, i64 %267, !dbg !38
+  %269 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !39
+  %270 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %268, i64 %269, i1 %47) #5, !dbg !39
+  %271 = add i32 %44, -3145728, !dbg !40
+  %272 = sext i32 %271 to i64, !dbg !41
+  %273 = getelementptr bfloat, ptr addrspace(1) %3, i64 %272, !dbg !41
+  %274 = add i32 %17, -8192, !dbg !42
+  %275 = icmp ult i32 %274, 65536, !dbg !42
+  %276 = and i1 %33, %275, !dbg !42
+  %277 = and i1 %34, %275, !dbg !42
+  %278 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !43
+  %279 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %273, i64 %278, i1 %276) #5, !dbg !43
+  %280 = extractvalue { i32, i32, i32, i32 } %279, 0, !dbg !43
+  %281 = extractvalue { i32, i32, i32, i32 } %279, 1, !dbg !43
+  %282 = extractvalue { i32, i32, i32, i32 } %279, 2, !dbg !43
+  %283 = extractvalue { i32, i32, i32, i32 } %279, 3, !dbg !43
+  %extelt.offset12 = lshr i32 %280, 16, !dbg !43
+  %284 = trunc nuw i32 %extelt.offset12 to i16, !dbg !43
+  %extelt.offset14 = lshr i32 %281, 16, !dbg !43
+  %285 = trunc nuw i32 %extelt.offset14 to i16, !dbg !43
+  %extelt.offset16 = lshr i32 %282, 16, !dbg !43
+  %286 = trunc nuw i32 %extelt.offset16 to i16, !dbg !43
+  %extelt.offset18 = lshr i32 %283, 16, !dbg !43
+  %287 = trunc nuw i32 %extelt.offset18 to i16, !dbg !43
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !44
+  %288 = trunc i32 %280 to i16, !dbg !44
+  %289 = insertelement <1 x i16> poison, i16 %288, i64 0, !dbg !44
+  store <1 x i16> %289, ptr addrspace(3) %75, align 2, !dbg !44
+  %290 = insertelement <1 x i16> poison, i16 %284, i64 0, !dbg !44
+  store <1 x i16> %290, ptr addrspace(3) %79, align 2, !dbg !44
+  %291 = trunc i32 %281 to i16, !dbg !44
+  %292 = insertelement <1 x i16> poison, i16 %291, i64 0, !dbg !44
+  store <1 x i16> %292, ptr addrspace(3) %82, align 2, !dbg !44
+  %293 = insertelement <1 x i16> poison, i16 %285, i64 0, !dbg !44
+  store <1 x i16> %293, ptr addrspace(3) %86, align 2, !dbg !44
+  %294 = trunc i32 %282 to i16, !dbg !44
+  %295 = insertelement <1 x i16> poison, i16 %294, i64 0, !dbg !44
+  store <1 x i16> %295, ptr addrspace(3) %89, align 2, !dbg !44
+  %296 = insertelement <1 x i16> poison, i16 %286, i64 0, !dbg !44
+  store <1 x i16> %296, ptr addrspace(3) %93, align 2, !dbg !44
+  %297 = trunc i32 %283 to i16, !dbg !44
+  %298 = insertelement <1 x i16> poison, i16 %297, i64 0, !dbg !44
+  store <1 x i16> %298, ptr addrspace(3) %96, align 2, !dbg !44
+  %299 = insertelement <1 x i16> poison, i16 %287, i64 0, !dbg !44
+  store <1 x i16> %299, ptr addrspace(3) %100, align 2, !dbg !44
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !44
+  %300 = load <2 x bfloat>, ptr addrspace(3) %109, align 4, !dbg !44
+  %301 = load <2 x bfloat>, ptr addrspace(3) %114, align 4, !dbg !44
+  %302 = load <2 x bfloat>, ptr addrspace(3) %119, align 4, !dbg !44
+  %303 = load <2 x bfloat>, ptr addrspace(3) %124, align 4, !dbg !44
+  %304 = shl nsw i32 %36, 5, !dbg !45
+  %305 = add nsw i32 %.decomposed72, -8192, !dbg !45
+  %306 = add i32 %305, %304, !dbg !46
+  %307 = sext i32 %306 to i64, !dbg !47
+  %308 = getelementptr float, ptr addrspace(1) %4, i64 %307, !dbg !47
+  %309 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !48
+  %310 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %308, i64 %309, i1 %277) #5, !dbg !48
+  %311 = extractvalue { i32, i32, i32, i32 } %310, 0, !dbg !48
+  %312 = extractvalue { i32, i32, i32, i32 } %310, 1, !dbg !48
+  %313 = extractvalue { i32, i32, i32, i32 } %310, 2, !dbg !48
+  %314 = extractvalue { i32, i32, i32, i32 } %310, 3, !dbg !48
+  %315 = bitcast i32 %311 to float, !dbg !48
+  %316 = bitcast i32 %312 to float, !dbg !48
+  %317 = bitcast i32 %313 to float, !dbg !48
+  %318 = bitcast i32 %314 to float, !dbg !48
+  %319 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !48
+  %320 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %308, i64 %319, i1 %277) #5, !dbg !48
+  %321 = extractvalue { i32, i32, i32, i32 } %320, 0, !dbg !48
+  %322 = extractvalue { i32, i32, i32, i32 } %320, 1, !dbg !48
+  %323 = extractvalue { i32, i32, i32, i32 } %320, 2, !dbg !48
+  %324 = extractvalue { i32, i32, i32, i32 } %320, 3, !dbg !48
+  %325 = bitcast i32 %321 to float, !dbg !48
+  %326 = bitcast i32 %322 to float, !dbg !48
+  %327 = bitcast i32 %323 to float, !dbg !48
+  %328 = bitcast i32 %324 to float, !dbg !48
+  %329 = tail call float @llvm.nvvm.div.full(float %315, float 1.280000e+02), !dbg !49
+  %330 = tail call float @llvm.nvvm.div.full(float %316, float 1.280000e+02), !dbg !49
+  %331 = tail call float @llvm.nvvm.div.full(float %317, float 1.280000e+02), !dbg !49
+  %332 = tail call float @llvm.nvvm.div.full(float %318, float 1.280000e+02), !dbg !49
+  %333 = tail call float @llvm.nvvm.div.full(float %325, float 1.280000e+02), !dbg !49
+  %334 = tail call float @llvm.nvvm.div.full(float %326, float 1.280000e+02), !dbg !49
+  %335 = tail call float @llvm.nvvm.div.full(float %327, float 1.280000e+02), !dbg !49
+  %336 = tail call float @llvm.nvvm.div.full(float %328, float 1.280000e+02), !dbg !49
+  %337 = fadd float %329, 0x3EB0C6F7A0000000, !dbg !50
+  %338 = fadd float %330, 0x3EB0C6F7A0000000, !dbg !50
+  %339 = fadd float %331, 0x3EB0C6F7A0000000, !dbg !50
+  %340 = fadd float %332, 0x3EB0C6F7A0000000, !dbg !50
+  %341 = fadd float %333, 0x3EB0C6F7A0000000, !dbg !50
+  %342 = fadd float %334, 0x3EB0C6F7A0000000, !dbg !50
+  %343 = fadd float %335, 0x3EB0C6F7A0000000, !dbg !50
+  %344 = fadd float %336, 0x3EB0C6F7A0000000, !dbg !50
+  %345 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !51
+  %.not.i48 = icmp eq i32 %345, 0, !dbg !51
+  br i1 %.not.i48, label %348, label %346, !dbg !51
+
+346:                                              ; preds = %__nv_rsqrtf.exit47
+  %347 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %337), !dbg !51
+  br label %__nv_rsqrtf.exit50, !dbg !51
+
+348:                                              ; preds = %__nv_rsqrtf.exit47
+  %349 = tail call float @llvm.nvvm.rsqrt.approx.f(float %337), !dbg !51
+  br label %__nv_rsqrtf.exit50, !dbg !51
+
+__nv_rsqrtf.exit50:                               ; preds = %346, %348
+  %.0.i49 = phi float [ %347, %346 ], [ %349, %348 ], !dbg !51
+  %350 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !51
+  %.not.i51 = icmp eq i32 %350, 0, !dbg !51
+  br i1 %.not.i51, label %353, label %351, !dbg !51
+
+351:                                              ; preds = %__nv_rsqrtf.exit50
+  %352 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %338), !dbg !51
+  br label %__nv_rsqrtf.exit53, !dbg !51
+
+353:                                              ; preds = %__nv_rsqrtf.exit50
+  %354 = tail call float @llvm.nvvm.rsqrt.approx.f(float %338), !dbg !51
+  br label %__nv_rsqrtf.exit53, !dbg !51
+
+__nv_rsqrtf.exit53:                               ; preds = %351, %353
+  %.0.i52 = phi float [ %352, %351 ], [ %354, %353 ], !dbg !51
+  %355 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !51
+  %.not.i54 = icmp eq i32 %355, 0, !dbg !51
+  br i1 %.not.i54, label %358, label %356, !dbg !51
+
+356:                                              ; preds = %__nv_rsqrtf.exit53
+  %357 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %339), !dbg !51
+  br label %__nv_rsqrtf.exit56, !dbg !51
+
+358:                                              ; preds = %__nv_rsqrtf.exit53
+  %359 = tail call float @llvm.nvvm.rsqrt.approx.f(float %339), !dbg !51
+  br label %__nv_rsqrtf.exit56, !dbg !51
+
+__nv_rsqrtf.exit56:                               ; preds = %356, %358
+  %.0.i55 = phi float [ %357, %356 ], [ %359, %358 ], !dbg !51
+  %360 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !51
+  %.not.i57 = icmp eq i32 %360, 0, !dbg !51
+  br i1 %.not.i57, label %363, label %361, !dbg !51
+
+361:                                              ; preds = %__nv_rsqrtf.exit56
+  %362 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %340), !dbg !51
+  br label %__nv_rsqrtf.exit59, !dbg !51
+
+363:                                              ; preds = %__nv_rsqrtf.exit56
+  %364 = tail call float @llvm.nvvm.rsqrt.approx.f(float %340), !dbg !51
+  br label %__nv_rsqrtf.exit59, !dbg !51
+
+__nv_rsqrtf.exit59:                               ; preds = %361, %363
+  %.0.i58 = phi float [ %362, %361 ], [ %364, %363 ], !dbg !51
+  %365 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !51
+  %.not.i60 = icmp eq i32 %365, 0, !dbg !51
+  br i1 %.not.i60, label %368, label %366, !dbg !51
+
+366:                                              ; preds = %__nv_rsqrtf.exit59
+  %367 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %341), !dbg !51
+  br label %__nv_rsqrtf.exit62, !dbg !51
+
+368:                                              ; preds = %__nv_rsqrtf.exit59
+  %369 = tail call float @llvm.nvvm.rsqrt.approx.f(float %341), !dbg !51
+  br label %__nv_rsqrtf.exit62, !dbg !51
+
+__nv_rsqrtf.exit62:                               ; preds = %366, %368
+  %.0.i61 = phi float [ %367, %366 ], [ %369, %368 ], !dbg !51
+  %370 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !51
+  %.not.i63 = icmp eq i32 %370, 0, !dbg !51
+  br i1 %.not.i63, label %373, label %371, !dbg !51
+
+371:                                              ; preds = %__nv_rsqrtf.exit62
+  %372 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %342), !dbg !51
+  br label %__nv_rsqrtf.exit65, !dbg !51
+
+373:                                              ; preds = %__nv_rsqrtf.exit62
+  %374 = tail call float @llvm.nvvm.rsqrt.approx.f(float %342), !dbg !51
+  br label %__nv_rsqrtf.exit65, !dbg !51
+
+__nv_rsqrtf.exit65:                               ; preds = %371, %373
+  %.0.i64 = phi float [ %372, %371 ], [ %374, %373 ], !dbg !51
+  %375 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !51
+  %.not.i66 = icmp eq i32 %375, 0, !dbg !51
+  br i1 %.not.i66, label %378, label %376, !dbg !51
+
+376:                                              ; preds = %__nv_rsqrtf.exit65
+  %377 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %343), !dbg !51
+  br label %__nv_rsqrtf.exit68, !dbg !51
+
+378:                                              ; preds = %__nv_rsqrtf.exit65
+  %379 = tail call float @llvm.nvvm.rsqrt.approx.f(float %343), !dbg !51
+  br label %__nv_rsqrtf.exit68, !dbg !51
+
+__nv_rsqrtf.exit68:                               ; preds = %376, %378
+  %.0.i67 = phi float [ %377, %376 ], [ %379, %378 ], !dbg !51
+  %380 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !51
+  %.not.i69 = icmp eq i32 %380, 0, !dbg !51
+  br i1 %.not.i69, label %383, label %381, !dbg !51
+
+381:                                              ; preds = %__nv_rsqrtf.exit68
+  %382 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %344), !dbg !51
+  br label %__nv_rsqrtf.exit71, !dbg !51
+
+383:                                              ; preds = %__nv_rsqrtf.exit68
+  %384 = tail call float @llvm.nvvm.rsqrt.approx.f(float %344), !dbg !51
+  br label %__nv_rsqrtf.exit71, !dbg !51
+
+__nv_rsqrtf.exit71:                               ; preds = %381, %383
+  %.0.i70 = phi float [ %382, %381 ], [ %384, %383 ], !dbg !51
+  %385 = extractelement <2 x bfloat> %303, i64 1, !dbg !44
+  %386 = fpext bfloat %385 to float, !dbg !44
+  %387 = extractelement <2 x bfloat> %302, i64 1, !dbg !44
+  %388 = fpext bfloat %387 to float, !dbg !44
+  %389 = extractelement <2 x bfloat> %303, i64 0, !dbg !44
+  %390 = fpext bfloat %389 to float, !dbg !44
+  %391 = extractelement <2 x bfloat> %302, i64 0, !dbg !44
+  %392 = fpext bfloat %391 to float, !dbg !44
+  %393 = extractelement <2 x bfloat> %301, i64 1, !dbg !44
+  %394 = fpext bfloat %393 to float, !dbg !44
+  %395 = extractelement <2 x bfloat> %300, i64 1, !dbg !44
+  %396 = fpext bfloat %395 to float, !dbg !44
+  %397 = extractelement <2 x bfloat> %301, i64 0, !dbg !44
+  %398 = fpext bfloat %397 to float, !dbg !44
+  %399 = extractelement <2 x bfloat> %300, i64 0, !dbg !44
+  %400 = fpext bfloat %399 to float, !dbg !44
+  %401 = extractvalue { i32, i32, i32, i32 } %270, 3, !dbg !39
+  %402 = bitcast i32 %401 to <2 x bfloat>, !dbg !39
+  %403 = extractvalue { i32, i32, i32, i32 } %270, 2, !dbg !39
+  %404 = bitcast i32 %403 to <2 x bfloat>, !dbg !39
+  %405 = extractvalue { i32, i32, i32, i32 } %270, 1, !dbg !39
+  %406 = bitcast i32 %405 to <2 x bfloat>, !dbg !39
+  %407 = extractvalue { i32, i32, i32, i32 } %270, 0, !dbg !39
+  %408 = bitcast i32 %407 to <2 x bfloat>, !dbg !39
+  %409 = icmp slt i32 %23, 73728, !dbg !52
+  %410 = fmul float %.0.i49, %400, !dbg !53
+  %411 = fmul float %.0.i52, %398, !dbg !53
+  %412 = fmul float %.0.i55, %396, !dbg !53
+  %413 = fmul float %.0.i58, %394, !dbg !53
+  %414 = fmul float %.0.i61, %392, !dbg !53
+  %415 = fmul float %.0.i64, %390, !dbg !53
+  %416 = fmul float %.0.i67, %388, !dbg !53
+  %417 = fmul float %.0.i70, %386, !dbg !53
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !53
+  store float %410, ptr addrspace(3) %227, align 4, !dbg !53
+  store float %412, ptr addrspace(3) %228, align 4, !dbg !53
+  store float %411, ptr addrspace(3) %230, align 4, !dbg !53
+  store float %413, ptr addrspace(3) %231, align 4, !dbg !53
+  store float %414, ptr addrspace(3) %233, align 4, !dbg !53
+  store float %416, ptr addrspace(3) %234, align 4, !dbg !53
+  store float %415, ptr addrspace(3) %236, align 4, !dbg !53
+  store float %417, ptr addrspace(3) %237, align 4, !dbg !53
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !53
+  %418 = load float, ptr addrspace(3) %244, align 4, !dbg !53
+  %419 = load float, ptr addrspace(3) %247, align 4, !dbg !53
+  %420 = load float, ptr addrspace(3) %250, align 4, !dbg !53
+  %421 = load float, ptr addrspace(3) %253, align 4, !dbg !53
+  %422 = load float, ptr addrspace(3) %256, align 4, !dbg !53
+  %423 = load float, ptr addrspace(3) %259, align 4, !dbg !53
+  %424 = load float, ptr addrspace(3) %262, align 4, !dbg !53
+  %425 = load float, ptr addrspace(3) %265, align 4, !dbg !53
+  %426 = getelementptr bfloat, ptr addrspace(1) %5, i64 %267, !dbg !54
+  %427 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !55
+  %428 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %426, i64 %427, i1 %276) #5, !dbg !55
+  %429 = extractvalue { i32, i32, i32, i32 } %428, 0, !dbg !55
+  %430 = bitcast i32 %429 to <2 x bfloat>, !dbg !55
+  %431 = extractvalue { i32, i32, i32, i32 } %428, 1, !dbg !55
+  %432 = bitcast i32 %431 to <2 x bfloat>, !dbg !55
+  %433 = extractvalue { i32, i32, i32, i32 } %428, 2, !dbg !55
+  %434 = bitcast i32 %433 to <2 x bfloat>, !dbg !55
+  %435 = extractvalue { i32, i32, i32, i32 } %428, 3, !dbg !55
+  %436 = bitcast i32 %435 to <2 x bfloat>, !dbg !55
+  %437 = shl i32 %23, 7, !dbg !56
+  %438 = add i32 %437, %31, !dbg !57
+  %439 = sext i32 %438 to i64, !dbg !58
+  %440 = getelementptr bfloat, ptr addrspace(1) %6, i64 %439, !dbg !58
+  %441 = and i1 %33, %409, !dbg !59
+  %442 = fpext <2 x bfloat> %408 to <2 x float>, !dbg !60
+  %443 = insertelement <2 x float> poison, float %245, i64 0, !dbg !61
+  %444 = insertelement <2 x float> %443, float %248, i64 1, !dbg !61
+  %445 = fmul <2 x float> %444, %442, !dbg !61
+  %446 = fpext <2 x bfloat> %430 to <2 x float>, !dbg !62
+  %447 = insertelement <2 x float> poison, float %418, i64 0, !dbg !63
+  %448 = insertelement <2 x float> %447, float %419, i64 1, !dbg !63
+  %449 = fmul <2 x float> %448, %446, !dbg !63
+  %450 = insertelement <2 x i1> poison, i1 %39, i64 0, !dbg !64
+  %451 = shufflevector <2 x i1> %450, <2 x i1> poison, <2 x i32> zeroinitializer, !dbg !64
+  %452 = select <2 x i1> %451, <2 x float> %445, <2 x float> %449, !dbg !64
+  %453 = fptrunc <2 x float> %452 to <2 x bfloat>, !dbg !65
+  %454 = fpext <2 x bfloat> %406 to <2 x float>, !dbg !60
+  %455 = insertelement <2 x float> poison, float %251, i64 0, !dbg !61
+  %456 = insertelement <2 x float> %455, float %254, i64 1, !dbg !61
+  %457 = fmul <2 x float> %456, %454, !dbg !61
+  %458 = fpext <2 x bfloat> %432 to <2 x float>, !dbg !62
+  %459 = insertelement <2 x float> poison, float %420, i64 0, !dbg !63
+  %460 = insertelement <2 x float> %459, float %421, i64 1, !dbg !63
+  %461 = fmul <2 x float> %460, %458, !dbg !63
+  %462 = select <2 x i1> %451, <2 x float> %457, <2 x float> %461, !dbg !64
+  %463 = fptrunc <2 x float> %462 to <2 x bfloat>, !dbg !65
+  %464 = fpext <2 x bfloat> %404 to <2 x float>, !dbg !60
+  %465 = insertelement <2 x float> poison, float %257, i64 0, !dbg !61
+  %466 = insertelement <2 x float> %465, float %260, i64 1, !dbg !61
+  %467 = fmul <2 x float> %466, %464, !dbg !61
+  %468 = fpext <2 x bfloat> %434 to <2 x float>, !dbg !62
+  %469 = insertelement <2 x float> poison, float %422, i64 0, !dbg !63
+  %470 = insertelement <2 x float> %469, float %423, i64 1, !dbg !63
+  %471 = fmul <2 x float> %470, %468, !dbg !63
+  %472 = select <2 x i1> %451, <2 x float> %467, <2 x float> %471, !dbg !64
+  %473 = fptrunc <2 x float> %472 to <2 x bfloat>, !dbg !65
+  %474 = fpext <2 x bfloat> %402 to <2 x float>, !dbg !60
+  %475 = insertelement <2 x float> poison, float %263, i64 0, !dbg !61
+  %476 = insertelement <2 x float> %475, float %266, i64 1, !dbg !61
+  %477 = fmul <2 x float> %476, %474, !dbg !61
+  %478 = fpext <2 x bfloat> %436 to <2 x float>, !dbg !62
+  %479 = insertelement <2 x float> poison, float %424, i64 0, !dbg !63
+  %480 = insertelement <2 x float> %479, float %425, i64 1, !dbg !63
+  %481 = fmul <2 x float> %480, %478, !dbg !63
+  %482 = select <2 x i1> %451, <2 x float> %477, <2 x float> %481, !dbg !64
+  %483 = fptrunc <2 x float> %482 to <2 x bfloat>, !dbg !65
+  %484 = bitcast <2 x bfloat> %453 to i32, !dbg !65
+  %485 = bitcast <2 x bfloat> %463 to i32, !dbg !65
+  %486 = bitcast <2 x bfloat> %473 to i32, !dbg !65
+  %487 = bitcast <2 x bfloat> %483 to i32, !dbg !65
+  tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %484, i32 %485, i32 %486, i32 %487, ptr addrspace(1) %440, i1 %441) #5, !dbg !65
+  ret void, !dbg !66
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 65535) i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() #1
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 65535) i32 @llvm.nvvm.read.ptx.sreg.ctaid.z() #1
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 1, 65536) i32 @llvm.nvvm.read.ptx.sreg.nctaid.y() #1
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1
+
+; Function Attrs: convergent nocallback nounwind
+declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #2
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.div.full(float, float) #3
+
+declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #4
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #3
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.rsqrt.approx.f(float) #3
+
+attributes #0 = { nounwind "nvvm.reqntid"="128" }
+attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #2 = { convergent nocallback nounwind }
+attributes #3 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
+attributes #4 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #5 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3}
+!llvm.ident = !{!4}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly)
+!1 = !DIFile(filename: "c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py", directory: "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h")
+!2 = !{i32 2, !"Debug Info Version", i32 3}
+!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
+!4 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
+!5 = distinct !DISubprogram(name: "triton_poi_fused__fused_rms_norm_cat_view_2", linkageName: "triton_poi_fused__fused_rms_norm_cat_view_2", scope: !1, file: !1, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
+!7 = !{}
+!8 = !DILocation(line: 21, column: 29, scope: !5)
+!9 = !DILocation(line: 21, column: 48, scope: !5)
+!10 = !DILocation(line: 21, column: 69, scope: !5)
+!11 = !DILocation(line: 21, column: 53, scope: !5)
+!12 = !DILocation(line: 21, column: 34, scope: !5)
+!13 = !DILocation(line: 21, column: 75, scope: !5)
+!14 = !DILocation(line: 22, column: 44, scope: !5)
+!15 = !DILocation(line: 22, column: 23, scope: !5)
+!16 = !DILocation(line: 24, column: 28, scope: !5)
+!17 = !DILocation(line: 24, column: 33, scope: !5)
+!18 = !DILocation(line: 25, column: 44, scope: !5)
+!19 = !DILocation(line: 25, column: 23, scope: !5)
+!20 = !DILocation(line: 26, column: 21, scope: !5)
+!21 = !DILocation(line: 27, column: 19, scope: !5)
+!22 = !DILocation(line: 29, column: 19, scope: !5)
+!23 = !DILocation(line: 35, column: 18, scope: !5)
+!24 = !DILocation(line: 36, column: 39, scope: !5)
+!25 = !DILocation(line: 36, column: 35, scope: !5)
+!26 = !DILocation(line: 36, column: 51, scope: !5)
+!27 = !DILocation(line: 36, column: 44, scope: !5)
+!28 = !DILocation(line: 36, column: 30, scope: !5)
+!29 = !DILocation(line: 36, column: 64, scope: !5)
+!30 = !DILocation(line: 36, column: 57, scope: !5)
+!31 = !DILocation(line: 36, column: 123, scope: !5)
+!32 = !DILocation(line: 38, column: 30, scope: !5)
+!33 = !DILocation(line: 38, column: 80, scope: !5)
+!34 = !DILocation(line: 40, column: 19, scope: !5)
+!35 = !DILocation(line: 42, column: 19, scope: !5)
+!36 = !DILocation(line: 43, column: 28, scope: !5)
+!37 = !DILocation(line: 44, column: 19, scope: !5)
+!38 = !DILocation(line: 45, column: 31, scope: !5)
+!39 = !DILocation(line: 45, column: 71, scope: !5)
+!40 = !DILocation(line: 54, column: 45, scope: !5)
+!41 = !DILocation(line: 54, column: 31, scope: !5)
+!42 = !DILocation(line: 54, column: 83, scope: !5)
+!43 = !DILocation(line: 54, column: 67, scope: !5)
+!44 = !DILocation(line: 54, column: 134, scope: !5)
+!45 = !DILocation(line: 56, column: 56, scope: !5)
+!46 = !DILocation(line: 56, column: 52, scope: !5)
+!47 = !DILocation(line: 56, column: 31, scope: !5)
+!48 = !DILocation(line: 56, column: 90, scope: !5)
+!49 = !DILocation(line: 58, column: 21, scope: !5)
+!50 = !DILocation(line: 60, column: 20, scope: !5)
+!51 = !DILocation(line: 61, column: 28, scope: !5)
+!52 = !DILocation(line: 23, column: 21, scope: !5)
+!53 = !DILocation(line: 62, column: 20, scope: !5)
+!54 = !DILocation(line: 63, column: 31, scope: !5)
+!55 = !DILocation(line: 63, column: 71, scope: !5)
+!56 = !DILocation(line: 70, column: 34, scope: !5)
+!57 = !DILocation(line: 70, column: 30, scope: !5)
+!58 = !DILocation(line: 70, column: 25, scope: !5)
+!59 = !DILocation(line: 70, column: 54, scope: !5)
+!60 = !DILocation(line: 45, column: 137, scope: !5)
+!61 = !DILocation(line: 47, column: 20, scope: !5)
+!62 = !DILocation(line: 63, column: 138, scope: !5)
+!63 = !DILocation(line: 65, column: 20, scope: !5)
+!64 = !DILocation(line: 0, scope: !5)
+!65 = !DILocation(line: 70, column: 46, scope: !5)
+!66 = !DILocation(line: 70, column: 4, scope: !5)
diff --git a/triton/BMEOK34QH7HISRHLZCWDTEICFHNMYNUUIGVSUVGWE7RQZVRWVZPA/triton_poi_fused__fused_rms_norm_cat_view_2.ptx b/triton/BMEOK34QH7HISRHLZCWDTEICFHNMYNUUIGVSUVGWE7RQZVRWVZPA/triton_poi_fused__fused_rms_norm_cat_view_2.ptx
new file mode 100644
index 0000000000000000000000000000000000000000..30e6e10414788c65e78dc51c11522a91f063f5a1
--- /dev/null
+++ b/triton/BMEOK34QH7HISRHLZCWDTEICFHNMYNUUIGVSUVGWE7RQZVRWVZPA/triton_poi_fused__fused_rms_norm_cat_view_2.ptx
@@ -0,0 +1,794 @@
+//
+// Generated by LLVM NVPTX Back-End
+//
+
+.version 9.1
+.target sm_89
+.address_size 64
+
+	// .globl	triton_poi_fused__fused_rms_norm_cat_view_2 // -- Begin function triton_poi_fused__fused_rms_norm_cat_view_2
+.extern .shared .align 16 .b8 global_smem[];
+.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90};
+                                        // @triton_poi_fused__fused_rms_norm_cat_view_2
+.visible .entry triton_poi_fused__fused_rms_norm_cat_view_2(
+	.param .u64 .ptr .global .align 1 triton_poi_fused__fused_rms_norm_cat_view_2_param_0,
+	.param .u64 .ptr .global .align 1 triton_poi_fused__fused_rms_norm_cat_view_2_param_1,
+	.param .u64 .ptr .global .align 1 triton_poi_fused__fused_rms_norm_cat_view_2_param_2,
+	.param .u64 .ptr .global .align 1 triton_poi_fused__fused_rms_norm_cat_view_2_param_3,
+	.param .u64 .ptr .global .align 1 triton_poi_fused__fused_rms_norm_cat_view_2_param_4,
+	.param .u64 .ptr .global .align 1 triton_poi_fused__fused_rms_norm_cat_view_2_param_5,
+	.param .u64 .ptr .global .align 1 triton_poi_fused__fused_rms_norm_cat_view_2_param_6,
+	.param .u32 triton_poi_fused__fused_rms_norm_cat_view_2_param_7,
+	.param .u32 triton_poi_fused__fused_rms_norm_cat_view_2_param_8,
+	.param .u64 .ptr .global .align 1 triton_poi_fused__fused_rms_norm_cat_view_2_param_9,
+	.param .u64 .ptr .global .align 1 triton_poi_fused__fused_rms_norm_cat_view_2_param_10
+)
+.reqntid 128
+{
+	.reg .pred 	%p<12>;
+	.reg .b16 	%rs<33>;
+	.reg .b32 	%r<295>;
+	.reg .b64 	%rd<24>;
+	.loc	1 18 0                          // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:18:0
+$L__func_begin0:
+	.loc	1 18 0                          // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:18:0
+
+// %bb.0:                               // %__nv_rsqrtf.exit
+	ld.param.b64 	%rd16, [triton_poi_fused__fused_rms_norm_cat_view_2_param_0];
+	ld.param.b64 	%rd17, [triton_poi_fused__fused_rms_norm_cat_view_2_param_1];
+$L__tmp0:
+	.loc	1 21 29                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:21:29
+	mov.u32 	%r38, %ctaid.y;
+	ld.param.b64 	%rd18, [triton_poi_fused__fused_rms_norm_cat_view_2_param_2];
+	.loc	1 21 48                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:21:48
+	mov.u32 	%r39, %ctaid.z;
+	ld.param.b64 	%rd19, [triton_poi_fused__fused_rms_norm_cat_view_2_param_3];
+	.loc	1 21 69                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:21:69
+	mov.u32 	%r40, %nctaid.y;
+	ld.param.b64 	%rd20, [triton_poi_fused__fused_rms_norm_cat_view_2_param_4];
+	.loc	1 21 34                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:21:34
+	mad.lo.s32 	%r41, %r39, %r40, %r38;
+	ld.param.b64 	%rd21, [triton_poi_fused__fused_rms_norm_cat_view_2_param_5];
+	.loc	1 21 75                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:21:75
+	shl.b32 	%r42, %r41, 3;
+	ld.param.b64 	%rd22, [triton_poi_fused__fused_rms_norm_cat_view_2_param_6];
+	.loc	1 22 44                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:22:44
+	mov.u32 	%r43, %tid.x;
+	and.b32 	%r44, %r43, 112;
+	bfe.u32 	%r45, %r43, 4, 3;
+	and.b32 	%r46, %r43, 1;
+	shl.b32 	%r47, %r46, 2;
+	.loc	1 22 23                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:22:23
+	or.b32 	%r48, %r42, %r45;
+	or.b32 	%r49, %r42, %r47;
+	.loc	1 24 28                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:24:28
+	mov.u32 	%r50, %ctaid.x;
+	.loc	1 24 33                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:24:33
+	shl.b32 	%r51, %r50, 7;
+	.loc	1 25 44                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:25:44
+	shl.b32 	%r52, %r43, 3;
+	and.b32 	%r53, %r52, 120;
+	shr.u32 	%r54, %r43, 1;
+	bfe.u32 	%r55, %r43, 1, 6;
+	.loc	1 25 23                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:25:23
+	or.b32 	%r56, %r53, %r51;
+	or.b32 	%r57, %r55, %r51;
+	.loc	1 26 21                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:26:21
+	setp.lt.s32 	%p6, %r56, 128;
+	setp.lt.s32 	%p7, %r57, 128;
+	.loc	1 27 19                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:27:19
+	bfe.s32 	%r58, %r41, 28, 1;
+	shr.u32 	%r59, %r58, 27;
+	add.s32 	%r60, %r48, %r59;
+	shr.u32 	%r61, %r60, 5;
+	.loc	1 29 19                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:29:19
+	and.b32 	%r62, %r60, 33554400;
+	sub.s32 	%r63, %r48, %r62;
+	.loc	1 35 18                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:35:18
+	setp.lt.s32 	%p8, %r48, 8192;
+	setp.lt.s32 	%p9, %r49, 8192;
+	.loc	1 36 39                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:36:39
+	shl.b32 	%r64, %r63, 7;
+	.loc	1 36 35                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:36:35
+	add.s32 	%r65, %r64, %r56;
+	.loc	1 36 44                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:36:44
+	mad.lo.s32 	%r66, %r61, 12288, %r65;
+	.loc	1 36 30                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:36:30
+	mad.wide.s32 	%rd1, %r66, 2, %rd16;
+	.loc	1 36 64                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:36:64
+	and.pred 	%p1, %p6, %p8;
+	and.pred 	%p2, %p7, %p9;
+	.loc	1 36 57                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:36:57
+	// begin inline asm
+	mov.u64 %rd2, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd2, 1.0;
+	// end inline asm
+	mov.b32 	%r5, 0;
+	// begin inline asm
+	mov.u32 %r1, %r5;
+	mov.u32 %r2, %r5;
+	mov.u32 %r3, %r5;
+	mov.u32 %r4, %r5;
+	@%p1 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r1, %r2, %r3, %r4 }, [ %rd1 + 0 ], %rd2;
+	// end inline asm
+	shr.u32 	%r67, %r1, 16;
+	shr.u32 	%r68, %r2, 16;
+	shr.u32 	%r69, %r3, 16;
+	shr.u32 	%r70, %r4, 16;
+	.loc	1 36 123                        // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:36:123
+	shl.b32 	%r71, %r43, 4;
+	and.b32 	%r72, %r71, 112;
+	bfe.s32 	%r73, %r43, 3, 1;
+	and.b32 	%r74, %r43, 8;
+	shr.u32 	%r75, %r74, 1;
+	bfe.s32 	%r76, %r43, 4, 1;
+	and.b32 	%r77, %r76, 136;
+	and.b32 	%r78, %r43, 32;
+	shr.u32 	%r79, %r78, 4;
+	shr.u32 	%r80, %r43, 3;
+	and.b32 	%r81, %r80, 8;
+	or.b32 	%r82, %r75, %r79;
+	or.b32 	%r83, %r77, %r72;
+	xor.b32 	%r84, %r83, %r81;
+	or.b32 	%r85, %r82, %r84;
+	mov.b32 	%r86, global_smem;
+	add.s32 	%r87, %r86, %r85;
+	st.shared.b16 	[%r87], %r1;
+	xor.b32 	%r88, %r85, 32;
+	add.s32 	%r89, %r86, %r88;
+	st.shared.b16 	[%r89+256], %r67;
+	xor.b32 	%r90, %r85, 64;
+	add.s32 	%r91, %r86, %r90;
+	st.shared.b16 	[%r91+512], %r2;
+	xor.b32 	%r92, %r85, 96;
+	add.s32 	%r93, %r86, %r92;
+	st.shared.b16 	[%r93+768], %r68;
+	xor.b32 	%r94, %r85, 4;
+	add.s32 	%r95, %r86, %r94;
+	st.shared.b16 	[%r95+1024], %r3;
+	xor.b32 	%r96, %r85, 36;
+	add.s32 	%r97, %r86, %r96;
+	st.shared.b16 	[%r97+1280], %r69;
+	xor.b32 	%r98, %r85, 68;
+	add.s32 	%r99, %r86, %r98;
+	st.shared.b16 	[%r99+1536], %r4;
+	xor.b32 	%r100, %r85, 100;
+	add.s32 	%r101, %r86, %r100;
+	st.shared.b16 	[%r101+1792], %r70;
+	bar.sync 	0;
+	and.b32 	%r102, %r43, 6;
+	shl.b32 	%r103, %r46, 3;
+	and.b32 	%r104, %r73, 1028;
+	mul.lo.s32 	%r105, %r102, 144;
+	xor.b32 	%r106, %r105, %r44;
+	or.b32 	%r107, %r104, %r106;
+	or.b32 	%r108, %r107, %r103;
+	add.s32 	%r109, %r86, %r108;
+	ld.shared.v2.b16 	{%rs1, %rs2}, [%r109];
+	xor.b32 	%r110, %r108, 136;
+	add.s32 	%r111, %r86, %r110;
+	ld.shared.v2.b16 	{%rs3, %rs4}, [%r111];
+	xor.b32 	%r112, %r108, 4;
+	add.s32 	%r113, %r86, %r112;
+	ld.shared.v2.b16 	{%rs5, %rs6}, [%r113];
+	xor.b32 	%r114, %r108, 140;
+	add.s32 	%r115, %r86, %r114;
+	ld.shared.v2.b16 	{%rs7, %rs8}, [%r115];
+	cvt.f32.bf16 	%r116, %rs1;
+	cvt.f32.bf16 	%r117, %rs3;
+	cvt.f32.bf16 	%r118, %rs2;
+	cvt.f32.bf16 	%r119, %rs4;
+	cvt.f32.bf16 	%r120, %rs5;
+	cvt.f32.bf16 	%r121, %rs7;
+	cvt.f32.bf16 	%r122, %rs6;
+	cvt.f32.bf16 	%r123, %rs8;
+	.loc	1 38 30                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:38:30
+	mad.wide.s32 	%rd3, %r49, 4, %rd17;
+	.loc	1 38 80                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:38:80
+	// begin inline asm
+	mov.u64 %rd4, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd4, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r6, %r5;
+	mov.u32 %r7, %r5;
+	mov.u32 %r8, %r5;
+	mov.u32 %r9, %r5;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r6, %r7, %r8, %r9 }, [ %rd3 + 0 ], %rd4;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd5, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd5, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r10, %r5;
+	mov.u32 %r11, %r5;
+	mov.u32 %r12, %r5;
+	mov.u32 %r13, %r5;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r10, %r11, %r12, %r13 }, [ %rd3 + 0 ], %rd5;
+	// end inline asm
+	mov.b32 	%r124, 0f43000000;
+	.loc	1 40 19                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:40:19
+	div.full.f32 	%r125, %r6, %r124;
+	div.full.f32 	%r126, %r7, %r124;
+	div.full.f32 	%r127, %r8, %r124;
+	div.full.f32 	%r128, %r9, %r124;
+	div.full.f32 	%r129, %r10, %r124;
+	div.full.f32 	%r130, %r11, %r124;
+	div.full.f32 	%r131, %r12, %r124;
+	div.full.f32 	%r132, %r13, %r124;
+	.loc	1 42 19                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:42:19
+	add.f32 	%r133, %r125, 0f358637BD;
+	add.f32 	%r134, %r126, 0f358637BD;
+	add.f32 	%r135, %r127, 0f358637BD;
+	add.f32 	%r136, %r128, 0f358637BD;
+	add.f32 	%r137, %r129, 0f358637BD;
+	add.f32 	%r138, %r130, 0f358637BD;
+	add.f32 	%r139, %r131, 0f358637BD;
+	add.f32 	%r140, %r132, 0f358637BD;
+	.loc	1 43 28                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:43:28
+	rsqrt.approx.ftz.f32 	%r141, %r133;
+	rsqrt.approx.ftz.f32 	%r142, %r134;
+	rsqrt.approx.ftz.f32 	%r143, %r135;
+	rsqrt.approx.ftz.f32 	%r144, %r136;
+	rsqrt.approx.ftz.f32 	%r145, %r137;
+	rsqrt.approx.ftz.f32 	%r146, %r138;
+	rsqrt.approx.ftz.f32 	%r147, %r139;
+	rsqrt.approx.ftz.f32 	%r148, %r140;
+	.loc	1 44 19                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:44:19
+	mul.f32 	%r149, %r141, %r116;
+	mul.f32 	%r150, %r142, %r117;
+	mul.f32 	%r151, %r143, %r118;
+	mul.f32 	%r152, %r144, %r119;
+	mul.f32 	%r153, %r145, %r120;
+	mul.f32 	%r154, %r146, %r121;
+	mul.f32 	%r155, %r147, %r122;
+	mul.f32 	%r156, %r148, %r123;
+	bar.sync 	0;
+	and.b32 	%r157, %r73, 2052;
+	mul.lo.s32 	%r158, %r102, 272;
+	xor.b32 	%r159, %r158, %r44;
+	or.b32 	%r160, %r157, %r159;
+	or.b32 	%r161, %r160, %r103;
+	add.s32 	%r162, %r86, %r161;
+	st.shared.b32 	[%r162], %r149;
+	st.shared.b32 	[%r162+128], %r151;
+	xor.b32 	%r163, %r161, 264;
+	add.s32 	%r164, %r86, %r163;
+	st.shared.b32 	[%r164], %r150;
+	st.shared.b32 	[%r164+128], %r152;
+	xor.b32 	%r165, %r161, 4;
+	add.s32 	%r166, %r86, %r165;
+	st.shared.b32 	[%r166], %r153;
+	st.shared.b32 	[%r166+128], %r155;
+	xor.b32 	%r167, %r161, 268;
+	add.s32 	%r168, %r86, %r167;
+	st.shared.b32 	[%r168], %r154;
+	st.shared.b32 	[%r168+128], %r156;
+	bar.sync 	0;
+	and.b32 	%r169, %r71, 368;
+	and.b32 	%r170, %r54, 12;
+	shl.b32 	%r171, %r78, 2;
+	or.b32 	%r172, %r169, %r170;
+	xor.b32 	%r173, %r172, %r81;
+	or.b32 	%r174, %r173, %r171;
+	add.s32 	%r175, %r86, %r174;
+	ld.shared.b32 	%r176, [%r175];
+	xor.b32 	%r177, %r174, 32;
+	add.s32 	%r178, %r86, %r177;
+	ld.shared.b32 	%r179, [%r178+512];
+	xor.b32 	%r180, %r174, 64;
+	add.s32 	%r181, %r86, %r180;
+	ld.shared.b32 	%r182, [%r181+1024];
+	xor.b32 	%r183, %r174, 96;
+	add.s32 	%r184, %r86, %r183;
+	ld.shared.b32 	%r185, [%r184+1536];
+	xor.b32 	%r186, %r174, 4;
+	add.s32 	%r187, %r86, %r186;
+	ld.shared.b32 	%r188, [%r187+2048];
+	xor.b32 	%r189, %r174, 36;
+	add.s32 	%r190, %r86, %r189;
+	ld.shared.b32 	%r191, [%r190+2560];
+	xor.b32 	%r192, %r174, 68;
+	add.s32 	%r193, %r86, %r192;
+	ld.shared.b32 	%r194, [%r193+3072];
+	xor.b32 	%r195, %r174, 100;
+	add.s32 	%r196, %r86, %r195;
+	ld.shared.b32 	%r197, [%r196+3584];
+	.loc	1 45 31                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:45:31
+	mul.wide.s32 	%rd23, %r56, 2;
+	add.s64 	%rd6, %rd18, %rd23;
+	.loc	1 45 71                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:45:71
+	// begin inline asm
+	mov.u64 %rd7, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd7, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r14, %r5;
+	mov.u32 %r15, %r5;
+	mov.u32 %r16, %r5;
+	mov.u32 %r17, %r5;
+	@%p1 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r14, %r15, %r16, %r17 }, [ %rd6 + 0 ], %rd7;
+	// end inline asm
+	.loc	1 54 45                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:54:45
+	add.s32 	%r198, %r66, -3145728;
+	.loc	1 54 31                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:54:31
+	mad.wide.s32 	%rd8, %r198, 2, %rd19;
+	.loc	1 54 83                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:54:83
+	add.s32 	%r199, %r42, -8192;
+	setp.lt.u32 	%p10, %r199, 65536;
+	and.pred 	%p3, %p6, %p10;
+	and.pred 	%p4, %p7, %p10;
+	.loc	1 54 67                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:54:67
+	// begin inline asm
+	mov.u64 %rd9, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd9, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r18, %r5;
+	mov.u32 %r19, %r5;
+	mov.u32 %r20, %r5;
+	mov.u32 %r21, %r5;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r18, %r19, %r20, %r21 }, [ %rd8 + 0 ], %rd9;
+	// end inline asm
+	shr.u32 	%r200, %r18, 16;
+	shr.u32 	%r201, %r19, 16;
+	shr.u32 	%r202, %r20, 16;
+	shr.u32 	%r203, %r21, 16;
+	.loc	1 54 134                        // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:54:134
+	bar.sync 	0;
+	st.shared.b16 	[%r87], %r18;
+	st.shared.b16 	[%r89+256], %r200;
+	st.shared.b16 	[%r91+512], %r19;
+	st.shared.b16 	[%r93+768], %r201;
+	st.shared.b16 	[%r95+1024], %r20;
+	st.shared.b16 	[%r97+1280], %r202;
+	st.shared.b16 	[%r99+1536], %r21;
+	st.shared.b16 	[%r101+1792], %r203;
+	bar.sync 	0;
+	ld.shared.v2.b16 	{%rs9, %rs10}, [%r109];
+	ld.shared.v2.b16 	{%rs11, %rs12}, [%r111];
+	ld.shared.v2.b16 	{%rs13, %rs14}, [%r113];
+	ld.shared.v2.b16 	{%rs15, %rs16}, [%r115];
+	.loc	1 56 52                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:56:52
+	add.s32 	%r204, %r49, -8192;
+	.loc	1 56 31                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:56:31
+	mad.wide.s32 	%rd10, %r204, 4, %rd20;
+	.loc	1 56 90                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:56:90
+	// begin inline asm
+	mov.u64 %rd11, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd11, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r22, %r5;
+	mov.u32 %r23, %r5;
+	mov.u32 %r24, %r5;
+	mov.u32 %r25, %r5;
+	@%p4 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r22, %r23, %r24, %r25 }, [ %rd10 + 0 ], %rd11;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd12, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd12, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r26, %r5;
+	mov.u32 %r27, %r5;
+	mov.u32 %r28, %r5;
+	mov.u32 %r29, %r5;
+	@%p4 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r26, %r27, %r28, %r29 }, [ %rd10 + 0 ], %rd12;
+	// end inline asm
+	.loc	1 58 21                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:58:21
+	div.full.f32 	%r205, %r22, %r124;
+	div.full.f32 	%r206, %r23, %r124;
+	div.full.f32 	%r207, %r24, %r124;
+	div.full.f32 	%r208, %r25, %r124;
+	div.full.f32 	%r209, %r26, %r124;
+	div.full.f32 	%r210, %r27, %r124;
+	div.full.f32 	%r211, %r28, %r124;
+	div.full.f32 	%r212, %r29, %r124;
+	.loc	1 60 20                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:60:20
+	add.f32 	%r213, %r205, 0f358637BD;
+	add.f32 	%r214, %r206, 0f358637BD;
+	add.f32 	%r215, %r207, 0f358637BD;
+	add.f32 	%r216, %r208, 0f358637BD;
+	add.f32 	%r217, %r209, 0f358637BD;
+	add.f32 	%r218, %r210, 0f358637BD;
+	add.f32 	%r219, %r211, 0f358637BD;
+	add.f32 	%r220, %r212, 0f358637BD;
+	.loc	1 61 28                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:61:28
+	rsqrt.approx.ftz.f32 	%r221, %r213;
+	rsqrt.approx.ftz.f32 	%r222, %r214;
+	rsqrt.approx.ftz.f32 	%r223, %r215;
+	rsqrt.approx.ftz.f32 	%r224, %r216;
+	rsqrt.approx.ftz.f32 	%r225, %r217;
+	rsqrt.approx.ftz.f32 	%r226, %r218;
+	rsqrt.approx.ftz.f32 	%r227, %r219;
+	rsqrt.approx.ftz.f32 	%r228, %r220;
+	.loc	1 54 134                        // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:54:134
+	cvt.f32.bf16 	%r229, %rs16;
+	cvt.f32.bf16 	%r230, %rs14;
+	cvt.f32.bf16 	%r231, %rs15;
+	cvt.f32.bf16 	%r232, %rs13;
+	cvt.f32.bf16 	%r233, %rs12;
+	cvt.f32.bf16 	%r234, %rs10;
+	cvt.f32.bf16 	%r235, %rs11;
+	cvt.f32.bf16 	%r236, %rs9;
+	.loc	1 23 21                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:23:21
+	setp.lt.s32 	%p11, %r48, 73728;
+	.loc	1 62 20                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:62:20
+	mul.f32 	%r237, %r221, %r236;
+	mul.f32 	%r238, %r222, %r235;
+	mul.f32 	%r239, %r223, %r234;
+	mul.f32 	%r240, %r224, %r233;
+	mul.f32 	%r241, %r225, %r232;
+	mul.f32 	%r242, %r226, %r231;
+	mul.f32 	%r243, %r227, %r230;
+	mul.f32 	%r244, %r228, %r229;
+	bar.sync 	0;
+	st.shared.b32 	[%r162], %r237;
+	st.shared.b32 	[%r162+128], %r239;
+	st.shared.b32 	[%r164], %r238;
+	st.shared.b32 	[%r164+128], %r240;
+	st.shared.b32 	[%r166], %r241;
+	st.shared.b32 	[%r166+128], %r243;
+	st.shared.b32 	[%r168], %r242;
+	st.shared.b32 	[%r168+128], %r244;
+	bar.sync 	0;
+	ld.shared.b32 	%r245, [%r175];
+	ld.shared.b32 	%r246, [%r178+512];
+	ld.shared.b32 	%r247, [%r181+1024];
+	ld.shared.b32 	%r248, [%r184+1536];
+	ld.shared.b32 	%r249, [%r187+2048];
+	ld.shared.b32 	%r250, [%r190+2560];
+	ld.shared.b32 	%r251, [%r193+3072];
+	ld.shared.b32 	%r252, [%r196+3584];
+	.loc	1 63 31                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:63:31
+	add.s64 	%rd13, %rd21, %rd23;
+	.loc	1 63 71                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:63:71
+	// begin inline asm
+	mov.u64 %rd14, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd14, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r30, %r5;
+	mov.u32 %r31, %r5;
+	mov.u32 %r32, %r5;
+	mov.u32 %r33, %r5;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r30, %r31, %r32, %r33 }, [ %rd13 + 0 ], %rd14;
+	// end inline asm
+	.loc	1 70 34                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:70:34
+	shl.b32 	%r253, %r48, 7;
+	.loc	1 70 30                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:70:30
+	add.s32 	%r254, %r253, %r56;
+	.loc	1 70 25                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:70:25
+	mad.wide.s32 	%rd15, %r254, 2, %rd22;
+	.loc	1 70 54                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:70:54
+	and.pred 	%p5, %p6, %p11;
+	.loc	1 45 137                        // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:45:137
+	mov.b32 	{%rs17, %rs18}, %r14;
+	cvt.f32.bf16 	%r255, %rs17;
+	cvt.f32.bf16 	%r256, %rs18;
+	.loc	1 47 20                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:47:20
+	mul.f32 	%r257, %r179, %r256;
+	mul.f32 	%r258, %r176, %r255;
+	.loc	1 63 138                        // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:63:138
+	mov.b32 	{%rs19, %rs20}, %r30;
+	cvt.f32.bf16 	%r259, %rs19;
+	cvt.f32.bf16 	%r260, %rs20;
+	.loc	1 65 20                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:65:20
+	mul.f32 	%r261, %r246, %r260;
+	mul.f32 	%r262, %r245, %r259;
+	.loc	1 0 0                           // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:0
+	selp.f32 	%r263, %r258, %r262, %p8;
+	selp.f32 	%r264, %r257, %r261, %p8;
+	.loc	1 70 46                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:70:46
+	cvt.rn.bf16x2.f32 	%r34, %r264, %r263;
+	.loc	1 45 137                        // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:45:137
+	mov.b32 	{%rs21, %rs22}, %r15;
+	cvt.f32.bf16 	%r265, %rs21;
+	cvt.f32.bf16 	%r266, %rs22;
+	.loc	1 47 20                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:47:20
+	mul.f32 	%r267, %r185, %r266;
+	mul.f32 	%r268, %r182, %r265;
+	.loc	1 63 138                        // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:63:138
+	mov.b32 	{%rs23, %rs24}, %r31;
+	cvt.f32.bf16 	%r269, %rs23;
+	cvt.f32.bf16 	%r270, %rs24;
+	.loc	1 65 20                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:65:20
+	mul.f32 	%r271, %r248, %r270;
+	mul.f32 	%r272, %r247, %r269;
+	.loc	1 0 0                           // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:0
+	selp.f32 	%r273, %r268, %r272, %p8;
+	selp.f32 	%r274, %r267, %r271, %p8;
+	.loc	1 70 46                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:70:46
+	cvt.rn.bf16x2.f32 	%r35, %r274, %r273;
+	.loc	1 45 137                        // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:45:137
+	mov.b32 	{%rs25, %rs26}, %r16;
+	cvt.f32.bf16 	%r275, %rs25;
+	cvt.f32.bf16 	%r276, %rs26;
+	.loc	1 47 20                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:47:20
+	mul.f32 	%r277, %r191, %r276;
+	mul.f32 	%r278, %r188, %r275;
+	.loc	1 63 138                        // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:63:138
+	mov.b32 	{%rs27, %rs28}, %r32;
+	cvt.f32.bf16 	%r279, %rs27;
+	cvt.f32.bf16 	%r280, %rs28;
+	.loc	1 65 20                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:65:20
+	mul.f32 	%r281, %r250, %r280;
+	mul.f32 	%r282, %r249, %r279;
+	.loc	1 0 0                           // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:0
+	selp.f32 	%r283, %r278, %r282, %p8;
+	selp.f32 	%r284, %r277, %r281, %p8;
+	.loc	1 70 46                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:70:46
+	cvt.rn.bf16x2.f32 	%r36, %r284, %r283;
+	.loc	1 45 137                        // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:45:137
+	mov.b32 	{%rs29, %rs30}, %r17;
+	cvt.f32.bf16 	%r285, %rs29;
+	cvt.f32.bf16 	%r286, %rs30;
+	.loc	1 47 20                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:47:20
+	mul.f32 	%r287, %r197, %r286;
+	mul.f32 	%r288, %r194, %r285;
+	.loc	1 63 138                        // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:63:138
+	mov.b32 	{%rs31, %rs32}, %r33;
+	cvt.f32.bf16 	%r289, %rs31;
+	cvt.f32.bf16 	%r290, %rs32;
+	.loc	1 65 20                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:65:20
+	mul.f32 	%r291, %r252, %r290;
+	mul.f32 	%r292, %r251, %r289;
+	.loc	1 0 0                           // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:0
+	selp.f32 	%r293, %r288, %r292, %p8;
+	selp.f32 	%r294, %r287, %r291, %p8;
+	.loc	1 70 46                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:70:46
+	cvt.rn.bf16x2.f32 	%r37, %r294, %r293;
+	// begin inline asm
+	@%p5 st.global.v4.b32 [ %rd15 + 0 ], { %r34, %r35, %r36, %r37 };
+	// end inline asm
+	.loc	1 70 4                          // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:70:4
+	ret;
+$L__tmp1:
+$L__func_end0:
+                                        // -- End function
+}
+	.file	1 "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py"
+	.section	.debug_abbrev
+	{
+.b8 1                                   // Abbreviation Code
+.b8 17                                  // DW_TAG_compile_unit
+.b8 0                                   // DW_CHILDREN_no
+.b8 37                                  // DW_AT_producer
+.b8 8                                   // DW_FORM_string
+.b8 19                                  // DW_AT_language
+.b8 5                                   // DW_FORM_data2
+.b8 3                                   // DW_AT_name
+.b8 8                                   // DW_FORM_string
+.b8 16                                  // DW_AT_stmt_list
+.b8 6                                   // DW_FORM_data4
+.b8 27                                  // DW_AT_comp_dir
+.b8 8                                   // DW_FORM_string
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 0                                   // EOM(3)
+	}
+	.section	.debug_info
+	{
+.b32 224                                // Length of Unit
+.b8 2                                   // DWARF version number
+.b8 0
+.b32 .debug_abbrev                      // Offset Into Abbrev. Section
+.b8 8                                   // Address Size (in bytes)
+.b8 1                                   // Abbrev [1] 0xb:0xd9 DW_TAG_compile_unit
+.b8 116                                 // DW_AT_producer
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 0
+.b8 2                                   // DW_AT_language
+.b8 0
+.b8 99                                  // DW_AT_name
+.b8 50
+.b8 104
+.b8 105
+.b8 106
+.b8 51
+.b8 104
+.b8 109
+.b8 108
+.b8 111
+.b8 117
+.b8 109
+.b8 120
+.b8 100
+.b8 109
+.b8 104
+.b8 117
+.b8 101
+.b8 122
+.b8 115
+.b8 121
+.b8 104
+.b8 107
+.b8 109
+.b8 110
+.b8 113
+.b8 103
+.b8 110
+.b8 102
+.b8 97
+.b8 53
+.b8 105
+.b8 118
+.b8 114
+.b8 101
+.b8 50
+.b8 55
+.b8 117
+.b8 111
+.b8 115
+.b8 121
+.b8 109
+.b8 97
+.b8 109
+.b8 51
+.b8 100
+.b8 114
+.b8 55
+.b8 97
+.b8 53
+.b8 120
+.b8 98
+.b8 46
+.b8 112
+.b8 121
+.b8 0
+.b32 .debug_line                        // DW_AT_stmt_list
+.b8 47                                  // DW_AT_comp_dir
+.b8 97
+.b8 112
+.b8 112
+.b8 47
+.b8 116
+.b8 101
+.b8 110
+.b8 115
+.b8 111
+.b8 114
+.b8 114
+.b8 116
+.b8 95
+.b8 108
+.b8 108
+.b8 109
+.b8 47
+.b8 118
+.b8 105
+.b8 115
+.b8 117
+.b8 97
+.b8 108
+.b8 95
+.b8 103
+.b8 101
+.b8 110
+.b8 47
+.b8 99
+.b8 111
+.b8 109
+.b8 112
+.b8 105
+.b8 108
+.b8 101
+.b8 100
+.b8 95
+.b8 99
+.b8 97
+.b8 99
+.b8 104
+.b8 101
+.b8 47
+.b8 102
+.b8 108
+.b8 117
+.b8 120
+.b8 50
+.b8 95
+.b8 107
+.b8 108
+.b8 101
+.b8 105
+.b8 110
+.b8 95
+.b8 57
+.b8 98
+.b8 95
+.b8 78
+.b8 86
+.b8 73
+.b8 68
+.b8 73
+.b8 65
+.b8 95
+.b8 71
+.b8 101
+.b8 70
+.b8 111
+.b8 114
+.b8 99
+.b8 101
+.b8 95
+.b8 82
+.b8 84
+.b8 88
+.b8 95
+.b8 52
+.b8 48
+.b8 57
+.b8 48
+.b8 95
+.b8 115
+.b8 109
+.b8 56
+.b8 57
+.b8 95
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 50
+.b8 46
+.b8 49
+.b8 48
+.b8 46
+.b8 48
+.b8 97
+.b8 48
+.b8 95
+.b8 98
+.b8 52
+.b8 101
+.b8 52
+.b8 101
+.b8 101
+.b8 56
+.b8 49
+.b8 100
+.b8 51
+.b8 46
+.b8 110
+.b8 118
+.b8 50
+.b8 53
+.b8 46
+.b8 49
+.b8 50
+.b8 95
+.b8 99
+.b8 117
+.b8 100
+.b8 97
+.b8 49
+.b8 51
+.b8 95
+.b8 49
+.b8 47
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 105
+.b8 110
+.b8 100
+.b8 117
+.b8 99
+.b8 116
+.b8 111
+.b8 114
+.b8 47
+.b8 50
+.b8 104
+.b8 0
+	}
+	.section	.debug_macinfo	{	}
diff --git a/triton/BMEOK34QH7HISRHLZCWDTEICFHNMYNUUIGVSUVGWE7RQZVRWVZPA/triton_poi_fused__fused_rms_norm_cat_view_2.source b/triton/BMEOK34QH7HISRHLZCWDTEICFHNMYNUUIGVSUVGWE7RQZVRWVZPA/triton_poi_fused__fused_rms_norm_cat_view_2.source
new file mode 100644
index 0000000000000000000000000000000000000000..fb33074b29fb0dac06ff56ab95cbae14db960018
--- /dev/null
+++ b/triton/BMEOK34QH7HISRHLZCWDTEICFHNMYNUUIGVSUVGWE7RQZVRWVZPA/triton_poi_fused__fused_rms_norm_cat_view_2.source
@@ -0,0 +1,415 @@
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":18:0)
+#loc99 = loc("in_ptr0"(#loc))
+#loc100 = loc("in_ptr1"(#loc))
+#loc101 = loc("in_ptr2"(#loc))
+#loc102 = loc("in_ptr3"(#loc))
+#loc103 = loc("in_ptr4"(#loc))
+#loc104 = loc("in_ptr5"(#loc))
+#loc105 = loc("out_ptr0"(#loc))
+#loc106 = loc("ynumel"(#loc))
+#loc107 = loc("xnumel"(#loc))
+module {
+  tt.func public @triton_poi_fused__fused_rms_norm_cat_view_2(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %in_ptr4: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr4"(#loc)), %in_ptr5: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr5"(#loc)), %out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ynumel: i32 {tt.divisibility = 16 : i32} loc("ynumel"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} {
+    %ynumel_0 = arith.constant 73728 : i32 loc(#loc108)
+    %xnumel_1 = arith.constant 128 : i32 loc(#loc109)
+    %yoffset = tt.get_program_id y : i32 loc(#loc110)
+    %yoffset_2 = tt.get_program_id z : i32 loc(#loc111)
+    %yoffset_3 = tt.get_num_programs y : i32 loc(#loc112)
+    %yoffset_4 = arith.muli %yoffset_2, %yoffset_3 : i32 loc(#loc113)
+    %yoffset_5 = arith.addi %yoffset, %yoffset_4 : i32 loc(#loc114)
+    %yoffset_6 = arith.constant 8 : i32 loc(#loc115)
+    %yoffset_7 = arith.constant 8 : i32 loc(#loc115)
+    %yoffset_8 = arith.muli %yoffset_5, %yoffset_7 : i32 loc(#loc115)
+    %yindex = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32> loc(#loc116)
+    %yindex_9 = tt.expand_dims %yindex {axis = 1 : i32} : tensor<8xi32> -> tensor<8x1xi32> loc(#loc117)
+    %yindex_10 = tt.splat %yoffset_8 : i32 -> tensor<8x1xi32> loc(#loc118)
+    %yindex_11 = arith.addi %yindex_10, %yindex_9 : tensor<8x1xi32> loc(#loc118)
+    %ymask = arith.constant dense<73728> : tensor<8x1xi32> loc(#loc119)
+    %ymask_12 = arith.cmpi slt, %yindex_11, %ymask : tensor<8x1xi32> loc(#loc119)
+    %xoffset = tt.get_program_id x : i32 loc(#loc120)
+    %xoffset_13 = arith.constant 128 : i32 loc(#loc121)
+    %xoffset_14 = arith.constant 128 : i32 loc(#loc121)
+    %xoffset_15 = arith.muli %xoffset, %xoffset_14 : i32 loc(#loc121)
+    %xindex = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc122)
+    %xindex_16 = tt.expand_dims %xindex {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc123)
+    %xindex_17 = tt.splat %xoffset_15 : i32 -> tensor<1x128xi32> loc(#loc124)
+    %xindex_18 = arith.addi %xindex_17, %xindex_16 : tensor<1x128xi32> loc(#loc124)
+    %xmask = arith.constant dense<128> : tensor<1x128xi32> loc(#loc125)
+    %xmask_19 = arith.cmpi slt, %xindex_18, %xmask : tensor<1x128xi32> loc(#loc125)
+    %y1 = arith.constant 32 : i32 loc(#loc126)
+    %y1_20 = arith.constant 32 : i32 loc(#loc126)
+    %y1_21 = arith.constant dense<32> : tensor<8x1xi32> loc(#loc126)
+    %y1_22 = arith.divsi %yindex_11, %y1_21 : tensor<8x1xi32> loc(#loc126)
+    %y0 = arith.constant 32 : i32 loc(#loc127)
+    %y0_23 = arith.constant 32 : i32 loc(#loc127)
+    %y0_24 = arith.constant dense<32> : tensor<8x1xi32> loc(#loc127)
+    %y0_25 = arith.remsi %yindex_11, %y0_24 : tensor<8x1xi32> loc(#loc127)
+    %tmp1 = arith.constant 0 : i64 loc(#loc128)
+    %tmp1_26 = arith.constant dense<0> : tensor<1x1xi64> loc(#loc128)
+    %tmp2 = arith.extsi %y1_22 : tensor<8x1xi32> to tensor<8x1xi64> loc(#loc129)
+    %tmp2_27 = arith.constant dense<0> : tensor<8x1xi64> loc(#loc129)
+    %tmp2_28 = arith.cmpi sge, %tmp2, %tmp2_27 : tensor<8x1xi64> loc(#loc129)
+    %tmp3 = arith.constant 256 : i64 loc(#loc130)
+    %tmp3_29 = arith.constant dense<256> : tensor<1x1xi64> loc(#loc130)
+    %tmp4 = arith.extsi %y1_22 : tensor<8x1xi32> to tensor<8x1xi64> loc(#loc131)
+    %tmp4_30 = arith.constant dense<256> : tensor<8x1xi64> loc(#loc131)
+    %tmp4_31 = arith.cmpi slt, %tmp4, %tmp4_30 : tensor<8x1xi64> loc(#loc131)
+    %tmp5 = arith.constant 128 : i32 loc(#loc132)
+    %tmp5_32 = arith.constant 128 : i32 loc(#loc132)
+    %tmp5_33 = arith.constant dense<128> : tensor<8x1xi32> loc(#loc132)
+    %tmp5_34 = arith.muli %tmp5_33, %y0_25 : tensor<8x1xi32> loc(#loc132)
+    %tmp5_35 = tt.broadcast %xindex_18 : tensor<1x128xi32> -> tensor<8x128xi32> loc(#loc133)
+    %tmp5_36 = tt.broadcast %tmp5_34 : tensor<8x1xi32> -> tensor<8x128xi32> loc(#loc133)
+    %tmp5_37 = arith.addi %tmp5_35, %tmp5_36 : tensor<8x128xi32> loc(#loc133)
+    %tmp5_38 = arith.constant 12288 : i32 loc(#loc134)
+    %tmp5_39 = arith.constant 12288 : i32 loc(#loc134)
+    %tmp5_40 = arith.constant dense<12288> : tensor<8x1xi32> loc(#loc134)
+    %tmp5_41 = arith.muli %tmp5_40, %y1_22 : tensor<8x1xi32> loc(#loc134)
+    %tmp5_42 = tt.broadcast %tmp5_41 : tensor<8x1xi32> -> tensor<8x128xi32> loc(#loc135)
+    %tmp5_43 = arith.addi %tmp5_37, %tmp5_42 : tensor<8x128xi32> loc(#loc135)
+    %tmp5_44 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<8x128x!tt.ptr<bf16>> loc(#loc136)
+    %tmp5_45 = tt.addptr %tmp5_44, %tmp5_43 : tensor<8x128x!tt.ptr<bf16>>, tensor<8x128xi32> loc(#loc136)
+    %tmp5_46 = tt.broadcast %tmp4_31 : tensor<8x1xi1> -> tensor<8x128xi1> loc(#loc137)
+    %tmp5_47 = tt.broadcast %xmask_19 : tensor<1x128xi1> -> tensor<8x128xi1> loc(#loc137)
+    %tmp5_48 = arith.andi %tmp5_46, %tmp5_47 : tensor<8x128xi1> loc(#loc137)
+    %tmp5_49 = tt.broadcast %ymask_12 : tensor<8x1xi1> -> tensor<8x128xi1> loc(#loc138)
+    %tmp5_50 = arith.andi %tmp5_48, %tmp5_49 : tensor<8x128xi1> loc(#loc138)
+    %tmp5_51 = arith.constant 0.000000e+00 : f32 loc(#loc139)
+    %tmp5_52 = arith.constant dense<0.000000e+00> : tensor<8x128xf32> loc(#loc139)
+    %tmp5_53 = arith.truncf %tmp5_52 : tensor<8x128xf32> to tensor<8x128xbf16> loc(#loc139)
+    %tmp5_54 = tt.load %tmp5_45, %tmp5_50, %tmp5_53 evictionPolicy = evict_last : tensor<8x128x!tt.ptr<bf16>> loc(#loc139)
+    %tmp5_55 = arith.extf %tmp5_54 : tensor<8x128xbf16> to tensor<8x128xf32> loc(#loc140)
+    %tmp7 = arith.constant 32 : i32 loc(#loc141)
+    %tmp7_56 = arith.constant 32 : i32 loc(#loc141)
+    %tmp7_57 = arith.constant dense<32> : tensor<8x1xi32> loc(#loc141)
+    %tmp7_58 = arith.muli %tmp7_57, %y1_22 : tensor<8x1xi32> loc(#loc141)
+    %tmp7_59 = arith.addi %y0_25, %tmp7_58 : tensor<8x1xi32> loc(#loc142)
+    %tmp7_60 = tt.broadcast %tmp7_59 : tensor<8x1xi32> -> tensor<8x128xi32> loc(#loc143)
+    %tmp7_61 = tt.splat %in_ptr1 : !tt.ptr<f32> -> tensor<8x128x!tt.ptr<f32>> loc(#loc144)
+    %tmp7_62 = tt.addptr %tmp7_61, %tmp7_60 : tensor<8x128x!tt.ptr<f32>>, tensor<8x128xi32> loc(#loc144)
+    %tmp7_63 = tt.broadcast %tmp4_31 : tensor<8x1xi1> -> tensor<8x128xi1> loc(#loc145)
+    %tmp7_64 = tt.broadcast %xmask_19 : tensor<1x128xi1> -> tensor<8x128xi1> loc(#loc145)
+    %tmp7_65 = arith.andi %tmp7_63, %tmp7_64 : tensor<8x128xi1> loc(#loc145)
+    %tmp7_66 = tt.broadcast %ymask_12 : tensor<8x1xi1> -> tensor<8x128xi1> loc(#loc146)
+    %tmp7_67 = arith.andi %tmp7_65, %tmp7_66 : tensor<8x128xi1> loc(#loc146)
+    %tmp7_68 = arith.constant 0.000000e+00 : f32 loc(#loc147)
+    %tmp7_69 = arith.constant dense<0.000000e+00> : tensor<8x128xf32> loc(#loc147)
+    %tmp7_70 = tt.load %tmp7_62, %tmp7_67, %tmp7_69 evictionPolicy = evict_last : tensor<8x128x!tt.ptr<f32>> loc(#loc147)
+    %tmp8 = arith.constant 1.280000e+02 : f32 loc(#loc148)
+    %tmp9 = arith.constant dense<1.280000e+02> : tensor<8x128xf32> loc(#loc149)
+    %tmp9_71 = arith.divf %tmp7_70, %tmp9 : tensor<8x128xf32> loc(#loc149)
+    %tmp10 = arith.constant 9.99999997E-7 : f32 loc(#loc150)
+    %tmp11 = arith.constant dense<9.99999997E-7> : tensor<8x128xf32> loc(#loc151)
+    %tmp11_72 = arith.addf %tmp9_71, %tmp11 : tensor<8x128xf32> loc(#loc151)
+    %tmp12 = tt.extern_elementwise %tmp11_72 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<8x128xf32>) -> tensor<8x128xf32> loc(#loc152)
+    %tmp13 = arith.mulf %tmp5_55, %tmp12 : tensor<8x128xf32> loc(#loc153)
+    %tmp14 = tt.broadcast %xindex_18 : tensor<1x128xi32> -> tensor<8x128xi32> loc(#loc154)
+    %tmp14_73 = tt.splat %in_ptr2 : !tt.ptr<bf16> -> tensor<8x128x!tt.ptr<bf16>> loc(#loc155)
+    %tmp14_74 = tt.addptr %tmp14_73, %tmp14 : tensor<8x128x!tt.ptr<bf16>>, tensor<8x128xi32> loc(#loc155)
+    %tmp14_75 = tt.broadcast %tmp4_31 : tensor<8x1xi1> -> tensor<8x128xi1> loc(#loc156)
+    %tmp14_76 = tt.broadcast %xmask_19 : tensor<1x128xi1> -> tensor<8x128xi1> loc(#loc156)
+    %tmp14_77 = arith.andi %tmp14_75, %tmp14_76 : tensor<8x128xi1> loc(#loc156)
+    %tmp14_78 = tt.broadcast %ymask_12 : tensor<8x1xi1> -> tensor<8x128xi1> loc(#loc157)
+    %tmp14_79 = arith.andi %tmp14_77, %tmp14_78 : tensor<8x128xi1> loc(#loc157)
+    %tmp14_80 = arith.constant 0.000000e+00 : f32 loc(#loc158)
+    %tmp14_81 = arith.constant dense<0.000000e+00> : tensor<8x128xf32> loc(#loc158)
+    %tmp14_82 = arith.truncf %tmp14_81 : tensor<8x128xf32> to tensor<8x128xbf16> loc(#loc158)
+    %tmp14_83 = tt.load %tmp14_74, %tmp14_79, %tmp14_82 evictionPolicy = evict_last : tensor<8x128x!tt.ptr<bf16>> loc(#loc158)
+    %tmp14_84 = arith.extf %tmp14_83 : tensor<8x128xbf16> to tensor<8x128xf32> loc(#loc159)
+    %tmp16 = arith.mulf %tmp13, %tmp14_84 : tensor<8x128xf32> loc(#loc160)
+    %tmp18 = arith.constant 0.000000e+00 : f32 loc(#loc161)
+    %tmp18_85 = arith.constant dense<0.000000e+00> : tensor<8x128xf32> loc(#loc161)
+    %tmp19 = tt.broadcast %tmp4_31 : tensor<8x1xi1> -> tensor<8x128xi1> loc(#loc162)
+    %tmp19_86 = arith.select %tmp19, %tmp16, %tmp18_85 : tensor<8x128xi1>, tensor<8x128xf32> loc(#loc162)
+    %tmp20 = arith.extsi %y1_22 : tensor<8x1xi32> to tensor<8x1xi64> loc(#loc163)
+    %tmp20_87 = arith.constant dense<256> : tensor<8x1xi64> loc(#loc163)
+    %tmp20_88 = arith.cmpi sge, %tmp20, %tmp20_87 : tensor<8x1xi64> loc(#loc163)
+    %tmp21 = arith.constant 2304 : i64 loc(#loc164)
+    %tmp21_89 = arith.constant dense<2304> : tensor<1x1xi64> loc(#loc164)
+    %tmp22 = arith.extsi %y1_22 : tensor<8x1xi32> to tensor<8x1xi64> loc(#loc165)
+    %tmp22_90 = arith.constant dense<2304> : tensor<8x1xi64> loc(#loc165)
+    %tmp22_91 = arith.cmpi slt, %tmp22, %tmp22_90 : tensor<8x1xi64> loc(#loc165)
+    %tmp23 = arith.constant 128 : i32 loc(#loc166)
+    %tmp23_92 = arith.constant 128 : i32 loc(#loc166)
+    %tmp23_93 = arith.constant dense<128> : tensor<8x1xi32> loc(#loc166)
+    %tmp23_94 = arith.muli %tmp23_93, %y0_25 : tensor<8x1xi32> loc(#loc166)
+    %tmp23_95 = tt.broadcast %xindex_18 : tensor<1x128xi32> -> tensor<8x128xi32> loc(#loc167)
+    %tmp23_96 = tt.broadcast %tmp23_94 : tensor<8x1xi32> -> tensor<8x128xi32> loc(#loc167)
+    %tmp23_97 = arith.addi %tmp23_95, %tmp23_96 : tensor<8x128xi32> loc(#loc167)
+    %tmp23_98 = arith.constant -256 : i32 loc(#loc168)
+    %tmp23_99 = arith.constant -256 : i32 loc(#loc168)
+    %tmp23_100 = arith.constant dense<-256> : tensor<8x1xi32> loc(#loc168)
+    %tmp23_101 = arith.addi %tmp23_100, %y1_22 : tensor<8x1xi32> loc(#loc168)
+    %tmp23_102 = arith.constant 12288 : i32 loc(#loc169)
+    %tmp23_103 = arith.constant 12288 : i32 loc(#loc169)
+    %tmp23_104 = arith.constant dense<12288> : tensor<8x1xi32> loc(#loc169)
+    %tmp23_105 = arith.muli %tmp23_104, %tmp23_101 : tensor<8x1xi32> loc(#loc169)
+    %tmp23_106 = tt.broadcast %tmp23_105 : tensor<8x1xi32> -> tensor<8x128xi32> loc(#loc170)
+    %tmp23_107 = arith.addi %tmp23_97, %tmp23_106 : tensor<8x128xi32> loc(#loc170)
+    %tmp23_108 = tt.splat %in_ptr3 : !tt.ptr<bf16> -> tensor<8x128x!tt.ptr<bf16>> loc(#loc171)
+    %tmp23_109 = tt.addptr %tmp23_108, %tmp23_107 : tensor<8x128x!tt.ptr<bf16>>, tensor<8x128xi32> loc(#loc171)
+    %tmp23_110 = tt.broadcast %tmp20_88 : tensor<8x1xi1> -> tensor<8x128xi1> loc(#loc172)
+    %tmp23_111 = tt.broadcast %xmask_19 : tensor<1x128xi1> -> tensor<8x128xi1> loc(#loc172)
+    %tmp23_112 = arith.andi %tmp23_110, %tmp23_111 : tensor<8x128xi1> loc(#loc172)
+    %tmp23_113 = tt.broadcast %ymask_12 : tensor<8x1xi1> -> tensor<8x128xi1> loc(#loc173)
+    %tmp23_114 = arith.andi %tmp23_112, %tmp23_113 : tensor<8x128xi1> loc(#loc173)
+    %tmp23_115 = arith.constant 0.000000e+00 : f32 loc(#loc174)
+    %tmp23_116 = arith.constant dense<0.000000e+00> : tensor<8x128xf32> loc(#loc174)
+    %tmp23_117 = arith.truncf %tmp23_116 : tensor<8x128xf32> to tensor<8x128xbf16> loc(#loc174)
+    %tmp23_118 = tt.load %tmp23_109, %tmp23_114, %tmp23_117 evictionPolicy = evict_last : tensor<8x128x!tt.ptr<bf16>> loc(#loc174)
+    %tmp23_119 = arith.extf %tmp23_118 : tensor<8x128xbf16> to tensor<8x128xf32> loc(#loc175)
+    %tmp25 = arith.constant -256 : i32 loc(#loc176)
+    %tmp25_120 = arith.constant -256 : i32 loc(#loc176)
+    %tmp25_121 = arith.constant dense<-256> : tensor<8x1xi32> loc(#loc176)
+    %tmp25_122 = arith.addi %tmp25_121, %y1_22 : tensor<8x1xi32> loc(#loc176)
+    %tmp25_123 = arith.constant 32 : i32 loc(#loc177)
+    %tmp25_124 = arith.constant 32 : i32 loc(#loc177)
+    %tmp25_125 = arith.constant dense<32> : tensor<8x1xi32> loc(#loc177)
+    %tmp25_126 = arith.muli %tmp25_125, %tmp25_122 : tensor<8x1xi32> loc(#loc177)
+    %tmp25_127 = arith.addi %y0_25, %tmp25_126 : tensor<8x1xi32> loc(#loc178)
+    %tmp25_128 = tt.broadcast %tmp25_127 : tensor<8x1xi32> -> tensor<8x128xi32> loc(#loc179)
+    %tmp25_129 = tt.splat %in_ptr4 : !tt.ptr<f32> -> tensor<8x128x!tt.ptr<f32>> loc(#loc180)
+    %tmp25_130 = tt.addptr %tmp25_129, %tmp25_128 : tensor<8x128x!tt.ptr<f32>>, tensor<8x128xi32> loc(#loc180)
+    %tmp25_131 = tt.broadcast %tmp20_88 : tensor<8x1xi1> -> tensor<8x128xi1> loc(#loc181)
+    %tmp25_132 = tt.broadcast %xmask_19 : tensor<1x128xi1> -> tensor<8x128xi1> loc(#loc181)
+    %tmp25_133 = arith.andi %tmp25_131, %tmp25_132 : tensor<8x128xi1> loc(#loc181)
+    %tmp25_134 = tt.broadcast %ymask_12 : tensor<8x1xi1> -> tensor<8x128xi1> loc(#loc182)
+    %tmp25_135 = arith.andi %tmp25_133, %tmp25_134 : tensor<8x128xi1> loc(#loc182)
+    %tmp25_136 = arith.constant 0.000000e+00 : f32 loc(#loc183)
+    %tmp25_137 = arith.constant dense<0.000000e+00> : tensor<8x128xf32> loc(#loc183)
+    %tmp25_138 = tt.load %tmp25_130, %tmp25_135, %tmp25_137 evictionPolicy = evict_last : tensor<8x128x!tt.ptr<f32>> loc(#loc183)
+    %tmp26 = arith.constant 1.280000e+02 : f32 loc(#loc184)
+    %tmp27 = arith.constant dense<1.280000e+02> : tensor<8x128xf32> loc(#loc185)
+    %tmp27_139 = arith.divf %tmp25_138, %tmp27 : tensor<8x128xf32> loc(#loc185)
+    %tmp28 = arith.constant 9.99999997E-7 : f32 loc(#loc186)
+    %tmp29 = arith.constant dense<9.99999997E-7> : tensor<8x128xf32> loc(#loc187)
+    %tmp29_140 = arith.addf %tmp27_139, %tmp29 : tensor<8x128xf32> loc(#loc187)
+    %tmp30 = tt.extern_elementwise %tmp29_140 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<8x128xf32>) -> tensor<8x128xf32> loc(#loc188)
+    %tmp31 = arith.mulf %tmp23_119, %tmp30 : tensor<8x128xf32> loc(#loc189)
+    %tmp32 = tt.broadcast %xindex_18 : tensor<1x128xi32> -> tensor<8x128xi32> loc(#loc190)
+    %tmp32_141 = tt.splat %in_ptr5 : !tt.ptr<bf16> -> tensor<8x128x!tt.ptr<bf16>> loc(#loc191)
+    %tmp32_142 = tt.addptr %tmp32_141, %tmp32 : tensor<8x128x!tt.ptr<bf16>>, tensor<8x128xi32> loc(#loc191)
+    %tmp32_143 = tt.broadcast %tmp20_88 : tensor<8x1xi1> -> tensor<8x128xi1> loc(#loc192)
+    %tmp32_144 = tt.broadcast %xmask_19 : tensor<1x128xi1> -> tensor<8x128xi1> loc(#loc192)
+    %tmp32_145 = arith.andi %tmp32_143, %tmp32_144 : tensor<8x128xi1> loc(#loc192)
+    %tmp32_146 = tt.broadcast %ymask_12 : tensor<8x1xi1> -> tensor<8x128xi1> loc(#loc193)
+    %tmp32_147 = arith.andi %tmp32_145, %tmp32_146 : tensor<8x128xi1> loc(#loc193)
+    %tmp32_148 = arith.constant 0.000000e+00 : f32 loc(#loc194)
+    %tmp32_149 = arith.constant dense<0.000000e+00> : tensor<8x128xf32> loc(#loc194)
+    %tmp32_150 = arith.truncf %tmp32_149 : tensor<8x128xf32> to tensor<8x128xbf16> loc(#loc194)
+    %tmp32_151 = tt.load %tmp32_142, %tmp32_147, %tmp32_150 evictionPolicy = evict_last : tensor<8x128x!tt.ptr<bf16>> loc(#loc194)
+    %tmp32_152 = arith.extf %tmp32_151 : tensor<8x128xbf16> to tensor<8x128xf32> loc(#loc195)
+    %tmp34 = arith.mulf %tmp31, %tmp32_152 : tensor<8x128xf32> loc(#loc196)
+    %tmp36 = arith.constant 0.000000e+00 : f32 loc(#loc197)
+    %tmp36_153 = arith.constant dense<0.000000e+00> : tensor<8x128xf32> loc(#loc197)
+    %tmp37 = tt.broadcast %tmp20_88 : tensor<8x1xi1> -> tensor<8x128xi1> loc(#loc198)
+    %tmp37_154 = arith.select %tmp37, %tmp34, %tmp36_153 : tensor<8x128xi1>, tensor<8x128xf32> loc(#loc198)
+    %tmp38 = tt.broadcast %tmp4_31 : tensor<8x1xi1> -> tensor<8x128xi1> loc(#loc199)
+    %tmp38_155 = arith.select %tmp38, %tmp19_86, %tmp37_154 : tensor<8x128xi1>, tensor<8x128xf32> loc(#loc199)
+    %c128_i32 = arith.constant 128 : i32 loc(#loc93)
+    %c128_i32_156 = arith.constant 128 : i32 loc(#loc93)
+    %cst = arith.constant dense<128> : tensor<8x1xi32> loc(#loc93)
+    %0 = arith.muli %cst, %yindex_11 : tensor<8x1xi32> loc(#loc93)
+    %1 = tt.broadcast %xindex_18 : tensor<1x128xi32> -> tensor<8x128xi32> loc(#loc94)
+    %2 = tt.broadcast %0 : tensor<8x1xi32> -> tensor<8x128xi32> loc(#loc94)
+    %3 = arith.addi %1, %2 : tensor<8x128xi32> loc(#loc94)
+    %4 = tt.splat %out_ptr0 : !tt.ptr<bf16> -> tensor<8x128x!tt.ptr<bf16>> loc(#loc95)
+    %5 = tt.addptr %4, %3 : tensor<8x128x!tt.ptr<bf16>>, tensor<8x128xi32> loc(#loc95)
+    %6 = tt.broadcast %xmask_19 : tensor<1x128xi1> -> tensor<8x128xi1> loc(#loc96)
+    %7 = tt.broadcast %ymask_12 : tensor<8x1xi1> -> tensor<8x128xi1> loc(#loc96)
+    %8 = arith.andi %6, %7 : tensor<8x128xi1> loc(#loc96)
+    %9 = arith.truncf %tmp38_155 : tensor<8x128xf32> to tensor<8x128xbf16> loc(#loc97)
+    tt.store %5, %9, %8 : tensor<8x128x!tt.ptr<bf16>> loc(#loc97)
+    tt.return loc(#loc98)
+  } loc(#loc)
+} loc(#loc)
+#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":19:13)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":20:13)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:29)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:48)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:69)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:53)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:34)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:75)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":22:36)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":22:44)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":22:23)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":23:21)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":24:28)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":24:33)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":25:36)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":25:44)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":25:23)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":26:21)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":27:19)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":29:19)
+#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":32:30)
+#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":33:19)
+#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":34:32)
+#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":35:18)
+#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:39)
+#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:35)
+#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:51)
+#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:44)
+#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:30)
+#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:64)
+#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:72)
+#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:57)
+#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:123)
+#loc34 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:55)
+#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:51)
+#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:60)
+#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:30)
+#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:87)
+#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:95)
+#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:80)
+#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":39:11)
+#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":40:19)
+#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":41:12)
+#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":42:19)
+#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":43:28)
+#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":44:19)
+#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:51)
+#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:31)
+#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:78)
+#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:86)
+#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:71)
+#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:137)
+#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":47:20)
+#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":49:38)
+#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":50:34)
+#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":51:20)
+#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":52:34)
+#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":53:19)
+#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:40)
+#loc60 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:36)
+#loc61 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:61)
+#loc62 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:52)
+#loc63 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:45)
+#loc64 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:31)
+#loc65 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:75)
+#loc66 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:83)
+#loc67 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:67)
+#loc68 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:134)
+#loc69 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:65)
+#loc70 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:56)
+#loc71 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:52)
+#loc72 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:70)
+#loc73 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:31)
+#loc74 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:98)
+#loc75 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:106)
+#loc76 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:90)
+#loc77 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":57:12)
+#loc78 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":58:21)
+#loc79 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":59:12)
+#loc80 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":60:20)
+#loc81 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":61:28)
+#loc82 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":62:20)
+#loc83 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:51)
+#loc84 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:31)
+#loc85 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:79)
+#loc86 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:87)
+#loc87 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:71)
+#loc88 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:138)
+#loc89 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":65:20)
+#loc90 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":67:38)
+#loc91 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":68:35)
+#loc92 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":69:34)
+#loc93 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:34)
+#loc94 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:30)
+#loc95 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:25)
+#loc96 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:54)
+#loc97 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:46)
+#loc98 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:4)
+#loc108 = loc("ynumel"(#loc1))
+#loc109 = loc("xnumel"(#loc2))
+#loc110 = loc("yoffset"(#loc3))
+#loc111 = loc("yoffset"(#loc4))
+#loc112 = loc("yoffset"(#loc5))
+#loc113 = loc("yoffset"(#loc6))
+#loc114 = loc("yoffset"(#loc7))
+#loc115 = loc("yoffset"(#loc8))
+#loc116 = loc("yindex"(#loc9))
+#loc117 = loc("yindex"(#loc10))
+#loc118 = loc("yindex"(#loc11))
+#loc119 = loc("ymask"(#loc12))
+#loc120 = loc("xoffset"(#loc13))
+#loc121 = loc("xoffset"(#loc14))
+#loc122 = loc("xindex"(#loc15))
+#loc123 = loc("xindex"(#loc16))
+#loc124 = loc("xindex"(#loc17))
+#loc125 = loc("xmask"(#loc18))
+#loc126 = loc("y1"(#loc19))
+#loc127 = loc("y0"(#loc20))
+#loc128 = loc("tmp1"(#loc21))
+#loc129 = loc("tmp2"(#loc22))
+#loc130 = loc("tmp3"(#loc23))
+#loc131 = loc("tmp4"(#loc24))
+#loc132 = loc("tmp5"(#loc25))
+#loc133 = loc("tmp5"(#loc26))
+#loc134 = loc("tmp5"(#loc27))
+#loc135 = loc("tmp5"(#loc28))
+#loc136 = loc("tmp5"(#loc29))
+#loc137 = loc("tmp5"(#loc30))
+#loc138 = loc("tmp5"(#loc31))
+#loc139 = loc("tmp5"(#loc32))
+#loc140 = loc("tmp5"(#loc33))
+#loc141 = loc("tmp7"(#loc34))
+#loc142 = loc("tmp7"(#loc35))
+#loc143 = loc("tmp7"(#loc36))
+#loc144 = loc("tmp7"(#loc37))
+#loc145 = loc("tmp7"(#loc38))
+#loc146 = loc("tmp7"(#loc39))
+#loc147 = loc("tmp7"(#loc40))
+#loc148 = loc("tmp8"(#loc41))
+#loc149 = loc("tmp9"(#loc42))
+#loc150 = loc("tmp10"(#loc43))
+#loc151 = loc("tmp11"(#loc44))
+#loc152 = loc("tmp12"(#loc45))
+#loc153 = loc("tmp13"(#loc46))
+#loc154 = loc("tmp14"(#loc47))
+#loc155 = loc("tmp14"(#loc48))
+#loc156 = loc("tmp14"(#loc49))
+#loc157 = loc("tmp14"(#loc50))
+#loc158 = loc("tmp14"(#loc51))
+#loc159 = loc("tmp14"(#loc52))
+#loc160 = loc("tmp16"(#loc53))
+#loc161 = loc("tmp18"(#loc54))
+#loc162 = loc("tmp19"(#loc55))
+#loc163 = loc("tmp20"(#loc56))
+#loc164 = loc("tmp21"(#loc57))
+#loc165 = loc("tmp22"(#loc58))
+#loc166 = loc("tmp23"(#loc59))
+#loc167 = loc("tmp23"(#loc60))
+#loc168 = loc("tmp23"(#loc61))
+#loc169 = loc("tmp23"(#loc62))
+#loc170 = loc("tmp23"(#loc63))
+#loc171 = loc("tmp23"(#loc64))
+#loc172 = loc("tmp23"(#loc65))
+#loc173 = loc("tmp23"(#loc66))
+#loc174 = loc("tmp23"(#loc67))
+#loc175 = loc("tmp23"(#loc68))
+#loc176 = loc("tmp25"(#loc69))
+#loc177 = loc("tmp25"(#loc70))
+#loc178 = loc("tmp25"(#loc71))
+#loc179 = loc("tmp25"(#loc72))
+#loc180 = loc("tmp25"(#loc73))
+#loc181 = loc("tmp25"(#loc74))
+#loc182 = loc("tmp25"(#loc75))
+#loc183 = loc("tmp25"(#loc76))
+#loc184 = loc("tmp26"(#loc77))
+#loc185 = loc("tmp27"(#loc78))
+#loc186 = loc("tmp28"(#loc79))
+#loc187 = loc("tmp29"(#loc80))
+#loc188 = loc("tmp30"(#loc81))
+#loc189 = loc("tmp31"(#loc82))
+#loc190 = loc("tmp32"(#loc83))
+#loc191 = loc("tmp32"(#loc84))
+#loc192 = loc("tmp32"(#loc85))
+#loc193 = loc("tmp32"(#loc86))
+#loc194 = loc("tmp32"(#loc87))
+#loc195 = loc("tmp32"(#loc88))
+#loc196 = loc("tmp34"(#loc89))
+#loc197 = loc("tmp36"(#loc90))
+#loc198 = loc("tmp37"(#loc91))
+#loc199 = loc("tmp38"(#loc92))
diff --git a/triton/BMEOK34QH7HISRHLZCWDTEICFHNMYNUUIGVSUVGWE7RQZVRWVZPA/triton_poi_fused__fused_rms_norm_cat_view_2.ttgir b/triton/BMEOK34QH7HISRHLZCWDTEICFHNMYNUUIGVSUVGWE7RQZVRWVZPA/triton_poi_fused__fused_rms_norm_cat_view_2.ttgir
new file mode 100644
index 0000000000000000000000000000000000000000..f39978e3de144f8a83850e861421a5c781fa263a
--- /dev/null
+++ b/triton/BMEOK34QH7HISRHLZCWDTEICFHNMYNUUIGVSUVGWE7RQZVRWVZPA/triton_poi_fused__fused_rms_norm_cat_view_2.ttgir
@@ -0,0 +1,288 @@
+#blocked = #ttg.blocked<{sizePerThread = [4, 1], threadsPerWarp = [2, 16], warpsPerCTA = [1, 4], order = [0, 1]}>
+#blocked1 = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [2, 16], warpsPerCTA = [4, 1], order = [1, 0]}>
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":18:0)
+#loc70 = loc("in_ptr0"(#loc))
+#loc71 = loc("in_ptr1"(#loc))
+#loc72 = loc("in_ptr2"(#loc))
+#loc73 = loc("in_ptr3"(#loc))
+#loc74 = loc("in_ptr4"(#loc))
+#loc75 = loc("in_ptr5"(#loc))
+#loc76 = loc("out_ptr0"(#loc))
+#loc77 = loc("ynumel"(#loc))
+#loc78 = loc("xnumel"(#loc))
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:89", "ttg.threads-per-warp" = 32 : i32} {
+  tt.func public @triton_poi_fused__fused_rms_norm_cat_view_2(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %in_ptr4: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr4"(#loc)), %in_ptr5: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr5"(#loc)), %out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ynumel: i32 {tt.divisibility = 16 : i32} loc("ynumel"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} {
+    %cst = arith.constant dense<-256> : tensor<8x1xi32, #blocked> loc(#loc1)
+    %cst_0 = arith.constant dense<-256> : tensor<8x1xi32, #blocked1> loc(#loc1)
+    %cst_1 = arith.constant dense<12288> : tensor<8x1xi32, #blocked1> loc(#loc1)
+    %cst_2 = arith.constant dense<128> : tensor<8x1xi32, #blocked1> loc(#loc1)
+    %cst_3 = arith.constant dense<256> : tensor<8x1xi64, #blocked> loc(#loc1)
+    %cst_4 = arith.constant dense<256> : tensor<8x1xi64, #blocked1> loc(#loc1)
+    %cst_5 = arith.constant dense<32> : tensor<8x1xi32, #blocked> loc(#loc1)
+    %cst_6 = arith.constant dense<32> : tensor<8x1xi32, #blocked1> loc(#loc1)
+    %cst_7 = arith.constant dense<128> : tensor<1x128xi32, #blocked> loc(#loc1)
+    %cst_8 = arith.constant dense<128> : tensor<1x128xi32, #blocked1> loc(#loc1)
+    %cst_9 = arith.constant dense<73728> : tensor<8x1xi32, #blocked> loc(#loc1)
+    %cst_10 = arith.constant dense<73728> : tensor<8x1xi32, #blocked1> loc(#loc1)
+    %c8_i32 = arith.constant 8 : i32 loc(#loc1)
+    %c128_i32 = arith.constant 128 : i32 loc(#loc1)
+    %cst_11 = arith.constant dense<0.000000e+00> : tensor<8x128xbf16, #blocked1> loc(#loc1)
+    %cst_12 = arith.constant dense<0.000000e+00> : tensor<8x128xf32, #blocked> loc(#loc1)
+    %cst_13 = arith.constant dense<9.99999997E-7> : tensor<8x128xf32, #blocked> loc(#loc1)
+    %cst_14 = arith.constant dense<1.280000e+02> : tensor<8x128xf32, #blocked> loc(#loc1)
+    %cst_15 = arith.constant dense<0.000000e+00> : tensor<8x128xf32, #blocked1> loc(#loc1)
+    %yoffset = tt.get_program_id y : i32 loc(#loc79)
+    %yoffset_16 = tt.get_program_id z : i32 loc(#loc80)
+    %yoffset_17 = tt.get_num_programs y : i32 loc(#loc81)
+    %yoffset_18 = arith.muli %yoffset_16, %yoffset_17 : i32 loc(#loc82)
+    %yoffset_19 = arith.addi %yoffset, %yoffset_18 : i32 loc(#loc83)
+    %yoffset_20 = arith.muli %yoffset_19, %c8_i32 : i32 loc(#loc84)
+    %yindex = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc85)
+    %yindex_21 = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc85)
+    %yindex_22 = tt.expand_dims %yindex {axis = 1 : i32} : tensor<8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<8x1xi32, #blocked1> loc(#loc85)
+    %yindex_23 = tt.expand_dims %yindex_21 {axis = 1 : i32} : tensor<8xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<8x1xi32, #blocked> loc(#loc85)
+    %yindex_24 = tt.splat %yoffset_20 : i32 -> tensor<8x1xi32, #blocked1> loc(#loc86)
+    %yindex_25 = tt.splat %yoffset_20 : i32 -> tensor<8x1xi32, #blocked> loc(#loc86)
+    %yindex_26 = arith.addi %yindex_24, %yindex_22 : tensor<8x1xi32, #blocked1> loc(#loc86)
+    %yindex_27 = arith.addi %yindex_25, %yindex_23 : tensor<8x1xi32, #blocked> loc(#loc86)
+    %ymask = arith.cmpi slt, %yindex_26, %cst_10 : tensor<8x1xi32, #blocked1> loc(#loc87)
+    %ymask_28 = arith.cmpi slt, %yindex_27, %cst_9 : tensor<8x1xi32, #blocked> loc(#loc87)
+    %xoffset = tt.get_program_id x : i32 loc(#loc88)
+    %xoffset_29 = arith.muli %xoffset, %c128_i32 : i32 loc(#loc89)
+    %xindex = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc90)
+    %xindex_30 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc90)
+    %xindex_31 = tt.expand_dims %xindex {axis = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x128xi32, #blocked1> loc(#loc90)
+    %xindex_32 = tt.expand_dims %xindex_30 {axis = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x128xi32, #blocked> loc(#loc90)
+    %xindex_33 = tt.splat %xoffset_29 : i32 -> tensor<1x128xi32, #blocked1> loc(#loc91)
+    %xindex_34 = tt.splat %xoffset_29 : i32 -> tensor<1x128xi32, #blocked> loc(#loc91)
+    %xindex_35 = arith.addi %xindex_33, %xindex_31 : tensor<1x128xi32, #blocked1> loc(#loc91)
+    %xindex_36 = arith.addi %xindex_34, %xindex_32 : tensor<1x128xi32, #blocked> loc(#loc91)
+    %xmask = arith.cmpi slt, %xindex_35, %cst_8 : tensor<1x128xi32, #blocked1> loc(#loc92)
+    %xmask_37 = arith.cmpi slt, %xindex_36, %cst_7 : tensor<1x128xi32, #blocked> loc(#loc92)
+    %y1 = arith.divsi %yindex_26, %cst_6 : tensor<8x1xi32, #blocked1> loc(#loc93)
+    %y1_38 = arith.divsi %yindex_27, %cst_5 : tensor<8x1xi32, #blocked> loc(#loc93)
+    %y0 = arith.remsi %yindex_26, %cst_6 : tensor<8x1xi32, #blocked1> loc(#loc94)
+    %y0_39 = arith.remsi %yindex_27, %cst_5 : tensor<8x1xi32, #blocked> loc(#loc94)
+    %tmp4 = arith.extsi %y1 : tensor<8x1xi32, #blocked1> to tensor<8x1xi64, #blocked1> loc(#loc95)
+    %tmp4_40 = arith.extsi %y1_38 : tensor<8x1xi32, #blocked> to tensor<8x1xi64, #blocked> loc(#loc95)
+    %tmp4_41 = arith.cmpi slt, %tmp4, %cst_4 : tensor<8x1xi64, #blocked1> loc(#loc95)
+    %tmp4_42 = arith.cmpi slt, %tmp4_40, %cst_3 : tensor<8x1xi64, #blocked> loc(#loc95)
+    %tmp5 = arith.muli %y0, %cst_2 : tensor<8x1xi32, #blocked1> loc(#loc96)
+    %tmp5_43 = tt.broadcast %xindex_35 : tensor<1x128xi32, #blocked1> -> tensor<8x128xi32, #blocked1> loc(#loc97)
+    %tmp5_44 = tt.broadcast %tmp5 : tensor<8x1xi32, #blocked1> -> tensor<8x128xi32, #blocked1> loc(#loc97)
+    %tmp5_45 = arith.addi %tmp5_43, %tmp5_44 : tensor<8x128xi32, #blocked1> loc(#loc97)
+    %tmp5_46 = arith.muli %y1, %cst_1 : tensor<8x1xi32, #blocked1> loc(#loc98)
+    %tmp5_47 = tt.broadcast %tmp5_46 : tensor<8x1xi32, #blocked1> -> tensor<8x128xi32, #blocked1> loc(#loc99)
+    %tmp5_48 = arith.addi %tmp5_45, %tmp5_47 : tensor<8x128xi32, #blocked1> loc(#loc99)
+    %tmp5_49 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<8x128x!tt.ptr<bf16>, #blocked1> loc(#loc100)
+    %tmp5_50 = tt.addptr %tmp5_49, %tmp5_48 : tensor<8x128x!tt.ptr<bf16>, #blocked1>, tensor<8x128xi32, #blocked1> loc(#loc100)
+    %tmp5_51 = tt.broadcast %tmp4_41 : tensor<8x1xi1, #blocked1> -> tensor<8x128xi1, #blocked1> loc(#loc101)
+    %tmp5_52 = tt.broadcast %tmp4_42 : tensor<8x1xi1, #blocked> -> tensor<8x128xi1, #blocked> loc(#loc101)
+    %tmp5_53 = tt.broadcast %xmask : tensor<1x128xi1, #blocked1> -> tensor<8x128xi1, #blocked1> loc(#loc101)
+    %tmp5_54 = tt.broadcast %xmask_37 : tensor<1x128xi1, #blocked> -> tensor<8x128xi1, #blocked> loc(#loc101)
+    %tmp5_55 = arith.andi %tmp5_51, %tmp5_53 : tensor<8x128xi1, #blocked1> loc(#loc101)
+    %tmp5_56 = arith.andi %tmp5_52, %tmp5_54 : tensor<8x128xi1, #blocked> loc(#loc101)
+    %tmp5_57 = tt.broadcast %ymask : tensor<8x1xi1, #blocked1> -> tensor<8x128xi1, #blocked1> loc(#loc102)
+    %tmp5_58 = tt.broadcast %ymask_28 : tensor<8x1xi1, #blocked> -> tensor<8x128xi1, #blocked> loc(#loc102)
+    %tmp5_59 = arith.andi %tmp5_55, %tmp5_57 : tensor<8x128xi1, #blocked1> loc(#loc102)
+    %tmp5_60 = arith.andi %tmp5_56, %tmp5_58 : tensor<8x128xi1, #blocked> loc(#loc102)
+    %tmp5_61 = tt.load %tmp5_50, %tmp5_59, %cst_11 evictionPolicy = evict_last : tensor<8x128x!tt.ptr<bf16>, #blocked1> loc(#loc103)
+    %tmp5_62 = ttg.convert_layout %tmp5_61 : tensor<8x128xbf16, #blocked1> -> tensor<8x128xbf16, #blocked> loc(#loc104)
+    %tmp5_63 = arith.extf %tmp5_62 : tensor<8x128xbf16, #blocked> to tensor<8x128xf32, #blocked> loc(#loc104)
+    %tmp7 = arith.muli %y1_38, %cst_5 : tensor<8x1xi32, #blocked> loc(#loc105)
+    %tmp7_64 = arith.addi %y0_39, %tmp7 : tensor<8x1xi32, #blocked> loc(#loc106)
+    %tmp7_65 = tt.splat %in_ptr1 : !tt.ptr<f32> -> tensor<8x1x!tt.ptr<f32>, #blocked> loc(#loc107)
+    %tmp7_66 = tt.addptr %tmp7_65, %tmp7_64 : tensor<8x1x!tt.ptr<f32>, #blocked>, tensor<8x1xi32, #blocked> loc(#loc107)
+    %tmp7_67 = tt.broadcast %tmp7_66 : tensor<8x1x!tt.ptr<f32>, #blocked> -> tensor<8x128x!tt.ptr<f32>, #blocked> loc(#loc107)
+    %tmp7_68 = tt.load %tmp7_67, %tmp5_60, %cst_12 evictionPolicy = evict_last : tensor<8x128x!tt.ptr<f32>, #blocked> loc(#loc108)
+    %tmp9 = arith.divf %tmp7_68, %cst_14 : tensor<8x128xf32, #blocked> loc(#loc109)
+    %tmp11 = arith.addf %tmp9, %cst_13 : tensor<8x128xf32, #blocked> loc(#loc110)
+    %tmp12 = tt.extern_elementwise %tmp11 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<8x128xf32, #blocked>) -> tensor<8x128xf32, #blocked> loc(#loc111)
+    %tmp13 = arith.mulf %tmp5_63, %tmp12 : tensor<8x128xf32, #blocked> loc(#loc112)
+    %tmp13_69 = ttg.convert_layout %tmp13 : tensor<8x128xf32, #blocked> -> tensor<8x128xf32, #blocked1> loc(#loc112)
+    %tmp14 = tt.splat %in_ptr2 : !tt.ptr<bf16> -> tensor<1x128x!tt.ptr<bf16>, #blocked1> loc(#loc113)
+    %tmp14_70 = tt.addptr %tmp14, %xindex_35 : tensor<1x128x!tt.ptr<bf16>, #blocked1>, tensor<1x128xi32, #blocked1> loc(#loc113)
+    %tmp14_71 = tt.broadcast %tmp14_70 : tensor<1x128x!tt.ptr<bf16>, #blocked1> -> tensor<8x128x!tt.ptr<bf16>, #blocked1> loc(#loc113)
+    %tmp14_72 = tt.load %tmp14_71, %tmp5_59, %cst_11 evictionPolicy = evict_last : tensor<8x128x!tt.ptr<bf16>, #blocked1> loc(#loc114)
+    %tmp14_73 = arith.extf %tmp14_72 : tensor<8x128xbf16, #blocked1> to tensor<8x128xf32, #blocked1> loc(#loc115)
+    %tmp16 = arith.mulf %tmp13_69, %tmp14_73 : tensor<8x128xf32, #blocked1> loc(#loc116)
+    %tmp20 = arith.cmpi sge, %tmp4, %cst_4 : tensor<8x1xi64, #blocked1> loc(#loc117)
+    %tmp20_74 = arith.cmpi sge, %tmp4_40, %cst_3 : tensor<8x1xi64, #blocked> loc(#loc117)
+    %tmp23 = arith.addi %y1, %cst_0 : tensor<8x1xi32, #blocked1> loc(#loc118)
+    %tmp23_75 = arith.addi %y1_38, %cst : tensor<8x1xi32, #blocked> loc(#loc118)
+    %tmp23_76 = arith.muli %tmp23, %cst_1 : tensor<8x1xi32, #blocked1> loc(#loc119)
+    %tmp23_77 = tt.broadcast %tmp23_76 : tensor<8x1xi32, #blocked1> -> tensor<8x128xi32, #blocked1> loc(#loc120)
+    %tmp23_78 = arith.addi %tmp5_45, %tmp23_77 : tensor<8x128xi32, #blocked1> loc(#loc120)
+    %tmp23_79 = tt.splat %in_ptr3 : !tt.ptr<bf16> -> tensor<8x128x!tt.ptr<bf16>, #blocked1> loc(#loc121)
+    %tmp23_80 = tt.addptr %tmp23_79, %tmp23_78 : tensor<8x128x!tt.ptr<bf16>, #blocked1>, tensor<8x128xi32, #blocked1> loc(#loc121)
+    %tmp23_81 = tt.broadcast %tmp20 : tensor<8x1xi1, #blocked1> -> tensor<8x128xi1, #blocked1> loc(#loc122)
+    %tmp23_82 = tt.broadcast %tmp20_74 : tensor<8x1xi1, #blocked> -> tensor<8x128xi1, #blocked> loc(#loc122)
+    %tmp23_83 = arith.andi %tmp23_81, %tmp5_53 : tensor<8x128xi1, #blocked1> loc(#loc122)
+    %tmp23_84 = arith.andi %tmp23_82, %tmp5_54 : tensor<8x128xi1, #blocked> loc(#loc122)
+    %tmp23_85 = arith.andi %tmp23_83, %tmp5_57 : tensor<8x128xi1, #blocked1> loc(#loc123)
+    %tmp23_86 = arith.andi %tmp23_84, %tmp5_58 : tensor<8x128xi1, #blocked> loc(#loc123)
+    %tmp23_87 = tt.load %tmp23_80, %tmp23_85, %cst_11 evictionPolicy = evict_last : tensor<8x128x!tt.ptr<bf16>, #blocked1> loc(#loc124)
+    %tmp23_88 = ttg.convert_layout %tmp23_87 : tensor<8x128xbf16, #blocked1> -> tensor<8x128xbf16, #blocked> loc(#loc125)
+    %tmp23_89 = arith.extf %tmp23_88 : tensor<8x128xbf16, #blocked> to tensor<8x128xf32, #blocked> loc(#loc125)
+    %tmp25 = arith.muli %tmp23_75, %cst_5 : tensor<8x1xi32, #blocked> loc(#loc126)
+    %tmp25_90 = arith.addi %y0_39, %tmp25 : tensor<8x1xi32, #blocked> loc(#loc127)
+    %tmp25_91 = tt.splat %in_ptr4 : !tt.ptr<f32> -> tensor<8x1x!tt.ptr<f32>, #blocked> loc(#loc128)
+    %tmp25_92 = tt.addptr %tmp25_91, %tmp25_90 : tensor<8x1x!tt.ptr<f32>, #blocked>, tensor<8x1xi32, #blocked> loc(#loc128)
+    %tmp25_93 = tt.broadcast %tmp25_92 : tensor<8x1x!tt.ptr<f32>, #blocked> -> tensor<8x128x!tt.ptr<f32>, #blocked> loc(#loc128)
+    %tmp25_94 = tt.load %tmp25_93, %tmp23_86, %cst_12 evictionPolicy = evict_last : tensor<8x128x!tt.ptr<f32>, #blocked> loc(#loc129)
+    %tmp27 = arith.divf %tmp25_94, %cst_14 : tensor<8x128xf32, #blocked> loc(#loc130)
+    %tmp29 = arith.addf %tmp27, %cst_13 : tensor<8x128xf32, #blocked> loc(#loc131)
+    %tmp30 = tt.extern_elementwise %tmp29 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<8x128xf32, #blocked>) -> tensor<8x128xf32, #blocked> loc(#loc132)
+    %tmp31 = arith.mulf %tmp23_89, %tmp30 : tensor<8x128xf32, #blocked> loc(#loc133)
+    %tmp31_95 = ttg.convert_layout %tmp31 : tensor<8x128xf32, #blocked> -> tensor<8x128xf32, #blocked1> loc(#loc133)
+    %tmp32 = tt.splat %in_ptr5 : !tt.ptr<bf16> -> tensor<1x128x!tt.ptr<bf16>, #blocked1> loc(#loc134)
+    %tmp32_96 = tt.addptr %tmp32, %xindex_35 : tensor<1x128x!tt.ptr<bf16>, #blocked1>, tensor<1x128xi32, #blocked1> loc(#loc134)
+    %tmp32_97 = tt.broadcast %tmp32_96 : tensor<1x128x!tt.ptr<bf16>, #blocked1> -> tensor<8x128x!tt.ptr<bf16>, #blocked1> loc(#loc134)
+    %tmp32_98 = tt.load %tmp32_97, %tmp23_85, %cst_11 evictionPolicy = evict_last : tensor<8x128x!tt.ptr<bf16>, #blocked1> loc(#loc135)
+    %tmp32_99 = arith.extf %tmp32_98 : tensor<8x128xbf16, #blocked1> to tensor<8x128xf32, #blocked1> loc(#loc136)
+    %tmp34 = arith.mulf %tmp31_95, %tmp32_99 : tensor<8x128xf32, #blocked1> loc(#loc137)
+    %tmp37 = arith.select %tmp23_81, %tmp34, %cst_15 : tensor<8x128xi1, #blocked1>, tensor<8x128xf32, #blocked1> loc(#loc138)
+    %tmp38 = arith.select %tmp5_51, %tmp16, %tmp37 : tensor<8x128xi1, #blocked1>, tensor<8x128xf32, #blocked1> loc(#loc141)
+    %0 = arith.muli %yindex_26, %cst_2 : tensor<8x1xi32, #blocked1> loc(#loc64)
+    %1 = tt.broadcast %0 : tensor<8x1xi32, #blocked1> -> tensor<8x128xi32, #blocked1> loc(#loc65)
+    %2 = arith.addi %tmp5_43, %1 : tensor<8x128xi32, #blocked1> loc(#loc65)
+    %3 = tt.splat %out_ptr0 : !tt.ptr<bf16> -> tensor<8x128x!tt.ptr<bf16>, #blocked1> loc(#loc66)
+    %4 = tt.addptr %3, %2 : tensor<8x128x!tt.ptr<bf16>, #blocked1>, tensor<8x128xi32, #blocked1> loc(#loc66)
+    %5 = arith.andi %tmp5_53, %tmp5_57 : tensor<8x128xi1, #blocked1> loc(#loc67)
+    %6 = arith.truncf %tmp38 : tensor<8x128xf32, #blocked1> to tensor<8x128xbf16, #blocked1> loc(#loc68)
+    tt.store %4, %6, %5 : tensor<8x128x!tt.ptr<bf16>, #blocked1> loc(#loc68)
+    tt.return loc(#loc69)
+  } loc(#loc)
+} loc(#loc)
+#loc1 = loc(unknown)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:29)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:48)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:69)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:53)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:34)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:75)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":22:44)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":22:23)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":23:21)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":24:28)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":24:33)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":25:44)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":25:23)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":26:21)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":27:19)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":29:19)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":35:18)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:39)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:35)
+#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:51)
+#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:44)
+#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:30)
+#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:64)
+#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:72)
+#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:57)
+#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:123)
+#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:55)
+#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:51)
+#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:30)
+#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:80)
+#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":40:19)
+#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":42:19)
+#loc34 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":43:28)
+#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":44:19)
+#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:31)
+#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:71)
+#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:137)
+#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":47:20)
+#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":51:20)
+#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:61)
+#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:52)
+#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:45)
+#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:31)
+#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:75)
+#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:83)
+#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:67)
+#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:134)
+#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:56)
+#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:52)
+#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:31)
+#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:90)
+#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":58:21)
+#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":60:20)
+#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":61:28)
+#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":62:20)
+#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:31)
+#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:71)
+#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:138)
+#loc60 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":65:20)
+#loc61 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":68:35)
+#loc62 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":69:34)
+#loc63 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":50:34)
+#loc64 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:34)
+#loc65 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:30)
+#loc66 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:25)
+#loc67 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:54)
+#loc68 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:46)
+#loc69 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:4)
+#loc79 = loc("yoffset"(#loc2))
+#loc80 = loc("yoffset"(#loc3))
+#loc81 = loc("yoffset"(#loc4))
+#loc82 = loc("yoffset"(#loc5))
+#loc83 = loc("yoffset"(#loc6))
+#loc84 = loc("yoffset"(#loc7))
+#loc85 = loc("yindex"(#loc8))
+#loc86 = loc("yindex"(#loc9))
+#loc87 = loc("ymask"(#loc10))
+#loc88 = loc("xoffset"(#loc11))
+#loc89 = loc("xoffset"(#loc12))
+#loc90 = loc("xindex"(#loc13))
+#loc91 = loc("xindex"(#loc14))
+#loc92 = loc("xmask"(#loc15))
+#loc93 = loc("y1"(#loc16))
+#loc94 = loc("y0"(#loc17))
+#loc95 = loc("tmp4"(#loc18))
+#loc96 = loc("tmp5"(#loc19))
+#loc97 = loc("tmp5"(#loc20))
+#loc98 = loc("tmp5"(#loc21))
+#loc99 = loc("tmp5"(#loc22))
+#loc100 = loc("tmp5"(#loc23))
+#loc101 = loc("tmp5"(#loc24))
+#loc102 = loc("tmp5"(#loc25))
+#loc103 = loc("tmp5"(#loc26))
+#loc104 = loc("tmp5"(#loc27))
+#loc105 = loc("tmp7"(#loc28))
+#loc106 = loc("tmp7"(#loc29))
+#loc107 = loc("tmp7"(#loc30))
+#loc108 = loc("tmp7"(#loc31))
+#loc109 = loc("tmp9"(#loc32))
+#loc110 = loc("tmp11"(#loc33))
+#loc111 = loc("tmp12"(#loc34))
+#loc112 = loc("tmp13"(#loc35))
+#loc113 = loc("tmp14"(#loc36))
+#loc114 = loc("tmp14"(#loc37))
+#loc115 = loc("tmp14"(#loc38))
+#loc116 = loc("tmp16"(#loc39))
+#loc117 = loc("tmp20"(#loc40))
+#loc118 = loc("tmp23"(#loc41))
+#loc119 = loc("tmp23"(#loc42))
+#loc120 = loc("tmp23"(#loc43))
+#loc121 = loc("tmp23"(#loc44))
+#loc122 = loc("tmp23"(#loc45))
+#loc123 = loc("tmp23"(#loc46))
+#loc124 = loc("tmp23"(#loc47))
+#loc125 = loc("tmp23"(#loc48))
+#loc126 = loc("tmp25"(#loc49))
+#loc127 = loc("tmp25"(#loc50))
+#loc128 = loc("tmp25"(#loc51))
+#loc129 = loc("tmp25"(#loc52))
+#loc130 = loc("tmp27"(#loc53))
+#loc131 = loc("tmp29"(#loc54))
+#loc132 = loc("tmp30"(#loc55))
+#loc133 = loc("tmp31"(#loc56))
+#loc134 = loc("tmp32"(#loc57))
+#loc135 = loc("tmp32"(#loc58))
+#loc136 = loc("tmp32"(#loc59))
+#loc137 = loc("tmp34"(#loc60))
+#loc138 = loc("tmp37"(#loc61))
+#loc139 = loc("tmp38"(#loc62))
+#loc140 = loc("tmp19"(#loc63))
+#loc141 = loc(fused[#loc139, #loc140])
diff --git a/triton/BMEOK34QH7HISRHLZCWDTEICFHNMYNUUIGVSUVGWE7RQZVRWVZPA/triton_poi_fused__fused_rms_norm_cat_view_2.ttir b/triton/BMEOK34QH7HISRHLZCWDTEICFHNMYNUUIGVSUVGWE7RQZVRWVZPA/triton_poi_fused__fused_rms_norm_cat_view_2.ttir
new file mode 100644
index 0000000000000000000000000000000000000000..7e341e428627154d59d283ecfe6fae7ca5b98f82
--- /dev/null
+++ b/triton/BMEOK34QH7HISRHLZCWDTEICFHNMYNUUIGVSUVGWE7RQZVRWVZPA/triton_poi_fused__fused_rms_norm_cat_view_2.ttir
@@ -0,0 +1,256 @@
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":18:0)
+#loc72 = loc("in_ptr0"(#loc))
+#loc73 = loc("in_ptr1"(#loc))
+#loc74 = loc("in_ptr2"(#loc))
+#loc75 = loc("in_ptr3"(#loc))
+#loc76 = loc("in_ptr4"(#loc))
+#loc77 = loc("in_ptr5"(#loc))
+#loc78 = loc("out_ptr0"(#loc))
+#loc79 = loc("ynumel"(#loc))
+#loc80 = loc("xnumel"(#loc))
+module {
+  tt.func public @triton_poi_fused__fused_rms_norm_cat_view_2(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %in_ptr4: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr4"(#loc)), %in_ptr5: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr5"(#loc)), %out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ynumel: i32 {tt.divisibility = 16 : i32} loc("ynumel"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} {
+    %cst = arith.constant dense<0.000000e+00> : tensor<8x128xbf16> loc(#loc1)
+    %cst_0 = arith.constant dense<-256> : tensor<8x1xi32> loc(#loc1)
+    %cst_1 = arith.constant dense<9.99999997E-7> : tensor<8x128xf32> loc(#loc1)
+    %cst_2 = arith.constant dense<1.280000e+02> : tensor<8x128xf32> loc(#loc1)
+    %cst_3 = arith.constant dense<0.000000e+00> : tensor<8x128xf32> loc(#loc1)
+    %cst_4 = arith.constant dense<12288> : tensor<8x1xi32> loc(#loc1)
+    %cst_5 = arith.constant dense<128> : tensor<8x1xi32> loc(#loc1)
+    %cst_6 = arith.constant dense<256> : tensor<8x1xi64> loc(#loc1)
+    %cst_7 = arith.constant dense<32> : tensor<8x1xi32> loc(#loc1)
+    %xmask = arith.constant dense<128> : tensor<1x128xi32> loc(#loc81)
+    %ymask = arith.constant dense<73728> : tensor<8x1xi32> loc(#loc82)
+    %c8_i32 = arith.constant 8 : i32 loc(#loc1)
+    %c128_i32 = arith.constant 128 : i32 loc(#loc1)
+    %yoffset = tt.get_program_id y : i32 loc(#loc83)
+    %yoffset_8 = tt.get_program_id z : i32 loc(#loc84)
+    %yoffset_9 = tt.get_num_programs y : i32 loc(#loc85)
+    %yoffset_10 = arith.muli %yoffset_8, %yoffset_9 : i32 loc(#loc86)
+    %yoffset_11 = arith.addi %yoffset, %yoffset_10 : i32 loc(#loc87)
+    %yoffset_12 = arith.muli %yoffset_11, %c8_i32 : i32 loc(#loc88)
+    %yindex = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32> loc(#loc89)
+    %yindex_13 = tt.expand_dims %yindex {axis = 1 : i32} : tensor<8xi32> -> tensor<8x1xi32> loc(#loc90)
+    %yindex_14 = tt.splat %yoffset_12 : i32 -> tensor<8x1xi32> loc(#loc91)
+    %yindex_15 = arith.addi %yindex_14, %yindex_13 : tensor<8x1xi32> loc(#loc91)
+    %ymask_16 = arith.cmpi slt, %yindex_15, %ymask : tensor<8x1xi32> loc(#loc82)
+    %xoffset = tt.get_program_id x : i32 loc(#loc92)
+    %xoffset_17 = arith.muli %xoffset, %c128_i32 : i32 loc(#loc93)
+    %xindex = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc94)
+    %xindex_18 = tt.expand_dims %xindex {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc95)
+    %xindex_19 = tt.splat %xoffset_17 : i32 -> tensor<1x128xi32> loc(#loc96)
+    %xindex_20 = arith.addi %xindex_19, %xindex_18 : tensor<1x128xi32> loc(#loc96)
+    %xmask_21 = arith.cmpi slt, %xindex_20, %xmask : tensor<1x128xi32> loc(#loc81)
+    %y1 = arith.divsi %yindex_15, %cst_7 : tensor<8x1xi32> loc(#loc97)
+    %y0 = arith.remsi %yindex_15, %cst_7 : tensor<8x1xi32> loc(#loc98)
+    %tmp4 = arith.extsi %y1 : tensor<8x1xi32> to tensor<8x1xi64> loc(#loc99)
+    %tmp4_22 = arith.cmpi slt, %tmp4, %cst_6 : tensor<8x1xi64> loc(#loc99)
+    %tmp5 = arith.muli %y0, %cst_5 : tensor<8x1xi32> loc(#loc100)
+    %tmp5_23 = tt.broadcast %xindex_20 : tensor<1x128xi32> -> tensor<8x128xi32> loc(#loc101)
+    %tmp5_24 = tt.broadcast %tmp5 : tensor<8x1xi32> -> tensor<8x128xi32> loc(#loc101)
+    %tmp5_25 = arith.addi %tmp5_23, %tmp5_24 : tensor<8x128xi32> loc(#loc101)
+    %tmp5_26 = arith.muli %y1, %cst_4 : tensor<8x1xi32> loc(#loc102)
+    %tmp5_27 = tt.broadcast %tmp5_26 : tensor<8x1xi32> -> tensor<8x128xi32> loc(#loc103)
+    %tmp5_28 = arith.addi %tmp5_25, %tmp5_27 : tensor<8x128xi32> loc(#loc103)
+    %tmp5_29 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<8x128x!tt.ptr<bf16>> loc(#loc104)
+    %tmp5_30 = tt.addptr %tmp5_29, %tmp5_28 : tensor<8x128x!tt.ptr<bf16>>, tensor<8x128xi32> loc(#loc104)
+    %tmp5_31 = tt.broadcast %tmp4_22 : tensor<8x1xi1> -> tensor<8x128xi1> loc(#loc105)
+    %tmp5_32 = tt.broadcast %xmask_21 : tensor<1x128xi1> -> tensor<8x128xi1> loc(#loc105)
+    %tmp5_33 = arith.andi %tmp5_31, %tmp5_32 : tensor<8x128xi1> loc(#loc105)
+    %tmp5_34 = tt.broadcast %ymask_16 : tensor<8x1xi1> -> tensor<8x128xi1> loc(#loc106)
+    %tmp5_35 = arith.andi %tmp5_33, %tmp5_34 : tensor<8x128xi1> loc(#loc106)
+    %tmp5_36 = tt.load %tmp5_30, %tmp5_35, %cst evictionPolicy = evict_last : tensor<8x128x!tt.ptr<bf16>> loc(#loc107)
+    %tmp5_37 = arith.extf %tmp5_36 : tensor<8x128xbf16> to tensor<8x128xf32> loc(#loc108)
+    %tmp7 = arith.muli %y1, %cst_7 : tensor<8x1xi32> loc(#loc109)
+    %tmp7_38 = arith.addi %y0, %tmp7 : tensor<8x1xi32> loc(#loc110)
+    %tmp7_39 = tt.splat %in_ptr1 : !tt.ptr<f32> -> tensor<8x1x!tt.ptr<f32>> loc(#loc111)
+    %tmp7_40 = tt.addptr %tmp7_39, %tmp7_38 : tensor<8x1x!tt.ptr<f32>>, tensor<8x1xi32> loc(#loc111)
+    %tmp7_41 = tt.broadcast %tmp7_40 : tensor<8x1x!tt.ptr<f32>> -> tensor<8x128x!tt.ptr<f32>> loc(#loc111)
+    %tmp7_42 = tt.load %tmp7_41, %tmp5_35, %cst_3 evictionPolicy = evict_last : tensor<8x128x!tt.ptr<f32>> loc(#loc112)
+    %tmp9 = arith.divf %tmp7_42, %cst_2 : tensor<8x128xf32> loc(#loc113)
+    %tmp11 = arith.addf %tmp9, %cst_1 : tensor<8x128xf32> loc(#loc114)
+    %tmp12 = tt.extern_elementwise %tmp11 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<8x128xf32>) -> tensor<8x128xf32> loc(#loc115)
+    %tmp13 = arith.mulf %tmp5_37, %tmp12 : tensor<8x128xf32> loc(#loc116)
+    %tmp14 = tt.splat %in_ptr2 : !tt.ptr<bf16> -> tensor<1x128x!tt.ptr<bf16>> loc(#loc117)
+    %tmp14_43 = tt.addptr %tmp14, %xindex_20 : tensor<1x128x!tt.ptr<bf16>>, tensor<1x128xi32> loc(#loc117)
+    %tmp14_44 = tt.broadcast %tmp14_43 : tensor<1x128x!tt.ptr<bf16>> -> tensor<8x128x!tt.ptr<bf16>> loc(#loc117)
+    %tmp14_45 = tt.load %tmp14_44, %tmp5_35, %cst evictionPolicy = evict_last : tensor<8x128x!tt.ptr<bf16>> loc(#loc118)
+    %tmp14_46 = arith.extf %tmp14_45 : tensor<8x128xbf16> to tensor<8x128xf32> loc(#loc119)
+    %tmp16 = arith.mulf %tmp13, %tmp14_46 : tensor<8x128xf32> loc(#loc120)
+    %tmp19 = arith.select %tmp5_31, %tmp16, %cst_3 : tensor<8x128xi1>, tensor<8x128xf32> loc(#loc121)
+    %tmp20 = arith.cmpi sge, %tmp4, %cst_6 : tensor<8x1xi64> loc(#loc122)
+    %tmp23 = arith.addi %y1, %cst_0 : tensor<8x1xi32> loc(#loc123)
+    %tmp23_47 = arith.muli %tmp23, %cst_4 : tensor<8x1xi32> loc(#loc124)
+    %tmp23_48 = tt.broadcast %tmp23_47 : tensor<8x1xi32> -> tensor<8x128xi32> loc(#loc125)
+    %tmp23_49 = arith.addi %tmp5_25, %tmp23_48 : tensor<8x128xi32> loc(#loc125)
+    %tmp23_50 = tt.splat %in_ptr3 : !tt.ptr<bf16> -> tensor<8x128x!tt.ptr<bf16>> loc(#loc126)
+    %tmp23_51 = tt.addptr %tmp23_50, %tmp23_49 : tensor<8x128x!tt.ptr<bf16>>, tensor<8x128xi32> loc(#loc126)
+    %tmp23_52 = tt.broadcast %tmp20 : tensor<8x1xi1> -> tensor<8x128xi1> loc(#loc127)
+    %tmp23_53 = arith.andi %tmp23_52, %tmp5_32 : tensor<8x128xi1> loc(#loc127)
+    %tmp23_54 = arith.andi %tmp23_53, %tmp5_34 : tensor<8x128xi1> loc(#loc128)
+    %tmp23_55 = tt.load %tmp23_51, %tmp23_54, %cst evictionPolicy = evict_last : tensor<8x128x!tt.ptr<bf16>> loc(#loc129)
+    %tmp23_56 = arith.extf %tmp23_55 : tensor<8x128xbf16> to tensor<8x128xf32> loc(#loc130)
+    %tmp25 = arith.muli %tmp23, %cst_7 : tensor<8x1xi32> loc(#loc131)
+    %tmp25_57 = arith.addi %y0, %tmp25 : tensor<8x1xi32> loc(#loc132)
+    %tmp25_58 = tt.splat %in_ptr4 : !tt.ptr<f32> -> tensor<8x1x!tt.ptr<f32>> loc(#loc133)
+    %tmp25_59 = tt.addptr %tmp25_58, %tmp25_57 : tensor<8x1x!tt.ptr<f32>>, tensor<8x1xi32> loc(#loc133)
+    %tmp25_60 = tt.broadcast %tmp25_59 : tensor<8x1x!tt.ptr<f32>> -> tensor<8x128x!tt.ptr<f32>> loc(#loc133)
+    %tmp25_61 = tt.load %tmp25_60, %tmp23_54, %cst_3 evictionPolicy = evict_last : tensor<8x128x!tt.ptr<f32>> loc(#loc134)
+    %tmp27 = arith.divf %tmp25_61, %cst_2 : tensor<8x128xf32> loc(#loc135)
+    %tmp29 = arith.addf %tmp27, %cst_1 : tensor<8x128xf32> loc(#loc136)
+    %tmp30 = tt.extern_elementwise %tmp29 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<8x128xf32>) -> tensor<8x128xf32> loc(#loc137)
+    %tmp31 = arith.mulf %tmp23_56, %tmp30 : tensor<8x128xf32> loc(#loc138)
+    %tmp32 = tt.splat %in_ptr5 : !tt.ptr<bf16> -> tensor<1x128x!tt.ptr<bf16>> loc(#loc139)
+    %tmp32_62 = tt.addptr %tmp32, %xindex_20 : tensor<1x128x!tt.ptr<bf16>>, tensor<1x128xi32> loc(#loc139)
+    %tmp32_63 = tt.broadcast %tmp32_62 : tensor<1x128x!tt.ptr<bf16>> -> tensor<8x128x!tt.ptr<bf16>> loc(#loc139)
+    %tmp32_64 = tt.load %tmp32_63, %tmp23_54, %cst evictionPolicy = evict_last : tensor<8x128x!tt.ptr<bf16>> loc(#loc140)
+    %tmp32_65 = arith.extf %tmp32_64 : tensor<8x128xbf16> to tensor<8x128xf32> loc(#loc141)
+    %tmp34 = arith.mulf %tmp31, %tmp32_65 : tensor<8x128xf32> loc(#loc142)
+    %tmp37 = arith.select %tmp23_52, %tmp34, %cst_3 : tensor<8x128xi1>, tensor<8x128xf32> loc(#loc143)
+    %tmp38 = arith.select %tmp5_31, %tmp19, %tmp37 : tensor<8x128xi1>, tensor<8x128xf32> loc(#loc144)
+    %0 = arith.muli %yindex_15, %cst_5 : tensor<8x1xi32> loc(#loc66)
+    %1 = tt.broadcast %0 : tensor<8x1xi32> -> tensor<8x128xi32> loc(#loc67)
+    %2 = arith.addi %tmp5_23, %1 : tensor<8x128xi32> loc(#loc67)
+    %3 = tt.splat %out_ptr0 : !tt.ptr<bf16> -> tensor<8x128x!tt.ptr<bf16>> loc(#loc68)
+    %4 = tt.addptr %3, %2 : tensor<8x128x!tt.ptr<bf16>>, tensor<8x128xi32> loc(#loc68)
+    %5 = arith.andi %tmp5_32, %tmp5_34 : tensor<8x128xi1> loc(#loc69)
+    %6 = arith.truncf %tmp38 : tensor<8x128xf32> to tensor<8x128xbf16> loc(#loc70)
+    tt.store %4, %6, %5 : tensor<8x128x!tt.ptr<bf16>> loc(#loc70)
+    tt.return loc(#loc71)
+  } loc(#loc)
+} loc(#loc)
+#loc1 = loc(unknown)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":26:21)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":23:21)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:29)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:48)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:69)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:53)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:34)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:75)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":22:36)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":22:44)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":22:23)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":24:28)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":24:33)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":25:36)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":25:44)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":25:23)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":27:19)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":29:19)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":35:18)
+#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:39)
+#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:35)
+#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:51)
+#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:44)
+#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:30)
+#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:64)
+#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:72)
+#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:57)
+#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:123)
+#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:55)
+#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:51)
+#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:30)
+#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:80)
+#loc34 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":40:19)
+#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":42:19)
+#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":43:28)
+#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":44:19)
+#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:31)
+#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:71)
+#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:137)
+#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":47:20)
+#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":50:34)
+#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":51:20)
+#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:61)
+#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:52)
+#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:45)
+#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:31)
+#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:75)
+#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:83)
+#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:67)
+#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:134)
+#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:56)
+#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:52)
+#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:31)
+#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:90)
+#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":58:21)
+#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":60:20)
+#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":61:28)
+#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":62:20)
+#loc60 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:31)
+#loc61 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:71)
+#loc62 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:138)
+#loc63 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":65:20)
+#loc64 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":68:35)
+#loc65 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":69:34)
+#loc66 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:34)
+#loc67 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:30)
+#loc68 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:25)
+#loc69 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:54)
+#loc70 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:46)
+#loc71 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:4)
+#loc81 = loc("xmask"(#loc2))
+#loc82 = loc("ymask"(#loc3))
+#loc83 = loc("yoffset"(#loc4))
+#loc84 = loc("yoffset"(#loc5))
+#loc85 = loc("yoffset"(#loc6))
+#loc86 = loc("yoffset"(#loc7))
+#loc87 = loc("yoffset"(#loc8))
+#loc88 = loc("yoffset"(#loc9))
+#loc89 = loc("yindex"(#loc10))
+#loc90 = loc("yindex"(#loc11))
+#loc91 = loc("yindex"(#loc12))
+#loc92 = loc("xoffset"(#loc13))
+#loc93 = loc("xoffset"(#loc14))
+#loc94 = loc("xindex"(#loc15))
+#loc95 = loc("xindex"(#loc16))
+#loc96 = loc("xindex"(#loc17))
+#loc97 = loc("y1"(#loc18))
+#loc98 = loc("y0"(#loc19))
+#loc99 = loc("tmp4"(#loc20))
+#loc100 = loc("tmp5"(#loc21))
+#loc101 = loc("tmp5"(#loc22))
+#loc102 = loc("tmp5"(#loc23))
+#loc103 = loc("tmp5"(#loc24))
+#loc104 = loc("tmp5"(#loc25))
+#loc105 = loc("tmp5"(#loc26))
+#loc106 = loc("tmp5"(#loc27))
+#loc107 = loc("tmp5"(#loc28))
+#loc108 = loc("tmp5"(#loc29))
+#loc109 = loc("tmp7"(#loc30))
+#loc110 = loc("tmp7"(#loc31))
+#loc111 = loc("tmp7"(#loc32))
+#loc112 = loc("tmp7"(#loc33))
+#loc113 = loc("tmp9"(#loc34))
+#loc114 = loc("tmp11"(#loc35))
+#loc115 = loc("tmp12"(#loc36))
+#loc116 = loc("tmp13"(#loc37))
+#loc117 = loc("tmp14"(#loc38))
+#loc118 = loc("tmp14"(#loc39))
+#loc119 = loc("tmp14"(#loc40))
+#loc120 = loc("tmp16"(#loc41))
+#loc121 = loc("tmp19"(#loc42))
+#loc122 = loc("tmp20"(#loc43))
+#loc123 = loc("tmp23"(#loc44))
+#loc124 = loc("tmp23"(#loc45))
+#loc125 = loc("tmp23"(#loc46))
+#loc126 = loc("tmp23"(#loc47))
+#loc127 = loc("tmp23"(#loc48))
+#loc128 = loc("tmp23"(#loc49))
+#loc129 = loc("tmp23"(#loc50))
+#loc130 = loc("tmp23"(#loc51))
+#loc131 = loc("tmp25"(#loc52))
+#loc132 = loc("tmp25"(#loc53))
+#loc133 = loc("tmp25"(#loc54))
+#loc134 = loc("tmp25"(#loc55))
+#loc135 = loc("tmp27"(#loc56))
+#loc136 = loc("tmp29"(#loc57))
+#loc137 = loc("tmp30"(#loc58))
+#loc138 = loc("tmp31"(#loc59))
+#loc139 = loc("tmp32"(#loc60))
+#loc140 = loc("tmp32"(#loc61))
+#loc141 = loc("tmp32"(#loc62))
+#loc142 = loc("tmp34"(#loc63))
+#loc143 = loc("tmp37"(#loc64))
+#loc144 = loc("tmp38"(#loc65))
diff --git a/triton/C3PA2FQRIXNX4FILRXWMWDTESFUYR3BPZHZKRDDGR2QUBFWHGDOQ/__grp__triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json b/triton/C3PA2FQRIXNX4FILRXWMWDTESFUYR3BPZHZKRDDGR2QUBFWHGDOQ/__grp__triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..d0f9bf343483e2f266965205e64ea8913a226f37
--- /dev/null
+++ b/triton/C3PA2FQRIXNX4FILRXWMWDTESFUYR3BPZHZKRDDGR2QUBFWHGDOQ/__grp__triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json
@@ -0,0 +1 @@
+{"child_paths": {"triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.source": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/C3PA2FQRIXNX4FILRXWMWDTESFUYR3BPZHZKRDDGR2QUBFWHGDOQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.source", "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/C3PA2FQRIXNX4FILRXWMWDTESFUYR3BPZHZKRDDGR2QUBFWHGDOQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttir", "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttgir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/C3PA2FQRIXNX4FILRXWMWDTESFUYR3BPZHZKRDDGR2QUBFWHGDOQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttgir", "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.llir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/C3PA2FQRIXNX4FILRXWMWDTESFUYR3BPZHZKRDDGR2QUBFWHGDOQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.llir", "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ptx": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/C3PA2FQRIXNX4FILRXWMWDTESFUYR3BPZHZKRDDGR2QUBFWHGDOQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ptx", "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.cubin": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/C3PA2FQRIXNX4FILRXWMWDTESFUYR3BPZHZKRDDGR2QUBFWHGDOQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.cubin", "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/C3PA2FQRIXNX4FILRXWMWDTESFUYR3BPZHZKRDDGR2QUBFWHGDOQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json"}}
\ No newline at end of file
diff --git a/triton/C3PA2FQRIXNX4FILRXWMWDTESFUYR3BPZHZKRDDGR2QUBFWHGDOQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.cubin b/triton/C3PA2FQRIXNX4FILRXWMWDTESFUYR3BPZHZKRDDGR2QUBFWHGDOQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..20b59d0575e6e00352c9a4de76a8d172ad6bc435
Binary files /dev/null and b/triton/C3PA2FQRIXNX4FILRXWMWDTESFUYR3BPZHZKRDDGR2QUBFWHGDOQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.cubin differ
diff --git a/triton/C3PA2FQRIXNX4FILRXWMWDTESFUYR3BPZHZKRDDGR2QUBFWHGDOQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json b/triton/C3PA2FQRIXNX4FILRXWMWDTESFUYR3BPZHZKRDDGR2QUBFWHGDOQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..022927831500d71d0905d1826f5e5e31120ce875
--- /dev/null
+++ b/triton/C3PA2FQRIXNX4FILRXWMWDTESFUYR3BPZHZKRDDGR2QUBFWHGDOQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json
@@ -0,0 +1 @@
+{"hash": "16de0d161145db7e150b8deccb0e64916988ec2fc9f2a88c668ea14096c730dd", "target": {"backend": "cuda", "arch": 89, "warp_size": 32}, "num_warps": 16, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "enable_reflect_ftz": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee", "bf16x3", "bf16x6"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm89", "instrumentation_mode": "", "triton_version": "3.6.0", "tensordesc_meta": [], "shared": 16384, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0"}
\ No newline at end of file
diff --git a/triton/C3PA2FQRIXNX4FILRXWMWDTESFUYR3BPZHZKRDDGR2QUBFWHGDOQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.llir b/triton/C3PA2FQRIXNX4FILRXWMWDTESFUYR3BPZHZKRDDGR2QUBFWHGDOQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.llir
new file mode 100644
index 0000000000000000000000000000000000000000..734b3218e9de1a9cf17afd0592ff8759783ac44a
--- /dev/null
+++ b/triton/C3PA2FQRIXNX4FILRXWMWDTESFUYR3BPZHZKRDDGR2QUBFWHGDOQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.llir
@@ -0,0 +1,1426 @@
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64"
+
+@global_smem = external local_unnamed_addr addrspace(3) global [0 x i8], align 16
+@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1
+
+; Function Attrs: nounwind
+define ptx_kernel void @triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, ptr addrspace(1) %6, i32 %7, i32 %8, ptr addrspace(1) readnone captures(none) %9, ptr addrspace(1) readnone captures(none) %10) local_unnamed_addr #0 !dbg !5 {
+  %12 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !8
+  %13 = shl i32 %12, 6, !dbg !9
+  %14 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
+  %15 = and i32 %14, 504, !dbg !10
+  %16 = lshr exact i32 %15, 3, !dbg !10
+  %17 = or disjoint i32 %16, %13, !dbg !11
+  %18 = and i32 %14, 7, !dbg !12
+  %19 = shl nuw nsw i32 %18, 3, !dbg !12
+  %20 = sdiv i32 %17, 32, !dbg !13
+  %21 = shl i32 %17, 7
+  %22 = shl i32 %20, 15
+  %23 = add i32 %22, %21
+  %24 = add i32 %23, 4096
+  %25 = zext nneg i32 %19 to i64, !dbg !14
+  %26 = or disjoint i32 %24, %19, !dbg !15
+  %27 = sext i32 %26 to i64, !dbg !16
+  %28 = getelementptr bfloat, ptr addrspace(1) %2, i64 %27, !dbg !16
+  %29 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !17
+  %30 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %28, i64 %29, i1 true) #6, !dbg !17
+  %31 = extractvalue { i32, i32, i32, i32 } %30, 0, !dbg !17
+  %32 = bitcast i32 %31 to <2 x bfloat>, !dbg !17
+  %33 = extractvalue { i32, i32, i32, i32 } %30, 1, !dbg !17
+  %34 = bitcast i32 %33 to <2 x bfloat>, !dbg !17
+  %35 = extractvalue { i32, i32, i32, i32 } %30, 2, !dbg !17
+  %36 = bitcast i32 %35 to <2 x bfloat>, !dbg !17
+  %37 = extractvalue { i32, i32, i32, i32 } %30, 3, !dbg !17
+  %38 = bitcast i32 %37 to <2 x bfloat>, !dbg !17
+  %39 = extractelement <2 x bfloat> %32, i64 0, !dbg !17
+  %40 = extractelement <2 x bfloat> %32, i64 1, !dbg !17
+  %41 = extractelement <2 x bfloat> %34, i64 0, !dbg !17
+  %42 = extractelement <2 x bfloat> %34, i64 1, !dbg !17
+  %43 = extractelement <2 x bfloat> %36, i64 0, !dbg !17
+  %44 = extractelement <2 x bfloat> %36, i64 1, !dbg !17
+  %45 = extractelement <2 x bfloat> %38, i64 0, !dbg !17
+  %46 = extractelement <2 x bfloat> %38, i64 1, !dbg !17
+  %47 = fpext bfloat %39 to float, !dbg !18
+  %48 = fpext bfloat %40 to float, !dbg !18
+  %49 = fpext bfloat %41 to float, !dbg !18
+  %50 = fpext bfloat %42 to float, !dbg !18
+  %51 = fpext bfloat %43 to float, !dbg !18
+  %52 = fpext bfloat %44 to float, !dbg !18
+  %53 = fpext bfloat %45 to float, !dbg !18
+  %54 = fpext bfloat %46 to float, !dbg !18
+  %55 = or disjoint i32 %23, %19, !dbg !19
+  %56 = sext i32 %55 to i64, !dbg !20
+  %57 = getelementptr bfloat, ptr addrspace(1) %2, i64 %56, !dbg !20
+  %58 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !21
+  %59 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %57, i64 %58, i1 true) #6, !dbg !21
+  %60 = extractvalue { i32, i32, i32, i32 } %59, 0, !dbg !21
+  %61 = bitcast i32 %60 to <2 x bfloat>, !dbg !21
+  %62 = extractvalue { i32, i32, i32, i32 } %59, 1, !dbg !21
+  %63 = bitcast i32 %62 to <2 x bfloat>, !dbg !21
+  %64 = extractvalue { i32, i32, i32, i32 } %59, 2, !dbg !21
+  %65 = bitcast i32 %64 to <2 x bfloat>, !dbg !21
+  %66 = extractvalue { i32, i32, i32, i32 } %59, 3, !dbg !21
+  %67 = bitcast i32 %66 to <2 x bfloat>, !dbg !21
+  %68 = extractelement <2 x bfloat> %61, i64 0, !dbg !21
+  %69 = extractelement <2 x bfloat> %61, i64 1, !dbg !21
+  %70 = extractelement <2 x bfloat> %63, i64 0, !dbg !21
+  %71 = extractelement <2 x bfloat> %63, i64 1, !dbg !21
+  %72 = extractelement <2 x bfloat> %65, i64 0, !dbg !21
+  %73 = extractelement <2 x bfloat> %65, i64 1, !dbg !21
+  %74 = extractelement <2 x bfloat> %67, i64 0, !dbg !21
+  %75 = extractelement <2 x bfloat> %67, i64 1, !dbg !21
+  %76 = fpext bfloat %68 to float, !dbg !22
+  %77 = fpext bfloat %69 to float, !dbg !22
+  %78 = fpext bfloat %70 to float, !dbg !22
+  %79 = fpext bfloat %71 to float, !dbg !22
+  %80 = fpext bfloat %72 to float, !dbg !22
+  %81 = fpext bfloat %73 to float, !dbg !22
+  %82 = fpext bfloat %74 to float, !dbg !22
+  %83 = fpext bfloat %75 to float, !dbg !22
+  %84 = fmul float %47, %47, !dbg !23
+  %85 = fmul float %48, %48, !dbg !23
+  %86 = fmul float %49, %49, !dbg !23
+  %87 = fmul float %50, %50, !dbg !23
+  %88 = fmul float %51, %51, !dbg !23
+  %89 = fmul float %52, %52, !dbg !23
+  %90 = fmul float %53, %53, !dbg !23
+  %91 = fmul float %54, %54, !dbg !23
+  %92 = fmul float %76, %76, !dbg !24
+  %93 = fmul float %77, %77, !dbg !24
+  %94 = fmul float %78, %78, !dbg !24
+  %95 = fmul float %79, %79, !dbg !24
+  %96 = fmul float %80, %80, !dbg !24
+  %97 = fmul float %81, %81, !dbg !24
+  %98 = fmul float %82, %82, !dbg !24
+  %99 = fmul float %83, %83, !dbg !24
+  %100 = or disjoint i32 %19, 64, !dbg !25
+  %101 = or disjoint i32 %24, %100, !dbg !15
+  %102 = sext i32 %101 to i64, !dbg !16
+  %103 = getelementptr bfloat, ptr addrspace(1) %2, i64 %102, !dbg !16
+  %104 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !17
+  %105 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %103, i64 %104, i1 true) #6, !dbg !17
+  %106 = extractvalue { i32, i32, i32, i32 } %105, 0, !dbg !17
+  %107 = bitcast i32 %106 to <2 x bfloat>, !dbg !17
+  %108 = extractvalue { i32, i32, i32, i32 } %105, 1, !dbg !17
+  %109 = bitcast i32 %108 to <2 x bfloat>, !dbg !17
+  %110 = extractvalue { i32, i32, i32, i32 } %105, 2, !dbg !17
+  %111 = bitcast i32 %110 to <2 x bfloat>, !dbg !17
+  %112 = extractvalue { i32, i32, i32, i32 } %105, 3, !dbg !17
+  %113 = bitcast i32 %112 to <2 x bfloat>, !dbg !17
+  %114 = extractelement <2 x bfloat> %107, i64 0, !dbg !17
+  %115 = extractelement <2 x bfloat> %107, i64 1, !dbg !17
+  %116 = extractelement <2 x bfloat> %109, i64 0, !dbg !17
+  %117 = extractelement <2 x bfloat> %109, i64 1, !dbg !17
+  %118 = extractelement <2 x bfloat> %111, i64 0, !dbg !17
+  %119 = extractelement <2 x bfloat> %111, i64 1, !dbg !17
+  %120 = extractelement <2 x bfloat> %113, i64 0, !dbg !17
+  %121 = extractelement <2 x bfloat> %113, i64 1, !dbg !17
+  %122 = fpext bfloat %114 to float, !dbg !18
+  %123 = fpext bfloat %115 to float, !dbg !18
+  %124 = fpext bfloat %116 to float, !dbg !18
+  %125 = fpext bfloat %117 to float, !dbg !18
+  %126 = fpext bfloat %118 to float, !dbg !18
+  %127 = fpext bfloat %119 to float, !dbg !18
+  %128 = fpext bfloat %120 to float, !dbg !18
+  %129 = fpext bfloat %121 to float, !dbg !18
+  %130 = or disjoint i32 %23, %100, !dbg !19
+  %131 = sext i32 %130 to i64, !dbg !20
+  %132 = getelementptr bfloat, ptr addrspace(1) %2, i64 %131, !dbg !20
+  %133 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !21
+  %134 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %132, i64 %133, i1 true) #6, !dbg !21
+  %135 = extractvalue { i32, i32, i32, i32 } %134, 0, !dbg !21
+  %136 = bitcast i32 %135 to <2 x bfloat>, !dbg !21
+  %137 = extractvalue { i32, i32, i32, i32 } %134, 1, !dbg !21
+  %138 = bitcast i32 %137 to <2 x bfloat>, !dbg !21
+  %139 = extractvalue { i32, i32, i32, i32 } %134, 2, !dbg !21
+  %140 = bitcast i32 %139 to <2 x bfloat>, !dbg !21
+  %141 = extractvalue { i32, i32, i32, i32 } %134, 3, !dbg !21
+  %142 = bitcast i32 %141 to <2 x bfloat>, !dbg !21
+  %143 = extractelement <2 x bfloat> %136, i64 0, !dbg !21
+  %144 = extractelement <2 x bfloat> %136, i64 1, !dbg !21
+  %145 = extractelement <2 x bfloat> %138, i64 0, !dbg !21
+  %146 = extractelement <2 x bfloat> %138, i64 1, !dbg !21
+  %147 = extractelement <2 x bfloat> %140, i64 0, !dbg !21
+  %148 = extractelement <2 x bfloat> %140, i64 1, !dbg !21
+  %149 = extractelement <2 x bfloat> %142, i64 0, !dbg !21
+  %150 = extractelement <2 x bfloat> %142, i64 1, !dbg !21
+  %151 = fpext bfloat %143 to float, !dbg !22
+  %152 = fpext bfloat %144 to float, !dbg !22
+  %153 = fpext bfloat %145 to float, !dbg !22
+  %154 = fpext bfloat %146 to float, !dbg !22
+  %155 = fpext bfloat %147 to float, !dbg !22
+  %156 = fpext bfloat %148 to float, !dbg !22
+  %157 = fpext bfloat %149 to float, !dbg !22
+  %158 = fpext bfloat %150 to float, !dbg !22
+  %159 = fmul float %122, %122, !dbg !23
+  %160 = fmul float %123, %123, !dbg !23
+  %161 = fmul float %124, %124, !dbg !23
+  %162 = fmul float %125, %125, !dbg !23
+  %163 = fmul float %126, %126, !dbg !23
+  %164 = fmul float %127, %127, !dbg !23
+  %165 = fmul float %128, %128, !dbg !23
+  %166 = fmul float %129, %129, !dbg !23
+  %167 = fadd float %84, %159, !dbg !26
+  %168 = fadd float %85, %160, !dbg !26
+  %169 = fadd float %86, %161, !dbg !26
+  %170 = fadd float %87, %162, !dbg !26
+  %171 = fadd float %88, %163, !dbg !26
+  %172 = fadd float %89, %164, !dbg !26
+  %173 = fadd float %90, %165, !dbg !26
+  %174 = fadd float %91, %166, !dbg !26
+  %175 = fmul float %151, %151, !dbg !24
+  %176 = fmul float %152, %152, !dbg !24
+  %177 = fmul float %153, %153, !dbg !24
+  %178 = fmul float %154, %154, !dbg !24
+  %179 = fmul float %155, %155, !dbg !24
+  %180 = fmul float %156, %156, !dbg !24
+  %181 = fmul float %157, %157, !dbg !24
+  %182 = fmul float %158, %158, !dbg !24
+  %183 = fadd float %92, %175, !dbg !27
+  %184 = fadd float %93, %176, !dbg !27
+  %185 = fadd float %94, %177, !dbg !27
+  %186 = fadd float %95, %178, !dbg !27
+  %187 = fadd float %96, %179, !dbg !27
+  %188 = fadd float %97, %180, !dbg !27
+  %189 = fadd float %98, %181, !dbg !27
+  %190 = fadd float %99, %182, !dbg !27
+  %191 = and i32 %14, 63, !dbg !10
+  %192 = or disjoint i32 %13, %191, !dbg !11
+  %193 = lshr i32 %14, 6, !dbg !12
+  %194 = and i32 %193, 6, !dbg !12
+  %195 = sdiv i32 %192, 32, !dbg !13
+  %196 = fadd float %167, %168, !dbg !28
+  %197 = fadd float %169, %196, !dbg !28
+  %198 = fadd float %170, %197, !dbg !28
+  %199 = fadd float %171, %198, !dbg !28
+  %200 = fadd float %172, %199, !dbg !28
+  %201 = fadd float %173, %200, !dbg !28
+  %202 = fadd float %174, %201, !dbg !28
+  %203 = bitcast float %202 to i32, !dbg !31
+  %204 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %203, i32 4, i32 31), !dbg !31
+  %205 = bitcast i32 %204 to float, !dbg !31
+  %206 = fadd float %202, %205, !dbg !28
+  %207 = bitcast float %206 to i32, !dbg !31
+  %208 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %207, i32 2, i32 31), !dbg !31
+  %209 = bitcast i32 %208 to float, !dbg !31
+  %210 = fadd float %206, %209, !dbg !28
+  %211 = bitcast float %210 to i32, !dbg !31
+  %212 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %211, i32 1, i32 31), !dbg !31
+  %213 = bitcast i32 %212 to float, !dbg !31
+  %214 = fadd float %210, %213, !dbg !28
+  %215 = fadd float %183, %184, !dbg !34
+  %216 = fadd float %185, %215, !dbg !34
+  %217 = fadd float %186, %216, !dbg !34
+  %218 = fadd float %187, %217, !dbg !34
+  %219 = fadd float %188, %218, !dbg !34
+  %220 = fadd float %189, %219, !dbg !34
+  %221 = fadd float %190, %220, !dbg !34
+  %222 = bitcast float %221 to i32, !dbg !35
+  %223 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %222, i32 4, i32 31), !dbg !35
+  %224 = bitcast i32 %223 to float, !dbg !35
+  %225 = fadd float %221, %224, !dbg !34
+  %226 = bitcast float %225 to i32, !dbg !35
+  %227 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %226, i32 2, i32 31), !dbg !35
+  %228 = bitcast i32 %227 to float, !dbg !35
+  %229 = fadd float %225, %228, !dbg !34
+  %230 = bitcast float %229 to i32, !dbg !35
+  %231 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %230, i32 1, i32 31), !dbg !35
+  %232 = bitcast i32 %231 to float, !dbg !35
+  %233 = fadd float %229, %232, !dbg !34
+  %234 = shl i32 %20, 7, !dbg !37
+  %235 = tail call float @llvm.nvvm.div.full(float %233, float 1.280000e+02), !dbg !38
+  %236 = fadd float %235, 0x3EB0C6F7A0000000, !dbg !39
+  %237 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !40
+  %.not.i = icmp eq i32 %237, 0, !dbg !40
+  br i1 %.not.i, label %240, label %238, !dbg !40
+
+238:                                              ; preds = %11
+  %239 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %236), !dbg !40
+  br label %__nv_rsqrtf.exit, !dbg !40
+
+240:                                              ; preds = %11
+  %241 = tail call float @llvm.nvvm.rsqrt.approx.f(float %236), !dbg !40
+  br label %__nv_rsqrtf.exit, !dbg !40
+
+__nv_rsqrtf.exit:                                 ; preds = %238, %240
+  %.0.i = phi float [ %239, %238 ], [ %241, %240 ], !dbg !40
+  %242 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !40
+  %243 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !40
+  %244 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !40
+  %245 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !40
+  %246 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !40
+  %247 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !40
+  %248 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !40
+  %.not.i34 = icmp eq i32 %248, 0, !dbg !40
+  br i1 %.not.i34, label %251, label %249, !dbg !40
+
+249:                                              ; preds = %__nv_rsqrtf.exit
+  %250 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %236), !dbg !40
+  br label %__nv_rsqrtf.exit36, !dbg !40
+
+251:                                              ; preds = %__nv_rsqrtf.exit
+  %252 = tail call float @llvm.nvvm.rsqrt.approx.f(float %236), !dbg !40
+  br label %__nv_rsqrtf.exit36, !dbg !40
+
+__nv_rsqrtf.exit36:                               ; preds = %249, %251
+  %.0.i35 = phi float [ %250, %249 ], [ %252, %251 ], !dbg !40
+  %253 = lshr exact i32 %15, 1, !dbg !41
+  %254 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %253, !dbg !41
+  store float %.0.i, ptr addrspace(3) %254, align 4, !dbg !41
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !41
+  %255 = shl nuw nsw i32 %191, 2, !dbg !41
+  %256 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %255, !dbg !41
+  %257 = load float, ptr addrspace(3) %256, align 4, !dbg !41
+  %258 = tail call float @llvm.nvvm.div.full(float %214, float 1.280000e+02), !dbg !42
+  %259 = fadd float %258, 0x3EB0C6F7A0000000, !dbg !43
+  %260 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44
+  %.not.i37 = icmp eq i32 %260, 0, !dbg !44
+  br i1 %.not.i37, label %263, label %261, !dbg !44
+
+261:                                              ; preds = %__nv_rsqrtf.exit36
+  %262 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %259), !dbg !44
+  br label %__nv_rsqrtf.exit39, !dbg !44
+
+263:                                              ; preds = %__nv_rsqrtf.exit36
+  %264 = tail call float @llvm.nvvm.rsqrt.approx.f(float %259), !dbg !44
+  br label %__nv_rsqrtf.exit39, !dbg !44
+
+__nv_rsqrtf.exit39:                               ; preds = %261, %263
+  %.0.i38 = phi float [ %262, %261 ], [ %264, %263 ], !dbg !44
+  %265 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44
+  %266 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44
+  %267 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44
+  %268 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44
+  %269 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44
+  %270 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44
+  %271 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44
+  %.not.i58 = icmp eq i32 %271, 0, !dbg !44
+  br i1 %.not.i58, label %274, label %272, !dbg !44
+
+272:                                              ; preds = %__nv_rsqrtf.exit39
+  %273 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %259), !dbg !44
+  br label %__nv_rsqrtf.exit60, !dbg !44
+
+274:                                              ; preds = %__nv_rsqrtf.exit39
+  %275 = tail call float @llvm.nvvm.rsqrt.approx.f(float %259), !dbg !44
+  br label %__nv_rsqrtf.exit60, !dbg !44
+
+__nv_rsqrtf.exit60:                               ; preds = %272, %274
+  %.0.i59 = phi float [ %273, %272 ], [ %275, %274 ], !dbg !44
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !45
+  store float %.0.i38, ptr addrspace(3) %254, align 4, !dbg !45
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !45
+  %276 = load float, ptr addrspace(3) %256, align 4, !dbg !45
+  %277 = shl i32 %17, 7, !dbg !46
+  %278 = and i32 %193, 1
+  %279 = or disjoint i32 %19, %234
+  %280 = and i32 %14, 224
+  %281 = shl nuw nsw i32 %280, 6
+  %282 = shl nuw nsw i32 %14, 2
+  %283 = and i32 %282, 124
+  %284 = lshr exact i32 %280, 3
+  %285 = lshr i32 %14, 1
+  %286 = and i32 %285, 128
+  %287 = or disjoint i32 %281, %283
+  %288 = xor i32 %287, %284
+  %289 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %286
+  %290 = getelementptr inbounds nuw i8, ptr addrspace(3) %289, i32 %288
+  %291 = getelementptr inbounds nuw i8, ptr addrspace(3) %290, i32 256
+  %292 = getelementptr inbounds nuw i8, ptr addrspace(3) %290, i32 512
+  %293 = getelementptr inbounds nuw i8, ptr addrspace(3) %290, i32 768
+  %294 = getelementptr inbounds nuw i8, ptr addrspace(3) %290, i32 1024
+  %295 = getelementptr inbounds nuw i8, ptr addrspace(3) %290, i32 1280
+  %296 = getelementptr inbounds nuw i8, ptr addrspace(3) %290, i32 1536
+  %297 = getelementptr inbounds nuw i8, ptr addrspace(3) %290, i32 1792
+  %298 = and i32 %14, 28
+  %299 = shl nuw nsw i32 %298, 9
+  %300 = shl nuw nsw i32 %14, 5
+  %301 = and i32 %300, 96
+  %302 = and i32 %282, 1920
+  %303 = or disjoint i32 %299, %301
+  %304 = or disjoint i32 %303, %302
+  %305 = or disjoint i32 %304, %298
+  %306 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %305
+  %307 = xor i32 %305, 4
+  %308 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %307
+  %309 = xor i32 %305, 8
+  %310 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %309
+  %311 = xor i32 %305, 12
+  %312 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %311
+  %313 = xor i32 %305, 16
+  %314 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %313
+  %315 = xor i32 %305, 20
+  %316 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %315
+  %317 = xor i32 %305, 24
+  %318 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %317
+  %319 = xor i32 %305, 28
+  %320 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %319
+  %321 = icmp eq i32 %278, 0
+  %322 = shl i32 %192, 7
+  %323 = shl i32 %195, 15
+  %324 = add i32 %323, %322
+  %325 = icmp ne i32 %278, 0
+  %326 = add i32 %324, 4097
+  %327 = add i32 %324, 4096
+  %328 = shl nuw nsw i32 %298, 8
+  %329 = shl nuw nsw i32 %14, 1
+  %330 = and i32 %329, 768
+  %331 = lshr i32 %14, 5
+  %332 = and i32 %331, 2
+  %333 = or disjoint i32 %330, %332
+  %334 = or disjoint i32 %333, %328
+  %335 = or disjoint i32 %334, %255
+  %336 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %335
+  %337 = xor i32 %335, 16
+  %338 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %337
+  %339 = xor i32 %335, 32
+  %340 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %339
+  %341 = xor i32 %335, 48
+  %342 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %341
+  %343 = xor i32 %335, 64
+  %344 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %343
+  %345 = xor i32 %335, 80
+  %346 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %345
+  %347 = xor i32 %335, 96
+  %348 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %347
+  %349 = xor i32 %335, 112
+  %350 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %349
+  %351 = shl nuw nsw i32 %280, 5
+  %352 = shl nuw nsw i32 %18, 4
+  %353 = or disjoint i32 %351, %352
+  %354 = xor i32 %353, %253
+  %355 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %354
+  %356 = getelementptr inbounds nuw i8, ptr addrspace(3) %355, i32 256
+  %357 = getelementptr inbounds nuw i8, ptr addrspace(3) %355, i32 512
+  %358 = getelementptr inbounds nuw i8, ptr addrspace(3) %355, i32 768
+  %359 = zext nneg i32 %194 to i64, !dbg !47
+  %360 = sext i32 %234 to i64, !dbg !47
+  %361 = sext i32 %277 to i64, !dbg !47
+  %362 = or disjoint i32 %279, 4, !dbg !47
+  %invariant.op = sext i32 %362 to i64, !dbg !47
+  br label %363, !dbg !47
+
+363:                                              ; preds = %__nv_rsqrtf.exit60, %363
+  %364 = phi i1 [ true, %__nv_rsqrtf.exit60 ], [ false, %363 ]
+  %indvars.iv = phi i64 [ 0, %__nv_rsqrtf.exit60 ], [ 64, %363 ]
+  %365 = or disjoint i64 %indvars.iv, %25, !dbg !48
+  %366 = or disjoint i64 %indvars.iv, %359, !dbg !48
+  %367 = or disjoint i64 %366, 48, !dbg !48
+  %368 = or disjoint i64 %366, 8, !dbg !49
+  %369 = or disjoint i64 %366, 16, !dbg !49
+  %370 = or disjoint i64 %366, 24, !dbg !49
+  %371 = or disjoint i64 %366, 32, !dbg !49
+  %372 = or disjoint i64 %366, 40, !dbg !49
+  %373 = or disjoint i64 %366, 56, !dbg !49
+  %374 = trunc nuw nsw i64 %365 to i32, !dbg !50
+  %375 = or disjoint i32 %23, %374, !dbg !50
+  %376 = sext i32 %375 to i64, !dbg !51
+  %377 = getelementptr bfloat, ptr addrspace(1) %2, i64 %376, !dbg !51
+  %378 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !52
+  %379 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %377, i64 %378, i1 true) #6, !dbg !52
+  %380 = extractvalue { i32, i32, i32, i32 } %379, 0, !dbg !52
+  %381 = bitcast i32 %380 to <2 x bfloat>, !dbg !52
+  %382 = extractvalue { i32, i32, i32, i32 } %379, 1, !dbg !52
+  %383 = bitcast i32 %382 to <2 x bfloat>, !dbg !52
+  %384 = extractvalue { i32, i32, i32, i32 } %379, 2, !dbg !52
+  %385 = bitcast i32 %384 to <2 x bfloat>, !dbg !52
+  %386 = extractvalue { i32, i32, i32, i32 } %379, 3, !dbg !52
+  %387 = bitcast i32 %386 to <2 x bfloat>, !dbg !52
+  %388 = extractelement <2 x bfloat> %381, i64 0, !dbg !52
+  %389 = extractelement <2 x bfloat> %381, i64 1, !dbg !52
+  %390 = extractelement <2 x bfloat> %383, i64 0, !dbg !52
+  %391 = extractelement <2 x bfloat> %383, i64 1, !dbg !52
+  %392 = extractelement <2 x bfloat> %385, i64 0, !dbg !52
+  %393 = extractelement <2 x bfloat> %385, i64 1, !dbg !52
+  %394 = extractelement <2 x bfloat> %387, i64 0, !dbg !52
+  %395 = extractelement <2 x bfloat> %387, i64 1, !dbg !52
+  %396 = fpext bfloat %388 to float, !dbg !53
+  %397 = fpext bfloat %389 to float, !dbg !53
+  %398 = fpext bfloat %390 to float, !dbg !53
+  %399 = fpext bfloat %391 to float, !dbg !53
+  %400 = fpext bfloat %392 to float, !dbg !53
+  %401 = fpext bfloat %393 to float, !dbg !53
+  %402 = fpext bfloat %394 to float, !dbg !53
+  %403 = fpext bfloat %395 to float, !dbg !53
+  %404 = getelementptr bfloat, ptr addrspace(1) %3, i64 %365, !dbg !54
+  %405 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !55
+  %406 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %404, i64 %405, i1 true) #6, !dbg !55
+  %407 = extractvalue { i32, i32, i32, i32 } %406, 0, !dbg !55
+  %408 = bitcast i32 %407 to <2 x bfloat>, !dbg !55
+  %409 = extractvalue { i32, i32, i32, i32 } %406, 1, !dbg !55
+  %410 = bitcast i32 %409 to <2 x bfloat>, !dbg !55
+  %411 = extractvalue { i32, i32, i32, i32 } %406, 2, !dbg !55
+  %412 = bitcast i32 %411 to <2 x bfloat>, !dbg !55
+  %413 = extractvalue { i32, i32, i32, i32 } %406, 3, !dbg !55
+  %414 = bitcast i32 %413 to <2 x bfloat>, !dbg !55
+  %415 = extractelement <2 x bfloat> %408, i64 0, !dbg !55
+  %416 = extractelement <2 x bfloat> %408, i64 1, !dbg !55
+  %417 = extractelement <2 x bfloat> %410, i64 0, !dbg !55
+  %418 = extractelement <2 x bfloat> %410, i64 1, !dbg !55
+  %419 = extractelement <2 x bfloat> %412, i64 0, !dbg !55
+  %420 = extractelement <2 x bfloat> %412, i64 1, !dbg !55
+  %421 = extractelement <2 x bfloat> %414, i64 0, !dbg !55
+  %422 = extractelement <2 x bfloat> %414, i64 1, !dbg !55
+  %423 = fpext bfloat %415 to float, !dbg !56
+  %424 = fpext bfloat %416 to float, !dbg !56
+  %425 = fpext bfloat %417 to float, !dbg !56
+  %426 = fpext bfloat %418 to float, !dbg !56
+  %427 = fpext bfloat %419 to float, !dbg !56
+  %428 = fpext bfloat %420 to float, !dbg !56
+  %429 = fpext bfloat %421 to float, !dbg !56
+  %430 = fpext bfloat %422 to float, !dbg !56
+  %431 = or disjoint i64 %365, %360, !dbg !57
+  %.reass = or disjoint i64 %indvars.iv, %invariant.op
+  %432 = getelementptr float, ptr addrspace(1) %4, i64 %431, !dbg !58
+  %433 = getelementptr float, ptr addrspace(1) %4, i64 %.reass, !dbg !58
+  %434 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !59
+  %435 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %432, i64 %434, i1 true) #6, !dbg !59
+  %436 = extractvalue { i32, i32, i32, i32 } %435, 0, !dbg !59
+  %437 = extractvalue { i32, i32, i32, i32 } %435, 1, !dbg !59
+  %438 = extractvalue { i32, i32, i32, i32 } %435, 2, !dbg !59
+  %439 = extractvalue { i32, i32, i32, i32 } %435, 3, !dbg !59
+  %440 = bitcast i32 %436 to float, !dbg !59
+  %441 = bitcast i32 %437 to float, !dbg !59
+  %442 = bitcast i32 %438 to float, !dbg !59
+  %443 = bitcast i32 %439 to float, !dbg !59
+  %444 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !59
+  %445 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %433, i64 %444, i1 true) #6, !dbg !59
+  %446 = extractvalue { i32, i32, i32, i32 } %445, 0, !dbg !59
+  %447 = extractvalue { i32, i32, i32, i32 } %445, 1, !dbg !59
+  %448 = extractvalue { i32, i32, i32, i32 } %445, 2, !dbg !59
+  %449 = extractvalue { i32, i32, i32, i32 } %445, 3, !dbg !59
+  %450 = bitcast i32 %446 to float, !dbg !59
+  %451 = bitcast i32 %447 to float, !dbg !59
+  %452 = bitcast i32 %448 to float, !dbg !59
+  %453 = bitcast i32 %449 to float, !dbg !59
+  %454 = getelementptr float, ptr addrspace(1) %5, i64 %431, !dbg !60
+  %455 = getelementptr float, ptr addrspace(1) %5, i64 %.reass, !dbg !60
+  %456 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !61
+  %457 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %454, i64 %456, i1 true) #6, !dbg !61
+  %458 = extractvalue { i32, i32, i32, i32 } %457, 0, !dbg !61
+  %459 = extractvalue { i32, i32, i32, i32 } %457, 1, !dbg !61
+  %460 = extractvalue { i32, i32, i32, i32 } %457, 2, !dbg !61
+  %461 = extractvalue { i32, i32, i32, i32 } %457, 3, !dbg !61
+  %462 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !61
+  %463 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %455, i64 %462, i1 true) #6, !dbg !61
+  %464 = extractvalue { i32, i32, i32, i32 } %463, 0, !dbg !61
+  %465 = extractvalue { i32, i32, i32, i32 } %463, 1, !dbg !61
+  %466 = extractvalue { i32, i32, i32, i32 } %463, 2, !dbg !61
+  %467 = extractvalue { i32, i32, i32, i32 } %463, 3, !dbg !61
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !61
+  %468 = insertelement <1 x i32> poison, i32 %458, i64 0, !dbg !61
+  store <1 x i32> %468, ptr addrspace(3) %290, align 4, !dbg !61
+  %469 = insertelement <1 x i32> poison, i32 %459, i64 0, !dbg !61
+  store <1 x i32> %469, ptr addrspace(3) %291, align 4, !dbg !61
+  %470 = insertelement <1 x i32> poison, i32 %460, i64 0, !dbg !61
+  store <1 x i32> %470, ptr addrspace(3) %292, align 4, !dbg !61
+  %471 = insertelement <1 x i32> poison, i32 %461, i64 0, !dbg !61
+  store <1 x i32> %471, ptr addrspace(3) %293, align 4, !dbg !61
+  %472 = insertelement <1 x i32> poison, i32 %464, i64 0, !dbg !61
+  store <1 x i32> %472, ptr addrspace(3) %294, align 4, !dbg !61
+  %473 = insertelement <1 x i32> poison, i32 %465, i64 0, !dbg !61
+  store <1 x i32> %473, ptr addrspace(3) %295, align 4, !dbg !61
+  %474 = insertelement <1 x i32> poison, i32 %466, i64 0, !dbg !61
+  store <1 x i32> %474, ptr addrspace(3) %296, align 4, !dbg !61
+  %475 = insertelement <1 x i32> poison, i32 %467, i64 0, !dbg !61
+  store <1 x i32> %475, ptr addrspace(3) %297, align 4, !dbg !61
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !61
+  %476 = load float, ptr addrspace(3) %306, align 4, !dbg !61
+  %477 = load float, ptr addrspace(3) %308, align 4, !dbg !61
+  %478 = load float, ptr addrspace(3) %310, align 4, !dbg !61
+  %479 = load float, ptr addrspace(3) %312, align 4, !dbg !61
+  %480 = load float, ptr addrspace(3) %314, align 4, !dbg !61
+  %481 = load float, ptr addrspace(3) %316, align 4, !dbg !61
+  %482 = load float, ptr addrspace(3) %318, align 4, !dbg !61
+  %483 = load float, ptr addrspace(3) %320, align 4, !dbg !61
+  %484 = or disjoint i32 %24, %374, !dbg !62
+  %485 = sext i32 %484 to i64, !dbg !63
+  %486 = getelementptr bfloat, ptr addrspace(1) %2, i64 %485, !dbg !63
+  %487 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #6, !dbg !64
+  %488 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %486, i64 %487, i1 true) #6, !dbg !64
+  %489 = extractvalue { i32, i32, i32, i32 } %488, 0, !dbg !64
+  %490 = bitcast i32 %489 to <2 x bfloat>, !dbg !64
+  %491 = extractvalue { i32, i32, i32, i32 } %488, 1, !dbg !64
+  %492 = bitcast i32 %491 to <2 x bfloat>, !dbg !64
+  %493 = extractvalue { i32, i32, i32, i32 } %488, 2, !dbg !64
+  %494 = bitcast i32 %493 to <2 x bfloat>, !dbg !64
+  %495 = extractvalue { i32, i32, i32, i32 } %488, 3, !dbg !64
+  %496 = bitcast i32 %495 to <2 x bfloat>, !dbg !64
+  %497 = extractelement <2 x bfloat> %490, i64 0, !dbg !64
+  %498 = extractelement <2 x bfloat> %490, i64 1, !dbg !64
+  %499 = extractelement <2 x bfloat> %492, i64 0, !dbg !64
+  %500 = extractelement <2 x bfloat> %492, i64 1, !dbg !64
+  %501 = extractelement <2 x bfloat> %494, i64 0, !dbg !64
+  %502 = extractelement <2 x bfloat> %494, i64 1, !dbg !64
+  %503 = extractelement <2 x bfloat> %496, i64 0, !dbg !64
+  %504 = extractelement <2 x bfloat> %496, i64 1, !dbg !64
+  %505 = fpext bfloat %497 to float, !dbg !65
+  %506 = fpext bfloat %498 to float, !dbg !65
+  %507 = fpext bfloat %499 to float, !dbg !65
+  %508 = fpext bfloat %500 to float, !dbg !65
+  %509 = fpext bfloat %501 to float, !dbg !65
+  %510 = fpext bfloat %502 to float, !dbg !65
+  %511 = fpext bfloat %503 to float, !dbg !65
+  %512 = fpext bfloat %504 to float, !dbg !65
+  %513 = getelementptr bfloat, ptr addrspace(1) %6, i64 %365, !dbg !66
+  %514 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !67
+  %515 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %513, i64 %514, i1 true) #6, !dbg !67
+  %516 = extractvalue { i32, i32, i32, i32 } %515, 0, !dbg !67
+  %517 = bitcast i32 %516 to <2 x bfloat>, !dbg !67
+  %518 = extractvalue { i32, i32, i32, i32 } %515, 1, !dbg !67
+  %519 = bitcast i32 %518 to <2 x bfloat>, !dbg !67
+  %520 = extractvalue { i32, i32, i32, i32 } %515, 2, !dbg !67
+  %521 = bitcast i32 %520 to <2 x bfloat>, !dbg !67
+  %522 = extractvalue { i32, i32, i32, i32 } %515, 3, !dbg !67
+  %523 = bitcast i32 %522 to <2 x bfloat>, !dbg !67
+  %524 = extractelement <2 x bfloat> %517, i64 0, !dbg !67
+  %525 = extractelement <2 x bfloat> %517, i64 1, !dbg !67
+  %526 = extractelement <2 x bfloat> %519, i64 0, !dbg !67
+  %527 = extractelement <2 x bfloat> %519, i64 1, !dbg !67
+  %528 = extractelement <2 x bfloat> %521, i64 0, !dbg !67
+  %529 = extractelement <2 x bfloat> %521, i64 1, !dbg !67
+  %530 = extractelement <2 x bfloat> %523, i64 0, !dbg !67
+  %531 = extractelement <2 x bfloat> %523, i64 1, !dbg !67
+  %532 = fpext bfloat %524 to float, !dbg !68
+  %533 = fpext bfloat %525 to float, !dbg !68
+  %534 = fpext bfloat %526 to float, !dbg !68
+  %535 = fpext bfloat %527 to float, !dbg !68
+  %536 = fpext bfloat %528 to float, !dbg !68
+  %537 = fpext bfloat %529 to float, !dbg !68
+  %538 = fpext bfloat %530 to float, !dbg !68
+  %539 = fpext bfloat %531 to float, !dbg !68
+  %540 = or disjoint i64 %366, 1, !dbg !69
+  %541 = or disjoint i64 %366, 9, !dbg !69
+  %542 = or disjoint i64 %366, 17, !dbg !69
+  %543 = or disjoint i64 %366, 25, !dbg !69
+  %544 = or disjoint i64 %366, 33, !dbg !69
+  %545 = or disjoint i64 %366, 41, !dbg !69
+  %546 = or disjoint i64 %366, 49, !dbg !69
+  %547 = or disjoint i64 %366, 57, !dbg !69
+  %548 = trunc nuw nsw i64 %540 to i32, !dbg !70
+  %549 = or disjoint i32 %324, %548, !dbg !70
+  %550 = trunc nuw nsw i64 %541 to i32, !dbg !70
+  %551 = or disjoint i32 %324, %550, !dbg !70
+  %552 = trunc nuw nsw i64 %542 to i32, !dbg !70
+  %553 = or disjoint i32 %324, %552, !dbg !70
+  %554 = trunc nuw nsw i64 %543 to i32, !dbg !70
+  %555 = or disjoint i32 %324, %554, !dbg !70
+  %556 = trunc nuw nsw i64 %544 to i32, !dbg !70
+  %557 = or disjoint i32 %324, %556, !dbg !70
+  %558 = trunc nuw nsw i64 %545 to i32, !dbg !70
+  %559 = or disjoint i32 %324, %558, !dbg !70
+  %560 = trunc nuw nsw i64 %546 to i32, !dbg !70
+  %561 = or disjoint i32 %324, %560, !dbg !70
+  %562 = trunc nuw nsw i64 %547 to i32, !dbg !70
+  %563 = or disjoint i32 %324, %562, !dbg !70
+  %564 = sext i32 %549 to i64, !dbg !71
+  %565 = getelementptr bfloat, ptr addrspace(1) %2, i64 %564, !dbg !71
+  %566 = sext i32 %551 to i64, !dbg !71
+  %567 = getelementptr bfloat, ptr addrspace(1) %2, i64 %566, !dbg !71
+  %568 = sext i32 %553 to i64, !dbg !71
+  %569 = getelementptr bfloat, ptr addrspace(1) %2, i64 %568, !dbg !71
+  %570 = sext i32 %555 to i64, !dbg !71
+  %571 = getelementptr bfloat, ptr addrspace(1) %2, i64 %570, !dbg !71
+  %572 = sext i32 %557 to i64, !dbg !71
+  %573 = getelementptr bfloat, ptr addrspace(1) %2, i64 %572, !dbg !71
+  %574 = sext i32 %559 to i64, !dbg !71
+  %575 = getelementptr bfloat, ptr addrspace(1) %2, i64 %574, !dbg !71
+  %576 = sext i32 %561 to i64, !dbg !71
+  %577 = getelementptr bfloat, ptr addrspace(1) %2, i64 %576, !dbg !71
+  %578 = sext i32 %563 to i64, !dbg !71
+  %579 = getelementptr bfloat, ptr addrspace(1) %2, i64 %578, !dbg !71
+  %580 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !72
+  %581 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %565, i64 %580, i1 %321) #6, !dbg !72
+  %582 = bitcast i16 %581 to bfloat, !dbg !72
+  %583 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !72
+  %584 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %567, i64 %583, i1 %321) #6, !dbg !72
+  %585 = bitcast i16 %584 to bfloat, !dbg !72
+  %586 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !72
+  %587 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %569, i64 %586, i1 %321) #6, !dbg !72
+  %588 = bitcast i16 %587 to bfloat, !dbg !72
+  %589 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !72
+  %590 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %571, i64 %589, i1 %321) #6, !dbg !72
+  %591 = bitcast i16 %590 to bfloat, !dbg !72
+  %592 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !72
+  %593 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %573, i64 %592, i1 %321) #6, !dbg !72
+  %594 = bitcast i16 %593 to bfloat, !dbg !72
+  %595 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !72
+  %596 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %575, i64 %595, i1 %321) #6, !dbg !72
+  %597 = bitcast i16 %596 to bfloat, !dbg !72
+  %598 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !72
+  %599 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %577, i64 %598, i1 %321) #6, !dbg !72
+  %600 = bitcast i16 %599 to bfloat, !dbg !72
+  %601 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !72
+  %602 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %579, i64 %601, i1 %321) #6, !dbg !72
+  %603 = bitcast i16 %602 to bfloat, !dbg !72
+  %604 = fpext bfloat %582 to float, !dbg !73
+  %605 = fpext bfloat %585 to float, !dbg !73
+  %606 = fpext bfloat %588 to float, !dbg !73
+  %607 = fpext bfloat %591 to float, !dbg !73
+  %608 = fpext bfloat %594 to float, !dbg !73
+  %609 = fpext bfloat %597 to float, !dbg !73
+  %610 = fpext bfloat %600 to float, !dbg !73
+  %611 = fpext bfloat %603 to float, !dbg !73
+  %612 = fmul float %257, %604, !dbg !41
+  %613 = fmul float %257, %605, !dbg !41
+  %614 = fmul float %257, %606, !dbg !41
+  %615 = fmul float %257, %607, !dbg !41
+  %616 = fmul float %257, %608, !dbg !41
+  %617 = fmul float %257, %609, !dbg !41
+  %618 = fmul float %257, %610, !dbg !41
+  %619 = fmul float %257, %611, !dbg !41
+  %620 = getelementptr bfloat, ptr addrspace(1) %3, i64 %540, !dbg !74
+  %621 = getelementptr bfloat, ptr addrspace(1) %3, i64 %541, !dbg !74
+  %622 = getelementptr bfloat, ptr addrspace(1) %3, i64 %542, !dbg !74
+  %623 = getelementptr bfloat, ptr addrspace(1) %3, i64 %543, !dbg !74
+  %624 = getelementptr bfloat, ptr addrspace(1) %3, i64 %544, !dbg !74
+  %625 = getelementptr bfloat, ptr addrspace(1) %3, i64 %545, !dbg !74
+  %626 = getelementptr bfloat, ptr addrspace(1) %3, i64 %546, !dbg !74
+  %627 = getelementptr bfloat, ptr addrspace(1) %3, i64 %547, !dbg !74
+  %628 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !75
+  %629 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %620, i64 %628, i1 %321) #6, !dbg !75
+  %630 = bitcast i16 %629 to bfloat, !dbg !75
+  %631 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !75
+  %632 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %621, i64 %631, i1 %321) #6, !dbg !75
+  %633 = bitcast i16 %632 to bfloat, !dbg !75
+  %634 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !75
+  %635 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %622, i64 %634, i1 %321) #6, !dbg !75
+  %636 = bitcast i16 %635 to bfloat, !dbg !75
+  %637 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !75
+  %638 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %623, i64 %637, i1 %321) #6, !dbg !75
+  %639 = bitcast i16 %638 to bfloat, !dbg !75
+  %640 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !75
+  %641 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %624, i64 %640, i1 %321) #6, !dbg !75
+  %642 = bitcast i16 %641 to bfloat, !dbg !75
+  %643 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !75
+  %644 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %625, i64 %643, i1 %321) #6, !dbg !75
+  %645 = bitcast i16 %644 to bfloat, !dbg !75
+  %646 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !75
+  %647 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %626, i64 %646, i1 %321) #6, !dbg !75
+  %648 = bitcast i16 %647 to bfloat, !dbg !75
+  %649 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !75
+  %650 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %627, i64 %649, i1 %321) #6, !dbg !75
+  %651 = bitcast i16 %650 to bfloat, !dbg !75
+  %652 = fpext bfloat %630 to float, !dbg !76
+  %653 = fpext bfloat %633 to float, !dbg !76
+  %654 = fpext bfloat %636 to float, !dbg !76
+  %655 = fpext bfloat %639 to float, !dbg !76
+  %656 = fpext bfloat %642 to float, !dbg !76
+  %657 = fpext bfloat %645 to float, !dbg !76
+  %658 = fpext bfloat %648 to float, !dbg !76
+  %659 = fpext bfloat %651 to float, !dbg !76
+  %660 = fmul float %612, %652, !dbg !77
+  %661 = fmul float %613, %653, !dbg !77
+  %662 = fmul float %614, %654, !dbg !77
+  %663 = fmul float %615, %655, !dbg !77
+  %664 = fmul float %616, %656, !dbg !77
+  %665 = fmul float %617, %657, !dbg !77
+  %666 = fmul float %618, %658, !dbg !77
+  %667 = fmul float %619, %659, !dbg !77
+  %668 = fsub float 0.000000e+00, %660, !dbg !78
+  %669 = fsub float 0.000000e+00, %661, !dbg !78
+  %670 = fsub float 0.000000e+00, %662, !dbg !78
+  %671 = fsub float 0.000000e+00, %663, !dbg !78
+  %672 = fsub float 0.000000e+00, %664, !dbg !78
+  %673 = fsub float 0.000000e+00, %665, !dbg !78
+  %674 = fsub float 0.000000e+00, %666, !dbg !78
+  %675 = fsub float 0.000000e+00, %667, !dbg !78
+  %676 = trunc nuw nsw i64 %366 to i32, !dbg !79
+  %677 = or disjoint i32 %324, %676, !dbg !79
+  %678 = trunc nuw nsw i64 %368 to i32, !dbg !79
+  %679 = or disjoint i32 %324, %678, !dbg !79
+  %680 = trunc nuw nsw i64 %369 to i32, !dbg !79
+  %681 = or disjoint i32 %324, %680, !dbg !79
+  %682 = trunc nuw nsw i64 %370 to i32, !dbg !79
+  %683 = or disjoint i32 %324, %682, !dbg !79
+  %684 = trunc nuw nsw i64 %371 to i32, !dbg !79
+  %685 = or disjoint i32 %324, %684, !dbg !79
+  %686 = trunc nuw nsw i64 %372 to i32, !dbg !79
+  %687 = or disjoint i32 %324, %686, !dbg !79
+  %688 = trunc nuw nsw i64 %367 to i32, !dbg !79
+  %689 = or disjoint i32 %324, %688, !dbg !79
+  %690 = trunc nuw nsw i64 %373 to i32, !dbg !79
+  %691 = or disjoint i32 %324, %690, !dbg !79
+  %692 = sext i32 %677 to i64, !dbg !80
+  %693 = getelementptr bfloat, ptr addrspace(1) %2, i64 %692, !dbg !80
+  %694 = sext i32 %679 to i64, !dbg !80
+  %695 = getelementptr bfloat, ptr addrspace(1) %2, i64 %694, !dbg !80
+  %696 = sext i32 %681 to i64, !dbg !80
+  %697 = getelementptr bfloat, ptr addrspace(1) %2, i64 %696, !dbg !80
+  %698 = sext i32 %683 to i64, !dbg !80
+  %699 = getelementptr bfloat, ptr addrspace(1) %2, i64 %698, !dbg !80
+  %700 = sext i32 %685 to i64, !dbg !80
+  %701 = getelementptr bfloat, ptr addrspace(1) %2, i64 %700, !dbg !80
+  %702 = sext i32 %687 to i64, !dbg !80
+  %703 = getelementptr bfloat, ptr addrspace(1) %2, i64 %702, !dbg !80
+  %704 = sext i32 %689 to i64, !dbg !80
+  %705 = getelementptr bfloat, ptr addrspace(1) %2, i64 %704, !dbg !80
+  %706 = sext i32 %691 to i64, !dbg !80
+  %707 = getelementptr bfloat, ptr addrspace(1) %2, i64 %706, !dbg !80
+  %708 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !81
+  %709 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %693, i64 %708, i1 %325) #6, !dbg !81
+  %710 = bitcast i16 %709 to bfloat, !dbg !81
+  %711 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !81
+  %712 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %695, i64 %711, i1 %325) #6, !dbg !81
+  %713 = bitcast i16 %712 to bfloat, !dbg !81
+  %714 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !81
+  %715 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %697, i64 %714, i1 %325) #6, !dbg !81
+  %716 = bitcast i16 %715 to bfloat, !dbg !81
+  %717 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !81
+  %718 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %699, i64 %717, i1 %325) #6, !dbg !81
+  %719 = bitcast i16 %718 to bfloat, !dbg !81
+  %720 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !81
+  %721 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %701, i64 %720, i1 %325) #6, !dbg !81
+  %722 = bitcast i16 %721 to bfloat, !dbg !81
+  %723 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !81
+  %724 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %703, i64 %723, i1 %325) #6, !dbg !81
+  %725 = bitcast i16 %724 to bfloat, !dbg !81
+  %726 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !81
+  %727 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %705, i64 %726, i1 %325) #6, !dbg !81
+  %728 = bitcast i16 %727 to bfloat, !dbg !81
+  %729 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !81
+  %730 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %707, i64 %729, i1 %325) #6, !dbg !81
+  %731 = bitcast i16 %730 to bfloat, !dbg !81
+  %732 = fpext bfloat %710 to float, !dbg !82
+  %733 = fpext bfloat %713 to float, !dbg !82
+  %734 = fpext bfloat %716 to float, !dbg !82
+  %735 = fpext bfloat %719 to float, !dbg !82
+  %736 = fpext bfloat %722 to float, !dbg !82
+  %737 = fpext bfloat %725 to float, !dbg !82
+  %738 = fpext bfloat %728 to float, !dbg !82
+  %739 = fpext bfloat %731 to float, !dbg !82
+  %740 = fmul float %257, %732, !dbg !83
+  %741 = fmul float %257, %733, !dbg !83
+  %742 = fmul float %257, %734, !dbg !83
+  %743 = fmul float %257, %735, !dbg !83
+  %744 = fmul float %257, %736, !dbg !83
+  %745 = fmul float %257, %737, !dbg !83
+  %746 = fmul float %257, %738, !dbg !83
+  %747 = fmul float %257, %739, !dbg !83
+  %748 = getelementptr bfloat, ptr addrspace(1) %3, i64 %366, !dbg !84
+  %749 = getelementptr bfloat, ptr addrspace(1) %3, i64 %368, !dbg !84
+  %750 = getelementptr bfloat, ptr addrspace(1) %3, i64 %369, !dbg !84
+  %751 = getelementptr bfloat, ptr addrspace(1) %3, i64 %370, !dbg !84
+  %752 = getelementptr bfloat, ptr addrspace(1) %3, i64 %371, !dbg !84
+  %753 = getelementptr bfloat, ptr addrspace(1) %3, i64 %372, !dbg !84
+  %754 = getelementptr bfloat, ptr addrspace(1) %3, i64 %367, !dbg !84
+  %755 = getelementptr bfloat, ptr addrspace(1) %3, i64 %373, !dbg !84
+  %756 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !85
+  %757 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %748, i64 %756, i1 %325) #6, !dbg !85
+  %758 = bitcast i16 %757 to bfloat, !dbg !85
+  %759 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !85
+  %760 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %749, i64 %759, i1 %325) #6, !dbg !85
+  %761 = bitcast i16 %760 to bfloat, !dbg !85
+  %762 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !85
+  %763 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %750, i64 %762, i1 %325) #6, !dbg !85
+  %764 = bitcast i16 %763 to bfloat, !dbg !85
+  %765 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !85
+  %766 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %751, i64 %765, i1 %325) #6, !dbg !85
+  %767 = bitcast i16 %766 to bfloat, !dbg !85
+  %768 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !85
+  %769 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %752, i64 %768, i1 %325) #6, !dbg !85
+  %770 = bitcast i16 %769 to bfloat, !dbg !85
+  %771 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !85
+  %772 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %753, i64 %771, i1 %325) #6, !dbg !85
+  %773 = bitcast i16 %772 to bfloat, !dbg !85
+  %774 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !85
+  %775 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %754, i64 %774, i1 %325) #6, !dbg !85
+  %776 = bitcast i16 %775 to bfloat, !dbg !85
+  %777 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !85
+  %778 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %755, i64 %777, i1 %325) #6, !dbg !85
+  %779 = bitcast i16 %778 to bfloat, !dbg !85
+  %780 = fpext bfloat %758 to float, !dbg !86
+  %781 = fpext bfloat %761 to float, !dbg !86
+  %782 = fpext bfloat %764 to float, !dbg !86
+  %783 = fpext bfloat %767 to float, !dbg !86
+  %784 = fpext bfloat %770 to float, !dbg !86
+  %785 = fpext bfloat %773 to float, !dbg !86
+  %786 = fpext bfloat %776 to float, !dbg !86
+  %787 = fpext bfloat %779 to float, !dbg !86
+  %788 = fmul float %740, %780, !dbg !87
+  %789 = fmul float %741, %781, !dbg !87
+  %790 = fmul float %742, %782, !dbg !87
+  %791 = fmul float %743, %783, !dbg !87
+  %792 = fmul float %744, %784, !dbg !87
+  %793 = fmul float %745, %785, !dbg !87
+  %794 = fmul float %746, %786, !dbg !87
+  %795 = fmul float %747, %787, !dbg !87
+  %796 = select i1 %321, float %668, float %788, !dbg !88
+  %797 = select i1 %321, float %669, float %789, !dbg !88
+  %798 = select i1 %321, float %670, float %790, !dbg !88
+  %799 = select i1 %321, float %671, float %791, !dbg !88
+  %800 = select i1 %321, float %672, float %792, !dbg !88
+  %801 = select i1 %321, float %673, float %793, !dbg !88
+  %802 = select i1 %321, float %674, float %794, !dbg !88
+  %803 = select i1 %321, float %675, float %795, !dbg !88
+  %804 = fmul float %.0.i35, %396, !dbg !89
+  %805 = fmul float %.0.i35, %397, !dbg !89
+  %806 = fmul float %.0.i35, %398, !dbg !89
+  %807 = fmul float %.0.i35, %399, !dbg !89
+  %808 = fmul float %.0.i35, %400, !dbg !89
+  %809 = fmul float %.0.i35, %401, !dbg !89
+  %810 = fmul float %.0.i35, %402, !dbg !89
+  %811 = fmul float %.0.i35, %403, !dbg !89
+  %812 = fmul float %804, %423, !dbg !90
+  %813 = fmul float %805, %424, !dbg !90
+  %814 = fmul float %806, %425, !dbg !90
+  %815 = fmul float %807, %426, !dbg !90
+  %816 = fmul float %808, %427, !dbg !90
+  %817 = fmul float %809, %428, !dbg !90
+  %818 = fmul float %810, %429, !dbg !90
+  %819 = fmul float %811, %430, !dbg !90
+  %820 = fmul float %812, %440, !dbg !91
+  %821 = fmul float %813, %441, !dbg !91
+  %822 = fmul float %814, %442, !dbg !91
+  %823 = fmul float %815, %443, !dbg !91
+  %824 = fmul float %816, %450, !dbg !91
+  %825 = fmul float %817, %451, !dbg !91
+  %826 = fmul float %818, %452, !dbg !91
+  %827 = fmul float %819, %453, !dbg !91
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !91
+  store float %820, ptr addrspace(3) %290, align 4, !dbg !91
+  store float %821, ptr addrspace(3) %291, align 4, !dbg !91
+  store float %822, ptr addrspace(3) %292, align 4, !dbg !91
+  store float %823, ptr addrspace(3) %293, align 4, !dbg !91
+  store float %824, ptr addrspace(3) %294, align 4, !dbg !91
+  store float %825, ptr addrspace(3) %295, align 4, !dbg !91
+  store float %826, ptr addrspace(3) %296, align 4, !dbg !91
+  store float %827, ptr addrspace(3) %297, align 4, !dbg !91
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !91
+  %828 = load float, ptr addrspace(3) %306, align 4, !dbg !91
+  %829 = load float, ptr addrspace(3) %308, align 4, !dbg !91
+  %830 = load float, ptr addrspace(3) %310, align 4, !dbg !91
+  %831 = load float, ptr addrspace(3) %312, align 4, !dbg !91
+  %832 = load float, ptr addrspace(3) %314, align 4, !dbg !91
+  %833 = load float, ptr addrspace(3) %316, align 4, !dbg !91
+  %834 = load float, ptr addrspace(3) %318, align 4, !dbg !91
+  %835 = load float, ptr addrspace(3) %320, align 4, !dbg !91
+  %836 = fmul float %476, %796, !dbg !92
+  %837 = fmul float %477, %797, !dbg !92
+  %838 = fmul float %478, %798, !dbg !92
+  %839 = fmul float %479, %799, !dbg !92
+  %840 = fmul float %480, %800, !dbg !92
+  %841 = fmul float %481, %801, !dbg !92
+  %842 = fmul float %482, %802, !dbg !92
+  %843 = fmul float %483, %803, !dbg !92
+  %844 = fadd float %836, %828, !dbg !93
+  %845 = fadd float %837, %829, !dbg !93
+  %846 = fadd float %838, %830, !dbg !93
+  %847 = fadd float %839, %831, !dbg !93
+  %848 = fadd float %840, %832, !dbg !93
+  %849 = fadd float %841, %833, !dbg !93
+  %850 = fadd float %842, %834, !dbg !93
+  %851 = fadd float %843, %835, !dbg !93
+  %852 = or disjoint i32 %326, %676, !dbg !94
+  %853 = or disjoint i32 %326, %678, !dbg !94
+  %854 = or disjoint i32 %326, %680, !dbg !94
+  %855 = or disjoint i32 %326, %682, !dbg !94
+  %856 = or disjoint i32 %326, %684, !dbg !94
+  %857 = or disjoint i32 %326, %686, !dbg !94
+  %858 = or disjoint i32 %326, %688, !dbg !94
+  %859 = or disjoint i32 %326, %690, !dbg !94
+  %860 = sext i32 %852 to i64, !dbg !95
+  %861 = getelementptr bfloat, ptr addrspace(1) %2, i64 %860, !dbg !95
+  %862 = sext i32 %853 to i64, !dbg !95
+  %863 = getelementptr bfloat, ptr addrspace(1) %2, i64 %862, !dbg !95
+  %864 = sext i32 %854 to i64, !dbg !95
+  %865 = getelementptr bfloat, ptr addrspace(1) %2, i64 %864, !dbg !95
+  %866 = sext i32 %855 to i64, !dbg !95
+  %867 = getelementptr bfloat, ptr addrspace(1) %2, i64 %866, !dbg !95
+  %868 = sext i32 %856 to i64, !dbg !95
+  %869 = getelementptr bfloat, ptr addrspace(1) %2, i64 %868, !dbg !95
+  %870 = sext i32 %857 to i64, !dbg !95
+  %871 = getelementptr bfloat, ptr addrspace(1) %2, i64 %870, !dbg !95
+  %872 = sext i32 %858 to i64, !dbg !95
+  %873 = getelementptr bfloat, ptr addrspace(1) %2, i64 %872, !dbg !95
+  %874 = sext i32 %859 to i64, !dbg !95
+  %875 = getelementptr bfloat, ptr addrspace(1) %2, i64 %874, !dbg !95
+  %876 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !96
+  %877 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %861, i64 %876, i1 %321) #6, !dbg !96
+  %878 = bitcast i16 %877 to bfloat, !dbg !96
+  %879 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !96
+  %880 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %863, i64 %879, i1 %321) #6, !dbg !96
+  %881 = bitcast i16 %880 to bfloat, !dbg !96
+  %882 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !96
+  %883 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %865, i64 %882, i1 %321) #6, !dbg !96
+  %884 = bitcast i16 %883 to bfloat, !dbg !96
+  %885 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !96
+  %886 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %867, i64 %885, i1 %321) #6, !dbg !96
+  %887 = bitcast i16 %886 to bfloat, !dbg !96
+  %888 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !96
+  %889 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %869, i64 %888, i1 %321) #6, !dbg !96
+  %890 = bitcast i16 %889 to bfloat, !dbg !96
+  %891 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !96
+  %892 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %871, i64 %891, i1 %321) #6, !dbg !96
+  %893 = bitcast i16 %892 to bfloat, !dbg !96
+  %894 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !96
+  %895 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %873, i64 %894, i1 %321) #6, !dbg !96
+  %896 = bitcast i16 %895 to bfloat, !dbg !96
+  %897 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !96
+  %898 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %875, i64 %897, i1 %321) #6, !dbg !96
+  %899 = bitcast i16 %898 to bfloat, !dbg !96
+  %900 = fpext bfloat %878 to float, !dbg !97
+  %901 = fpext bfloat %881 to float, !dbg !97
+  %902 = fpext bfloat %884 to float, !dbg !97
+  %903 = fpext bfloat %887 to float, !dbg !97
+  %904 = fpext bfloat %890 to float, !dbg !97
+  %905 = fpext bfloat %893 to float, !dbg !97
+  %906 = fpext bfloat %896 to float, !dbg !97
+  %907 = fpext bfloat %899 to float, !dbg !97
+  %908 = fmul float %276, %900, !dbg !45
+  %909 = fmul float %276, %901, !dbg !45
+  %910 = fmul float %276, %902, !dbg !45
+  %911 = fmul float %276, %903, !dbg !45
+  %912 = fmul float %276, %904, !dbg !45
+  %913 = fmul float %276, %905, !dbg !45
+  %914 = fmul float %276, %906, !dbg !45
+  %915 = fmul float %276, %907, !dbg !45
+  %916 = getelementptr bfloat, ptr addrspace(1) %6, i64 %540, !dbg !98
+  %917 = getelementptr bfloat, ptr addrspace(1) %6, i64 %541, !dbg !98
+  %918 = getelementptr bfloat, ptr addrspace(1) %6, i64 %542, !dbg !98
+  %919 = getelementptr bfloat, ptr addrspace(1) %6, i64 %543, !dbg !98
+  %920 = getelementptr bfloat, ptr addrspace(1) %6, i64 %544, !dbg !98
+  %921 = getelementptr bfloat, ptr addrspace(1) %6, i64 %545, !dbg !98
+  %922 = getelementptr bfloat, ptr addrspace(1) %6, i64 %546, !dbg !98
+  %923 = getelementptr bfloat, ptr addrspace(1) %6, i64 %547, !dbg !98
+  %924 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !99
+  %925 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %916, i64 %924, i1 %321) #6, !dbg !99
+  %926 = bitcast i16 %925 to bfloat, !dbg !99
+  %927 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !99
+  %928 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %917, i64 %927, i1 %321) #6, !dbg !99
+  %929 = bitcast i16 %928 to bfloat, !dbg !99
+  %930 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !99
+  %931 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %918, i64 %930, i1 %321) #6, !dbg !99
+  %932 = bitcast i16 %931 to bfloat, !dbg !99
+  %933 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !99
+  %934 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %919, i64 %933, i1 %321) #6, !dbg !99
+  %935 = bitcast i16 %934 to bfloat, !dbg !99
+  %936 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !99
+  %937 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %920, i64 %936, i1 %321) #6, !dbg !99
+  %938 = bitcast i16 %937 to bfloat, !dbg !99
+  %939 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !99
+  %940 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %921, i64 %939, i1 %321) #6, !dbg !99
+  %941 = bitcast i16 %940 to bfloat, !dbg !99
+  %942 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !99
+  %943 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %922, i64 %942, i1 %321) #6, !dbg !99
+  %944 = bitcast i16 %943 to bfloat, !dbg !99
+  %945 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !99
+  %946 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %923, i64 %945, i1 %321) #6, !dbg !99
+  %947 = bitcast i16 %946 to bfloat, !dbg !99
+  %948 = fpext bfloat %926 to float, !dbg !100
+  %949 = fpext bfloat %929 to float, !dbg !100
+  %950 = fpext bfloat %932 to float, !dbg !100
+  %951 = fpext bfloat %935 to float, !dbg !100
+  %952 = fpext bfloat %938 to float, !dbg !100
+  %953 = fpext bfloat %941 to float, !dbg !100
+  %954 = fpext bfloat %944 to float, !dbg !100
+  %955 = fpext bfloat %947 to float, !dbg !100
+  %956 = fmul float %908, %948, !dbg !101
+  %957 = fmul float %909, %949, !dbg !101
+  %958 = fmul float %910, %950, !dbg !101
+  %959 = fmul float %911, %951, !dbg !101
+  %960 = fmul float %912, %952, !dbg !101
+  %961 = fmul float %913, %953, !dbg !101
+  %962 = fmul float %914, %954, !dbg !101
+  %963 = fmul float %915, %955, !dbg !101
+  %964 = fsub float 0.000000e+00, %956, !dbg !102
+  %965 = fsub float 0.000000e+00, %957, !dbg !102
+  %966 = fsub float 0.000000e+00, %958, !dbg !102
+  %967 = fsub float 0.000000e+00, %959, !dbg !102
+  %968 = fsub float 0.000000e+00, %960, !dbg !102
+  %969 = fsub float 0.000000e+00, %961, !dbg !102
+  %970 = fsub float 0.000000e+00, %962, !dbg !102
+  %971 = fsub float 0.000000e+00, %963, !dbg !102
+  %972 = or disjoint i32 %327, %676, !dbg !103
+  %973 = or disjoint i32 %327, %678, !dbg !103
+  %974 = or disjoint i32 %327, %680, !dbg !103
+  %975 = or disjoint i32 %327, %682, !dbg !103
+  %976 = or disjoint i32 %327, %684, !dbg !103
+  %977 = or disjoint i32 %327, %686, !dbg !103
+  %978 = or disjoint i32 %327, %688, !dbg !103
+  %979 = or disjoint i32 %327, %690, !dbg !103
+  %980 = sext i32 %972 to i64, !dbg !104
+  %981 = getelementptr bfloat, ptr addrspace(1) %2, i64 %980, !dbg !104
+  %982 = sext i32 %973 to i64, !dbg !104
+  %983 = getelementptr bfloat, ptr addrspace(1) %2, i64 %982, !dbg !104
+  %984 = sext i32 %974 to i64, !dbg !104
+  %985 = getelementptr bfloat, ptr addrspace(1) %2, i64 %984, !dbg !104
+  %986 = sext i32 %975 to i64, !dbg !104
+  %987 = getelementptr bfloat, ptr addrspace(1) %2, i64 %986, !dbg !104
+  %988 = sext i32 %976 to i64, !dbg !104
+  %989 = getelementptr bfloat, ptr addrspace(1) %2, i64 %988, !dbg !104
+  %990 = sext i32 %977 to i64, !dbg !104
+  %991 = getelementptr bfloat, ptr addrspace(1) %2, i64 %990, !dbg !104
+  %992 = sext i32 %978 to i64, !dbg !104
+  %993 = getelementptr bfloat, ptr addrspace(1) %2, i64 %992, !dbg !104
+  %994 = sext i32 %979 to i64, !dbg !104
+  %995 = getelementptr bfloat, ptr addrspace(1) %2, i64 %994, !dbg !104
+  %996 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !105
+  %997 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %981, i64 %996, i1 %325) #6, !dbg !105
+  %998 = bitcast i16 %997 to bfloat, !dbg !105
+  %999 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !105
+  %1000 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %983, i64 %999, i1 %325) #6, !dbg !105
+  %1001 = bitcast i16 %1000 to bfloat, !dbg !105
+  %1002 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !105
+  %1003 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %985, i64 %1002, i1 %325) #6, !dbg !105
+  %1004 = bitcast i16 %1003 to bfloat, !dbg !105
+  %1005 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !105
+  %1006 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %987, i64 %1005, i1 %325) #6, !dbg !105
+  %1007 = bitcast i16 %1006 to bfloat, !dbg !105
+  %1008 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !105
+  %1009 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %989, i64 %1008, i1 %325) #6, !dbg !105
+  %1010 = bitcast i16 %1009 to bfloat, !dbg !105
+  %1011 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !105
+  %1012 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %991, i64 %1011, i1 %325) #6, !dbg !105
+  %1013 = bitcast i16 %1012 to bfloat, !dbg !105
+  %1014 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !105
+  %1015 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %993, i64 %1014, i1 %325) #6, !dbg !105
+  %1016 = bitcast i16 %1015 to bfloat, !dbg !105
+  %1017 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !105
+  %1018 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %995, i64 %1017, i1 %325) #6, !dbg !105
+  %1019 = bitcast i16 %1018 to bfloat, !dbg !105
+  %1020 = fpext bfloat %998 to float, !dbg !106
+  %1021 = fpext bfloat %1001 to float, !dbg !106
+  %1022 = fpext bfloat %1004 to float, !dbg !106
+  %1023 = fpext bfloat %1007 to float, !dbg !106
+  %1024 = fpext bfloat %1010 to float, !dbg !106
+  %1025 = fpext bfloat %1013 to float, !dbg !106
+  %1026 = fpext bfloat %1016 to float, !dbg !106
+  %1027 = fpext bfloat %1019 to float, !dbg !106
+  %1028 = fmul float %276, %1020, !dbg !107
+  %1029 = fmul float %276, %1021, !dbg !107
+  %1030 = fmul float %276, %1022, !dbg !107
+  %1031 = fmul float %276, %1023, !dbg !107
+  %1032 = fmul float %276, %1024, !dbg !107
+  %1033 = fmul float %276, %1025, !dbg !107
+  %1034 = fmul float %276, %1026, !dbg !107
+  %1035 = fmul float %276, %1027, !dbg !107
+  %1036 = getelementptr bfloat, ptr addrspace(1) %6, i64 %366, !dbg !108
+  %1037 = getelementptr bfloat, ptr addrspace(1) %6, i64 %368, !dbg !108
+  %1038 = getelementptr bfloat, ptr addrspace(1) %6, i64 %369, !dbg !108
+  %1039 = getelementptr bfloat, ptr addrspace(1) %6, i64 %370, !dbg !108
+  %1040 = getelementptr bfloat, ptr addrspace(1) %6, i64 %371, !dbg !108
+  %1041 = getelementptr bfloat, ptr addrspace(1) %6, i64 %372, !dbg !108
+  %1042 = getelementptr bfloat, ptr addrspace(1) %6, i64 %367, !dbg !108
+  %1043 = getelementptr bfloat, ptr addrspace(1) %6, i64 %373, !dbg !108
+  %1044 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !109
+  %1045 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %1036, i64 %1044, i1 %325) #6, !dbg !109
+  %1046 = bitcast i16 %1045 to bfloat, !dbg !109
+  %1047 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !109
+  %1048 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %1037, i64 %1047, i1 %325) #6, !dbg !109
+  %1049 = bitcast i16 %1048 to bfloat, !dbg !109
+  %1050 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !109
+  %1051 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %1038, i64 %1050, i1 %325) #6, !dbg !109
+  %1052 = bitcast i16 %1051 to bfloat, !dbg !109
+  %1053 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !109
+  %1054 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %1039, i64 %1053, i1 %325) #6, !dbg !109
+  %1055 = bitcast i16 %1054 to bfloat, !dbg !109
+  %1056 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !109
+  %1057 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %1040, i64 %1056, i1 %325) #6, !dbg !109
+  %1058 = bitcast i16 %1057 to bfloat, !dbg !109
+  %1059 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !109
+  %1060 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %1041, i64 %1059, i1 %325) #6, !dbg !109
+  %1061 = bitcast i16 %1060 to bfloat, !dbg !109
+  %1062 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !109
+  %1063 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %1042, i64 %1062, i1 %325) #6, !dbg !109
+  %1064 = bitcast i16 %1063 to bfloat, !dbg !109
+  %1065 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !109
+  %1066 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %1043, i64 %1065, i1 %325) #6, !dbg !109
+  %1067 = bitcast i16 %1066 to bfloat, !dbg !109
+  %1068 = fpext bfloat %1046 to float, !dbg !110
+  %1069 = fpext bfloat %1049 to float, !dbg !110
+  %1070 = fpext bfloat %1052 to float, !dbg !110
+  %1071 = fpext bfloat %1055 to float, !dbg !110
+  %1072 = fpext bfloat %1058 to float, !dbg !110
+  %1073 = fpext bfloat %1061 to float, !dbg !110
+  %1074 = fpext bfloat %1064 to float, !dbg !110
+  %1075 = fpext bfloat %1067 to float, !dbg !110
+  %1076 = fmul float %1028, %1068, !dbg !111
+  %1077 = fmul float %1029, %1069, !dbg !111
+  %1078 = fmul float %1030, %1070, !dbg !111
+  %1079 = fmul float %1031, %1071, !dbg !111
+  %1080 = fmul float %1032, %1072, !dbg !111
+  %1081 = fmul float %1033, %1073, !dbg !111
+  %1082 = fmul float %1034, %1074, !dbg !111
+  %1083 = fmul float %1035, %1075, !dbg !111
+  %1084 = select i1 %321, float %964, float %1076, !dbg !88
+  %1085 = select i1 %321, float %965, float %1077, !dbg !88
+  %1086 = select i1 %321, float %966, float %1078, !dbg !88
+  %1087 = select i1 %321, float %967, float %1079, !dbg !88
+  %1088 = select i1 %321, float %968, float %1080, !dbg !88
+  %1089 = select i1 %321, float %969, float %1081, !dbg !88
+  %1090 = select i1 %321, float %970, float %1082, !dbg !88
+  %1091 = select i1 %321, float %971, float %1083, !dbg !88
+  %1092 = fmul float %.0.i59, %505, !dbg !112
+  %1093 = fmul float %.0.i59, %506, !dbg !112
+  %1094 = fmul float %.0.i59, %507, !dbg !112
+  %1095 = fmul float %.0.i59, %508, !dbg !112
+  %1096 = fmul float %.0.i59, %509, !dbg !112
+  %1097 = fmul float %.0.i59, %510, !dbg !112
+  %1098 = fmul float %.0.i59, %511, !dbg !112
+  %1099 = fmul float %.0.i59, %512, !dbg !112
+  %1100 = fmul float %1092, %532, !dbg !113
+  %1101 = fmul float %1093, %533, !dbg !113
+  %1102 = fmul float %1094, %534, !dbg !113
+  %1103 = fmul float %1095, %535, !dbg !113
+  %1104 = fmul float %1096, %536, !dbg !113
+  %1105 = fmul float %1097, %537, !dbg !113
+  %1106 = fmul float %1098, %538, !dbg !113
+  %1107 = fmul float %1099, %539, !dbg !113
+  %1108 = fmul float %1100, %440, !dbg !114
+  %1109 = fmul float %1101, %441, !dbg !114
+  %1110 = fmul float %1102, %442, !dbg !114
+  %1111 = fmul float %1103, %443, !dbg !114
+  %1112 = fmul float %1104, %450, !dbg !114
+  %1113 = fmul float %1105, %451, !dbg !114
+  %1114 = fmul float %1106, %452, !dbg !114
+  %1115 = fmul float %1107, %453, !dbg !114
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !114
+  store float %1108, ptr addrspace(3) %290, align 4, !dbg !114
+  store float %1109, ptr addrspace(3) %291, align 4, !dbg !114
+  store float %1110, ptr addrspace(3) %292, align 4, !dbg !114
+  store float %1111, ptr addrspace(3) %293, align 4, !dbg !114
+  store float %1112, ptr addrspace(3) %294, align 4, !dbg !114
+  store float %1113, ptr addrspace(3) %295, align 4, !dbg !114
+  store float %1114, ptr addrspace(3) %296, align 4, !dbg !114
+  store float %1115, ptr addrspace(3) %297, align 4, !dbg !114
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !114
+  %1116 = load float, ptr addrspace(3) %306, align 4, !dbg !114
+  %1117 = load float, ptr addrspace(3) %308, align 4, !dbg !114
+  %1118 = load float, ptr addrspace(3) %310, align 4, !dbg !114
+  %1119 = load float, ptr addrspace(3) %312, align 4, !dbg !114
+  %1120 = load float, ptr addrspace(3) %314, align 4, !dbg !114
+  %1121 = load float, ptr addrspace(3) %316, align 4, !dbg !114
+  %1122 = load float, ptr addrspace(3) %318, align 4, !dbg !114
+  %1123 = load float, ptr addrspace(3) %320, align 4, !dbg !114
+  %1124 = fmul float %476, %1084, !dbg !115
+  %1125 = fmul float %477, %1085, !dbg !115
+  %1126 = fmul float %478, %1086, !dbg !115
+  %1127 = fmul float %479, %1087, !dbg !115
+  %1128 = fmul float %480, %1088, !dbg !115
+  %1129 = fmul float %481, %1089, !dbg !115
+  %1130 = fmul float %482, %1090, !dbg !115
+  %1131 = fmul float %483, %1091, !dbg !115
+  %1132 = fadd float %1124, %1116, !dbg !116
+  %1133 = fadd float %1125, %1117, !dbg !116
+  %1134 = fadd float %1126, %1118, !dbg !116
+  %1135 = fadd float %1127, %1119, !dbg !116
+  %1136 = fadd float %1128, %1120, !dbg !116
+  %1137 = fadd float %1129, %1121, !dbg !116
+  %1138 = fadd float %1130, %1122, !dbg !116
+  %1139 = fadd float %1131, %1123, !dbg !116
+  %1140 = or disjoint i64 %365, %361, !dbg !117
+  %1141 = getelementptr bfloat, ptr addrspace(1) %0, i64 %1140, !dbg !118
+  %1142 = fptrunc float %844 to bfloat, !dbg !119
+  %1143 = fptrunc float %845 to bfloat, !dbg !119
+  %1144 = fptrunc float %846 to bfloat, !dbg !119
+  %1145 = fptrunc float %847 to bfloat, !dbg !119
+  %1146 = fptrunc float %848 to bfloat, !dbg !119
+  %1147 = fptrunc float %849 to bfloat, !dbg !119
+  %1148 = fptrunc float %850 to bfloat, !dbg !119
+  %1149 = fptrunc float %851 to bfloat, !dbg !119
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !119
+  store bfloat %1142, ptr addrspace(3) %336, align 2, !dbg !119
+  store bfloat %1143, ptr addrspace(3) %338, align 2, !dbg !119
+  store bfloat %1144, ptr addrspace(3) %340, align 2, !dbg !119
+  store bfloat %1145, ptr addrspace(3) %342, align 2, !dbg !119
+  store bfloat %1146, ptr addrspace(3) %344, align 2, !dbg !119
+  store bfloat %1147, ptr addrspace(3) %346, align 2, !dbg !119
+  store bfloat %1148, ptr addrspace(3) %348, align 2, !dbg !119
+  store bfloat %1149, ptr addrspace(3) %350, align 2, !dbg !119
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !119
+  %1150 = load i32, ptr addrspace(3) %355, align 4, !dbg !119
+  %1151 = load i32, ptr addrspace(3) %356, align 4, !dbg !119
+  %1152 = load i32, ptr addrspace(3) %357, align 4, !dbg !119
+  %1153 = load i32, ptr addrspace(3) %358, align 4, !dbg !119
+  tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %1150, i32 %1151, i32 %1152, i32 %1153, ptr addrspace(1) %1141, i1 true) #6, !dbg !119
+  %1154 = getelementptr bfloat, ptr addrspace(1) %1, i64 %1140, !dbg !120
+  %1155 = fptrunc float %1132 to bfloat, !dbg !121
+  %1156 = fptrunc float %1133 to bfloat, !dbg !121
+  %1157 = fptrunc float %1134 to bfloat, !dbg !121
+  %1158 = fptrunc float %1135 to bfloat, !dbg !121
+  %1159 = fptrunc float %1136 to bfloat, !dbg !121
+  %1160 = fptrunc float %1137 to bfloat, !dbg !121
+  %1161 = fptrunc float %1138 to bfloat, !dbg !121
+  %1162 = fptrunc float %1139 to bfloat, !dbg !121
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !121
+  store bfloat %1155, ptr addrspace(3) %336, align 2, !dbg !121
+  store bfloat %1156, ptr addrspace(3) %338, align 2, !dbg !121
+  store bfloat %1157, ptr addrspace(3) %340, align 2, !dbg !121
+  store bfloat %1158, ptr addrspace(3) %342, align 2, !dbg !121
+  store bfloat %1159, ptr addrspace(3) %344, align 2, !dbg !121
+  store bfloat %1160, ptr addrspace(3) %346, align 2, !dbg !121
+  store bfloat %1161, ptr addrspace(3) %348, align 2, !dbg !121
+  store bfloat %1162, ptr addrspace(3) %350, align 2, !dbg !121
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !121
+  %1163 = load i32, ptr addrspace(3) %355, align 4, !dbg !121
+  %1164 = load i32, ptr addrspace(3) %356, align 4, !dbg !121
+  %1165 = load i32, ptr addrspace(3) %357, align 4, !dbg !121
+  %1166 = load i32, ptr addrspace(3) %358, align 4, !dbg !121
+  tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %1163, i32 %1164, i32 %1165, i32 %1166, ptr addrspace(1) %1154, i1 true) #6, !dbg !121
+  br i1 %364, label %363, label %1167, !dbg !47
+
+1167:                                             ; preds = %363
+  ret void, !dbg !122
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1
+
+; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
+declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.div.full(float, float) #3
+
+; Function Attrs: convergent nocallback nounwind
+declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #4
+
+declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #5
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #3
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.rsqrt.approx.f(float) #3
+
+attributes #0 = { nounwind "nvvm.reqntid"="512" }
+attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
+attributes #3 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
+attributes #4 = { convergent nocallback nounwind }
+attributes #5 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #6 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3}
+!llvm.ident = !{!4}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly)
+!1 = !DIFile(filename: "cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py", directory: "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv")
+!2 = !{i32 2, !"Debug Info Version", i32 3}
+!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
+!4 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
+!5 = distinct !DISubprogram(name: "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0", linkageName: "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0", scope: !1, file: !1, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
+!7 = !{}
+!8 = !DILocation(line: 23, column: 28, scope: !5)
+!9 = !DILocation(line: 23, column: 33, scope: !5)
+!10 = !DILocation(line: 24, column: 44, scope: !5)
+!11 = !DILocation(line: 24, column: 23, scope: !5)
+!12 = !DILocation(line: 26, column: 37, scope: !5)
+!13 = !DILocation(line: 29, column: 19, scope: !5)
+!14 = !DILocation(line: 33, column: 43, scope: !5)
+!15 = !DILocation(line: 39, column: 57, scope: !5)
+!16 = !DILocation(line: 39, column: 34, scope: !5)
+!17 = !DILocation(line: 39, column: 68, scope: !5)
+!18 = !DILocation(line: 39, column: 121, scope: !5)
+!19 = !DILocation(line: 40, column: 50, scope: !5)
+!20 = !DILocation(line: 40, column: 34, scope: !5)
+!21 = !DILocation(line: 40, column: 61, scope: !5)
+!22 = !DILocation(line: 40, column: 114, scope: !5)
+!23 = !DILocation(line: 42, column: 22, scope: !5)
+!24 = !DILocation(line: 47, column: 22, scope: !5)
+!25 = !DILocation(line: 34, column: 31, scope: !5)
+!26 = !DILocation(line: 44, column: 23, scope: !5)
+!27 = !DILocation(line: 49, column: 25, scope: !5)
+!28 = !DILocation(line: 263, column: 15, scope: !29, inlinedAt: !31)
+!29 = distinct !DILexicalBlockFile(scope: !5, file: !30, discriminator: 0)
+!30 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.12/dist-packages/triton/language")
+!31 = !DILocation(line: 293, column: 36, scope: !29, inlinedAt: !32)
+!32 = !DILocation(line: 51, column: 25, scope: !33)
+!33 = distinct !DILexicalBlockFile(scope: !5, file: !1, discriminator: 0)
+!34 = !DILocation(line: 263, column: 15, scope: !29, inlinedAt: !35)
+!35 = !DILocation(line: 293, column: 36, scope: !29, inlinedAt: !36)
+!36 = !DILocation(line: 52, column: 27, scope: !33)
+!37 = !DILocation(line: 63, column: 46, scope: !5)
+!38 = !DILocation(line: 75, column: 25, scope: !5)
+!39 = !DILocation(line: 77, column: 24, scope: !5)
+!40 = !DILocation(line: 78, column: 32, scope: !5)
+!41 = !DILocation(line: 79, column: 24, scope: !5)
+!42 = !DILocation(line: 123, column: 24, scope: !5)
+!43 = !DILocation(line: 124, column: 24, scope: !5)
+!44 = !DILocation(line: 125, column: 32, scope: !5)
+!45 = !DILocation(line: 126, column: 24, scope: !5)
+!46 = !DILocation(line: 161, column: 43, scope: !5)
+!47 = !DILocation(line: 53, column: 43, scope: !5)
+!48 = !DILocation(line: 54, column: 31, scope: !5)
+!49 = !DILocation(line: 72, column: 41, scope: !5)
+!50 = !DILocation(line: 61, column: 51, scope: !5)
+!51 = !DILocation(line: 61, column: 35, scope: !5)
+!52 = !DILocation(line: 61, column: 62, scope: !5)
+!53 = !DILocation(line: 61, column: 115, scope: !5)
+!54 = !DILocation(line: 62, column: 35, scope: !5)
+!55 = !DILocation(line: 62, column: 42, scope: !5)
+!56 = !DILocation(line: 62, column: 95, scope: !5)
+!57 = !DILocation(line: 63, column: 42, scope: !5)
+!58 = !DILocation(line: 63, column: 35, scope: !5)
+!59 = !DILocation(line: 63, column: 51, scope: !5)
+!60 = !DILocation(line: 64, column: 35, scope: !5)
+!61 = !DILocation(line: 64, column: 51, scope: !5)
+!62 = !DILocation(line: 65, column: 58, scope: !5)
+!63 = !DILocation(line: 65, column: 35, scope: !5)
+!64 = !DILocation(line: 65, column: 69, scope: !5)
+!65 = !DILocation(line: 65, column: 123, scope: !5)
+!66 = !DILocation(line: 66, column: 36, scope: !5)
+!67 = !DILocation(line: 66, column: 43, scope: !5)
+!68 = !DILocation(line: 66, column: 96, scope: !5)
+!69 = !DILocation(line: 72, column: 39, scope: !5)
+!70 = !DILocation(line: 72, column: 57, scope: !5)
+!71 = !DILocation(line: 72, column: 35, scope: !5)
+!72 = !DILocation(line: 72, column: 68, scope: !5)
+!73 = !DILocation(line: 72, column: 129, scope: !5)
+!74 = !DILocation(line: 80, column: 35, scope: !5)
+!75 = !DILocation(line: 80, column: 85, scope: !5)
+!76 = !DILocation(line: 80, column: 146, scope: !5)
+!77 = !DILocation(line: 82, column: 24, scope: !5)
+!78 = !DILocation(line: 84, column: 17, scope: !5)
+!79 = !DILocation(line: 90, column: 53, scope: !5)
+!80 = !DILocation(line: 90, column: 35, scope: !5)
+!81 = !DILocation(line: 90, column: 64, scope: !5)
+!82 = !DILocation(line: 90, column: 125, scope: !5)
+!83 = !DILocation(line: 97, column: 24, scope: !5)
+!84 = !DILocation(line: 98, column: 35, scope: !5)
+!85 = !DILocation(line: 98, column: 81, scope: !5)
+!86 = !DILocation(line: 98, column: 142, scope: !5)
+!87 = !DILocation(line: 100, column: 24, scope: !5)
+!88 = !DILocation(line: 0, scope: !5)
+!89 = !DILocation(line: 111, column: 24, scope: !5)
+!90 = !DILocation(line: 113, column: 24, scope: !5)
+!91 = !DILocation(line: 116, column: 24, scope: !5)
+!92 = !DILocation(line: 118, column: 24, scope: !5)
+!93 = !DILocation(line: 119, column: 24, scope: !5)
+!94 = !DILocation(line: 121, column: 60, scope: !5)
+!95 = !DILocation(line: 121, column: 35, scope: !5)
+!96 = !DILocation(line: 121, column: 71, scope: !5)
+!97 = !DILocation(line: 121, column: 132, scope: !5)
+!98 = !DILocation(line: 127, column: 35, scope: !5)
+!99 = !DILocation(line: 127, column: 85, scope: !5)
+!100 = !DILocation(line: 127, column: 146, scope: !5)
+!101 = !DILocation(line: 129, column: 24, scope: !5)
+!102 = !DILocation(line: 131, column: 17, scope: !5)
+!103 = !DILocation(line: 134, column: 60, scope: !5)
+!104 = !DILocation(line: 134, column: 35, scope: !5)
+!105 = !DILocation(line: 134, column: 71, scope: !5)
+!106 = !DILocation(line: 134, column: 132, scope: !5)
+!107 = !DILocation(line: 139, column: 24, scope: !5)
+!108 = !DILocation(line: 140, column: 35, scope: !5)
+!109 = !DILocation(line: 140, column: 81, scope: !5)
+!110 = !DILocation(line: 140, column: 142, scope: !5)
+!111 = !DILocation(line: 142, column: 24, scope: !5)
+!112 = !DILocation(line: 151, column: 25, scope: !5)
+!113 = !DILocation(line: 153, column: 26, scope: !5)
+!114 = !DILocation(line: 156, column: 26, scope: !5)
+!115 = !DILocation(line: 158, column: 26, scope: !5)
+!116 = !DILocation(line: 159, column: 26, scope: !5)
+!117 = !DILocation(line: 161, column: 39, scope: !5)
+!118 = !DILocation(line: 161, column: 32, scope: !5)
+!119 = !DILocation(line: 161, column: 55, scope: !5)
+!120 = !DILocation(line: 162, column: 32, scope: !5)
+!121 = !DILocation(line: 162, column: 56, scope: !5)
+!122 = !DILocation(line: 53, column: 4, scope: !5)
diff --git a/triton/C3PA2FQRIXNX4FILRXWMWDTESFUYR3BPZHZKRDDGR2QUBFWHGDOQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ptx b/triton/C3PA2FQRIXNX4FILRXWMWDTESFUYR3BPZHZKRDDGR2QUBFWHGDOQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ptx
new file mode 100644
index 0000000000000000000000000000000000000000..6ba16afc7c5c89773bb586712c820d45b1a4eb19
--- /dev/null
+++ b/triton/C3PA2FQRIXNX4FILRXWMWDTESFUYR3BPZHZKRDDGR2QUBFWHGDOQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ptx
@@ -0,0 +1,2014 @@
+//
+// Generated by LLVM NVPTX Back-End
+//
+
+.version 9.1
+.target sm_89
+.address_size 64
+
+	// .globl	triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0 // -- Begin function triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0
+.extern .shared .align 16 .b8 global_smem[];
+.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90};
+                                        // @triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0
+.visible .entry triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0(
+	.param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_0,
+	.param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_1,
+	.param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_2,
+	.param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_3,
+	.param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_4,
+	.param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_5,
+	.param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_6,
+	.param .u32 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_7,
+	.param .u32 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_8,
+	.param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_9,
+	.param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_10
+)
+.reqntid 512
+{
+	.reg .pred 	%p<6>;
+	.reg .b16 	%rs<146>;
+	.reg .b32 	%r<543>;
+	.reg .b64 	%rd<201>;
+	.loc	1 18 0                          // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:18:0
+$L__func_begin0:
+	.loc	1 18 0                          // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:18:0
+
+// %bb.0:                               // %__nv_rsqrtf.exit
+	ld.param.b64 	%rd11, [triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_6];
+	ld.param.b64 	%rd10, [triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_5];
+	ld.param.b64 	%rd9, [triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_4];
+	ld.param.b64 	%rd8, [triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_3];
+	ld.param.b64 	%rd7, [triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_2];
+	ld.param.b64 	%rd6, [triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_1];
+	ld.param.b64 	%rd5, [triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_0];
+$L__tmp0:
+	.loc	1 23 28                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:23:28
+	mov.u32 	%r47, %ctaid.x;
+	.loc	1 23 33                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:23:33
+	shl.b32 	%r48, %r47, 6;
+	.loc	1 24 44                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:24:44
+	mov.u32 	%r49, %tid.x;
+	and.b32 	%r50, %r49, 504;
+	bfe.u32 	%r51, %r49, 3, 6;
+	.loc	1 24 23                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:24:23
+	or.b32 	%r52, %r51, %r48;
+	.loc	1 26 37                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:26:37
+	and.b32 	%r53, %r49, 7;
+	shl.b32 	%r54, %r53, 3;
+	.loc	1 29 19                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:29:19
+	bfe.s32 	%r55, %r47, 25, 1;
+	shr.u32 	%r56, %r55, 27;
+	add.s32 	%r57, %r52, %r56;
+	shr.s32 	%r58, %r57, 5;
+	shl.b32 	%r59, %r52, 7;
+	shl.b32 	%r60, %r58, 15;
+	add.s32 	%r1, %r60, %r59;
+	add.s32 	%r2, %r1, 4096;
+	.loc	1 33 43                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:33:43
+	cvt.u64.u32 	%rd1, %r54;
+	.loc	1 39 57                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:39:57
+	or.b32 	%r61, %r2, %r54;
+	.loc	1 39 34                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:39:34
+	mad.wide.s32 	%rd12, %r61, 2, %rd7;
+	.loc	1 39 68                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:39:68
+	// begin inline asm
+	mov.u64 %rd13, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd13, 1.0;
+	// end inline asm
+	mov.b32 	%r34, 0;
+	mov.pred 	%p2, -1;
+	// begin inline asm
+	mov.u32 %r30, %r34;
+	mov.u32 %r31, %r34;
+	mov.u32 %r32, %r34;
+	mov.u32 %r33, %r34;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r30, %r31, %r32, %r33 }, [ %rd12 + 0 ], %rd13;
+	// end inline asm
+	mov.b32 	{%rs1, %rs2}, %r30;
+	mov.b32 	{%rs3, %rs4}, %r31;
+	mov.b32 	{%rs5, %rs6}, %r32;
+	mov.b32 	{%rs7, %rs8}, %r33;
+	.loc	1 39 121                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:39:121
+	cvt.f32.bf16 	%r62, %rs1;
+	cvt.f32.bf16 	%r63, %rs2;
+	cvt.f32.bf16 	%r64, %rs3;
+	cvt.f32.bf16 	%r65, %rs4;
+	cvt.f32.bf16 	%r66, %rs5;
+	cvt.f32.bf16 	%r67, %rs6;
+	cvt.f32.bf16 	%r68, %rs7;
+	cvt.f32.bf16 	%r69, %rs8;
+	.loc	1 40 50                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:40:50
+	or.b32 	%r70, %r1, %r54;
+	.loc	1 40 34                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:40:34
+	mad.wide.s32 	%rd14, %r70, 2, %rd7;
+	.loc	1 40 61                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:40:61
+	// begin inline asm
+	mov.u64 %rd15, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd15, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r35, %r34;
+	mov.u32 %r36, %r34;
+	mov.u32 %r37, %r34;
+	mov.u32 %r38, %r34;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r35, %r36, %r37, %r38 }, [ %rd14 + 0 ], %rd15;
+	// end inline asm
+	mov.b32 	{%rs9, %rs10}, %r35;
+	mov.b32 	{%rs11, %rs12}, %r36;
+	mov.b32 	{%rs13, %rs14}, %r37;
+	mov.b32 	{%rs15, %rs16}, %r38;
+	.loc	1 40 114                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:40:114
+	cvt.f32.bf16 	%r71, %rs9;
+	cvt.f32.bf16 	%r72, %rs10;
+	cvt.f32.bf16 	%r73, %rs11;
+	cvt.f32.bf16 	%r74, %rs12;
+	cvt.f32.bf16 	%r75, %rs13;
+	cvt.f32.bf16 	%r76, %rs14;
+	cvt.f32.bf16 	%r77, %rs15;
+	cvt.f32.bf16 	%r78, %rs16;
+	.loc	1 39 34                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:39:34
+	cvt.s64.s32 	%rd20, %r2;
+	or.b64 	%rd21, %rd20, %rd1;
+	shl.b64 	%rd22, %rd21, 1;
+	add.s64 	%rd23, %rd7, %rd22;
+	add.s64 	%rd16, %rd23, 128;
+	.loc	1 39 68                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:39:68
+	// begin inline asm
+	mov.u64 %rd17, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd17, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r39, %r34;
+	mov.u32 %r40, %r34;
+	mov.u32 %r41, %r34;
+	mov.u32 %r42, %r34;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r39, %r40, %r41, %r42 }, [ %rd16 + 0 ], %rd17;
+	// end inline asm
+	mov.b32 	{%rs17, %rs18}, %r39;
+	mov.b32 	{%rs19, %rs20}, %r40;
+	mov.b32 	{%rs21, %rs22}, %r41;
+	mov.b32 	{%rs23, %rs24}, %r42;
+	.loc	1 39 121                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:39:121
+	cvt.f32.bf16 	%r79, %rs17;
+	cvt.f32.bf16 	%r80, %rs18;
+	cvt.f32.bf16 	%r81, %rs19;
+	cvt.f32.bf16 	%r82, %rs20;
+	cvt.f32.bf16 	%r83, %rs21;
+	cvt.f32.bf16 	%r84, %rs22;
+	cvt.f32.bf16 	%r85, %rs23;
+	cvt.f32.bf16 	%r86, %rs24;
+	.loc	1 40 34                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:40:34
+	cvt.s64.s32 	%rd24, %r1;
+	or.b64 	%rd25, %rd24, %rd1;
+	shl.b64 	%rd26, %rd25, 1;
+	add.s64 	%rd27, %rd7, %rd26;
+	add.s64 	%rd18, %rd27, 128;
+	.loc	1 40 61                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:40:61
+	// begin inline asm
+	mov.u64 %rd19, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd19, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r43, %r34;
+	mov.u32 %r44, %r34;
+	mov.u32 %r45, %r34;
+	mov.u32 %r46, %r34;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r43, %r44, %r45, %r46 }, [ %rd18 + 0 ], %rd19;
+	// end inline asm
+	mov.b32 	{%rs25, %rs26}, %r43;
+	mov.b32 	{%rs27, %rs28}, %r44;
+	mov.b32 	{%rs29, %rs30}, %r45;
+	mov.b32 	{%rs31, %rs32}, %r46;
+	.loc	1 40 114                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:40:114
+	cvt.f32.bf16 	%r87, %rs25;
+	cvt.f32.bf16 	%r88, %rs26;
+	cvt.f32.bf16 	%r89, %rs27;
+	cvt.f32.bf16 	%r90, %rs28;
+	cvt.f32.bf16 	%r91, %rs29;
+	cvt.f32.bf16 	%r92, %rs30;
+	cvt.f32.bf16 	%r93, %rs31;
+	cvt.f32.bf16 	%r94, %rs32;
+	.loc	1 42 22                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:42:22
+	mul.f32 	%r95, %r79, %r79;
+	mul.f32 	%r96, %r80, %r80;
+	mul.f32 	%r97, %r81, %r81;
+	mul.f32 	%r98, %r82, %r82;
+	mul.f32 	%r99, %r83, %r83;
+	mul.f32 	%r100, %r84, %r84;
+	mul.f32 	%r101, %r85, %r85;
+	mul.f32 	%r102, %r86, %r86;
+	.loc	1 44 23                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:44:23
+	fma.rn.f32 	%r103, %r62, %r62, %r95;
+	fma.rn.f32 	%r104, %r63, %r63, %r96;
+	fma.rn.f32 	%r105, %r64, %r64, %r97;
+	fma.rn.f32 	%r106, %r65, %r65, %r98;
+	fma.rn.f32 	%r107, %r66, %r66, %r99;
+	fma.rn.f32 	%r108, %r67, %r67, %r100;
+	fma.rn.f32 	%r109, %r68, %r68, %r101;
+	fma.rn.f32 	%r110, %r69, %r69, %r102;
+	.loc	1 47 22                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:47:22
+	mul.f32 	%r111, %r87, %r87;
+	mul.f32 	%r112, %r88, %r88;
+	mul.f32 	%r113, %r89, %r89;
+	mul.f32 	%r114, %r90, %r90;
+	mul.f32 	%r115, %r91, %r91;
+	mul.f32 	%r116, %r92, %r92;
+	mul.f32 	%r117, %r93, %r93;
+	mul.f32 	%r118, %r94, %r94;
+	.loc	1 49 25                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:49:25
+	fma.rn.f32 	%r119, %r71, %r71, %r111;
+	fma.rn.f32 	%r120, %r72, %r72, %r112;
+	fma.rn.f32 	%r121, %r73, %r73, %r113;
+	fma.rn.f32 	%r122, %r74, %r74, %r114;
+	fma.rn.f32 	%r123, %r75, %r75, %r115;
+	fma.rn.f32 	%r124, %r76, %r76, %r116;
+	fma.rn.f32 	%r125, %r77, %r77, %r117;
+	fma.rn.f32 	%r126, %r78, %r78, %r118;
+	.loc	1 24 44                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:24:44
+	and.b32 	%r127, %r49, 63;
+	.loc	1 24 23                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:24:23
+	or.b32 	%r128, %r48, %r127;
+	.loc	1 26 37                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:26:37
+	shr.u32 	%r129, %r49, 6;
+	and.b32 	%r130, %r129, 6;
+	.loc	1 29 19                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:29:19
+	add.s32 	%r131, %r128, %r56;
+$L__tmp1:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ] ]
+	add.f32 	%r132, %r103, %r104;
+	add.f32 	%r133, %r105, %r132;
+	add.f32 	%r134, %r106, %r133;
+	add.f32 	%r135, %r107, %r134;
+	add.f32 	%r136, %r108, %r135;
+	add.f32 	%r137, %r109, %r136;
+	add.f32 	%r138, %r110, %r137;
+$L__tmp2:
+	.loc	2 293 36                        // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ]
+	shfl.sync.bfly.b32 	%r139, %r138, 4, 31, -1;
+$L__tmp3:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ] ]
+	add.f32 	%r140, %r138, %r139;
+$L__tmp4:
+	.loc	2 293 36                        // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ]
+	shfl.sync.bfly.b32 	%r141, %r140, 2, 31, -1;
+$L__tmp5:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ] ]
+	add.f32 	%r142, %r140, %r141;
+$L__tmp6:
+	.loc	2 293 36                        // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ]
+	shfl.sync.bfly.b32 	%r143, %r142, 1, 31, -1;
+$L__tmp7:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ] ]
+	add.f32 	%r144, %r142, %r143;
+$L__tmp8:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ] ]
+	add.f32 	%r145, %r119, %r120;
+	add.f32 	%r146, %r121, %r145;
+	add.f32 	%r147, %r122, %r146;
+	add.f32 	%r148, %r123, %r147;
+	add.f32 	%r149, %r124, %r148;
+	add.f32 	%r150, %r125, %r149;
+	add.f32 	%r151, %r126, %r150;
+$L__tmp9:
+	.loc	2 293 36                        // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ]
+	shfl.sync.bfly.b32 	%r152, %r151, 4, 31, -1;
+$L__tmp10:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ] ]
+	add.f32 	%r153, %r151, %r152;
+$L__tmp11:
+	.loc	2 293 36                        // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ]
+	shfl.sync.bfly.b32 	%r154, %r153, 2, 31, -1;
+$L__tmp12:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ] ]
+	add.f32 	%r155, %r153, %r154;
+$L__tmp13:
+	.loc	2 293 36                        // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ]
+	shfl.sync.bfly.b32 	%r156, %r155, 1, 31, -1;
+$L__tmp14:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ] ]
+	add.f32 	%r157, %r155, %r156;
+$L__tmp15:
+	.loc	1 63 46                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:63:46
+	shl.b32 	%r158, %r58, 7;
+	mov.b32 	%r159, 0f43000000;
+	.loc	1 75 25                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:75:25
+	div.full.f32 	%r160, %r157, %r159;
+	.loc	1 77 24                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:77:24
+	add.f32 	%r161, %r160, 0f358637BD;
+	.loc	1 78 32                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:78:32
+	rsqrt.approx.ftz.f32 	%r3, %r161;
+	.loc	1 79 24                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:79:24
+	shr.u32 	%r162, %r50, 1;
+	mov.b32 	%r163, global_smem;
+	add.s32 	%r164, %r163, %r162;
+	st.shared.b32 	[%r164], %r3;
+	bar.sync 	0;
+	shl.b32 	%r165, %r127, 2;
+	add.s32 	%r166, %r163, %r165;
+	ld.shared.b32 	%r4, [%r166];
+	.loc	1 123 24                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:123:24
+	div.full.f32 	%r167, %r144, %r159;
+	.loc	1 124 24                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:124:24
+	add.f32 	%r168, %r167, 0f358637BD;
+	.loc	1 125 32                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:125:32
+	rsqrt.approx.ftz.f32 	%r5, %r168;
+	.loc	1 126 24                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:126:24
+	bar.sync 	0;
+	st.shared.b32 	[%r164], %r5;
+	bar.sync 	0;
+	ld.shared.b32 	%r6, [%r166];
+	bfe.u32 	%r7, %r49, 6, 1;
+	or.b32 	%r8, %r54, %r158;
+	and.b32 	%r169, %r49, 224;
+	shl.b32 	%r170, %r169, 6;
+	shl.b32 	%r171, %r49, 2;
+	and.b32 	%r172, %r171, 124;
+	shr.u32 	%r173, %r169, 3;
+	shr.u32 	%r174, %r49, 1;
+	and.b32 	%r175, %r174, 128;
+	or.b32 	%r176, %r170, %r172;
+	xor.b32 	%r177, %r176, %r173;
+	add.s32 	%r178, %r163, %r175;
+	add.s32 	%r9, %r178, %r177;
+	and.b32 	%r179, %r49, 28;
+	shl.b32 	%r180, %r179, 9;
+	shl.b32 	%r181, %r49, 5;
+	and.b32 	%r182, %r181, 96;
+	and.b32 	%r183, %r171, 1920;
+	or.b32 	%r184, %r180, %r182;
+	or.b32 	%r185, %r184, %r183;
+	or.b32 	%r186, %r185, %r179;
+	add.s32 	%r10, %r163, %r186;
+	xor.b32 	%r187, %r186, 4;
+	add.s32 	%r11, %r163, %r187;
+	xor.b32 	%r188, %r186, 8;
+	add.s32 	%r12, %r163, %r188;
+	xor.b32 	%r189, %r186, 12;
+	add.s32 	%r13, %r163, %r189;
+	xor.b32 	%r190, %r186, 16;
+	add.s32 	%r14, %r163, %r190;
+	xor.b32 	%r191, %r186, 20;
+	add.s32 	%r15, %r163, %r191;
+	xor.b32 	%r192, %r186, 24;
+	add.s32 	%r16, %r163, %r192;
+	xor.b32 	%r193, %r186, 28;
+	add.s32 	%r17, %r163, %r193;
+	shl.b32 	%r194, %r128, 7;
+	shl.b32 	%r195, %r131, 10;
+	and.b32 	%r196, %r195, -32768;
+	add.s32 	%r18, %r196, %r194;
+	add.s32 	%r19, %r18, 4097;
+	add.s32 	%r20, %r18, 4096;
+	shl.b32 	%r197, %r179, 8;
+	shl.b32 	%r198, %r49, 1;
+	and.b32 	%r199, %r198, 768;
+	shr.u32 	%r200, %r49, 5;
+	and.b32 	%r201, %r200, 2;
+	or.b32 	%r202, %r199, %r201;
+	or.b32 	%r203, %r202, %r197;
+	or.b32 	%r204, %r203, %r165;
+	add.s32 	%r21, %r163, %r204;
+	xor.b32 	%r205, %r204, 16;
+	add.s32 	%r22, %r163, %r205;
+	xor.b32 	%r206, %r204, 32;
+	add.s32 	%r23, %r163, %r206;
+	xor.b32 	%r207, %r204, 48;
+	add.s32 	%r24, %r163, %r207;
+	xor.b32 	%r208, %r204, 64;
+	add.s32 	%r25, %r163, %r208;
+	xor.b32 	%r209, %r204, 80;
+	add.s32 	%r26, %r163, %r209;
+	xor.b32 	%r210, %r204, 96;
+	add.s32 	%r27, %r163, %r210;
+	xor.b32 	%r211, %r204, 112;
+	add.s32 	%r28, %r163, %r211;
+	shl.b32 	%r212, %r169, 5;
+	shl.b32 	%r213, %r53, 4;
+	or.b32 	%r214, %r212, %r213;
+	xor.b32 	%r215, %r214, %r162;
+	add.s32 	%r29, %r163, %r215;
+	.loc	1 53 43                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:53:43
+	cvt.u64.u32 	%rd2, %r130;
+	cvt.s64.s32 	%rd3, %r158;
+	cvt.s64.s32 	%rd4, %r59;
+	mov.b64 	%rd200, 0;
+	mov.pred 	%p5, %p2;
+$L__BB0_1:                              // =>This Inner Loop Header: Depth=1
+	.loc	1 0 43                          // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:0:43
+	mov.pred 	%p1, %p5;
+	setp.ne.b32 	%p4, %r7, 0;
+	setp.eq.b32 	%p3, %r7, 0;
+	.loc	1 54 31                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:54:31
+	or.b64 	%rd174, %rd200, %rd1;
+	or.b64 	%rd175, %rd200, %rd2;
+	.loc	1 61 51                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:61:51
+	cvt.u32.u64 	%r256, %rd174;
+	or.b32 	%r257, %r1, %r256;
+	.loc	1 61 35                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:61:35
+	mad.wide.s32 	%rd29, %r257, 2, %rd7;
+	.loc	1 61 62                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:61:62
+	// begin inline asm
+	mov.u64 %rd28, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd28, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r216, %r34;
+	mov.u32 %r217, %r34;
+	mov.u32 %r218, %r34;
+	mov.u32 %r219, %r34;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r216, %r217, %r218, %r219 }, [ %rd29 + 0 ], %rd28;
+	// end inline asm
+	mov.b32 	{%rs98, %rs99}, %r216;
+	mov.b32 	{%rs100, %rs101}, %r217;
+	mov.b32 	{%rs102, %rs103}, %r218;
+	mov.b32 	{%rs104, %rs105}, %r219;
+	.loc	1 61 115                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:61:115
+	cvt.f32.bf16 	%r258, %rs98;
+	cvt.f32.bf16 	%r259, %rs99;
+	cvt.f32.bf16 	%r260, %rs100;
+	cvt.f32.bf16 	%r261, %rs101;
+	cvt.f32.bf16 	%r262, %rs102;
+	cvt.f32.bf16 	%r263, %rs103;
+	cvt.f32.bf16 	%r264, %rs104;
+	cvt.f32.bf16 	%r265, %rs105;
+	.loc	1 62 35                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:62:35
+	shl.b64 	%rd176, %rd174, 1;
+	add.s64 	%rd31, %rd8, %rd176;
+	.loc	1 62 42                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:62:42
+	// begin inline asm
+	mov.u64 %rd30, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd30, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r220, %r34;
+	mov.u32 %r221, %r34;
+	mov.u32 %r222, %r34;
+	mov.u32 %r223, %r34;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r220, %r221, %r222, %r223 }, [ %rd31 + 0 ], %rd30;
+	// end inline asm
+	mov.b32 	{%rs106, %rs107}, %r220;
+	mov.b32 	{%rs108, %rs109}, %r221;
+	mov.b32 	{%rs110, %rs111}, %r222;
+	mov.b32 	{%rs112, %rs113}, %r223;
+	.loc	1 62 95                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:62:95
+	cvt.f32.bf16 	%r266, %rs106;
+	cvt.f32.bf16 	%r267, %rs107;
+	cvt.f32.bf16 	%r268, %rs108;
+	cvt.f32.bf16 	%r269, %rs109;
+	cvt.f32.bf16 	%r270, %rs110;
+	cvt.f32.bf16 	%r271, %rs111;
+	cvt.f32.bf16 	%r272, %rs112;
+	cvt.f32.bf16 	%r273, %rs113;
+	.loc	1 63 42                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:63:42
+	or.b64 	%rd177, %rd174, %rd3;
+	.loc	1 63 35                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:63:35
+	shl.b64 	%rd178, %rd177, 2;
+	add.s64 	%rd33, %rd9, %rd178;
+	.loc	1 53 43                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:53:43
+	cvt.s64.s32 	%rd179, %r8;
+	add.s64 	%rd180, %rd200, %rd179;
+	.loc	1 63 35                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:63:35
+	shl.b64 	%rd181, %rd180, 2;
+	add.s64 	%rd182, %rd9, %rd181;
+	add.s64 	%rd35, %rd182, 16;
+	.loc	1 63 51                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:63:51
+	// begin inline asm
+	mov.u64 %rd32, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd32, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r224, %r34;
+	mov.u32 %r225, %r34;
+	mov.u32 %r226, %r34;
+	mov.u32 %r227, %r34;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r224, %r225, %r226, %r227 }, [ %rd33 + 0 ], %rd32;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd34, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd34, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r228, %r34;
+	mov.u32 %r229, %r34;
+	mov.u32 %r230, %r34;
+	mov.u32 %r231, %r34;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r228, %r229, %r230, %r231 }, [ %rd35 + 0 ], %rd34;
+	// end inline asm
+	.loc	1 64 35                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:64:35
+	add.s64 	%rd37, %rd10, %rd178;
+	add.s64 	%rd183, %rd10, %rd181;
+	add.s64 	%rd39, %rd183, 16;
+	.loc	1 64 51                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:64:51
+	// begin inline asm
+	mov.u64 %rd36, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd36, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r232, %r34;
+	mov.u32 %r233, %r34;
+	mov.u32 %r234, %r34;
+	mov.u32 %r235, %r34;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r232, %r233, %r234, %r235 }, [ %rd37 + 0 ], %rd36;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd38, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd38, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r236, %r34;
+	mov.u32 %r237, %r34;
+	mov.u32 %r238, %r34;
+	mov.u32 %r239, %r34;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r236, %r237, %r238, %r239 }, [ %rd39 + 0 ], %rd38;
+	// end inline asm
+	bar.sync 	0;
+	st.shared.b32 	[%r9], %r232;
+	st.shared.b32 	[%r9+256], %r233;
+	st.shared.b32 	[%r9+512], %r234;
+	st.shared.b32 	[%r9+768], %r235;
+	st.shared.b32 	[%r9+1024], %r236;
+	st.shared.b32 	[%r9+1280], %r237;
+	st.shared.b32 	[%r9+1536], %r238;
+	st.shared.b32 	[%r9+1792], %r239;
+	bar.sync 	0;
+	ld.shared.b32 	%r274, [%r10];
+	ld.shared.b32 	%r275, [%r11];
+	ld.shared.b32 	%r276, [%r12];
+	ld.shared.b32 	%r277, [%r13];
+	ld.shared.b32 	%r278, [%r14];
+	ld.shared.b32 	%r279, [%r15];
+	ld.shared.b32 	%r280, [%r16];
+	ld.shared.b32 	%r281, [%r17];
+	.loc	1 65 58                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:65:58
+	or.b32 	%r282, %r2, %r256;
+	.loc	1 65 35                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:65:35
+	mad.wide.s32 	%rd41, %r282, 2, %rd7;
+	.loc	1 65 69                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:65:69
+	// begin inline asm
+	mov.u64 %rd40, 0x0;
+	createpolicy.fractional.L2::evict_first.b64 %rd40, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r240, %r34;
+	mov.u32 %r241, %r34;
+	mov.u32 %r242, %r34;
+	mov.u32 %r243, %r34;
+	@%p2 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { %r240, %r241, %r242, %r243 }, [ %rd41 + 0 ], %rd40;
+	// end inline asm
+	mov.b32 	{%rs114, %rs115}, %r240;
+	mov.b32 	{%rs116, %rs117}, %r241;
+	mov.b32 	{%rs118, %rs119}, %r242;
+	mov.b32 	{%rs120, %rs121}, %r243;
+	.loc	1 65 123                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:65:123
+	cvt.f32.bf16 	%r283, %rs114;
+	cvt.f32.bf16 	%r284, %rs115;
+	cvt.f32.bf16 	%r285, %rs116;
+	cvt.f32.bf16 	%r286, %rs117;
+	cvt.f32.bf16 	%r287, %rs118;
+	cvt.f32.bf16 	%r288, %rs119;
+	cvt.f32.bf16 	%r289, %rs120;
+	cvt.f32.bf16 	%r290, %rs121;
+	.loc	1 66 36                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:66:36
+	add.s64 	%rd43, %rd11, %rd176;
+	.loc	1 66 43                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:66:43
+	// begin inline asm
+	mov.u64 %rd42, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd42, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r244, %r34;
+	mov.u32 %r245, %r34;
+	mov.u32 %r246, %r34;
+	mov.u32 %r247, %r34;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r244, %r245, %r246, %r247 }, [ %rd43 + 0 ], %rd42;
+	// end inline asm
+	mov.b32 	{%rs122, %rs123}, %r244;
+	mov.b32 	{%rs124, %rs125}, %r245;
+	mov.b32 	{%rs126, %rs127}, %r246;
+	mov.b32 	{%rs128, %rs129}, %r247;
+	.loc	1 66 96                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:66:96
+	cvt.f32.bf16 	%r291, %rs122;
+	cvt.f32.bf16 	%r292, %rs123;
+	cvt.f32.bf16 	%r293, %rs124;
+	cvt.f32.bf16 	%r294, %rs125;
+	cvt.f32.bf16 	%r295, %rs126;
+	cvt.f32.bf16 	%r296, %rs127;
+	cvt.f32.bf16 	%r297, %rs128;
+	cvt.f32.bf16 	%r298, %rs129;
+	.loc	1 72 35                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:72:35
+	cvt.s64.s32 	%rd184, %r18;
+	.loc	1 72 57                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:72:57
+	cvt.u32.u64 	%r299, %rd175;
+	.loc	1 72 35                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:72:35
+	cvt.s64.s32 	%rd185, %rd175;
+	add.s64 	%rd186, %rd184, %rd185;
+	shl.b64 	%rd187, %rd186, 1;
+	add.s64 	%rd188, %rd7, %rd187;
+	add.s64 	%rd45, %rd188, 2;
+	add.s64 	%rd47, %rd188, 18;
+	add.s64 	%rd49, %rd188, 34;
+	add.s64 	%rd51, %rd188, 50;
+	add.s64 	%rd53, %rd188, 66;
+	add.s64 	%rd55, %rd188, 82;
+	add.s64 	%rd57, %rd188, 98;
+	add.s64 	%rd59, %rd188, 114;
+	.loc	1 72 68                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:72:68
+	// begin inline asm
+	mov.u64 %rd44, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd44, 1.0;
+	// end inline asm
+	mov.b16 	%rs34, 0;
+	// begin inline asm
+	mov.u16 %rs33, %rs34;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs33 }, [ %rd45 + 0 ], %rd44;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd46, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd46, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs35, %rs34;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs35 }, [ %rd47 + 0 ], %rd46;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd48, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd48, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs36, %rs34;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs36 }, [ %rd49 + 0 ], %rd48;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd50, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd50, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs37, %rs34;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs37 }, [ %rd51 + 0 ], %rd50;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd52, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd52, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs38, %rs34;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs38 }, [ %rd53 + 0 ], %rd52;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd54, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd54, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs39, %rs34;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs39 }, [ %rd55 + 0 ], %rd54;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd56, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd56, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs40, %rs34;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs40 }, [ %rd57 + 0 ], %rd56;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd58, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd58, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs41, %rs34;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs41 }, [ %rd59 + 0 ], %rd58;
+	// end inline asm
+	.loc	1 72 129                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:72:129
+	cvt.f32.bf16 	%r300, %rs33;
+	cvt.f32.bf16 	%r301, %rs35;
+	cvt.f32.bf16 	%r302, %rs36;
+	cvt.f32.bf16 	%r303, %rs37;
+	cvt.f32.bf16 	%r304, %rs38;
+	cvt.f32.bf16 	%r305, %rs39;
+	cvt.f32.bf16 	%r306, %rs40;
+	cvt.f32.bf16 	%r307, %rs41;
+	.loc	1 79 24                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:79:24
+	mul.f32 	%r308, %r4, %r300;
+	mul.f32 	%r309, %r4, %r301;
+	mul.f32 	%r310, %r4, %r302;
+	mul.f32 	%r311, %r4, %r303;
+	mul.f32 	%r312, %r4, %r304;
+	mul.f32 	%r313, %r4, %r305;
+	mul.f32 	%r314, %r4, %r306;
+	mul.f32 	%r315, %r4, %r307;
+	.loc	1 80 35                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:80:35
+	shl.b64 	%rd189, %rd175, 1;
+	add.s64 	%rd93, %rd8, %rd189;
+	add.s64 	%rd61, %rd93, 2;
+	add.s64 	%rd63, %rd93, 18;
+	add.s64 	%rd65, %rd93, 34;
+	add.s64 	%rd67, %rd93, 50;
+	add.s64 	%rd69, %rd93, 66;
+	add.s64 	%rd71, %rd93, 82;
+	add.s64 	%rd73, %rd93, 98;
+	add.s64 	%rd75, %rd93, 114;
+	.loc	1 80 85                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:80:85
+	// begin inline asm
+	mov.u64 %rd60, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd60, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs42, %rs34;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs42 }, [ %rd61 + 0 ], %rd60;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd62, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd62, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs43, %rs34;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs43 }, [ %rd63 + 0 ], %rd62;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd64, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd64, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs44, %rs34;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs44 }, [ %rd65 + 0 ], %rd64;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd66, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd66, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs45, %rs34;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs45 }, [ %rd67 + 0 ], %rd66;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd68, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd68, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs46, %rs34;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs46 }, [ %rd69 + 0 ], %rd68;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd70, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd70, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs47, %rs34;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs47 }, [ %rd71 + 0 ], %rd70;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd72, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd72, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs48, %rs34;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs48 }, [ %rd73 + 0 ], %rd72;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd74, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd74, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs49, %rs34;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs49 }, [ %rd75 + 0 ], %rd74;
+	// end inline asm
+	.loc	1 80 146                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:80:146
+	cvt.f32.bf16 	%r316, %rs42;
+	cvt.f32.bf16 	%r317, %rs43;
+	cvt.f32.bf16 	%r318, %rs44;
+	cvt.f32.bf16 	%r319, %rs45;
+	cvt.f32.bf16 	%r320, %rs46;
+	cvt.f32.bf16 	%r321, %rs47;
+	cvt.f32.bf16 	%r322, %rs48;
+	cvt.f32.bf16 	%r323, %rs49;
+	.loc	1 84 17                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:84:17
+	neg.f32 	%r324, %r308;
+	fma.rn.f32 	%r325, %r324, %r316, 0f00000000;
+	neg.f32 	%r326, %r309;
+	fma.rn.f32 	%r327, %r326, %r317, 0f00000000;
+	neg.f32 	%r328, %r310;
+	fma.rn.f32 	%r329, %r328, %r318, 0f00000000;
+	neg.f32 	%r330, %r311;
+	fma.rn.f32 	%r331, %r330, %r319, 0f00000000;
+	neg.f32 	%r332, %r312;
+	fma.rn.f32 	%r333, %r332, %r320, 0f00000000;
+	neg.f32 	%r334, %r313;
+	fma.rn.f32 	%r335, %r334, %r321, 0f00000000;
+	neg.f32 	%r336, %r314;
+	fma.rn.f32 	%r337, %r336, %r322, 0f00000000;
+	neg.f32 	%r338, %r315;
+	fma.rn.f32 	%r339, %r338, %r323, 0f00000000;
+	.loc	1 90 53                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:90:53
+	or.b32 	%r340, %r18, %r299;
+	.loc	1 90 35                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:90:35
+	mad.wide.s32 	%rd77, %r340, 2, %rd7;
+	add.s64 	%rd79, %rd188, 16;
+	add.s64 	%rd81, %rd188, 32;
+	add.s64 	%rd83, %rd188, 48;
+	add.s64 	%rd85, %rd188, 64;
+	add.s64 	%rd87, %rd188, 80;
+	add.s64 	%rd89, %rd188, 96;
+	add.s64 	%rd91, %rd188, 112;
+	.loc	1 90 64                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:90:64
+	// begin inline asm
+	mov.u64 %rd76, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd76, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs50, %rs34;
+	@%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs50 }, [ %rd77 + 0 ], %rd76;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd78, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd78, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs51, %rs34;
+	@%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs51 }, [ %rd79 + 0 ], %rd78;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd80, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd80, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs52, %rs34;
+	@%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs52 }, [ %rd81 + 0 ], %rd80;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd82, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd82, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs53, %rs34;
+	@%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs53 }, [ %rd83 + 0 ], %rd82;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd84, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd84, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs54, %rs34;
+	@%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs54 }, [ %rd85 + 0 ], %rd84;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd86, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd86, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs55, %rs34;
+	@%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs55 }, [ %rd87 + 0 ], %rd86;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd88, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd88, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs56, %rs34;
+	@%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs56 }, [ %rd89 + 0 ], %rd88;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd90, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd90, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs57, %rs34;
+	@%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs57 }, [ %rd91 + 0 ], %rd90;
+	// end inline asm
+	.loc	1 90 125                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:90:125
+	cvt.f32.bf16 	%r341, %rs50;
+	cvt.f32.bf16 	%r342, %rs51;
+	cvt.f32.bf16 	%r343, %rs52;
+	cvt.f32.bf16 	%r344, %rs53;
+	cvt.f32.bf16 	%r345, %rs54;
+	cvt.f32.bf16 	%r346, %rs55;
+	cvt.f32.bf16 	%r347, %rs56;
+	cvt.f32.bf16 	%r348, %rs57;
+	.loc	1 97 24                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:97:24
+	mul.f32 	%r349, %r4, %r341;
+	mul.f32 	%r350, %r4, %r342;
+	mul.f32 	%r351, %r4, %r343;
+	mul.f32 	%r352, %r4, %r344;
+	mul.f32 	%r353, %r4, %r345;
+	mul.f32 	%r354, %r4, %r346;
+	mul.f32 	%r355, %r4, %r347;
+	mul.f32 	%r356, %r4, %r348;
+	.loc	1 98 35                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:98:35
+	add.s64 	%rd95, %rd93, 16;
+	add.s64 	%rd97, %rd93, 32;
+	add.s64 	%rd99, %rd93, 48;
+	add.s64 	%rd101, %rd93, 64;
+	add.s64 	%rd103, %rd93, 80;
+	add.s64 	%rd105, %rd93, 96;
+	add.s64 	%rd107, %rd93, 112;
+	.loc	1 98 81                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:98:81
+	// begin inline asm
+	mov.u64 %rd92, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd92, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs58, %rs34;
+	@%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs58 }, [ %rd93 + 0 ], %rd92;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd94, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd94, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs59, %rs34;
+	@%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs59 }, [ %rd95 + 0 ], %rd94;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd96, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd96, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs60, %rs34;
+	@%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs60 }, [ %rd97 + 0 ], %rd96;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd98, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd98, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs61, %rs34;
+	@%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs61 }, [ %rd99 + 0 ], %rd98;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd100, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd100, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs62, %rs34;
+	@%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs62 }, [ %rd101 + 0 ], %rd100;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd102, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd102, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs63, %rs34;
+	@%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs63 }, [ %rd103 + 0 ], %rd102;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd104, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd104, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs64, %rs34;
+	@%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs64 }, [ %rd105 + 0 ], %rd104;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd106, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd106, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs65, %rs34;
+	@%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs65 }, [ %rd107 + 0 ], %rd106;
+	// end inline asm
+	.loc	1 98 142                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:98:142
+	cvt.f32.bf16 	%r357, %rs58;
+	cvt.f32.bf16 	%r358, %rs59;
+	cvt.f32.bf16 	%r359, %rs60;
+	cvt.f32.bf16 	%r360, %rs61;
+	cvt.f32.bf16 	%r361, %rs62;
+	cvt.f32.bf16 	%r362, %rs63;
+	cvt.f32.bf16 	%r363, %rs64;
+	cvt.f32.bf16 	%r364, %rs65;
+	.loc	1 100 24                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:100:24
+	mul.f32 	%r365, %r349, %r357;
+	mul.f32 	%r366, %r350, %r358;
+	mul.f32 	%r367, %r351, %r359;
+	mul.f32 	%r368, %r352, %r360;
+	mul.f32 	%r369, %r353, %r361;
+	mul.f32 	%r370, %r354, %r362;
+	mul.f32 	%r371, %r355, %r363;
+	mul.f32 	%r372, %r356, %r364;
+	.loc	1 0 0                           // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:0
+	selp.f32 	%r373, %r325, %r365, %p3;
+	selp.f32 	%r374, %r327, %r366, %p3;
+	selp.f32 	%r375, %r329, %r367, %p3;
+	selp.f32 	%r376, %r331, %r368, %p3;
+	selp.f32 	%r377, %r333, %r369, %p3;
+	selp.f32 	%r378, %r335, %r370, %p3;
+	selp.f32 	%r379, %r337, %r371, %p3;
+	selp.f32 	%r380, %r339, %r372, %p3;
+	.loc	1 111 24                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:111:24
+	mul.f32 	%r381, %r3, %r258;
+	mul.f32 	%r382, %r3, %r259;
+	mul.f32 	%r383, %r3, %r260;
+	mul.f32 	%r384, %r3, %r261;
+	mul.f32 	%r385, %r3, %r262;
+	mul.f32 	%r386, %r3, %r263;
+	mul.f32 	%r387, %r3, %r264;
+	mul.f32 	%r388, %r3, %r265;
+	.loc	1 113 24                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:113:24
+	mul.f32 	%r389, %r381, %r266;
+	mul.f32 	%r390, %r382, %r267;
+	mul.f32 	%r391, %r383, %r268;
+	mul.f32 	%r392, %r384, %r269;
+	mul.f32 	%r393, %r385, %r270;
+	mul.f32 	%r394, %r386, %r271;
+	mul.f32 	%r395, %r387, %r272;
+	mul.f32 	%r396, %r388, %r273;
+	.loc	1 116 24                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:116:24
+	mul.f32 	%r397, %r389, %r224;
+	mul.f32 	%r398, %r390, %r225;
+	mul.f32 	%r399, %r391, %r226;
+	mul.f32 	%r400, %r392, %r227;
+	mul.f32 	%r401, %r393, %r228;
+	mul.f32 	%r402, %r394, %r229;
+	mul.f32 	%r403, %r395, %r230;
+	mul.f32 	%r404, %r396, %r231;
+	bar.sync 	0;
+	st.shared.b32 	[%r9], %r397;
+	st.shared.b32 	[%r9+256], %r398;
+	st.shared.b32 	[%r9+512], %r399;
+	st.shared.b32 	[%r9+768], %r400;
+	st.shared.b32 	[%r9+1024], %r401;
+	st.shared.b32 	[%r9+1280], %r402;
+	st.shared.b32 	[%r9+1536], %r403;
+	st.shared.b32 	[%r9+1792], %r404;
+	bar.sync 	0;
+	ld.shared.b32 	%r405, [%r10];
+	ld.shared.b32 	%r406, [%r11];
+	ld.shared.b32 	%r407, [%r12];
+	ld.shared.b32 	%r408, [%r13];
+	ld.shared.b32 	%r409, [%r14];
+	ld.shared.b32 	%r410, [%r15];
+	ld.shared.b32 	%r411, [%r16];
+	ld.shared.b32 	%r412, [%r17];
+	.loc	1 119 24                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:119:24
+	fma.rn.f32 	%r413, %r274, %r373, %r405;
+	fma.rn.f32 	%r414, %r275, %r374, %r406;
+	fma.rn.f32 	%r415, %r276, %r375, %r407;
+	fma.rn.f32 	%r416, %r277, %r376, %r408;
+	fma.rn.f32 	%r417, %r278, %r377, %r409;
+	fma.rn.f32 	%r418, %r279, %r378, %r410;
+	fma.rn.f32 	%r419, %r280, %r379, %r411;
+	fma.rn.f32 	%r420, %r281, %r380, %r412;
+	.loc	1 121 60                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:121:60
+	or.b32 	%r421, %r19, %r299;
+	.loc	1 121 35                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:121:35
+	mad.wide.s32 	%rd109, %r421, 2, %rd7;
+	cvt.s64.s32 	%rd190, %r19;
+	add.s64 	%rd191, %rd190, %rd185;
+	shl.b64 	%rd192, %rd191, 1;
+	add.s64 	%rd193, %rd7, %rd192;
+	add.s64 	%rd111, %rd193, 16;
+	add.s64 	%rd113, %rd193, 32;
+	add.s64 	%rd115, %rd193, 48;
+	add.s64 	%rd117, %rd193, 64;
+	add.s64 	%rd119, %rd193, 80;
+	add.s64 	%rd121, %rd193, 96;
+	add.s64 	%rd123, %rd193, 112;
+	.loc	1 121 71                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:121:71
+	// begin inline asm
+	mov.u64 %rd108, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd108, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs66, %rs34;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs66 }, [ %rd109 + 0 ], %rd108;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd110, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd110, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs67, %rs34;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs67 }, [ %rd111 + 0 ], %rd110;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd112, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd112, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs68, %rs34;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs68 }, [ %rd113 + 0 ], %rd112;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd114, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd114, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs69, %rs34;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs69 }, [ %rd115 + 0 ], %rd114;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd116, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd116, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs70, %rs34;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs70 }, [ %rd117 + 0 ], %rd116;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd118, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd118, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs71, %rs34;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs71 }, [ %rd119 + 0 ], %rd118;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd120, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd120, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs72, %rs34;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs72 }, [ %rd121 + 0 ], %rd120;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd122, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd122, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs73, %rs34;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs73 }, [ %rd123 + 0 ], %rd122;
+	// end inline asm
+	.loc	1 121 132                       // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:121:132
+	cvt.f32.bf16 	%r422, %rs66;
+	cvt.f32.bf16 	%r423, %rs67;
+	cvt.f32.bf16 	%r424, %rs68;
+	cvt.f32.bf16 	%r425, %rs69;
+	cvt.f32.bf16 	%r426, %rs70;
+	cvt.f32.bf16 	%r427, %rs71;
+	cvt.f32.bf16 	%r428, %rs72;
+	cvt.f32.bf16 	%r429, %rs73;
+	.loc	1 126 24                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:126:24
+	mul.f32 	%r430, %r6, %r422;
+	mul.f32 	%r431, %r6, %r423;
+	mul.f32 	%r432, %r6, %r424;
+	mul.f32 	%r433, %r6, %r425;
+	mul.f32 	%r434, %r6, %r426;
+	mul.f32 	%r435, %r6, %r427;
+	mul.f32 	%r436, %r6, %r428;
+	mul.f32 	%r437, %r6, %r429;
+	.loc	1 127 35                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:127:35
+	add.s64 	%rd157, %rd11, %rd189;
+	add.s64 	%rd125, %rd157, 2;
+	add.s64 	%rd127, %rd157, 18;
+	add.s64 	%rd129, %rd157, 34;
+	add.s64 	%rd131, %rd157, 50;
+	add.s64 	%rd133, %rd157, 66;
+	add.s64 	%rd135, %rd157, 82;
+	add.s64 	%rd137, %rd157, 98;
+	add.s64 	%rd139, %rd157, 114;
+	.loc	1 127 85                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:127:85
+	// begin inline asm
+	mov.u64 %rd124, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd124, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs74, %rs34;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs74 }, [ %rd125 + 0 ], %rd124;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd126, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd126, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs75, %rs34;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs75 }, [ %rd127 + 0 ], %rd126;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd128, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd128, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs76, %rs34;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs76 }, [ %rd129 + 0 ], %rd128;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd130, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd130, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs77, %rs34;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs77 }, [ %rd131 + 0 ], %rd130;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd132, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd132, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs78, %rs34;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs78 }, [ %rd133 + 0 ], %rd132;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd134, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd134, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs79, %rs34;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs79 }, [ %rd135 + 0 ], %rd134;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd136, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd136, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs80, %rs34;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs80 }, [ %rd137 + 0 ], %rd136;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd138, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd138, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs81, %rs34;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs81 }, [ %rd139 + 0 ], %rd138;
+	// end inline asm
+	.loc	1 127 146                       // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:127:146
+	cvt.f32.bf16 	%r438, %rs74;
+	cvt.f32.bf16 	%r439, %rs75;
+	cvt.f32.bf16 	%r440, %rs76;
+	cvt.f32.bf16 	%r441, %rs77;
+	cvt.f32.bf16 	%r442, %rs78;
+	cvt.f32.bf16 	%r443, %rs79;
+	cvt.f32.bf16 	%r444, %rs80;
+	cvt.f32.bf16 	%r445, %rs81;
+	.loc	1 131 17                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:131:17
+	neg.f32 	%r446, %r430;
+	fma.rn.f32 	%r447, %r446, %r438, 0f00000000;
+	neg.f32 	%r448, %r431;
+	fma.rn.f32 	%r449, %r448, %r439, 0f00000000;
+	neg.f32 	%r450, %r432;
+	fma.rn.f32 	%r451, %r450, %r440, 0f00000000;
+	neg.f32 	%r452, %r433;
+	fma.rn.f32 	%r453, %r452, %r441, 0f00000000;
+	neg.f32 	%r454, %r434;
+	fma.rn.f32 	%r455, %r454, %r442, 0f00000000;
+	neg.f32 	%r456, %r435;
+	fma.rn.f32 	%r457, %r456, %r443, 0f00000000;
+	neg.f32 	%r458, %r436;
+	fma.rn.f32 	%r459, %r458, %r444, 0f00000000;
+	neg.f32 	%r460, %r437;
+	fma.rn.f32 	%r461, %r460, %r445, 0f00000000;
+	.loc	1 134 60                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:134:60
+	or.b32 	%r462, %r20, %r299;
+	.loc	1 134 35                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:134:35
+	mad.wide.s32 	%rd141, %r462, 2, %rd7;
+	cvt.s64.s32 	%rd194, %r20;
+	add.s64 	%rd195, %rd194, %rd185;
+	shl.b64 	%rd196, %rd195, 1;
+	add.s64 	%rd197, %rd7, %rd196;
+	add.s64 	%rd143, %rd197, 16;
+	add.s64 	%rd145, %rd197, 32;
+	add.s64 	%rd147, %rd197, 48;
+	add.s64 	%rd149, %rd197, 64;
+	add.s64 	%rd151, %rd197, 80;
+	add.s64 	%rd153, %rd197, 96;
+	add.s64 	%rd155, %rd197, 112;
+	.loc	1 134 71                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:134:71
+	// begin inline asm
+	mov.u64 %rd140, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd140, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs82, %rs34;
+	@%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs82 }, [ %rd141 + 0 ], %rd140;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd142, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd142, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs83, %rs34;
+	@%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs83 }, [ %rd143 + 0 ], %rd142;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd144, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd144, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs84, %rs34;
+	@%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs84 }, [ %rd145 + 0 ], %rd144;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd146, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd146, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs85, %rs34;
+	@%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs85 }, [ %rd147 + 0 ], %rd146;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd148, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd148, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs86, %rs34;
+	@%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs86 }, [ %rd149 + 0 ], %rd148;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd150, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd150, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs87, %rs34;
+	@%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs87 }, [ %rd151 + 0 ], %rd150;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd152, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd152, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs88, %rs34;
+	@%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs88 }, [ %rd153 + 0 ], %rd152;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd154, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd154, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs89, %rs34;
+	@%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs89 }, [ %rd155 + 0 ], %rd154;
+	// end inline asm
+	.loc	1 134 132                       // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:134:132
+	cvt.f32.bf16 	%r463, %rs82;
+	cvt.f32.bf16 	%r464, %rs83;
+	cvt.f32.bf16 	%r465, %rs84;
+	cvt.f32.bf16 	%r466, %rs85;
+	cvt.f32.bf16 	%r467, %rs86;
+	cvt.f32.bf16 	%r468, %rs87;
+	cvt.f32.bf16 	%r469, %rs88;
+	cvt.f32.bf16 	%r470, %rs89;
+	.loc	1 139 24                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:139:24
+	mul.f32 	%r471, %r6, %r463;
+	mul.f32 	%r472, %r6, %r464;
+	mul.f32 	%r473, %r6, %r465;
+	mul.f32 	%r474, %r6, %r466;
+	mul.f32 	%r475, %r6, %r467;
+	mul.f32 	%r476, %r6, %r468;
+	mul.f32 	%r477, %r6, %r469;
+	mul.f32 	%r478, %r6, %r470;
+	.loc	1 140 35                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:140:35
+	add.s64 	%rd159, %rd157, 16;
+	add.s64 	%rd161, %rd157, 32;
+	add.s64 	%rd163, %rd157, 48;
+	add.s64 	%rd165, %rd157, 64;
+	add.s64 	%rd167, %rd157, 80;
+	add.s64 	%rd169, %rd157, 96;
+	add.s64 	%rd171, %rd157, 112;
+	.loc	1 140 81                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:140:81
+	// begin inline asm
+	mov.u64 %rd156, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd156, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs90, %rs34;
+	@%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs90 }, [ %rd157 + 0 ], %rd156;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd158, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd158, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs91, %rs34;
+	@%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs91 }, [ %rd159 + 0 ], %rd158;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd160, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd160, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs92, %rs34;
+	@%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs92 }, [ %rd161 + 0 ], %rd160;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd162, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd162, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs93, %rs34;
+	@%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs93 }, [ %rd163 + 0 ], %rd162;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd164, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd164, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs94, %rs34;
+	@%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs94 }, [ %rd165 + 0 ], %rd164;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd166, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd166, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs95, %rs34;
+	@%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs95 }, [ %rd167 + 0 ], %rd166;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd168, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd168, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs96, %rs34;
+	@%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs96 }, [ %rd169 + 0 ], %rd168;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd170, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd170, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs97, %rs34;
+	@%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs97 }, [ %rd171 + 0 ], %rd170;
+	// end inline asm
+	.loc	1 140 142                       // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:140:142
+	cvt.f32.bf16 	%r479, %rs90;
+	cvt.f32.bf16 	%r480, %rs91;
+	cvt.f32.bf16 	%r481, %rs92;
+	cvt.f32.bf16 	%r482, %rs93;
+	cvt.f32.bf16 	%r483, %rs94;
+	cvt.f32.bf16 	%r484, %rs95;
+	cvt.f32.bf16 	%r485, %rs96;
+	cvt.f32.bf16 	%r486, %rs97;
+	.loc	1 142 24                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:142:24
+	mul.f32 	%r487, %r471, %r479;
+	mul.f32 	%r488, %r472, %r480;
+	mul.f32 	%r489, %r473, %r481;
+	mul.f32 	%r490, %r474, %r482;
+	mul.f32 	%r491, %r475, %r483;
+	mul.f32 	%r492, %r476, %r484;
+	mul.f32 	%r493, %r477, %r485;
+	mul.f32 	%r494, %r478, %r486;
+	.loc	1 0 0                           // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:0
+	selp.f32 	%r495, %r447, %r487, %p3;
+	selp.f32 	%r496, %r449, %r488, %p3;
+	selp.f32 	%r497, %r451, %r489, %p3;
+	selp.f32 	%r498, %r453, %r490, %p3;
+	selp.f32 	%r499, %r455, %r491, %p3;
+	selp.f32 	%r500, %r457, %r492, %p3;
+	selp.f32 	%r501, %r459, %r493, %p3;
+	selp.f32 	%r502, %r461, %r494, %p3;
+	.loc	1 151 25                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:151:25
+	mul.f32 	%r503, %r5, %r283;
+	mul.f32 	%r504, %r5, %r284;
+	mul.f32 	%r505, %r5, %r285;
+	mul.f32 	%r506, %r5, %r286;
+	mul.f32 	%r507, %r5, %r287;
+	mul.f32 	%r508, %r5, %r288;
+	mul.f32 	%r509, %r5, %r289;
+	mul.f32 	%r510, %r5, %r290;
+	.loc	1 153 26                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:153:26
+	mul.f32 	%r511, %r503, %r291;
+	mul.f32 	%r512, %r504, %r292;
+	mul.f32 	%r513, %r505, %r293;
+	mul.f32 	%r514, %r506, %r294;
+	mul.f32 	%r515, %r507, %r295;
+	mul.f32 	%r516, %r508, %r296;
+	mul.f32 	%r517, %r509, %r297;
+	mul.f32 	%r518, %r510, %r298;
+	.loc	1 156 26                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:156:26
+	mul.f32 	%r519, %r511, %r224;
+	mul.f32 	%r520, %r512, %r225;
+	mul.f32 	%r521, %r513, %r226;
+	mul.f32 	%r522, %r514, %r227;
+	mul.f32 	%r523, %r515, %r228;
+	mul.f32 	%r524, %r516, %r229;
+	mul.f32 	%r525, %r517, %r230;
+	mul.f32 	%r526, %r518, %r231;
+	bar.sync 	0;
+	st.shared.b32 	[%r9], %r519;
+	st.shared.b32 	[%r9+256], %r520;
+	st.shared.b32 	[%r9+512], %r521;
+	st.shared.b32 	[%r9+768], %r522;
+	st.shared.b32 	[%r9+1024], %r523;
+	st.shared.b32 	[%r9+1280], %r524;
+	st.shared.b32 	[%r9+1536], %r525;
+	st.shared.b32 	[%r9+1792], %r526;
+	bar.sync 	0;
+	ld.shared.b32 	%r527, [%r10];
+	ld.shared.b32 	%r528, [%r11];
+	ld.shared.b32 	%r529, [%r12];
+	ld.shared.b32 	%r530, [%r13];
+	ld.shared.b32 	%r531, [%r14];
+	ld.shared.b32 	%r532, [%r15];
+	ld.shared.b32 	%r533, [%r16];
+	ld.shared.b32 	%r534, [%r17];
+	.loc	1 159 26                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:159:26
+	fma.rn.f32 	%r535, %r274, %r495, %r527;
+	fma.rn.f32 	%r536, %r275, %r496, %r528;
+	fma.rn.f32 	%r537, %r276, %r497, %r529;
+	fma.rn.f32 	%r538, %r277, %r498, %r530;
+	fma.rn.f32 	%r539, %r278, %r499, %r531;
+	fma.rn.f32 	%r540, %r279, %r500, %r532;
+	fma.rn.f32 	%r541, %r280, %r501, %r533;
+	fma.rn.f32 	%r542, %r281, %r502, %r534;
+	.loc	1 161 39                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:161:39
+	or.b64 	%rd198, %rd174, %rd4;
+	.loc	1 161 32                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:161:32
+	shl.b64 	%rd199, %rd198, 1;
+	add.s64 	%rd172, %rd5, %rd199;
+	.loc	1 161 55                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:161:55
+	cvt.rn.bf16.f32 	%rs130, %r413;
+	cvt.rn.bf16.f32 	%rs131, %r414;
+	cvt.rn.bf16.f32 	%rs132, %r415;
+	cvt.rn.bf16.f32 	%rs133, %r416;
+	cvt.rn.bf16.f32 	%rs134, %r417;
+	cvt.rn.bf16.f32 	%rs135, %r418;
+	cvt.rn.bf16.f32 	%rs136, %r419;
+	cvt.rn.bf16.f32 	%rs137, %r420;
+	bar.sync 	0;
+	st.shared.b16 	[%r21], %rs130;
+	st.shared.b16 	[%r22], %rs131;
+	st.shared.b16 	[%r23], %rs132;
+	st.shared.b16 	[%r24], %rs133;
+	st.shared.b16 	[%r25], %rs134;
+	st.shared.b16 	[%r26], %rs135;
+	st.shared.b16 	[%r27], %rs136;
+	st.shared.b16 	[%r28], %rs137;
+	bar.sync 	0;
+	ld.shared.b32 	%r248, [%r29];
+	ld.shared.b32 	%r249, [%r29+256];
+	ld.shared.b32 	%r250, [%r29+512];
+	ld.shared.b32 	%r251, [%r29+768];
+	// begin inline asm
+	@%p2 st.global.v4.b32 [ %rd172 + 0 ], { %r248, %r249, %r250, %r251 };
+	// end inline asm
+	.loc	1 162 32                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:162:32
+	add.s64 	%rd173, %rd6, %rd199;
+	.loc	1 162 56                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:162:56
+	cvt.rn.bf16.f32 	%rs138, %r535;
+	cvt.rn.bf16.f32 	%rs139, %r536;
+	cvt.rn.bf16.f32 	%rs140, %r537;
+	cvt.rn.bf16.f32 	%rs141, %r538;
+	cvt.rn.bf16.f32 	%rs142, %r539;
+	cvt.rn.bf16.f32 	%rs143, %r540;
+	cvt.rn.bf16.f32 	%rs144, %r541;
+	cvt.rn.bf16.f32 	%rs145, %r542;
+	bar.sync 	0;
+	st.shared.b16 	[%r21], %rs138;
+	st.shared.b16 	[%r22], %rs139;
+	st.shared.b16 	[%r23], %rs140;
+	st.shared.b16 	[%r24], %rs141;
+	st.shared.b16 	[%r25], %rs142;
+	st.shared.b16 	[%r26], %rs143;
+	st.shared.b16 	[%r27], %rs144;
+	st.shared.b16 	[%r28], %rs145;
+	bar.sync 	0;
+	ld.shared.b32 	%r252, [%r29];
+	ld.shared.b32 	%r253, [%r29+256];
+	ld.shared.b32 	%r254, [%r29+512];
+	ld.shared.b32 	%r255, [%r29+768];
+	// begin inline asm
+	@%p2 st.global.v4.b32 [ %rd173 + 0 ], { %r252, %r253, %r254, %r255 };
+	// end inline asm
+	mov.b64 	%rd200, 64;
+	mov.pred 	%p5, 0;
+	.loc	1 53 43                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:53:43
+	@%p1 bra 	$L__BB0_1;
+// %bb.2:
+	.loc	1 53 4                          // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:53:4
+	ret;
+$L__tmp16:
+$L__func_end0:
+                                        // -- End function
+}
+	.file	1 "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py"
+	.file	2 "/usr/local/lib/python3.12/dist-packages/triton/language/standard.py"
+	.section	.debug_abbrev
+	{
+.b8 1                                   // Abbreviation Code
+.b8 17                                  // DW_TAG_compile_unit
+.b8 1                                   // DW_CHILDREN_yes
+.b8 37                                  // DW_AT_producer
+.b8 8                                   // DW_FORM_string
+.b8 19                                  // DW_AT_language
+.b8 5                                   // DW_FORM_data2
+.b8 3                                   // DW_AT_name
+.b8 8                                   // DW_FORM_string
+.b8 16                                  // DW_AT_stmt_list
+.b8 6                                   // DW_FORM_data4
+.b8 27                                  // DW_AT_comp_dir
+.b8 8                                   // DW_FORM_string
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 2                                   // Abbreviation Code
+.b8 46                                  // DW_TAG_subprogram
+.b8 0                                   // DW_CHILDREN_no
+.b8 3                                   // DW_AT_name
+.b8 8                                   // DW_FORM_string
+.b8 32                                  // DW_AT_inline
+.b8 11                                  // DW_FORM_data1
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 3                                   // Abbreviation Code
+.b8 46                                  // DW_TAG_subprogram
+.b8 1                                   // DW_CHILDREN_yes
+.b8 17                                  // DW_AT_low_pc
+.b8 1                                   // DW_FORM_addr
+.b8 18                                  // DW_AT_high_pc
+.b8 1                                   // DW_FORM_addr
+.b8 49                                  // DW_AT_abstract_origin
+.b8 19                                  // DW_FORM_ref4
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 4                                   // Abbreviation Code
+.b8 29                                  // DW_TAG_inlined_subroutine
+.b8 1                                   // DW_CHILDREN_yes
+.b8 49                                  // DW_AT_abstract_origin
+.b8 19                                  // DW_FORM_ref4
+.b8 17                                  // DW_AT_low_pc
+.b8 1                                   // DW_FORM_addr
+.b8 18                                  // DW_AT_high_pc
+.b8 1                                   // DW_FORM_addr
+.b8 88                                  // DW_AT_call_file
+.b8 11                                  // DW_FORM_data1
+.b8 89                                  // DW_AT_call_line
+.b8 11                                  // DW_FORM_data1
+.b8 87                                  // DW_AT_call_column
+.b8 11                                  // DW_FORM_data1
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 5                                   // Abbreviation Code
+.b8 29                                  // DW_TAG_inlined_subroutine
+.b8 0                                   // DW_CHILDREN_no
+.b8 49                                  // DW_AT_abstract_origin
+.b8 19                                  // DW_FORM_ref4
+.b8 17                                  // DW_AT_low_pc
+.b8 1                                   // DW_FORM_addr
+.b8 18                                  // DW_AT_high_pc
+.b8 1                                   // DW_FORM_addr
+.b8 88                                  // DW_AT_call_file
+.b8 11                                  // DW_FORM_data1
+.b8 89                                  // DW_AT_call_line
+.b8 5                                   // DW_FORM_data2
+.b8 87                                  // DW_AT_call_column
+.b8 11                                  // DW_FORM_data1
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 0                                   // EOM(3)
+	}
+	.section	.debug_info
+	{
+.b32 456                                // Length of Unit
+.b8 2                                   // DWARF version number
+.b8 0
+.b32 .debug_abbrev                      // Offset Into Abbrev. Section
+.b8 8                                   // Address Size (in bytes)
+.b8 1                                   // Abbrev [1] 0xb:0x1c1 DW_TAG_compile_unit
+.b8 116                                 // DW_AT_producer
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 0
+.b8 2                                   // DW_AT_language
+.b8 0
+.b8 99                                  // DW_AT_name
+.b8 98
+.b8 118
+.b8 113
+.b8 104
+.b8 106
+.b8 116
+.b8 121
+.b8 103
+.b8 55
+.b8 102
+.b8 118
+.b8 120
+.b8 122
+.b8 119
+.b8 116
+.b8 98
+.b8 116
+.b8 116
+.b8 52
+.b8 118
+.b8 114
+.b8 100
+.b8 107
+.b8 98
+.b8 110
+.b8 98
+.b8 54
+.b8 110
+.b8 51
+.b8 50
+.b8 102
+.b8 110
+.b8 114
+.b8 105
+.b8 106
+.b8 106
+.b8 112
+.b8 108
+.b8 51
+.b8 118
+.b8 118
+.b8 52
+.b8 99
+.b8 102
+.b8 113
+.b8 100
+.b8 52
+.b8 109
+.b8 122
+.b8 110
+.b8 114
+.b8 46
+.b8 112
+.b8 121
+.b8 0
+.b32 .debug_line                        // DW_AT_stmt_list
+.b8 47                                  // DW_AT_comp_dir
+.b8 97
+.b8 112
+.b8 112
+.b8 47
+.b8 116
+.b8 101
+.b8 110
+.b8 115
+.b8 111
+.b8 114
+.b8 114
+.b8 116
+.b8 95
+.b8 108
+.b8 108
+.b8 109
+.b8 47
+.b8 118
+.b8 105
+.b8 115
+.b8 117
+.b8 97
+.b8 108
+.b8 95
+.b8 103
+.b8 101
+.b8 110
+.b8 47
+.b8 99
+.b8 111
+.b8 109
+.b8 112
+.b8 105
+.b8 108
+.b8 101
+.b8 100
+.b8 95
+.b8 99
+.b8 97
+.b8 99
+.b8 104
+.b8 101
+.b8 47
+.b8 102
+.b8 108
+.b8 117
+.b8 120
+.b8 50
+.b8 95
+.b8 107
+.b8 108
+.b8 101
+.b8 105
+.b8 110
+.b8 95
+.b8 57
+.b8 98
+.b8 95
+.b8 78
+.b8 86
+.b8 73
+.b8 68
+.b8 73
+.b8 65
+.b8 95
+.b8 71
+.b8 101
+.b8 70
+.b8 111
+.b8 114
+.b8 99
+.b8 101
+.b8 95
+.b8 82
+.b8 84
+.b8 88
+.b8 95
+.b8 52
+.b8 48
+.b8 57
+.b8 48
+.b8 95
+.b8 115
+.b8 109
+.b8 56
+.b8 57
+.b8 95
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 50
+.b8 46
+.b8 49
+.b8 48
+.b8 46
+.b8 48
+.b8 97
+.b8 48
+.b8 95
+.b8 98
+.b8 52
+.b8 101
+.b8 52
+.b8 101
+.b8 101
+.b8 56
+.b8 49
+.b8 100
+.b8 51
+.b8 46
+.b8 110
+.b8 118
+.b8 50
+.b8 53
+.b8 46
+.b8 49
+.b8 50
+.b8 95
+.b8 99
+.b8 117
+.b8 100
+.b8 97
+.b8 49
+.b8 51
+.b8 95
+.b8 49
+.b8 47
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 105
+.b8 110
+.b8 100
+.b8 117
+.b8 99
+.b8 116
+.b8 111
+.b8 114
+.b8 47
+.b8 98
+.b8 118
+.b8 0
+.b8 2                                   // Abbrev [2] 0xe4:0x6d DW_TAG_subprogram
+.b8 116                                 // DW_AT_name
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 114
+.b8 101
+.b8 100
+.b8 95
+.b8 102
+.b8 117
+.b8 115
+.b8 101
+.b8 100
+.b8 95
+.b8 95
+.b8 102
+.b8 117
+.b8 115
+.b8 101
+.b8 100
+.b8 95
+.b8 114
+.b8 109
+.b8 115
+.b8 95
+.b8 110
+.b8 111
+.b8 114
+.b8 109
+.b8 95
+.b8 95
+.b8 116
+.b8 111
+.b8 95
+.b8 99
+.b8 111
+.b8 112
+.b8 121
+.b8 95
+.b8 97
+.b8 100
+.b8 100
+.b8 95
+.b8 109
+.b8 117
+.b8 108
+.b8 95
+.b8 110
+.b8 101
+.b8 103
+.b8 95
+.b8 115
+.b8 112
+.b8 108
+.b8 105
+.b8 116
+.b8 95
+.b8 115
+.b8 112
+.b8 108
+.b8 105
+.b8 116
+.b8 95
+.b8 119
+.b8 105
+.b8 116
+.b8 104
+.b8 95
+.b8 115
+.b8 105
+.b8 122
+.b8 101
+.b8 115
+.b8 95
+.b8 115
+.b8 116
+.b8 97
+.b8 99
+.b8 107
+.b8 95
+.b8 117
+.b8 110
+.b8 98
+.b8 105
+.b8 110
+.b8 100
+.b8 95
+.b8 117
+.b8 110
+.b8 115
+.b8 113
+.b8 117
+.b8 101
+.b8 101
+.b8 122
+.b8 101
+.b8 95
+.b8 118
+.b8 105
+.b8 101
+.b8 119
+.b8 95
+.b8 48
+.b8 0
+.b8 1                                   // DW_AT_inline
+.b8 3                                   // Abbrev [3] 0x151:0x7a DW_TAG_subprogram
+.b64 $L__func_begin0                    // DW_AT_low_pc
+.b64 $L__func_end0                      // DW_AT_high_pc
+.b32 228                                // DW_AT_abstract_origin
+.b8 4                                   // Abbrev [4] 0x166:0x32 DW_TAG_inlined_subroutine
+.b32 228                                // DW_AT_abstract_origin
+.b64 $L__tmp1                           // DW_AT_low_pc
+.b64 $L__tmp8                           // DW_AT_high_pc
+.b8 1                                   // DW_AT_call_file
+.b8 51                                  // DW_AT_call_line
+.b8 25                                  // DW_AT_call_column
+.b8 5                                   // Abbrev [5] 0x17e:0x19 DW_TAG_inlined_subroutine
+.b32 228                                // DW_AT_abstract_origin
+.b64 $L__tmp1                           // DW_AT_low_pc
+.b64 $L__tmp8                           // DW_AT_high_pc
+.b8 2                                   // DW_AT_call_file
+.b8 37                                  // DW_AT_call_line
+.b8 1
+.b8 36                                  // DW_AT_call_column
+.b8 0                                   // End Of Children Mark
+.b8 4                                   // Abbrev [4] 0x198:0x32 DW_TAG_inlined_subroutine
+.b32 228                                // DW_AT_abstract_origin
+.b64 $L__tmp8                           // DW_AT_low_pc
+.b64 $L__tmp15                          // DW_AT_high_pc
+.b8 1                                   // DW_AT_call_file
+.b8 52                                  // DW_AT_call_line
+.b8 27                                  // DW_AT_call_column
+.b8 5                                   // Abbrev [5] 0x1b0:0x19 DW_TAG_inlined_subroutine
+.b32 228                                // DW_AT_abstract_origin
+.b64 $L__tmp8                           // DW_AT_low_pc
+.b64 $L__tmp15                          // DW_AT_high_pc
+.b8 2                                   // DW_AT_call_file
+.b8 37                                  // DW_AT_call_line
+.b8 1
+.b8 36                                  // DW_AT_call_column
+.b8 0                                   // End Of Children Mark
+.b8 0                                   // End Of Children Mark
+.b8 0                                   // End Of Children Mark
+	}
+	.section	.debug_macinfo	{	}
diff --git a/triton/C3PA2FQRIXNX4FILRXWMWDTESFUYR3BPZHZKRDDGR2QUBFWHGDOQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.source b/triton/C3PA2FQRIXNX4FILRXWMWDTESFUYR3BPZHZKRDDGR2QUBFWHGDOQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.source
new file mode 100644
index 0000000000000000000000000000000000000000..96fc24d835acf2c6fce5ada7026bbe34b256d6ff
--- /dev/null
+++ b/triton/C3PA2FQRIXNX4FILRXWMWDTESFUYR3BPZHZKRDDGR2QUBFWHGDOQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.source
@@ -0,0 +1,972 @@
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":18:0)
+#loc213 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":287:0)
+#loc215 = loc(unknown)
+#loc218 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":262:0)
+#loc222 = loc("in_out_ptr0"(#loc))
+#loc223 = loc("in_out_ptr1"(#loc))
+#loc224 = loc("in_ptr0"(#loc))
+#loc225 = loc("in_ptr1"(#loc))
+#loc226 = loc("in_ptr2"(#loc))
+#loc227 = loc("in_ptr3"(#loc))
+#loc228 = loc("in_ptr4"(#loc))
+#loc229 = loc("xnumel"(#loc))
+#loc230 = loc("r0_numel"(#loc))
+#loc432 = loc("input"(#loc213))
+#loc433 = loc("a"(#loc218))
+#loc434 = loc("b"(#loc218))
+module {
+  tt.func public @triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0(%in_out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_out_ptr0"(#loc)), %in_out_ptr1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_out_ptr1"(#loc)), %in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %in_ptr4: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr4"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
+    %xnumel_0 = arith.constant 73728 : i32 loc(#loc231)
+    %r0_numel_1 = arith.constant 128 : i32 loc(#loc232)
+    %xoffset = tt.get_program_id x : i32 loc(#loc233)
+    %xoffset_2 = arith.constant 64 : i32 loc(#loc234)
+    %xoffset_3 = arith.constant 64 : i32 loc(#loc234)
+    %xoffset_4 = arith.muli %xoffset, %xoffset_3 : i32 loc(#loc234)
+    %xindex = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc235)
+    %xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc236)
+    %xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<64x1xi32> loc(#loc237)
+    %xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<64x1xi32> loc(#loc237)
+    %xmask = arith.constant true loc(#loc238)
+    %xmask_8 = arith.constant dense<true> : tensor<64x64xi1> loc(#loc238)
+    %r0_base = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc239)
+    %r0_base_9 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc240)
+    %x0 = arith.constant 32 : i32 loc(#loc241)
+    %x0_10 = arith.constant 32 : i32 loc(#loc241)
+    %x0_11 = arith.constant dense<32> : tensor<64x1xi32> loc(#loc241)
+    %x0_12 = arith.remsi %xindex_7, %x0_11 : tensor<64x1xi32> loc(#loc241)
+    %x1 = arith.constant 32 : i32 loc(#loc242)
+    %x1_13 = arith.constant 32 : i32 loc(#loc242)
+    %x1_14 = arith.constant dense<32> : tensor<64x1xi32> loc(#loc242)
+    %x1_15 = arith.divsi %xindex_7, %x1_14 : tensor<64x1xi32> loc(#loc242)
+    %_tmp4 = arith.constant 0.000000e+00 : f32 loc(#loc243)
+    %_tmp4_16 = arith.constant dense<0.000000e+00> : tensor<64x64xf32> loc(#loc243)
+    %_tmp10 = arith.constant 0.000000e+00 : f32 loc(#loc244)
+    %_tmp10_17 = arith.constant dense<0.000000e+00> : tensor<64x64xf32> loc(#loc244)
+    %c0_i32 = arith.constant 0 : i32 loc(#loc15)
+    %c64_i32 = arith.constant 64 : i32 loc(#loc15)
+    %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc15)
+    %1 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc15)
+    %2 = arith.bitcast %c64_i32 : i32 to i32 loc(#loc15)
+    %3 = ub.poison : i32 loc(#loc15)
+    %_tmp10_18:2 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%_tmp4_23 = %_tmp4_16, %_tmp10_24 = %_tmp10_17) -> (tensor<64x64xf32>, tensor<64x64xf32>)  : i32 {
+      %r0_index = tt.splat %r0_offset : i32 -> tensor<1x64xi32> loc(#loc246)
+      %r0_index_25 = arith.addi %r0_index, %r0_base_9 : tensor<1x64xi32> loc(#loc246)
+      %r0_mask = arith.constant dense<128> : tensor<1x64xi32> loc(#loc247)
+      %r0_mask_26 = arith.cmpi slt, %r0_index_25, %r0_mask : tensor<1x64xi32> loc(#loc247)
+      %tmp0 = arith.constant 4096 : i32 loc(#loc248)
+      %tmp0_27 = arith.constant 4096 : i32 loc(#loc248)
+      %tmp0_28 = arith.constant dense<4096> : tensor<1x64xi32> loc(#loc248)
+      %tmp0_29 = arith.addi %tmp0_28, %r0_index_25 : tensor<1x64xi32> loc(#loc248)
+      %tmp0_30 = arith.constant 128 : i32 loc(#loc249)
+      %tmp0_31 = arith.constant 128 : i32 loc(#loc249)
+      %tmp0_32 = arith.constant dense<128> : tensor<64x1xi32> loc(#loc249)
+      %tmp0_33 = arith.muli %tmp0_32, %x0_12 : tensor<64x1xi32> loc(#loc249)
+      %tmp0_34 = tt.broadcast %tmp0_29 : tensor<1x64xi32> -> tensor<64x64xi32> loc(#loc250)
+      %tmp0_35 = tt.broadcast %tmp0_33 : tensor<64x1xi32> -> tensor<64x64xi32> loc(#loc250)
+      %tmp0_36 = arith.addi %tmp0_34, %tmp0_35 : tensor<64x64xi32> loc(#loc250)
+      %tmp0_37 = arith.constant 36864 : i32 loc(#loc251)
+      %tmp0_38 = arith.constant 36864 : i32 loc(#loc251)
+      %tmp0_39 = arith.constant dense<36864> : tensor<64x1xi32> loc(#loc251)
+      %tmp0_40 = arith.muli %tmp0_39, %x1_15 : tensor<64x1xi32> loc(#loc251)
+      %tmp0_41 = tt.broadcast %tmp0_40 : tensor<64x1xi32> -> tensor<64x64xi32> loc(#loc252)
+      %tmp0_42 = arith.addi %tmp0_36, %tmp0_41 : tensor<64x64xi32> loc(#loc252)
+      %tmp0_43 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<64x64x!tt.ptr<bf16>> loc(#loc253)
+      %tmp0_44 = tt.addptr %tmp0_43, %tmp0_42 : tensor<64x64x!tt.ptr<bf16>>, tensor<64x64xi32> loc(#loc253)
+      %tmp0_45 = arith.constant 0.000000e+00 : f32 loc(#loc254)
+      %tmp0_46 = tt.broadcast %r0_mask_26 : tensor<1x64xi1> -> tensor<64x64xi1> loc(#loc254)
+      %tmp0_47 = arith.constant dense<0.000000e+00> : tensor<64x64xf32> loc(#loc254)
+      %tmp0_48 = arith.truncf %tmp0_47 : tensor<64x64xf32> to tensor<64x64xbf16> loc(#loc254)
+      %tmp0_49 = tt.load %tmp0_44, %tmp0_46, %tmp0_48 evictionPolicy = evict_last : tensor<64x64x!tt.ptr<bf16>> loc(#loc254)
+      %tmp0_50 = arith.extf %tmp0_49 : tensor<64x64xbf16> to tensor<64x64xf32> loc(#loc255)
+      %tmp6 = arith.constant 128 : i32 loc(#loc256)
+      %tmp6_51 = arith.constant 128 : i32 loc(#loc256)
+      %tmp6_52 = arith.constant dense<128> : tensor<64x1xi32> loc(#loc256)
+      %tmp6_53 = arith.muli %tmp6_52, %x0_12 : tensor<64x1xi32> loc(#loc256)
+      %tmp6_54 = tt.broadcast %r0_index_25 : tensor<1x64xi32> -> tensor<64x64xi32> loc(#loc257)
+      %tmp6_55 = tt.broadcast %tmp6_53 : tensor<64x1xi32> -> tensor<64x64xi32> loc(#loc257)
+      %tmp6_56 = arith.addi %tmp6_54, %tmp6_55 : tensor<64x64xi32> loc(#loc257)
+      %tmp6_57 = arith.constant 36864 : i32 loc(#loc258)
+      %tmp6_58 = arith.constant 36864 : i32 loc(#loc258)
+      %tmp6_59 = arith.constant dense<36864> : tensor<64x1xi32> loc(#loc258)
+      %tmp6_60 = arith.muli %tmp6_59, %x1_15 : tensor<64x1xi32> loc(#loc258)
+      %tmp6_61 = tt.broadcast %tmp6_60 : tensor<64x1xi32> -> tensor<64x64xi32> loc(#loc259)
+      %tmp6_62 = arith.addi %tmp6_56, %tmp6_61 : tensor<64x64xi32> loc(#loc259)
+      %tmp6_63 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<64x64x!tt.ptr<bf16>> loc(#loc260)
+      %tmp6_64 = tt.addptr %tmp6_63, %tmp6_62 : tensor<64x64x!tt.ptr<bf16>>, tensor<64x64xi32> loc(#loc260)
+      %tmp6_65 = arith.constant 0.000000e+00 : f32 loc(#loc261)
+      %tmp6_66 = tt.broadcast %r0_mask_26 : tensor<1x64xi1> -> tensor<64x64xi1> loc(#loc261)
+      %tmp6_67 = arith.constant dense<0.000000e+00> : tensor<64x64xf32> loc(#loc261)
+      %tmp6_68 = arith.truncf %tmp6_67 : tensor<64x64xf32> to tensor<64x64xbf16> loc(#loc261)
+      %tmp6_69 = tt.load %tmp6_64, %tmp6_66, %tmp6_68 evictionPolicy = evict_last : tensor<64x64x!tt.ptr<bf16>> loc(#loc261)
+      %tmp6_70 = arith.extf %tmp6_69 : tensor<64x64xbf16> to tensor<64x64xf32> loc(#loc262)
+      %tmp2 = arith.mulf %tmp0_50, %tmp0_50 : tensor<64x64xf32> loc(#loc263)
+      %tmp5 = arith.addf %_tmp4_23, %tmp2 : tensor<64x64xf32> loc(#loc264)
+      %_tmp4_71 = tt.broadcast %r0_mask_26 : tensor<1x64xi1> -> tensor<64x64xi1> loc(#loc265)
+      %_tmp4_72 = arith.select %_tmp4_71, %tmp5, %_tmp4_23 : tensor<64x64xi1>, tensor<64x64xf32> loc(#loc265)
+      %tmp8 = arith.mulf %tmp6_70, %tmp6_70 : tensor<64x64xf32> loc(#loc266)
+      %tmp11 = arith.addf %_tmp10_24, %tmp8 : tensor<64x64xf32> loc(#loc267)
+      %_tmp10_73 = tt.broadcast %r0_mask_26 : tensor<1x64xi1> -> tensor<64x64xi1> loc(#loc268)
+      %_tmp10_74 = arith.select %_tmp10_73, %tmp11, %_tmp10_24 : tensor<64x64xi1>, tensor<64x64xf32> loc(#loc268)
+      scf.yield %_tmp4_72, %_tmp10_74 : tensor<64x64xf32>, tensor<64x64xf32> loc(#loc39)
+    } loc(#loc435)
+    %tmp4 = tt.call @"triton.language.standard.sum__fp32S64_64S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%_tmp10_18#0) : (tensor<64x64xf32>) -> tensor<64xf32> loc(#loc269)
+    %tmp4_19 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<64xf32> -> tensor<64x1xf32> loc(#loc270)
+    %tmp10 = tt.call @"triton.language.standard.sum__fp32S64_64S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%_tmp10_18#1) : (tensor<64x64xf32>) -> tensor<64xf32> loc(#loc271)
+    %tmp10_20 = tt.expand_dims %tmp10 {axis = 1 : i32} : tensor<64xf32> -> tensor<64x1xf32> loc(#loc272)
+    %c0_i32_21 = arith.constant 0 : i32 loc(#loc44)
+    %c64_i32_22 = arith.constant 64 : i32 loc(#loc44)
+    %4 = arith.bitcast %c0_i32_21 : i32 to i32 loc(#loc44)
+    %5 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc44)
+    %6 = arith.bitcast %c64_i32_22 : i32 to i32 loc(#loc44)
+    %7 = ub.poison : i32 loc(#loc44)
+    scf.for %r0_offset = %4 to %5 step %6  : i32 {
+      %r0_index = tt.splat %r0_offset : i32 -> tensor<1x64xi32> loc(#loc273)
+      %r0_index_23 = arith.addi %r0_index, %r0_base_9 : tensor<1x64xi32> loc(#loc273)
+      %r0_mask = arith.constant dense<128> : tensor<1x64xi32> loc(#loc274)
+      %r0_mask_24 = arith.cmpi slt, %r0_index_23, %r0_mask : tensor<1x64xi32> loc(#loc274)
+      %r0_3 = arith.constant 2 : i32 loc(#loc275)
+      %r0_3_25 = arith.constant 2 : i32 loc(#loc275)
+      %r0_3_26 = arith.constant dense<2> : tensor<1x64xi32> loc(#loc275)
+      %r0_3_27 = arith.remsi %r0_index_23, %r0_3_26 : tensor<1x64xi32> loc(#loc275)
+      %r0_4 = arith.constant 2 : i32 loc(#loc276)
+      %r0_4_28 = arith.constant 2 : i32 loc(#loc276)
+      %r0_4_29 = arith.constant dense<2> : tensor<1x64xi32> loc(#loc276)
+      %r0_4_30 = arith.divsi %r0_index_23, %r0_4_29 : tensor<1x64xi32> loc(#loc276)
+      %tmp50 = arith.constant 128 : i32 loc(#loc277)
+      %tmp50_31 = arith.constant 128 : i32 loc(#loc277)
+      %tmp50_32 = arith.constant dense<128> : tensor<64x1xi32> loc(#loc277)
+      %tmp50_33 = arith.muli %tmp50_32, %x0_12 : tensor<64x1xi32> loc(#loc277)
+      %tmp50_34 = tt.broadcast %r0_index_23 : tensor<1x64xi32> -> tensor<64x64xi32> loc(#loc278)
+      %tmp50_35 = tt.broadcast %tmp50_33 : tensor<64x1xi32> -> tensor<64x64xi32> loc(#loc278)
+      %tmp50_36 = arith.addi %tmp50_34, %tmp50_35 : tensor<64x64xi32> loc(#loc278)
+      %tmp50_37 = arith.constant 36864 : i32 loc(#loc279)
+      %tmp50_38 = arith.constant 36864 : i32 loc(#loc279)
+      %tmp50_39 = arith.constant dense<36864> : tensor<64x1xi32> loc(#loc279)
+      %tmp50_40 = arith.muli %tmp50_39, %x1_15 : tensor<64x1xi32> loc(#loc279)
+      %tmp50_41 = tt.broadcast %tmp50_40 : tensor<64x1xi32> -> tensor<64x64xi32> loc(#loc280)
+      %tmp50_42 = arith.addi %tmp50_36, %tmp50_41 : tensor<64x64xi32> loc(#loc280)
+      %tmp50_43 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<64x64x!tt.ptr<bf16>> loc(#loc281)
+      %tmp50_44 = tt.addptr %tmp50_43, %tmp50_42 : tensor<64x64x!tt.ptr<bf16>>, tensor<64x64xi32> loc(#loc281)
+      %tmp50_45 = arith.constant 0.000000e+00 : f32 loc(#loc282)
+      %tmp50_46 = tt.broadcast %r0_mask_24 : tensor<1x64xi1> -> tensor<64x64xi1> loc(#loc282)
+      %tmp50_47 = arith.constant dense<0.000000e+00> : tensor<64x64xf32> loc(#loc282)
+      %tmp50_48 = arith.truncf %tmp50_47 : tensor<64x64xf32> to tensor<64x64xbf16> loc(#loc282)
+      %tmp50_49 = tt.load %tmp50_44, %tmp50_46, %tmp50_48 evictionPolicy = evict_last : tensor<64x64x!tt.ptr<bf16>> loc(#loc282)
+      %tmp50_50 = arith.extf %tmp50_49 : tensor<64x64xbf16> to tensor<64x64xf32> loc(#loc283)
+      %tmp58 = tt.splat %in_ptr1 : !tt.ptr<bf16> -> tensor<1x64x!tt.ptr<bf16>> loc(#loc284)
+      %tmp58_51 = tt.addptr %tmp58, %r0_index_23 : tensor<1x64x!tt.ptr<bf16>>, tensor<1x64xi32> loc(#loc284)
+      %tmp58_52 = arith.constant 0.000000e+00 : f32 loc(#loc285)
+      %tmp58_53 = arith.constant dense<0.000000e+00> : tensor<1x64xf32> loc(#loc285)
+      %tmp58_54 = arith.truncf %tmp58_53 : tensor<1x64xf32> to tensor<1x64xbf16> loc(#loc285)
+      %tmp58_55 = tt.load %tmp58_51, %r0_mask_24, %tmp58_54 evictionPolicy = evict_last : tensor<1x64x!tt.ptr<bf16>> loc(#loc285)
+      %tmp58_56 = arith.extf %tmp58_55 : tensor<1x64xbf16> to tensor<1x64xf32> loc(#loc286)
+      %tmp63 = arith.constant 128 : i32 loc(#loc287)
+      %tmp63_57 = arith.constant 128 : i32 loc(#loc287)
+      %tmp63_58 = arith.constant dense<128> : tensor<64x1xi32> loc(#loc287)
+      %tmp63_59 = arith.muli %tmp63_58, %x1_15 : tensor<64x1xi32> loc(#loc287)
+      %tmp63_60 = tt.broadcast %r0_index_23 : tensor<1x64xi32> -> tensor<64x64xi32> loc(#loc288)
+      %tmp63_61 = tt.broadcast %tmp63_59 : tensor<64x1xi32> -> tensor<64x64xi32> loc(#loc288)
+      %tmp63_62 = arith.addi %tmp63_60, %tmp63_61 : tensor<64x64xi32> loc(#loc288)
+      %tmp63_63 = tt.splat %in_ptr2 : !tt.ptr<f32> -> tensor<64x64x!tt.ptr<f32>> loc(#loc289)
+      %tmp63_64 = tt.addptr %tmp63_63, %tmp63_62 : tensor<64x64x!tt.ptr<f32>>, tensor<64x64xi32> loc(#loc289)
+      %tmp63_65 = arith.constant 0.000000e+00 : f32 loc(#loc290)
+      %tmp63_66 = tt.broadcast %r0_mask_24 : tensor<1x64xi1> -> tensor<64x64xi1> loc(#loc290)
+      %tmp63_67 = arith.constant dense<0.000000e+00> : tensor<64x64xf32> loc(#loc290)
+      %tmp63_68 = tt.load %tmp63_64, %tmp63_66, %tmp63_67 evictionPolicy = evict_last : tensor<64x64x!tt.ptr<f32>> loc(#loc290)
+      %tmp66 = arith.constant 128 : i32 loc(#loc291)
+      %tmp66_69 = arith.constant 128 : i32 loc(#loc291)
+      %tmp66_70 = arith.constant dense<128> : tensor<64x1xi32> loc(#loc291)
+      %tmp66_71 = arith.muli %tmp66_70, %x1_15 : tensor<64x1xi32> loc(#loc291)
+      %tmp66_72 = tt.broadcast %r0_index_23 : tensor<1x64xi32> -> tensor<64x64xi32> loc(#loc292)
+      %tmp66_73 = tt.broadcast %tmp66_71 : tensor<64x1xi32> -> tensor<64x64xi32> loc(#loc292)
+      %tmp66_74 = arith.addi %tmp66_72, %tmp66_73 : tensor<64x64xi32> loc(#loc292)
+      %tmp66_75 = tt.splat %in_ptr3 : !tt.ptr<f32> -> tensor<64x64x!tt.ptr<f32>> loc(#loc293)
+      %tmp66_76 = tt.addptr %tmp66_75, %tmp66_74 : tensor<64x64x!tt.ptr<f32>>, tensor<64x64xi32> loc(#loc293)
+      %tmp66_77 = arith.constant 0.000000e+00 : f32 loc(#loc294)
+      %tmp66_78 = tt.broadcast %r0_mask_24 : tensor<1x64xi1> -> tensor<64x64xi1> loc(#loc294)
+      %tmp66_79 = arith.constant dense<0.000000e+00> : tensor<64x64xf32> loc(#loc294)
+      %tmp66_80 = tt.load %tmp66_76, %tmp66_78, %tmp66_79 evictionPolicy = evict_last : tensor<64x64x!tt.ptr<f32>> loc(#loc294)
+      %tmp96 = arith.constant 4096 : i32 loc(#loc295)
+      %tmp96_81 = arith.constant 4096 : i32 loc(#loc295)
+      %tmp96_82 = arith.constant dense<4096> : tensor<1x64xi32> loc(#loc295)
+      %tmp96_83 = arith.addi %tmp96_82, %r0_index_23 : tensor<1x64xi32> loc(#loc295)
+      %tmp96_84 = arith.constant 128 : i32 loc(#loc296)
+      %tmp96_85 = arith.constant 128 : i32 loc(#loc296)
+      %tmp96_86 = arith.constant dense<128> : tensor<64x1xi32> loc(#loc296)
+      %tmp96_87 = arith.muli %tmp96_86, %x0_12 : tensor<64x1xi32> loc(#loc296)
+      %tmp96_88 = tt.broadcast %tmp96_83 : tensor<1x64xi32> -> tensor<64x64xi32> loc(#loc297)
+      %tmp96_89 = tt.broadcast %tmp96_87 : tensor<64x1xi32> -> tensor<64x64xi32> loc(#loc297)
+      %tmp96_90 = arith.addi %tmp96_88, %tmp96_89 : tensor<64x64xi32> loc(#loc297)
+      %tmp96_91 = arith.constant 36864 : i32 loc(#loc298)
+      %tmp96_92 = arith.constant 36864 : i32 loc(#loc298)
+      %tmp96_93 = arith.constant dense<36864> : tensor<64x1xi32> loc(#loc298)
+      %tmp96_94 = arith.muli %tmp96_93, %x1_15 : tensor<64x1xi32> loc(#loc298)
+      %tmp96_95 = tt.broadcast %tmp96_94 : tensor<64x1xi32> -> tensor<64x64xi32> loc(#loc299)
+      %tmp96_96 = arith.addi %tmp96_90, %tmp96_95 : tensor<64x64xi32> loc(#loc299)
+      %tmp96_97 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<64x64x!tt.ptr<bf16>> loc(#loc300)
+      %tmp96_98 = tt.addptr %tmp96_97, %tmp96_96 : tensor<64x64x!tt.ptr<bf16>>, tensor<64x64xi32> loc(#loc300)
+      %tmp96_99 = arith.constant 0.000000e+00 : f32 loc(#loc301)
+      %tmp96_100 = tt.broadcast %r0_mask_24 : tensor<1x64xi1> -> tensor<64x64xi1> loc(#loc301)
+      %tmp96_101 = arith.constant dense<0.000000e+00> : tensor<64x64xf32> loc(#loc301)
+      %tmp96_102 = arith.truncf %tmp96_101 : tensor<64x64xf32> to tensor<64x64xbf16> loc(#loc301)
+      %tmp96_103 = tt.load %tmp96_98, %tmp96_100, %tmp96_102 evictionPolicy = evict_first : tensor<64x64x!tt.ptr<bf16>> loc(#loc301)
+      %tmp96_104 = arith.extf %tmp96_103 : tensor<64x64xbf16> to tensor<64x64xf32> loc(#loc302)
+      %tmp102 = tt.splat %in_ptr4 : !tt.ptr<bf16> -> tensor<1x64x!tt.ptr<bf16>> loc(#loc303)
+      %tmp102_105 = tt.addptr %tmp102, %r0_index_23 : tensor<1x64x!tt.ptr<bf16>>, tensor<1x64xi32> loc(#loc303)
+      %tmp102_106 = arith.constant 0.000000e+00 : f32 loc(#loc304)
+      %tmp102_107 = arith.constant dense<0.000000e+00> : tensor<1x64xf32> loc(#loc304)
+      %tmp102_108 = arith.truncf %tmp102_107 : tensor<1x64xf32> to tensor<1x64xbf16> loc(#loc304)
+      %tmp102_109 = tt.load %tmp102_105, %r0_mask_24, %tmp102_108 evictionPolicy = evict_last : tensor<1x64x!tt.ptr<bf16>> loc(#loc304)
+      %tmp102_110 = arith.extf %tmp102_109 : tensor<1x64xbf16> to tensor<1x64xf32> loc(#loc305)
+      %tmp13 = arith.constant 0 : i64 loc(#loc306)
+      %tmp13_111 = arith.constant dense<0> : tensor<1x1xi64> loc(#loc306)
+      %tmp14 = arith.extsi %r0_3_27 : tensor<1x64xi32> to tensor<1x64xi64> loc(#loc307)
+      %tmp14_112 = arith.constant dense<0> : tensor<1x64xi64> loc(#loc307)
+      %tmp14_113 = arith.cmpi sge, %tmp14, %tmp14_112 : tensor<1x64xi64> loc(#loc307)
+      %tmp15 = arith.constant 1 : i64 loc(#loc308)
+      %tmp15_114 = arith.constant dense<1> : tensor<1x1xi64> loc(#loc308)
+      %tmp16 = arith.extsi %r0_3_27 : tensor<1x64xi32> to tensor<1x64xi64> loc(#loc309)
+      %tmp16_115 = arith.constant dense<1> : tensor<1x64xi64> loc(#loc309)
+      %tmp16_116 = arith.cmpi slt, %tmp16, %tmp16_115 : tensor<1x64xi64> loc(#loc309)
+      %tmp17 = arith.constant 2 : i32 loc(#loc310)
+      %tmp17_117 = arith.constant 2 : i32 loc(#loc310)
+      %tmp17_118 = arith.constant dense<2> : tensor<1x64xi32> loc(#loc310)
+      %tmp17_119 = arith.muli %tmp17_118, %r0_4_30 : tensor<1x64xi32> loc(#loc310)
+      %tmp17_120 = arith.constant 1 : i32 loc(#loc311)
+      %tmp17_121 = arith.constant 1 : i32 loc(#loc311)
+      %tmp17_122 = arith.constant dense<1> : tensor<1x64xi32> loc(#loc311)
+      %tmp17_123 = arith.addi %tmp17_122, %tmp17_119 : tensor<1x64xi32> loc(#loc311)
+      %tmp17_124 = arith.constant 128 : i32 loc(#loc312)
+      %tmp17_125 = arith.constant 128 : i32 loc(#loc312)
+      %tmp17_126 = arith.constant dense<128> : tensor<64x1xi32> loc(#loc312)
+      %tmp17_127 = arith.muli %tmp17_126, %x0_12 : tensor<64x1xi32> loc(#loc312)
+      %tmp17_128 = tt.broadcast %tmp17_123 : tensor<1x64xi32> -> tensor<64x64xi32> loc(#loc313)
+      %tmp17_129 = tt.broadcast %tmp17_127 : tensor<64x1xi32> -> tensor<64x64xi32> loc(#loc313)
+      %tmp17_130 = arith.addi %tmp17_128, %tmp17_129 : tensor<64x64xi32> loc(#loc313)
+      %tmp17_131 = arith.constant 36864 : i32 loc(#loc314)
+      %tmp17_132 = arith.constant 36864 : i32 loc(#loc314)
+      %tmp17_133 = arith.constant dense<36864> : tensor<64x1xi32> loc(#loc314)
+      %tmp17_134 = arith.muli %tmp17_133, %x1_15 : tensor<64x1xi32> loc(#loc314)
+      %tmp17_135 = tt.broadcast %tmp17_134 : tensor<64x1xi32> -> tensor<64x64xi32> loc(#loc315)
+      %tmp17_136 = arith.addi %tmp17_130, %tmp17_135 : tensor<64x64xi32> loc(#loc315)
+      %tmp17_137 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<64x64x!tt.ptr<bf16>> loc(#loc316)
+      %tmp17_138 = tt.addptr %tmp17_137, %tmp17_136 : tensor<64x64x!tt.ptr<bf16>>, tensor<64x64xi32> loc(#loc316)
+      %tmp17_139 = arith.andi %r0_mask_24, %tmp16_116 : tensor<1x64xi1> loc(#loc317)
+      %tmp17_140 = arith.constant 0.000000e+00 : f32 loc(#loc318)
+      %tmp17_141 = tt.broadcast %tmp17_139 : tensor<1x64xi1> -> tensor<64x64xi1> loc(#loc318)
+      %tmp17_142 = arith.constant dense<0.000000e+00> : tensor<64x64xf32> loc(#loc318)
+      %tmp17_143 = arith.truncf %tmp17_142 : tensor<64x64xf32> to tensor<64x64xbf16> loc(#loc318)
+      %tmp17_144 = tt.load %tmp17_138, %tmp17_141, %tmp17_143 evictionPolicy = evict_last : tensor<64x64x!tt.ptr<bf16>> loc(#loc318)
+      %tmp17_145 = arith.extf %tmp17_144 : tensor<64x64xbf16> to tensor<64x64xf32> loc(#loc319)
+      %tmp19 = arith.constant 1.280000e+02 : f32 loc(#loc320)
+      %tmp20 = arith.constant dense<1.280000e+02> : tensor<64x1xf32> loc(#loc321)
+      %tmp20_146 = arith.divf %tmp10_20, %tmp20 : tensor<64x1xf32> loc(#loc321)
+      %tmp21 = arith.constant 9.99999997E-7 : f32 loc(#loc322)
+      %tmp22 = arith.constant dense<9.99999997E-7> : tensor<64x1xf32> loc(#loc323)
+      %tmp22_147 = arith.addf %tmp20_146, %tmp22 : tensor<64x1xf32> loc(#loc323)
+      %tmp23 = tt.extern_elementwise %tmp22_147 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<64x1xf32>) -> tensor<64x1xf32> loc(#loc324)
+      %tmp24 = tt.broadcast %tmp23 : tensor<64x1xf32> -> tensor<64x64xf32> loc(#loc325)
+      %tmp24_148 = arith.mulf %tmp17_145, %tmp24 : tensor<64x64xf32> loc(#loc325)
+      %tmp25 = arith.constant 2 : i32 loc(#loc326)
+      %tmp25_149 = arith.constant 2 : i32 loc(#loc326)
+      %tmp25_150 = arith.constant dense<2> : tensor<1x64xi32> loc(#loc326)
+      %tmp25_151 = arith.muli %tmp25_150, %r0_4_30 : tensor<1x64xi32> loc(#loc326)
+      %tmp25_152 = arith.constant 1 : i32 loc(#loc327)
+      %tmp25_153 = arith.constant 1 : i32 loc(#loc327)
+      %tmp25_154 = arith.constant dense<1> : tensor<1x64xi32> loc(#loc327)
+      %tmp25_155 = arith.addi %tmp25_154, %tmp25_151 : tensor<1x64xi32> loc(#loc327)
+      %tmp25_156 = tt.broadcast %tmp25_155 : tensor<1x64xi32> -> tensor<64x64xi32> loc(#loc328)
+      %tmp25_157 = tt.splat %in_ptr1 : !tt.ptr<bf16> -> tensor<64x64x!tt.ptr<bf16>> loc(#loc329)
+      %tmp25_158 = tt.addptr %tmp25_157, %tmp25_156 : tensor<64x64x!tt.ptr<bf16>>, tensor<64x64xi32> loc(#loc329)
+      %tmp25_159 = arith.andi %r0_mask_24, %tmp16_116 : tensor<1x64xi1> loc(#loc330)
+      %tmp25_160 = arith.constant 0.000000e+00 : f32 loc(#loc331)
+      %tmp25_161 = tt.broadcast %tmp25_159 : tensor<1x64xi1> -> tensor<64x64xi1> loc(#loc331)
+      %tmp25_162 = arith.constant dense<0.000000e+00> : tensor<64x64xf32> loc(#loc331)
+      %tmp25_163 = arith.truncf %tmp25_162 : tensor<64x64xf32> to tensor<64x64xbf16> loc(#loc331)
+      %tmp25_164 = tt.load %tmp25_158, %tmp25_161, %tmp25_163 evictionPolicy = evict_last : tensor<64x64x!tt.ptr<bf16>> loc(#loc331)
+      %tmp25_165 = arith.extf %tmp25_164 : tensor<64x64xbf16> to tensor<64x64xf32> loc(#loc332)
+      %tmp27 = arith.mulf %tmp24_148, %tmp25_165 : tensor<64x64xf32> loc(#loc333)
+      %tmp29 = arith.constant 0.000000e+00 : f32 loc(#loc334)
+      %tmp29_166 = arith.constant dense<0.000000e+00> : tensor<64x64xf32> loc(#loc334)
+      %tmp29_167 = arith.subf %tmp29_166, %tmp27 : tensor<64x64xf32> loc(#loc334)
+      %tmp30 = arith.constant 0.000000e+00 : f32 loc(#loc335)
+      %tmp30_168 = arith.constant dense<0.000000e+00> : tensor<64x64xf32> loc(#loc335)
+      %tmp31 = tt.broadcast %tmp16_116 : tensor<1x64xi1> -> tensor<64x64xi1> loc(#loc336)
+      %tmp31_169 = arith.select %tmp31, %tmp29_167, %tmp30_168 : tensor<64x64xi1>, tensor<64x64xf32> loc(#loc336)
+      %tmp32 = arith.extsi %r0_3_27 : tensor<1x64xi32> to tensor<1x64xi64> loc(#loc337)
+      %tmp32_170 = arith.constant dense<1> : tensor<1x64xi64> loc(#loc337)
+      %tmp32_171 = arith.cmpi sge, %tmp32, %tmp32_170 : tensor<1x64xi64> loc(#loc337)
+      %tmp33 = arith.constant 2 : i64 loc(#loc338)
+      %tmp33_172 = arith.constant dense<2> : tensor<1x1xi64> loc(#loc338)
+      %tmp34 = arith.extsi %r0_3_27 : tensor<1x64xi32> to tensor<1x64xi64> loc(#loc339)
+      %tmp34_173 = arith.constant dense<2> : tensor<1x64xi64> loc(#loc339)
+      %tmp34_174 = arith.cmpi slt, %tmp34, %tmp34_173 : tensor<1x64xi64> loc(#loc339)
+      %tmp35 = arith.constant 2 : i32 loc(#loc340)
+      %tmp35_175 = arith.constant 2 : i32 loc(#loc340)
+      %tmp35_176 = arith.constant dense<2> : tensor<1x64xi32> loc(#loc340)
+      %tmp35_177 = arith.muli %tmp35_176, %r0_4_30 : tensor<1x64xi32> loc(#loc340)
+      %tmp35_178 = arith.constant 128 : i32 loc(#loc341)
+      %tmp35_179 = arith.constant 128 : i32 loc(#loc341)
+      %tmp35_180 = arith.constant dense<128> : tensor<64x1xi32> loc(#loc341)
+      %tmp35_181 = arith.muli %tmp35_180, %x0_12 : tensor<64x1xi32> loc(#loc341)
+      %tmp35_182 = tt.broadcast %tmp35_177 : tensor<1x64xi32> -> tensor<64x64xi32> loc(#loc342)
+      %tmp35_183 = tt.broadcast %tmp35_181 : tensor<64x1xi32> -> tensor<64x64xi32> loc(#loc342)
+      %tmp35_184 = arith.addi %tmp35_182, %tmp35_183 : tensor<64x64xi32> loc(#loc342)
+      %tmp35_185 = arith.constant 36864 : i32 loc(#loc343)
+      %tmp35_186 = arith.constant 36864 : i32 loc(#loc343)
+      %tmp35_187 = arith.constant dense<36864> : tensor<64x1xi32> loc(#loc343)
+      %tmp35_188 = arith.muli %tmp35_187, %x1_15 : tensor<64x1xi32> loc(#loc343)
+      %tmp35_189 = tt.broadcast %tmp35_188 : tensor<64x1xi32> -> tensor<64x64xi32> loc(#loc344)
+      %tmp35_190 = arith.addi %tmp35_184, %tmp35_189 : tensor<64x64xi32> loc(#loc344)
+      %tmp35_191 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<64x64x!tt.ptr<bf16>> loc(#loc345)
+      %tmp35_192 = tt.addptr %tmp35_191, %tmp35_190 : tensor<64x64x!tt.ptr<bf16>>, tensor<64x64xi32> loc(#loc345)
+      %tmp35_193 = arith.andi %r0_mask_24, %tmp32_171 : tensor<1x64xi1> loc(#loc346)
+      %tmp35_194 = arith.constant 0.000000e+00 : f32 loc(#loc347)
+      %tmp35_195 = tt.broadcast %tmp35_193 : tensor<1x64xi1> -> tensor<64x64xi1> loc(#loc347)
+      %tmp35_196 = arith.constant dense<0.000000e+00> : tensor<64x64xf32> loc(#loc347)
+      %tmp35_197 = arith.truncf %tmp35_196 : tensor<64x64xf32> to tensor<64x64xbf16> loc(#loc347)
+      %tmp35_198 = tt.load %tmp35_192, %tmp35_195, %tmp35_197 evictionPolicy = evict_last : tensor<64x64x!tt.ptr<bf16>> loc(#loc347)
+      %tmp35_199 = arith.extf %tmp35_198 : tensor<64x64xbf16> to tensor<64x64xf32> loc(#loc348)
+      %tmp37 = arith.constant 1.280000e+02 : f32 loc(#loc349)
+      %tmp38 = arith.constant dense<1.280000e+02> : tensor<64x1xf32> loc(#loc350)
+      %tmp38_200 = arith.divf %tmp10_20, %tmp38 : tensor<64x1xf32> loc(#loc350)
+      %tmp39 = arith.constant 9.99999997E-7 : f32 loc(#loc351)
+      %tmp40 = arith.constant dense<9.99999997E-7> : tensor<64x1xf32> loc(#loc352)
+      %tmp40_201 = arith.addf %tmp38_200, %tmp40 : tensor<64x1xf32> loc(#loc352)
+      %tmp41 = tt.extern_elementwise %tmp40_201 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<64x1xf32>) -> tensor<64x1xf32> loc(#loc353)
+      %tmp42 = tt.broadcast %tmp41 : tensor<64x1xf32> -> tensor<64x64xf32> loc(#loc354)
+      %tmp42_202 = arith.mulf %tmp35_199, %tmp42 : tensor<64x64xf32> loc(#loc354)
+      %tmp43 = arith.constant 2 : i32 loc(#loc355)
+      %tmp43_203 = arith.constant 2 : i32 loc(#loc355)
+      %tmp43_204 = arith.constant dense<2> : tensor<1x64xi32> loc(#loc355)
+      %tmp43_205 = arith.muli %tmp43_204, %r0_4_30 : tensor<1x64xi32> loc(#loc355)
+      %tmp43_206 = tt.broadcast %tmp43_205 : tensor<1x64xi32> -> tensor<64x64xi32> loc(#loc356)
+      %tmp43_207 = tt.splat %in_ptr1 : !tt.ptr<bf16> -> tensor<64x64x!tt.ptr<bf16>> loc(#loc357)
+      %tmp43_208 = tt.addptr %tmp43_207, %tmp43_206 : tensor<64x64x!tt.ptr<bf16>>, tensor<64x64xi32> loc(#loc357)
+      %tmp43_209 = arith.andi %r0_mask_24, %tmp32_171 : tensor<1x64xi1> loc(#loc358)
+      %tmp43_210 = arith.constant 0.000000e+00 : f32 loc(#loc359)
+      %tmp43_211 = tt.broadcast %tmp43_209 : tensor<1x64xi1> -> tensor<64x64xi1> loc(#loc359)
+      %tmp43_212 = arith.constant dense<0.000000e+00> : tensor<64x64xf32> loc(#loc359)
+      %tmp43_213 = arith.truncf %tmp43_212 : tensor<64x64xf32> to tensor<64x64xbf16> loc(#loc359)
+      %tmp43_214 = tt.load %tmp43_208, %tmp43_211, %tmp43_213 evictionPolicy = evict_last : tensor<64x64x!tt.ptr<bf16>> loc(#loc359)
+      %tmp43_215 = arith.extf %tmp43_214 : tensor<64x64xbf16> to tensor<64x64xf32> loc(#loc360)
+      %tmp45 = arith.mulf %tmp42_202, %tmp43_215 : tensor<64x64xf32> loc(#loc361)
+      %tmp47 = arith.constant 0.000000e+00 : f32 loc(#loc362)
+      %tmp47_216 = arith.constant dense<0.000000e+00> : tensor<64x64xf32> loc(#loc362)
+      %tmp48 = tt.broadcast %tmp32_171 : tensor<1x64xi1> -> tensor<64x64xi1> loc(#loc363)
+      %tmp48_217 = arith.select %tmp48, %tmp45, %tmp47_216 : tensor<64x64xi1>, tensor<64x64xf32> loc(#loc363)
+      %tmp49 = tt.broadcast %tmp16_116 : tensor<1x64xi1> -> tensor<64x64xi1> loc(#loc364)
+      %tmp49_218 = arith.select %tmp49, %tmp31_169, %tmp48_217 : tensor<64x64xi1>, tensor<64x64xf32> loc(#loc364)
+      %tmp52 = arith.constant 1.280000e+02 : f32 loc(#loc365)
+      %tmp53 = arith.constant dense<1.280000e+02> : tensor<64x1xf32> loc(#loc366)
+      %tmp53_219 = arith.divf %tmp10_20, %tmp53 : tensor<64x1xf32> loc(#loc366)
+      %tmp54 = arith.constant 9.99999997E-7 : f32 loc(#loc367)
+      %tmp55 = arith.constant dense<9.99999997E-7> : tensor<64x1xf32> loc(#loc368)
+      %tmp55_220 = arith.addf %tmp53_219, %tmp55 : tensor<64x1xf32> loc(#loc368)
+      %tmp56 = tt.extern_elementwise %tmp55_220 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<64x1xf32>) -> tensor<64x1xf32> loc(#loc369)
+      %tmp57 = tt.broadcast %tmp56 : tensor<64x1xf32> -> tensor<64x64xf32> loc(#loc370)
+      %tmp57_221 = arith.mulf %tmp50_50, %tmp57 : tensor<64x64xf32> loc(#loc370)
+      %tmp60 = tt.broadcast %tmp58_56 : tensor<1x64xf32> -> tensor<64x64xf32> loc(#loc371)
+      %tmp60_222 = arith.mulf %tmp57_221, %tmp60 : tensor<64x64xf32> loc(#loc371)
+      %tmp64 = arith.mulf %tmp60_222, %tmp63_68 : tensor<64x64xf32> loc(#loc372)
+      %tmp67 = arith.mulf %tmp49_218, %tmp66_80 : tensor<64x64xf32> loc(#loc373)
+      %tmp68 = arith.addf %tmp64, %tmp67 : tensor<64x64xf32> loc(#loc374)
+      %tmp70 = arith.constant 2 : i32 loc(#loc375)
+      %tmp70_223 = arith.constant 2 : i32 loc(#loc375)
+      %tmp70_224 = arith.constant dense<2> : tensor<1x64xi32> loc(#loc375)
+      %tmp70_225 = arith.muli %tmp70_224, %r0_4_30 : tensor<1x64xi32> loc(#loc375)
+      %tmp70_226 = arith.constant 4097 : i32 loc(#loc376)
+      %tmp70_227 = arith.constant 4097 : i32 loc(#loc376)
+      %tmp70_228 = arith.constant dense<4097> : tensor<1x64xi32> loc(#loc376)
+      %tmp70_229 = arith.addi %tmp70_228, %tmp70_225 : tensor<1x64xi32> loc(#loc376)
+      %tmp70_230 = arith.constant 128 : i32 loc(#loc377)
+      %tmp70_231 = arith.constant 128 : i32 loc(#loc377)
+      %tmp70_232 = arith.constant dense<128> : tensor<64x1xi32> loc(#loc377)
+      %tmp70_233 = arith.muli %tmp70_232, %x0_12 : tensor<64x1xi32> loc(#loc377)
+      %tmp70_234 = tt.broadcast %tmp70_229 : tensor<1x64xi32> -> tensor<64x64xi32> loc(#loc378)
+      %tmp70_235 = tt.broadcast %tmp70_233 : tensor<64x1xi32> -> tensor<64x64xi32> loc(#loc378)
+      %tmp70_236 = arith.addi %tmp70_234, %tmp70_235 : tensor<64x64xi32> loc(#loc378)
+      %tmp70_237 = arith.constant 36864 : i32 loc(#loc379)
+      %tmp70_238 = arith.constant 36864 : i32 loc(#loc379)
+      %tmp70_239 = arith.constant dense<36864> : tensor<64x1xi32> loc(#loc379)
+      %tmp70_240 = arith.muli %tmp70_239, %x1_15 : tensor<64x1xi32> loc(#loc379)
+      %tmp70_241 = tt.broadcast %tmp70_240 : tensor<64x1xi32> -> tensor<64x64xi32> loc(#loc380)
+      %tmp70_242 = arith.addi %tmp70_236, %tmp70_241 : tensor<64x64xi32> loc(#loc380)
+      %tmp70_243 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<64x64x!tt.ptr<bf16>> loc(#loc381)
+      %tmp70_244 = tt.addptr %tmp70_243, %tmp70_242 : tensor<64x64x!tt.ptr<bf16>>, tensor<64x64xi32> loc(#loc381)
+      %tmp70_245 = arith.andi %r0_mask_24, %tmp16_116 : tensor<1x64xi1> loc(#loc382)
+      %tmp70_246 = arith.constant 0.000000e+00 : f32 loc(#loc383)
+      %tmp70_247 = tt.broadcast %tmp70_245 : tensor<1x64xi1> -> tensor<64x64xi1> loc(#loc383)
+      %tmp70_248 = arith.constant dense<0.000000e+00> : tensor<64x64xf32> loc(#loc383)
+      %tmp70_249 = arith.truncf %tmp70_248 : tensor<64x64xf32> to tensor<64x64xbf16> loc(#loc383)
+      %tmp70_250 = tt.load %tmp70_244, %tmp70_247, %tmp70_249 evictionPolicy = evict_last : tensor<64x64x!tt.ptr<bf16>> loc(#loc383)
+      %tmp70_251 = arith.extf %tmp70_250 : tensor<64x64xbf16> to tensor<64x64xf32> loc(#loc384)
+      %tmp72 = arith.constant dense<1.280000e+02> : tensor<64x1xf32> loc(#loc385)
+      %tmp72_252 = arith.divf %tmp4_19, %tmp72 : tensor<64x1xf32> loc(#loc385)
+      %tmp73 = arith.constant dense<9.99999997E-7> : tensor<64x1xf32> loc(#loc386)
+      %tmp73_253 = arith.addf %tmp72_252, %tmp73 : tensor<64x1xf32> loc(#loc386)
+      %tmp74 = tt.extern_elementwise %tmp73_253 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<64x1xf32>) -> tensor<64x1xf32> loc(#loc387)
+      %tmp75 = tt.broadcast %tmp74 : tensor<64x1xf32> -> tensor<64x64xf32> loc(#loc388)
+      %tmp75_254 = arith.mulf %tmp70_251, %tmp75 : tensor<64x64xf32> loc(#loc388)
+      %tmp76 = arith.constant 2 : i32 loc(#loc389)
+      %tmp76_255 = arith.constant 2 : i32 loc(#loc389)
+      %tmp76_256 = arith.constant dense<2> : tensor<1x64xi32> loc(#loc389)
+      %tmp76_257 = arith.muli %tmp76_256, %r0_4_30 : tensor<1x64xi32> loc(#loc389)
+      %tmp76_258 = arith.constant 1 : i32 loc(#loc390)
+      %tmp76_259 = arith.constant 1 : i32 loc(#loc390)
+      %tmp76_260 = arith.constant dense<1> : tensor<1x64xi32> loc(#loc390)
+      %tmp76_261 = arith.addi %tmp76_260, %tmp76_257 : tensor<1x64xi32> loc(#loc390)
+      %tmp76_262 = tt.broadcast %tmp76_261 : tensor<1x64xi32> -> tensor<64x64xi32> loc(#loc391)
+      %tmp76_263 = tt.splat %in_ptr4 : !tt.ptr<bf16> -> tensor<64x64x!tt.ptr<bf16>> loc(#loc392)
+      %tmp76_264 = tt.addptr %tmp76_263, %tmp76_262 : tensor<64x64x!tt.ptr<bf16>>, tensor<64x64xi32> loc(#loc392)
+      %tmp76_265 = arith.andi %r0_mask_24, %tmp16_116 : tensor<1x64xi1> loc(#loc393)
+      %tmp76_266 = arith.constant 0.000000e+00 : f32 loc(#loc394)
+      %tmp76_267 = tt.broadcast %tmp76_265 : tensor<1x64xi1> -> tensor<64x64xi1> loc(#loc394)
+      %tmp76_268 = arith.constant dense<0.000000e+00> : tensor<64x64xf32> loc(#loc394)
+      %tmp76_269 = arith.truncf %tmp76_268 : tensor<64x64xf32> to tensor<64x64xbf16> loc(#loc394)
+      %tmp76_270 = tt.load %tmp76_264, %tmp76_267, %tmp76_269 evictionPolicy = evict_last : tensor<64x64x!tt.ptr<bf16>> loc(#loc394)
+      %tmp76_271 = arith.extf %tmp76_270 : tensor<64x64xbf16> to tensor<64x64xf32> loc(#loc395)
+      %tmp78 = arith.mulf %tmp75_254, %tmp76_271 : tensor<64x64xf32> loc(#loc396)
+      %tmp80 = arith.constant 0.000000e+00 : f32 loc(#loc397)
+      %tmp80_272 = arith.constant dense<0.000000e+00> : tensor<64x64xf32> loc(#loc397)
+      %tmp80_273 = arith.subf %tmp80_272, %tmp78 : tensor<64x64xf32> loc(#loc397)
+      %tmp81 = arith.constant 0.000000e+00 : f32 loc(#loc398)
+      %tmp81_274 = arith.constant dense<0.000000e+00> : tensor<64x64xf32> loc(#loc398)
+      %tmp82 = tt.broadcast %tmp16_116 : tensor<1x64xi1> -> tensor<64x64xi1> loc(#loc399)
+      %tmp82_275 = arith.select %tmp82, %tmp80_273, %tmp81_274 : tensor<64x64xi1>, tensor<64x64xf32> loc(#loc399)
+      %tmp83 = arith.constant 2 : i32 loc(#loc400)
+      %tmp83_276 = arith.constant 2 : i32 loc(#loc400)
+      %tmp83_277 = arith.constant dense<2> : tensor<1x64xi32> loc(#loc400)
+      %tmp83_278 = arith.muli %tmp83_277, %r0_4_30 : tensor<1x64xi32> loc(#loc400)
+      %tmp83_279 = arith.constant 4096 : i32 loc(#loc401)
+      %tmp83_280 = arith.constant 4096 : i32 loc(#loc401)
+      %tmp83_281 = arith.constant dense<4096> : tensor<1x64xi32> loc(#loc401)
+      %tmp83_282 = arith.addi %tmp83_281, %tmp83_278 : tensor<1x64xi32> loc(#loc401)
+      %tmp83_283 = arith.constant 128 : i32 loc(#loc402)
+      %tmp83_284 = arith.constant 128 : i32 loc(#loc402)
+      %tmp83_285 = arith.constant dense<128> : tensor<64x1xi32> loc(#loc402)
+      %tmp83_286 = arith.muli %tmp83_285, %x0_12 : tensor<64x1xi32> loc(#loc402)
+      %tmp83_287 = tt.broadcast %tmp83_282 : tensor<1x64xi32> -> tensor<64x64xi32> loc(#loc403)
+      %tmp83_288 = tt.broadcast %tmp83_286 : tensor<64x1xi32> -> tensor<64x64xi32> loc(#loc403)
+      %tmp83_289 = arith.addi %tmp83_287, %tmp83_288 : tensor<64x64xi32> loc(#loc403)
+      %tmp83_290 = arith.constant 36864 : i32 loc(#loc404)
+      %tmp83_291 = arith.constant 36864 : i32 loc(#loc404)
+      %tmp83_292 = arith.constant dense<36864> : tensor<64x1xi32> loc(#loc404)
+      %tmp83_293 = arith.muli %tmp83_292, %x1_15 : tensor<64x1xi32> loc(#loc404)
+      %tmp83_294 = tt.broadcast %tmp83_293 : tensor<64x1xi32> -> tensor<64x64xi32> loc(#loc405)
+      %tmp83_295 = arith.addi %tmp83_289, %tmp83_294 : tensor<64x64xi32> loc(#loc405)
+      %tmp83_296 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<64x64x!tt.ptr<bf16>> loc(#loc406)
+      %tmp83_297 = tt.addptr %tmp83_296, %tmp83_295 : tensor<64x64x!tt.ptr<bf16>>, tensor<64x64xi32> loc(#loc406)
+      %tmp83_298 = arith.andi %r0_mask_24, %tmp32_171 : tensor<1x64xi1> loc(#loc407)
+      %tmp83_299 = arith.constant 0.000000e+00 : f32 loc(#loc408)
+      %tmp83_300 = tt.broadcast %tmp83_298 : tensor<1x64xi1> -> tensor<64x64xi1> loc(#loc408)
+      %tmp83_301 = arith.constant dense<0.000000e+00> : tensor<64x64xf32> loc(#loc408)
+      %tmp83_302 = arith.truncf %tmp83_301 : tensor<64x64xf32> to tensor<64x64xbf16> loc(#loc408)
+      %tmp83_303 = tt.load %tmp83_297, %tmp83_300, %tmp83_302 evictionPolicy = evict_last : tensor<64x64x!tt.ptr<bf16>> loc(#loc408)
+      %tmp83_304 = arith.extf %tmp83_303 : tensor<64x64xbf16> to tensor<64x64xf32> loc(#loc409)
+      %tmp85 = arith.constant dense<1.280000e+02> : tensor<64x1xf32> loc(#loc410)
+      %tmp85_305 = arith.divf %tmp4_19, %tmp85 : tensor<64x1xf32> loc(#loc410)
+      %tmp86 = arith.constant dense<9.99999997E-7> : tensor<64x1xf32> loc(#loc411)
+      %tmp86_306 = arith.addf %tmp85_305, %tmp86 : tensor<64x1xf32> loc(#loc411)
+      %tmp87 = tt.extern_elementwise %tmp86_306 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<64x1xf32>) -> tensor<64x1xf32> loc(#loc412)
+      %tmp88 = tt.broadcast %tmp87 : tensor<64x1xf32> -> tensor<64x64xf32> loc(#loc413)
+      %tmp88_307 = arith.mulf %tmp83_304, %tmp88 : tensor<64x64xf32> loc(#loc413)
+      %tmp89 = arith.constant 2 : i32 loc(#loc414)
+      %tmp89_308 = arith.constant 2 : i32 loc(#loc414)
+      %tmp89_309 = arith.constant dense<2> : tensor<1x64xi32> loc(#loc414)
+      %tmp89_310 = arith.muli %tmp89_309, %r0_4_30 : tensor<1x64xi32> loc(#loc414)
+      %tmp89_311 = tt.broadcast %tmp89_310 : tensor<1x64xi32> -> tensor<64x64xi32> loc(#loc415)
+      %tmp89_312 = tt.splat %in_ptr4 : !tt.ptr<bf16> -> tensor<64x64x!tt.ptr<bf16>> loc(#loc416)
+      %tmp89_313 = tt.addptr %tmp89_312, %tmp89_311 : tensor<64x64x!tt.ptr<bf16>>, tensor<64x64xi32> loc(#loc416)
+      %tmp89_314 = arith.andi %r0_mask_24, %tmp32_171 : tensor<1x64xi1> loc(#loc417)
+      %tmp89_315 = arith.constant 0.000000e+00 : f32 loc(#loc418)
+      %tmp89_316 = tt.broadcast %tmp89_314 : tensor<1x64xi1> -> tensor<64x64xi1> loc(#loc418)
+      %tmp89_317 = arith.constant dense<0.000000e+00> : tensor<64x64xf32> loc(#loc418)
+      %tmp89_318 = arith.truncf %tmp89_317 : tensor<64x64xf32> to tensor<64x64xbf16> loc(#loc418)
+      %tmp89_319 = tt.load %tmp89_313, %tmp89_316, %tmp89_318 evictionPolicy = evict_last : tensor<64x64x!tt.ptr<bf16>> loc(#loc418)
+      %tmp89_320 = arith.extf %tmp89_319 : tensor<64x64xbf16> to tensor<64x64xf32> loc(#loc419)
+      %tmp91 = arith.mulf %tmp88_307, %tmp89_320 : tensor<64x64xf32> loc(#loc420)
+      %tmp93 = arith.constant 0.000000e+00 : f32 loc(#loc421)
+      %tmp93_321 = arith.constant dense<0.000000e+00> : tensor<64x64xf32> loc(#loc421)
+      %tmp94 = tt.broadcast %tmp32_171 : tensor<1x64xi1> -> tensor<64x64xi1> loc(#loc422)
+      %tmp94_322 = arith.select %tmp94, %tmp91, %tmp93_321 : tensor<64x64xi1>, tensor<64x64xf32> loc(#loc422)
+      %tmp95 = tt.broadcast %tmp16_116 : tensor<1x64xi1> -> tensor<64x64xi1> loc(#loc423)
+      %tmp95_323 = arith.select %tmp95, %tmp82_275, %tmp94_322 : tensor<64x64xi1>, tensor<64x64xf32> loc(#loc423)
+      %tmp98 = arith.constant dense<1.280000e+02> : tensor<64x1xf32> loc(#loc424)
+      %tmp98_324 = arith.divf %tmp4_19, %tmp98 : tensor<64x1xf32> loc(#loc424)
+      %tmp99 = arith.constant dense<9.99999997E-7> : tensor<64x1xf32> loc(#loc425)
+      %tmp99_325 = arith.addf %tmp98_324, %tmp99 : tensor<64x1xf32> loc(#loc425)
+      %tmp100 = tt.extern_elementwise %tmp99_325 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<64x1xf32>) -> tensor<64x1xf32> loc(#loc426)
+      %tmp101 = tt.broadcast %tmp100 : tensor<64x1xf32> -> tensor<64x64xf32> loc(#loc427)
+      %tmp101_326 = arith.mulf %tmp96_104, %tmp101 : tensor<64x64xf32> loc(#loc427)
+      %tmp104 = tt.broadcast %tmp102_110 : tensor<1x64xf32> -> tensor<64x64xf32> loc(#loc428)
+      %tmp104_327 = arith.mulf %tmp101_326, %tmp104 : tensor<64x64xf32> loc(#loc428)
+      %tmp107 = arith.mulf %tmp104_327, %tmp63_68 : tensor<64x64xf32> loc(#loc429)
+      %tmp109 = arith.mulf %tmp95_323, %tmp66_80 : tensor<64x64xf32> loc(#loc430)
+      %tmp110 = arith.addf %tmp107, %tmp109 : tensor<64x64xf32> loc(#loc431)
+      %c128_i32 = arith.constant 128 : i32 loc(#loc204)
+      %c128_i32_328 = arith.constant 128 : i32 loc(#loc204)
+      %cst = arith.constant dense<128> : tensor<64x1xi32> loc(#loc204)
+      %8 = arith.muli %cst, %xindex_7 : tensor<64x1xi32> loc(#loc204)
+      %9 = tt.broadcast %r0_index_23 : tensor<1x64xi32> -> tensor<64x64xi32> loc(#loc205)
+      %10 = tt.broadcast %8 : tensor<64x1xi32> -> tensor<64x64xi32> loc(#loc205)
+      %11 = arith.addi %9, %10 : tensor<64x64xi32> loc(#loc205)
+      %12 = tt.splat %in_out_ptr0 : !tt.ptr<bf16> -> tensor<64x64x!tt.ptr<bf16>> loc(#loc206)
+      %13 = tt.addptr %12, %11 : tensor<64x64x!tt.ptr<bf16>>, tensor<64x64xi32> loc(#loc206)
+      %14 = tt.broadcast %r0_mask_24 : tensor<1x64xi1> -> tensor<64x64xi1> loc(#loc207)
+      %15 = arith.truncf %tmp68 : tensor<64x64xf32> to tensor<64x64xbf16> loc(#loc207)
+      tt.store %13, %15, %14 : tensor<64x64x!tt.ptr<bf16>> loc(#loc207)
+      %c128_i32_329 = arith.constant 128 : i32 loc(#loc208)
+      %c128_i32_330 = arith.constant 128 : i32 loc(#loc208)
+      %cst_331 = arith.constant dense<128> : tensor<64x1xi32> loc(#loc208)
+      %16 = arith.muli %cst_331, %xindex_7 : tensor<64x1xi32> loc(#loc208)
+      %17 = tt.broadcast %r0_index_23 : tensor<1x64xi32> -> tensor<64x64xi32> loc(#loc209)
+      %18 = tt.broadcast %16 : tensor<64x1xi32> -> tensor<64x64xi32> loc(#loc209)
+      %19 = arith.addi %17, %18 : tensor<64x64xi32> loc(#loc209)
+      %20 = tt.splat %in_out_ptr1 : !tt.ptr<bf16> -> tensor<64x64x!tt.ptr<bf16>> loc(#loc210)
+      %21 = tt.addptr %20, %19 : tensor<64x64x!tt.ptr<bf16>>, tensor<64x64xi32> loc(#loc210)
+      %22 = tt.broadcast %r0_mask_24 : tensor<1x64xi1> -> tensor<64x64xi1> loc(#loc211)
+      %23 = arith.truncf %tmp110 : tensor<64x64xf32> to tensor<64x64xbf16> loc(#loc211)
+      tt.store %21, %23, %22 : tensor<64x64x!tt.ptr<bf16>> loc(#loc211)
+    } loc(#loc44)
+    tt.return loc(#loc212)
+  } loc(#loc)
+  tt.func private @"triton.language.standard.sum__fp32S64_64S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<64x64xf32> loc("input"(#loc213))) -> tensor<64xf32> attributes {noinline = false} {
+    %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({
+    ^bb0(%arg1: f32 loc(unknown), %arg2: f32 loc(unknown)):
+      %2 = tt.call @triton.language.standard._sum_combine__fp32_fp32__(%arg1, %arg2) : (f32, f32) -> f32 loc(#loc214)
+      tt.reduce.return %2 : f32 loc(#loc214)
+    }) : (tensor<64x64xf32>) -> tensor<64xf32> loc(#loc214)
+    tt.return %0 : tensor<64xf32> loc(#loc216)
+  ^bb1:  // no predecessors
+    %1 = ub.poison : tensor<64xf32> loc(#loc217)
+    tt.return %1 : tensor<64xf32> loc(#loc217)
+  } loc(#loc213)
+  tt.func private @triton.language.standard._sum_combine__fp32_fp32__(%a: f32 loc("a"(#loc218)), %b: f32 loc("b"(#loc218))) -> f32 attributes {noinline = false} {
+    %0 = arith.addf %a, %b : f32 loc(#loc219)
+    tt.return %0 : f32 loc(#loc220)
+  ^bb1:  // no predecessors
+    %1 = ub.poison : f32 loc(#loc221)
+    tt.return %1 : f32 loc(#loc221)
+  } loc(#loc218)
+} loc(#loc)
+#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":19:13)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":20:15)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":23:28)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":23:33)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:36)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:44)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:23)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":25:46)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":26:27)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":26:37)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":28:19)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":29:19)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":30:43)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":32:44)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":33:43)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":34:31)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":35:29)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:41)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:52)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:48)
+#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:63)
+#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:57)
+#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:34)
+#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:68)
+#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:121)
+#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:45)
+#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:41)
+#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:56)
+#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:50)
+#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:34)
+#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:61)
+#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:114)
+#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":42:22)
+#loc34 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":44:23)
+#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":45:40)
+#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":47:22)
+#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":49:25)
+#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":50:42)
+#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":50:8)
+#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":51:25)
+#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":51:28)
+#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":52:27)
+#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":52:30)
+#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":53:43)
+#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":54:31)
+#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":55:29)
+#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":58:27)
+#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":59:27)
+#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:46)
+#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:42)
+#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:57)
+#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:51)
+#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:35)
+#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:62)
+#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:115)
+#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:35)
+#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:42)
+#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:95)
+#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:46)
+#loc60 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:42)
+#loc61 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:35)
+#loc62 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:51)
+#loc63 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:46)
+#loc64 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:42)
+#loc65 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:35)
+#loc66 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:51)
+#loc67 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:42)
+#loc68 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:53)
+#loc69 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:49)
+#loc70 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:64)
+#loc71 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:58)
+#loc72 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:35)
+#loc73 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:69)
+#loc74 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:123)
+#loc75 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:36)
+#loc76 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:43)
+#loc77 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:96)
+#loc78 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":68:35)
+#loc79 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":69:25)
+#loc80 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":70:35)
+#loc81 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":71:24)
+#loc82 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:41)
+#loc83 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:39)
+#loc84 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:52)
+#loc85 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:48)
+#loc86 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:63)
+#loc87 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:57)
+#loc88 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:35)
+#loc89 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:78)
+#loc90 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:68)
+#loc91 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:129)
+#loc92 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":74:16)
+#loc93 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":75:25)
+#loc94 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":76:16)
+#loc95 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":77:24)
+#loc96 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":78:32)
+#loc97 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":79:24)
+#loc98 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:57)
+#loc99 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:55)
+#loc100 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:63)
+#loc101 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:35)
+#loc102 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:95)
+#loc103 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:85)
+#loc104 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:146)
+#loc105 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":82:24)
+#loc106 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":84:17)
+#loc107 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":85:42)
+#loc108 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":86:39)
+#loc109 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":87:25)
+#loc110 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":88:35)
+#loc111 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":89:24)
+#loc112 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:37)
+#loc113 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:48)
+#loc114 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:44)
+#loc115 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:59)
+#loc116 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:53)
+#loc117 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:35)
+#loc118 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:74)
+#loc119 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:64)
+#loc120 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:125)
+#loc121 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":92:16)
+#loc122 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":93:25)
+#loc123 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":94:16)
+#loc124 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":95:24)
+#loc125 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":96:32)
+#loc126 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":97:24)
+#loc127 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:53)
+#loc128 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:59)
+#loc129 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:35)
+#loc130 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:91)
+#loc131 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:81)
+#loc132 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:142)
+#loc133 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":100:24)
+#loc134 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":102:42)
+#loc135 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":103:39)
+#loc136 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":104:39)
+#loc137 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":106:16)
+#loc138 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":107:25)
+#loc139 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":108:16)
+#loc140 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":109:24)
+#loc141 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":110:32)
+#loc142 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":111:24)
+#loc143 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":113:24)
+#loc144 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":116:24)
+#loc145 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":118:24)
+#loc146 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":119:24)
+#loc147 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:44)
+#loc148 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:42)
+#loc149 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:55)
+#loc150 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:51)
+#loc151 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:66)
+#loc152 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:60)
+#loc153 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:35)
+#loc154 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:81)
+#loc155 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:71)
+#loc156 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:132)
+#loc157 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":123:24)
+#loc158 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":124:24)
+#loc159 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":125:32)
+#loc160 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":126:24)
+#loc161 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:57)
+#loc162 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:55)
+#loc163 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:63)
+#loc164 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:35)
+#loc165 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:95)
+#loc166 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:85)
+#loc167 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:146)
+#loc168 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":129:24)
+#loc169 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":131:17)
+#loc170 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":132:42)
+#loc171 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":133:39)
+#loc172 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:44)
+#loc173 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:42)
+#loc174 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:55)
+#loc175 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:51)
+#loc176 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:66)
+#loc177 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:60)
+#loc178 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:35)
+#loc179 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:81)
+#loc180 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:71)
+#loc181 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:132)
+#loc182 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":136:24)
+#loc183 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":137:24)
+#loc184 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":138:32)
+#loc185 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":139:24)
+#loc186 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:53)
+#loc187 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:59)
+#loc188 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:35)
+#loc189 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:91)
+#loc190 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:81)
+#loc191 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:142)
+#loc192 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":142:24)
+#loc193 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":144:42)
+#loc194 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":145:39)
+#loc195 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":146:39)
+#loc196 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":148:24)
+#loc197 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":149:24)
+#loc198 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":150:33)
+#loc199 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":151:25)
+#loc200 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":153:26)
+#loc201 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":156:26)
+#loc202 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":158:26)
+#loc203 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":159:26)
+#loc204 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:43)
+#loc205 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:39)
+#loc206 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:32)
+#loc207 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:55)
+#loc208 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:43)
+#loc209 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:39)
+#loc210 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:32)
+#loc211 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:56)
+#loc212 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":53:4)
+#loc214 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:36)
+#loc216 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:11)
+#loc217 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:4)
+#loc219 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:15)
+#loc220 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:11)
+#loc221 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:4)
+#loc231 = loc("xnumel"(#loc1))
+#loc232 = loc("r0_numel"(#loc2))
+#loc233 = loc("xoffset"(#loc3))
+#loc234 = loc("xoffset"(#loc4))
+#loc235 = loc("xindex"(#loc5))
+#loc236 = loc("xindex"(#loc6))
+#loc237 = loc("xindex"(#loc7))
+#loc238 = loc("xmask"(#loc8))
+#loc239 = loc("r0_base"(#loc9))
+#loc240 = loc("r0_base"(#loc10))
+#loc241 = loc("x0"(#loc11))
+#loc242 = loc("x1"(#loc12))
+#loc243 = loc("_tmp4"(#loc13))
+#loc244 = loc("_tmp10"(#loc14))
+#loc245 = loc("_tmp4"(#loc15))
+#loc246 = loc("r0_index"(#loc16))
+#loc247 = loc("r0_mask"(#loc17))
+#loc248 = loc("tmp0"(#loc18))
+#loc249 = loc("tmp0"(#loc19))
+#loc250 = loc("tmp0"(#loc20))
+#loc251 = loc("tmp0"(#loc21))
+#loc252 = loc("tmp0"(#loc22))
+#loc253 = loc("tmp0"(#loc23))
+#loc254 = loc("tmp0"(#loc24))
+#loc255 = loc("tmp0"(#loc25))
+#loc256 = loc("tmp6"(#loc26))
+#loc257 = loc("tmp6"(#loc27))
+#loc258 = loc("tmp6"(#loc28))
+#loc259 = loc("tmp6"(#loc29))
+#loc260 = loc("tmp6"(#loc30))
+#loc261 = loc("tmp6"(#loc31))
+#loc262 = loc("tmp6"(#loc32))
+#loc263 = loc("tmp2"(#loc33))
+#loc264 = loc("tmp5"(#loc34))
+#loc265 = loc("_tmp4"(#loc35))
+#loc266 = loc("tmp8"(#loc36))
+#loc267 = loc("tmp11"(#loc37))
+#loc268 = loc("_tmp10"(#loc38))
+#loc269 = loc("tmp4"(#loc40))
+#loc270 = loc("tmp4"(#loc41))
+#loc271 = loc("tmp10"(#loc42))
+#loc272 = loc("tmp10"(#loc43))
+#loc273 = loc("r0_index"(#loc45))
+#loc274 = loc("r0_mask"(#loc46))
+#loc275 = loc("r0_3"(#loc47))
+#loc276 = loc("r0_4"(#loc48))
+#loc277 = loc("tmp50"(#loc49))
+#loc278 = loc("tmp50"(#loc50))
+#loc279 = loc("tmp50"(#loc51))
+#loc280 = loc("tmp50"(#loc52))
+#loc281 = loc("tmp50"(#loc53))
+#loc282 = loc("tmp50"(#loc54))
+#loc283 = loc("tmp50"(#loc55))
+#loc284 = loc("tmp58"(#loc56))
+#loc285 = loc("tmp58"(#loc57))
+#loc286 = loc("tmp58"(#loc58))
+#loc287 = loc("tmp63"(#loc59))
+#loc288 = loc("tmp63"(#loc60))
+#loc289 = loc("tmp63"(#loc61))
+#loc290 = loc("tmp63"(#loc62))
+#loc291 = loc("tmp66"(#loc63))
+#loc292 = loc("tmp66"(#loc64))
+#loc293 = loc("tmp66"(#loc65))
+#loc294 = loc("tmp66"(#loc66))
+#loc295 = loc("tmp96"(#loc67))
+#loc296 = loc("tmp96"(#loc68))
+#loc297 = loc("tmp96"(#loc69))
+#loc298 = loc("tmp96"(#loc70))
+#loc299 = loc("tmp96"(#loc71))
+#loc300 = loc("tmp96"(#loc72))
+#loc301 = loc("tmp96"(#loc73))
+#loc302 = loc("tmp96"(#loc74))
+#loc303 = loc("tmp102"(#loc75))
+#loc304 = loc("tmp102"(#loc76))
+#loc305 = loc("tmp102"(#loc77))
+#loc306 = loc("tmp13"(#loc78))
+#loc307 = loc("tmp14"(#loc79))
+#loc308 = loc("tmp15"(#loc80))
+#loc309 = loc("tmp16"(#loc81))
+#loc310 = loc("tmp17"(#loc82))
+#loc311 = loc("tmp17"(#loc83))
+#loc312 = loc("tmp17"(#loc84))
+#loc313 = loc("tmp17"(#loc85))
+#loc314 = loc("tmp17"(#loc86))
+#loc315 = loc("tmp17"(#loc87))
+#loc316 = loc("tmp17"(#loc88))
+#loc317 = loc("tmp17"(#loc89))
+#loc318 = loc("tmp17"(#loc90))
+#loc319 = loc("tmp17"(#loc91))
+#loc320 = loc("tmp19"(#loc92))
+#loc321 = loc("tmp20"(#loc93))
+#loc322 = loc("tmp21"(#loc94))
+#loc323 = loc("tmp22"(#loc95))
+#loc324 = loc("tmp23"(#loc96))
+#loc325 = loc("tmp24"(#loc97))
+#loc326 = loc("tmp25"(#loc98))
+#loc327 = loc("tmp25"(#loc99))
+#loc328 = loc("tmp25"(#loc100))
+#loc329 = loc("tmp25"(#loc101))
+#loc330 = loc("tmp25"(#loc102))
+#loc331 = loc("tmp25"(#loc103))
+#loc332 = loc("tmp25"(#loc104))
+#loc333 = loc("tmp27"(#loc105))
+#loc334 = loc("tmp29"(#loc106))
+#loc335 = loc("tmp30"(#loc107))
+#loc336 = loc("tmp31"(#loc108))
+#loc337 = loc("tmp32"(#loc109))
+#loc338 = loc("tmp33"(#loc110))
+#loc339 = loc("tmp34"(#loc111))
+#loc340 = loc("tmp35"(#loc112))
+#loc341 = loc("tmp35"(#loc113))
+#loc342 = loc("tmp35"(#loc114))
+#loc343 = loc("tmp35"(#loc115))
+#loc344 = loc("tmp35"(#loc116))
+#loc345 = loc("tmp35"(#loc117))
+#loc346 = loc("tmp35"(#loc118))
+#loc347 = loc("tmp35"(#loc119))
+#loc348 = loc("tmp35"(#loc120))
+#loc349 = loc("tmp37"(#loc121))
+#loc350 = loc("tmp38"(#loc122))
+#loc351 = loc("tmp39"(#loc123))
+#loc352 = loc("tmp40"(#loc124))
+#loc353 = loc("tmp41"(#loc125))
+#loc354 = loc("tmp42"(#loc126))
+#loc355 = loc("tmp43"(#loc127))
+#loc356 = loc("tmp43"(#loc128))
+#loc357 = loc("tmp43"(#loc129))
+#loc358 = loc("tmp43"(#loc130))
+#loc359 = loc("tmp43"(#loc131))
+#loc360 = loc("tmp43"(#loc132))
+#loc361 = loc("tmp45"(#loc133))
+#loc362 = loc("tmp47"(#loc134))
+#loc363 = loc("tmp48"(#loc135))
+#loc364 = loc("tmp49"(#loc136))
+#loc365 = loc("tmp52"(#loc137))
+#loc366 = loc("tmp53"(#loc138))
+#loc367 = loc("tmp54"(#loc139))
+#loc368 = loc("tmp55"(#loc140))
+#loc369 = loc("tmp56"(#loc141))
+#loc370 = loc("tmp57"(#loc142))
+#loc371 = loc("tmp60"(#loc143))
+#loc372 = loc("tmp64"(#loc144))
+#loc373 = loc("tmp67"(#loc145))
+#loc374 = loc("tmp68"(#loc146))
+#loc375 = loc("tmp70"(#loc147))
+#loc376 = loc("tmp70"(#loc148))
+#loc377 = loc("tmp70"(#loc149))
+#loc378 = loc("tmp70"(#loc150))
+#loc379 = loc("tmp70"(#loc151))
+#loc380 = loc("tmp70"(#loc152))
+#loc381 = loc("tmp70"(#loc153))
+#loc382 = loc("tmp70"(#loc154))
+#loc383 = loc("tmp70"(#loc155))
+#loc384 = loc("tmp70"(#loc156))
+#loc385 = loc("tmp72"(#loc157))
+#loc386 = loc("tmp73"(#loc158))
+#loc387 = loc("tmp74"(#loc159))
+#loc388 = loc("tmp75"(#loc160))
+#loc389 = loc("tmp76"(#loc161))
+#loc390 = loc("tmp76"(#loc162))
+#loc391 = loc("tmp76"(#loc163))
+#loc392 = loc("tmp76"(#loc164))
+#loc393 = loc("tmp76"(#loc165))
+#loc394 = loc("tmp76"(#loc166))
+#loc395 = loc("tmp76"(#loc167))
+#loc396 = loc("tmp78"(#loc168))
+#loc397 = loc("tmp80"(#loc169))
+#loc398 = loc("tmp81"(#loc170))
+#loc399 = loc("tmp82"(#loc171))
+#loc400 = loc("tmp83"(#loc172))
+#loc401 = loc("tmp83"(#loc173))
+#loc402 = loc("tmp83"(#loc174))
+#loc403 = loc("tmp83"(#loc175))
+#loc404 = loc("tmp83"(#loc176))
+#loc405 = loc("tmp83"(#loc177))
+#loc406 = loc("tmp83"(#loc178))
+#loc407 = loc("tmp83"(#loc179))
+#loc408 = loc("tmp83"(#loc180))
+#loc409 = loc("tmp83"(#loc181))
+#loc410 = loc("tmp85"(#loc182))
+#loc411 = loc("tmp86"(#loc183))
+#loc412 = loc("tmp87"(#loc184))
+#loc413 = loc("tmp88"(#loc185))
+#loc414 = loc("tmp89"(#loc186))
+#loc415 = loc("tmp89"(#loc187))
+#loc416 = loc("tmp89"(#loc188))
+#loc417 = loc("tmp89"(#loc189))
+#loc418 = loc("tmp89"(#loc190))
+#loc419 = loc("tmp89"(#loc191))
+#loc420 = loc("tmp91"(#loc192))
+#loc421 = loc("tmp93"(#loc193))
+#loc422 = loc("tmp94"(#loc194))
+#loc423 = loc("tmp95"(#loc195))
+#loc424 = loc("tmp98"(#loc196))
+#loc425 = loc("tmp99"(#loc197))
+#loc426 = loc("tmp100"(#loc198))
+#loc427 = loc("tmp101"(#loc199))
+#loc428 = loc("tmp104"(#loc200))
+#loc429 = loc("tmp107"(#loc201))
+#loc430 = loc("tmp109"(#loc202))
+#loc431 = loc("tmp110"(#loc203))
+#loc435 = loc("_tmp10"(#loc245))
diff --git a/triton/C3PA2FQRIXNX4FILRXWMWDTESFUYR3BPZHZKRDDGR2QUBFWHGDOQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttgir b/triton/C3PA2FQRIXNX4FILRXWMWDTESFUYR3BPZHZKRDDGR2QUBFWHGDOQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttgir
new file mode 100644
index 0000000000000000000000000000000000000000..79669ec9185bdb64f7fa1a687f3989a52164dbbc
--- /dev/null
+++ b/triton/C3PA2FQRIXNX4FILRXWMWDTESFUYR3BPZHZKRDDGR2QUBFWHGDOQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttgir
@@ -0,0 +1,546 @@
+#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [2, 8], order = [0, 1]}>
+#blocked1 = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [4, 8], warpsPerCTA = [16, 1], order = [1, 0]}>
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":18:0)
+#loc1 = loc(unknown)
+#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":51:25)
+#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":52:27)
+#loc147 = loc("in_out_ptr0"(#loc))
+#loc148 = loc("in_out_ptr1"(#loc))
+#loc149 = loc("in_ptr0"(#loc))
+#loc150 = loc("in_ptr1"(#loc))
+#loc151 = loc("in_ptr2"(#loc))
+#loc152 = loc("in_ptr3"(#loc))
+#loc153 = loc("in_ptr4"(#loc))
+#loc154 = loc("xnumel"(#loc))
+#loc155 = loc("r0_numel"(#loc))
+#loc185 = loc("tmp4"(#loc33))
+#loc187 = loc("tmp10"(#loc36))
+#loc292 = loc(callsite(#loc1 at #loc185))
+#loc294 = loc(callsite(#loc1 at #loc187))
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 16 : i32, ttg.target = "cuda:89", "ttg.threads-per-warp" = 32 : i32} {
+  tt.func public @triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0(%in_out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_out_ptr0"(#loc)), %in_out_ptr1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_out_ptr1"(#loc)), %in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %in_ptr4: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr4"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
+    %cst = arith.constant dense<4097> : tensor<1x64xi32, #blocked> loc(#loc1)
+    %cst_0 = arith.constant dense<0.000000e+00> : tensor<1x64xbf16, #blocked1> loc(#loc1)
+    %cst_1 = arith.constant dense<1> : tensor<1x64xi32, #blocked> loc(#loc1)
+    %cst_2 = arith.constant dense<1> : tensor<1x64xi64, #blocked> loc(#loc1)
+    %cst_3 = arith.constant dense<2> : tensor<1x64xi32, #blocked> loc(#loc1)
+    %cst_4 = arith.constant dense<36864> : tensor<64x1xi32, #blocked> loc(#loc1)
+    %cst_5 = arith.constant dense<36864> : tensor<64x1xi32, #blocked1> loc(#loc1)
+    %cst_6 = arith.constant dense<128> : tensor<64x1xi32, #blocked> loc(#loc1)
+    %cst_7 = arith.constant dense<128> : tensor<64x1xi32, #blocked1> loc(#loc1)
+    %cst_8 = arith.constant dense<4096> : tensor<1x64xi32, #blocked> loc(#loc1)
+    %cst_9 = arith.constant dense<4096> : tensor<1x64xi32, #blocked1> loc(#loc1)
+    %cst_10 = arith.constant dense<128> : tensor<1x64xi32, #blocked> loc(#loc1)
+    %cst_11 = arith.constant dense<128> : tensor<1x64xi32, #blocked1> loc(#loc1)
+    %cst_12 = arith.constant dense<32> : tensor<64x1xi32, #blocked> loc(#loc1)
+    %cst_13 = arith.constant dense<32> : tensor<64x1xi32, #blocked1> loc(#loc1)
+    %c64_i32 = arith.constant 64 : i32 loc(#loc1)
+    %cst_14 = arith.constant dense<0.000000e+00> : tensor<64x64xbf16, #blocked1> loc(#loc1)
+    %cst_15 = arith.constant dense<0.000000e+00> : tensor<64x64xbf16, #blocked> loc(#loc1)
+    %c128_i32 = arith.constant 128 : i32 loc(#loc1)
+    %c0_i32 = arith.constant 0 : i32 loc(#loc1)
+    %cst_16 = arith.constant dense<9.99999997E-7> : tensor<64x1xf32, #blocked1> loc(#loc1)
+    %cst_17 = arith.constant dense<1.280000e+02> : tensor<64x1xf32, #blocked1> loc(#loc1)
+    %cst_18 = arith.constant dense<0.000000e+00> : tensor<64x64xf32, #blocked> loc(#loc1)
+    %cst_19 = arith.constant dense<0.000000e+00> : tensor<64x64xf32, #blocked1> loc(#loc1)
+    %xoffset = tt.get_program_id x : i32 loc(#loc156)
+    %xoffset_20 = arith.muli %xoffset, %c64_i32 : i32 loc(#loc157)
+    %xindex = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc158)
+    %xindex_21 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc158)
+    %xindex_22 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<64x1xi32, #blocked1> loc(#loc158)
+    %xindex_23 = tt.expand_dims %xindex_21 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> loc(#loc158)
+    %xindex_24 = tt.splat %xoffset_20 : i32 -> tensor<64x1xi32, #blocked1> loc(#loc159)
+    %xindex_25 = tt.splat %xoffset_20 : i32 -> tensor<64x1xi32, #blocked> loc(#loc159)
+    %xindex_26 = arith.addi %xindex_24, %xindex_22 : tensor<64x1xi32, #blocked1> loc(#loc159)
+    %xindex_27 = arith.addi %xindex_25, %xindex_23 : tensor<64x1xi32, #blocked> loc(#loc159)
+    %r0_base = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc160)
+    %r0_base_28 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc160)
+    %r0_base_29 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x64xi32, #blocked1> loc(#loc160)
+    %r0_base_30 = tt.expand_dims %r0_base_28 {axis = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x64xi32, #blocked> loc(#loc160)
+    %x0 = arith.remsi %xindex_26, %cst_13 : tensor<64x1xi32, #blocked1> loc(#loc161)
+    %x0_31 = arith.remsi %xindex_27, %cst_12 : tensor<64x1xi32, #blocked> loc(#loc161)
+    %x1 = arith.divsi %xindex_26, %cst_13 : tensor<64x1xi32, #blocked1> loc(#loc162)
+    %x1_32 = arith.divsi %xindex_27, %cst_12 : tensor<64x1xi32, #blocked> loc(#loc162)
+    %tmp0 = arith.muli %x0, %cst_7 : tensor<64x1xi32, #blocked1> loc(#loc163)
+    %tmp0_33 = tt.broadcast %tmp0 : tensor<64x1xi32, #blocked1> -> tensor<64x64xi32, #blocked1> loc(#loc164)
+    %tmp0_34 = arith.muli %x1, %cst_5 : tensor<64x1xi32, #blocked1> loc(#loc165)
+    %tmp0_35 = tt.broadcast %tmp0_34 : tensor<64x1xi32, #blocked1> -> tensor<64x64xi32, #blocked1> loc(#loc166)
+    %tmp0_36 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<64x64x!tt.ptr<bf16>, #blocked1> loc(#loc167)
+    %_tmp10:2 = scf.for %_tmp10_51 = %c0_i32 to %c128_i32 step %c64_i32 iter_args(%arg10 = %cst_19, %arg11 = %cst_19) -> (tensor<64x64xf32, #blocked1>, tensor<64x64xf32, #blocked1>)  : i32 {
+      %r0_index = tt.splat %_tmp10_51 : i32 -> tensor<1x64xi32, #blocked1> loc(#loc169)
+      %r0_index_52 = arith.addi %r0_index, %r0_base_29 : tensor<1x64xi32, #blocked1> loc(#loc169)
+      %r0_mask = arith.cmpi slt, %r0_index_52, %cst_11 : tensor<1x64xi32, #blocked1> loc(#loc170)
+      %tmp0_53 = arith.addi %r0_index_52, %cst_9 : tensor<1x64xi32, #blocked1> loc(#loc171)
+      %tmp0_54 = tt.broadcast %tmp0_53 : tensor<1x64xi32, #blocked1> -> tensor<64x64xi32, #blocked1> loc(#loc164)
+      %tmp0_55 = arith.addi %tmp0_54, %tmp0_33 : tensor<64x64xi32, #blocked1> loc(#loc164)
+      %tmp0_56 = arith.addi %tmp0_55, %tmp0_35 : tensor<64x64xi32, #blocked1> loc(#loc166)
+      %tmp0_57 = tt.addptr %tmp0_36, %tmp0_56 : tensor<64x64x!tt.ptr<bf16>, #blocked1>, tensor<64x64xi32, #blocked1> loc(#loc167)
+      %tmp0_58 = tt.broadcast %r0_mask : tensor<1x64xi1, #blocked1> -> tensor<64x64xi1, #blocked1> loc(#loc172)
+      %tmp0_59 = tt.load %tmp0_57, %tmp0_58, %cst_14 evictionPolicy = evict_last : tensor<64x64x!tt.ptr<bf16>, #blocked1> loc(#loc172)
+      %tmp0_60 = arith.extf %tmp0_59 : tensor<64x64xbf16, #blocked1> to tensor<64x64xf32, #blocked1> loc(#loc173)
+      %tmp6 = tt.broadcast %r0_index_52 : tensor<1x64xi32, #blocked1> -> tensor<64x64xi32, #blocked1> loc(#loc174)
+      %tmp6_61 = arith.addi %tmp6, %tmp0_33 : tensor<64x64xi32, #blocked1> loc(#loc174)
+      %tmp6_62 = arith.addi %tmp6_61, %tmp0_35 : tensor<64x64xi32, #blocked1> loc(#loc175)
+      %tmp6_63 = tt.addptr %tmp0_36, %tmp6_62 : tensor<64x64x!tt.ptr<bf16>, #blocked1>, tensor<64x64xi32, #blocked1> loc(#loc176)
+      %tmp6_64 = tt.load %tmp6_63, %tmp0_58, %cst_14 evictionPolicy = evict_last : tensor<64x64x!tt.ptr<bf16>, #blocked1> loc(#loc177)
+      %tmp6_65 = arith.extf %tmp6_64 : tensor<64x64xbf16, #blocked1> to tensor<64x64xf32, #blocked1> loc(#loc178)
+      %tmp2 = arith.mulf %tmp0_60, %tmp0_60 : tensor<64x64xf32, #blocked1> loc(#loc179)
+      %tmp5 = arith.addf %arg10, %tmp2 : tensor<64x64xf32, #blocked1> loc(#loc180)
+      %_tmp4 = arith.select %tmp0_58, %tmp5, %arg10 : tensor<64x64xi1, #blocked1>, tensor<64x64xf32, #blocked1> loc(#loc181)
+      %tmp8 = arith.mulf %tmp6_65, %tmp6_65 : tensor<64x64xf32, #blocked1> loc(#loc182)
+      %tmp11 = arith.addf %arg11, %tmp8 : tensor<64x64xf32, #blocked1> loc(#loc183)
+      %_tmp10_66 = arith.select %tmp0_58, %tmp11, %arg11 : tensor<64x64xi1, #blocked1>, tensor<64x64xf32, #blocked1> loc(#loc184)
+      scf.yield %_tmp4, %_tmp10_66 : tensor<64x64xf32, #blocked1>, tensor<64x64xf32, #blocked1> loc(#loc31)
+    } loc(#loc290)
+    %tmp4 = "tt.reduce"(%_tmp10#0) <{axis = 1 : i32}> ({
+    ^bb0(%tmp4_51: f32 loc(callsite(#loc1 at #loc185)), %tmp4_52: f32 loc(callsite(#loc1 at #loc185))):
+      %tmp4_53 = arith.addf %tmp4_51, %tmp4_52 : f32 loc(#loc297)
+      tt.reduce.return %tmp4_53 : f32 loc(#loc291)
+    }) : (tensor<64x64xf32, #blocked1>) -> tensor<64xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc291)
+    %tmp4_37 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<64xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<64x1xf32, #blocked1> loc(#loc186)
+    %tmp10 = "tt.reduce"(%_tmp10#1) <{axis = 1 : i32}> ({
+    ^bb0(%tmp10_51: f32 loc(callsite(#loc1 at #loc187)), %tmp10_52: f32 loc(callsite(#loc1 at #loc187))):
+      %tmp10_53 = arith.addf %tmp10_51, %tmp10_52 : f32 loc(#loc298)
+      tt.reduce.return %tmp10_53 : f32 loc(#loc293)
+    }) : (tensor<64x64xf32, #blocked1>) -> tensor<64xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc293)
+    %tmp10_38 = tt.expand_dims %tmp10 {axis = 1 : i32} : tensor<64xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<64x1xf32, #blocked1> loc(#loc188)
+    %tmp50 = arith.muli %x0_31, %cst_6 : tensor<64x1xi32, #blocked> loc(#loc189)
+    %tmp50_39 = tt.broadcast %tmp50 : tensor<64x1xi32, #blocked> -> tensor<64x64xi32, #blocked> loc(#loc190)
+    %tmp50_40 = arith.muli %x1_32, %cst_4 : tensor<64x1xi32, #blocked> loc(#loc191)
+    %tmp50_41 = tt.broadcast %tmp50_40 : tensor<64x1xi32, #blocked> -> tensor<64x64xi32, #blocked> loc(#loc192)
+    %tmp50_42 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<64x64x!tt.ptr<bf16>, #blocked> loc(#loc193)
+    %tmp58 = tt.splat %in_ptr1 : !tt.ptr<bf16> -> tensor<1x64x!tt.ptr<bf16>, #blocked> loc(#loc194)
+    %tmp58_43 = tt.splat %in_ptr1 : !tt.ptr<bf16> -> tensor<1x64x!tt.ptr<bf16>, #blocked1> loc(#loc194)
+    %tmp63 = arith.muli %x1, %cst_7 : tensor<64x1xi32, #blocked1> loc(#loc195)
+    %tmp63_44 = tt.broadcast %tmp63 : tensor<64x1xi32, #blocked1> -> tensor<64x64xi32, #blocked1> loc(#loc196)
+    %tmp63_45 = tt.splat %in_ptr2 : !tt.ptr<f32> -> tensor<64x64x!tt.ptr<f32>, #blocked1> loc(#loc197)
+    %tmp66 = tt.splat %in_ptr3 : !tt.ptr<f32> -> tensor<64x64x!tt.ptr<f32>, #blocked1> loc(#loc198)
+    %tmp102 = tt.splat %in_ptr4 : !tt.ptr<bf16> -> tensor<1x64x!tt.ptr<bf16>, #blocked> loc(#loc199)
+    %tmp102_46 = tt.splat %in_ptr4 : !tt.ptr<bf16> -> tensor<1x64x!tt.ptr<bf16>, #blocked1> loc(#loc199)
+    %tmp20 = arith.divf %tmp10_38, %cst_17 : tensor<64x1xf32, #blocked1> loc(#loc200)
+    %tmp22 = arith.addf %tmp20, %cst_16 : tensor<64x1xf32, #blocked1> loc(#loc201)
+    %tmp23 = tt.extern_elementwise %tmp22 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<64x1xf32, #blocked1>) -> tensor<64x1xf32, #blocked1> loc(#loc202)
+    %tmp24 = ttg.convert_layout %tmp23 : tensor<64x1xf32, #blocked1> -> tensor<64x1xf32, #blocked> loc(#loc203)
+    %tmp24_47 = tt.broadcast %tmp24 : tensor<64x1xf32, #blocked> -> tensor<64x64xf32, #blocked> loc(#loc203)
+    %tmp24_48 = tt.broadcast %tmp23 : tensor<64x1xf32, #blocked1> -> tensor<64x64xf32, #blocked1> loc(#loc203)
+    %tmp72 = arith.divf %tmp4_37, %cst_17 : tensor<64x1xf32, #blocked1> loc(#loc204)
+    %tmp73 = arith.addf %tmp72, %cst_16 : tensor<64x1xf32, #blocked1> loc(#loc205)
+    %tmp74 = tt.extern_elementwise %tmp73 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<64x1xf32, #blocked1>) -> tensor<64x1xf32, #blocked1> loc(#loc206)
+    %tmp75 = ttg.convert_layout %tmp74 : tensor<64x1xf32, #blocked1> -> tensor<64x1xf32, #blocked> loc(#loc207)
+    %tmp75_49 = tt.broadcast %tmp75 : tensor<64x1xf32, #blocked> -> tensor<64x64xf32, #blocked> loc(#loc207)
+    %tmp75_50 = tt.broadcast %tmp74 : tensor<64x1xf32, #blocked1> -> tensor<64x64xf32, #blocked1> loc(#loc207)
+    %0 = arith.muli %xindex_26, %cst_7 : tensor<64x1xi32, #blocked1> loc(#loc57)
+    %1 = tt.broadcast %0 : tensor<64x1xi32, #blocked1> -> tensor<64x64xi32, #blocked1> loc(#loc58)
+    %2 = tt.splat %in_out_ptr0 : !tt.ptr<bf16> -> tensor<64x64x!tt.ptr<bf16>, #blocked1> loc(#loc59)
+    %3 = tt.splat %in_out_ptr1 : !tt.ptr<bf16> -> tensor<64x64x!tt.ptr<bf16>, #blocked1> loc(#loc60)
+    scf.for %r0_offset = %c0_i32 to %c128_i32 step %c64_i32  : i32 {
+      %r0_index = tt.splat %r0_offset : i32 -> tensor<1x64xi32, #blocked1> loc(#loc208)
+      %r0_index_51 = tt.splat %r0_offset : i32 -> tensor<1x64xi32, #blocked> loc(#loc208)
+      %r0_index_52 = arith.addi %r0_index, %r0_base_29 : tensor<1x64xi32, #blocked1> loc(#loc208)
+      %r0_index_53 = arith.addi %r0_index_51, %r0_base_30 : tensor<1x64xi32, #blocked> loc(#loc208)
+      %r0_mask = arith.cmpi slt, %r0_index_52, %cst_11 : tensor<1x64xi32, #blocked1> loc(#loc209)
+      %r0_mask_54 = arith.cmpi slt, %r0_index_53, %cst_10 : tensor<1x64xi32, #blocked> loc(#loc209)
+      %r0_3 = arith.remsi %r0_index_53, %cst_3 : tensor<1x64xi32, #blocked> loc(#loc210)
+      %r0_4 = arith.divsi %r0_index_53, %cst_3 : tensor<1x64xi32, #blocked> loc(#loc211)
+      %tmp50_55 = tt.broadcast %r0_index_52 : tensor<1x64xi32, #blocked1> -> tensor<64x64xi32, #blocked1> loc(#loc190)
+      %tmp50_56 = arith.addi %tmp50_55, %tmp0_33 : tensor<64x64xi32, #blocked1> loc(#loc190)
+      %tmp50_57 = arith.addi %tmp50_56, %tmp0_35 : tensor<64x64xi32, #blocked1> loc(#loc192)
+      %tmp50_58 = tt.addptr %tmp0_36, %tmp50_57 : tensor<64x64x!tt.ptr<bf16>, #blocked1>, tensor<64x64xi32, #blocked1> loc(#loc193)
+      %tmp50_59 = tt.broadcast %r0_mask : tensor<1x64xi1, #blocked1> -> tensor<64x64xi1, #blocked1> loc(#loc212)
+      %tmp50_60 = tt.load %tmp50_58, %tmp50_59, %cst_14 evictionPolicy = evict_last : tensor<64x64x!tt.ptr<bf16>, #blocked1> loc(#loc212)
+      %tmp50_61 = arith.extf %tmp50_60 : tensor<64x64xbf16, #blocked1> to tensor<64x64xf32, #blocked1> loc(#loc213)
+      %tmp58_62 = tt.addptr %tmp58_43, %r0_index_52 : tensor<1x64x!tt.ptr<bf16>, #blocked1>, tensor<1x64xi32, #blocked1> loc(#loc194)
+      %tmp58_63 = tt.load %tmp58_62, %r0_mask, %cst_0 evictionPolicy = evict_last : tensor<1x64x!tt.ptr<bf16>, #blocked1> loc(#loc214)
+      %tmp58_64 = arith.extf %tmp58_63 : tensor<1x64xbf16, #blocked1> to tensor<1x64xf32, #blocked1> loc(#loc215)
+      %tmp63_65 = arith.addi %tmp50_55, %tmp63_44 : tensor<64x64xi32, #blocked1> loc(#loc196)
+      %tmp63_66 = tt.addptr %tmp63_45, %tmp63_65 : tensor<64x64x!tt.ptr<f32>, #blocked1>, tensor<64x64xi32, #blocked1> loc(#loc197)
+      %tmp63_67 = tt.load %tmp63_66, %tmp50_59, %cst_19 evictionPolicy = evict_last : tensor<64x64x!tt.ptr<f32>, #blocked1> loc(#loc216)
+      %tmp66_68 = tt.addptr %tmp66, %tmp63_65 : tensor<64x64x!tt.ptr<f32>, #blocked1>, tensor<64x64xi32, #blocked1> loc(#loc198)
+      %tmp66_69 = tt.load %tmp66_68, %tmp50_59, %cst_19 evictionPolicy = evict_last : tensor<64x64x!tt.ptr<f32>, #blocked1> loc(#loc217)
+      %tmp66_70 = ttg.convert_layout %tmp66_69 : tensor<64x64xf32, #blocked1> -> tensor<64x64xf32, #blocked> loc(#loc217)
+      %tmp96 = arith.addi %r0_index_52, %cst_9 : tensor<1x64xi32, #blocked1> loc(#loc218)
+      %tmp96_71 = tt.broadcast %tmp96 : tensor<1x64xi32, #blocked1> -> tensor<64x64xi32, #blocked1> loc(#loc219)
+      %tmp96_72 = arith.addi %tmp96_71, %tmp0_33 : tensor<64x64xi32, #blocked1> loc(#loc219)
+      %tmp96_73 = arith.addi %tmp96_72, %tmp0_35 : tensor<64x64xi32, #blocked1> loc(#loc220)
+      %tmp96_74 = tt.addptr %tmp0_36, %tmp96_73 : tensor<64x64x!tt.ptr<bf16>, #blocked1>, tensor<64x64xi32, #blocked1> loc(#loc221)
+      %tmp96_75 = tt.load %tmp96_74, %tmp50_59, %cst_14 evictionPolicy = evict_first : tensor<64x64x!tt.ptr<bf16>, #blocked1> loc(#loc222)
+      %tmp96_76 = arith.extf %tmp96_75 : tensor<64x64xbf16, #blocked1> to tensor<64x64xf32, #blocked1> loc(#loc223)
+      %tmp102_77 = tt.addptr %tmp102_46, %r0_index_52 : tensor<1x64x!tt.ptr<bf16>, #blocked1>, tensor<1x64xi32, #blocked1> loc(#loc199)
+      %tmp102_78 = tt.load %tmp102_77, %r0_mask, %cst_0 evictionPolicy = evict_last : tensor<1x64x!tt.ptr<bf16>, #blocked1> loc(#loc224)
+      %tmp102_79 = arith.extf %tmp102_78 : tensor<1x64xbf16, #blocked1> to tensor<1x64xf32, #blocked1> loc(#loc225)
+      %tmp16 = arith.extsi %r0_3 : tensor<1x64xi32, #blocked> to tensor<1x64xi64, #blocked> loc(#loc226)
+      %tmp16_80 = arith.cmpi slt, %tmp16, %cst_2 : tensor<1x64xi64, #blocked> loc(#loc226)
+      %tmp17 = arith.muli %r0_4, %cst_3 : tensor<1x64xi32, #blocked> loc(#loc227)
+      %tmp17_81 = arith.addi %tmp17, %cst_1 : tensor<1x64xi32, #blocked> loc(#loc228)
+      %tmp17_82 = tt.broadcast %tmp17_81 : tensor<1x64xi32, #blocked> -> tensor<64x64xi32, #blocked> loc(#loc229)
+      %tmp17_83 = arith.addi %tmp17_82, %tmp50_39 : tensor<64x64xi32, #blocked> loc(#loc229)
+      %tmp17_84 = arith.addi %tmp17_83, %tmp50_41 : tensor<64x64xi32, #blocked> loc(#loc230)
+      %tmp17_85 = tt.addptr %tmp50_42, %tmp17_84 : tensor<64x64x!tt.ptr<bf16>, #blocked>, tensor<64x64xi32, #blocked> loc(#loc231)
+      %tmp17_86 = arith.andi %r0_mask_54, %tmp16_80 : tensor<1x64xi1, #blocked> loc(#loc232)
+      %tmp17_87 = tt.broadcast %tmp17_86 : tensor<1x64xi1, #blocked> -> tensor<64x64xi1, #blocked> loc(#loc233)
+      %tmp17_88 = tt.load %tmp17_85, %tmp17_87, %cst_15 evictionPolicy = evict_last : tensor<64x64x!tt.ptr<bf16>, #blocked> loc(#loc233)
+      %tmp17_89 = arith.extf %tmp17_88 : tensor<64x64xbf16, #blocked> to tensor<64x64xf32, #blocked> loc(#loc234)
+      %tmp24_90 = arith.mulf %tmp17_89, %tmp24_47 : tensor<64x64xf32, #blocked> loc(#loc203)
+      %tmp25 = tt.addptr %tmp58, %tmp17_81 : tensor<1x64x!tt.ptr<bf16>, #blocked>, tensor<1x64xi32, #blocked> loc(#loc235)
+      %tmp25_91 = tt.broadcast %tmp25 : tensor<1x64x!tt.ptr<bf16>, #blocked> -> tensor<64x64x!tt.ptr<bf16>, #blocked> loc(#loc235)
+      %tmp25_92 = tt.load %tmp25_91, %tmp17_87, %cst_15 evictionPolicy = evict_last : tensor<64x64x!tt.ptr<bf16>, #blocked> loc(#loc236)
+      %tmp25_93 = arith.extf %tmp25_92 : tensor<64x64xbf16, #blocked> to tensor<64x64xf32, #blocked> loc(#loc237)
+      %tmp27 = arith.mulf %tmp24_90, %tmp25_93 : tensor<64x64xf32, #blocked> loc(#loc238)
+      %tmp29 = arith.subf %cst_18, %tmp27 : tensor<64x64xf32, #blocked> loc(#loc239)
+      %tmp31 = tt.broadcast %tmp16_80 : tensor<1x64xi1, #blocked> -> tensor<64x64xi1, #blocked> loc(#loc240)
+      %tmp32 = arith.cmpi sge, %tmp16, %cst_2 : tensor<1x64xi64, #blocked> loc(#loc241)
+      %tmp35 = tt.broadcast %tmp17 : tensor<1x64xi32, #blocked> -> tensor<64x64xi32, #blocked> loc(#loc242)
+      %tmp35_94 = arith.addi %tmp35, %tmp50_39 : tensor<64x64xi32, #blocked> loc(#loc242)
+      %tmp35_95 = arith.addi %tmp35_94, %tmp50_41 : tensor<64x64xi32, #blocked> loc(#loc243)
+      %tmp35_96 = tt.addptr %tmp50_42, %tmp35_95 : tensor<64x64x!tt.ptr<bf16>, #blocked>, tensor<64x64xi32, #blocked> loc(#loc244)
+      %tmp35_97 = arith.andi %r0_mask_54, %tmp32 : tensor<1x64xi1, #blocked> loc(#loc245)
+      %tmp35_98 = tt.broadcast %tmp35_97 : tensor<1x64xi1, #blocked> -> tensor<64x64xi1, #blocked> loc(#loc246)
+      %tmp35_99 = tt.load %tmp35_96, %tmp35_98, %cst_15 evictionPolicy = evict_last : tensor<64x64x!tt.ptr<bf16>, #blocked> loc(#loc246)
+      %tmp35_100 = arith.extf %tmp35_99 : tensor<64x64xbf16, #blocked> to tensor<64x64xf32, #blocked> loc(#loc247)
+      %tmp42 = arith.mulf %tmp35_100, %tmp24_47 : tensor<64x64xf32, #blocked> loc(#loc248)
+      %tmp43 = tt.addptr %tmp58, %tmp17 : tensor<1x64x!tt.ptr<bf16>, #blocked>, tensor<1x64xi32, #blocked> loc(#loc249)
+      %tmp43_101 = tt.broadcast %tmp43 : tensor<1x64x!tt.ptr<bf16>, #blocked> -> tensor<64x64x!tt.ptr<bf16>, #blocked> loc(#loc249)
+      %tmp43_102 = tt.load %tmp43_101, %tmp35_98, %cst_15 evictionPolicy = evict_last : tensor<64x64x!tt.ptr<bf16>, #blocked> loc(#loc250)
+      %tmp43_103 = arith.extf %tmp43_102 : tensor<64x64xbf16, #blocked> to tensor<64x64xf32, #blocked> loc(#loc251)
+      %tmp45 = arith.mulf %tmp42, %tmp43_103 : tensor<64x64xf32, #blocked> loc(#loc252)
+      %tmp48 = tt.broadcast %tmp32 : tensor<1x64xi1, #blocked> -> tensor<64x64xi1, #blocked> loc(#loc253)
+      %tmp48_104 = arith.select %tmp48, %tmp45, %cst_18 : tensor<64x64xi1, #blocked>, tensor<64x64xf32, #blocked> loc(#loc253)
+      %tmp49 = arith.select %tmp31, %tmp29, %tmp48_104 : tensor<64x64xi1, #blocked>, tensor<64x64xf32, #blocked> loc(#loc295)
+      %tmp57 = arith.mulf %tmp50_61, %tmp24_48 : tensor<64x64xf32, #blocked1> loc(#loc255)
+      %tmp60 = tt.broadcast %tmp58_64 : tensor<1x64xf32, #blocked1> -> tensor<64x64xf32, #blocked1> loc(#loc256)
+      %tmp60_105 = arith.mulf %tmp57, %tmp60 : tensor<64x64xf32, #blocked1> loc(#loc256)
+      %tmp64 = arith.mulf %tmp60_105, %tmp63_67 : tensor<64x64xf32, #blocked1> loc(#loc257)
+      %tmp64_106 = ttg.convert_layout %tmp64 : tensor<64x64xf32, #blocked1> -> tensor<64x64xf32, #blocked> loc(#loc257)
+      %tmp67 = arith.mulf %tmp49, %tmp66_70 : tensor<64x64xf32, #blocked> loc(#loc258)
+      %tmp68 = arith.addf %tmp64_106, %tmp67 : tensor<64x64xf32, #blocked> loc(#loc259)
+      %tmp70 = arith.addi %tmp17, %cst : tensor<1x64xi32, #blocked> loc(#loc260)
+      %tmp70_107 = tt.broadcast %tmp70 : tensor<1x64xi32, #blocked> -> tensor<64x64xi32, #blocked> loc(#loc261)
+      %tmp70_108 = arith.addi %tmp70_107, %tmp50_39 : tensor<64x64xi32, #blocked> loc(#loc261)
+      %tmp70_109 = arith.addi %tmp70_108, %tmp50_41 : tensor<64x64xi32, #blocked> loc(#loc262)
+      %tmp70_110 = tt.addptr %tmp50_42, %tmp70_109 : tensor<64x64x!tt.ptr<bf16>, #blocked>, tensor<64x64xi32, #blocked> loc(#loc263)
+      %tmp70_111 = tt.load %tmp70_110, %tmp17_87, %cst_15 evictionPolicy = evict_last : tensor<64x64x!tt.ptr<bf16>, #blocked> loc(#loc264)
+      %tmp70_112 = arith.extf %tmp70_111 : tensor<64x64xbf16, #blocked> to tensor<64x64xf32, #blocked> loc(#loc265)
+      %tmp75_113 = arith.mulf %tmp70_112, %tmp75_49 : tensor<64x64xf32, #blocked> loc(#loc207)
+      %tmp76 = tt.addptr %tmp102, %tmp17_81 : tensor<1x64x!tt.ptr<bf16>, #blocked>, tensor<1x64xi32, #blocked> loc(#loc266)
+      %tmp76_114 = tt.broadcast %tmp76 : tensor<1x64x!tt.ptr<bf16>, #blocked> -> tensor<64x64x!tt.ptr<bf16>, #blocked> loc(#loc266)
+      %tmp76_115 = tt.load %tmp76_114, %tmp17_87, %cst_15 evictionPolicy = evict_last : tensor<64x64x!tt.ptr<bf16>, #blocked> loc(#loc267)
+      %tmp76_116 = arith.extf %tmp76_115 : tensor<64x64xbf16, #blocked> to tensor<64x64xf32, #blocked> loc(#loc268)
+      %tmp78 = arith.mulf %tmp75_113, %tmp76_116 : tensor<64x64xf32, #blocked> loc(#loc269)
+      %tmp80 = arith.subf %cst_18, %tmp78 : tensor<64x64xf32, #blocked> loc(#loc270)
+      %tmp83 = arith.addi %tmp17, %cst_8 : tensor<1x64xi32, #blocked> loc(#loc271)
+      %tmp83_117 = tt.broadcast %tmp83 : tensor<1x64xi32, #blocked> -> tensor<64x64xi32, #blocked> loc(#loc272)
+      %tmp83_118 = arith.addi %tmp83_117, %tmp50_39 : tensor<64x64xi32, #blocked> loc(#loc272)
+      %tmp83_119 = arith.addi %tmp83_118, %tmp50_41 : tensor<64x64xi32, #blocked> loc(#loc273)
+      %tmp83_120 = tt.addptr %tmp50_42, %tmp83_119 : tensor<64x64x!tt.ptr<bf16>, #blocked>, tensor<64x64xi32, #blocked> loc(#loc274)
+      %tmp83_121 = tt.load %tmp83_120, %tmp35_98, %cst_15 evictionPolicy = evict_last : tensor<64x64x!tt.ptr<bf16>, #blocked> loc(#loc275)
+      %tmp83_122 = arith.extf %tmp83_121 : tensor<64x64xbf16, #blocked> to tensor<64x64xf32, #blocked> loc(#loc276)
+      %tmp88 = arith.mulf %tmp83_122, %tmp75_49 : tensor<64x64xf32, #blocked> loc(#loc277)
+      %tmp89 = tt.addptr %tmp102, %tmp17 : tensor<1x64x!tt.ptr<bf16>, #blocked>, tensor<1x64xi32, #blocked> loc(#loc278)
+      %tmp89_123 = tt.broadcast %tmp89 : tensor<1x64x!tt.ptr<bf16>, #blocked> -> tensor<64x64x!tt.ptr<bf16>, #blocked> loc(#loc278)
+      %tmp89_124 = tt.load %tmp89_123, %tmp35_98, %cst_15 evictionPolicy = evict_last : tensor<64x64x!tt.ptr<bf16>, #blocked> loc(#loc279)
+      %tmp89_125 = arith.extf %tmp89_124 : tensor<64x64xbf16, #blocked> to tensor<64x64xf32, #blocked> loc(#loc280)
+      %tmp91 = arith.mulf %tmp88, %tmp89_125 : tensor<64x64xf32, #blocked> loc(#loc281)
+      %tmp94 = arith.select %tmp48, %tmp91, %cst_18 : tensor<64x64xi1, #blocked>, tensor<64x64xf32, #blocked> loc(#loc282)
+      %tmp95 = arith.select %tmp31, %tmp80, %tmp94 : tensor<64x64xi1, #blocked>, tensor<64x64xf32, #blocked> loc(#loc296)
+      %tmp101 = arith.mulf %tmp96_76, %tmp75_50 : tensor<64x64xf32, #blocked1> loc(#loc285)
+      %tmp104 = tt.broadcast %tmp102_79 : tensor<1x64xf32, #blocked1> -> tensor<64x64xf32, #blocked1> loc(#loc286)
+      %tmp104_126 = arith.mulf %tmp101, %tmp104 : tensor<64x64xf32, #blocked1> loc(#loc286)
+      %tmp107 = arith.mulf %tmp104_126, %tmp63_67 : tensor<64x64xf32, #blocked1> loc(#loc287)
+      %tmp107_127 = ttg.convert_layout %tmp107 : tensor<64x64xf32, #blocked1> -> tensor<64x64xf32, #blocked> loc(#loc287)
+      %tmp109 = arith.mulf %tmp95, %tmp66_70 : tensor<64x64xf32, #blocked> loc(#loc288)
+      %tmp110 = arith.addf %tmp107_127, %tmp109 : tensor<64x64xf32, #blocked> loc(#loc289)
+      %4 = arith.addi %tmp50_55, %1 : tensor<64x64xi32, #blocked1> loc(#loc58)
+      %5 = tt.addptr %2, %4 : tensor<64x64x!tt.ptr<bf16>, #blocked1>, tensor<64x64xi32, #blocked1> loc(#loc59)
+      %6 = arith.truncf %tmp68 : tensor<64x64xf32, #blocked> to tensor<64x64xbf16, #blocked> loc(#loc144)
+      %7 = ttg.convert_layout %6 : tensor<64x64xbf16, #blocked> -> tensor<64x64xbf16, #blocked1> loc(#loc144)
+      tt.store %5, %7, %tmp50_59 : tensor<64x64x!tt.ptr<bf16>, #blocked1> loc(#loc144)
+      %8 = tt.addptr %3, %4 : tensor<64x64x!tt.ptr<bf16>, #blocked1>, tensor<64x64xi32, #blocked1> loc(#loc60)
+      %9 = arith.truncf %tmp110 : tensor<64x64xf32, #blocked> to tensor<64x64xbf16, #blocked> loc(#loc145)
+      %10 = ttg.convert_layout %9 : tensor<64x64xbf16, #blocked> -> tensor<64x64xbf16, #blocked1> loc(#loc145)
+      tt.store %8, %10, %tmp50_59 : tensor<64x64x!tt.ptr<bf16>, #blocked1> loc(#loc145)
+    } loc(#loc61)
+    tt.return loc(#loc146)
+  } loc(#loc)
+} loc(#loc)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":23:28)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":23:33)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:44)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:23)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":26:37)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":28:19)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":29:19)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:52)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:48)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:63)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:57)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:34)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":33:43)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":34:31)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":35:29)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:41)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:68)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:121)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:41)
+#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:50)
+#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:34)
+#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:61)
+#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:114)
+#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":42:22)
+#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":44:23)
+#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":45:40)
+#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":47:22)
+#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":49:25)
+#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":50:42)
+#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":50:8)
+#loc32 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:36)
+#loc34 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:15)
+#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":51:28)
+#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":52:30)
+#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:46)
+#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:42)
+#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:57)
+#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:51)
+#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:35)
+#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:35)
+#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:46)
+#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:42)
+#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:35)
+#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:35)
+#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:36)
+#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":75:25)
+#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":77:24)
+#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":78:32)
+#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":79:24)
+#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":123:24)
+#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":124:24)
+#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":125:32)
+#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":126:24)
+#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:43)
+#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:39)
+#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:32)
+#loc60 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:32)
+#loc61 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":53:43)
+#loc62 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":54:31)
+#loc63 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":55:29)
+#loc64 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":58:27)
+#loc65 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":59:27)
+#loc66 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:62)
+#loc67 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:115)
+#loc68 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:42)
+#loc69 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:95)
+#loc70 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:51)
+#loc71 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:51)
+#loc72 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:42)
+#loc73 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:49)
+#loc74 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:58)
+#loc75 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:35)
+#loc76 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:69)
+#loc77 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:123)
+#loc78 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:43)
+#loc79 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:96)
+#loc80 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":71:24)
+#loc81 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:41)
+#loc82 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:39)
+#loc83 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:48)
+#loc84 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:57)
+#loc85 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:35)
+#loc86 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:78)
+#loc87 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:68)
+#loc88 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:129)
+#loc89 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:35)
+#loc90 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:85)
+#loc91 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:146)
+#loc92 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":82:24)
+#loc93 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":84:17)
+#loc94 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":86:39)
+#loc95 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":87:25)
+#loc96 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:44)
+#loc97 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:53)
+#loc98 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:35)
+#loc99 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:74)
+#loc100 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:64)
+#loc101 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:125)
+#loc102 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":97:24)
+#loc103 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:35)
+#loc104 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:81)
+#loc105 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:142)
+#loc106 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":100:24)
+#loc107 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":103:39)
+#loc108 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":104:39)
+#loc109 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":111:24)
+#loc110 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":113:24)
+#loc111 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":116:24)
+#loc112 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":118:24)
+#loc113 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":119:24)
+#loc114 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:42)
+#loc115 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:51)
+#loc116 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:60)
+#loc117 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:35)
+#loc118 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:71)
+#loc119 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:132)
+#loc120 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:35)
+#loc121 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:85)
+#loc122 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:146)
+#loc123 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":129:24)
+#loc124 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":131:17)
+#loc125 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:42)
+#loc126 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:51)
+#loc127 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:60)
+#loc128 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:35)
+#loc129 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:71)
+#loc130 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:132)
+#loc131 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":139:24)
+#loc132 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:35)
+#loc133 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:81)
+#loc134 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:142)
+#loc135 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":142:24)
+#loc136 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":145:39)
+#loc137 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":146:39)
+#loc138 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":133:39)
+#loc139 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":151:25)
+#loc140 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":153:26)
+#loc141 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":156:26)
+#loc142 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":158:26)
+#loc143 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":159:26)
+#loc144 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:55)
+#loc145 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:56)
+#loc146 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":53:4)
+#loc156 = loc("xoffset"(#loc2))
+#loc157 = loc("xoffset"(#loc3))
+#loc158 = loc("xindex"(#loc4))
+#loc159 = loc("xindex"(#loc5))
+#loc160 = loc("r0_base"(#loc6))
+#loc161 = loc("x0"(#loc7))
+#loc162 = loc("x1"(#loc8))
+#loc163 = loc("tmp0"(#loc9))
+#loc164 = loc("tmp0"(#loc10))
+#loc165 = loc("tmp0"(#loc11))
+#loc166 = loc("tmp0"(#loc12))
+#loc167 = loc("tmp0"(#loc13))
+#loc168 = loc("_tmp4"(#loc14))
+#loc169 = loc("r0_index"(#loc15))
+#loc170 = loc("r0_mask"(#loc16))
+#loc171 = loc("tmp0"(#loc17))
+#loc172 = loc("tmp0"(#loc18))
+#loc173 = loc("tmp0"(#loc19))
+#loc174 = loc("tmp6"(#loc20))
+#loc175 = loc("tmp6"(#loc21))
+#loc176 = loc("tmp6"(#loc22))
+#loc177 = loc("tmp6"(#loc23))
+#loc178 = loc("tmp6"(#loc24))
+#loc179 = loc("tmp2"(#loc25))
+#loc180 = loc("tmp5"(#loc26))
+#loc181 = loc("_tmp4"(#loc27))
+#loc182 = loc("tmp8"(#loc28))
+#loc183 = loc("tmp11"(#loc29))
+#loc184 = loc("_tmp10"(#loc30))
+#loc186 = loc("tmp4"(#loc35))
+#loc188 = loc("tmp10"(#loc37))
+#loc189 = loc("tmp50"(#loc38))
+#loc190 = loc("tmp50"(#loc39))
+#loc191 = loc("tmp50"(#loc40))
+#loc192 = loc("tmp50"(#loc41))
+#loc193 = loc("tmp50"(#loc42))
+#loc194 = loc("tmp58"(#loc43))
+#loc195 = loc("tmp63"(#loc44))
+#loc196 = loc("tmp63"(#loc45))
+#loc197 = loc("tmp63"(#loc46))
+#loc198 = loc("tmp66"(#loc47))
+#loc199 = loc("tmp102"(#loc48))
+#loc200 = loc("tmp20"(#loc49))
+#loc201 = loc("tmp22"(#loc50))
+#loc202 = loc("tmp23"(#loc51))
+#loc203 = loc("tmp24"(#loc52))
+#loc204 = loc("tmp72"(#loc53))
+#loc205 = loc("tmp73"(#loc54))
+#loc206 = loc("tmp74"(#loc55))
+#loc207 = loc("tmp75"(#loc56))
+#loc208 = loc("r0_index"(#loc62))
+#loc209 = loc("r0_mask"(#loc63))
+#loc210 = loc("r0_3"(#loc64))
+#loc211 = loc("r0_4"(#loc65))
+#loc212 = loc("tmp50"(#loc66))
+#loc213 = loc("tmp50"(#loc67))
+#loc214 = loc("tmp58"(#loc68))
+#loc215 = loc("tmp58"(#loc69))
+#loc216 = loc("tmp63"(#loc70))
+#loc217 = loc("tmp66"(#loc71))
+#loc218 = loc("tmp96"(#loc72))
+#loc219 = loc("tmp96"(#loc73))
+#loc220 = loc("tmp96"(#loc74))
+#loc221 = loc("tmp96"(#loc75))
+#loc222 = loc("tmp96"(#loc76))
+#loc223 = loc("tmp96"(#loc77))
+#loc224 = loc("tmp102"(#loc78))
+#loc225 = loc("tmp102"(#loc79))
+#loc226 = loc("tmp16"(#loc80))
+#loc227 = loc("tmp17"(#loc81))
+#loc228 = loc("tmp17"(#loc82))
+#loc229 = loc("tmp17"(#loc83))
+#loc230 = loc("tmp17"(#loc84))
+#loc231 = loc("tmp17"(#loc85))
+#loc232 = loc("tmp17"(#loc86))
+#loc233 = loc("tmp17"(#loc87))
+#loc234 = loc("tmp17"(#loc88))
+#loc235 = loc("tmp25"(#loc89))
+#loc236 = loc("tmp25"(#loc90))
+#loc237 = loc("tmp25"(#loc91))
+#loc238 = loc("tmp27"(#loc92))
+#loc239 = loc("tmp29"(#loc93))
+#loc240 = loc("tmp31"(#loc94))
+#loc241 = loc("tmp32"(#loc95))
+#loc242 = loc("tmp35"(#loc96))
+#loc243 = loc("tmp35"(#loc97))
+#loc244 = loc("tmp35"(#loc98))
+#loc245 = loc("tmp35"(#loc99))
+#loc246 = loc("tmp35"(#loc100))
+#loc247 = loc("tmp35"(#loc101))
+#loc248 = loc("tmp42"(#loc102))
+#loc249 = loc("tmp43"(#loc103))
+#loc250 = loc("tmp43"(#loc104))
+#loc251 = loc("tmp43"(#loc105))
+#loc252 = loc("tmp45"(#loc106))
+#loc253 = loc("tmp48"(#loc107))
+#loc254 = loc("tmp49"(#loc108))
+#loc255 = loc("tmp57"(#loc109))
+#loc256 = loc("tmp60"(#loc110))
+#loc257 = loc("tmp64"(#loc111))
+#loc258 = loc("tmp67"(#loc112))
+#loc259 = loc("tmp68"(#loc113))
+#loc260 = loc("tmp70"(#loc114))
+#loc261 = loc("tmp70"(#loc115))
+#loc262 = loc("tmp70"(#loc116))
+#loc263 = loc("tmp70"(#loc117))
+#loc264 = loc("tmp70"(#loc118))
+#loc265 = loc("tmp70"(#loc119))
+#loc266 = loc("tmp76"(#loc120))
+#loc267 = loc("tmp76"(#loc121))
+#loc268 = loc("tmp76"(#loc122))
+#loc269 = loc("tmp78"(#loc123))
+#loc270 = loc("tmp80"(#loc124))
+#loc271 = loc("tmp83"(#loc125))
+#loc272 = loc("tmp83"(#loc126))
+#loc273 = loc("tmp83"(#loc127))
+#loc274 = loc("tmp83"(#loc128))
+#loc275 = loc("tmp83"(#loc129))
+#loc276 = loc("tmp83"(#loc130))
+#loc277 = loc("tmp88"(#loc131))
+#loc278 = loc("tmp89"(#loc132))
+#loc279 = loc("tmp89"(#loc133))
+#loc280 = loc("tmp89"(#loc134))
+#loc281 = loc("tmp91"(#loc135))
+#loc282 = loc("tmp94"(#loc136))
+#loc283 = loc("tmp95"(#loc137))
+#loc284 = loc("tmp82"(#loc138))
+#loc285 = loc("tmp101"(#loc139))
+#loc286 = loc("tmp104"(#loc140))
+#loc287 = loc("tmp107"(#loc141))
+#loc288 = loc("tmp109"(#loc142))
+#loc289 = loc("tmp110"(#loc143))
+#loc290 = loc("_tmp10"(#loc168))
+#loc291 = loc(callsite(#loc32 at #loc185))
+#loc293 = loc(callsite(#loc32 at #loc187))
+#loc295 = loc(fused[#loc254, #loc240])
+#loc296 = loc(fused[#loc283, #loc284])
+#loc297 = loc(callsite(#loc34 at #loc291))
+#loc298 = loc(callsite(#loc34 at #loc293))
diff --git a/triton/C3PA2FQRIXNX4FILRXWMWDTESFUYR3BPZHZKRDDGR2QUBFWHGDOQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttir b/triton/C3PA2FQRIXNX4FILRXWMWDTESFUYR3BPZHZKRDDGR2QUBFWHGDOQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttir
new file mode 100644
index 0000000000000000000000000000000000000000..c0b6bc3aae425465c1ff99634180a3fd4c3adacf
--- /dev/null
+++ b/triton/C3PA2FQRIXNX4FILRXWMWDTESFUYR3BPZHZKRDDGR2QUBFWHGDOQ/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttir
@@ -0,0 +1,516 @@
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":18:0)
+#loc1 = loc(unknown)
+#loc34 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":51:25)
+#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":52:27)
+#loc148 = loc("in_out_ptr0"(#loc))
+#loc149 = loc("in_out_ptr1"(#loc))
+#loc150 = loc("in_ptr0"(#loc))
+#loc151 = loc("in_ptr1"(#loc))
+#loc152 = loc("in_ptr2"(#loc))
+#loc153 = loc("in_ptr3"(#loc))
+#loc154 = loc("in_ptr4"(#loc))
+#loc155 = loc("xnumel"(#loc))
+#loc156 = loc("r0_numel"(#loc))
+#loc187 = loc("tmp4"(#loc34))
+#loc189 = loc("tmp10"(#loc37))
+#loc294 = loc(callsite(#loc1 at #loc187))
+#loc296 = loc(callsite(#loc1 at #loc189))
+module {
+  tt.func public @triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0(%in_out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_out_ptr0"(#loc)), %in_out_ptr1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_out_ptr1"(#loc)), %in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %in_ptr4: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr4"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
+    %cst = arith.constant dense<0.000000e+00> : tensor<1x64xbf16> loc(#loc1)
+    %cst_0 = arith.constant dense<0.000000e+00> : tensor<64x64xbf16> loc(#loc1)
+    %c128_i32 = arith.constant 128 : i32 loc(#loc1)
+    %c0_i32 = arith.constant 0 : i32 loc(#loc1)
+    %cst_1 = arith.constant dense<4097> : tensor<1x64xi32> loc(#loc1)
+    %cst_2 = arith.constant dense<9.99999997E-7> : tensor<64x1xf32> loc(#loc1)
+    %cst_3 = arith.constant dense<1.280000e+02> : tensor<64x1xf32> loc(#loc1)
+    %cst_4 = arith.constant dense<1> : tensor<1x64xi32> loc(#loc1)
+    %cst_5 = arith.constant dense<1> : tensor<1x64xi64> loc(#loc1)
+    %cst_6 = arith.constant dense<2> : tensor<1x64xi32> loc(#loc1)
+    %cst_7 = arith.constant dense<36864> : tensor<64x1xi32> loc(#loc1)
+    %cst_8 = arith.constant dense<128> : tensor<64x1xi32> loc(#loc1)
+    %cst_9 = arith.constant dense<4096> : tensor<1x64xi32> loc(#loc1)
+    %cst_10 = arith.constant dense<128> : tensor<1x64xi32> loc(#loc1)
+    %cst_11 = arith.constant dense<0.000000e+00> : tensor<64x64xf32> loc(#loc1)
+    %cst_12 = arith.constant dense<32> : tensor<64x1xi32> loc(#loc1)
+    %c64_i32 = arith.constant 64 : i32 loc(#loc1)
+    %xoffset = tt.get_program_id x : i32 loc(#loc157)
+    %xoffset_13 = arith.muli %xoffset, %c64_i32 : i32 loc(#loc158)
+    %xindex = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc159)
+    %xindex_14 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc160)
+    %xindex_15 = tt.splat %xoffset_13 : i32 -> tensor<64x1xi32> loc(#loc161)
+    %xindex_16 = arith.addi %xindex_15, %xindex_14 : tensor<64x1xi32> loc(#loc161)
+    %r0_base = tt.expand_dims %xindex {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc162)
+    %x0 = arith.remsi %xindex_16, %cst_12 : tensor<64x1xi32> loc(#loc163)
+    %x1 = arith.divsi %xindex_16, %cst_12 : tensor<64x1xi32> loc(#loc164)
+    %_tmp10:2 = scf.for %r0_offset = %c0_i32 to %c128_i32 step %c64_i32 iter_args(%_tmp4 = %cst_11, %_tmp10_19 = %cst_11) -> (tensor<64x64xf32>, tensor<64x64xf32>)  : i32 {
+      %r0_index = tt.splat %r0_offset : i32 -> tensor<1x64xi32> loc(#loc166)
+      %r0_index_20 = arith.addi %r0_index, %r0_base : tensor<1x64xi32> loc(#loc166)
+      %r0_mask = arith.cmpi slt, %r0_index_20, %cst_10 : tensor<1x64xi32> loc(#loc167)
+      %tmp0 = arith.addi %r0_index_20, %cst_9 : tensor<1x64xi32> loc(#loc168)
+      %tmp0_21 = arith.muli %x0, %cst_8 : tensor<64x1xi32> loc(#loc169)
+      %tmp0_22 = tt.broadcast %tmp0 : tensor<1x64xi32> -> tensor<64x64xi32> loc(#loc170)
+      %tmp0_23 = tt.broadcast %tmp0_21 : tensor<64x1xi32> -> tensor<64x64xi32> loc(#loc170)
+      %tmp0_24 = arith.addi %tmp0_22, %tmp0_23 : tensor<64x64xi32> loc(#loc170)
+      %tmp0_25 = arith.muli %x1, %cst_7 : tensor<64x1xi32> loc(#loc171)
+      %tmp0_26 = tt.broadcast %tmp0_25 : tensor<64x1xi32> -> tensor<64x64xi32> loc(#loc172)
+      %tmp0_27 = arith.addi %tmp0_24, %tmp0_26 : tensor<64x64xi32> loc(#loc172)
+      %tmp0_28 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<64x64x!tt.ptr<bf16>> loc(#loc173)
+      %tmp0_29 = tt.addptr %tmp0_28, %tmp0_27 : tensor<64x64x!tt.ptr<bf16>>, tensor<64x64xi32> loc(#loc173)
+      %tmp0_30 = tt.broadcast %r0_mask : tensor<1x64xi1> -> tensor<64x64xi1> loc(#loc174)
+      %tmp0_31 = tt.load %tmp0_29, %tmp0_30, %cst_0 evictionPolicy = evict_last : tensor<64x64x!tt.ptr<bf16>> loc(#loc174)
+      %tmp0_32 = arith.extf %tmp0_31 : tensor<64x64xbf16> to tensor<64x64xf32> loc(#loc175)
+      %tmp6 = tt.broadcast %r0_index_20 : tensor<1x64xi32> -> tensor<64x64xi32> loc(#loc176)
+      %tmp6_33 = arith.addi %tmp6, %tmp0_23 : tensor<64x64xi32> loc(#loc176)
+      %tmp6_34 = arith.addi %tmp6_33, %tmp0_26 : tensor<64x64xi32> loc(#loc177)
+      %tmp6_35 = tt.addptr %tmp0_28, %tmp6_34 : tensor<64x64x!tt.ptr<bf16>>, tensor<64x64xi32> loc(#loc178)
+      %tmp6_36 = tt.load %tmp6_35, %tmp0_30, %cst_0 evictionPolicy = evict_last : tensor<64x64x!tt.ptr<bf16>> loc(#loc179)
+      %tmp6_37 = arith.extf %tmp6_36 : tensor<64x64xbf16> to tensor<64x64xf32> loc(#loc180)
+      %tmp2 = arith.mulf %tmp0_32, %tmp0_32 : tensor<64x64xf32> loc(#loc181)
+      %tmp5 = arith.addf %_tmp4, %tmp2 : tensor<64x64xf32> loc(#loc182)
+      %_tmp4_38 = arith.select %tmp0_30, %tmp5, %_tmp4 : tensor<64x64xi1>, tensor<64x64xf32> loc(#loc183)
+      %tmp8 = arith.mulf %tmp6_37, %tmp6_37 : tensor<64x64xf32> loc(#loc184)
+      %tmp11 = arith.addf %_tmp10_19, %tmp8 : tensor<64x64xf32> loc(#loc185)
+      %_tmp10_39 = arith.select %tmp0_30, %tmp11, %_tmp10_19 : tensor<64x64xi1>, tensor<64x64xf32> loc(#loc186)
+      scf.yield %_tmp4_38, %_tmp10_39 : tensor<64x64xf32>, tensor<64x64xf32> loc(#loc32)
+    } loc(#loc292)
+    %tmp4 = "tt.reduce"(%_tmp10#0) <{axis = 1 : i32}> ({
+    ^bb0(%tmp4_19: f32 loc(callsite(#loc1 at #loc187)), %tmp4_20: f32 loc(callsite(#loc1 at #loc187))):
+      %tmp4_21 = arith.addf %tmp4_19, %tmp4_20 : f32 loc(#loc297)
+      tt.reduce.return %tmp4_21 : f32 loc(#loc293)
+    }) : (tensor<64x64xf32>) -> tensor<64xf32> loc(#loc293)
+    %tmp4_17 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<64xf32> -> tensor<64x1xf32> loc(#loc188)
+    %tmp10 = "tt.reduce"(%_tmp10#1) <{axis = 1 : i32}> ({
+    ^bb0(%tmp10_19: f32 loc(callsite(#loc1 at #loc189)), %tmp10_20: f32 loc(callsite(#loc1 at #loc189))):
+      %tmp10_21 = arith.addf %tmp10_19, %tmp10_20 : f32 loc(#loc298)
+      tt.reduce.return %tmp10_21 : f32 loc(#loc295)
+    }) : (tensor<64x64xf32>) -> tensor<64xf32> loc(#loc295)
+    %tmp10_18 = tt.expand_dims %tmp10 {axis = 1 : i32} : tensor<64xf32> -> tensor<64x1xf32> loc(#loc190)
+    scf.for %r0_offset = %c0_i32 to %c128_i32 step %c64_i32  : i32 {
+      %r0_index = tt.splat %r0_offset : i32 -> tensor<1x64xi32> loc(#loc191)
+      %r0_index_19 = arith.addi %r0_index, %r0_base : tensor<1x64xi32> loc(#loc191)
+      %r0_mask = arith.cmpi slt, %r0_index_19, %cst_10 : tensor<1x64xi32> loc(#loc192)
+      %r0_3 = arith.remsi %r0_index_19, %cst_6 : tensor<1x64xi32> loc(#loc193)
+      %r0_4 = arith.divsi %r0_index_19, %cst_6 : tensor<1x64xi32> loc(#loc194)
+      %tmp50 = arith.muli %x0, %cst_8 : tensor<64x1xi32> loc(#loc195)
+      %tmp50_20 = tt.broadcast %r0_index_19 : tensor<1x64xi32> -> tensor<64x64xi32> loc(#loc196)
+      %tmp50_21 = tt.broadcast %tmp50 : tensor<64x1xi32> -> tensor<64x64xi32> loc(#loc196)
+      %tmp50_22 = arith.addi %tmp50_20, %tmp50_21 : tensor<64x64xi32> loc(#loc196)
+      %tmp50_23 = arith.muli %x1, %cst_7 : tensor<64x1xi32> loc(#loc197)
+      %tmp50_24 = tt.broadcast %tmp50_23 : tensor<64x1xi32> -> tensor<64x64xi32> loc(#loc198)
+      %tmp50_25 = arith.addi %tmp50_22, %tmp50_24 : tensor<64x64xi32> loc(#loc198)
+      %tmp50_26 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<64x64x!tt.ptr<bf16>> loc(#loc199)
+      %tmp50_27 = tt.addptr %tmp50_26, %tmp50_25 : tensor<64x64x!tt.ptr<bf16>>, tensor<64x64xi32> loc(#loc199)
+      %tmp50_28 = tt.broadcast %r0_mask : tensor<1x64xi1> -> tensor<64x64xi1> loc(#loc200)
+      %tmp50_29 = tt.load %tmp50_27, %tmp50_28, %cst_0 evictionPolicy = evict_last : tensor<64x64x!tt.ptr<bf16>> loc(#loc200)
+      %tmp50_30 = arith.extf %tmp50_29 : tensor<64x64xbf16> to tensor<64x64xf32> loc(#loc201)
+      %tmp58 = tt.splat %in_ptr1 : !tt.ptr<bf16> -> tensor<1x64x!tt.ptr<bf16>> loc(#loc202)
+      %tmp58_31 = tt.addptr %tmp58, %r0_index_19 : tensor<1x64x!tt.ptr<bf16>>, tensor<1x64xi32> loc(#loc202)
+      %tmp58_32 = tt.load %tmp58_31, %r0_mask, %cst evictionPolicy = evict_last : tensor<1x64x!tt.ptr<bf16>> loc(#loc203)
+      %tmp58_33 = arith.extf %tmp58_32 : tensor<1x64xbf16> to tensor<1x64xf32> loc(#loc204)
+      %tmp63 = arith.muli %x1, %cst_8 : tensor<64x1xi32> loc(#loc205)
+      %tmp63_34 = tt.broadcast %tmp63 : tensor<64x1xi32> -> tensor<64x64xi32> loc(#loc206)
+      %tmp63_35 = arith.addi %tmp50_20, %tmp63_34 : tensor<64x64xi32> loc(#loc206)
+      %tmp63_36 = tt.splat %in_ptr2 : !tt.ptr<f32> -> tensor<64x64x!tt.ptr<f32>> loc(#loc207)
+      %tmp63_37 = tt.addptr %tmp63_36, %tmp63_35 : tensor<64x64x!tt.ptr<f32>>, tensor<64x64xi32> loc(#loc207)
+      %tmp63_38 = tt.load %tmp63_37, %tmp50_28, %cst_11 evictionPolicy = evict_last : tensor<64x64x!tt.ptr<f32>> loc(#loc208)
+      %tmp66 = tt.splat %in_ptr3 : !tt.ptr<f32> -> tensor<64x64x!tt.ptr<f32>> loc(#loc209)
+      %tmp66_39 = tt.addptr %tmp66, %tmp63_35 : tensor<64x64x!tt.ptr<f32>>, tensor<64x64xi32> loc(#loc209)
+      %tmp66_40 = tt.load %tmp66_39, %tmp50_28, %cst_11 evictionPolicy = evict_last : tensor<64x64x!tt.ptr<f32>> loc(#loc210)
+      %tmp96 = arith.addi %r0_index_19, %cst_9 : tensor<1x64xi32> loc(#loc211)
+      %tmp96_41 = tt.broadcast %tmp96 : tensor<1x64xi32> -> tensor<64x64xi32> loc(#loc212)
+      %tmp96_42 = arith.addi %tmp96_41, %tmp50_21 : tensor<64x64xi32> loc(#loc212)
+      %tmp96_43 = arith.addi %tmp96_42, %tmp50_24 : tensor<64x64xi32> loc(#loc213)
+      %tmp96_44 = tt.addptr %tmp50_26, %tmp96_43 : tensor<64x64x!tt.ptr<bf16>>, tensor<64x64xi32> loc(#loc214)
+      %tmp96_45 = tt.load %tmp96_44, %tmp50_28, %cst_0 evictionPolicy = evict_first : tensor<64x64x!tt.ptr<bf16>> loc(#loc215)
+      %tmp96_46 = arith.extf %tmp96_45 : tensor<64x64xbf16> to tensor<64x64xf32> loc(#loc216)
+      %tmp102 = tt.splat %in_ptr4 : !tt.ptr<bf16> -> tensor<1x64x!tt.ptr<bf16>> loc(#loc217)
+      %tmp102_47 = tt.addptr %tmp102, %r0_index_19 : tensor<1x64x!tt.ptr<bf16>>, tensor<1x64xi32> loc(#loc217)
+      %tmp102_48 = tt.load %tmp102_47, %r0_mask, %cst evictionPolicy = evict_last : tensor<1x64x!tt.ptr<bf16>> loc(#loc218)
+      %tmp102_49 = arith.extf %tmp102_48 : tensor<1x64xbf16> to tensor<1x64xf32> loc(#loc219)
+      %tmp16 = arith.extsi %r0_3 : tensor<1x64xi32> to tensor<1x64xi64> loc(#loc220)
+      %tmp16_50 = arith.cmpi slt, %tmp16, %cst_5 : tensor<1x64xi64> loc(#loc220)
+      %tmp17 = arith.muli %r0_4, %cst_6 : tensor<1x64xi32> loc(#loc221)
+      %tmp17_51 = arith.addi %tmp17, %cst_4 : tensor<1x64xi32> loc(#loc222)
+      %tmp17_52 = tt.broadcast %tmp17_51 : tensor<1x64xi32> -> tensor<64x64xi32> loc(#loc223)
+      %tmp17_53 = arith.addi %tmp17_52, %tmp50_21 : tensor<64x64xi32> loc(#loc223)
+      %tmp17_54 = arith.addi %tmp17_53, %tmp50_24 : tensor<64x64xi32> loc(#loc224)
+      %tmp17_55 = tt.addptr %tmp50_26, %tmp17_54 : tensor<64x64x!tt.ptr<bf16>>, tensor<64x64xi32> loc(#loc225)
+      %tmp17_56 = arith.andi %r0_mask, %tmp16_50 : tensor<1x64xi1> loc(#loc226)
+      %tmp17_57 = tt.broadcast %tmp17_56 : tensor<1x64xi1> -> tensor<64x64xi1> loc(#loc227)
+      %tmp17_58 = tt.load %tmp17_55, %tmp17_57, %cst_0 evictionPolicy = evict_last : tensor<64x64x!tt.ptr<bf16>> loc(#loc227)
+      %tmp17_59 = arith.extf %tmp17_58 : tensor<64x64xbf16> to tensor<64x64xf32> loc(#loc228)
+      %tmp20 = arith.divf %tmp10_18, %cst_3 : tensor<64x1xf32> loc(#loc229)
+      %tmp22 = arith.addf %tmp20, %cst_2 : tensor<64x1xf32> loc(#loc230)
+      %tmp23 = tt.extern_elementwise %tmp22 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<64x1xf32>) -> tensor<64x1xf32> loc(#loc231)
+      %tmp24 = tt.broadcast %tmp23 : tensor<64x1xf32> -> tensor<64x64xf32> loc(#loc232)
+      %tmp24_60 = arith.mulf %tmp17_59, %tmp24 : tensor<64x64xf32> loc(#loc232)
+      %tmp25 = tt.addptr %tmp58, %tmp17_51 : tensor<1x64x!tt.ptr<bf16>>, tensor<1x64xi32> loc(#loc233)
+      %tmp25_61 = tt.broadcast %tmp25 : tensor<1x64x!tt.ptr<bf16>> -> tensor<64x64x!tt.ptr<bf16>> loc(#loc233)
+      %tmp25_62 = tt.load %tmp25_61, %tmp17_57, %cst_0 evictionPolicy = evict_last : tensor<64x64x!tt.ptr<bf16>> loc(#loc234)
+      %tmp25_63 = arith.extf %tmp25_62 : tensor<64x64xbf16> to tensor<64x64xf32> loc(#loc235)
+      %tmp27 = arith.mulf %tmp24_60, %tmp25_63 : tensor<64x64xf32> loc(#loc236)
+      %tmp29 = arith.subf %cst_11, %tmp27 : tensor<64x64xf32> loc(#loc237)
+      %tmp31 = tt.broadcast %tmp16_50 : tensor<1x64xi1> -> tensor<64x64xi1> loc(#loc238)
+      %tmp31_64 = arith.select %tmp31, %tmp29, %cst_11 : tensor<64x64xi1>, tensor<64x64xf32> loc(#loc238)
+      %tmp32 = arith.cmpi sge, %tmp16, %cst_5 : tensor<1x64xi64> loc(#loc239)
+      %tmp35 = tt.broadcast %tmp17 : tensor<1x64xi32> -> tensor<64x64xi32> loc(#loc240)
+      %tmp35_65 = arith.addi %tmp35, %tmp50_21 : tensor<64x64xi32> loc(#loc240)
+      %tmp35_66 = arith.addi %tmp35_65, %tmp50_24 : tensor<64x64xi32> loc(#loc241)
+      %tmp35_67 = tt.addptr %tmp50_26, %tmp35_66 : tensor<64x64x!tt.ptr<bf16>>, tensor<64x64xi32> loc(#loc242)
+      %tmp35_68 = arith.andi %r0_mask, %tmp32 : tensor<1x64xi1> loc(#loc243)
+      %tmp35_69 = tt.broadcast %tmp35_68 : tensor<1x64xi1> -> tensor<64x64xi1> loc(#loc244)
+      %tmp35_70 = tt.load %tmp35_67, %tmp35_69, %cst_0 evictionPolicy = evict_last : tensor<64x64x!tt.ptr<bf16>> loc(#loc244)
+      %tmp35_71 = arith.extf %tmp35_70 : tensor<64x64xbf16> to tensor<64x64xf32> loc(#loc245)
+      %tmp42 = arith.mulf %tmp35_71, %tmp24 : tensor<64x64xf32> loc(#loc246)
+      %tmp43 = tt.addptr %tmp58, %tmp17 : tensor<1x64x!tt.ptr<bf16>>, tensor<1x64xi32> loc(#loc247)
+      %tmp43_72 = tt.broadcast %tmp43 : tensor<1x64x!tt.ptr<bf16>> -> tensor<64x64x!tt.ptr<bf16>> loc(#loc247)
+      %tmp43_73 = tt.load %tmp43_72, %tmp35_69, %cst_0 evictionPolicy = evict_last : tensor<64x64x!tt.ptr<bf16>> loc(#loc248)
+      %tmp43_74 = arith.extf %tmp43_73 : tensor<64x64xbf16> to tensor<64x64xf32> loc(#loc249)
+      %tmp45 = arith.mulf %tmp42, %tmp43_74 : tensor<64x64xf32> loc(#loc250)
+      %tmp48 = tt.broadcast %tmp32 : tensor<1x64xi1> -> tensor<64x64xi1> loc(#loc251)
+      %tmp48_75 = arith.select %tmp48, %tmp45, %cst_11 : tensor<64x64xi1>, tensor<64x64xf32> loc(#loc251)
+      %tmp49 = arith.select %tmp31, %tmp31_64, %tmp48_75 : tensor<64x64xi1>, tensor<64x64xf32> loc(#loc252)
+      %tmp57 = arith.mulf %tmp50_30, %tmp24 : tensor<64x64xf32> loc(#loc253)
+      %tmp60 = tt.broadcast %tmp58_33 : tensor<1x64xf32> -> tensor<64x64xf32> loc(#loc254)
+      %tmp60_76 = arith.mulf %tmp57, %tmp60 : tensor<64x64xf32> loc(#loc254)
+      %tmp64 = arith.mulf %tmp60_76, %tmp63_38 : tensor<64x64xf32> loc(#loc255)
+      %tmp67 = arith.mulf %tmp49, %tmp66_40 : tensor<64x64xf32> loc(#loc256)
+      %tmp68 = arith.addf %tmp64, %tmp67 : tensor<64x64xf32> loc(#loc257)
+      %tmp70 = arith.addi %tmp17, %cst_1 : tensor<1x64xi32> loc(#loc258)
+      %tmp70_77 = tt.broadcast %tmp70 : tensor<1x64xi32> -> tensor<64x64xi32> loc(#loc259)
+      %tmp70_78 = arith.addi %tmp70_77, %tmp50_21 : tensor<64x64xi32> loc(#loc259)
+      %tmp70_79 = arith.addi %tmp70_78, %tmp50_24 : tensor<64x64xi32> loc(#loc260)
+      %tmp70_80 = tt.addptr %tmp50_26, %tmp70_79 : tensor<64x64x!tt.ptr<bf16>>, tensor<64x64xi32> loc(#loc261)
+      %tmp70_81 = tt.load %tmp70_80, %tmp17_57, %cst_0 evictionPolicy = evict_last : tensor<64x64x!tt.ptr<bf16>> loc(#loc262)
+      %tmp70_82 = arith.extf %tmp70_81 : tensor<64x64xbf16> to tensor<64x64xf32> loc(#loc263)
+      %tmp72 = arith.divf %tmp4_17, %cst_3 : tensor<64x1xf32> loc(#loc264)
+      %tmp73 = arith.addf %tmp72, %cst_2 : tensor<64x1xf32> loc(#loc265)
+      %tmp74 = tt.extern_elementwise %tmp73 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<64x1xf32>) -> tensor<64x1xf32> loc(#loc266)
+      %tmp75 = tt.broadcast %tmp74 : tensor<64x1xf32> -> tensor<64x64xf32> loc(#loc267)
+      %tmp75_83 = arith.mulf %tmp70_82, %tmp75 : tensor<64x64xf32> loc(#loc267)
+      %tmp76 = tt.addptr %tmp102, %tmp17_51 : tensor<1x64x!tt.ptr<bf16>>, tensor<1x64xi32> loc(#loc268)
+      %tmp76_84 = tt.broadcast %tmp76 : tensor<1x64x!tt.ptr<bf16>> -> tensor<64x64x!tt.ptr<bf16>> loc(#loc268)
+      %tmp76_85 = tt.load %tmp76_84, %tmp17_57, %cst_0 evictionPolicy = evict_last : tensor<64x64x!tt.ptr<bf16>> loc(#loc269)
+      %tmp76_86 = arith.extf %tmp76_85 : tensor<64x64xbf16> to tensor<64x64xf32> loc(#loc270)
+      %tmp78 = arith.mulf %tmp75_83, %tmp76_86 : tensor<64x64xf32> loc(#loc271)
+      %tmp80 = arith.subf %cst_11, %tmp78 : tensor<64x64xf32> loc(#loc272)
+      %tmp82 = arith.select %tmp31, %tmp80, %cst_11 : tensor<64x64xi1>, tensor<64x64xf32> loc(#loc273)
+      %tmp83 = arith.addi %tmp17, %cst_9 : tensor<1x64xi32> loc(#loc274)
+      %tmp83_87 = tt.broadcast %tmp83 : tensor<1x64xi32> -> tensor<64x64xi32> loc(#loc275)
+      %tmp83_88 = arith.addi %tmp83_87, %tmp50_21 : tensor<64x64xi32> loc(#loc275)
+      %tmp83_89 = arith.addi %tmp83_88, %tmp50_24 : tensor<64x64xi32> loc(#loc276)
+      %tmp83_90 = tt.addptr %tmp50_26, %tmp83_89 : tensor<64x64x!tt.ptr<bf16>>, tensor<64x64xi32> loc(#loc277)
+      %tmp83_91 = tt.load %tmp83_90, %tmp35_69, %cst_0 evictionPolicy = evict_last : tensor<64x64x!tt.ptr<bf16>> loc(#loc278)
+      %tmp83_92 = arith.extf %tmp83_91 : tensor<64x64xbf16> to tensor<64x64xf32> loc(#loc279)
+      %tmp88 = arith.mulf %tmp83_92, %tmp75 : tensor<64x64xf32> loc(#loc280)
+      %tmp89 = tt.addptr %tmp102, %tmp17 : tensor<1x64x!tt.ptr<bf16>>, tensor<1x64xi32> loc(#loc281)
+      %tmp89_93 = tt.broadcast %tmp89 : tensor<1x64x!tt.ptr<bf16>> -> tensor<64x64x!tt.ptr<bf16>> loc(#loc281)
+      %tmp89_94 = tt.load %tmp89_93, %tmp35_69, %cst_0 evictionPolicy = evict_last : tensor<64x64x!tt.ptr<bf16>> loc(#loc282)
+      %tmp89_95 = arith.extf %tmp89_94 : tensor<64x64xbf16> to tensor<64x64xf32> loc(#loc283)
+      %tmp91 = arith.mulf %tmp88, %tmp89_95 : tensor<64x64xf32> loc(#loc284)
+      %tmp94 = arith.select %tmp48, %tmp91, %cst_11 : tensor<64x64xi1>, tensor<64x64xf32> loc(#loc285)
+      %tmp95 = arith.select %tmp31, %tmp82, %tmp94 : tensor<64x64xi1>, tensor<64x64xf32> loc(#loc286)
+      %tmp101 = arith.mulf %tmp96_46, %tmp75 : tensor<64x64xf32> loc(#loc287)
+      %tmp104 = tt.broadcast %tmp102_49 : tensor<1x64xf32> -> tensor<64x64xf32> loc(#loc288)
+      %tmp104_96 = arith.mulf %tmp101, %tmp104 : tensor<64x64xf32> loc(#loc288)
+      %tmp107 = arith.mulf %tmp104_96, %tmp63_38 : tensor<64x64xf32> loc(#loc289)
+      %tmp109 = arith.mulf %tmp95, %tmp66_40 : tensor<64x64xf32> loc(#loc290)
+      %tmp110 = arith.addf %tmp107, %tmp109 : tensor<64x64xf32> loc(#loc291)
+      %0 = arith.muli %xindex_16, %cst_8 : tensor<64x1xi32> loc(#loc141)
+      %1 = tt.broadcast %0 : tensor<64x1xi32> -> tensor<64x64xi32> loc(#loc142)
+      %2 = arith.addi %tmp50_20, %1 : tensor<64x64xi32> loc(#loc142)
+      %3 = tt.splat %in_out_ptr0 : !tt.ptr<bf16> -> tensor<64x64x!tt.ptr<bf16>> loc(#loc143)
+      %4 = tt.addptr %3, %2 : tensor<64x64x!tt.ptr<bf16>>, tensor<64x64xi32> loc(#loc143)
+      %5 = arith.truncf %tmp68 : tensor<64x64xf32> to tensor<64x64xbf16> loc(#loc144)
+      tt.store %4, %5, %tmp50_28 : tensor<64x64x!tt.ptr<bf16>> loc(#loc144)
+      %6 = tt.splat %in_out_ptr1 : !tt.ptr<bf16> -> tensor<64x64x!tt.ptr<bf16>> loc(#loc145)
+      %7 = tt.addptr %6, %2 : tensor<64x64x!tt.ptr<bf16>>, tensor<64x64xi32> loc(#loc145)
+      %8 = arith.truncf %tmp110 : tensor<64x64xf32> to tensor<64x64xbf16> loc(#loc146)
+      tt.store %7, %8, %tmp50_28 : tensor<64x64x!tt.ptr<bf16>> loc(#loc146)
+    } loc(#loc39)
+    tt.return loc(#loc147)
+  } loc(#loc)
+} loc(#loc)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":23:28)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":23:33)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:36)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:44)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:23)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":26:37)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":28:19)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":29:19)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":33:43)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":34:31)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":35:29)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:41)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:52)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:48)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:63)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:57)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:34)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:68)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:121)
+#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:41)
+#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:50)
+#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:34)
+#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:61)
+#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:114)
+#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":42:22)
+#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":44:23)
+#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":45:40)
+#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":47:22)
+#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":49:25)
+#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":50:42)
+#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":50:8)
+#loc33 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:36)
+#loc35 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:15)
+#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":51:28)
+#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":52:30)
+#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":53:43)
+#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":54:31)
+#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":55:29)
+#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":58:27)
+#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":59:27)
+#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:46)
+#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:42)
+#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:57)
+#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:51)
+#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:35)
+#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:62)
+#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:115)
+#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:35)
+#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:42)
+#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:95)
+#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:46)
+#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:42)
+#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:35)
+#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:51)
+#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:35)
+#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:51)
+#loc60 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:42)
+#loc61 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:49)
+#loc62 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:58)
+#loc63 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:35)
+#loc64 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:69)
+#loc65 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:123)
+#loc66 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:36)
+#loc67 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:43)
+#loc68 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:96)
+#loc69 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":71:24)
+#loc70 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:41)
+#loc71 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:39)
+#loc72 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:48)
+#loc73 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:57)
+#loc74 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:35)
+#loc75 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:78)
+#loc76 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:68)
+#loc77 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:129)
+#loc78 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":75:25)
+#loc79 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":77:24)
+#loc80 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":78:32)
+#loc81 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":79:24)
+#loc82 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:35)
+#loc83 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:85)
+#loc84 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:146)
+#loc85 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":82:24)
+#loc86 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":84:17)
+#loc87 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":86:39)
+#loc88 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":87:25)
+#loc89 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:44)
+#loc90 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:53)
+#loc91 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:35)
+#loc92 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:74)
+#loc93 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:64)
+#loc94 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:125)
+#loc95 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":97:24)
+#loc96 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:35)
+#loc97 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:81)
+#loc98 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:142)
+#loc99 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":100:24)
+#loc100 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":103:39)
+#loc101 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":104:39)
+#loc102 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":111:24)
+#loc103 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":113:24)
+#loc104 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":116:24)
+#loc105 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":118:24)
+#loc106 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":119:24)
+#loc107 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:42)
+#loc108 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:51)
+#loc109 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:60)
+#loc110 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:35)
+#loc111 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:71)
+#loc112 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:132)
+#loc113 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":123:24)
+#loc114 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":124:24)
+#loc115 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":125:32)
+#loc116 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":126:24)
+#loc117 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:35)
+#loc118 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:85)
+#loc119 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:146)
+#loc120 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":129:24)
+#loc121 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":131:17)
+#loc122 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":133:39)
+#loc123 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:42)
+#loc124 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:51)
+#loc125 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:60)
+#loc126 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:35)
+#loc127 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:71)
+#loc128 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:132)
+#loc129 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":139:24)
+#loc130 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:35)
+#loc131 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:81)
+#loc132 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:142)
+#loc133 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":142:24)
+#loc134 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":145:39)
+#loc135 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":146:39)
+#loc136 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":151:25)
+#loc137 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":153:26)
+#loc138 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":156:26)
+#loc139 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":158:26)
+#loc140 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":159:26)
+#loc141 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:43)
+#loc142 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:39)
+#loc143 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:32)
+#loc144 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:55)
+#loc145 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:32)
+#loc146 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:56)
+#loc147 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":53:4)
+#loc157 = loc("xoffset"(#loc2))
+#loc158 = loc("xoffset"(#loc3))
+#loc159 = loc("xindex"(#loc4))
+#loc160 = loc("xindex"(#loc5))
+#loc161 = loc("xindex"(#loc6))
+#loc162 = loc("r0_base"(#loc7))
+#loc163 = loc("x0"(#loc8))
+#loc164 = loc("x1"(#loc9))
+#loc165 = loc("_tmp4"(#loc10))
+#loc166 = loc("r0_index"(#loc11))
+#loc167 = loc("r0_mask"(#loc12))
+#loc168 = loc("tmp0"(#loc13))
+#loc169 = loc("tmp0"(#loc14))
+#loc170 = loc("tmp0"(#loc15))
+#loc171 = loc("tmp0"(#loc16))
+#loc172 = loc("tmp0"(#loc17))
+#loc173 = loc("tmp0"(#loc18))
+#loc174 = loc("tmp0"(#loc19))
+#loc175 = loc("tmp0"(#loc20))
+#loc176 = loc("tmp6"(#loc21))
+#loc177 = loc("tmp6"(#loc22))
+#loc178 = loc("tmp6"(#loc23))
+#loc179 = loc("tmp6"(#loc24))
+#loc180 = loc("tmp6"(#loc25))
+#loc181 = loc("tmp2"(#loc26))
+#loc182 = loc("tmp5"(#loc27))
+#loc183 = loc("_tmp4"(#loc28))
+#loc184 = loc("tmp8"(#loc29))
+#loc185 = loc("tmp11"(#loc30))
+#loc186 = loc("_tmp10"(#loc31))
+#loc188 = loc("tmp4"(#loc36))
+#loc190 = loc("tmp10"(#loc38))
+#loc191 = loc("r0_index"(#loc40))
+#loc192 = loc("r0_mask"(#loc41))
+#loc193 = loc("r0_3"(#loc42))
+#loc194 = loc("r0_4"(#loc43))
+#loc195 = loc("tmp50"(#loc44))
+#loc196 = loc("tmp50"(#loc45))
+#loc197 = loc("tmp50"(#loc46))
+#loc198 = loc("tmp50"(#loc47))
+#loc199 = loc("tmp50"(#loc48))
+#loc200 = loc("tmp50"(#loc49))
+#loc201 = loc("tmp50"(#loc50))
+#loc202 = loc("tmp58"(#loc51))
+#loc203 = loc("tmp58"(#loc52))
+#loc204 = loc("tmp58"(#loc53))
+#loc205 = loc("tmp63"(#loc54))
+#loc206 = loc("tmp63"(#loc55))
+#loc207 = loc("tmp63"(#loc56))
+#loc208 = loc("tmp63"(#loc57))
+#loc209 = loc("tmp66"(#loc58))
+#loc210 = loc("tmp66"(#loc59))
+#loc211 = loc("tmp96"(#loc60))
+#loc212 = loc("tmp96"(#loc61))
+#loc213 = loc("tmp96"(#loc62))
+#loc214 = loc("tmp96"(#loc63))
+#loc215 = loc("tmp96"(#loc64))
+#loc216 = loc("tmp96"(#loc65))
+#loc217 = loc("tmp102"(#loc66))
+#loc218 = loc("tmp102"(#loc67))
+#loc219 = loc("tmp102"(#loc68))
+#loc220 = loc("tmp16"(#loc69))
+#loc221 = loc("tmp17"(#loc70))
+#loc222 = loc("tmp17"(#loc71))
+#loc223 = loc("tmp17"(#loc72))
+#loc224 = loc("tmp17"(#loc73))
+#loc225 = loc("tmp17"(#loc74))
+#loc226 = loc("tmp17"(#loc75))
+#loc227 = loc("tmp17"(#loc76))
+#loc228 = loc("tmp17"(#loc77))
+#loc229 = loc("tmp20"(#loc78))
+#loc230 = loc("tmp22"(#loc79))
+#loc231 = loc("tmp23"(#loc80))
+#loc232 = loc("tmp24"(#loc81))
+#loc233 = loc("tmp25"(#loc82))
+#loc234 = loc("tmp25"(#loc83))
+#loc235 = loc("tmp25"(#loc84))
+#loc236 = loc("tmp27"(#loc85))
+#loc237 = loc("tmp29"(#loc86))
+#loc238 = loc("tmp31"(#loc87))
+#loc239 = loc("tmp32"(#loc88))
+#loc240 = loc("tmp35"(#loc89))
+#loc241 = loc("tmp35"(#loc90))
+#loc242 = loc("tmp35"(#loc91))
+#loc243 = loc("tmp35"(#loc92))
+#loc244 = loc("tmp35"(#loc93))
+#loc245 = loc("tmp35"(#loc94))
+#loc246 = loc("tmp42"(#loc95))
+#loc247 = loc("tmp43"(#loc96))
+#loc248 = loc("tmp43"(#loc97))
+#loc249 = loc("tmp43"(#loc98))
+#loc250 = loc("tmp45"(#loc99))
+#loc251 = loc("tmp48"(#loc100))
+#loc252 = loc("tmp49"(#loc101))
+#loc253 = loc("tmp57"(#loc102))
+#loc254 = loc("tmp60"(#loc103))
+#loc255 = loc("tmp64"(#loc104))
+#loc256 = loc("tmp67"(#loc105))
+#loc257 = loc("tmp68"(#loc106))
+#loc258 = loc("tmp70"(#loc107))
+#loc259 = loc("tmp70"(#loc108))
+#loc260 = loc("tmp70"(#loc109))
+#loc261 = loc("tmp70"(#loc110))
+#loc262 = loc("tmp70"(#loc111))
+#loc263 = loc("tmp70"(#loc112))
+#loc264 = loc("tmp72"(#loc113))
+#loc265 = loc("tmp73"(#loc114))
+#loc266 = loc("tmp74"(#loc115))
+#loc267 = loc("tmp75"(#loc116))
+#loc268 = loc("tmp76"(#loc117))
+#loc269 = loc("tmp76"(#loc118))
+#loc270 = loc("tmp76"(#loc119))
+#loc271 = loc("tmp78"(#loc120))
+#loc272 = loc("tmp80"(#loc121))
+#loc273 = loc("tmp82"(#loc122))
+#loc274 = loc("tmp83"(#loc123))
+#loc275 = loc("tmp83"(#loc124))
+#loc276 = loc("tmp83"(#loc125))
+#loc277 = loc("tmp83"(#loc126))
+#loc278 = loc("tmp83"(#loc127))
+#loc279 = loc("tmp83"(#loc128))
+#loc280 = loc("tmp88"(#loc129))
+#loc281 = loc("tmp89"(#loc130))
+#loc282 = loc("tmp89"(#loc131))
+#loc283 = loc("tmp89"(#loc132))
+#loc284 = loc("tmp91"(#loc133))
+#loc285 = loc("tmp94"(#loc134))
+#loc286 = loc("tmp95"(#loc135))
+#loc287 = loc("tmp101"(#loc136))
+#loc288 = loc("tmp104"(#loc137))
+#loc289 = loc("tmp107"(#loc138))
+#loc290 = loc("tmp109"(#loc139))
+#loc291 = loc("tmp110"(#loc140))
+#loc292 = loc("_tmp10"(#loc165))
+#loc293 = loc(callsite(#loc33 at #loc187))
+#loc295 = loc(callsite(#loc33 at #loc189))
+#loc297 = loc(callsite(#loc35 at #loc293))
+#loc298 = loc(callsite(#loc35 at #loc295))
diff --git a/triton/CDFI4S2JQK6U4T4QGBDV3BGOLLOBVUWVCTCBGE45PJKLZXDGL2VQ/__grp__triton_red_fused__fused_rms_norm_view_1.json b/triton/CDFI4S2JQK6U4T4QGBDV3BGOLLOBVUWVCTCBGE45PJKLZXDGL2VQ/__grp__triton_red_fused__fused_rms_norm_view_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..1bcac25deb8f4c0f2844e8c053ac818a34f9ede5
--- /dev/null
+++ b/triton/CDFI4S2JQK6U4T4QGBDV3BGOLLOBVUWVCTCBGE45PJKLZXDGL2VQ/__grp__triton_red_fused__fused_rms_norm_view_1.json
@@ -0,0 +1 @@
+{"child_paths": {"triton_red_fused__fused_rms_norm_view_1.source": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/CDFI4S2JQK6U4T4QGBDV3BGOLLOBVUWVCTCBGE45PJKLZXDGL2VQ/triton_red_fused__fused_rms_norm_view_1.source", "triton_red_fused__fused_rms_norm_view_1.ttir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/CDFI4S2JQK6U4T4QGBDV3BGOLLOBVUWVCTCBGE45PJKLZXDGL2VQ/triton_red_fused__fused_rms_norm_view_1.ttir", "triton_red_fused__fused_rms_norm_view_1.ttgir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/CDFI4S2JQK6U4T4QGBDV3BGOLLOBVUWVCTCBGE45PJKLZXDGL2VQ/triton_red_fused__fused_rms_norm_view_1.ttgir", "triton_red_fused__fused_rms_norm_view_1.llir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/CDFI4S2JQK6U4T4QGBDV3BGOLLOBVUWVCTCBGE45PJKLZXDGL2VQ/triton_red_fused__fused_rms_norm_view_1.llir", "triton_red_fused__fused_rms_norm_view_1.ptx": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/CDFI4S2JQK6U4T4QGBDV3BGOLLOBVUWVCTCBGE45PJKLZXDGL2VQ/triton_red_fused__fused_rms_norm_view_1.ptx", "triton_red_fused__fused_rms_norm_view_1.cubin": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/CDFI4S2JQK6U4T4QGBDV3BGOLLOBVUWVCTCBGE45PJKLZXDGL2VQ/triton_red_fused__fused_rms_norm_view_1.cubin", "triton_red_fused__fused_rms_norm_view_1.json": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/CDFI4S2JQK6U4T4QGBDV3BGOLLOBVUWVCTCBGE45PJKLZXDGL2VQ/triton_red_fused__fused_rms_norm_view_1.json"}}
\ No newline at end of file
diff --git a/triton/CDFI4S2JQK6U4T4QGBDV3BGOLLOBVUWVCTCBGE45PJKLZXDGL2VQ/triton_red_fused__fused_rms_norm_view_1.cubin b/triton/CDFI4S2JQK6U4T4QGBDV3BGOLLOBVUWVCTCBGE45PJKLZXDGL2VQ/triton_red_fused__fused_rms_norm_view_1.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..70bf519b261812a409ae16a50350985ad0159570
Binary files /dev/null and b/triton/CDFI4S2JQK6U4T4QGBDV3BGOLLOBVUWVCTCBGE45PJKLZXDGL2VQ/triton_red_fused__fused_rms_norm_view_1.cubin differ
diff --git a/triton/CDFI4S2JQK6U4T4QGBDV3BGOLLOBVUWVCTCBGE45PJKLZXDGL2VQ/triton_red_fused__fused_rms_norm_view_1.json b/triton/CDFI4S2JQK6U4T4QGBDV3BGOLLOBVUWVCTCBGE45PJKLZXDGL2VQ/triton_red_fused__fused_rms_norm_view_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..83e0ebd0e54693c260fc02493452983647f848ba
--- /dev/null
+++ b/triton/CDFI4S2JQK6U4T4QGBDV3BGOLLOBVUWVCTCBGE45PJKLZXDGL2VQ/triton_red_fused__fused_rms_norm_view_1.json
@@ -0,0 +1 @@
+{"hash": "10ca8e4b4982bd4e4f9030475d84ce5adc1ad2d514c413139d7a54bcdc665eab", "target": {"backend": "cuda", "arch": 89, "warp_size": 32}, "num_warps": 8, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "enable_reflect_ftz": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee", "bf16x3", "bf16x6"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm89", "instrumentation_mode": "", "triton_version": "3.6.0", "tensordesc_meta": [], "shared": 32, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused__fused_rms_norm_view_1"}
\ No newline at end of file
diff --git a/triton/CDFI4S2JQK6U4T4QGBDV3BGOLLOBVUWVCTCBGE45PJKLZXDGL2VQ/triton_red_fused__fused_rms_norm_view_1.llir b/triton/CDFI4S2JQK6U4T4QGBDV3BGOLLOBVUWVCTCBGE45PJKLZXDGL2VQ/triton_red_fused__fused_rms_norm_view_1.llir
new file mode 100644
index 0000000000000000000000000000000000000000..02f3ff04eebd305ef59d35ebae5455700069b478
--- /dev/null
+++ b/triton/CDFI4S2JQK6U4T4QGBDV3BGOLLOBVUWVCTCBGE45PJKLZXDGL2VQ/triton_red_fused__fused_rms_norm_view_1.llir
@@ -0,0 +1,136 @@
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64"
+
+@global_smem = external local_unnamed_addr addrspace(3) global [0 x i8], align 16
+
+; Function Attrs: nounwind
+define ptx_kernel void @triton_red_fused__fused_rms_norm_view_1(ptr addrspace(1) %0, ptr addrspace(1) %1, i32 %2, i32 %3, ptr addrspace(1) readnone captures(none) %4, ptr addrspace(1) readnone captures(none) %5) local_unnamed_addr #0 !dbg !4 {
+  %7 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7
+  %8 = shl i32 %7, 3, !dbg !8
+  %9 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9
+  %10 = and i32 %9, 224, !dbg !9
+  %11 = lshr exact i32 %10, 5, !dbg !9
+  %12 = and i32 %9, 7, !dbg !9
+  %13 = or disjoint i32 %11, %8, !dbg !10
+  %14 = or disjoint i32 %8, %12, !dbg !10
+  %15 = shl nuw nsw i32 %9, 2, !dbg !11
+  %16 = and i32 %15, 124, !dbg !11
+  %17 = sdiv i32 %13, 32, !dbg !12
+  %18 = mul i32 %17, 32, !dbg !13
+  %.decomposed = sub i32 %13, %18, !dbg !13
+  %19 = shl nsw i32 %.decomposed, 7, !dbg !14
+  %20 = or disjoint i32 %19, %16, !dbg !15
+  %21 = mul i32 %17, 12288, !dbg !16
+  %22 = add i32 %20, %21, !dbg !17
+  %23 = sext i32 %22 to i64, !dbg !18
+  %24 = getelementptr bfloat, ptr addrspace(1) %0, i64 %23, !dbg !18
+  %25 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !19
+  %26 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %24, i64 %25, i1 true) #4, !dbg !19
+  %27 = extractvalue { i32, i32 } %26, 0, !dbg !19
+  %28 = bitcast i32 %27 to <2 x bfloat>, !dbg !19
+  %29 = extractvalue { i32, i32 } %26, 1, !dbg !19
+  %30 = bitcast i32 %29 to <2 x bfloat>, !dbg !19
+  %31 = extractelement <2 x bfloat> %28, i64 0, !dbg !19
+  %32 = extractelement <2 x bfloat> %28, i64 1, !dbg !19
+  %33 = extractelement <2 x bfloat> %30, i64 0, !dbg !19
+  %34 = extractelement <2 x bfloat> %30, i64 1, !dbg !19
+  %35 = fpext bfloat %31 to float, !dbg !20
+  %36 = fpext bfloat %32 to float, !dbg !20
+  %37 = fpext bfloat %33 to float, !dbg !20
+  %38 = fpext bfloat %34 to float, !dbg !20
+  %39 = fmul float %35, %35, !dbg !21
+  %40 = fmul float %36, %36, !dbg !21
+  %41 = fmul float %37, %37, !dbg !21
+  %42 = fmul float %38, %38, !dbg !21
+  %43 = fadd float %39, %40, !dbg !22
+  %44 = fadd float %41, %43, !dbg !22
+  %45 = fadd float %42, %44, !dbg !22
+  %46 = bitcast float %45 to i32, !dbg !25
+  %47 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %46, i32 16, i32 31), !dbg !25
+  %48 = bitcast i32 %47 to float, !dbg !25
+  %49 = fadd float %45, %48, !dbg !22
+  %50 = bitcast float %49 to i32, !dbg !25
+  %51 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %50, i32 8, i32 31), !dbg !25
+  %52 = bitcast i32 %51 to float, !dbg !25
+  %53 = fadd float %49, %52, !dbg !22
+  %54 = bitcast float %53 to i32, !dbg !25
+  %55 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %54, i32 4, i32 31), !dbg !25
+  %56 = bitcast i32 %55 to float, !dbg !25
+  %57 = fadd float %53, %56, !dbg !22
+  %58 = bitcast float %57 to i32, !dbg !25
+  %59 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %58, i32 2, i32 31), !dbg !25
+  %60 = bitcast i32 %59 to float, !dbg !25
+  %61 = fadd float %57, %60, !dbg !22
+  %62 = bitcast float %61 to i32, !dbg !25
+  %63 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %62, i32 1, i32 31), !dbg !25
+  %64 = bitcast i32 %63 to float, !dbg !25
+  %65 = fadd float %61, %64, !dbg !22
+  %66 = lshr exact i32 %10, 3, !dbg !28
+  %67 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %66, !dbg !28
+  store float %65, ptr addrspace(3) %67, align 4, !dbg !28
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !28
+  %68 = shl nuw nsw i32 %12, 2, !dbg !28
+  %69 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %68, !dbg !28
+  %70 = load i32, ptr addrspace(3) %69, align 4, !dbg !28
+  %71 = sext i32 %14 to i64, !dbg !29
+  %72 = getelementptr float, ptr addrspace(1) %1, i64 %71, !dbg !29
+  %73 = and i32 %9, 248, !dbg !30
+  %74 = icmp eq i32 %73, 0, !dbg !30
+  tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %70, ptr addrspace(1) %72, i1 %74) #4, !dbg !30
+  ret void, !dbg !31
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1
+
+; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
+declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2
+
+; Function Attrs: convergent nocallback nounwind
+declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #3
+
+attributes #0 = { nounwind "nvvm.reqntid"="256" }
+attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
+attributes #3 = { convergent nocallback nounwind }
+attributes #4 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly)
+!1 = !DIFile(filename: "cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py", directory: "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi")
+!2 = !{i32 2, !"Debug Info Version", i32 3}
+!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
+!4 = distinct !DISubprogram(name: "triton_red_fused__fused_rms_norm_view_1", linkageName: "triton_red_fused__fused_rms_norm_view_1", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!5 = !DISubroutineType(cc: DW_CC_normal, types: !6)
+!6 = !{}
+!7 = !DILocation(line: 23, column: 28, scope: !4)
+!8 = !DILocation(line: 23, column: 33, scope: !4)
+!9 = !DILocation(line: 24, column: 44, scope: !4)
+!10 = !DILocation(line: 24, column: 23, scope: !4)
+!11 = !DILocation(line: 26, column: 37, scope: !4)
+!12 = !DILocation(line: 29, column: 19, scope: !4)
+!13 = !DILocation(line: 28, column: 19, scope: !4)
+!14 = !DILocation(line: 38, column: 45, scope: !4)
+!15 = !DILocation(line: 38, column: 41, scope: !4)
+!16 = !DILocation(line: 38, column: 56, scope: !4)
+!17 = !DILocation(line: 38, column: 50, scope: !4)
+!18 = !DILocation(line: 38, column: 34, scope: !4)
+!19 = !DILocation(line: 38, column: 61, scope: !4)
+!20 = !DILocation(line: 38, column: 115, scope: !4)
+!21 = !DILocation(line: 40, column: 22, scope: !4)
+!22 = !DILocation(line: 263, column: 15, scope: !23, inlinedAt: !25)
+!23 = distinct !DILexicalBlockFile(scope: !4, file: !24, discriminator: 0)
+!24 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.12/dist-packages/triton/language")
+!25 = !DILocation(line: 293, column: 36, scope: !23, inlinedAt: !26)
+!26 = !DILocation(line: 44, column: 25, scope: !27)
+!27 = distinct !DILexicalBlockFile(scope: !4, file: !1, discriminator: 0)
+!28 = !DILocation(line: 44, column: 28, scope: !4)
+!29 = !DILocation(line: 45, column: 25, scope: !4)
+!30 = !DILocation(line: 45, column: 36, scope: !4)
+!31 = !DILocation(line: 45, column: 4, scope: !4)
diff --git a/triton/CDFI4S2JQK6U4T4QGBDV3BGOLLOBVUWVCTCBGE45PJKLZXDGL2VQ/triton_red_fused__fused_rms_norm_view_1.ptx b/triton/CDFI4S2JQK6U4T4QGBDV3BGOLLOBVUWVCTCBGE45PJKLZXDGL2VQ/triton_red_fused__fused_rms_norm_view_1.ptx
new file mode 100644
index 0000000000000000000000000000000000000000..4e426c60e1b24ea0ec9550ecbdada64d09301a6b
--- /dev/null
+++ b/triton/CDFI4S2JQK6U4T4QGBDV3BGOLLOBVUWVCTCBGE45PJKLZXDGL2VQ/triton_red_fused__fused_rms_norm_view_1.ptx
@@ -0,0 +1,506 @@
+//
+// Generated by LLVM NVPTX Back-End
+//
+
+.version 9.1
+.target sm_89
+.address_size 64
+
+	// .globl	triton_red_fused__fused_rms_norm_view_1 // -- Begin function triton_red_fused__fused_rms_norm_view_1
+.extern .shared .align 16 .b8 global_smem[];
+                                        // @triton_red_fused__fused_rms_norm_view_1
+.visible .entry triton_red_fused__fused_rms_norm_view_1(
+	.param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm_view_1_param_0,
+	.param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm_view_1_param_1,
+	.param .u32 triton_red_fused__fused_rms_norm_view_1_param_2,
+	.param .u32 triton_red_fused__fused_rms_norm_view_1_param_3,
+	.param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm_view_1_param_4,
+	.param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm_view_1_param_5
+)
+.reqntid 256
+{
+	.reg .pred 	%p<3>;
+	.reg .b16 	%rs<5>;
+	.reg .b32 	%r<48>;
+	.reg .b64 	%rd<6>;
+	.loc	1 18 0                          // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:18:0
+$L__func_begin0:
+	.loc	1 18 0                          // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:18:0
+
+// %bb.0:
+	ld.param.b64 	%rd4, [triton_red_fused__fused_rms_norm_view_1_param_0];
+	ld.param.b64 	%rd5, [triton_red_fused__fused_rms_norm_view_1_param_1];
+$L__tmp0:
+	.loc	1 23 28                         // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:23:28
+	mov.u32 	%r5, %ctaid.x;
+	.loc	1 23 33                         // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:23:33
+	shl.b32 	%r6, %r5, 3;
+	.loc	1 24 44                         // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:24:44
+	mov.u32 	%r7, %tid.x;
+	and.b32 	%r8, %r7, 224;
+	bfe.u32 	%r9, %r7, 5, 3;
+	and.b32 	%r10, %r7, 7;
+	.loc	1 24 23                         // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:24:23
+	or.b32 	%r11, %r9, %r6;
+	or.b32 	%r12, %r6, %r10;
+	.loc	1 26 37                         // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:26:37
+	shl.b32 	%r13, %r7, 2;
+	and.b32 	%r14, %r13, 124;
+	.loc	1 29 19                         // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:29:19
+	bfe.s32 	%r15, %r5, 28, 1;
+	shr.u32 	%r16, %r15, 27;
+	add.s32 	%r17, %r11, %r16;
+	shr.u32 	%r18, %r17, 5;
+	.loc	1 28 19                         // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:28:19
+	and.b32 	%r19, %r17, 33554400;
+	sub.s32 	%r20, %r11, %r19;
+	.loc	1 38 45                         // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:38:45
+	shl.b32 	%r21, %r20, 7;
+	.loc	1 38 41                         // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:38:41
+	or.b32 	%r22, %r21, %r14;
+	.loc	1 38 50                         // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:38:50
+	mad.lo.s32 	%r23, %r18, 12288, %r22;
+	.loc	1 38 34                         // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:38:34
+	mad.wide.s32 	%rd1, %r23, 2, %rd4;
+	.loc	1 38 61                         // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:38:61
+	// begin inline asm
+	mov.u64 %rd2, 0x0;
+	createpolicy.fractional.L2::evict_first.b64 %rd2, 1.0;
+	// end inline asm
+	mov.b32 	%r3, 0;
+	mov.pred 	%p1, -1;
+	// begin inline asm
+	mov.u32 %r1, %r3;
+	mov.u32 %r2, %r3;
+	@%p1 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { %r1, %r2 }, [ %rd1 + 0 ], %rd2;
+	// end inline asm
+	mov.b32 	{%rs1, %rs2}, %r1;
+	mov.b32 	{%rs3, %rs4}, %r2;
+	.loc	1 38 115                        // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:38:115
+	cvt.f32.bf16 	%r24, %rs1;
+	cvt.f32.bf16 	%r25, %rs2;
+	cvt.f32.bf16 	%r26, %rs3;
+	cvt.f32.bf16 	%r27, %rs4;
+	.loc	1 40 22                         // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:40:22
+	mul.f32 	%r28, %r25, %r25;
+$L__tmp1:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:44:25 ] ]
+	fma.rn.f32 	%r29, %r24, %r24, %r28;
+	fma.rn.f32 	%r30, %r26, %r26, %r29;
+	fma.rn.f32 	%r31, %r27, %r27, %r30;
+$L__tmp2:
+	.loc	2 293 36                        // standard.py:293:36 @[ cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:44:25 ]
+	shfl.sync.bfly.b32 	%r32, %r31, 16, 31, -1;
+$L__tmp3:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:44:25 ] ]
+	add.f32 	%r33, %r31, %r32;
+$L__tmp4:
+	.loc	2 293 36                        // standard.py:293:36 @[ cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:44:25 ]
+	shfl.sync.bfly.b32 	%r34, %r33, 8, 31, -1;
+$L__tmp5:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:44:25 ] ]
+	add.f32 	%r35, %r33, %r34;
+$L__tmp6:
+	.loc	2 293 36                        // standard.py:293:36 @[ cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:44:25 ]
+	shfl.sync.bfly.b32 	%r36, %r35, 4, 31, -1;
+$L__tmp7:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:44:25 ] ]
+	add.f32 	%r37, %r35, %r36;
+$L__tmp8:
+	.loc	2 293 36                        // standard.py:293:36 @[ cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:44:25 ]
+	shfl.sync.bfly.b32 	%r38, %r37, 2, 31, -1;
+$L__tmp9:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:44:25 ] ]
+	add.f32 	%r39, %r37, %r38;
+$L__tmp10:
+	.loc	2 293 36                        // standard.py:293:36 @[ cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:44:25 ]
+	shfl.sync.bfly.b32 	%r40, %r39, 1, 31, -1;
+$L__tmp11:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:44:25 ] ]
+	add.f32 	%r41, %r39, %r40;
+$L__tmp12:
+	.loc	1 44 28                         // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:44:28
+	shr.u32 	%r42, %r8, 3;
+	mov.b32 	%r43, global_smem;
+	add.s32 	%r44, %r43, %r42;
+	st.shared.b32 	[%r44], %r41;
+	bar.sync 	0;
+	shl.b32 	%r45, %r10, 2;
+	add.s32 	%r46, %r43, %r45;
+	ld.shared.b32 	%r4, [%r46];
+	.loc	1 45 25                         // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:45:25
+	mad.wide.s32 	%rd3, %r12, 4, %rd5;
+	.loc	1 45 36                         // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:45:36
+	and.b32 	%r47, %r7, 248;
+	setp.eq.b32 	%p2, %r47, 0;
+	// begin inline asm
+	@%p2 st.global.b32 [ %rd3 + 0 ], { %r4 };
+	// end inline asm
+	.loc	1 45 4                          // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:45:4
+	ret;
+$L__tmp13:
+$L__func_end0:
+                                        // -- End function
+}
+	.file	1 "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py"
+	.file	2 "/usr/local/lib/python3.12/dist-packages/triton/language/standard.py"
+	.section	.debug_abbrev
+	{
+.b8 1                                   // Abbreviation Code
+.b8 17                                  // DW_TAG_compile_unit
+.b8 1                                   // DW_CHILDREN_yes
+.b8 37                                  // DW_AT_producer
+.b8 8                                   // DW_FORM_string
+.b8 19                                  // DW_AT_language
+.b8 5                                   // DW_FORM_data2
+.b8 3                                   // DW_AT_name
+.b8 8                                   // DW_FORM_string
+.b8 16                                  // DW_AT_stmt_list
+.b8 6                                   // DW_FORM_data4
+.b8 27                                  // DW_AT_comp_dir
+.b8 8                                   // DW_FORM_string
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 2                                   // Abbreviation Code
+.b8 46                                  // DW_TAG_subprogram
+.b8 0                                   // DW_CHILDREN_no
+.b8 3                                   // DW_AT_name
+.b8 8                                   // DW_FORM_string
+.b8 32                                  // DW_AT_inline
+.b8 11                                  // DW_FORM_data1
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 3                                   // Abbreviation Code
+.b8 46                                  // DW_TAG_subprogram
+.b8 1                                   // DW_CHILDREN_yes
+.b8 17                                  // DW_AT_low_pc
+.b8 1                                   // DW_FORM_addr
+.b8 18                                  // DW_AT_high_pc
+.b8 1                                   // DW_FORM_addr
+.b8 49                                  // DW_AT_abstract_origin
+.b8 19                                  // DW_FORM_ref4
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 4                                   // Abbreviation Code
+.b8 29                                  // DW_TAG_inlined_subroutine
+.b8 1                                   // DW_CHILDREN_yes
+.b8 49                                  // DW_AT_abstract_origin
+.b8 19                                  // DW_FORM_ref4
+.b8 17                                  // DW_AT_low_pc
+.b8 1                                   // DW_FORM_addr
+.b8 18                                  // DW_AT_high_pc
+.b8 1                                   // DW_FORM_addr
+.b8 88                                  // DW_AT_call_file
+.b8 11                                  // DW_FORM_data1
+.b8 89                                  // DW_AT_call_line
+.b8 11                                  // DW_FORM_data1
+.b8 87                                  // DW_AT_call_column
+.b8 11                                  // DW_FORM_data1
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 5                                   // Abbreviation Code
+.b8 29                                  // DW_TAG_inlined_subroutine
+.b8 0                                   // DW_CHILDREN_no
+.b8 49                                  // DW_AT_abstract_origin
+.b8 19                                  // DW_FORM_ref4
+.b8 17                                  // DW_AT_low_pc
+.b8 1                                   // DW_FORM_addr
+.b8 18                                  // DW_AT_high_pc
+.b8 1                                   // DW_FORM_addr
+.b8 88                                  // DW_AT_call_file
+.b8 11                                  // DW_FORM_data1
+.b8 89                                  // DW_AT_call_line
+.b8 5                                   // DW_FORM_data2
+.b8 87                                  // DW_AT_call_column
+.b8 11                                  // DW_FORM_data1
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 0                                   // EOM(3)
+	}
+	.section	.debug_info
+	{
+.b32 339                                // Length of Unit
+.b8 2                                   // DWARF version number
+.b8 0
+.b32 .debug_abbrev                      // Offset Into Abbrev. Section
+.b8 8                                   // Address Size (in bytes)
+.b8 1                                   // Abbrev [1] 0xb:0x14c DW_TAG_compile_unit
+.b8 116                                 // DW_AT_producer
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 0
+.b8 2                                   // DW_AT_language
+.b8 0
+.b8 99                                  // DW_AT_name
+.b8 113
+.b8 105
+.b8 116
+.b8 120
+.b8 53
+.b8 104
+.b8 119
+.b8 117
+.b8 112
+.b8 107
+.b8 98
+.b8 106
+.b8 109
+.b8 99
+.b8 115
+.b8 111
+.b8 121
+.b8 107
+.b8 113
+.b8 101
+.b8 112
+.b8 122
+.b8 113
+.b8 99
+.b8 55
+.b8 122
+.b8 99
+.b8 120
+.b8 106
+.b8 99
+.b8 98
+.b8 53
+.b8 97
+.b8 99
+.b8 113
+.b8 107
+.b8 105
+.b8 55
+.b8 122
+.b8 99
+.b8 115
+.b8 106
+.b8 105
+.b8 102
+.b8 114
+.b8 110
+.b8 114
+.b8 122
+.b8 99
+.b8 114
+.b8 114
+.b8 46
+.b8 112
+.b8 121
+.b8 0
+.b32 .debug_line                        // DW_AT_stmt_list
+.b8 47                                  // DW_AT_comp_dir
+.b8 97
+.b8 112
+.b8 112
+.b8 47
+.b8 116
+.b8 101
+.b8 110
+.b8 115
+.b8 111
+.b8 114
+.b8 114
+.b8 116
+.b8 95
+.b8 108
+.b8 108
+.b8 109
+.b8 47
+.b8 118
+.b8 105
+.b8 115
+.b8 117
+.b8 97
+.b8 108
+.b8 95
+.b8 103
+.b8 101
+.b8 110
+.b8 47
+.b8 99
+.b8 111
+.b8 109
+.b8 112
+.b8 105
+.b8 108
+.b8 101
+.b8 100
+.b8 95
+.b8 99
+.b8 97
+.b8 99
+.b8 104
+.b8 101
+.b8 47
+.b8 102
+.b8 108
+.b8 117
+.b8 120
+.b8 50
+.b8 95
+.b8 107
+.b8 108
+.b8 101
+.b8 105
+.b8 110
+.b8 95
+.b8 57
+.b8 98
+.b8 95
+.b8 78
+.b8 86
+.b8 73
+.b8 68
+.b8 73
+.b8 65
+.b8 95
+.b8 71
+.b8 101
+.b8 70
+.b8 111
+.b8 114
+.b8 99
+.b8 101
+.b8 95
+.b8 82
+.b8 84
+.b8 88
+.b8 95
+.b8 52
+.b8 48
+.b8 57
+.b8 48
+.b8 95
+.b8 115
+.b8 109
+.b8 56
+.b8 57
+.b8 95
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 50
+.b8 46
+.b8 49
+.b8 48
+.b8 46
+.b8 48
+.b8 97
+.b8 48
+.b8 95
+.b8 98
+.b8 52
+.b8 101
+.b8 52
+.b8 101
+.b8 101
+.b8 56
+.b8 49
+.b8 100
+.b8 51
+.b8 46
+.b8 110
+.b8 118
+.b8 50
+.b8 53
+.b8 46
+.b8 49
+.b8 50
+.b8 95
+.b8 99
+.b8 117
+.b8 100
+.b8 97
+.b8 49
+.b8 51
+.b8 95
+.b8 49
+.b8 47
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 105
+.b8 110
+.b8 100
+.b8 117
+.b8 99
+.b8 116
+.b8 111
+.b8 114
+.b8 47
+.b8 113
+.b8 105
+.b8 0
+.b8 2                                   // Abbrev [2] 0xe4:0x2a DW_TAG_subprogram
+.b8 116                                 // DW_AT_name
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 114
+.b8 101
+.b8 100
+.b8 95
+.b8 102
+.b8 117
+.b8 115
+.b8 101
+.b8 100
+.b8 95
+.b8 95
+.b8 102
+.b8 117
+.b8 115
+.b8 101
+.b8 100
+.b8 95
+.b8 114
+.b8 109
+.b8 115
+.b8 95
+.b8 110
+.b8 111
+.b8 114
+.b8 109
+.b8 95
+.b8 118
+.b8 105
+.b8 101
+.b8 119
+.b8 95
+.b8 49
+.b8 0
+.b8 1                                   // DW_AT_inline
+.b8 3                                   // Abbrev [3] 0x10e:0x48 DW_TAG_subprogram
+.b64 $L__func_begin0                    // DW_AT_low_pc
+.b64 $L__func_end0                      // DW_AT_high_pc
+.b32 228                                // DW_AT_abstract_origin
+.b8 4                                   // Abbrev [4] 0x123:0x32 DW_TAG_inlined_subroutine
+.b32 228                                // DW_AT_abstract_origin
+.b64 $L__tmp1                           // DW_AT_low_pc
+.b64 $L__tmp12                          // DW_AT_high_pc
+.b8 1                                   // DW_AT_call_file
+.b8 44                                  // DW_AT_call_line
+.b8 25                                  // DW_AT_call_column
+.b8 5                                   // Abbrev [5] 0x13b:0x19 DW_TAG_inlined_subroutine
+.b32 228                                // DW_AT_abstract_origin
+.b64 $L__tmp1                           // DW_AT_low_pc
+.b64 $L__tmp12                          // DW_AT_high_pc
+.b8 2                                   // DW_AT_call_file
+.b8 37                                  // DW_AT_call_line
+.b8 1
+.b8 36                                  // DW_AT_call_column
+.b8 0                                   // End Of Children Mark
+.b8 0                                   // End Of Children Mark
+.b8 0                                   // End Of Children Mark
+	}
+	.section	.debug_macinfo	{	}
diff --git a/triton/CDFI4S2JQK6U4T4QGBDV3BGOLLOBVUWVCTCBGE45PJKLZXDGL2VQ/triton_red_fused__fused_rms_norm_view_1.source b/triton/CDFI4S2JQK6U4T4QGBDV3BGOLLOBVUWVCTCBGE45PJKLZXDGL2VQ/triton_red_fused__fused_rms_norm_view_1.source
new file mode 100644
index 0000000000000000000000000000000000000000..ff7cf9f915b20d3434f66ead436e51762c5b04b2
--- /dev/null
+++ b/triton/CDFI4S2JQK6U4T4QGBDV3BGOLLOBVUWVCTCBGE45PJKLZXDGL2VQ/triton_red_fused__fused_rms_norm_view_1.source
@@ -0,0 +1,167 @@
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":18:0)
+#loc33 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":287:0)
+#loc35 = loc(unknown)
+#loc38 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":262:0)
+#loc42 = loc("in_ptr0"(#loc))
+#loc43 = loc("out_ptr0"(#loc))
+#loc44 = loc("xnumel"(#loc))
+#loc45 = loc("r0_numel"(#loc))
+#loc74 = loc("input"(#loc33))
+#loc75 = loc("a"(#loc38))
+#loc76 = loc("b"(#loc38))
+module {
+  tt.func public @triton_red_fused__fused_rms_norm_view_1(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
+    %xnumel_0 = arith.constant 65536 : i32 loc(#loc46)
+    %r0_numel_1 = arith.constant 128 : i32 loc(#loc47)
+    %xoffset = tt.get_program_id x : i32 loc(#loc48)
+    %xoffset_2 = arith.constant 8 : i32 loc(#loc49)
+    %xoffset_3 = arith.constant 8 : i32 loc(#loc49)
+    %xoffset_4 = arith.muli %xoffset, %xoffset_3 : i32 loc(#loc49)
+    %xindex = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32> loc(#loc50)
+    %xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<8xi32> -> tensor<8x1xi32> loc(#loc51)
+    %xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<8x1xi32> loc(#loc52)
+    %xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<8x1xi32> loc(#loc52)
+    %xmask = arith.constant true loc(#loc53)
+    %xmask_8 = arith.constant dense<true> : tensor<8x128xi1> loc(#loc53)
+    %r0_base = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc54)
+    %r0_base_9 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc55)
+    %x0 = arith.constant 32 : i32 loc(#loc56)
+    %x0_10 = arith.constant 32 : i32 loc(#loc56)
+    %x0_11 = arith.constant dense<32> : tensor<8x1xi32> loc(#loc56)
+    %x0_12 = arith.remsi %xindex_7, %x0_11 : tensor<8x1xi32> loc(#loc56)
+    %x1 = arith.constant 32 : i32 loc(#loc57)
+    %x1_13 = arith.constant 32 : i32 loc(#loc57)
+    %x1_14 = arith.constant dense<32> : tensor<8x1xi32> loc(#loc57)
+    %x1_15 = arith.divsi %xindex_7, %x1_14 : tensor<8x1xi32> loc(#loc57)
+    %_tmp4 = arith.constant 0.000000e+00 : f32 loc(#loc58)
+    %_tmp4_16 = arith.constant dense<0.000000e+00> : tensor<8x128xf32> loc(#loc58)
+    %c0_i32 = arith.constant 0 : i32 loc(#loc14)
+    %c128_i32 = arith.constant 128 : i32 loc(#loc14)
+    %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc14)
+    %1 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc14)
+    %2 = arith.bitcast %c128_i32 : i32 to i32 loc(#loc14)
+    %3 = ub.poison : i32 loc(#loc14)
+    %_tmp4_17 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%_tmp4_19 = %_tmp4_16) -> (tensor<8x128xf32>)  : i32 {
+      %r0_index = tt.splat %r0_offset : i32 -> tensor<1x128xi32> loc(#loc60)
+      %r0_index_20 = arith.addi %r0_index, %r0_base_9 : tensor<1x128xi32> loc(#loc60)
+      %r0_mask = arith.constant dense<128> : tensor<1x128xi32> loc(#loc61)
+      %r0_mask_21 = arith.cmpi slt, %r0_index_20, %r0_mask : tensor<1x128xi32> loc(#loc61)
+      %tmp0 = arith.constant 128 : i32 loc(#loc62)
+      %tmp0_22 = arith.constant 128 : i32 loc(#loc62)
+      %tmp0_23 = arith.constant dense<128> : tensor<8x1xi32> loc(#loc62)
+      %tmp0_24 = arith.muli %tmp0_23, %x0_12 : tensor<8x1xi32> loc(#loc62)
+      %tmp0_25 = tt.broadcast %r0_index_20 : tensor<1x128xi32> -> tensor<8x128xi32> loc(#loc63)
+      %tmp0_26 = tt.broadcast %tmp0_24 : tensor<8x1xi32> -> tensor<8x128xi32> loc(#loc63)
+      %tmp0_27 = arith.addi %tmp0_25, %tmp0_26 : tensor<8x128xi32> loc(#loc63)
+      %tmp0_28 = arith.constant 12288 : i32 loc(#loc64)
+      %tmp0_29 = arith.constant 12288 : i32 loc(#loc64)
+      %tmp0_30 = arith.constant dense<12288> : tensor<8x1xi32> loc(#loc64)
+      %tmp0_31 = arith.muli %tmp0_30, %x1_15 : tensor<8x1xi32> loc(#loc64)
+      %tmp0_32 = tt.broadcast %tmp0_31 : tensor<8x1xi32> -> tensor<8x128xi32> loc(#loc65)
+      %tmp0_33 = arith.addi %tmp0_27, %tmp0_32 : tensor<8x128xi32> loc(#loc65)
+      %tmp0_34 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<8x128x!tt.ptr<bf16>> loc(#loc66)
+      %tmp0_35 = tt.addptr %tmp0_34, %tmp0_33 : tensor<8x128x!tt.ptr<bf16>>, tensor<8x128xi32> loc(#loc66)
+      %tmp0_36 = arith.constant 0.000000e+00 : f32 loc(#loc67)
+      %tmp0_37 = tt.broadcast %r0_mask_21 : tensor<1x128xi1> -> tensor<8x128xi1> loc(#loc67)
+      %tmp0_38 = arith.constant dense<0.000000e+00> : tensor<8x128xf32> loc(#loc67)
+      %tmp0_39 = arith.truncf %tmp0_38 : tensor<8x128xf32> to tensor<8x128xbf16> loc(#loc67)
+      %tmp0_40 = tt.load %tmp0_35, %tmp0_37, %tmp0_39 evictionPolicy = evict_first : tensor<8x128x!tt.ptr<bf16>> loc(#loc67)
+      %tmp0_41 = arith.extf %tmp0_40 : tensor<8x128xbf16> to tensor<8x128xf32> loc(#loc68)
+      %tmp2 = arith.mulf %tmp0_41, %tmp0_41 : tensor<8x128xf32> loc(#loc69)
+      %tmp5 = arith.addf %_tmp4_19, %tmp2 : tensor<8x128xf32> loc(#loc70)
+      %_tmp4_42 = tt.broadcast %r0_mask_21 : tensor<1x128xi1> -> tensor<8x128xi1> loc(#loc71)
+      %_tmp4_43 = arith.select %_tmp4_42, %tmp5, %_tmp4_19 : tensor<8x128xi1>, tensor<8x128xf32> loc(#loc71)
+      scf.yield %_tmp4_43 : tensor<8x128xf32> loc(#loc27)
+    } loc(#loc59)
+    %tmp4 = tt.call @"triton.language.standard.sum__fp32S8_128S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%_tmp4_17) : (tensor<8x128xf32>) -> tensor<8xf32> loc(#loc72)
+    %tmp4_18 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<8xf32> -> tensor<8x1xf32> loc(#loc73)
+    %4 = tt.splat %out_ptr0 : !tt.ptr<f32> -> tensor<8x1x!tt.ptr<f32>> loc(#loc30)
+    %5 = tt.addptr %4, %xindex_7 : tensor<8x1x!tt.ptr<f32>>, tensor<8x1xi32> loc(#loc30)
+    tt.store %5, %tmp4_18 : tensor<8x1x!tt.ptr<f32>> loc(#loc31)
+    tt.return loc(#loc32)
+  } loc(#loc)
+  tt.func private @"triton.language.standard.sum__fp32S8_128S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<8x128xf32> loc("input"(#loc33))) -> tensor<8xf32> attributes {noinline = false} {
+    %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({
+    ^bb0(%arg1: f32 loc(unknown), %arg2: f32 loc(unknown)):
+      %2 = tt.call @triton.language.standard._sum_combine__fp32_fp32__(%arg1, %arg2) : (f32, f32) -> f32 loc(#loc34)
+      tt.reduce.return %2 : f32 loc(#loc34)
+    }) : (tensor<8x128xf32>) -> tensor<8xf32> loc(#loc34)
+    tt.return %0 : tensor<8xf32> loc(#loc36)
+  ^bb1:  // no predecessors
+    %1 = ub.poison : tensor<8xf32> loc(#loc37)
+    tt.return %1 : tensor<8xf32> loc(#loc37)
+  } loc(#loc33)
+  tt.func private @triton.language.standard._sum_combine__fp32_fp32__(%a: f32 loc("a"(#loc38)), %b: f32 loc("b"(#loc38))) -> f32 attributes {noinline = false} {
+    %0 = arith.addf %a, %b : f32 loc(#loc39)
+    tt.return %0 : f32 loc(#loc40)
+  ^bb1:  // no predecessors
+    %1 = ub.poison : f32 loc(#loc41)
+    tt.return %1 : f32 loc(#loc41)
+  } loc(#loc38)
+} loc(#loc)
+#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":19:13)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":20:15)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":23:28)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":23:33)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":24:36)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":24:44)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":24:23)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":25:46)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":26:27)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":26:37)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":28:19)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":29:19)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":30:43)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":32:43)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":33:31)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":34:29)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:45)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:41)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:56)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:50)
+#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:34)
+#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:61)
+#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:115)
+#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":40:22)
+#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":42:23)
+#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":43:40)
+#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":43:8)
+#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":44:25)
+#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":44:28)
+#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":45:25)
+#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":45:36)
+#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":45:4)
+#loc34 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:36)
+#loc36 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:11)
+#loc37 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:4)
+#loc39 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:15)
+#loc40 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:11)
+#loc41 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:4)
+#loc46 = loc("xnumel"(#loc1))
+#loc47 = loc("r0_numel"(#loc2))
+#loc48 = loc("xoffset"(#loc3))
+#loc49 = loc("xoffset"(#loc4))
+#loc50 = loc("xindex"(#loc5))
+#loc51 = loc("xindex"(#loc6))
+#loc52 = loc("xindex"(#loc7))
+#loc53 = loc("xmask"(#loc8))
+#loc54 = loc("r0_base"(#loc9))
+#loc55 = loc("r0_base"(#loc10))
+#loc56 = loc("x0"(#loc11))
+#loc57 = loc("x1"(#loc12))
+#loc58 = loc("_tmp4"(#loc13))
+#loc59 = loc("_tmp4"(#loc14))
+#loc60 = loc("r0_index"(#loc15))
+#loc61 = loc("r0_mask"(#loc16))
+#loc62 = loc("tmp0"(#loc17))
+#loc63 = loc("tmp0"(#loc18))
+#loc64 = loc("tmp0"(#loc19))
+#loc65 = loc("tmp0"(#loc20))
+#loc66 = loc("tmp0"(#loc21))
+#loc67 = loc("tmp0"(#loc22))
+#loc68 = loc("tmp0"(#loc23))
+#loc69 = loc("tmp2"(#loc24))
+#loc70 = loc("tmp5"(#loc25))
+#loc71 = loc("_tmp4"(#loc26))
+#loc72 = loc("tmp4"(#loc28))
+#loc73 = loc("tmp4"(#loc29))
diff --git a/triton/CDFI4S2JQK6U4T4QGBDV3BGOLLOBVUWVCTCBGE45PJKLZXDGL2VQ/triton_red_fused__fused_rms_norm_view_1.ttgir b/triton/CDFI4S2JQK6U4T4QGBDV3BGOLLOBVUWVCTCBGE45PJKLZXDGL2VQ/triton_red_fused__fused_rms_norm_view_1.ttgir
new file mode 100644
index 0000000000000000000000000000000000000000..1a04917e573a412ea65a59022bd735e3a8366458
--- /dev/null
+++ b/triton/CDFI4S2JQK6U4T4QGBDV3BGOLLOBVUWVCTCBGE45PJKLZXDGL2VQ/triton_red_fused__fused_rms_norm_view_1.ttgir
@@ -0,0 +1,108 @@
+#blocked = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 32], warpsPerCTA = [8, 1], order = [1, 0]}>
+#blocked1 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [8, 4], warpsPerCTA = [1, 8], order = [0, 1]}>
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":18:0)
+#loc1 = loc(unknown)
+#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":44:25)
+#loc27 = loc("in_ptr0"(#loc))
+#loc28 = loc("out_ptr0"(#loc))
+#loc29 = loc("xnumel"(#loc))
+#loc30 = loc("r0_numel"(#loc))
+#loc49 = loc("tmp4"(#loc21))
+#loc52 = loc(callsite(#loc1 at #loc49))
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "cuda:89", "ttg.threads-per-warp" = 32 : i32} {
+  tt.func public @triton_red_fused__fused_rms_norm_view_1(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
+    %cst = arith.constant dense<128> : tensor<1x128xi32, #blocked> loc(#loc1)
+    %cst_0 = arith.constant dense<128> : tensor<8x1xi32, #blocked> loc(#loc1)
+    %cst_1 = arith.constant dense<12288> : tensor<8x1xi32, #blocked> loc(#loc1)
+    %cst_2 = arith.constant dense<32> : tensor<8x1xi32, #blocked> loc(#loc1)
+    %c8_i32 = arith.constant 8 : i32 loc(#loc1)
+    %cst_3 = arith.constant dense<0.000000e+00> : tensor<8x128xbf16, #blocked> loc(#loc1)
+    %cst_4 = arith.constant dense<0.000000e+00> : tensor<8x128xf32, #blocked> loc(#loc1)
+    %xoffset = tt.get_program_id x : i32 loc(#loc31)
+    %xoffset_5 = arith.muli %xoffset, %c8_i32 : i32 loc(#loc32)
+    %xindex = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc33)
+    %xindex_6 = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc33)
+    %xindex_7 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<8xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<8x1xi32, #blocked> loc(#loc33)
+    %xindex_8 = tt.expand_dims %xindex_6 {axis = 1 : i32} : tensor<8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<8x1xi32, #blocked1> loc(#loc33)
+    %xindex_9 = tt.splat %xoffset_5 : i32 -> tensor<8x1xi32, #blocked> loc(#loc34)
+    %xindex_10 = tt.splat %xoffset_5 : i32 -> tensor<8x1xi32, #blocked1> loc(#loc34)
+    %xindex_11 = arith.addi %xindex_9, %xindex_7 : tensor<8x1xi32, #blocked> loc(#loc34)
+    %xindex_12 = arith.addi %xindex_10, %xindex_8 : tensor<8x1xi32, #blocked1> loc(#loc34)
+    %r0_base = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc35)
+    %r0_base_13 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x128xi32, #blocked> loc(#loc35)
+    %x0 = arith.remsi %xindex_11, %cst_2 : tensor<8x1xi32, #blocked> loc(#loc36)
+    %x1 = arith.divsi %xindex_11, %cst_2 : tensor<8x1xi32, #blocked> loc(#loc37)
+    %r0_mask = arith.cmpi slt, %r0_base_13, %cst : tensor<1x128xi32, #blocked> loc(#loc38)
+    %tmp0 = arith.muli %x0, %cst_0 : tensor<8x1xi32, #blocked> loc(#loc39)
+    %tmp0_14 = tt.broadcast %r0_base_13 : tensor<1x128xi32, #blocked> -> tensor<8x128xi32, #blocked> loc(#loc40)
+    %tmp0_15 = tt.broadcast %tmp0 : tensor<8x1xi32, #blocked> -> tensor<8x128xi32, #blocked> loc(#loc40)
+    %tmp0_16 = arith.addi %tmp0_14, %tmp0_15 : tensor<8x128xi32, #blocked> loc(#loc40)
+    %tmp0_17 = arith.muli %x1, %cst_1 : tensor<8x1xi32, #blocked> loc(#loc41)
+    %tmp0_18 = tt.broadcast %tmp0_17 : tensor<8x1xi32, #blocked> -> tensor<8x128xi32, #blocked> loc(#loc42)
+    %tmp0_19 = arith.addi %tmp0_16, %tmp0_18 : tensor<8x128xi32, #blocked> loc(#loc42)
+    %tmp0_20 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<8x128x!tt.ptr<bf16>, #blocked> loc(#loc43)
+    %tmp0_21 = tt.addptr %tmp0_20, %tmp0_19 : tensor<8x128x!tt.ptr<bf16>, #blocked>, tensor<8x128xi32, #blocked> loc(#loc43)
+    %tmp0_22 = tt.broadcast %r0_mask : tensor<1x128xi1, #blocked> -> tensor<8x128xi1, #blocked> loc(#loc44)
+    %tmp0_23 = tt.load %tmp0_21, %tmp0_22, %cst_3 evictionPolicy = evict_first : tensor<8x128x!tt.ptr<bf16>, #blocked> loc(#loc44)
+    %tmp0_24 = arith.extf %tmp0_23 : tensor<8x128xbf16, #blocked> to tensor<8x128xf32, #blocked> loc(#loc45)
+    %tmp2 = arith.mulf %tmp0_24, %tmp0_24 : tensor<8x128xf32, #blocked> loc(#loc46)
+    %tmp5 = arith.addf %tmp2, %cst_4 : tensor<8x128xf32, #blocked> loc(#loc47)
+    %_tmp4 = arith.select %tmp0_22, %tmp5, %cst_4 : tensor<8x128xi1, #blocked>, tensor<8x128xf32, #blocked> loc(#loc48)
+    %tmp4 = "tt.reduce"(%_tmp4) <{axis = 1 : i32}> ({
+    ^bb0(%tmp4_27: f32 loc(callsite(#loc1 at #loc49)), %tmp4_28: f32 loc(callsite(#loc1 at #loc49))):
+      %tmp4_29 = arith.addf %tmp4_27, %tmp4_28 : f32 loc(#loc53)
+      tt.reduce.return %tmp4_29 : f32 loc(#loc51)
+    }) : (tensor<8x128xf32, #blocked>) -> tensor<8xf32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc51)
+    %tmp4_25 = ttg.convert_layout %tmp4 : tensor<8xf32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<8xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc50)
+    %tmp4_26 = tt.expand_dims %tmp4_25 {axis = 1 : i32} : tensor<8xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<8x1xf32, #blocked1> loc(#loc50)
+    %0 = tt.splat %out_ptr0 : !tt.ptr<f32> -> tensor<8x1x!tt.ptr<f32>, #blocked1> loc(#loc24)
+    %1 = tt.addptr %0, %xindex_12 : tensor<8x1x!tt.ptr<f32>, #blocked1>, tensor<8x1xi32, #blocked1> loc(#loc24)
+    tt.store %1, %tmp4_26 : tensor<8x1x!tt.ptr<f32>, #blocked1> loc(#loc25)
+    tt.return loc(#loc26)
+  } loc(#loc)
+} loc(#loc)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":23:28)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":23:33)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":24:44)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":24:23)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":26:37)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":28:19)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":29:19)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":34:29)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:45)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:41)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:56)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:50)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:34)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:61)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:115)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":40:22)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":42:23)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":43:40)
+#loc20 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:36)
+#loc22 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:15)
+#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":44:28)
+#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":45:25)
+#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":45:36)
+#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":45:4)
+#loc31 = loc("xoffset"(#loc2))
+#loc32 = loc("xoffset"(#loc3))
+#loc33 = loc("xindex"(#loc4))
+#loc34 = loc("xindex"(#loc5))
+#loc35 = loc("r0_base"(#loc6))
+#loc36 = loc("x0"(#loc7))
+#loc37 = loc("x1"(#loc8))
+#loc38 = loc("r0_mask"(#loc9))
+#loc39 = loc("tmp0"(#loc10))
+#loc40 = loc("tmp0"(#loc11))
+#loc41 = loc("tmp0"(#loc12))
+#loc42 = loc("tmp0"(#loc13))
+#loc43 = loc("tmp0"(#loc14))
+#loc44 = loc("tmp0"(#loc15))
+#loc45 = loc("tmp0"(#loc16))
+#loc46 = loc("tmp2"(#loc17))
+#loc47 = loc("tmp5"(#loc18))
+#loc48 = loc("_tmp4"(#loc19))
+#loc50 = loc("tmp4"(#loc23))
+#loc51 = loc(callsite(#loc20 at #loc49))
+#loc53 = loc(callsite(#loc22 at #loc51))
diff --git a/triton/CDFI4S2JQK6U4T4QGBDV3BGOLLOBVUWVCTCBGE45PJKLZXDGL2VQ/triton_red_fused__fused_rms_norm_view_1.ttir b/triton/CDFI4S2JQK6U4T4QGBDV3BGOLLOBVUWVCTCBGE45PJKLZXDGL2VQ/triton_red_fused__fused_rms_norm_view_1.ttir
new file mode 100644
index 0000000000000000000000000000000000000000..1adeb6b9b8cc2ffcd18ba5d1da5475760b71c0fd
--- /dev/null
+++ b/triton/CDFI4S2JQK6U4T4QGBDV3BGOLLOBVUWVCTCBGE45PJKLZXDGL2VQ/triton_red_fused__fused_rms_norm_view_1.ttir
@@ -0,0 +1,105 @@
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":18:0)
+#loc2 = loc(unknown)
+#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":44:25)
+#loc29 = loc("in_ptr0"(#loc))
+#loc30 = loc("out_ptr0"(#loc))
+#loc31 = loc("xnumel"(#loc))
+#loc32 = loc("r0_numel"(#loc))
+#loc53 = loc("tmp4"(#loc23))
+#loc56 = loc(callsite(#loc2 at #loc53))
+module {
+  tt.func public @triton_red_fused__fused_rms_norm_view_1(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
+    %tmp0 = arith.constant dense<0.000000e+00> : tensor<8x128xbf16> loc(#loc33)
+    %cst = arith.constant dense<12288> : tensor<8x1xi32> loc(#loc2)
+    %cst_0 = arith.constant dense<128> : tensor<8x1xi32> loc(#loc2)
+    %cst_1 = arith.constant dense<128> : tensor<1x128xi32> loc(#loc2)
+    %cst_2 = arith.constant dense<0.000000e+00> : tensor<8x128xf32> loc(#loc2)
+    %cst_3 = arith.constant dense<32> : tensor<8x1xi32> loc(#loc2)
+    %c8_i32 = arith.constant 8 : i32 loc(#loc2)
+    %xoffset = tt.get_program_id x : i32 loc(#loc34)
+    %xoffset_4 = arith.muli %xoffset, %c8_i32 : i32 loc(#loc35)
+    %xindex = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32> loc(#loc36)
+    %xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<8xi32> -> tensor<8x1xi32> loc(#loc37)
+    %xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<8x1xi32> loc(#loc38)
+    %xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<8x1xi32> loc(#loc38)
+    %r0_base = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc39)
+    %r0_base_8 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc40)
+    %x0 = arith.remsi %xindex_7, %cst_3 : tensor<8x1xi32> loc(#loc41)
+    %x1 = arith.divsi %xindex_7, %cst_3 : tensor<8x1xi32> loc(#loc42)
+    %r0_mask = arith.cmpi slt, %r0_base_8, %cst_1 : tensor<1x128xi32> loc(#loc43)
+    %tmp0_9 = arith.muli %x0, %cst_0 : tensor<8x1xi32> loc(#loc44)
+    %tmp0_10 = tt.broadcast %r0_base_8 : tensor<1x128xi32> -> tensor<8x128xi32> loc(#loc45)
+    %tmp0_11 = tt.broadcast %tmp0_9 : tensor<8x1xi32> -> tensor<8x128xi32> loc(#loc45)
+    %tmp0_12 = arith.addi %tmp0_10, %tmp0_11 : tensor<8x128xi32> loc(#loc45)
+    %tmp0_13 = arith.muli %x1, %cst : tensor<8x1xi32> loc(#loc46)
+    %tmp0_14 = tt.broadcast %tmp0_13 : tensor<8x1xi32> -> tensor<8x128xi32> loc(#loc47)
+    %tmp0_15 = arith.addi %tmp0_12, %tmp0_14 : tensor<8x128xi32> loc(#loc47)
+    %tmp0_16 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<8x128x!tt.ptr<bf16>> loc(#loc48)
+    %tmp0_17 = tt.addptr %tmp0_16, %tmp0_15 : tensor<8x128x!tt.ptr<bf16>>, tensor<8x128xi32> loc(#loc48)
+    %tmp0_18 = tt.broadcast %r0_mask : tensor<1x128xi1> -> tensor<8x128xi1> loc(#loc33)
+    %tmp0_19 = tt.load %tmp0_17, %tmp0_18, %tmp0 evictionPolicy = evict_first : tensor<8x128x!tt.ptr<bf16>> loc(#loc33)
+    %tmp0_20 = arith.extf %tmp0_19 : tensor<8x128xbf16> to tensor<8x128xf32> loc(#loc49)
+    %tmp2 = arith.mulf %tmp0_20, %tmp0_20 : tensor<8x128xf32> loc(#loc50)
+    %tmp5 = arith.addf %tmp2, %cst_2 : tensor<8x128xf32> loc(#loc51)
+    %_tmp4 = arith.select %tmp0_18, %tmp5, %cst_2 : tensor<8x128xi1>, tensor<8x128xf32> loc(#loc52)
+    %tmp4 = "tt.reduce"(%_tmp4) <{axis = 1 : i32}> ({
+    ^bb0(%tmp4_22: f32 loc(callsite(#loc2 at #loc53)), %tmp4_23: f32 loc(callsite(#loc2 at #loc53))):
+      %tmp4_24 = arith.addf %tmp4_22, %tmp4_23 : f32 loc(#loc57)
+      tt.reduce.return %tmp4_24 : f32 loc(#loc55)
+    }) : (tensor<8x128xf32>) -> tensor<8xf32> loc(#loc55)
+    %tmp4_21 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<8xf32> -> tensor<8x1xf32> loc(#loc54)
+    %0 = tt.splat %out_ptr0 : !tt.ptr<f32> -> tensor<8x1x!tt.ptr<f32>> loc(#loc26)
+    %1 = tt.addptr %0, %xindex_7 : tensor<8x1x!tt.ptr<f32>>, tensor<8x1xi32> loc(#loc26)
+    tt.store %1, %tmp4_21 : tensor<8x1x!tt.ptr<f32>> loc(#loc27)
+    tt.return loc(#loc28)
+  } loc(#loc)
+} loc(#loc)
+#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:61)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":23:28)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":23:33)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":24:36)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":24:44)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":24:23)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":26:27)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":26:37)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":28:19)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":29:19)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":34:29)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:45)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:41)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:56)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:50)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:34)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:115)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":40:22)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":42:23)
+#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":43:40)
+#loc22 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:36)
+#loc24 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:15)
+#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":44:28)
+#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":45:25)
+#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":45:36)
+#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":45:4)
+#loc33 = loc("tmp0"(#loc1))
+#loc34 = loc("xoffset"(#loc3))
+#loc35 = loc("xoffset"(#loc4))
+#loc36 = loc("xindex"(#loc5))
+#loc37 = loc("xindex"(#loc6))
+#loc38 = loc("xindex"(#loc7))
+#loc39 = loc("r0_base"(#loc8))
+#loc40 = loc("r0_base"(#loc9))
+#loc41 = loc("x0"(#loc10))
+#loc42 = loc("x1"(#loc11))
+#loc43 = loc("r0_mask"(#loc12))
+#loc44 = loc("tmp0"(#loc13))
+#loc45 = loc("tmp0"(#loc14))
+#loc46 = loc("tmp0"(#loc15))
+#loc47 = loc("tmp0"(#loc16))
+#loc48 = loc("tmp0"(#loc17))
+#loc49 = loc("tmp0"(#loc18))
+#loc50 = loc("tmp2"(#loc19))
+#loc51 = loc("tmp5"(#loc20))
+#loc52 = loc("_tmp4"(#loc21))
+#loc54 = loc("tmp4"(#loc25))
+#loc55 = loc(callsite(#loc22 at #loc53))
+#loc57 = loc(callsite(#loc24 at #loc55))
diff --git a/triton/CSCZN5H6FSF2WGW6YDIXIDGO2LCRKL6FDUMRY7U4TKHTRIGCP4YA/__grp__triton_red_fused__fused_rms_norm_view_0.json b/triton/CSCZN5H6FSF2WGW6YDIXIDGO2LCRKL6FDUMRY7U4TKHTRIGCP4YA/__grp__triton_red_fused__fused_rms_norm_view_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..3d88bd0450098b079e90ecc9b3951d5c6a5e4d5e
--- /dev/null
+++ b/triton/CSCZN5H6FSF2WGW6YDIXIDGO2LCRKL6FDUMRY7U4TKHTRIGCP4YA/__grp__triton_red_fused__fused_rms_norm_view_0.json
@@ -0,0 +1 @@
+{"child_paths": {"triton_red_fused__fused_rms_norm_view_0.source": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/CSCZN5H6FSF2WGW6YDIXIDGO2LCRKL6FDUMRY7U4TKHTRIGCP4YA/triton_red_fused__fused_rms_norm_view_0.source", "triton_red_fused__fused_rms_norm_view_0.ttir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/CSCZN5H6FSF2WGW6YDIXIDGO2LCRKL6FDUMRY7U4TKHTRIGCP4YA/triton_red_fused__fused_rms_norm_view_0.ttir", "triton_red_fused__fused_rms_norm_view_0.ttgir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/CSCZN5H6FSF2WGW6YDIXIDGO2LCRKL6FDUMRY7U4TKHTRIGCP4YA/triton_red_fused__fused_rms_norm_view_0.ttgir", "triton_red_fused__fused_rms_norm_view_0.llir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/CSCZN5H6FSF2WGW6YDIXIDGO2LCRKL6FDUMRY7U4TKHTRIGCP4YA/triton_red_fused__fused_rms_norm_view_0.llir", "triton_red_fused__fused_rms_norm_view_0.ptx": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/CSCZN5H6FSF2WGW6YDIXIDGO2LCRKL6FDUMRY7U4TKHTRIGCP4YA/triton_red_fused__fused_rms_norm_view_0.ptx", "triton_red_fused__fused_rms_norm_view_0.cubin": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/CSCZN5H6FSF2WGW6YDIXIDGO2LCRKL6FDUMRY7U4TKHTRIGCP4YA/triton_red_fused__fused_rms_norm_view_0.cubin", "triton_red_fused__fused_rms_norm_view_0.json": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/CSCZN5H6FSF2WGW6YDIXIDGO2LCRKL6FDUMRY7U4TKHTRIGCP4YA/triton_red_fused__fused_rms_norm_view_0.json"}}
\ No newline at end of file
diff --git a/triton/CSCZN5H6FSF2WGW6YDIXIDGO2LCRKL6FDUMRY7U4TKHTRIGCP4YA/triton_red_fused__fused_rms_norm_view_0.cubin b/triton/CSCZN5H6FSF2WGW6YDIXIDGO2LCRKL6FDUMRY7U4TKHTRIGCP4YA/triton_red_fused__fused_rms_norm_view_0.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..b723a7e5a84d80ffa8521c60de48e498cabb7cc8
Binary files /dev/null and b/triton/CSCZN5H6FSF2WGW6YDIXIDGO2LCRKL6FDUMRY7U4TKHTRIGCP4YA/triton_red_fused__fused_rms_norm_view_0.cubin differ
diff --git a/triton/CSCZN5H6FSF2WGW6YDIXIDGO2LCRKL6FDUMRY7U4TKHTRIGCP4YA/triton_red_fused__fused_rms_norm_view_0.json b/triton/CSCZN5H6FSF2WGW6YDIXIDGO2LCRKL6FDUMRY7U4TKHTRIGCP4YA/triton_red_fused__fused_rms_norm_view_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..c2493755f701964bbb1485f4f3da23a47155b0d8
--- /dev/null
+++ b/triton/CSCZN5H6FSF2WGW6YDIXIDGO2LCRKL6FDUMRY7U4TKHTRIGCP4YA/triton_red_fused__fused_rms_norm_view_0.json
@@ -0,0 +1 @@
+{"hash": "148596f4fe2c8bab1adec0d1740cced2c5152fc51d191c7e9c9a8f38a0c27f30", "target": {"backend": "cuda", "arch": 89, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "enable_reflect_ftz": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee", "bf16x3", "bf16x6"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm89", "instrumentation_mode": "", "triton_version": "3.6.0", "tensordesc_meta": [], "shared": 64, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused__fused_rms_norm_view_0"}
\ No newline at end of file
diff --git a/triton/CSCZN5H6FSF2WGW6YDIXIDGO2LCRKL6FDUMRY7U4TKHTRIGCP4YA/triton_red_fused__fused_rms_norm_view_0.llir b/triton/CSCZN5H6FSF2WGW6YDIXIDGO2LCRKL6FDUMRY7U4TKHTRIGCP4YA/triton_red_fused__fused_rms_norm_view_0.llir
new file mode 100644
index 0000000000000000000000000000000000000000..3758792f084c13163ef06cef2e8c29759a22293a
--- /dev/null
+++ b/triton/CSCZN5H6FSF2WGW6YDIXIDGO2LCRKL6FDUMRY7U4TKHTRIGCP4YA/triton_red_fused__fused_rms_norm_view_0.llir
@@ -0,0 +1,167 @@
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64"
+
+@global_smem = external local_unnamed_addr addrspace(3) global [0 x i8], align 16
+
+; Function Attrs: nounwind
+define ptx_kernel void @triton_red_fused__fused_rms_norm_view_0(ptr addrspace(1) %0, ptr addrspace(1) %1, i32 %2, i32 %3, ptr addrspace(1) readnone captures(none) %4, ptr addrspace(1) readnone captures(none) %5) local_unnamed_addr #0 !dbg !4 {
+  %7 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7
+  %8 = shl i32 %7, 4, !dbg !8
+  %9 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9
+  %10 = and i32 %9, 120, !dbg !9
+  %11 = lshr exact i32 %10, 3, !dbg !9
+  %12 = or disjoint i32 %11, %8, !dbg !10
+  %13 = shl nuw nsw i32 %9, 2, !dbg !11
+  %14 = and i32 %13, 28, !dbg !11
+  %15 = sdiv i32 %12, 32, !dbg !12
+  %16 = mul i32 %15, 32, !dbg !13
+  %.decomposed = sub i32 %12, %16, !dbg !13
+  %17 = shl nsw i32 %.decomposed, 7, !dbg !14
+  %18 = mul i32 %15, 12288, !dbg !15
+  %19 = or disjoint i32 %17, %14
+  %20 = add i32 %19, %18
+  %21 = sext i32 %20 to i64, !dbg !16
+  %22 = getelementptr bfloat, ptr addrspace(1) %0, i64 %21, !dbg !16
+  %23 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !17
+  %24 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %22, i64 %23, i1 true) #4, !dbg !17
+  %25 = extractvalue { i32, i32 } %24, 0, !dbg !17
+  %26 = bitcast i32 %25 to <2 x bfloat>, !dbg !17
+  %27 = extractvalue { i32, i32 } %24, 1, !dbg !17
+  %28 = bitcast i32 %27 to <2 x bfloat>, !dbg !17
+  %29 = sext i32 %20 to i64, !dbg !16
+  %30 = getelementptr bfloat, ptr addrspace(1) %0, i64 %29, !dbg !16
+  %31 = getelementptr i8, ptr addrspace(1) %30, i64 64, !dbg !16
+  %32 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !17
+  %33 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %31, i64 %32, i1 true) #4, !dbg !17
+  %34 = extractvalue { i32, i32 } %33, 0, !dbg !17
+  %35 = bitcast i32 %34 to <2 x bfloat>, !dbg !17
+  %36 = extractvalue { i32, i32 } %33, 1, !dbg !17
+  %37 = bitcast i32 %36 to <2 x bfloat>, !dbg !17
+  %38 = sext i32 %20 to i64, !dbg !16
+  %39 = getelementptr bfloat, ptr addrspace(1) %0, i64 %38, !dbg !16
+  %40 = getelementptr i8, ptr addrspace(1) %39, i64 128, !dbg !16
+  %41 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !17
+  %42 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %40, i64 %41, i1 true) #4, !dbg !17
+  %43 = extractvalue { i32, i32 } %42, 0, !dbg !17
+  %44 = bitcast i32 %43 to <2 x bfloat>, !dbg !17
+  %45 = extractvalue { i32, i32 } %42, 1, !dbg !17
+  %46 = bitcast i32 %45 to <2 x bfloat>, !dbg !17
+  %47 = sext i32 %20 to i64, !dbg !16
+  %48 = getelementptr bfloat, ptr addrspace(1) %0, i64 %47, !dbg !16
+  %49 = getelementptr i8, ptr addrspace(1) %48, i64 192, !dbg !16
+  %50 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !17
+  %51 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %49, i64 %50, i1 true) #4, !dbg !17
+  %52 = extractvalue { i32, i32 } %51, 0, !dbg !17
+  %53 = bitcast i32 %52 to <2 x bfloat>, !dbg !17
+  %54 = extractvalue { i32, i32 } %51, 1, !dbg !17
+  %55 = bitcast i32 %54 to <2 x bfloat>, !dbg !17
+  %56 = fpext <2 x bfloat> %26 to <2 x float>, !dbg !18
+  %57 = fmul <2 x float> %56, %56, !dbg !19
+  %58 = fpext <2 x bfloat> %35 to <2 x float>, !dbg !18
+  %59 = fmul <2 x float> %58, %58, !dbg !19
+  %60 = fadd <2 x float> %57, %59, !dbg !20
+  %61 = fpext <2 x bfloat> %44 to <2 x float>, !dbg !18
+  %62 = fmul <2 x float> %61, %61, !dbg !19
+  %63 = fadd <2 x float> %60, %62, !dbg !20
+  %64 = fpext <2 x bfloat> %53 to <2 x float>, !dbg !18
+  %65 = fmul <2 x float> %64, %64, !dbg !19
+  %66 = fadd <2 x float> %63, %65, !dbg !20
+  %67 = fpext <2 x bfloat> %28 to <2 x float>, !dbg !18
+  %68 = fmul <2 x float> %67, %67, !dbg !19
+  %69 = fpext <2 x bfloat> %37 to <2 x float>, !dbg !18
+  %70 = fmul <2 x float> %69, %69, !dbg !19
+  %71 = fadd <2 x float> %68, %70, !dbg !20
+  %72 = fpext <2 x bfloat> %46 to <2 x float>, !dbg !18
+  %73 = fmul <2 x float> %72, %72, !dbg !19
+  %74 = fadd <2 x float> %71, %73, !dbg !20
+  %75 = fpext <2 x bfloat> %55 to <2 x float>, !dbg !18
+  %76 = fmul <2 x float> %75, %75, !dbg !19
+  %77 = fadd <2 x float> %74, %76, !dbg !20
+  %78 = and i32 %9, 15, !dbg !9
+  %79 = or disjoint i32 %8, %78, !dbg !10
+  %shift = shufflevector <2 x float> %66, <2 x float> poison, <2 x i32> <i32 1, i32 poison>, !dbg !21
+  %foldExtExtBinop = fadd <2 x float> %66, %shift, !dbg !21
+  %foldExtExtBinop5 = fadd <2 x float> %77, %foldExtExtBinop, !dbg !21
+  %shift7 = shufflevector <2 x float> %77, <2 x float> poison, <2 x i32> <i32 1, i32 poison>, !dbg !21
+  %foldExtExtBinop8 = fadd <2 x float> %shift7, %foldExtExtBinop5, !dbg !21
+  %80 = extractelement <2 x float> %foldExtExtBinop8, i64 0, !dbg !21
+  %81 = bitcast float %80 to i32, !dbg !24
+  %82 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %81, i32 4, i32 31), !dbg !24
+  %83 = bitcast i32 %82 to float, !dbg !24
+  %84 = fadd float %80, %83, !dbg !21
+  %85 = bitcast float %84 to i32, !dbg !24
+  %86 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %85, i32 2, i32 31), !dbg !24
+  %87 = bitcast i32 %86 to float, !dbg !24
+  %88 = fadd float %84, %87, !dbg !21
+  %89 = bitcast float %88 to i32, !dbg !24
+  %90 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %89, i32 1, i32 31), !dbg !24
+  %91 = bitcast i32 %90 to float, !dbg !24
+  %92 = fadd float %88, %91, !dbg !21
+  %93 = lshr exact i32 %10, 1, !dbg !27
+  %94 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %93, !dbg !27
+  store float %92, ptr addrspace(3) %94, align 4, !dbg !27
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !27
+  %95 = shl nuw nsw i32 %78, 2, !dbg !27
+  %96 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %95, !dbg !27
+  %97 = load i32, ptr addrspace(3) %96, align 4, !dbg !27
+  %98 = sext i32 %79 to i64, !dbg !28
+  %99 = getelementptr float, ptr addrspace(1) %1, i64 %98, !dbg !28
+  %100 = and i32 %9, 112, !dbg !29
+  %101 = icmp eq i32 %100, 0, !dbg !29
+  tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %97, ptr addrspace(1) %99, i1 %101) #4, !dbg !29
+  ret void, !dbg !30
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1
+
+; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
+declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2
+
+; Function Attrs: convergent nocallback nounwind
+declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #3
+
+attributes #0 = { nounwind "nvvm.reqntid"="128" }
+attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
+attributes #3 = { convergent nocallback nounwind }
+attributes #4 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly)
+!1 = !DIFile(filename: "cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py", directory: "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv")
+!2 = !{i32 2, !"Debug Info Version", i32 3}
+!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
+!4 = distinct !DISubprogram(name: "triton_red_fused__fused_rms_norm_view_0", linkageName: "triton_red_fused__fused_rms_norm_view_0", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!5 = !DISubroutineType(cc: DW_CC_normal, types: !6)
+!6 = !{}
+!7 = !DILocation(line: 23, column: 28, scope: !4)
+!8 = !DILocation(line: 23, column: 33, scope: !4)
+!9 = !DILocation(line: 24, column: 44, scope: !4)
+!10 = !DILocation(line: 24, column: 23, scope: !4)
+!11 = !DILocation(line: 26, column: 37, scope: !4)
+!12 = !DILocation(line: 29, column: 19, scope: !4)
+!13 = !DILocation(line: 28, column: 19, scope: !4)
+!14 = !DILocation(line: 38, column: 45, scope: !4)
+!15 = !DILocation(line: 38, column: 56, scope: !4)
+!16 = !DILocation(line: 38, column: 34, scope: !4)
+!17 = !DILocation(line: 38, column: 61, scope: !4)
+!18 = !DILocation(line: 38, column: 115, scope: !4)
+!19 = !DILocation(line: 40, column: 22, scope: !4)
+!20 = !DILocation(line: 42, column: 23, scope: !4)
+!21 = !DILocation(line: 263, column: 15, scope: !22, inlinedAt: !24)
+!22 = distinct !DILexicalBlockFile(scope: !4, file: !23, discriminator: 0)
+!23 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.12/dist-packages/triton/language")
+!24 = !DILocation(line: 293, column: 36, scope: !22, inlinedAt: !25)
+!25 = !DILocation(line: 44, column: 25, scope: !26)
+!26 = distinct !DILexicalBlockFile(scope: !4, file: !1, discriminator: 0)
+!27 = !DILocation(line: 44, column: 28, scope: !4)
+!28 = !DILocation(line: 45, column: 25, scope: !4)
+!29 = !DILocation(line: 45, column: 36, scope: !4)
+!30 = !DILocation(line: 45, column: 4, scope: !4)
diff --git a/triton/CSCZN5H6FSF2WGW6YDIXIDGO2LCRKL6FDUMRY7U4TKHTRIGCP4YA/triton_red_fused__fused_rms_norm_view_0.ptx b/triton/CSCZN5H6FSF2WGW6YDIXIDGO2LCRKL6FDUMRY7U4TKHTRIGCP4YA/triton_red_fused__fused_rms_norm_view_0.ptx
new file mode 100644
index 0000000000000000000000000000000000000000..7df8aa7762d21a7b9423be449161b2124bc9de79
--- /dev/null
+++ b/triton/CSCZN5H6FSF2WGW6YDIXIDGO2LCRKL6FDUMRY7U4TKHTRIGCP4YA/triton_red_fused__fused_rms_norm_view_0.ptx
@@ -0,0 +1,575 @@
+//
+// Generated by LLVM NVPTX Back-End
+//
+
+.version 9.1
+.target sm_89
+.address_size 64
+
+	// .globl	triton_red_fused__fused_rms_norm_view_0 // -- Begin function triton_red_fused__fused_rms_norm_view_0
+.extern .shared .align 16 .b8 global_smem[];
+                                        // @triton_red_fused__fused_rms_norm_view_0
+.visible .entry triton_red_fused__fused_rms_norm_view_0(
+	.param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm_view_0_param_0,
+	.param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm_view_0_param_1,
+	.param .u32 triton_red_fused__fused_rms_norm_view_0_param_2,
+	.param .u32 triton_red_fused__fused_rms_norm_view_0_param_3,
+	.param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm_view_0_param_4,
+	.param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm_view_0_param_5
+)
+.reqntid 128
+{
+	.reg .pred 	%p<3>;
+	.reg .b16 	%rs<17>;
+	.reg .b32 	%r<77>;
+	.reg .b64 	%rd<12>;
+	.loc	1 18 0                          // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:18:0
+$L__func_begin0:
+	.loc	1 18 0                          // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:18:0
+
+// %bb.0:
+	ld.param.b64 	%rd10, [triton_red_fused__fused_rms_norm_view_0_param_0];
+	ld.param.b64 	%rd11, [triton_red_fused__fused_rms_norm_view_0_param_1];
+$L__tmp0:
+	.loc	1 23 28                         // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:23:28
+	mov.u32 	%r11, %ctaid.x;
+	.loc	1 23 33                         // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:23:33
+	shl.b32 	%r12, %r11, 4;
+	.loc	1 24 44                         // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:24:44
+	mov.u32 	%r13, %tid.x;
+	and.b32 	%r14, %r13, 120;
+	bfe.u32 	%r15, %r13, 3, 4;
+	.loc	1 24 23                         // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:24:23
+	or.b32 	%r16, %r15, %r12;
+	.loc	1 26 37                         // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:26:37
+	shl.b32 	%r17, %r13, 2;
+	and.b32 	%r18, %r17, 28;
+	.loc	1 29 19                         // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:29:19
+	bfe.s32 	%r19, %r11, 27, 1;
+	shr.u32 	%r20, %r19, 27;
+	add.s32 	%r21, %r16, %r20;
+	shr.u32 	%r22, %r21, 5;
+	.loc	1 28 19                         // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:28:19
+	and.b32 	%r23, %r21, 33554400;
+	sub.s32 	%r24, %r16, %r23;
+	.loc	1 38 45                         // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:38:45
+	shl.b32 	%r25, %r24, 7;
+	or.b32 	%r26, %r25, %r18;
+	mad.lo.s32 	%r27, %r22, 12288, %r26;
+	.loc	1 38 34                         // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:38:34
+	mad.wide.s32 	%rd1, %r27, 2, %rd10;
+	.loc	1 38 61                         // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:38:61
+	// begin inline asm
+	mov.u64 %rd2, 0x0;
+	createpolicy.fractional.L2::evict_first.b64 %rd2, 1.0;
+	// end inline asm
+	mov.b32 	%r3, 0;
+	mov.pred 	%p1, -1;
+	// begin inline asm
+	mov.u32 %r1, %r3;
+	mov.u32 %r2, %r3;
+	@%p1 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { %r1, %r2 }, [ %rd1 + 0 ], %rd2;
+	// end inline asm
+	.loc	1 38 34                         // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:38:34
+	add.s64 	%rd3, %rd1, 64;
+	.loc	1 38 61                         // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:38:61
+	// begin inline asm
+	mov.u64 %rd4, 0x0;
+	createpolicy.fractional.L2::evict_first.b64 %rd4, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r4, %r3;
+	mov.u32 %r5, %r3;
+	@%p1 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { %r4, %r5 }, [ %rd3 + 0 ], %rd4;
+	// end inline asm
+	.loc	1 38 34                         // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:38:34
+	add.s64 	%rd5, %rd1, 128;
+	.loc	1 38 61                         // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:38:61
+	// begin inline asm
+	mov.u64 %rd6, 0x0;
+	createpolicy.fractional.L2::evict_first.b64 %rd6, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r6, %r3;
+	mov.u32 %r7, %r3;
+	@%p1 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { %r6, %r7 }, [ %rd5 + 0 ], %rd6;
+	// end inline asm
+	.loc	1 38 34                         // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:38:34
+	add.s64 	%rd7, %rd1, 192;
+	.loc	1 38 61                         // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:38:61
+	// begin inline asm
+	mov.u64 %rd8, 0x0;
+	createpolicy.fractional.L2::evict_first.b64 %rd8, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r8, %r3;
+	mov.u32 %r9, %r3;
+	@%p1 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { %r8, %r9 }, [ %rd7 + 0 ], %rd8;
+	// end inline asm
+	.loc	1 38 115                        // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:38:115
+	mov.b32 	{%rs1, %rs2}, %r1;
+	cvt.f32.bf16 	%r28, %rs1;
+	cvt.f32.bf16 	%r29, %rs2;
+	mov.b32 	{%rs3, %rs4}, %r4;
+	cvt.f32.bf16 	%r30, %rs4;
+	cvt.f32.bf16 	%r31, %rs3;
+	.loc	1 40 22                         // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:40:22
+	mul.f32 	%r32, %r31, %r31;
+	mul.f32 	%r33, %r30, %r30;
+	.loc	1 42 23                         // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:42:23
+	fma.rn.f32 	%r34, %r29, %r29, %r33;
+	fma.rn.f32 	%r35, %r28, %r28, %r32;
+	.loc	1 38 115                        // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:38:115
+	mov.b32 	{%rs5, %rs6}, %r6;
+	cvt.f32.bf16 	%r36, %rs6;
+	cvt.f32.bf16 	%r37, %rs5;
+	.loc	1 42 23                         // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:42:23
+	fma.rn.f32 	%r38, %r37, %r37, %r35;
+	fma.rn.f32 	%r39, %r36, %r36, %r34;
+	.loc	1 38 115                        // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:38:115
+	mov.b32 	{%rs7, %rs8}, %r8;
+	cvt.f32.bf16 	%r40, %rs7;
+	cvt.f32.bf16 	%r41, %rs8;
+	.loc	1 42 23                         // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:42:23
+	fma.rn.f32 	%r42, %r41, %r41, %r39;
+	fma.rn.f32 	%r43, %r40, %r40, %r38;
+	.loc	1 38 115                        // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:38:115
+	mov.b32 	{%rs9, %rs10}, %r2;
+	cvt.f32.bf16 	%r44, %rs9;
+	cvt.f32.bf16 	%r45, %rs10;
+	mov.b32 	{%rs11, %rs12}, %r5;
+	cvt.f32.bf16 	%r46, %rs12;
+	cvt.f32.bf16 	%r47, %rs11;
+	.loc	1 40 22                         // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:40:22
+	mul.f32 	%r48, %r47, %r47;
+	mul.f32 	%r49, %r46, %r46;
+	.loc	1 42 23                         // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:42:23
+	fma.rn.f32 	%r50, %r45, %r45, %r49;
+	fma.rn.f32 	%r51, %r44, %r44, %r48;
+	.loc	1 38 115                        // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:38:115
+	mov.b32 	{%rs13, %rs14}, %r7;
+	cvt.f32.bf16 	%r52, %rs14;
+	cvt.f32.bf16 	%r53, %rs13;
+	.loc	1 42 23                         // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:42:23
+	fma.rn.f32 	%r54, %r53, %r53, %r51;
+	fma.rn.f32 	%r55, %r52, %r52, %r50;
+	.loc	1 38 115                        // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:38:115
+	mov.b32 	{%rs15, %rs16}, %r9;
+	cvt.f32.bf16 	%r56, %rs15;
+	cvt.f32.bf16 	%r57, %rs16;
+	.loc	1 42 23                         // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:42:23
+	fma.rn.f32 	%r58, %r57, %r57, %r55;
+	fma.rn.f32 	%r59, %r56, %r56, %r54;
+	.loc	1 24 44                         // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:24:44
+	and.b32 	%r60, %r13, 15;
+	.loc	1 24 23                         // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:24:23
+	or.b32 	%r61, %r12, %r60;
+$L__tmp1:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:44:25 ] ]
+	add.f32 	%r62, %r43, %r42;
+	add.f32 	%r63, %r59, %r62;
+	add.f32 	%r64, %r58, %r63;
+$L__tmp2:
+	.loc	2 293 36                        // standard.py:293:36 @[ cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:44:25 ]
+	shfl.sync.bfly.b32 	%r65, %r64, 4, 31, -1;
+$L__tmp3:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:44:25 ] ]
+	add.f32 	%r66, %r64, %r65;
+$L__tmp4:
+	.loc	2 293 36                        // standard.py:293:36 @[ cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:44:25 ]
+	shfl.sync.bfly.b32 	%r67, %r66, 2, 31, -1;
+$L__tmp5:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:44:25 ] ]
+	add.f32 	%r68, %r66, %r67;
+$L__tmp6:
+	.loc	2 293 36                        // standard.py:293:36 @[ cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:44:25 ]
+	shfl.sync.bfly.b32 	%r69, %r68, 1, 31, -1;
+$L__tmp7:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:44:25 ] ]
+	add.f32 	%r70, %r68, %r69;
+$L__tmp8:
+	.loc	1 44 28                         // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:44:28
+	shr.u32 	%r71, %r14, 1;
+	mov.b32 	%r72, global_smem;
+	add.s32 	%r73, %r72, %r71;
+	st.shared.b32 	[%r73], %r70;
+	bar.sync 	0;
+	shl.b32 	%r74, %r60, 2;
+	add.s32 	%r75, %r72, %r74;
+	ld.shared.b32 	%r10, [%r75];
+	.loc	1 45 25                         // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:45:25
+	mad.wide.s32 	%rd9, %r61, 4, %rd11;
+	.loc	1 45 36                         // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:45:36
+	and.b32 	%r76, %r13, 112;
+	setp.eq.b32 	%p2, %r76, 0;
+	// begin inline asm
+	@%p2 st.global.b32 [ %rd9 + 0 ], { %r10 };
+	// end inline asm
+	.loc	1 45 4                          // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:45:4
+	ret;
+$L__tmp9:
+$L__func_end0:
+                                        // -- End function
+}
+	.file	1 "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py"
+	.file	2 "/usr/local/lib/python3.12/dist-packages/triton/language/standard.py"
+	.section	.debug_abbrev
+	{
+.b8 1                                   // Abbreviation Code
+.b8 17                                  // DW_TAG_compile_unit
+.b8 1                                   // DW_CHILDREN_yes
+.b8 37                                  // DW_AT_producer
+.b8 8                                   // DW_FORM_string
+.b8 19                                  // DW_AT_language
+.b8 5                                   // DW_FORM_data2
+.b8 3                                   // DW_AT_name
+.b8 8                                   // DW_FORM_string
+.b8 16                                  // DW_AT_stmt_list
+.b8 6                                   // DW_FORM_data4
+.b8 27                                  // DW_AT_comp_dir
+.b8 8                                   // DW_FORM_string
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 2                                   // Abbreviation Code
+.b8 46                                  // DW_TAG_subprogram
+.b8 0                                   // DW_CHILDREN_no
+.b8 3                                   // DW_AT_name
+.b8 8                                   // DW_FORM_string
+.b8 32                                  // DW_AT_inline
+.b8 11                                  // DW_FORM_data1
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 3                                   // Abbreviation Code
+.b8 46                                  // DW_TAG_subprogram
+.b8 1                                   // DW_CHILDREN_yes
+.b8 17                                  // DW_AT_low_pc
+.b8 1                                   // DW_FORM_addr
+.b8 18                                  // DW_AT_high_pc
+.b8 1                                   // DW_FORM_addr
+.b8 49                                  // DW_AT_abstract_origin
+.b8 19                                  // DW_FORM_ref4
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 4                                   // Abbreviation Code
+.b8 29                                  // DW_TAG_inlined_subroutine
+.b8 1                                   // DW_CHILDREN_yes
+.b8 49                                  // DW_AT_abstract_origin
+.b8 19                                  // DW_FORM_ref4
+.b8 17                                  // DW_AT_low_pc
+.b8 1                                   // DW_FORM_addr
+.b8 18                                  // DW_AT_high_pc
+.b8 1                                   // DW_FORM_addr
+.b8 88                                  // DW_AT_call_file
+.b8 11                                  // DW_FORM_data1
+.b8 89                                  // DW_AT_call_line
+.b8 11                                  // DW_FORM_data1
+.b8 87                                  // DW_AT_call_column
+.b8 11                                  // DW_FORM_data1
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 5                                   // Abbreviation Code
+.b8 29                                  // DW_TAG_inlined_subroutine
+.b8 0                                   // DW_CHILDREN_no
+.b8 49                                  // DW_AT_abstract_origin
+.b8 19                                  // DW_FORM_ref4
+.b8 17                                  // DW_AT_low_pc
+.b8 1                                   // DW_FORM_addr
+.b8 18                                  // DW_AT_high_pc
+.b8 1                                   // DW_FORM_addr
+.b8 88                                  // DW_AT_call_file
+.b8 11                                  // DW_FORM_data1
+.b8 89                                  // DW_AT_call_line
+.b8 5                                   // DW_FORM_data2
+.b8 87                                  // DW_AT_call_column
+.b8 11                                  // DW_FORM_data1
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 0                                   // EOM(3)
+	}
+	.section	.debug_info
+	{
+.b32 339                                // Length of Unit
+.b8 2                                   // DWARF version number
+.b8 0
+.b32 .debug_abbrev                      // Offset Into Abbrev. Section
+.b8 8                                   // Address Size (in bytes)
+.b8 1                                   // Abbrev [1] 0xb:0x14c DW_TAG_compile_unit
+.b8 116                                 // DW_AT_producer
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 0
+.b8 2                                   // DW_AT_language
+.b8 0
+.b8 99                                  // DW_AT_name
+.b8 119
+.b8 118
+.b8 121
+.b8 116
+.b8 52
+.b8 50
+.b8 55
+.b8 51
+.b8 105
+.b8 117
+.b8 51
+.b8 51
+.b8 109
+.b8 112
+.b8 101
+.b8 101
+.b8 55
+.b8 104
+.b8 98
+.b8 101
+.b8 116
+.b8 53
+.b8 106
+.b8 53
+.b8 101
+.b8 113
+.b8 52
+.b8 52
+.b8 100
+.b8 54
+.b8 102
+.b8 115
+.b8 104
+.b8 103
+.b8 119
+.b8 107
+.b8 121
+.b8 120
+.b8 107
+.b8 110
+.b8 53
+.b8 50
+.b8 103
+.b8 103
+.b8 103
+.b8 107
+.b8 105
+.b8 113
+.b8 104
+.b8 106
+.b8 53
+.b8 46
+.b8 112
+.b8 121
+.b8 0
+.b32 .debug_line                        // DW_AT_stmt_list
+.b8 47                                  // DW_AT_comp_dir
+.b8 97
+.b8 112
+.b8 112
+.b8 47
+.b8 116
+.b8 101
+.b8 110
+.b8 115
+.b8 111
+.b8 114
+.b8 114
+.b8 116
+.b8 95
+.b8 108
+.b8 108
+.b8 109
+.b8 47
+.b8 118
+.b8 105
+.b8 115
+.b8 117
+.b8 97
+.b8 108
+.b8 95
+.b8 103
+.b8 101
+.b8 110
+.b8 47
+.b8 99
+.b8 111
+.b8 109
+.b8 112
+.b8 105
+.b8 108
+.b8 101
+.b8 100
+.b8 95
+.b8 99
+.b8 97
+.b8 99
+.b8 104
+.b8 101
+.b8 47
+.b8 102
+.b8 108
+.b8 117
+.b8 120
+.b8 50
+.b8 95
+.b8 107
+.b8 108
+.b8 101
+.b8 105
+.b8 110
+.b8 95
+.b8 57
+.b8 98
+.b8 95
+.b8 78
+.b8 86
+.b8 73
+.b8 68
+.b8 73
+.b8 65
+.b8 95
+.b8 71
+.b8 101
+.b8 70
+.b8 111
+.b8 114
+.b8 99
+.b8 101
+.b8 95
+.b8 82
+.b8 84
+.b8 88
+.b8 95
+.b8 52
+.b8 48
+.b8 57
+.b8 48
+.b8 95
+.b8 115
+.b8 109
+.b8 56
+.b8 57
+.b8 95
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 50
+.b8 46
+.b8 49
+.b8 48
+.b8 46
+.b8 48
+.b8 97
+.b8 48
+.b8 95
+.b8 98
+.b8 52
+.b8 101
+.b8 52
+.b8 101
+.b8 101
+.b8 56
+.b8 49
+.b8 100
+.b8 51
+.b8 46
+.b8 110
+.b8 118
+.b8 50
+.b8 53
+.b8 46
+.b8 49
+.b8 50
+.b8 95
+.b8 99
+.b8 117
+.b8 100
+.b8 97
+.b8 49
+.b8 51
+.b8 95
+.b8 49
+.b8 47
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 105
+.b8 110
+.b8 100
+.b8 117
+.b8 99
+.b8 116
+.b8 111
+.b8 114
+.b8 47
+.b8 119
+.b8 118
+.b8 0
+.b8 2                                   // Abbrev [2] 0xe4:0x2a DW_TAG_subprogram
+.b8 116                                 // DW_AT_name
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 114
+.b8 101
+.b8 100
+.b8 95
+.b8 102
+.b8 117
+.b8 115
+.b8 101
+.b8 100
+.b8 95
+.b8 95
+.b8 102
+.b8 117
+.b8 115
+.b8 101
+.b8 100
+.b8 95
+.b8 114
+.b8 109
+.b8 115
+.b8 95
+.b8 110
+.b8 111
+.b8 114
+.b8 109
+.b8 95
+.b8 118
+.b8 105
+.b8 101
+.b8 119
+.b8 95
+.b8 48
+.b8 0
+.b8 1                                   // DW_AT_inline
+.b8 3                                   // Abbrev [3] 0x10e:0x48 DW_TAG_subprogram
+.b64 $L__func_begin0                    // DW_AT_low_pc
+.b64 $L__func_end0                      // DW_AT_high_pc
+.b32 228                                // DW_AT_abstract_origin
+.b8 4                                   // Abbrev [4] 0x123:0x32 DW_TAG_inlined_subroutine
+.b32 228                                // DW_AT_abstract_origin
+.b64 $L__tmp1                           // DW_AT_low_pc
+.b64 $L__tmp8                           // DW_AT_high_pc
+.b8 1                                   // DW_AT_call_file
+.b8 44                                  // DW_AT_call_line
+.b8 25                                  // DW_AT_call_column
+.b8 5                                   // Abbrev [5] 0x13b:0x19 DW_TAG_inlined_subroutine
+.b32 228                                // DW_AT_abstract_origin
+.b64 $L__tmp1                           // DW_AT_low_pc
+.b64 $L__tmp8                           // DW_AT_high_pc
+.b8 2                                   // DW_AT_call_file
+.b8 37                                  // DW_AT_call_line
+.b8 1
+.b8 36                                  // DW_AT_call_column
+.b8 0                                   // End Of Children Mark
+.b8 0                                   // End Of Children Mark
+.b8 0                                   // End Of Children Mark
+	}
+	.section	.debug_macinfo	{	}
diff --git a/triton/CSCZN5H6FSF2WGW6YDIXIDGO2LCRKL6FDUMRY7U4TKHTRIGCP4YA/triton_red_fused__fused_rms_norm_view_0.source b/triton/CSCZN5H6FSF2WGW6YDIXIDGO2LCRKL6FDUMRY7U4TKHTRIGCP4YA/triton_red_fused__fused_rms_norm_view_0.source
new file mode 100644
index 0000000000000000000000000000000000000000..7d4df7810576bc882ca137edfa01aaa181302c79
--- /dev/null
+++ b/triton/CSCZN5H6FSF2WGW6YDIXIDGO2LCRKL6FDUMRY7U4TKHTRIGCP4YA/triton_red_fused__fused_rms_norm_view_0.source
@@ -0,0 +1,167 @@
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":18:0)
+#loc33 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":287:0)
+#loc35 = loc(unknown)
+#loc38 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":262:0)
+#loc42 = loc("in_ptr0"(#loc))
+#loc43 = loc("out_ptr0"(#loc))
+#loc44 = loc("xnumel"(#loc))
+#loc45 = loc("r0_numel"(#loc))
+#loc74 = loc("input"(#loc33))
+#loc75 = loc("a"(#loc38))
+#loc76 = loc("b"(#loc38))
+module {
+  tt.func public @triton_red_fused__fused_rms_norm_view_0(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
+    %xnumel_0 = arith.constant 8192 : i32 loc(#loc46)
+    %r0_numel_1 = arith.constant 128 : i32 loc(#loc47)
+    %xoffset = tt.get_program_id x : i32 loc(#loc48)
+    %xoffset_2 = arith.constant 16 : i32 loc(#loc49)
+    %xoffset_3 = arith.constant 16 : i32 loc(#loc49)
+    %xoffset_4 = arith.muli %xoffset, %xoffset_3 : i32 loc(#loc49)
+    %xindex = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32> loc(#loc50)
+    %xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<16xi32> -> tensor<16x1xi32> loc(#loc51)
+    %xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<16x1xi32> loc(#loc52)
+    %xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<16x1xi32> loc(#loc52)
+    %xmask = arith.constant true loc(#loc53)
+    %xmask_8 = arith.constant dense<true> : tensor<16x32xi1> loc(#loc53)
+    %r0_base = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32> loc(#loc54)
+    %r0_base_9 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<32xi32> -> tensor<1x32xi32> loc(#loc55)
+    %x0 = arith.constant 32 : i32 loc(#loc56)
+    %x0_10 = arith.constant 32 : i32 loc(#loc56)
+    %x0_11 = arith.constant dense<32> : tensor<16x1xi32> loc(#loc56)
+    %x0_12 = arith.remsi %xindex_7, %x0_11 : tensor<16x1xi32> loc(#loc56)
+    %x1 = arith.constant 32 : i32 loc(#loc57)
+    %x1_13 = arith.constant 32 : i32 loc(#loc57)
+    %x1_14 = arith.constant dense<32> : tensor<16x1xi32> loc(#loc57)
+    %x1_15 = arith.divsi %xindex_7, %x1_14 : tensor<16x1xi32> loc(#loc57)
+    %_tmp4 = arith.constant 0.000000e+00 : f32 loc(#loc58)
+    %_tmp4_16 = arith.constant dense<0.000000e+00> : tensor<16x32xf32> loc(#loc58)
+    %c0_i32 = arith.constant 0 : i32 loc(#loc14)
+    %c32_i32 = arith.constant 32 : i32 loc(#loc14)
+    %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc14)
+    %1 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc14)
+    %2 = arith.bitcast %c32_i32 : i32 to i32 loc(#loc14)
+    %3 = ub.poison : i32 loc(#loc14)
+    %_tmp4_17 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%_tmp4_19 = %_tmp4_16) -> (tensor<16x32xf32>)  : i32 {
+      %r0_index = tt.splat %r0_offset : i32 -> tensor<1x32xi32> loc(#loc60)
+      %r0_index_20 = arith.addi %r0_index, %r0_base_9 : tensor<1x32xi32> loc(#loc60)
+      %r0_mask = arith.constant dense<128> : tensor<1x32xi32> loc(#loc61)
+      %r0_mask_21 = arith.cmpi slt, %r0_index_20, %r0_mask : tensor<1x32xi32> loc(#loc61)
+      %tmp0 = arith.constant 128 : i32 loc(#loc62)
+      %tmp0_22 = arith.constant 128 : i32 loc(#loc62)
+      %tmp0_23 = arith.constant dense<128> : tensor<16x1xi32> loc(#loc62)
+      %tmp0_24 = arith.muli %tmp0_23, %x0_12 : tensor<16x1xi32> loc(#loc62)
+      %tmp0_25 = tt.broadcast %r0_index_20 : tensor<1x32xi32> -> tensor<16x32xi32> loc(#loc63)
+      %tmp0_26 = tt.broadcast %tmp0_24 : tensor<16x1xi32> -> tensor<16x32xi32> loc(#loc63)
+      %tmp0_27 = arith.addi %tmp0_25, %tmp0_26 : tensor<16x32xi32> loc(#loc63)
+      %tmp0_28 = arith.constant 12288 : i32 loc(#loc64)
+      %tmp0_29 = arith.constant 12288 : i32 loc(#loc64)
+      %tmp0_30 = arith.constant dense<12288> : tensor<16x1xi32> loc(#loc64)
+      %tmp0_31 = arith.muli %tmp0_30, %x1_15 : tensor<16x1xi32> loc(#loc64)
+      %tmp0_32 = tt.broadcast %tmp0_31 : tensor<16x1xi32> -> tensor<16x32xi32> loc(#loc65)
+      %tmp0_33 = arith.addi %tmp0_27, %tmp0_32 : tensor<16x32xi32> loc(#loc65)
+      %tmp0_34 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<16x32x!tt.ptr<bf16>> loc(#loc66)
+      %tmp0_35 = tt.addptr %tmp0_34, %tmp0_33 : tensor<16x32x!tt.ptr<bf16>>, tensor<16x32xi32> loc(#loc66)
+      %tmp0_36 = arith.constant 0.000000e+00 : f32 loc(#loc67)
+      %tmp0_37 = tt.broadcast %r0_mask_21 : tensor<1x32xi1> -> tensor<16x32xi1> loc(#loc67)
+      %tmp0_38 = arith.constant dense<0.000000e+00> : tensor<16x32xf32> loc(#loc67)
+      %tmp0_39 = arith.truncf %tmp0_38 : tensor<16x32xf32> to tensor<16x32xbf16> loc(#loc67)
+      %tmp0_40 = tt.load %tmp0_35, %tmp0_37, %tmp0_39 evictionPolicy = evict_first : tensor<16x32x!tt.ptr<bf16>> loc(#loc67)
+      %tmp0_41 = arith.extf %tmp0_40 : tensor<16x32xbf16> to tensor<16x32xf32> loc(#loc68)
+      %tmp2 = arith.mulf %tmp0_41, %tmp0_41 : tensor<16x32xf32> loc(#loc69)
+      %tmp5 = arith.addf %_tmp4_19, %tmp2 : tensor<16x32xf32> loc(#loc70)
+      %_tmp4_42 = tt.broadcast %r0_mask_21 : tensor<1x32xi1> -> tensor<16x32xi1> loc(#loc71)
+      %_tmp4_43 = arith.select %_tmp4_42, %tmp5, %_tmp4_19 : tensor<16x32xi1>, tensor<16x32xf32> loc(#loc71)
+      scf.yield %_tmp4_43 : tensor<16x32xf32> loc(#loc27)
+    } loc(#loc59)
+    %tmp4 = tt.call @"triton.language.standard.sum__fp32S16_32S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%_tmp4_17) : (tensor<16x32xf32>) -> tensor<16xf32> loc(#loc72)
+    %tmp4_18 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<16xf32> -> tensor<16x1xf32> loc(#loc73)
+    %4 = tt.splat %out_ptr0 : !tt.ptr<f32> -> tensor<16x1x!tt.ptr<f32>> loc(#loc30)
+    %5 = tt.addptr %4, %xindex_7 : tensor<16x1x!tt.ptr<f32>>, tensor<16x1xi32> loc(#loc30)
+    tt.store %5, %tmp4_18 : tensor<16x1x!tt.ptr<f32>> loc(#loc31)
+    tt.return loc(#loc32)
+  } loc(#loc)
+  tt.func private @"triton.language.standard.sum__fp32S16_32S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<16x32xf32> loc("input"(#loc33))) -> tensor<16xf32> attributes {noinline = false} {
+    %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({
+    ^bb0(%arg1: f32 loc(unknown), %arg2: f32 loc(unknown)):
+      %2 = tt.call @triton.language.standard._sum_combine__fp32_fp32__(%arg1, %arg2) : (f32, f32) -> f32 loc(#loc34)
+      tt.reduce.return %2 : f32 loc(#loc34)
+    }) : (tensor<16x32xf32>) -> tensor<16xf32> loc(#loc34)
+    tt.return %0 : tensor<16xf32> loc(#loc36)
+  ^bb1:  // no predecessors
+    %1 = ub.poison : tensor<16xf32> loc(#loc37)
+    tt.return %1 : tensor<16xf32> loc(#loc37)
+  } loc(#loc33)
+  tt.func private @triton.language.standard._sum_combine__fp32_fp32__(%a: f32 loc("a"(#loc38)), %b: f32 loc("b"(#loc38))) -> f32 attributes {noinline = false} {
+    %0 = arith.addf %a, %b : f32 loc(#loc39)
+    tt.return %0 : f32 loc(#loc40)
+  ^bb1:  // no predecessors
+    %1 = ub.poison : f32 loc(#loc41)
+    tt.return %1 : f32 loc(#loc41)
+  } loc(#loc38)
+} loc(#loc)
+#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":19:13)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":20:15)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":23:28)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":23:33)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":24:36)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":24:44)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":24:23)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":25:46)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":26:27)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":26:37)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":28:19)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":29:19)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":30:43)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":32:43)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":33:31)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":34:29)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:45)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:41)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:56)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:50)
+#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:34)
+#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:61)
+#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:115)
+#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":40:22)
+#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":42:23)
+#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":43:40)
+#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":43:8)
+#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":44:25)
+#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":44:28)
+#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":45:25)
+#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":45:36)
+#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":45:4)
+#loc34 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:36)
+#loc36 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:11)
+#loc37 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:4)
+#loc39 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:15)
+#loc40 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:11)
+#loc41 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:4)
+#loc46 = loc("xnumel"(#loc1))
+#loc47 = loc("r0_numel"(#loc2))
+#loc48 = loc("xoffset"(#loc3))
+#loc49 = loc("xoffset"(#loc4))
+#loc50 = loc("xindex"(#loc5))
+#loc51 = loc("xindex"(#loc6))
+#loc52 = loc("xindex"(#loc7))
+#loc53 = loc("xmask"(#loc8))
+#loc54 = loc("r0_base"(#loc9))
+#loc55 = loc("r0_base"(#loc10))
+#loc56 = loc("x0"(#loc11))
+#loc57 = loc("x1"(#loc12))
+#loc58 = loc("_tmp4"(#loc13))
+#loc59 = loc("_tmp4"(#loc14))
+#loc60 = loc("r0_index"(#loc15))
+#loc61 = loc("r0_mask"(#loc16))
+#loc62 = loc("tmp0"(#loc17))
+#loc63 = loc("tmp0"(#loc18))
+#loc64 = loc("tmp0"(#loc19))
+#loc65 = loc("tmp0"(#loc20))
+#loc66 = loc("tmp0"(#loc21))
+#loc67 = loc("tmp0"(#loc22))
+#loc68 = loc("tmp0"(#loc23))
+#loc69 = loc("tmp2"(#loc24))
+#loc70 = loc("tmp5"(#loc25))
+#loc71 = loc("_tmp4"(#loc26))
+#loc72 = loc("tmp4"(#loc28))
+#loc73 = loc("tmp4"(#loc29))
diff --git a/triton/CSCZN5H6FSF2WGW6YDIXIDGO2LCRKL6FDUMRY7U4TKHTRIGCP4YA/triton_red_fused__fused_rms_norm_view_0.ttgir b/triton/CSCZN5H6FSF2WGW6YDIXIDGO2LCRKL6FDUMRY7U4TKHTRIGCP4YA/triton_red_fused__fused_rms_norm_view_0.ttgir
new file mode 100644
index 0000000000000000000000000000000000000000..c9700d21ed54bc8b623b0cc4efb423bce24f12fc
--- /dev/null
+++ b/triton/CSCZN5H6FSF2WGW6YDIXIDGO2LCRKL6FDUMRY7U4TKHTRIGCP4YA/triton_red_fused__fused_rms_norm_view_0.ttgir
@@ -0,0 +1,121 @@
+#blocked = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}>
+#blocked1 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [16, 2], warpsPerCTA = [1, 4], order = [0, 1]}>
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":18:0)
+#loc1 = loc(unknown)
+#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":44:25)
+#loc30 = loc("in_ptr0"(#loc))
+#loc31 = loc("out_ptr0"(#loc))
+#loc32 = loc("xnumel"(#loc))
+#loc33 = loc("r0_numel"(#loc))
+#loc54 = loc("tmp4"(#loc24))
+#loc57 = loc(callsite(#loc1 at #loc54))
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:89", "ttg.threads-per-warp" = 32 : i32} {
+  tt.func public @triton_red_fused__fused_rms_norm_view_0(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
+    %cst = arith.constant dense<128> : tensor<1x32xi32, #blocked> loc(#loc1)
+    %cst_0 = arith.constant dense<128> : tensor<16x1xi32, #blocked> loc(#loc1)
+    %cst_1 = arith.constant dense<12288> : tensor<16x1xi32, #blocked> loc(#loc1)
+    %cst_2 = arith.constant dense<32> : tensor<16x1xi32, #blocked> loc(#loc1)
+    %c16_i32 = arith.constant 16 : i32 loc(#loc1)
+    %cst_3 = arith.constant dense<0.000000e+00> : tensor<16x32xbf16, #blocked> loc(#loc1)
+    %c32_i32 = arith.constant 32 : i32 loc(#loc1)
+    %c128_i32 = arith.constant 128 : i32 loc(#loc1)
+    %c0_i32 = arith.constant 0 : i32 loc(#loc1)
+    %cst_4 = arith.constant dense<0.000000e+00> : tensor<16x32xf32, #blocked> loc(#loc1)
+    %xoffset = tt.get_program_id x : i32 loc(#loc34)
+    %xoffset_5 = arith.muli %xoffset, %c16_i32 : i32 loc(#loc35)
+    %xindex = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc36)
+    %xindex_6 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc36)
+    %xindex_7 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<16xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<16x1xi32, #blocked> loc(#loc36)
+    %xindex_8 = tt.expand_dims %xindex_6 {axis = 1 : i32} : tensor<16xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<16x1xi32, #blocked1> loc(#loc36)
+    %xindex_9 = tt.splat %xoffset_5 : i32 -> tensor<16x1xi32, #blocked> loc(#loc37)
+    %xindex_10 = tt.splat %xoffset_5 : i32 -> tensor<16x1xi32, #blocked1> loc(#loc37)
+    %xindex_11 = arith.addi %xindex_9, %xindex_7 : tensor<16x1xi32, #blocked> loc(#loc37)
+    %xindex_12 = arith.addi %xindex_10, %xindex_8 : tensor<16x1xi32, #blocked1> loc(#loc37)
+    %r0_base = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc38)
+    %r0_base_13 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<32xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x32xi32, #blocked> loc(#loc38)
+    %x0 = arith.remsi %xindex_11, %cst_2 : tensor<16x1xi32, #blocked> loc(#loc39)
+    %x1 = arith.divsi %xindex_11, %cst_2 : tensor<16x1xi32, #blocked> loc(#loc40)
+    %tmp0 = arith.muli %x0, %cst_0 : tensor<16x1xi32, #blocked> loc(#loc41)
+    %tmp0_14 = tt.broadcast %tmp0 : tensor<16x1xi32, #blocked> -> tensor<16x32xi32, #blocked> loc(#loc42)
+    %tmp0_15 = arith.muli %x1, %cst_1 : tensor<16x1xi32, #blocked> loc(#loc43)
+    %tmp0_16 = tt.broadcast %tmp0_15 : tensor<16x1xi32, #blocked> -> tensor<16x32xi32, #blocked> loc(#loc44)
+    %tmp0_17 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<16x32x!tt.ptr<bf16>, #blocked> loc(#loc45)
+    %_tmp4 = scf.for %_tmp4_20 = %c0_i32 to %c128_i32 step %c32_i32 iter_args(%arg5 = %cst_4) -> (tensor<16x32xf32, #blocked>)  : i32 {
+      %r0_index = tt.splat %_tmp4_20 : i32 -> tensor<1x32xi32, #blocked> loc(#loc47)
+      %r0_index_21 = arith.addi %r0_index, %r0_base_13 : tensor<1x32xi32, #blocked> loc(#loc47)
+      %r0_mask = arith.cmpi slt, %r0_index_21, %cst : tensor<1x32xi32, #blocked> loc(#loc48)
+      %tmp0_22 = tt.broadcast %r0_index_21 : tensor<1x32xi32, #blocked> -> tensor<16x32xi32, #blocked> loc(#loc42)
+      %tmp0_23 = arith.addi %tmp0_22, %tmp0_14 : tensor<16x32xi32, #blocked> loc(#loc42)
+      %tmp0_24 = arith.addi %tmp0_23, %tmp0_16 : tensor<16x32xi32, #blocked> loc(#loc44)
+      %tmp0_25 = tt.addptr %tmp0_17, %tmp0_24 : tensor<16x32x!tt.ptr<bf16>, #blocked>, tensor<16x32xi32, #blocked> loc(#loc45)
+      %tmp0_26 = tt.broadcast %r0_mask : tensor<1x32xi1, #blocked> -> tensor<16x32xi1, #blocked> loc(#loc49)
+      %tmp0_27 = tt.load %tmp0_25, %tmp0_26, %cst_3 evictionPolicy = evict_first : tensor<16x32x!tt.ptr<bf16>, #blocked> loc(#loc49)
+      %tmp0_28 = arith.extf %tmp0_27 : tensor<16x32xbf16, #blocked> to tensor<16x32xf32, #blocked> loc(#loc50)
+      %tmp2 = arith.mulf %tmp0_28, %tmp0_28 : tensor<16x32xf32, #blocked> loc(#loc51)
+      %tmp5 = arith.addf %arg5, %tmp2 : tensor<16x32xf32, #blocked> loc(#loc52)
+      %_tmp4_29 = arith.select %tmp0_26, %tmp5, %arg5 : tensor<16x32xi1, #blocked>, tensor<16x32xf32, #blocked> loc(#loc53)
+      scf.yield %_tmp4_29 : tensor<16x32xf32, #blocked> loc(#loc22)
+    } loc(#loc46)
+    %tmp4 = "tt.reduce"(%_tmp4) <{axis = 1 : i32}> ({
+    ^bb0(%tmp4_20: f32 loc(callsite(#loc1 at #loc54)), %tmp4_21: f32 loc(callsite(#loc1 at #loc54))):
+      %tmp4_22 = arith.addf %tmp4_20, %tmp4_21 : f32 loc(#loc58)
+      tt.reduce.return %tmp4_22 : f32 loc(#loc56)
+    }) : (tensor<16x32xf32, #blocked>) -> tensor<16xf32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc56)
+    %tmp4_18 = ttg.convert_layout %tmp4 : tensor<16xf32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<16xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc55)
+    %tmp4_19 = tt.expand_dims %tmp4_18 {axis = 1 : i32} : tensor<16xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<16x1xf32, #blocked1> loc(#loc55)
+    %0 = tt.splat %out_ptr0 : !tt.ptr<f32> -> tensor<16x1x!tt.ptr<f32>, #blocked1> loc(#loc27)
+    %1 = tt.addptr %0, %xindex_12 : tensor<16x1x!tt.ptr<f32>, #blocked1>, tensor<16x1xi32, #blocked1> loc(#loc27)
+    tt.store %1, %tmp4_19 : tensor<16x1x!tt.ptr<f32>, #blocked1> loc(#loc28)
+    tt.return loc(#loc29)
+  } loc(#loc)
+} loc(#loc)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":23:28)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":23:33)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":24:44)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":24:23)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":26:37)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":28:19)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":29:19)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:45)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:41)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:56)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:50)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:34)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":32:43)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":33:31)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":34:29)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:61)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:115)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":40:22)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":42:23)
+#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":43:40)
+#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":43:8)
+#loc23 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:36)
+#loc25 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:15)
+#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":44:28)
+#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":45:25)
+#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":45:36)
+#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":45:4)
+#loc34 = loc("xoffset"(#loc2))
+#loc35 = loc("xoffset"(#loc3))
+#loc36 = loc("xindex"(#loc4))
+#loc37 = loc("xindex"(#loc5))
+#loc38 = loc("r0_base"(#loc6))
+#loc39 = loc("x0"(#loc7))
+#loc40 = loc("x1"(#loc8))
+#loc41 = loc("tmp0"(#loc9))
+#loc42 = loc("tmp0"(#loc10))
+#loc43 = loc("tmp0"(#loc11))
+#loc44 = loc("tmp0"(#loc12))
+#loc45 = loc("tmp0"(#loc13))
+#loc46 = loc("_tmp4"(#loc14))
+#loc47 = loc("r0_index"(#loc15))
+#loc48 = loc("r0_mask"(#loc16))
+#loc49 = loc("tmp0"(#loc17))
+#loc50 = loc("tmp0"(#loc18))
+#loc51 = loc("tmp2"(#loc19))
+#loc52 = loc("tmp5"(#loc20))
+#loc53 = loc("_tmp4"(#loc21))
+#loc55 = loc("tmp4"(#loc26))
+#loc56 = loc(callsite(#loc23 at #loc54))
+#loc58 = loc(callsite(#loc25 at #loc56))
diff --git a/triton/CSCZN5H6FSF2WGW6YDIXIDGO2LCRKL6FDUMRY7U4TKHTRIGCP4YA/triton_red_fused__fused_rms_norm_view_0.ttir b/triton/CSCZN5H6FSF2WGW6YDIXIDGO2LCRKL6FDUMRY7U4TKHTRIGCP4YA/triton_red_fused__fused_rms_norm_view_0.ttir
new file mode 100644
index 0000000000000000000000000000000000000000..5c4affefef87b57008d4da1392fc66ccbe413873
--- /dev/null
+++ b/triton/CSCZN5H6FSF2WGW6YDIXIDGO2LCRKL6FDUMRY7U4TKHTRIGCP4YA/triton_red_fused__fused_rms_norm_view_0.ttir
@@ -0,0 +1,118 @@
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":18:0)
+#loc1 = loc(unknown)
+#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":44:25)
+#loc32 = loc("in_ptr0"(#loc))
+#loc33 = loc("out_ptr0"(#loc))
+#loc34 = loc("xnumel"(#loc))
+#loc35 = loc("r0_numel"(#loc))
+#loc58 = loc("tmp4"(#loc26))
+#loc61 = loc(callsite(#loc1 at #loc58))
+module {
+  tt.func public @triton_red_fused__fused_rms_norm_view_0(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
+    %cst = arith.constant dense<0.000000e+00> : tensor<16x32xbf16> loc(#loc1)
+    %c32_i32 = arith.constant 32 : i32 loc(#loc2)
+    %c128_i32 = arith.constant 128 : i32 loc(#loc2)
+    %c0_i32 = arith.constant 0 : i32 loc(#loc2)
+    %cst_0 = arith.constant dense<12288> : tensor<16x1xi32> loc(#loc1)
+    %cst_1 = arith.constant dense<128> : tensor<16x1xi32> loc(#loc1)
+    %cst_2 = arith.constant dense<128> : tensor<1x32xi32> loc(#loc1)
+    %cst_3 = arith.constant dense<0.000000e+00> : tensor<16x32xf32> loc(#loc1)
+    %cst_4 = arith.constant dense<32> : tensor<16x1xi32> loc(#loc1)
+    %c16_i32 = arith.constant 16 : i32 loc(#loc1)
+    %xoffset = tt.get_program_id x : i32 loc(#loc36)
+    %xoffset_5 = arith.muli %xoffset, %c16_i32 : i32 loc(#loc37)
+    %xindex = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32> loc(#loc38)
+    %xindex_6 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<16xi32> -> tensor<16x1xi32> loc(#loc39)
+    %xindex_7 = tt.splat %xoffset_5 : i32 -> tensor<16x1xi32> loc(#loc40)
+    %xindex_8 = arith.addi %xindex_7, %xindex_6 : tensor<16x1xi32> loc(#loc40)
+    %r0_base = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32> loc(#loc41)
+    %r0_base_9 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<32xi32> -> tensor<1x32xi32> loc(#loc42)
+    %x0 = arith.remsi %xindex_8, %cst_4 : tensor<16x1xi32> loc(#loc43)
+    %x1 = arith.divsi %xindex_8, %cst_4 : tensor<16x1xi32> loc(#loc44)
+    %_tmp4 = scf.for %r0_offset = %c0_i32 to %c128_i32 step %c32_i32 iter_args(%_tmp4_11 = %cst_3) -> (tensor<16x32xf32>)  : i32 {
+      %r0_index = tt.splat %r0_offset : i32 -> tensor<1x32xi32> loc(#loc46)
+      %r0_index_12 = arith.addi %r0_index, %r0_base_9 : tensor<1x32xi32> loc(#loc46)
+      %r0_mask = arith.cmpi slt, %r0_index_12, %cst_2 : tensor<1x32xi32> loc(#loc47)
+      %tmp0 = arith.muli %x0, %cst_1 : tensor<16x1xi32> loc(#loc48)
+      %tmp0_13 = tt.broadcast %r0_index_12 : tensor<1x32xi32> -> tensor<16x32xi32> loc(#loc49)
+      %tmp0_14 = tt.broadcast %tmp0 : tensor<16x1xi32> -> tensor<16x32xi32> loc(#loc49)
+      %tmp0_15 = arith.addi %tmp0_13, %tmp0_14 : tensor<16x32xi32> loc(#loc49)
+      %tmp0_16 = arith.muli %x1, %cst_0 : tensor<16x1xi32> loc(#loc50)
+      %tmp0_17 = tt.broadcast %tmp0_16 : tensor<16x1xi32> -> tensor<16x32xi32> loc(#loc51)
+      %tmp0_18 = arith.addi %tmp0_15, %tmp0_17 : tensor<16x32xi32> loc(#loc51)
+      %tmp0_19 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<16x32x!tt.ptr<bf16>> loc(#loc52)
+      %tmp0_20 = tt.addptr %tmp0_19, %tmp0_18 : tensor<16x32x!tt.ptr<bf16>>, tensor<16x32xi32> loc(#loc52)
+      %tmp0_21 = tt.broadcast %r0_mask : tensor<1x32xi1> -> tensor<16x32xi1> loc(#loc53)
+      %tmp0_22 = tt.load %tmp0_20, %tmp0_21, %cst evictionPolicy = evict_first : tensor<16x32x!tt.ptr<bf16>> loc(#loc53)
+      %tmp0_23 = arith.extf %tmp0_22 : tensor<16x32xbf16> to tensor<16x32xf32> loc(#loc54)
+      %tmp2 = arith.mulf %tmp0_23, %tmp0_23 : tensor<16x32xf32> loc(#loc55)
+      %tmp5 = arith.addf %_tmp4_11, %tmp2 : tensor<16x32xf32> loc(#loc56)
+      %_tmp4_24 = arith.select %tmp0_21, %tmp5, %_tmp4_11 : tensor<16x32xi1>, tensor<16x32xf32> loc(#loc57)
+      scf.yield %_tmp4_24 : tensor<16x32xf32> loc(#loc24)
+    } loc(#loc45)
+    %tmp4 = "tt.reduce"(%_tmp4) <{axis = 1 : i32}> ({
+    ^bb0(%tmp4_11: f32 loc(callsite(#loc1 at #loc58)), %tmp4_12: f32 loc(callsite(#loc1 at #loc58))):
+      %tmp4_13 = arith.addf %tmp4_11, %tmp4_12 : f32 loc(#loc62)
+      tt.reduce.return %tmp4_13 : f32 loc(#loc60)
+    }) : (tensor<16x32xf32>) -> tensor<16xf32> loc(#loc60)
+    %tmp4_10 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<16xf32> -> tensor<16x1xf32> loc(#loc59)
+    %0 = tt.splat %out_ptr0 : !tt.ptr<f32> -> tensor<16x1x!tt.ptr<f32>> loc(#loc29)
+    %1 = tt.addptr %0, %xindex_8 : tensor<16x1x!tt.ptr<f32>>, tensor<16x1xi32> loc(#loc29)
+    tt.store %1, %tmp4_10 : tensor<16x1x!tt.ptr<f32>> loc(#loc30)
+    tt.return loc(#loc31)
+  } loc(#loc)
+} loc(#loc)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":32:43)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":23:28)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":23:33)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":24:36)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":24:44)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":24:23)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":26:27)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":26:37)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":28:19)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":29:19)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":33:31)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":34:29)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:45)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:41)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:56)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:50)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:34)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:61)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:115)
+#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":40:22)
+#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":42:23)
+#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":43:40)
+#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":43:8)
+#loc25 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:36)
+#loc27 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:15)
+#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":44:28)
+#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":45:25)
+#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":45:36)
+#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":45:4)
+#loc36 = loc("xoffset"(#loc3))
+#loc37 = loc("xoffset"(#loc4))
+#loc38 = loc("xindex"(#loc5))
+#loc39 = loc("xindex"(#loc6))
+#loc40 = loc("xindex"(#loc7))
+#loc41 = loc("r0_base"(#loc8))
+#loc42 = loc("r0_base"(#loc9))
+#loc43 = loc("x0"(#loc10))
+#loc44 = loc("x1"(#loc11))
+#loc45 = loc("_tmp4"(#loc2))
+#loc46 = loc("r0_index"(#loc12))
+#loc47 = loc("r0_mask"(#loc13))
+#loc48 = loc("tmp0"(#loc14))
+#loc49 = loc("tmp0"(#loc15))
+#loc50 = loc("tmp0"(#loc16))
+#loc51 = loc("tmp0"(#loc17))
+#loc52 = loc("tmp0"(#loc18))
+#loc53 = loc("tmp0"(#loc19))
+#loc54 = loc("tmp0"(#loc20))
+#loc55 = loc("tmp2"(#loc21))
+#loc56 = loc("tmp5"(#loc22))
+#loc57 = loc("_tmp4"(#loc23))
+#loc59 = loc("tmp4"(#loc28))
+#loc60 = loc(callsite(#loc25 at #loc58))
+#loc62 = loc(callsite(#loc27 at #loc60))
diff --git a/triton/CYKNGA4OMPRI7EV7H5FM47DKU7VFZ4Q5NYQGPNW6ZIVYBLBWPVMA/__grp__triton_red_fused_add_mul_native_layer_norm_1.json b/triton/CYKNGA4OMPRI7EV7H5FM47DKU7VFZ4Q5NYQGPNW6ZIVYBLBWPVMA/__grp__triton_red_fused_add_mul_native_layer_norm_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..91a69b37b4ccd50c04e860f4ec28f38680470589
--- /dev/null
+++ b/triton/CYKNGA4OMPRI7EV7H5FM47DKU7VFZ4Q5NYQGPNW6ZIVYBLBWPVMA/__grp__triton_red_fused_add_mul_native_layer_norm_1.json
@@ -0,0 +1 @@
+{"child_paths": {"triton_red_fused_add_mul_native_layer_norm_1.source": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/CYKNGA4OMPRI7EV7H5FM47DKU7VFZ4Q5NYQGPNW6ZIVYBLBWPVMA/triton_red_fused_add_mul_native_layer_norm_1.source", "triton_red_fused_add_mul_native_layer_norm_1.ttir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/CYKNGA4OMPRI7EV7H5FM47DKU7VFZ4Q5NYQGPNW6ZIVYBLBWPVMA/triton_red_fused_add_mul_native_layer_norm_1.ttir", "triton_red_fused_add_mul_native_layer_norm_1.ttgir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/CYKNGA4OMPRI7EV7H5FM47DKU7VFZ4Q5NYQGPNW6ZIVYBLBWPVMA/triton_red_fused_add_mul_native_layer_norm_1.ttgir", "triton_red_fused_add_mul_native_layer_norm_1.llir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/CYKNGA4OMPRI7EV7H5FM47DKU7VFZ4Q5NYQGPNW6ZIVYBLBWPVMA/triton_red_fused_add_mul_native_layer_norm_1.llir", "triton_red_fused_add_mul_native_layer_norm_1.ptx": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/CYKNGA4OMPRI7EV7H5FM47DKU7VFZ4Q5NYQGPNW6ZIVYBLBWPVMA/triton_red_fused_add_mul_native_layer_norm_1.ptx", "triton_red_fused_add_mul_native_layer_norm_1.cubin": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/CYKNGA4OMPRI7EV7H5FM47DKU7VFZ4Q5NYQGPNW6ZIVYBLBWPVMA/triton_red_fused_add_mul_native_layer_norm_1.cubin", "triton_red_fused_add_mul_native_layer_norm_1.json": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/CYKNGA4OMPRI7EV7H5FM47DKU7VFZ4Q5NYQGPNW6ZIVYBLBWPVMA/triton_red_fused_add_mul_native_layer_norm_1.json"}}
\ No newline at end of file
diff --git a/triton/CYKNGA4OMPRI7EV7H5FM47DKU7VFZ4Q5NYQGPNW6ZIVYBLBWPVMA/triton_red_fused_add_mul_native_layer_norm_1.cubin b/triton/CYKNGA4OMPRI7EV7H5FM47DKU7VFZ4Q5NYQGPNW6ZIVYBLBWPVMA/triton_red_fused_add_mul_native_layer_norm_1.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..a0ca56e1a5a8f43f1f466021f9afeeaec0bdd343
Binary files /dev/null and b/triton/CYKNGA4OMPRI7EV7H5FM47DKU7VFZ4Q5NYQGPNW6ZIVYBLBWPVMA/triton_red_fused_add_mul_native_layer_norm_1.cubin differ
diff --git a/triton/CYKNGA4OMPRI7EV7H5FM47DKU7VFZ4Q5NYQGPNW6ZIVYBLBWPVMA/triton_red_fused_add_mul_native_layer_norm_1.json b/triton/CYKNGA4OMPRI7EV7H5FM47DKU7VFZ4Q5NYQGPNW6ZIVYBLBWPVMA/triton_red_fused_add_mul_native_layer_norm_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..b70a39a781f007bdcca5603ec4f9a88fc00926fe
--- /dev/null
+++ b/triton/CYKNGA4OMPRI7EV7H5FM47DKU7VFZ4Q5NYQGPNW6ZIVYBLBWPVMA/triton_red_fused_add_mul_native_layer_norm_1.json
@@ -0,0 +1 @@
+{"hash": "1614d3038e63e28f92bf3f4ace7c6aa7ea5cf21d6e2067b6deca2b80ac367d58", "target": {"backend": "cuda", "arch": 89, "warp_size": 32}, "num_warps": 16, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "enable_reflect_ftz": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee", "bf16x3", "bf16x6"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm89", "instrumentation_mode": "", "triton_version": "3.6.0", "tensordesc_meta": [], "shared": 192, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused_add_mul_native_layer_norm_1"}
\ No newline at end of file
diff --git a/triton/CYKNGA4OMPRI7EV7H5FM47DKU7VFZ4Q5NYQGPNW6ZIVYBLBWPVMA/triton_red_fused_add_mul_native_layer_norm_1.llir b/triton/CYKNGA4OMPRI7EV7H5FM47DKU7VFZ4Q5NYQGPNW6ZIVYBLBWPVMA/triton_red_fused_add_mul_native_layer_norm_1.llir
new file mode 100644
index 0000000000000000000000000000000000000000..3739a86fa49e0d35f289cbdc95b65bf354b3384e
--- /dev/null
+++ b/triton/CYKNGA4OMPRI7EV7H5FM47DKU7VFZ4Q5NYQGPNW6ZIVYBLBWPVMA/triton_red_fused_add_mul_native_layer_norm_1.llir
@@ -0,0 +1,547 @@
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64"
+
+@global_smem = external addrspace(3) global [0 x i8], align 16
+@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1
+
+; Function Attrs: nounwind
+define ptx_kernel void @triton_red_fused_add_mul_native_layer_norm_1(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, i32 %4, i32 %5, ptr addrspace(1) readnone captures(none) %6, ptr addrspace(1) readnone captures(none) %7) local_unnamed_addr #0 !dbg !5 {
+__nv_rsqrtf.exit:
+  %8 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !8
+  %9 = icmp samesign ult i32 %8, 256, !dbg !9
+  %10 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
+  %11 = and i32 %10, 511, !dbg !10
+  %12 = and i32 %10, 31, !dbg !10
+  %13 = lshr i32 %11, 5, !dbg !10
+  %14 = shl nuw nsw i32 %10, 3, !dbg !10
+  %15 = and i32 %14, 4088, !dbg !10
+  %16 = shl i32 %8, 12, !dbg !11
+  %17 = or disjoint i32 %15, %16, !dbg !12
+  %18 = sext i32 %17 to i64, !dbg !13
+  %19 = getelementptr bfloat, ptr addrspace(1) %0, i64 %18, !dbg !13
+  %20 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !14
+  %21 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %19, i64 %20, i1 %9) #6, !dbg !14
+  %22 = extractvalue { i32, i32, i32, i32 } %21, 0, !dbg !14
+  %23 = bitcast i32 %22 to <2 x bfloat>, !dbg !14
+  %24 = extractvalue { i32, i32, i32, i32 } %21, 1, !dbg !14
+  %25 = bitcast i32 %24 to <2 x bfloat>, !dbg !14
+  %26 = extractvalue { i32, i32, i32, i32 } %21, 2, !dbg !14
+  %27 = bitcast i32 %26 to <2 x bfloat>, !dbg !14
+  %28 = extractvalue { i32, i32, i32, i32 } %21, 3, !dbg !14
+  %29 = bitcast i32 %28 to <2 x bfloat>, !dbg !14
+  %30 = extractelement <2 x bfloat> %23, i64 0, !dbg !14
+  %31 = extractelement <2 x bfloat> %23, i64 1, !dbg !14
+  %32 = extractelement <2 x bfloat> %25, i64 0, !dbg !14
+  %33 = extractelement <2 x bfloat> %25, i64 1, !dbg !14
+  %34 = extractelement <2 x bfloat> %27, i64 0, !dbg !14
+  %35 = extractelement <2 x bfloat> %27, i64 1, !dbg !14
+  %36 = extractelement <2 x bfloat> %29, i64 0, !dbg !14
+  %37 = extractelement <2 x bfloat> %29, i64 1, !dbg !14
+  %38 = fpext bfloat %30 to float, !dbg !15
+  %39 = fpext bfloat %31 to float, !dbg !15
+  %40 = fpext bfloat %32 to float, !dbg !15
+  %41 = fpext bfloat %33 to float, !dbg !15
+  %42 = fpext bfloat %34 to float, !dbg !15
+  %43 = fpext bfloat %35 to float, !dbg !15
+  %44 = fpext bfloat %36 to float, !dbg !15
+  %45 = fpext bfloat %37 to float, !dbg !15
+  %46 = select i1 %9, float %38, float 0.000000e+00, !dbg !16
+  %47 = select i1 %9, float %39, float 0.000000e+00, !dbg !16
+  %48 = select i1 %9, float %40, float 0.000000e+00, !dbg !16
+  %49 = select i1 %9, float %41, float 0.000000e+00, !dbg !16
+  %50 = select i1 %9, float %42, float 0.000000e+00, !dbg !16
+  %51 = select i1 %9, float %43, float 0.000000e+00, !dbg !16
+  %52 = select i1 %9, float %44, float 0.000000e+00, !dbg !16
+  %53 = select i1 %9, float %45, float 0.000000e+00, !dbg !16
+  %54 = select i1 %9, float 1.000000e+00, float 0.000000e+00, !dbg !17
+  %55 = fsub float %47, %46, !dbg !18
+  %56 = select i1 %9, float 2.000000e+00, float 0.000000e+00, !dbg !24
+  %57 = fcmp oeq float %56, 0.000000e+00, !dbg !25
+  %58 = tail call float @llvm.nvvm.div.full(float %54, float %56), !dbg !26
+  %59 = select i1 %57, float 0.000000e+00, float %58, !dbg !27
+  %60 = fmul float %59, %55, !dbg !28
+  %61 = fadd float %46, %60, !dbg !29
+  %62 = fmul float %55, %55, !dbg !30
+  %63 = fmul float %54, %62, !dbg !31
+  %64 = fmul float %59, %63, !dbg !32
+  %65 = fadd float %64, 0.000000e+00, !dbg !33
+  %66 = fsub float %48, %61, !dbg !18
+  %67 = select i1 %9, float 3.000000e+00, float 0.000000e+00, !dbg !24
+  %68 = fcmp oeq float %67, 0.000000e+00, !dbg !25
+  %69 = tail call float @llvm.nvvm.div.full(float %54, float %67), !dbg !26
+  %70 = select i1 %68, float 0.000000e+00, float %69, !dbg !27
+  %71 = fmul float %70, %66, !dbg !28
+  %72 = fadd float %61, %71, !dbg !29
+  %73 = fmul float %66, %66, !dbg !30
+  %74 = fmul float %56, %73, !dbg !31
+  %75 = fmul float %70, %74, !dbg !32
+  %76 = fadd float %65, %75, !dbg !33
+  %77 = fsub float %49, %72, !dbg !18
+  %78 = select i1 %9, float 4.000000e+00, float 0.000000e+00, !dbg !24
+  %79 = fcmp oeq float %78, 0.000000e+00, !dbg !25
+  %80 = tail call float @llvm.nvvm.div.full(float %54, float %78), !dbg !26
+  %81 = select i1 %79, float 0.000000e+00, float %80, !dbg !27
+  %82 = fmul float %81, %77, !dbg !28
+  %83 = fadd float %72, %82, !dbg !29
+  %84 = fmul float %77, %77, !dbg !30
+  %85 = fmul float %67, %84, !dbg !31
+  %86 = fmul float %81, %85, !dbg !32
+  %87 = fadd float %76, %86, !dbg !33
+  %88 = fsub float %50, %83, !dbg !18
+  %89 = select i1 %9, float 5.000000e+00, float 0.000000e+00, !dbg !24
+  %90 = fcmp oeq float %89, 0.000000e+00, !dbg !25
+  %91 = tail call float @llvm.nvvm.div.full(float %54, float %89), !dbg !26
+  %92 = select i1 %90, float 0.000000e+00, float %91, !dbg !27
+  %93 = fmul float %92, %88, !dbg !28
+  %94 = fadd float %83, %93, !dbg !29
+  %95 = fmul float %88, %88, !dbg !30
+  %96 = fmul float %78, %95, !dbg !31
+  %97 = fmul float %92, %96, !dbg !32
+  %98 = fadd float %87, %97, !dbg !33
+  %99 = fsub float %51, %94, !dbg !18
+  %100 = select i1 %9, float 6.000000e+00, float 0.000000e+00, !dbg !24
+  %101 = fcmp oeq float %100, 0.000000e+00, !dbg !25
+  %102 = tail call float @llvm.nvvm.div.full(float %54, float %100), !dbg !26
+  %103 = select i1 %101, float 0.000000e+00, float %102, !dbg !27
+  %104 = fmul float %103, %99, !dbg !28
+  %105 = fadd float %94, %104, !dbg !29
+  %106 = fmul float %99, %99, !dbg !30
+  %107 = fmul float %89, %106, !dbg !31
+  %108 = fmul float %103, %107, !dbg !32
+  %109 = fadd float %98, %108, !dbg !33
+  %110 = fsub float %52, %105, !dbg !18
+  %111 = select i1 %9, float 7.000000e+00, float 0.000000e+00, !dbg !24
+  %112 = fcmp oeq float %111, 0.000000e+00, !dbg !25
+  %113 = tail call float @llvm.nvvm.div.full(float %54, float %111), !dbg !26
+  %114 = select i1 %112, float 0.000000e+00, float %113, !dbg !27
+  %115 = fmul float %114, %110, !dbg !28
+  %116 = fadd float %105, %115, !dbg !29
+  %117 = fmul float %110, %110, !dbg !30
+  %118 = fmul float %100, %117, !dbg !31
+  %119 = fmul float %114, %118, !dbg !32
+  %120 = fadd float %109, %119, !dbg !33
+  %121 = fsub float %53, %116, !dbg !18
+  %122 = select i1 %9, float 8.000000e+00, float 0.000000e+00, !dbg !24
+  %123 = fcmp oeq float %122, 0.000000e+00, !dbg !25
+  %124 = tail call float @llvm.nvvm.div.full(float %54, float %122), !dbg !26
+  %125 = select i1 %123, float 0.000000e+00, float %124, !dbg !27
+  %126 = fmul float %125, %121, !dbg !28
+  %127 = fadd float %116, %126, !dbg !29
+  %128 = fmul float %121, %121, !dbg !30
+  %129 = fmul float %111, %128, !dbg !31
+  %130 = fmul float %125, %129, !dbg !32
+  %131 = fadd float %120, %130, !dbg !33
+  %132 = bitcast float %127 to i32, !dbg !21
+  %133 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %132, i32 16, i32 31), !dbg !21
+  %134 = bitcast i32 %133 to float, !dbg !21
+  %135 = bitcast float %131 to i32, !dbg !21
+  %136 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %135, i32 16, i32 31), !dbg !21
+  %137 = bitcast i32 %136 to float, !dbg !21
+  %138 = bitcast float %122 to i32, !dbg !21
+  %139 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %138, i32 16, i32 31), !dbg !21
+  %140 = bitcast i32 %139 to float, !dbg !21
+  %141 = fsub float %134, %127, !dbg !18
+  %142 = fadd float %122, %140, !dbg !24
+  %143 = fcmp oeq float %142, 0.000000e+00, !dbg !25
+  %144 = tail call float @llvm.nvvm.div.full(float %140, float %142), !dbg !26
+  %145 = select i1 %143, float 0.000000e+00, float %144, !dbg !27
+  %146 = fmul float %145, %141, !dbg !28
+  %147 = fadd float %127, %146, !dbg !29
+  %148 = fadd float %131, %137, !dbg !34
+  %149 = fmul float %141, %141, !dbg !30
+  %150 = fmul float %122, %149, !dbg !31
+  %151 = fmul float %145, %150, !dbg !32
+  %152 = fadd float %148, %151, !dbg !33
+  %153 = bitcast float %147 to i32, !dbg !21
+  %154 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %153, i32 8, i32 31), !dbg !21
+  %155 = bitcast i32 %154 to float, !dbg !21
+  %156 = bitcast float %152 to i32, !dbg !21
+  %157 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %156, i32 8, i32 31), !dbg !21
+  %158 = bitcast i32 %157 to float, !dbg !21
+  %159 = bitcast float %142 to i32, !dbg !21
+  %160 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %159, i32 8, i32 31), !dbg !21
+  %161 = bitcast i32 %160 to float, !dbg !21
+  %162 = fsub float %155, %147, !dbg !18
+  %163 = fadd float %142, %161, !dbg !24
+  %164 = fcmp oeq float %163, 0.000000e+00, !dbg !25
+  %165 = tail call float @llvm.nvvm.div.full(float %161, float %163), !dbg !26
+  %166 = select i1 %164, float 0.000000e+00, float %165, !dbg !27
+  %167 = fmul float %166, %162, !dbg !28
+  %168 = fadd float %147, %167, !dbg !29
+  %169 = fadd float %152, %158, !dbg !34
+  %170 = fmul float %162, %162, !dbg !30
+  %171 = fmul float %142, %170, !dbg !31
+  %172 = fmul float %166, %171, !dbg !32
+  %173 = fadd float %169, %172, !dbg !33
+  %174 = bitcast float %168 to i32, !dbg !21
+  %175 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %174, i32 4, i32 31), !dbg !21
+  %176 = bitcast i32 %175 to float, !dbg !21
+  %177 = bitcast float %173 to i32, !dbg !21
+  %178 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %177, i32 4, i32 31), !dbg !21
+  %179 = bitcast i32 %178 to float, !dbg !21
+  %180 = bitcast float %163 to i32, !dbg !21
+  %181 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %180, i32 4, i32 31), !dbg !21
+  %182 = bitcast i32 %181 to float, !dbg !21
+  %183 = fsub float %176, %168, !dbg !18
+  %184 = fadd float %163, %182, !dbg !24
+  %185 = fcmp oeq float %184, 0.000000e+00, !dbg !25
+  %186 = tail call float @llvm.nvvm.div.full(float %182, float %184), !dbg !26
+  %187 = select i1 %185, float 0.000000e+00, float %186, !dbg !27
+  %188 = fmul float %187, %183, !dbg !28
+  %189 = fadd float %168, %188, !dbg !29
+  %190 = fadd float %173, %179, !dbg !34
+  %191 = fmul float %183, %183, !dbg !30
+  %192 = fmul float %163, %191, !dbg !31
+  %193 = fmul float %187, %192, !dbg !32
+  %194 = fadd float %190, %193, !dbg !33
+  %195 = bitcast float %189 to i32, !dbg !21
+  %196 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %195, i32 2, i32 31), !dbg !21
+  %197 = bitcast i32 %196 to float, !dbg !21
+  %198 = bitcast float %194 to i32, !dbg !21
+  %199 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %198, i32 2, i32 31), !dbg !21
+  %200 = bitcast i32 %199 to float, !dbg !21
+  %201 = bitcast float %184 to i32, !dbg !21
+  %202 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %201, i32 2, i32 31), !dbg !21
+  %203 = bitcast i32 %202 to float, !dbg !21
+  %204 = fsub float %197, %189, !dbg !18
+  %205 = fadd float %184, %203, !dbg !24
+  %206 = fcmp oeq float %205, 0.000000e+00, !dbg !25
+  %207 = tail call float @llvm.nvvm.div.full(float %203, float %205), !dbg !26
+  %208 = select i1 %206, float 0.000000e+00, float %207, !dbg !27
+  %209 = fmul float %208, %204, !dbg !28
+  %210 = fadd float %189, %209, !dbg !29
+  %211 = fadd float %194, %200, !dbg !34
+  %212 = fmul float %204, %204, !dbg !30
+  %213 = fmul float %184, %212, !dbg !31
+  %214 = fmul float %208, %213, !dbg !32
+  %215 = fadd float %211, %214, !dbg !33
+  %216 = bitcast float %210 to i32, !dbg !21
+  %217 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %216, i32 1, i32 31), !dbg !21
+  %218 = bitcast i32 %217 to float, !dbg !21
+  %219 = bitcast float %215 to i32, !dbg !21
+  %220 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %219, i32 1, i32 31), !dbg !21
+  %221 = bitcast i32 %220 to float, !dbg !21
+  %222 = bitcast float %205 to i32, !dbg !21
+  %223 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %222, i32 1, i32 31), !dbg !21
+  %224 = bitcast i32 %223 to float, !dbg !21
+  %225 = fsub float %218, %210, !dbg !18
+  %226 = fadd float %205, %224, !dbg !24
+  %227 = fcmp oeq float %226, 0.000000e+00, !dbg !25
+  %228 = tail call float @llvm.nvvm.div.full(float %224, float %226), !dbg !26
+  %229 = select i1 %227, float 0.000000e+00, float %228, !dbg !27
+  %230 = fmul float %229, %225, !dbg !28
+  %231 = fadd float %210, %230, !dbg !29
+  %232 = fadd float %215, %221, !dbg !34
+  %233 = fmul float %225, %225, !dbg !30
+  %234 = fmul float %205, %233, !dbg !31
+  %235 = fmul float %229, %234, !dbg !32
+  %236 = fadd float %232, %235, !dbg !33
+  %237 = icmp eq i32 %12, 0, !dbg !21
+  %238 = getelementptr float, ptr addrspace(3) @global_smem, i32 %13, !dbg !21
+  %239 = bitcast float %231 to <1 x i32>, !dbg !21
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %238, <1 x i32> %239, i1 %237) #6, !dbg !21
+  %240 = getelementptr float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 64), i32 %13, !dbg !21
+  %241 = bitcast float %236 to <1 x i32>, !dbg !21
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %240, <1 x i32> %241, i1 %237) #6, !dbg !21
+  %242 = getelementptr float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 128), i32 %13, !dbg !21
+  %243 = bitcast float %226 to <1 x i32>, !dbg !21
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %242, <1 x i32> %243, i1 %237) #6, !dbg !21
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !21
+  %244 = icmp samesign ult i32 %11, 16, !dbg !21
+  %245 = getelementptr float, ptr addrspace(3) @global_smem, i32 %11, !dbg !21
+  %246 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %245, i1 %244) #6, !dbg !21
+  %247 = bitcast i32 %246 to float, !dbg !21
+  %248 = getelementptr float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 64), i32 %11, !dbg !21
+  %249 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %248, i1 %244) #6, !dbg !21
+  %250 = bitcast i32 %249 to float, !dbg !21
+  %251 = getelementptr float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 128), i32 %11, !dbg !21
+  %252 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %251, i1 %244) #6, !dbg !21
+  %253 = bitcast i32 %252 to float, !dbg !21
+  %254 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %246, i32 8, i32 31), !dbg !21
+  %255 = bitcast i32 %254 to float, !dbg !21
+  %256 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %249, i32 8, i32 31), !dbg !21
+  %257 = bitcast i32 %256 to float, !dbg !21
+  %258 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %252, i32 8, i32 31), !dbg !21
+  %259 = bitcast i32 %258 to float, !dbg !21
+  %260 = fsub float %255, %247, !dbg !18
+  %261 = fadd float %253, %259, !dbg !24
+  %262 = fcmp oeq float %261, 0.000000e+00, !dbg !25
+  %263 = tail call float @llvm.nvvm.div.full(float %259, float %261), !dbg !26
+  %264 = select i1 %262, float 0.000000e+00, float %263, !dbg !27
+  %265 = fmul float %260, %264, !dbg !28
+  %266 = fadd float %265, %247, !dbg !29
+  %267 = fadd float %250, %257, !dbg !34
+  %268 = fmul float %260, %260, !dbg !30
+  %269 = fmul float %268, %253, !dbg !31
+  %270 = fmul float %269, %264, !dbg !32
+  %271 = fadd float %267, %270, !dbg !33
+  %272 = bitcast float %266 to i32, !dbg !21
+  %273 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %272, i32 4, i32 31), !dbg !21
+  %274 = bitcast i32 %273 to float, !dbg !21
+  %275 = bitcast float %271 to i32, !dbg !21
+  %276 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %275, i32 4, i32 31), !dbg !21
+  %277 = bitcast i32 %276 to float, !dbg !21
+  %278 = bitcast float %261 to i32, !dbg !21
+  %279 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %278, i32 4, i32 31), !dbg !21
+  %280 = bitcast i32 %279 to float, !dbg !21
+  %281 = fsub float %274, %266, !dbg !18
+  %282 = fadd float %261, %280, !dbg !24
+  %283 = fcmp oeq float %282, 0.000000e+00, !dbg !25
+  %284 = tail call float @llvm.nvvm.div.full(float %280, float %282), !dbg !26
+  %285 = select i1 %283, float 0.000000e+00, float %284, !dbg !27
+  %286 = fmul float %281, %285, !dbg !28
+  %287 = fadd float %266, %286, !dbg !29
+  %288 = fadd float %271, %277, !dbg !34
+  %289 = fmul float %281, %281, !dbg !30
+  %290 = fmul float %261, %289, !dbg !31
+  %291 = fmul float %285, %290, !dbg !32
+  %292 = fadd float %288, %291, !dbg !33
+  %293 = bitcast float %287 to i32, !dbg !21
+  %294 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %293, i32 2, i32 31), !dbg !21
+  %295 = bitcast i32 %294 to float, !dbg !21
+  %296 = bitcast float %292 to i32, !dbg !21
+  %297 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %296, i32 2, i32 31), !dbg !21
+  %298 = bitcast i32 %297 to float, !dbg !21
+  %299 = bitcast float %282 to i32, !dbg !21
+  %300 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %299, i32 2, i32 31), !dbg !21
+  %301 = bitcast i32 %300 to float, !dbg !21
+  %302 = fsub float %295, %287, !dbg !18
+  %303 = fadd float %282, %301, !dbg !24
+  %304 = fcmp oeq float %303, 0.000000e+00, !dbg !25
+  %305 = tail call float @llvm.nvvm.div.full(float %301, float %303), !dbg !26
+  %306 = select i1 %304, float 0.000000e+00, float %305, !dbg !27
+  %307 = fmul float %302, %306, !dbg !28
+  %308 = fadd float %287, %307, !dbg !29
+  %309 = fadd float %292, %298, !dbg !34
+  %310 = fmul float %302, %302, !dbg !30
+  %311 = fmul float %282, %310, !dbg !31
+  %312 = fmul float %306, %311, !dbg !32
+  %313 = fadd float %309, %312, !dbg !33
+  %314 = bitcast float %308 to i32, !dbg !21
+  %315 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %314, i32 1, i32 31), !dbg !21
+  %316 = bitcast i32 %315 to float, !dbg !21
+  %317 = bitcast float %313 to i32, !dbg !21
+  %318 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %317, i32 1, i32 31), !dbg !21
+  %319 = bitcast i32 %318 to float, !dbg !21
+  %320 = bitcast float %303 to i32, !dbg !21
+  %321 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %320, i32 1, i32 31), !dbg !21
+  %322 = bitcast i32 %321 to float, !dbg !21
+  %323 = fsub float %316, %308, !dbg !18
+  %324 = fadd float %303, %322, !dbg !24
+  %325 = fcmp oeq float %324, 0.000000e+00, !dbg !25
+  %326 = tail call float @llvm.nvvm.div.full(float %322, float %324), !dbg !26
+  %327 = select i1 %325, float 0.000000e+00, float %326, !dbg !27
+  %328 = fmul float %323, %327, !dbg !28
+  %329 = fadd float %308, %328, !dbg !29
+  %330 = fadd float %313, %319, !dbg !34
+  %331 = fmul float %323, %323, !dbg !30
+  %332 = fmul float %303, %331, !dbg !31
+  %333 = fmul float %327, %332, !dbg !32
+  %334 = fadd float %330, %333, !dbg !33
+  %335 = and i32 %10, 15, !dbg !21
+  %336 = icmp eq i32 %335, 0, !dbg !21
+  %337 = and i1 %244, %336, !dbg !21
+  %338 = bitcast float %329 to <1 x i32>, !dbg !21
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %245, <1 x i32> %338, i1 %337) #6, !dbg !21
+  %339 = bitcast float %334 to <1 x i32>, !dbg !21
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %248, <1 x i32> %339, i1 %337) #6, !dbg !21
+  %340 = bitcast float %324 to <1 x i32>, !dbg !21
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %251, <1 x i32> %340, i1 %337) #6, !dbg !21
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !21
+  %341 = load float, ptr addrspace(3) @global_smem, align 16, !dbg !21
+  %342 = load float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 64), align 16, !dbg !21
+  %343 = zext nneg i32 %15 to i64, !dbg !35
+  %344 = getelementptr bfloat, ptr addrspace(1) %1, i64 %343, !dbg !35
+  %345 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !36
+  %346 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %344, i64 %345, i1 true) #6, !dbg !36
+  %347 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #6, !dbg !37
+  %348 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %19, i64 %347, i1 %9) #6, !dbg !37
+  %349 = getelementptr bfloat, ptr addrspace(1) %2, i64 %343, !dbg !38
+  %350 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !39
+  %351 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %349, i64 %350, i1 true) #6, !dbg !39
+  %352 = tail call float @llvm.nvvm.div.full(float %342, float 4.096000e+03), !dbg !40
+  %353 = fadd float %352, 0x3EB0C6F7A0000000, !dbg !41
+  %354 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !42
+  %355 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !42
+  %356 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !42
+  %357 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !42
+  %358 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !42
+  %359 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !42
+  %360 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !42
+  %361 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !42
+  %.not.i19 = icmp eq i32 %361, 0, !dbg !42
+  br i1 %.not.i19, label %364, label %362, !dbg !42
+
+362:                                              ; preds = %__nv_rsqrtf.exit
+  %363 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %353), !dbg !42
+  br label %__nv_rsqrtf.exit21, !dbg !42
+
+364:                                              ; preds = %__nv_rsqrtf.exit
+  %365 = tail call float @llvm.nvvm.rsqrt.approx.f(float %353), !dbg !42
+  br label %__nv_rsqrtf.exit21, !dbg !42
+
+__nv_rsqrtf.exit21:                               ; preds = %362, %364
+  %.0.i20 = phi float [ %363, %362 ], [ %365, %364 ], !dbg !42
+  %366 = extractvalue { i32, i32, i32, i32 } %348, 3, !dbg !37
+  %367 = bitcast i32 %366 to <2 x bfloat>, !dbg !37
+  %368 = extractvalue { i32, i32, i32, i32 } %348, 2, !dbg !37
+  %369 = bitcast i32 %368 to <2 x bfloat>, !dbg !37
+  %370 = extractvalue { i32, i32, i32, i32 } %348, 1, !dbg !37
+  %371 = bitcast i32 %370 to <2 x bfloat>, !dbg !37
+  %372 = extractvalue { i32, i32, i32, i32 } %348, 0, !dbg !37
+  %373 = bitcast i32 %372 to <2 x bfloat>, !dbg !37
+  %374 = extractvalue { i32, i32, i32, i32 } %346, 3, !dbg !36
+  %375 = bitcast i32 %374 to <2 x bfloat>, !dbg !36
+  %376 = extractvalue { i32, i32, i32, i32 } %346, 2, !dbg !36
+  %377 = bitcast i32 %376 to <2 x bfloat>, !dbg !36
+  %378 = extractvalue { i32, i32, i32, i32 } %346, 1, !dbg !36
+  %379 = bitcast i32 %378 to <2 x bfloat>, !dbg !36
+  %380 = extractvalue { i32, i32, i32, i32 } %346, 0, !dbg !36
+  %381 = bitcast i32 %380 to <2 x bfloat>, !dbg !36
+  %382 = extractvalue { i32, i32, i32, i32 } %351, 3, !dbg !39
+  %383 = bitcast i32 %382 to <2 x bfloat>, !dbg !39
+  %384 = extractvalue { i32, i32, i32, i32 } %351, 2, !dbg !39
+  %385 = bitcast i32 %384 to <2 x bfloat>, !dbg !39
+  %386 = extractvalue { i32, i32, i32, i32 } %351, 1, !dbg !39
+  %387 = bitcast i32 %386 to <2 x bfloat>, !dbg !39
+  %388 = extractvalue { i32, i32, i32, i32 } %351, 0, !dbg !39
+  %389 = bitcast i32 %388 to <2 x bfloat>, !dbg !39
+  %390 = getelementptr bfloat, ptr addrspace(1) %3, i64 %18, !dbg !43
+  %391 = fpext <2 x bfloat> %373 to <2 x float>, !dbg !44
+  %392 = insertelement <2 x float> poison, float %341, i64 0, !dbg !45
+  %393 = shufflevector <2 x float> %392, <2 x float> poison, <2 x i32> zeroinitializer, !dbg !45
+  %394 = fsub <2 x float> %391, %393, !dbg !45
+  %395 = fpext <2 x bfloat> %381 to <2 x float>, !dbg !46
+  %396 = fadd <2 x float> %395, splat (float 1.000000e+00), !dbg !47
+  %397 = fpext <2 x bfloat> %389 to <2 x float>, !dbg !48
+  %398 = insertelement <2 x float> poison, float %.0.i20, i64 0, !dbg !49
+  %399 = shufflevector <2 x float> %398, <2 x float> poison, <2 x i32> zeroinitializer, !dbg !49
+  %400 = fmul <2 x float> %394, %399, !dbg !49
+  %401 = fmul <2 x float> %396, %400, !dbg !50
+  %402 = fadd <2 x float> %401, %397, !dbg !51
+  %403 = fptrunc <2 x float> %402 to <2 x bfloat>, !dbg !52
+  %404 = fpext <2 x bfloat> %371 to <2 x float>, !dbg !44
+  %405 = fsub <2 x float> %404, %393, !dbg !45
+  %406 = fpext <2 x bfloat> %379 to <2 x float>, !dbg !46
+  %407 = fadd <2 x float> %406, splat (float 1.000000e+00), !dbg !47
+  %408 = fpext <2 x bfloat> %387 to <2 x float>, !dbg !48
+  %409 = fmul <2 x float> %405, %399, !dbg !49
+  %410 = fmul <2 x float> %407, %409, !dbg !50
+  %411 = fadd <2 x float> %410, %408, !dbg !51
+  %412 = fptrunc <2 x float> %411 to <2 x bfloat>, !dbg !52
+  %413 = fpext <2 x bfloat> %369 to <2 x float>, !dbg !44
+  %414 = fsub <2 x float> %413, %393, !dbg !45
+  %415 = fpext <2 x bfloat> %377 to <2 x float>, !dbg !46
+  %416 = fadd <2 x float> %415, splat (float 1.000000e+00), !dbg !47
+  %417 = fpext <2 x bfloat> %385 to <2 x float>, !dbg !48
+  %418 = fmul <2 x float> %414, %399, !dbg !49
+  %419 = fmul <2 x float> %416, %418, !dbg !50
+  %420 = fadd <2 x float> %419, %417, !dbg !51
+  %421 = fptrunc <2 x float> %420 to <2 x bfloat>, !dbg !52
+  %422 = fpext <2 x bfloat> %367 to <2 x float>, !dbg !44
+  %423 = fsub <2 x float> %422, %393, !dbg !45
+  %424 = fpext <2 x bfloat> %375 to <2 x float>, !dbg !46
+  %425 = fadd <2 x float> %424, splat (float 1.000000e+00), !dbg !47
+  %426 = fpext <2 x bfloat> %383 to <2 x float>, !dbg !48
+  %427 = fmul <2 x float> %423, %399, !dbg !49
+  %428 = fmul <2 x float> %425, %427, !dbg !50
+  %429 = fadd <2 x float> %428, %426, !dbg !51
+  %430 = fptrunc <2 x float> %429 to <2 x bfloat>, !dbg !52
+  %431 = bitcast <2 x bfloat> %403 to i32, !dbg !52
+  %432 = bitcast <2 x bfloat> %412 to i32, !dbg !52
+  %433 = bitcast <2 x bfloat> %421 to i32, !dbg !52
+  %434 = bitcast <2 x bfloat> %430 to i32, !dbg !52
+  tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %431, i32 %432, i32 %433, i32 %434, ptr addrspace(1) %390, i1 %9) #6, !dbg !52
+  ret void, !dbg !53
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.div.full(float, float) #2
+
+; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
+declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #3
+
+; Function Attrs: convergent nocallback nounwind
+declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #4
+
+declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #5
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #2
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.rsqrt.approx.f(float) #2
+
+attributes #0 = { nounwind "nvvm.reqntid"="512" }
+attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #2 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
+attributes #3 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
+attributes #4 = { convergent nocallback nounwind }
+attributes #5 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #6 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3}
+!llvm.ident = !{!4}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly)
+!1 = !DIFile(filename: "cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py", directory: "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av")
+!2 = !{i32 2, !"Debug Info Version", i32 3}
+!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
+!4 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
+!5 = distinct !DISubprogram(name: "triton_red_fused_add_mul_native_layer_norm_1", linkageName: "triton_red_fused_add_mul_native_layer_norm_1", scope: !1, file: !1, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
+!7 = !{}
+!8 = !DILocation(line: 23, column: 28, scope: !5)
+!9 = !DILocation(line: 25, column: 21, scope: !5)
+!10 = !DILocation(line: 26, column: 37, scope: !5)
+!11 = !DILocation(line: 38, column: 46, scope: !5)
+!12 = !DILocation(line: 38, column: 41, scope: !5)
+!13 = !DILocation(line: 38, column: 34, scope: !5)
+!14 = !DILocation(line: 38, column: 51, scope: !5)
+!15 = !DILocation(line: 38, column: 112, scope: !5)
+!16 = !DILocation(line: 44, column: 62, scope: !5)
+!17 = !DILocation(line: 46, column: 66, scope: !5)
+!18 = !DILocation(line: 231, column: 21, scope: !19, inlinedAt: !21)
+!19 = distinct !DILexicalBlockFile(scope: !5, file: !20, discriminator: 0)
+!20 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime")
+!21 = !DILocation(line: 243, column: 46, scope: !19, inlinedAt: !22)
+!22 = !DILocation(line: 47, column: 79, scope: !23)
+!23 = distinct !DILexicalBlockFile(scope: !5, file: !1, discriminator: 0)
+!24 = !DILocation(line: 232, column: 28, scope: !19, inlinedAt: !21)
+!25 = !DILocation(line: 233, column: 39, scope: !19, inlinedAt: !21)
+!26 = !DILocation(line: 233, column: 60, scope: !19, inlinedAt: !21)
+!27 = !DILocation(line: 233, column: 49, scope: !19, inlinedAt: !21)
+!28 = !DILocation(line: 235, column: 25, scope: !19, inlinedAt: !21)
+!29 = !DILocation(line: 235, column: 17, scope: !19, inlinedAt: !21)
+!30 = !DILocation(line: 236, column: 30, scope: !19, inlinedAt: !21)
+!31 = !DILocation(line: 236, column: 38, scope: !19, inlinedAt: !21)
+!32 = !DILocation(line: 236, column: 49, scope: !19, inlinedAt: !21)
+!33 = !DILocation(line: 236, column: 22, scope: !19, inlinedAt: !21)
+!34 = !DILocation(line: 236, column: 15, scope: !19, inlinedAt: !21)
+!35 = !DILocation(line: 57, column: 34, scope: !5)
+!36 = !DILocation(line: 57, column: 41, scope: !5)
+!37 = !DILocation(line: 58, column: 52, scope: !5)
+!38 = !DILocation(line: 59, column: 35, scope: !5)
+!39 = !DILocation(line: 59, column: 42, scope: !5)
+!40 = !DILocation(line: 65, column: 24, scope: !5)
+!41 = !DILocation(line: 67, column: 24, scope: !5)
+!42 = !DILocation(line: 68, column: 32, scope: !5)
+!43 = !DILocation(line: 73, column: 29, scope: !5)
+!44 = !DILocation(line: 58, column: 114, scope: !5)
+!45 = !DILocation(line: 63, column: 24, scope: !5)
+!46 = !DILocation(line: 57, column: 94, scope: !5)
+!47 = !DILocation(line: 61, column: 23, scope: !5)
+!48 = !DILocation(line: 59, column: 95, scope: !5)
+!49 = !DILocation(line: 69, column: 24, scope: !5)
+!50 = !DILocation(line: 71, column: 24, scope: !5)
+!51 = !DILocation(line: 72, column: 24, scope: !5)
+!52 = !DILocation(line: 73, column: 53, scope: !5)
+!53 = !DILocation(line: 51, column: 4, scope: !5)
diff --git a/triton/CYKNGA4OMPRI7EV7H5FM47DKU7VFZ4Q5NYQGPNW6ZIVYBLBWPVMA/triton_red_fused_add_mul_native_layer_norm_1.ptx b/triton/CYKNGA4OMPRI7EV7H5FM47DKU7VFZ4Q5NYQGPNW6ZIVYBLBWPVMA/triton_red_fused_add_mul_native_layer_norm_1.ptx
new file mode 100644
index 0000000000000000000000000000000000000000..e10915bf74d890b97640fea658418fe43ef03302
--- /dev/null
+++ b/triton/CYKNGA4OMPRI7EV7H5FM47DKU7VFZ4Q5NYQGPNW6ZIVYBLBWPVMA/triton_red_fused_add_mul_native_layer_norm_1.ptx
@@ -0,0 +1,1032 @@
+//
+// Generated by LLVM NVPTX Back-End
+//
+
+.version 9.1
+.target sm_89
+.address_size 64
+
+	// .globl	triton_red_fused_add_mul_native_layer_norm_1 // -- Begin function triton_red_fused_add_mul_native_layer_norm_1
+.extern .shared .align 16 .b8 global_smem[];
+.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90};
+                                        // @triton_red_fused_add_mul_native_layer_norm_1
+.visible .entry triton_red_fused_add_mul_native_layer_norm_1(
+	.param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_1_param_0,
+	.param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_1_param_1,
+	.param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_1_param_2,
+	.param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_1_param_3,
+	.param .u32 triton_red_fused_add_mul_native_layer_norm_1_param_4,
+	.param .u32 triton_red_fused_add_mul_native_layer_norm_1_param_5,
+	.param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_1_param_6,
+	.param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_1_param_7
+)
+.reqntid 512
+{
+	.reg .pred 	%p<23>;
+	.reg .b16 	%rs<33>;
+	.reg .b32 	%r<287>;
+	.reg .b64 	%rd<15>;
+	.loc	1 18 0                          // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:18:0
+$L__func_begin0:
+	.loc	1 18 0                          // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:18:0
+
+// %bb.0:                               // %__nv_rsqrtf.exit
+	ld.param.b64 	%rd9, [triton_red_fused_add_mul_native_layer_norm_1_param_0];
+	ld.param.b64 	%rd10, [triton_red_fused_add_mul_native_layer_norm_1_param_1];
+$L__tmp0:
+	.loc	1 23 28                         // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:23:28
+	mov.u32 	%r37, %ctaid.x;
+	.loc	1 25 21                         // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:25:21
+	setp.lt.u32 	%p1, %r37, 256;
+	ld.param.b64 	%rd11, [triton_red_fused_add_mul_native_layer_norm_1_param_2];
+	ld.param.b64 	%rd12, [triton_red_fused_add_mul_native_layer_norm_1_param_3];
+	.loc	1 26 37                         // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:26:37
+	mov.u32 	%r38, %tid.x;
+	and.b32 	%r39, %r38, 511;
+	and.b32 	%r40, %r38, 31;
+	shl.b32 	%r41, %r38, 3;
+	and.b32 	%r42, %r41, 4088;
+	.loc	1 38 46                         // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:38:46
+	shl.b32 	%r43, %r37, 12;
+	.loc	1 38 41                         // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:38:41
+	or.b32 	%r44, %r42, %r43;
+	.loc	1 38 34                         // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:38:34
+	mul.wide.s32 	%rd13, %r44, 2;
+	add.s64 	%rd1, %rd9, %rd13;
+	.loc	1 38 51                         // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:38:51
+	// begin inline asm
+	mov.u64 %rd2, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd2, 1.0;
+	// end inline asm
+	mov.b32 	%r5, 0;
+	// begin inline asm
+	mov.u32 %r1, %r5;
+	mov.u32 %r2, %r5;
+	mov.u32 %r3, %r5;
+	mov.u32 %r4, %r5;
+	@%p1 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r1, %r2, %r3, %r4 }, [ %rd1 + 0 ], %rd2;
+	// end inline asm
+	mov.b32 	{%rs1, %rs2}, %r1;
+	mov.b32 	{%rs3, %rs4}, %r2;
+	mov.b32 	{%rs5, %rs6}, %r3;
+	mov.b32 	{%rs7, %rs8}, %r4;
+	.loc	1 38 112                        // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:38:112
+	cvt.f32.bf16 	%r45, %rs1;
+	cvt.f32.bf16 	%r46, %rs2;
+	cvt.f32.bf16 	%r47, %rs3;
+	cvt.f32.bf16 	%r48, %rs4;
+	cvt.f32.bf16 	%r49, %rs5;
+	cvt.f32.bf16 	%r50, %rs6;
+	cvt.f32.bf16 	%r51, %rs7;
+	cvt.f32.bf16 	%r52, %rs8;
+	.loc	1 44 62                         // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:44:62
+	selp.f32 	%r53, %r45, 0f00000000, %p1;
+	selp.f32 	%r54, %r46, 0f00000000, %p1;
+	selp.f32 	%r55, %r47, 0f00000000, %p1;
+	selp.f32 	%r56, %r48, 0f00000000, %p1;
+	selp.f32 	%r57, %r49, 0f00000000, %p1;
+	selp.f32 	%r58, %r50, 0f00000000, %p1;
+	selp.f32 	%r59, %r51, 0f00000000, %p1;
+	selp.f32 	%r60, %r52, 0f00000000, %p1;
+	.loc	1 46 66                         // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:46:66
+	selp.f32 	%r61, 0f3F800000, 0f00000000, %p1;
+$L__tmp1:
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	sub.f32 	%r62, %r54, %r53;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	selp.f32 	%r63, 0f40000000, 0f00000000, %p1;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	setp.eq.f32 	%p6, %r63, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	div.full.f32 	%r64, %r61, %r63;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	selp.f32 	%r65, 0f00000000, %r64, %p6;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	fma.rn.f32 	%r66, %r65, %r62, %r53;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	mul.f32 	%r67, %r62, %r62;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	mul.f32 	%r68, %r61, %r67;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	fma.rn.f32 	%r69, %r65, %r68, 0f00000000;
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	sub.f32 	%r70, %r55, %r66;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	selp.f32 	%r71, 0f40400000, 0f00000000, %p1;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	setp.eq.f32 	%p7, %r71, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	div.full.f32 	%r72, %r61, %r71;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	selp.f32 	%r73, 0f00000000, %r72, %p7;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	fma.rn.f32 	%r74, %r73, %r70, %r66;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	mul.f32 	%r75, %r70, %r70;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	mul.f32 	%r76, %r63, %r75;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	fma.rn.f32 	%r77, %r73, %r76, %r69;
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	sub.f32 	%r78, %r56, %r74;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	selp.f32 	%r79, 0f40800000, 0f00000000, %p1;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	setp.eq.f32 	%p8, %r79, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	div.full.f32 	%r80, %r61, %r79;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	selp.f32 	%r81, 0f00000000, %r80, %p8;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	fma.rn.f32 	%r82, %r81, %r78, %r74;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	mul.f32 	%r83, %r78, %r78;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	mul.f32 	%r84, %r71, %r83;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	fma.rn.f32 	%r85, %r81, %r84, %r77;
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	sub.f32 	%r86, %r57, %r82;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	selp.f32 	%r87, 0f40A00000, 0f00000000, %p1;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	setp.eq.f32 	%p9, %r87, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	div.full.f32 	%r88, %r61, %r87;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	selp.f32 	%r89, 0f00000000, %r88, %p9;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	fma.rn.f32 	%r90, %r89, %r86, %r82;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	mul.f32 	%r91, %r86, %r86;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	mul.f32 	%r92, %r79, %r91;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	fma.rn.f32 	%r93, %r89, %r92, %r85;
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	sub.f32 	%r94, %r58, %r90;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	selp.f32 	%r95, 0f40C00000, 0f00000000, %p1;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	setp.eq.f32 	%p10, %r95, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	div.full.f32 	%r96, %r61, %r95;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	selp.f32 	%r97, 0f00000000, %r96, %p10;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	fma.rn.f32 	%r98, %r97, %r94, %r90;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	mul.f32 	%r99, %r94, %r94;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	mul.f32 	%r100, %r87, %r99;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	fma.rn.f32 	%r101, %r97, %r100, %r93;
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	sub.f32 	%r102, %r59, %r98;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	selp.f32 	%r103, 0f40E00000, 0f00000000, %p1;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	setp.eq.f32 	%p11, %r103, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	div.full.f32 	%r104, %r61, %r103;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	selp.f32 	%r105, 0f00000000, %r104, %p11;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	fma.rn.f32 	%r106, %r105, %r102, %r98;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	mul.f32 	%r107, %r102, %r102;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	mul.f32 	%r108, %r95, %r107;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	fma.rn.f32 	%r109, %r105, %r108, %r101;
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	sub.f32 	%r110, %r60, %r106;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	selp.f32 	%r111, 0f41000000, 0f00000000, %p1;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	setp.eq.f32 	%p12, %r111, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	div.full.f32 	%r112, %r61, %r111;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	selp.f32 	%r113, 0f00000000, %r112, %p12;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	fma.rn.f32 	%r114, %r113, %r110, %r106;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	mul.f32 	%r115, %r110, %r110;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	mul.f32 	%r116, %r103, %r115;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	fma.rn.f32 	%r117, %r113, %r116, %r109;
+$L__tmp2:
+	.loc	2 243 46                        // triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ]
+	shfl.sync.bfly.b32 	%r118, %r114, 16, 31, -1;
+	shfl.sync.bfly.b32 	%r119, %r117, 16, 31, -1;
+	shfl.sync.bfly.b32 	%r120, %r111, 16, 31, -1;
+$L__tmp3:
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	sub.f32 	%r121, %r118, %r114;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	add.f32 	%r122, %r111, %r120;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	setp.eq.f32 	%p13, %r122, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	div.full.f32 	%r123, %r120, %r122;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	selp.f32 	%r124, 0f00000000, %r123, %p13;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	fma.rn.f32 	%r125, %r124, %r121, %r114;
+	.loc	2 236 15                        // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	add.f32 	%r126, %r117, %r119;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	mul.f32 	%r127, %r121, %r121;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	mul.f32 	%r128, %r111, %r127;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	fma.rn.f32 	%r129, %r124, %r128, %r126;
+$L__tmp4:
+	.loc	2 243 46                        // triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ]
+	shfl.sync.bfly.b32 	%r130, %r125, 8, 31, -1;
+	shfl.sync.bfly.b32 	%r131, %r129, 8, 31, -1;
+	shfl.sync.bfly.b32 	%r132, %r122, 8, 31, -1;
+$L__tmp5:
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	sub.f32 	%r133, %r130, %r125;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	add.f32 	%r134, %r122, %r132;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	setp.eq.f32 	%p14, %r134, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	div.full.f32 	%r135, %r132, %r134;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	selp.f32 	%r136, 0f00000000, %r135, %p14;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	fma.rn.f32 	%r137, %r136, %r133, %r125;
+	.loc	2 236 15                        // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	add.f32 	%r138, %r129, %r131;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	mul.f32 	%r139, %r133, %r133;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	mul.f32 	%r140, %r122, %r139;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	fma.rn.f32 	%r141, %r136, %r140, %r138;
+$L__tmp6:
+	.loc	2 243 46                        // triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ]
+	shfl.sync.bfly.b32 	%r142, %r137, 4, 31, -1;
+	shfl.sync.bfly.b32 	%r143, %r141, 4, 31, -1;
+	shfl.sync.bfly.b32 	%r144, %r134, 4, 31, -1;
+$L__tmp7:
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	sub.f32 	%r145, %r142, %r137;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	add.f32 	%r146, %r134, %r144;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	setp.eq.f32 	%p15, %r146, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	div.full.f32 	%r147, %r144, %r146;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	selp.f32 	%r148, 0f00000000, %r147, %p15;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	fma.rn.f32 	%r149, %r148, %r145, %r137;
+	.loc	2 236 15                        // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	add.f32 	%r150, %r141, %r143;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	mul.f32 	%r151, %r145, %r145;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	mul.f32 	%r152, %r134, %r151;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	fma.rn.f32 	%r153, %r148, %r152, %r150;
+$L__tmp8:
+	.loc	2 243 46                        // triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ]
+	shfl.sync.bfly.b32 	%r154, %r149, 2, 31, -1;
+	shfl.sync.bfly.b32 	%r155, %r153, 2, 31, -1;
+	shfl.sync.bfly.b32 	%r156, %r146, 2, 31, -1;
+$L__tmp9:
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	sub.f32 	%r157, %r154, %r149;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	add.f32 	%r158, %r146, %r156;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	setp.eq.f32 	%p16, %r158, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	div.full.f32 	%r159, %r156, %r158;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	selp.f32 	%r160, 0f00000000, %r159, %p16;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	fma.rn.f32 	%r161, %r160, %r157, %r149;
+	.loc	2 236 15                        // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	add.f32 	%r162, %r153, %r155;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	mul.f32 	%r163, %r157, %r157;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	mul.f32 	%r164, %r146, %r163;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	fma.rn.f32 	%r165, %r160, %r164, %r162;
+$L__tmp10:
+	.loc	2 243 46                        // triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ]
+	shfl.sync.bfly.b32 	%r166, %r161, 1, 31, -1;
+	shfl.sync.bfly.b32 	%r167, %r165, 1, 31, -1;
+	shfl.sync.bfly.b32 	%r168, %r158, 1, 31, -1;
+$L__tmp11:
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	sub.f32 	%r169, %r166, %r161;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	add.f32 	%r11, %r158, %r168;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	setp.eq.f32 	%p17, %r11, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	div.full.f32 	%r170, %r168, %r11;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	selp.f32 	%r171, 0f00000000, %r170, %p17;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	fma.rn.f32 	%r7, %r171, %r169, %r161;
+	.loc	2 236 15                        // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	add.f32 	%r172, %r165, %r167;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	mul.f32 	%r173, %r169, %r169;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	mul.f32 	%r174, %r158, %r173;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	fma.rn.f32 	%r9, %r171, %r174, %r172;
+$L__tmp12:
+	.loc	2 243 46                        // triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ]
+	setp.eq.b32 	%p2, %r40, 0;
+	shr.u32 	%r175, %r38, 3;
+	and.b32 	%r176, %r175, 60;
+	mov.b32 	%r177, global_smem;
+	add.s32 	%r6, %r177, %r176;
+	// begin inline asm
+	@%p2 st.shared.b32 [ %r6 + 0 ], %r7;
+	// end inline asm
+	add.s32 	%r8, %r6, 64;
+	// begin inline asm
+	@%p2 st.shared.b32 [ %r8 + 0 ], %r9;
+	// end inline asm
+	add.s32 	%r10, %r6, 128;
+	// begin inline asm
+	@%p2 st.shared.b32 [ %r10 + 0 ], %r11;
+	// end inline asm
+	bar.sync 	0;
+	setp.lt.u32 	%p3, %r39, 16;
+	shl.b32 	%r178, %r39, 2;
+	add.s32 	%r13, %r177, %r178;
+	// begin inline asm
+	@%p3 ld.shared.b32 %r12, [ %r13 + 0 ];
+	// end inline asm
+	add.s32 	%r15, %r13, 64;
+	// begin inline asm
+	@%p3 ld.shared.b32 %r14, [ %r15 + 0 ];
+	// end inline asm
+	add.s32 	%r17, %r13, 128;
+	// begin inline asm
+	@%p3 ld.shared.b32 %r16, [ %r17 + 0 ];
+	// end inline asm
+	shfl.sync.bfly.b32 	%r179, %r12, 8, 31, -1;
+	shfl.sync.bfly.b32 	%r180, %r14, 8, 31, -1;
+	shfl.sync.bfly.b32 	%r181, %r16, 8, 31, -1;
+$L__tmp13:
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	sub.f32 	%r182, %r179, %r12;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	add.f32 	%r183, %r16, %r181;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	setp.eq.f32 	%p18, %r183, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	div.full.f32 	%r184, %r181, %r183;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	selp.f32 	%r185, 0f00000000, %r184, %p18;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	fma.rn.f32 	%r186, %r182, %r185, %r12;
+	.loc	2 236 15                        // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	add.f32 	%r187, %r14, %r180;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	mul.f32 	%r188, %r182, %r182;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	mul.f32 	%r189, %r188, %r16;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	fma.rn.f32 	%r190, %r189, %r185, %r187;
+$L__tmp14:
+	.loc	2 243 46                        // triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ]
+	shfl.sync.bfly.b32 	%r191, %r186, 4, 31, -1;
+	shfl.sync.bfly.b32 	%r192, %r190, 4, 31, -1;
+	shfl.sync.bfly.b32 	%r193, %r183, 4, 31, -1;
+$L__tmp15:
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	sub.f32 	%r194, %r191, %r186;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	add.f32 	%r195, %r183, %r193;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	setp.eq.f32 	%p19, %r195, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	div.full.f32 	%r196, %r193, %r195;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	selp.f32 	%r197, 0f00000000, %r196, %p19;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	fma.rn.f32 	%r198, %r194, %r197, %r186;
+	.loc	2 236 15                        // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	add.f32 	%r199, %r190, %r192;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	mul.f32 	%r200, %r194, %r194;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	mul.f32 	%r201, %r183, %r200;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	fma.rn.f32 	%r202, %r197, %r201, %r199;
+$L__tmp16:
+	.loc	2 243 46                        // triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ]
+	shfl.sync.bfly.b32 	%r203, %r198, 2, 31, -1;
+	shfl.sync.bfly.b32 	%r204, %r202, 2, 31, -1;
+	shfl.sync.bfly.b32 	%r205, %r195, 2, 31, -1;
+$L__tmp17:
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	sub.f32 	%r206, %r203, %r198;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	add.f32 	%r207, %r195, %r205;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	setp.eq.f32 	%p20, %r207, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	div.full.f32 	%r208, %r205, %r207;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	selp.f32 	%r209, 0f00000000, %r208, %p20;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	fma.rn.f32 	%r210, %r206, %r209, %r198;
+	.loc	2 236 15                        // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	add.f32 	%r211, %r202, %r204;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	mul.f32 	%r212, %r206, %r206;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	mul.f32 	%r213, %r195, %r212;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	fma.rn.f32 	%r214, %r209, %r213, %r211;
+$L__tmp18:
+	.loc	2 243 46                        // triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ]
+	shfl.sync.bfly.b32 	%r215, %r210, 1, 31, -1;
+	shfl.sync.bfly.b32 	%r216, %r214, 1, 31, -1;
+	shfl.sync.bfly.b32 	%r217, %r207, 1, 31, -1;
+$L__tmp19:
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	sub.f32 	%r218, %r215, %r210;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	add.f32 	%r20, %r207, %r217;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	setp.eq.f32 	%p21, %r20, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	div.full.f32 	%r219, %r217, %r20;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	selp.f32 	%r220, 0f00000000, %r219, %p21;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	fma.rn.f32 	%r18, %r218, %r220, %r210;
+	.loc	2 236 15                        // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	add.f32 	%r221, %r214, %r216;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	mul.f32 	%r222, %r218, %r218;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	mul.f32 	%r223, %r207, %r222;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ] ]
+	fma.rn.f32 	%r19, %r220, %r223, %r221;
+$L__tmp20:
+	.loc	2 243 46                        // triton_helpers.py:243:46 @[ cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:47:79 ]
+	and.b32 	%r224, %r38, 15;
+	setp.eq.b32 	%p22, %r224, 0;
+	and.pred 	%p4, %p3, %p22;
+	// begin inline asm
+	@%p4 st.shared.b32 [ %r13 + 0 ], %r18;
+	// end inline asm
+	// begin inline asm
+	@%p4 st.shared.b32 [ %r15 + 0 ], %r19;
+	// end inline asm
+	// begin inline asm
+	@%p4 st.shared.b32 [ %r17 + 0 ], %r20;
+	// end inline asm
+	bar.sync 	0;
+	ld.shared.b32 	%r225, [global_smem];
+	ld.shared.b32 	%r226, [global_smem+64];
+$L__tmp21:
+	.loc	1 57 34                         // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:57:34
+	mul.wide.u32 	%rd14, %r42, 2;
+	add.s64 	%rd3, %rd10, %rd14;
+	.loc	1 57 41                         // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:57:41
+	// begin inline asm
+	mov.u64 %rd4, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd4, 1.0;
+	// end inline asm
+	mov.pred 	%p5, -1;
+	// begin inline asm
+	mov.u32 %r21, %r5;
+	mov.u32 %r22, %r5;
+	mov.u32 %r23, %r5;
+	mov.u32 %r24, %r5;
+	@%p5 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r21, %r22, %r23, %r24 }, [ %rd3 + 0 ], %rd4;
+	// end inline asm
+	.loc	1 58 52                         // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:58:52
+	// begin inline asm
+	mov.u64 %rd5, 0x0;
+	createpolicy.fractional.L2::evict_first.b64 %rd5, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r25, %r5;
+	mov.u32 %r26, %r5;
+	mov.u32 %r27, %r5;
+	mov.u32 %r28, %r5;
+	@%p1 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { %r25, %r26, %r27, %r28 }, [ %rd1 + 0 ], %rd5;
+	// end inline asm
+	.loc	1 59 35                         // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:59:35
+	add.s64 	%rd6, %rd11, %rd14;
+	.loc	1 59 42                         // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:59:42
+	// begin inline asm
+	mov.u64 %rd7, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd7, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r29, %r5;
+	mov.u32 %r30, %r5;
+	mov.u32 %r31, %r5;
+	mov.u32 %r32, %r5;
+	@%p5 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r29, %r30, %r31, %r32 }, [ %rd6 + 0 ], %rd7;
+	// end inline asm
+	mov.b32 	%r227, 0f45800000;
+	.loc	1 65 24                         // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:65:24
+	div.full.f32 	%r228, %r226, %r227;
+	.loc	1 67 24                         // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:67:24
+	add.f32 	%r229, %r228, 0f358637BD;
+	.loc	1 68 32                         // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:68:32
+	rsqrt.approx.ftz.f32 	%r230, %r229;
+	.loc	1 73 29                         // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:73:29
+	add.s64 	%rd8, %rd12, %rd13;
+	.loc	1 58 114                        // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:58:114
+	mov.b32 	{%rs9, %rs10}, %r25;
+	cvt.f32.bf16 	%r231, %rs10;
+	cvt.f32.bf16 	%r232, %rs9;
+	.loc	1 63 24                         // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:63:24
+	sub.f32 	%r233, %r232, %r225;
+	sub.f32 	%r234, %r231, %r225;
+	.loc	1 57 94                         // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:57:94
+	mov.b32 	{%rs11, %rs12}, %r21;
+	cvt.f32.bf16 	%r235, %rs11;
+	cvt.f32.bf16 	%r236, %rs12;
+	.loc	1 61 23                         // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:61:23
+	add.f32 	%r237, %r236, 0f3F800000;
+	add.f32 	%r238, %r235, 0f3F800000;
+	.loc	1 59 95                         // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:59:95
+	mov.b32 	{%rs13, %rs14}, %r29;
+	cvt.f32.bf16 	%r239, %rs14;
+	cvt.f32.bf16 	%r240, %rs13;
+	.loc	1 69 24                         // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:69:24
+	mul.f32 	%r241, %r234, %r230;
+	mul.f32 	%r242, %r233, %r230;
+	.loc	1 72 24                         // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:72:24
+	fma.rn.f32 	%r243, %r238, %r242, %r240;
+	fma.rn.f32 	%r244, %r237, %r241, %r239;
+	.loc	1 73 53                         // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:73:53
+	cvt.rn.bf16x2.f32 	%r33, %r244, %r243;
+	.loc	1 58 114                        // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:58:114
+	mov.b32 	{%rs15, %rs16}, %r26;
+	cvt.f32.bf16 	%r245, %rs16;
+	cvt.f32.bf16 	%r246, %rs15;
+	.loc	1 63 24                         // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:63:24
+	sub.f32 	%r247, %r246, %r225;
+	sub.f32 	%r248, %r245, %r225;
+	.loc	1 57 94                         // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:57:94
+	mov.b32 	{%rs17, %rs18}, %r22;
+	cvt.f32.bf16 	%r249, %rs17;
+	cvt.f32.bf16 	%r250, %rs18;
+	.loc	1 61 23                         // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:61:23
+	add.f32 	%r251, %r250, 0f3F800000;
+	add.f32 	%r252, %r249, 0f3F800000;
+	.loc	1 59 95                         // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:59:95
+	mov.b32 	{%rs19, %rs20}, %r30;
+	cvt.f32.bf16 	%r253, %rs20;
+	cvt.f32.bf16 	%r254, %rs19;
+	.loc	1 69 24                         // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:69:24
+	mul.f32 	%r255, %r248, %r230;
+	mul.f32 	%r256, %r247, %r230;
+	.loc	1 72 24                         // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:72:24
+	fma.rn.f32 	%r257, %r252, %r256, %r254;
+	fma.rn.f32 	%r258, %r251, %r255, %r253;
+	.loc	1 73 53                         // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:73:53
+	cvt.rn.bf16x2.f32 	%r34, %r258, %r257;
+	.loc	1 58 114                        // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:58:114
+	mov.b32 	{%rs21, %rs22}, %r27;
+	cvt.f32.bf16 	%r259, %rs22;
+	cvt.f32.bf16 	%r260, %rs21;
+	.loc	1 63 24                         // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:63:24
+	sub.f32 	%r261, %r260, %r225;
+	sub.f32 	%r262, %r259, %r225;
+	.loc	1 57 94                         // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:57:94
+	mov.b32 	{%rs23, %rs24}, %r23;
+	cvt.f32.bf16 	%r263, %rs23;
+	cvt.f32.bf16 	%r264, %rs24;
+	.loc	1 61 23                         // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:61:23
+	add.f32 	%r265, %r264, 0f3F800000;
+	add.f32 	%r266, %r263, 0f3F800000;
+	.loc	1 59 95                         // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:59:95
+	mov.b32 	{%rs25, %rs26}, %r31;
+	cvt.f32.bf16 	%r267, %rs26;
+	cvt.f32.bf16 	%r268, %rs25;
+	.loc	1 69 24                         // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:69:24
+	mul.f32 	%r269, %r262, %r230;
+	mul.f32 	%r270, %r261, %r230;
+	.loc	1 72 24                         // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:72:24
+	fma.rn.f32 	%r271, %r266, %r270, %r268;
+	fma.rn.f32 	%r272, %r265, %r269, %r267;
+	.loc	1 73 53                         // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:73:53
+	cvt.rn.bf16x2.f32 	%r35, %r272, %r271;
+	.loc	1 58 114                        // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:58:114
+	mov.b32 	{%rs27, %rs28}, %r28;
+	cvt.f32.bf16 	%r273, %rs28;
+	cvt.f32.bf16 	%r274, %rs27;
+	.loc	1 63 24                         // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:63:24
+	sub.f32 	%r275, %r274, %r225;
+	sub.f32 	%r276, %r273, %r225;
+	.loc	1 57 94                         // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:57:94
+	mov.b32 	{%rs29, %rs30}, %r24;
+	cvt.f32.bf16 	%r277, %rs29;
+	cvt.f32.bf16 	%r278, %rs30;
+	.loc	1 61 23                         // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:61:23
+	add.f32 	%r279, %r278, 0f3F800000;
+	add.f32 	%r280, %r277, 0f3F800000;
+	.loc	1 59 95                         // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:59:95
+	mov.b32 	{%rs31, %rs32}, %r32;
+	cvt.f32.bf16 	%r281, %rs32;
+	cvt.f32.bf16 	%r282, %rs31;
+	.loc	1 69 24                         // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:69:24
+	mul.f32 	%r283, %r276, %r230;
+	mul.f32 	%r284, %r275, %r230;
+	.loc	1 72 24                         // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:72:24
+	fma.rn.f32 	%r285, %r280, %r284, %r282;
+	fma.rn.f32 	%r286, %r279, %r283, %r281;
+	.loc	1 73 53                         // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:73:53
+	cvt.rn.bf16x2.f32 	%r36, %r286, %r285;
+	// begin inline asm
+	@%p1 st.global.v4.b32 [ %rd8 + 0 ], { %r33, %r34, %r35, %r36 };
+	// end inline asm
+	.loc	1 51 4                          // cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py:51:4
+	ret;
+$L__tmp22:
+$L__func_end0:
+                                        // -- End function
+}
+	.file	1 "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py"
+	.file	2 "/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py"
+	.section	.debug_abbrev
+	{
+.b8 1                                   // Abbreviation Code
+.b8 17                                  // DW_TAG_compile_unit
+.b8 1                                   // DW_CHILDREN_yes
+.b8 37                                  // DW_AT_producer
+.b8 8                                   // DW_FORM_string
+.b8 19                                  // DW_AT_language
+.b8 5                                   // DW_FORM_data2
+.b8 3                                   // DW_AT_name
+.b8 8                                   // DW_FORM_string
+.b8 16                                  // DW_AT_stmt_list
+.b8 6                                   // DW_FORM_data4
+.b8 27                                  // DW_AT_comp_dir
+.b8 8                                   // DW_FORM_string
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 2                                   // Abbreviation Code
+.b8 46                                  // DW_TAG_subprogram
+.b8 0                                   // DW_CHILDREN_no
+.b8 3                                   // DW_AT_name
+.b8 8                                   // DW_FORM_string
+.b8 32                                  // DW_AT_inline
+.b8 11                                  // DW_FORM_data1
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 3                                   // Abbreviation Code
+.b8 46                                  // DW_TAG_subprogram
+.b8 1                                   // DW_CHILDREN_yes
+.b8 17                                  // DW_AT_low_pc
+.b8 1                                   // DW_FORM_addr
+.b8 18                                  // DW_AT_high_pc
+.b8 1                                   // DW_FORM_addr
+.b8 49                                  // DW_AT_abstract_origin
+.b8 19                                  // DW_FORM_ref4
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 4                                   // Abbreviation Code
+.b8 29                                  // DW_TAG_inlined_subroutine
+.b8 1                                   // DW_CHILDREN_yes
+.b8 49                                  // DW_AT_abstract_origin
+.b8 19                                  // DW_FORM_ref4
+.b8 17                                  // DW_AT_low_pc
+.b8 1                                   // DW_FORM_addr
+.b8 18                                  // DW_AT_high_pc
+.b8 1                                   // DW_FORM_addr
+.b8 88                                  // DW_AT_call_file
+.b8 11                                  // DW_FORM_data1
+.b8 89                                  // DW_AT_call_line
+.b8 11                                  // DW_FORM_data1
+.b8 87                                  // DW_AT_call_column
+.b8 11                                  // DW_FORM_data1
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 5                                   // Abbreviation Code
+.b8 29                                  // DW_TAG_inlined_subroutine
+.b8 0                                   // DW_CHILDREN_no
+.b8 49                                  // DW_AT_abstract_origin
+.b8 19                                  // DW_FORM_ref4
+.b8 17                                  // DW_AT_low_pc
+.b8 1                                   // DW_FORM_addr
+.b8 18                                  // DW_AT_high_pc
+.b8 1                                   // DW_FORM_addr
+.b8 88                                  // DW_AT_call_file
+.b8 11                                  // DW_FORM_data1
+.b8 89                                  // DW_AT_call_line
+.b8 11                                  // DW_FORM_data1
+.b8 87                                  // DW_AT_call_column
+.b8 11                                  // DW_FORM_data1
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 0                                   // EOM(3)
+	}
+	.section	.debug_info
+	{
+.b32 343                                // Length of Unit
+.b8 2                                   // DWARF version number
+.b8 0
+.b32 .debug_abbrev                      // Offset Into Abbrev. Section
+.b8 8                                   // Address Size (in bytes)
+.b8 1                                   // Abbrev [1] 0xb:0x150 DW_TAG_compile_unit
+.b8 116                                 // DW_AT_producer
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 0
+.b8 2                                   // DW_AT_language
+.b8 0
+.b8 99                                  // DW_AT_name
+.b8 97
+.b8 118
+.b8 111
+.b8 97
+.b8 122
+.b8 54
+.b8 101
+.b8 55
+.b8 107
+.b8 98
+.b8 107
+.b8 53
+.b8 119
+.b8 113
+.b8 50
+.b8 110
+.b8 55
+.b8 118
+.b8 122
+.b8 54
+.b8 114
+.b8 120
+.b8 104
+.b8 99
+.b8 114
+.b8 119
+.b8 100
+.b8 117
+.b8 50
+.b8 116
+.b8 114
+.b8 97
+.b8 122
+.b8 101
+.b8 120
+.b8 117
+.b8 98
+.b8 100
+.b8 113
+.b8 53
+.b8 113
+.b8 119
+.b8 121
+.b8 118
+.b8 50
+.b8 97
+.b8 106
+.b8 109
+.b8 98
+.b8 107
+.b8 122
+.b8 46
+.b8 112
+.b8 121
+.b8 0
+.b32 .debug_line                        // DW_AT_stmt_list
+.b8 47                                  // DW_AT_comp_dir
+.b8 97
+.b8 112
+.b8 112
+.b8 47
+.b8 116
+.b8 101
+.b8 110
+.b8 115
+.b8 111
+.b8 114
+.b8 114
+.b8 116
+.b8 95
+.b8 108
+.b8 108
+.b8 109
+.b8 47
+.b8 118
+.b8 105
+.b8 115
+.b8 117
+.b8 97
+.b8 108
+.b8 95
+.b8 103
+.b8 101
+.b8 110
+.b8 47
+.b8 99
+.b8 111
+.b8 109
+.b8 112
+.b8 105
+.b8 108
+.b8 101
+.b8 100
+.b8 95
+.b8 99
+.b8 97
+.b8 99
+.b8 104
+.b8 101
+.b8 47
+.b8 102
+.b8 108
+.b8 117
+.b8 120
+.b8 50
+.b8 95
+.b8 107
+.b8 108
+.b8 101
+.b8 105
+.b8 110
+.b8 95
+.b8 57
+.b8 98
+.b8 95
+.b8 78
+.b8 86
+.b8 73
+.b8 68
+.b8 73
+.b8 65
+.b8 95
+.b8 71
+.b8 101
+.b8 70
+.b8 111
+.b8 114
+.b8 99
+.b8 101
+.b8 95
+.b8 82
+.b8 84
+.b8 88
+.b8 95
+.b8 52
+.b8 48
+.b8 57
+.b8 48
+.b8 95
+.b8 115
+.b8 109
+.b8 56
+.b8 57
+.b8 95
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 50
+.b8 46
+.b8 49
+.b8 48
+.b8 46
+.b8 48
+.b8 97
+.b8 48
+.b8 95
+.b8 98
+.b8 52
+.b8 101
+.b8 52
+.b8 101
+.b8 101
+.b8 56
+.b8 49
+.b8 100
+.b8 51
+.b8 46
+.b8 110
+.b8 118
+.b8 50
+.b8 53
+.b8 46
+.b8 49
+.b8 50
+.b8 95
+.b8 99
+.b8 117
+.b8 100
+.b8 97
+.b8 49
+.b8 51
+.b8 95
+.b8 49
+.b8 47
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 105
+.b8 110
+.b8 100
+.b8 117
+.b8 99
+.b8 116
+.b8 111
+.b8 114
+.b8 47
+.b8 97
+.b8 118
+.b8 0
+.b8 2                                   // Abbrev [2] 0xe4:0x2f DW_TAG_subprogram
+.b8 116                                 // DW_AT_name
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 114
+.b8 101
+.b8 100
+.b8 95
+.b8 102
+.b8 117
+.b8 115
+.b8 101
+.b8 100
+.b8 95
+.b8 97
+.b8 100
+.b8 100
+.b8 95
+.b8 109
+.b8 117
+.b8 108
+.b8 95
+.b8 110
+.b8 97
+.b8 116
+.b8 105
+.b8 118
+.b8 101
+.b8 95
+.b8 108
+.b8 97
+.b8 121
+.b8 101
+.b8 114
+.b8 95
+.b8 110
+.b8 111
+.b8 114
+.b8 109
+.b8 95
+.b8 49
+.b8 0
+.b8 1                                   // DW_AT_inline
+.b8 3                                   // Abbrev [3] 0x113:0x47 DW_TAG_subprogram
+.b64 $L__func_begin0                    // DW_AT_low_pc
+.b64 $L__func_end0                      // DW_AT_high_pc
+.b32 228                                // DW_AT_abstract_origin
+.b8 4                                   // Abbrev [4] 0x128:0x31 DW_TAG_inlined_subroutine
+.b32 228                                // DW_AT_abstract_origin
+.b64 $L__tmp1                           // DW_AT_low_pc
+.b64 $L__tmp21                          // DW_AT_high_pc
+.b8 1                                   // DW_AT_call_file
+.b8 47                                  // DW_AT_call_line
+.b8 79                                  // DW_AT_call_column
+.b8 5                                   // Abbrev [5] 0x140:0x18 DW_TAG_inlined_subroutine
+.b32 228                                // DW_AT_abstract_origin
+.b64 $L__tmp1                           // DW_AT_low_pc
+.b64 $L__tmp20                          // DW_AT_high_pc
+.b8 2                                   // DW_AT_call_file
+.b8 243                                 // DW_AT_call_line
+.b8 46                                  // DW_AT_call_column
+.b8 0                                   // End Of Children Mark
+.b8 0                                   // End Of Children Mark
+.b8 0                                   // End Of Children Mark
+	}
+	.section	.debug_macinfo	{	}
diff --git a/triton/CYKNGA4OMPRI7EV7H5FM47DKU7VFZ4Q5NYQGPNW6ZIVYBLBWPVMA/triton_red_fused_add_mul_native_layer_norm_1.source b/triton/CYKNGA4OMPRI7EV7H5FM47DKU7VFZ4Q5NYQGPNW6ZIVYBLBWPVMA/triton_red_fused_add_mul_native_layer_norm_1.source
new file mode 100644
index 0000000000000000000000000000000000000000..c27c9fc7765e7194273ba6079eb492e76752a0cd
--- /dev/null
+++ b/triton/CYKNGA4OMPRI7EV7H5FM47DKU7VFZ4Q5NYQGPNW6ZIVYBLBWPVMA/triton_red_fused_add_mul_native_layer_norm_1.source
@@ -0,0 +1,420 @@
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":18:0)
+#loc72 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":216:0)
+#loc85 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":133:0)
+#loc89 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":242:0)
+#loc91 = loc(unknown)
+#loc94 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":230:0)
+#loc109 = loc("in_ptr0"(#loc))
+#loc110 = loc("in_ptr1"(#loc))
+#loc111 = loc("in_ptr2"(#loc))
+#loc112 = loc("out_ptr2"(#loc))
+#loc113 = loc("xnumel"(#loc))
+#loc114 = loc("r0_numel"(#loc))
+#loc171 = loc("value"(#loc72))
+#loc172 = loc("mean"(#loc72))
+#loc173 = loc("m2"(#loc72))
+#loc174 = loc("weight"(#loc72))
+#loc175 = loc("first_iteration"(#loc72))
+#loc185 = loc("input"(#loc85))
+#loc186 = loc("mean"(#loc89))
+#loc187 = loc("m2"(#loc89))
+#loc188 = loc("weight"(#loc89))
+#loc189 = loc("mean_1"(#loc94))
+#loc190 = loc("m2_1"(#loc94))
+#loc191 = loc("weight_1"(#loc94))
+#loc192 = loc("mean_2"(#loc94))
+#loc193 = loc("m2_2"(#loc94))
+#loc194 = loc("weight_2"(#loc94))
+#loc201 = loc("new_mean"(#loc171))
+module {
+  tt.func public @triton_red_fused_add_mul_native_layer_norm_1(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %out_ptr2: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
+    %xnumel_0 = arith.constant 256 : i32 loc(#loc115)
+    %r0_numel_1 = arith.constant 4096 : i32 loc(#loc116)
+    %xoffset = tt.get_program_id x : i32 loc(#loc117)
+    %xoffset_2 = arith.constant 1 : i32 loc(#loc118)
+    %xoffset_3 = arith.constant 1 : i32 loc(#loc118)
+    %xoffset_4 = arith.muli %xoffset, %xoffset_3 : i32 loc(#loc118)
+    %xindex = tt.make_range {end = 1 : i32, start = 0 : i32} : tensor<1xi32> loc(#loc119)
+    %xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc120)
+    %xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<1x1xi32> loc(#loc121)
+    %xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<1x1xi32> loc(#loc121)
+    %xmask = arith.constant dense<256> : tensor<1x1xi32> loc(#loc122)
+    %xmask_8 = arith.cmpi slt, %xindex_7, %xmask : tensor<1x1xi32> loc(#loc122)
+    %r0_base = tt.make_range {end = 4096 : i32, start = 0 : i32} : tensor<4096xi32> loc(#loc123)
+    %r0_base_9 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<4096xi32> -> tensor<1x4096xi32> loc(#loc124)
+    %tmp3_mean = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_4096__(1,)cconstexpr_fp32_"() : () -> tensor<1x4096xf32> loc(#loc125)
+    %tmp3_m2 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_4096__(1,)cconstexpr_fp32_"() : () -> tensor<1x4096xf32> loc(#loc126)
+    %tmp3_weight = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_4096__(1,)cconstexpr_fp32_"() : () -> tensor<1x4096xf32> loc(#loc127)
+    %c0_i32 = arith.constant 0 : i32 loc(#loc14)
+    %c4096_i32 = arith.constant 4096 : i32 loc(#loc14)
+    %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc14)
+    %1 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc14)
+    %2 = arith.bitcast %c4096_i32 : i32 to i32 loc(#loc14)
+    %3 = ub.poison : i32 loc(#loc14)
+    %tmp3_weight_10:3 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%tmp3_mean_13 = %tmp3_mean, %tmp3_m2_14 = %tmp3_m2, %tmp3_weight_15 = %tmp3_weight) -> (tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32>)  : i32 {
+      %r0_index = tt.splat %r0_offset : i32 -> tensor<1x4096xi32> loc(#loc129)
+      %r0_index_16 = arith.addi %r0_index, %r0_base_9 : tensor<1x4096xi32> loc(#loc129)
+      %r0_mask = arith.constant dense<4096> : tensor<1x4096xi32> loc(#loc130)
+      %r0_mask_17 = arith.cmpi slt, %r0_index_16, %r0_mask : tensor<1x4096xi32> loc(#loc130)
+      %tmp0 = arith.constant 4096 : i32 loc(#loc131)
+      %tmp0_18 = arith.constant 4096 : i32 loc(#loc131)
+      %tmp0_19 = arith.constant dense<4096> : tensor<1x1xi32> loc(#loc131)
+      %tmp0_20 = arith.muli %tmp0_19, %xindex_7 : tensor<1x1xi32> loc(#loc131)
+      %tmp0_21 = tt.broadcast %tmp0_20 : tensor<1x1xi32> -> tensor<1x4096xi32> loc(#loc132)
+      %tmp0_22 = arith.addi %r0_index_16, %tmp0_21 : tensor<1x4096xi32> loc(#loc132)
+      %tmp0_23 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<1x4096x!tt.ptr<bf16>> loc(#loc133)
+      %tmp0_24 = tt.addptr %tmp0_23, %tmp0_22 : tensor<1x4096x!tt.ptr<bf16>>, tensor<1x4096xi32> loc(#loc133)
+      %tmp0_25 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x4096xi1> loc(#loc134)
+      %tmp0_26 = arith.andi %r0_mask_17, %tmp0_25 : tensor<1x4096xi1> loc(#loc134)
+      %tmp0_27 = arith.constant 0.000000e+00 : f32 loc(#loc135)
+      %tmp0_28 = arith.constant dense<0.000000e+00> : tensor<1x4096xf32> loc(#loc135)
+      %tmp0_29 = arith.truncf %tmp0_28 : tensor<1x4096xf32> to tensor<1x4096xbf16> loc(#loc135)
+      %tmp0_30 = tt.load %tmp0_24, %tmp0_26, %tmp0_29 evictionPolicy = evict_last : tensor<1x4096x!tt.ptr<bf16>> loc(#loc135)
+      %tmp0_31 = arith.extf %tmp0_30 : tensor<1x4096xbf16> to tensor<1x4096xf32> loc(#loc136)
+      %c0_i32_32 = arith.constant 0 : i32 loc(#loc23)
+      %9 = arith.cmpi eq, %r0_offset, %c0_i32_32 : i32 loc(#loc23)
+      %10:3 = tt.call @torch._inductor.runtime.triton_helpers.welford_reduce__fp32S1_4096S_fp32S1_4096S_fp32S1_4096S_fp32S1_4096S_u1__(%tmp0_31, %tmp3_mean_13, %tmp3_m2_14, %tmp3_weight_15, %9) : (tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32>, i1) -> (tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32>) loc(#loc24)
+      %tmp3_mean_33 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x4096xi1> loc(#loc137)
+      %tmp3_mean_34 = arith.andi %r0_mask_17, %tmp3_mean_33 : tensor<1x4096xi1> loc(#loc137)
+      %tmp3_mean_35 = arith.select %tmp3_mean_34, %10#0, %tmp3_mean_13 : tensor<1x4096xi1>, tensor<1x4096xf32> loc(#loc138)
+      %tmp3_m2_36 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x4096xi1> loc(#loc139)
+      %tmp3_m2_37 = arith.andi %r0_mask_17, %tmp3_m2_36 : tensor<1x4096xi1> loc(#loc139)
+      %tmp3_m2_38 = arith.select %tmp3_m2_37, %10#1, %tmp3_m2_14 : tensor<1x4096xi1>, tensor<1x4096xf32> loc(#loc140)
+      %tmp3_weight_39 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x4096xi1> loc(#loc141)
+      %tmp3_weight_40 = arith.andi %r0_mask_17, %tmp3_weight_39 : tensor<1x4096xi1> loc(#loc141)
+      %tmp3_weight_41 = arith.select %tmp3_weight_40, %10#2, %tmp3_weight_15 : tensor<1x4096xi1>, tensor<1x4096xf32> loc(#loc142)
+      scf.yield %tmp3_mean_35, %tmp3_m2_38, %tmp3_weight_41 : tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32> loc(#loc31)
+    } loc(#loc207)
+    %4:3 = tt.call @"torch._inductor.runtime.triton_helpers.welford__fp32S1_4096S_fp32S1_4096S_fp32S1_4096S__(3,)cconstexpr_1_"(%tmp3_weight_10#0, %tmp3_weight_10#1, %tmp3_weight_10#2) : (tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32>) -> (tensor<1xf32>, tensor<1xf32>, tensor<1xf32>) loc(#loc32)
+    %tmp3 = tt.expand_dims %4#0 {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc143)
+    %tmp7 = tt.expand_dims %4#1 {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc144)
+    %tmp8 = tt.expand_dims %4#2 {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc145)
+    %c0_i32_11 = arith.constant 0 : i32 loc(#loc36)
+    %c4096_i32_12 = arith.constant 4096 : i32 loc(#loc36)
+    %5 = arith.bitcast %c0_i32_11 : i32 to i32 loc(#loc36)
+    %6 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc36)
+    %7 = arith.bitcast %c4096_i32_12 : i32 to i32 loc(#loc36)
+    %8 = ub.poison : i32 loc(#loc36)
+    scf.for %r0_offset = %5 to %6 step %7  : i32 {
+      %r0_index = tt.splat %r0_offset : i32 -> tensor<1x4096xi32> loc(#loc146)
+      %r0_index_13 = arith.addi %r0_index, %r0_base_9 : tensor<1x4096xi32> loc(#loc146)
+      %r0_mask = arith.constant dense<4096> : tensor<1x4096xi32> loc(#loc147)
+      %r0_mask_14 = arith.cmpi slt, %r0_index_13, %r0_mask : tensor<1x4096xi32> loc(#loc147)
+      %tmp9 = tt.splat %in_ptr1 : !tt.ptr<bf16> -> tensor<1x4096x!tt.ptr<bf16>> loc(#loc148)
+      %tmp9_15 = tt.addptr %tmp9, %r0_index_13 : tensor<1x4096x!tt.ptr<bf16>>, tensor<1x4096xi32> loc(#loc148)
+      %tmp9_16 = arith.constant 0.000000e+00 : f32 loc(#loc149)
+      %tmp9_17 = arith.constant dense<0.000000e+00> : tensor<1x4096xf32> loc(#loc149)
+      %tmp9_18 = arith.truncf %tmp9_17 : tensor<1x4096xf32> to tensor<1x4096xbf16> loc(#loc149)
+      %tmp9_19 = tt.load %tmp9_15, %r0_mask_14, %tmp9_18 evictionPolicy = evict_last : tensor<1x4096x!tt.ptr<bf16>> loc(#loc149)
+      %tmp9_20 = arith.extf %tmp9_19 : tensor<1x4096xbf16> to tensor<1x4096xf32> loc(#loc150)
+      %tmp12 = arith.constant 4096 : i32 loc(#loc151)
+      %tmp12_21 = arith.constant 4096 : i32 loc(#loc151)
+      %tmp12_22 = arith.constant dense<4096> : tensor<1x1xi32> loc(#loc151)
+      %tmp12_23 = arith.muli %tmp12_22, %xindex_7 : tensor<1x1xi32> loc(#loc151)
+      %tmp12_24 = tt.broadcast %tmp12_23 : tensor<1x1xi32> -> tensor<1x4096xi32> loc(#loc152)
+      %tmp12_25 = arith.addi %r0_index_13, %tmp12_24 : tensor<1x4096xi32> loc(#loc152)
+      %tmp12_26 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<1x4096x!tt.ptr<bf16>> loc(#loc153)
+      %tmp12_27 = tt.addptr %tmp12_26, %tmp12_25 : tensor<1x4096x!tt.ptr<bf16>>, tensor<1x4096xi32> loc(#loc153)
+      %tmp12_28 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x4096xi1> loc(#loc154)
+      %tmp12_29 = arith.andi %r0_mask_14, %tmp12_28 : tensor<1x4096xi1> loc(#loc154)
+      %tmp12_30 = arith.constant 0.000000e+00 : f32 loc(#loc155)
+      %tmp12_31 = arith.constant dense<0.000000e+00> : tensor<1x4096xf32> loc(#loc155)
+      %tmp12_32 = arith.truncf %tmp12_31 : tensor<1x4096xf32> to tensor<1x4096xbf16> loc(#loc155)
+      %tmp12_33 = tt.load %tmp12_27, %tmp12_29, %tmp12_32 evictionPolicy = evict_first : tensor<1x4096x!tt.ptr<bf16>> loc(#loc155)
+      %tmp12_34 = arith.extf %tmp12_33 : tensor<1x4096xbf16> to tensor<1x4096xf32> loc(#loc156)
+      %tmp23 = tt.splat %in_ptr2 : !tt.ptr<bf16> -> tensor<1x4096x!tt.ptr<bf16>> loc(#loc157)
+      %tmp23_35 = tt.addptr %tmp23, %r0_index_13 : tensor<1x4096x!tt.ptr<bf16>>, tensor<1x4096xi32> loc(#loc157)
+      %tmp23_36 = arith.constant 0.000000e+00 : f32 loc(#loc158)
+      %tmp23_37 = arith.constant dense<0.000000e+00> : tensor<1x4096xf32> loc(#loc158)
+      %tmp23_38 = arith.truncf %tmp23_37 : tensor<1x4096xf32> to tensor<1x4096xbf16> loc(#loc158)
+      %tmp23_39 = tt.load %tmp23_35, %r0_mask_14, %tmp23_38 evictionPolicy = evict_last : tensor<1x4096x!tt.ptr<bf16>> loc(#loc158)
+      %tmp23_40 = arith.extf %tmp23_39 : tensor<1x4096xbf16> to tensor<1x4096xf32> loc(#loc159)
+      %tmp10 = arith.constant 1.000000e+00 : f32 loc(#loc160)
+      %tmp11 = arith.constant dense<1.000000e+00> : tensor<1x4096xf32> loc(#loc161)
+      %tmp11_41 = arith.addf %tmp9_20, %tmp11 : tensor<1x4096xf32> loc(#loc161)
+      %tmp14 = tt.broadcast %tmp3 : tensor<1x1xf32> -> tensor<1x4096xf32> loc(#loc162)
+      %tmp14_42 = arith.subf %tmp12_34, %tmp14 : tensor<1x4096xf32> loc(#loc162)
+      %tmp15 = arith.constant 4.096000e+03 : f32 loc(#loc163)
+      %tmp16 = arith.constant dense<4.096000e+03> : tensor<1x1xf32> loc(#loc164)
+      %tmp16_43 = arith.divf %tmp7, %tmp16 : tensor<1x1xf32> loc(#loc164)
+      %tmp17 = arith.constant 9.99999997E-7 : f32 loc(#loc165)
+      %tmp18 = arith.constant dense<9.99999997E-7> : tensor<1x1xf32> loc(#loc166)
+      %tmp18_44 = arith.addf %tmp16_43, %tmp18 : tensor<1x1xf32> loc(#loc166)
+      %tmp19 = tt.extern_elementwise %tmp18_44 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<1x1xf32>) -> tensor<1x1xf32> loc(#loc167)
+      %tmp20 = tt.broadcast %tmp19 : tensor<1x1xf32> -> tensor<1x4096xf32> loc(#loc168)
+      %tmp20_45 = arith.mulf %tmp14_42, %tmp20 : tensor<1x4096xf32> loc(#loc168)
+      %tmp22 = arith.mulf %tmp11_41, %tmp20_45 : tensor<1x4096xf32> loc(#loc169)
+      %tmp24 = arith.addf %tmp22, %tmp23_40 : tensor<1x4096xf32> loc(#loc170)
+      %c4096_i32_46 = arith.constant 4096 : i32 loc(#loc62)
+      %c4096_i32_47 = arith.constant 4096 : i32 loc(#loc62)
+      %cst = arith.constant dense<4096> : tensor<1x1xi32> loc(#loc62)
+      %9 = arith.muli %cst, %xindex_7 : tensor<1x1xi32> loc(#loc62)
+      %10 = tt.broadcast %9 : tensor<1x1xi32> -> tensor<1x4096xi32> loc(#loc63)
+      %11 = arith.addi %r0_index_13, %10 : tensor<1x4096xi32> loc(#loc63)
+      %12 = tt.splat %out_ptr2 : !tt.ptr<bf16> -> tensor<1x4096x!tt.ptr<bf16>> loc(#loc64)
+      %13 = tt.addptr %12, %11 : tensor<1x4096x!tt.ptr<bf16>>, tensor<1x4096xi32> loc(#loc64)
+      %14 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x4096xi1> loc(#loc65)
+      %15 = arith.andi %r0_mask_14, %14 : tensor<1x4096xi1> loc(#loc65)
+      %16 = arith.truncf %tmp24 : tensor<1x4096xf32> to tensor<1x4096xbf16> loc(#loc66)
+      tt.store %13, %16, %15 : tensor<1x4096x!tt.ptr<bf16>> loc(#loc66)
+    } loc(#loc36)
+    tt.return loc(#loc67)
+  } loc(#loc)
+  tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_4096__(1,)cconstexpr_fp32_"() -> tensor<1x4096xf32> attributes {noinline = false} {
+    %cst = arith.constant 0.000000e+00 : f32 loc(#loc69)
+    %cst_0 = arith.constant dense<0.000000e+00> : tensor<1x4096xf32> loc(#loc69)
+    tt.return %cst_0 : tensor<1x4096xf32> loc(#loc70)
+  ^bb1:  // no predecessors
+    %0 = ub.poison : tensor<1x4096xf32> loc(#loc71)
+    tt.return %0 : tensor<1x4096xf32> loc(#loc71)
+  } loc(#loc68)
+  tt.func private @torch._inductor.runtime.triton_helpers.welford_reduce__fp32S1_4096S_fp32S1_4096S_fp32S1_4096S_fp32S1_4096S_u1__(%new_mean: tensor<1x4096xf32> loc("new_mean"(#loc171)), %mean: tensor<1x4096xf32> loc("mean"(#loc72)), %m2: tensor<1x4096xf32> loc("m2"(#loc72)), %weight: tensor<1x4096xf32> loc("weight"(#loc72)), %first_iteration: i1 loc("first_iteration"(#loc72))) -> (tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32>) attributes {noinline = false} {
+    %0:3 = scf.if %first_iteration -> (tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32>) {
+      %new_weight = arith.constant 1.000000e+00 : f32 loc(#loc176)
+      %new_weight_0 = arith.constant dense<1.000000e+00> : tensor<1x4096xf32> loc(#loc202)
+      %new_m2 = tt.call @triton.language.standard.zeros_like__fp32S1_4096S__(%m2) : (tensor<1x4096xf32>) -> tensor<1x4096xf32> loc(#loc203)
+      scf.yield %new_m2, %new_mean, %new_weight_0 : tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32> loc(#loc203)
+    } else {
+      %delta = arith.subf %new_mean, %mean : tensor<1x4096xf32> loc(#loc178)
+      %new_weight = arith.constant 1 : i32 loc(#loc179)
+      %new_weight_0 = arith.constant 1.000000e+00 : f32 loc(#loc179)
+      %new_weight_1 = arith.constant dense<1.000000e+00> : tensor<1x4096xf32> loc(#loc179)
+      %new_weight_2 = arith.addf %weight, %new_weight_1 : tensor<1x4096xf32> loc(#loc204)
+      %new_mean_3 = arith.divf %delta, %new_weight_2 : tensor<1x4096xf32> loc(#loc180)
+      %new_mean_4 = arith.addf %mean, %new_mean_3 : tensor<1x4096xf32> loc(#loc205)
+      %new_m2 = arith.subf %new_mean, %new_mean_4 : tensor<1x4096xf32> loc(#loc182)
+      %new_m2_5 = arith.mulf %delta, %new_m2 : tensor<1x4096xf32> loc(#loc183)
+      %new_m2_6 = arith.addf %m2, %new_m2_5 : tensor<1x4096xf32> loc(#loc206)
+      scf.yield %new_m2_6, %new_mean_4, %new_weight_2 : tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32> loc(#loc184)
+    } loc(#loc73)
+    tt.return %0#1, %0#0, %0#2 : tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32> loc(#loc83)
+  ^bb1:  // no predecessors
+    %1 = ub.poison : tensor<1x4096xf32> loc(#loc84)
+    %2 = ub.poison : tensor<1x4096xf32> loc(#loc84)
+    %3 = ub.poison : tensor<1x4096xf32> loc(#loc84)
+    tt.return %1, %2, %3 : tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32> loc(#loc84)
+  } loc(#loc72)
+  tt.func private @triton.language.standard.zeros_like__fp32S1_4096S__(%input: tensor<1x4096xf32> loc("input"(#loc85))) -> tensor<1x4096xf32> attributes {noinline = false} {
+    %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_4096__(1,)cconstexpr_fp32_"() : () -> tensor<1x4096xf32> loc(#loc86)
+    tt.return %0 : tensor<1x4096xf32> loc(#loc87)
+  ^bb1:  // no predecessors
+    %1 = ub.poison : tensor<1x4096xf32> loc(#loc88)
+    tt.return %1 : tensor<1x4096xf32> loc(#loc88)
+  } loc(#loc85)
+  tt.func private @"torch._inductor.runtime.triton_helpers.welford__fp32S1_4096S_fp32S1_4096S_fp32S1_4096S__(3,)cconstexpr_1_"(%mean: tensor<1x4096xf32> loc("mean"(#loc89)), %m2: tensor<1x4096xf32> loc("m2"(#loc89)), %weight: tensor<1x4096xf32> loc("weight"(#loc89))) -> (tensor<1xf32>, tensor<1xf32>, tensor<1xf32>) attributes {noinline = false} {
+    %0:3 = "tt.reduce"(%mean, %m2, %weight) <{axis = 1 : i32}> ({
+    ^bb0(%arg3: f32 loc(unknown), %arg4: f32 loc(unknown), %arg5: f32 loc(unknown), %arg6: f32 loc(unknown), %arg7: f32 loc(unknown), %arg8: f32 loc(unknown)):
+      %4:3 = tt.call @torch._inductor.runtime.triton_helpers.welford_combine__fp32_fp32_fp32_fp32_fp32_fp32__(%arg3, %arg4, %arg5, %arg6, %arg7, %arg8) : (f32, f32, f32, f32, f32, f32) -> (f32, f32, f32) loc(#loc90)
+      tt.reduce.return %4#0, %4#1, %4#2 : f32, f32, f32 loc(#loc90)
+    }) : (tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32>) -> (tensor<1xf32>, tensor<1xf32>, tensor<1xf32>) loc(#loc90)
+    tt.return %0#0, %0#1, %0#2 : tensor<1xf32>, tensor<1xf32>, tensor<1xf32> loc(#loc92)
+  ^bb1:  // no predecessors
+    %1 = ub.poison : tensor<1xf32> loc(#loc93)
+    %2 = ub.poison : tensor<1xf32> loc(#loc93)
+    %3 = ub.poison : tensor<1xf32> loc(#loc93)
+    tt.return %1, %2, %3 : tensor<1xf32>, tensor<1xf32>, tensor<1xf32> loc(#loc93)
+  } loc(#loc89)
+  tt.func private @torch._inductor.runtime.triton_helpers.welford_combine__fp32_fp32_fp32_fp32_fp32_fp32__(%mean_1: f32 loc("mean_1"(#loc94)), %m2_1: f32 loc("m2_1"(#loc94)), %weight_1: f32 loc("weight_1"(#loc94)), %mean_2: f32 loc("mean_2"(#loc94)), %m2_2: f32 loc("m2_2"(#loc94)), %weight_2: f32 loc("weight_2"(#loc94))) -> (f32, f32, f32) attributes {noinline = false} {
+    %delta = arith.subf %mean_2, %mean_1 : f32 loc(#loc195)
+    %new_weight = arith.addf %weight_1, %weight_2 : f32 loc(#loc196)
+    %w2_over_w = arith.constant 0.000000e+00 : f32 loc(#loc197)
+    %w2_over_w_0 = arith.cmpf oeq, %new_weight, %w2_over_w : f32 loc(#loc197)
+    %w2_over_w_1 = arith.divf %weight_2, %new_weight : f32 loc(#loc198)
+    %w2_over_w_2 = arith.constant 0.000000e+00 : f32 loc(#loc199)
+    %w2_over_w_3 = arith.constant 0.000000e+00 : f32 loc(#loc199)
+    %w2_over_w_4 = arith.select %w2_over_w_0, %w2_over_w_3, %w2_over_w_1 : f32 loc(#loc199)
+    %0 = arith.mulf %delta, %w2_over_w_4 : f32 loc(#loc100)
+    %1 = arith.addf %mean_1, %0 : f32 loc(#loc101)
+    %2 = arith.addf %m2_1, %m2_2 : f32 loc(#loc102)
+    %3 = arith.mulf %delta, %delta : f32 loc(#loc103)
+    %4 = arith.mulf %3, %weight_1 : f32 loc(#loc104)
+    %5 = arith.mulf %4, %w2_over_w_4 : f32 loc(#loc105)
+    %6 = arith.addf %2, %5 : f32 loc(#loc106)
+    tt.return %1, %6, %new_weight : f32, f32, f32 loc(#loc107)
+  ^bb1:  // no predecessors
+    %7 = ub.poison : f32 loc(#loc108)
+    %8 = ub.poison : f32 loc(#loc108)
+    %9 = ub.poison : f32 loc(#loc108)
+    tt.return %7, %8, %9 : f32, f32, f32 loc(#loc108)
+  } loc(#loc94)
+} loc(#loc)
+#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":19:13)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":20:15)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":23:28)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":23:33)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":24:36)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":24:44)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":24:23)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":25:21)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":26:27)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":26:37)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":29:45)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":30:43)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":31:47)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":32:43)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":33:31)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":34:29)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":38:46)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":38:41)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":38:34)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":38:61)
+#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":38:51)
+#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":38:112)
+#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":42:62)
+#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":42:51)
+#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":44:39)
+#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":44:62)
+#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":45:37)
+#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":45:58)
+#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":46:41)
+#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":46:66)
+#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":46:8)
+#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":47:79)
+#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":48:16)
+#loc34 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":49:16)
+#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":50:16)
+#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":51:43)
+#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":52:31)
+#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":53:29)
+#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":57:34)
+#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":57:41)
+#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":57:94)
+#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":58:47)
+#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":58:42)
+#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":58:35)
+#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":58:62)
+#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":58:52)
+#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":58:114)
+#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":59:35)
+#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":59:42)
+#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":59:95)
+#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":60:16)
+#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":61:23)
+#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":63:24)
+#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":64:16)
+#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":65:24)
+#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":66:16)
+#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":67:24)
+#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":68:32)
+#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":69:24)
+#loc60 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":71:24)
+#loc61 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":72:24)
+#loc62 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":73:41)
+#loc63 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":73:36)
+#loc64 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":73:29)
+#loc65 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":73:63)
+#loc66 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":73:53)
+#loc67 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":51:4)
+#loc68 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":120:0)
+#loc69 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":129:31)
+#loc70 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":129:11)
+#loc71 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":129:4)
+#loc73 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":217:7)
+#loc74 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":218:46)
+#loc75 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":220:31)
+#loc76 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":222:24)
+#loc77 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":223:30)
+#loc78 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":224:34)
+#loc79 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":224:26)
+#loc80 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":225:39)
+#loc81 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":225:31)
+#loc82 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":225:22)
+#loc83 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":226:11)
+#loc84 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":226:4)
+#loc86 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":140:30)
+#loc87 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":140:11)
+#loc88 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":140:4)
+#loc90 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":243:46)
+#loc92 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":243:11)
+#loc93 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":243:4)
+#loc95 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":231:21)
+#loc96 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":232:28)
+#loc97 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:39)
+#loc98 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:60)
+#loc99 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:49)
+#loc100 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":235:25)
+#loc101 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":235:17)
+#loc102 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:15)
+#loc103 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:30)
+#loc104 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:38)
+#loc105 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:49)
+#loc106 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:22)
+#loc107 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":234:11)
+#loc108 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":234:4)
+#loc115 = loc("xnumel"(#loc1))
+#loc116 = loc("r0_numel"(#loc2))
+#loc117 = loc("xoffset"(#loc3))
+#loc118 = loc("xoffset"(#loc4))
+#loc119 = loc("xindex"(#loc5))
+#loc120 = loc("xindex"(#loc6))
+#loc121 = loc("xindex"(#loc7))
+#loc122 = loc("xmask"(#loc8))
+#loc123 = loc("r0_base"(#loc9))
+#loc124 = loc("r0_base"(#loc10))
+#loc125 = loc("tmp3_mean"(#loc11))
+#loc126 = loc("tmp3_m2"(#loc12))
+#loc127 = loc("tmp3_weight"(#loc13))
+#loc128 = loc("tmp3_mean"(#loc14))
+#loc129 = loc("r0_index"(#loc15))
+#loc130 = loc("r0_mask"(#loc16))
+#loc131 = loc("tmp0"(#loc17))
+#loc132 = loc("tmp0"(#loc18))
+#loc133 = loc("tmp0"(#loc19))
+#loc134 = loc("tmp0"(#loc20))
+#loc135 = loc("tmp0"(#loc21))
+#loc136 = loc("tmp0"(#loc22))
+#loc137 = loc("tmp3_mean"(#loc25))
+#loc138 = loc("tmp3_mean"(#loc26))
+#loc139 = loc("tmp3_m2"(#loc27))
+#loc140 = loc("tmp3_m2"(#loc28))
+#loc141 = loc("tmp3_weight"(#loc29))
+#loc142 = loc("tmp3_weight"(#loc30))
+#loc143 = loc("tmp3"(#loc33))
+#loc144 = loc("tmp7"(#loc34))
+#loc145 = loc("tmp8"(#loc35))
+#loc146 = loc("r0_index"(#loc37))
+#loc147 = loc("r0_mask"(#loc38))
+#loc148 = loc("tmp9"(#loc39))
+#loc149 = loc("tmp9"(#loc40))
+#loc150 = loc("tmp9"(#loc41))
+#loc151 = loc("tmp12"(#loc42))
+#loc152 = loc("tmp12"(#loc43))
+#loc153 = loc("tmp12"(#loc44))
+#loc154 = loc("tmp12"(#loc45))
+#loc155 = loc("tmp12"(#loc46))
+#loc156 = loc("tmp12"(#loc47))
+#loc157 = loc("tmp23"(#loc48))
+#loc158 = loc("tmp23"(#loc49))
+#loc159 = loc("tmp23"(#loc50))
+#loc160 = loc("tmp10"(#loc51))
+#loc161 = loc("tmp11"(#loc52))
+#loc162 = loc("tmp14"(#loc53))
+#loc163 = loc("tmp15"(#loc54))
+#loc164 = loc("tmp16"(#loc55))
+#loc165 = loc("tmp17"(#loc56))
+#loc166 = loc("tmp18"(#loc57))
+#loc167 = loc("tmp19"(#loc58))
+#loc168 = loc("tmp20"(#loc59))
+#loc169 = loc("tmp22"(#loc60))
+#loc170 = loc("tmp24"(#loc61))
+#loc176 = loc("new_weight"(#loc74))
+#loc177 = loc("new_m2"(#loc75))
+#loc178 = loc("delta"(#loc76))
+#loc179 = loc("new_weight"(#loc77))
+#loc180 = loc("new_mean"(#loc78))
+#loc181 = loc("new_mean"(#loc79))
+#loc182 = loc("new_m2"(#loc80))
+#loc183 = loc("new_m2"(#loc81))
+#loc184 = loc("new_m2"(#loc82))
+#loc195 = loc("delta"(#loc95))
+#loc196 = loc("new_weight"(#loc96))
+#loc197 = loc("w2_over_w"(#loc97))
+#loc198 = loc("w2_over_w"(#loc98))
+#loc199 = loc("w2_over_w"(#loc99))
+#loc200 = loc("tmp3_m2"(#loc128))
+#loc202 = loc("new_weight"(#loc176))
+#loc203 = loc("new_m2"(#loc177))
+#loc204 = loc("new_weight"(#loc179))
+#loc205 = loc("new_mean"(#loc181))
+#loc206 = loc("new_m2"(#loc184))
+#loc207 = loc("tmp3_weight"(#loc200))
diff --git a/triton/CYKNGA4OMPRI7EV7H5FM47DKU7VFZ4Q5NYQGPNW6ZIVYBLBWPVMA/triton_red_fused_add_mul_native_layer_norm_1.ttgir b/triton/CYKNGA4OMPRI7EV7H5FM47DKU7VFZ4Q5NYQGPNW6ZIVYBLBWPVMA/triton_red_fused_add_mul_native_layer_norm_1.ttgir
new file mode 100644
index 0000000000000000000000000000000000000000..c13382babf543129791f4fd5f0e7fcc3ced3b72b
--- /dev/null
+++ b/triton/CYKNGA4OMPRI7EV7H5FM47DKU7VFZ4Q5NYQGPNW6ZIVYBLBWPVMA/triton_red_fused_add_mul_native_layer_norm_1.ttgir
@@ -0,0 +1,179 @@
+#blocked = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [1, 32], warpsPerCTA = [1, 16], order = [1, 0]}>
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":18:0)
+#loc1 = loc(unknown)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":47:79)
+#loc49 = loc("in_ptr0"(#loc))
+#loc50 = loc("in_ptr1"(#loc))
+#loc51 = loc("in_ptr2"(#loc))
+#loc52 = loc("out_ptr2"(#loc))
+#loc53 = loc("xnumel"(#loc))
+#loc54 = loc("r0_numel"(#loc))
+#loc68 = loc(callsite(#loc1 at #loc15))
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 16 : i32, ttg.target = "cuda:89", "ttg.threads-per-warp" = 32 : i32} {
+  tt.func public @triton_red_fused_add_mul_native_layer_norm_1(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %out_ptr2: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
+    %cst = arith.constant dense<4096> : tensor<1x4096xi32, #blocked> loc(#loc1)
+    %cst_0 = arith.constant dense<0.000000e+00> : tensor<1x4096xbf16, #blocked> loc(#loc1)
+    %c4096_i32 = arith.constant 4096 : i32 loc(#loc1)
+    %c256_i32 = arith.constant 256 : i32 loc(#loc1)
+    %cst_1 = arith.constant 0.000000e+00 : f32 loc(#loc1)
+    %cst_2 = arith.constant dense<0.000000e+00> : tensor<1x4096xf32, #blocked> loc(#loc1)
+    %cst_3 = arith.constant dense<9.99999997E-7> : tensor<1x1xf32, #blocked> loc(#loc1)
+    %cst_4 = arith.constant dense<4.096000e+03> : tensor<1x1xf32, #blocked> loc(#loc1)
+    %cst_5 = arith.constant dense<1.000000e+00> : tensor<1x4096xf32, #blocked> loc(#loc1)
+    %xoffset = tt.get_program_id x : i32 loc(#loc55)
+    %xmask = arith.cmpi slt, %xoffset, %c256_i32 : i32 loc(#loc56)
+    %r0_base = tt.make_range {end = 4096 : i32, start = 0 : i32} : tensor<4096xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc57)
+    %r0_base_6 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<4096xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x4096xi32, #blocked> loc(#loc57)
+    %r0_mask = arith.cmpi slt, %r0_base_6, %cst : tensor<1x4096xi32, #blocked> loc(#loc58)
+    %tmp0 = arith.muli %xoffset, %c4096_i32 : i32 loc(#loc59)
+    %tmp0_7 = tt.splat %tmp0 : i32 -> tensor<1x4096xi32, #blocked> loc(#loc92)
+    %tmp0_8 = arith.addi %r0_base_6, %tmp0_7 : tensor<1x4096xi32, #blocked> loc(#loc60)
+    %tmp0_9 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<1x4096x!tt.ptr<bf16>, #blocked> loc(#loc61)
+    %tmp0_10 = tt.addptr %tmp0_9, %tmp0_8 : tensor<1x4096x!tt.ptr<bf16>, #blocked>, tensor<1x4096xi32, #blocked> loc(#loc61)
+    %tmp0_11 = tt.splat %xmask : i1 -> tensor<1x4096xi1, #blocked> loc(#loc93)
+    %tmp0_12 = arith.andi %r0_mask, %tmp0_11 : tensor<1x4096xi1, #blocked> loc(#loc62)
+    %tmp0_13 = tt.load %tmp0_10, %tmp0_12, %cst_0 evictionPolicy = evict_last : tensor<1x4096x!tt.ptr<bf16>, #blocked> loc(#loc63)
+    %tmp0_14 = arith.extf %tmp0_13 : tensor<1x4096xbf16, #blocked> to tensor<1x4096xf32, #blocked> loc(#loc64)
+    %tmp3_mean = arith.select %tmp0_12, %tmp0_14, %cst_2 : tensor<1x4096xi1, #blocked>, tensor<1x4096xf32, #blocked> loc(#loc65)
+    %tmp3_weight = arith.select %tmp0_12, %cst_5, %cst_2 : tensor<1x4096xi1, #blocked>, tensor<1x4096xf32, #blocked> loc(#loc66)
+    %0:3 = "tt.reduce"(%tmp3_mean, %cst_2, %tmp3_weight) <{axis = 1 : i32}> ({
+    ^bb0(%arg6: f32 loc(callsite(#loc1 at #loc15)), %arg7: f32 loc(callsite(#loc1 at #loc15)), %arg8: f32 loc(callsite(#loc1 at #loc15)), %arg9: f32 loc(callsite(#loc1 at #loc15)), %arg10: f32 loc(callsite(#loc1 at #loc15)), %arg11: f32 loc(callsite(#loc1 at #loc15))):
+      %delta = arith.subf %arg9, %arg6 : f32 loc(#loc94)
+      %new_weight = arith.addf %arg8, %arg11 : f32 loc(#loc95)
+      %w2_over_w = arith.cmpf oeq, %new_weight, %cst_1 : f32 loc(#loc96)
+      %w2_over_w_24 = arith.divf %arg11, %new_weight : f32 loc(#loc97)
+      %w2_over_w_25 = arith.select %w2_over_w, %cst_1, %w2_over_w_24 : f32 loc(#loc98)
+      %4 = arith.mulf %delta, %w2_over_w_25 : f32 loc(#loc99)
+      %5 = arith.addf %arg6, %4 : f32 loc(#loc100)
+      %6 = arith.addf %arg7, %arg10 : f32 loc(#loc101)
+      %7 = arith.mulf %delta, %delta : f32 loc(#loc102)
+      %8 = arith.mulf %7, %arg8 : f32 loc(#loc103)
+      %9 = arith.mulf %8, %w2_over_w_25 : f32 loc(#loc104)
+      %10 = arith.addf %6, %9 : f32 loc(#loc105)
+      tt.reduce.return %5, %10, %new_weight : f32, f32, f32 loc(#loc67)
+    }) : (tensor<1x4096xf32, #blocked>, tensor<1x4096xf32, #blocked>, tensor<1x4096xf32, #blocked>) -> (tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked}>>, tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked}>>, tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked}>>) loc(#loc67)
+    %tmp3 = tt.expand_dims %0#0 {axis = 1 : i32} : tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<1x1xf32, #blocked> loc(#loc74)
+    %tmp7 = tt.expand_dims %0#1 {axis = 1 : i32} : tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<1x1xf32, #blocked> loc(#loc75)
+    %tmp9 = tt.splat %in_ptr1 : !tt.ptr<bf16> -> tensor<1x4096x!tt.ptr<bf16>, #blocked> loc(#loc76)
+    %tmp9_15 = tt.addptr %tmp9, %r0_base_6 : tensor<1x4096x!tt.ptr<bf16>, #blocked>, tensor<1x4096xi32, #blocked> loc(#loc76)
+    %tmp9_16 = tt.load %tmp9_15, %r0_mask, %cst_0 evictionPolicy = evict_last : tensor<1x4096x!tt.ptr<bf16>, #blocked> loc(#loc77)
+    %tmp9_17 = arith.extf %tmp9_16 : tensor<1x4096xbf16, #blocked> to tensor<1x4096xf32, #blocked> loc(#loc78)
+    %tmp12 = tt.load %tmp0_10, %tmp0_12, %cst_0 evictionPolicy = evict_first : tensor<1x4096x!tt.ptr<bf16>, #blocked> loc(#loc79)
+    %tmp12_18 = arith.extf %tmp12 : tensor<1x4096xbf16, #blocked> to tensor<1x4096xf32, #blocked> loc(#loc80)
+    %tmp23 = tt.splat %in_ptr2 : !tt.ptr<bf16> -> tensor<1x4096x!tt.ptr<bf16>, #blocked> loc(#loc81)
+    %tmp23_19 = tt.addptr %tmp23, %r0_base_6 : tensor<1x4096x!tt.ptr<bf16>, #blocked>, tensor<1x4096xi32, #blocked> loc(#loc81)
+    %tmp23_20 = tt.load %tmp23_19, %r0_mask, %cst_0 evictionPolicy = evict_last : tensor<1x4096x!tt.ptr<bf16>, #blocked> loc(#loc82)
+    %tmp23_21 = arith.extf %tmp23_20 : tensor<1x4096xbf16, #blocked> to tensor<1x4096xf32, #blocked> loc(#loc83)
+    %tmp11 = arith.addf %tmp9_17, %cst_5 : tensor<1x4096xf32, #blocked> loc(#loc84)
+    %tmp14 = tt.broadcast %tmp3 : tensor<1x1xf32, #blocked> -> tensor<1x4096xf32, #blocked> loc(#loc85)
+    %tmp14_22 = arith.subf %tmp12_18, %tmp14 : tensor<1x4096xf32, #blocked> loc(#loc85)
+    %tmp16 = arith.divf %tmp7, %cst_4 : tensor<1x1xf32, #blocked> loc(#loc86)
+    %tmp18 = arith.addf %tmp16, %cst_3 : tensor<1x1xf32, #blocked> loc(#loc87)
+    %tmp19 = tt.extern_elementwise %tmp18 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<1x1xf32, #blocked>) -> tensor<1x1xf32, #blocked> loc(#loc88)
+    %tmp20 = tt.broadcast %tmp19 : tensor<1x1xf32, #blocked> -> tensor<1x4096xf32, #blocked> loc(#loc89)
+    %tmp20_23 = arith.mulf %tmp14_22, %tmp20 : tensor<1x4096xf32, #blocked> loc(#loc89)
+    %tmp22 = arith.mulf %tmp11, %tmp20_23 : tensor<1x4096xf32, #blocked> loc(#loc90)
+    %tmp24 = arith.addf %tmp22, %tmp23_21 : tensor<1x4096xf32, #blocked> loc(#loc91)
+    %1 = tt.splat %out_ptr2 : !tt.ptr<bf16> -> tensor<1x4096x!tt.ptr<bf16>, #blocked> loc(#loc46)
+    %2 = tt.addptr %1, %tmp0_8 : tensor<1x4096x!tt.ptr<bf16>, #blocked>, tensor<1x4096xi32, #blocked> loc(#loc46)
+    %3 = arith.truncf %tmp24 : tensor<1x4096xf32, #blocked> to tensor<1x4096xbf16, #blocked> loc(#loc47)
+    tt.store %2, %3, %tmp0_12 : tensor<1x4096x!tt.ptr<bf16>, #blocked> loc(#loc47)
+    tt.return loc(#loc48)
+  } loc(#loc)
+} loc(#loc)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":23:28)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":25:21)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":26:37)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":34:29)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":38:46)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":38:41)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":38:34)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":38:61)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":38:51)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":38:112)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":44:62)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":46:66)
+#loc14 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":243:46)
+#loc16 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":231:21)
+#loc17 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":232:28)
+#loc18 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:39)
+#loc19 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:60)
+#loc20 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:49)
+#loc21 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":235:25)
+#loc22 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":235:17)
+#loc23 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:15)
+#loc24 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:30)
+#loc25 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:38)
+#loc26 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:49)
+#loc27 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:22)
+#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":48:16)
+#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":49:16)
+#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":57:34)
+#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":57:41)
+#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":57:94)
+#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":58:52)
+#loc34 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":58:114)
+#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":59:35)
+#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":59:42)
+#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":59:95)
+#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":61:23)
+#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":63:24)
+#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":65:24)
+#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":67:24)
+#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":68:32)
+#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":69:24)
+#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":71:24)
+#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":72:24)
+#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":73:29)
+#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":73:53)
+#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":51:4)
+#loc55 = loc("xoffset"(#loc2))
+#loc56 = loc("xmask"(#loc3))
+#loc57 = loc("r0_base"(#loc4))
+#loc58 = loc("r0_mask"(#loc5))
+#loc59 = loc("tmp0"(#loc6))
+#loc60 = loc("tmp0"(#loc7))
+#loc61 = loc("tmp0"(#loc8))
+#loc62 = loc("tmp0"(#loc9))
+#loc63 = loc("tmp0"(#loc10))
+#loc64 = loc("tmp0"(#loc11))
+#loc65 = loc("tmp3_mean"(#loc12))
+#loc66 = loc("tmp3_weight"(#loc13))
+#loc67 = loc(callsite(#loc14 at #loc15))
+#loc69 = loc("delta"(#loc16))
+#loc70 = loc("new_weight"(#loc17))
+#loc71 = loc("w2_over_w"(#loc18))
+#loc72 = loc("w2_over_w"(#loc19))
+#loc73 = loc("w2_over_w"(#loc20))
+#loc74 = loc("tmp3"(#loc28))
+#loc75 = loc("tmp7"(#loc29))
+#loc76 = loc("tmp9"(#loc30))
+#loc77 = loc("tmp9"(#loc31))
+#loc78 = loc("tmp9"(#loc32))
+#loc79 = loc("tmp12"(#loc33))
+#loc80 = loc("tmp12"(#loc34))
+#loc81 = loc("tmp23"(#loc35))
+#loc82 = loc("tmp23"(#loc36))
+#loc83 = loc("tmp23"(#loc37))
+#loc84 = loc("tmp11"(#loc38))
+#loc85 = loc("tmp14"(#loc39))
+#loc86 = loc("tmp16"(#loc40))
+#loc87 = loc("tmp18"(#loc41))
+#loc88 = loc("tmp19"(#loc42))
+#loc89 = loc("tmp20"(#loc43))
+#loc90 = loc("tmp22"(#loc44))
+#loc91 = loc("tmp24"(#loc45))
+#loc92 = loc(fused[#loc60, #loc59])
+#loc93 = loc(fused[#loc62, #loc56])
+#loc94 = loc(callsite(#loc69 at #loc67))
+#loc95 = loc(callsite(#loc70 at #loc67))
+#loc96 = loc(callsite(#loc71 at #loc67))
+#loc97 = loc(callsite(#loc72 at #loc67))
+#loc98 = loc(callsite(#loc73 at #loc67))
+#loc99 = loc(callsite(#loc21 at #loc67))
+#loc100 = loc(callsite(#loc22 at #loc67))
+#loc101 = loc(callsite(#loc23 at #loc67))
+#loc102 = loc(callsite(#loc24 at #loc67))
+#loc103 = loc(callsite(#loc25 at #loc67))
+#loc104 = loc(callsite(#loc26 at #loc67))
+#loc105 = loc(callsite(#loc27 at #loc67))
diff --git a/triton/CYKNGA4OMPRI7EV7H5FM47DKU7VFZ4Q5NYQGPNW6ZIVYBLBWPVMA/triton_red_fused_add_mul_native_layer_norm_1.ttir b/triton/CYKNGA4OMPRI7EV7H5FM47DKU7VFZ4Q5NYQGPNW6ZIVYBLBWPVMA/triton_red_fused_add_mul_native_layer_norm_1.ttir
new file mode 100644
index 0000000000000000000000000000000000000000..6a8f45fd734360fe0014f0077bcc158fdd7be25a
--- /dev/null
+++ b/triton/CYKNGA4OMPRI7EV7H5FM47DKU7VFZ4Q5NYQGPNW6ZIVYBLBWPVMA/triton_red_fused_add_mul_native_layer_norm_1.ttir
@@ -0,0 +1,180 @@
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":18:0)
+#loc1 = loc(unknown)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":47:79)
+#loc50 = loc("in_ptr0"(#loc))
+#loc51 = loc("in_ptr1"(#loc))
+#loc52 = loc("in_ptr2"(#loc))
+#loc53 = loc("out_ptr2"(#loc))
+#loc54 = loc("xnumel"(#loc))
+#loc55 = loc("r0_numel"(#loc))
+#loc57 = loc(callsite(#loc1 at #loc3))
+module {
+  tt.func public @triton_red_fused_add_mul_native_layer_norm_1(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %out_ptr2: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
+    %c4096_i32 = arith.constant 4096 : i32 loc(#loc1)
+    %xmask = arith.constant 256 : i32 loc(#loc56)
+    %cst = arith.constant 0.000000e+00 : f32 loc(#loc57)
+    %cst_0 = arith.constant dense<0.000000e+00> : tensor<1x4096xf32> loc(#loc1)
+    %cst_1 = arith.constant dense<0.000000e+00> : tensor<1x4096xbf16> loc(#loc1)
+    %cst_2 = arith.constant dense<9.99999997E-7> : tensor<1x1xf32> loc(#loc1)
+    %cst_3 = arith.constant dense<4.096000e+03> : tensor<1x1xf32> loc(#loc1)
+    %cst_4 = arith.constant dense<1.000000e+00> : tensor<1x4096xf32> loc(#loc1)
+    %cst_5 = arith.constant dense<4096> : tensor<1x4096xi32> loc(#loc1)
+    %xoffset = tt.get_program_id x : i32 loc(#loc58)
+    %xmask_6 = arith.cmpi slt, %xoffset, %xmask : i32 loc(#loc56)
+    %r0_base = tt.make_range {end = 4096 : i32, start = 0 : i32} : tensor<4096xi32> loc(#loc59)
+    %r0_base_7 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<4096xi32> -> tensor<1x4096xi32> loc(#loc60)
+    %r0_mask = arith.cmpi slt, %r0_base_7, %cst_5 : tensor<1x4096xi32> loc(#loc61)
+    %tmp0 = arith.muli %xoffset, %c4096_i32 : i32 loc(#loc62)
+    %tmp0_8 = tt.splat %tmp0 : i32 -> tensor<1x4096xi32> loc(#loc94)
+    %tmp0_9 = arith.addi %r0_base_7, %tmp0_8 : tensor<1x4096xi32> loc(#loc63)
+    %tmp0_10 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<1x4096x!tt.ptr<bf16>> loc(#loc64)
+    %tmp0_11 = tt.addptr %tmp0_10, %tmp0_9 : tensor<1x4096x!tt.ptr<bf16>>, tensor<1x4096xi32> loc(#loc64)
+    %tmp0_12 = tt.splat %xmask_6 : i1 -> tensor<1x4096xi1> loc(#loc95)
+    %tmp0_13 = arith.andi %r0_mask, %tmp0_12 : tensor<1x4096xi1> loc(#loc65)
+    %tmp0_14 = tt.load %tmp0_11, %tmp0_13, %cst_1 evictionPolicy = evict_last : tensor<1x4096x!tt.ptr<bf16>> loc(#loc66)
+    %tmp0_15 = arith.extf %tmp0_14 : tensor<1x4096xbf16> to tensor<1x4096xf32> loc(#loc67)
+    %tmp3_mean = arith.select %tmp0_13, %tmp0_15, %cst_0 : tensor<1x4096xi1>, tensor<1x4096xf32> loc(#loc68)
+    %tmp3_weight = arith.select %tmp0_13, %cst_4, %cst_0 : tensor<1x4096xi1>, tensor<1x4096xf32> loc(#loc69)
+    %0:3 = "tt.reduce"(%tmp3_mean, %cst_0, %tmp3_weight) <{axis = 1 : i32}> ({
+    ^bb0(%arg6: f32 loc(callsite(#loc1 at #loc3)), %arg7: f32 loc(callsite(#loc1 at #loc3)), %arg8: f32 loc(callsite(#loc1 at #loc3)), %arg9: f32 loc(callsite(#loc1 at #loc3)), %arg10: f32 loc(callsite(#loc1 at #loc3)), %arg11: f32 loc(callsite(#loc1 at #loc3))):
+      %delta = arith.subf %arg9, %arg6 : f32 loc(#loc96)
+      %new_weight = arith.addf %arg8, %arg11 : f32 loc(#loc97)
+      %w2_over_w = arith.cmpf oeq, %new_weight, %cst : f32 loc(#loc98)
+      %w2_over_w_25 = arith.divf %arg11, %new_weight : f32 loc(#loc99)
+      %w2_over_w_26 = arith.select %w2_over_w, %cst, %w2_over_w_25 : f32 loc(#loc100)
+      %4 = arith.mulf %delta, %w2_over_w_26 : f32 loc(#loc101)
+      %5 = arith.addf %arg6, %4 : f32 loc(#loc102)
+      %6 = arith.addf %arg7, %arg10 : f32 loc(#loc103)
+      %7 = arith.mulf %delta, %delta : f32 loc(#loc104)
+      %8 = arith.mulf %7, %arg8 : f32 loc(#loc105)
+      %9 = arith.mulf %8, %w2_over_w_26 : f32 loc(#loc106)
+      %10 = arith.addf %6, %9 : f32 loc(#loc107)
+      tt.reduce.return %5, %10, %new_weight : f32, f32, f32 loc(#loc70)
+    }) : (tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32>) -> (tensor<1xf32>, tensor<1xf32>, tensor<1xf32>) loc(#loc70)
+    %tmp3 = tt.expand_dims %0#0 {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc76)
+    %tmp7 = tt.expand_dims %0#1 {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc77)
+    %tmp9 = tt.splat %in_ptr1 : !tt.ptr<bf16> -> tensor<1x4096x!tt.ptr<bf16>> loc(#loc78)
+    %tmp9_16 = tt.addptr %tmp9, %r0_base_7 : tensor<1x4096x!tt.ptr<bf16>>, tensor<1x4096xi32> loc(#loc78)
+    %tmp9_17 = tt.load %tmp9_16, %r0_mask, %cst_1 evictionPolicy = evict_last : tensor<1x4096x!tt.ptr<bf16>> loc(#loc79)
+    %tmp9_18 = arith.extf %tmp9_17 : tensor<1x4096xbf16> to tensor<1x4096xf32> loc(#loc80)
+    %tmp12 = tt.load %tmp0_11, %tmp0_13, %cst_1 evictionPolicy = evict_first : tensor<1x4096x!tt.ptr<bf16>> loc(#loc81)
+    %tmp12_19 = arith.extf %tmp12 : tensor<1x4096xbf16> to tensor<1x4096xf32> loc(#loc82)
+    %tmp23 = tt.splat %in_ptr2 : !tt.ptr<bf16> -> tensor<1x4096x!tt.ptr<bf16>> loc(#loc83)
+    %tmp23_20 = tt.addptr %tmp23, %r0_base_7 : tensor<1x4096x!tt.ptr<bf16>>, tensor<1x4096xi32> loc(#loc83)
+    %tmp23_21 = tt.load %tmp23_20, %r0_mask, %cst_1 evictionPolicy = evict_last : tensor<1x4096x!tt.ptr<bf16>> loc(#loc84)
+    %tmp23_22 = arith.extf %tmp23_21 : tensor<1x4096xbf16> to tensor<1x4096xf32> loc(#loc85)
+    %tmp11 = arith.addf %tmp9_18, %cst_4 : tensor<1x4096xf32> loc(#loc86)
+    %tmp14 = tt.broadcast %tmp3 : tensor<1x1xf32> -> tensor<1x4096xf32> loc(#loc87)
+    %tmp14_23 = arith.subf %tmp12_19, %tmp14 : tensor<1x4096xf32> loc(#loc87)
+    %tmp16 = arith.divf %tmp7, %cst_3 : tensor<1x1xf32> loc(#loc88)
+    %tmp18 = arith.addf %tmp16, %cst_2 : tensor<1x1xf32> loc(#loc89)
+    %tmp19 = tt.extern_elementwise %tmp18 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<1x1xf32>) -> tensor<1x1xf32> loc(#loc90)
+    %tmp20 = tt.broadcast %tmp19 : tensor<1x1xf32> -> tensor<1x4096xf32> loc(#loc91)
+    %tmp20_24 = arith.mulf %tmp14_23, %tmp20 : tensor<1x4096xf32> loc(#loc91)
+    %tmp22 = arith.mulf %tmp11, %tmp20_24 : tensor<1x4096xf32> loc(#loc92)
+    %tmp24 = arith.addf %tmp22, %tmp23_22 : tensor<1x4096xf32> loc(#loc93)
+    %1 = tt.splat %out_ptr2 : !tt.ptr<bf16> -> tensor<1x4096x!tt.ptr<bf16>> loc(#loc47)
+    %2 = tt.addptr %1, %tmp0_9 : tensor<1x4096x!tt.ptr<bf16>>, tensor<1x4096xi32> loc(#loc47)
+    %3 = arith.truncf %tmp24 : tensor<1x4096xf32> to tensor<1x4096xbf16> loc(#loc48)
+    tt.store %2, %3, %tmp0_13 : tensor<1x4096x!tt.ptr<bf16>> loc(#loc48)
+    tt.return loc(#loc49)
+  } loc(#loc)
+} loc(#loc)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":25:21)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":23:28)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":26:27)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":26:37)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":34:29)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":38:46)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":38:41)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":38:34)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":38:61)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":38:51)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":38:112)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":44:62)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":46:66)
+#loc16 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":243:46)
+#loc17 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":231:21)
+#loc18 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":232:28)
+#loc19 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:39)
+#loc20 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:60)
+#loc21 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:49)
+#loc22 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":235:25)
+#loc23 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":235:17)
+#loc24 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:15)
+#loc25 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:30)
+#loc26 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:38)
+#loc27 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:49)
+#loc28 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:22)
+#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":48:16)
+#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":49:16)
+#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":57:34)
+#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":57:41)
+#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":57:94)
+#loc34 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":58:52)
+#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":58:114)
+#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":59:35)
+#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":59:42)
+#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":59:95)
+#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":61:23)
+#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":63:24)
+#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":65:24)
+#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":67:24)
+#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":68:32)
+#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":69:24)
+#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":71:24)
+#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":72:24)
+#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":73:29)
+#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":73:53)
+#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/av/cavoaz6e7kbk5wq2n7vz6rxhcrwdu2trazexubdq5qwyv2ajmbkz.py":51:4)
+#loc56 = loc("xmask"(#loc2))
+#loc58 = loc("xoffset"(#loc4))
+#loc59 = loc("r0_base"(#loc5))
+#loc60 = loc("r0_base"(#loc6))
+#loc61 = loc("r0_mask"(#loc7))
+#loc62 = loc("tmp0"(#loc8))
+#loc63 = loc("tmp0"(#loc9))
+#loc64 = loc("tmp0"(#loc10))
+#loc65 = loc("tmp0"(#loc11))
+#loc66 = loc("tmp0"(#loc12))
+#loc67 = loc("tmp0"(#loc13))
+#loc68 = loc("tmp3_mean"(#loc14))
+#loc69 = loc("tmp3_weight"(#loc15))
+#loc70 = loc(callsite(#loc16 at #loc3))
+#loc71 = loc("delta"(#loc17))
+#loc72 = loc("new_weight"(#loc18))
+#loc73 = loc("w2_over_w"(#loc19))
+#loc74 = loc("w2_over_w"(#loc20))
+#loc75 = loc("w2_over_w"(#loc21))
+#loc76 = loc("tmp3"(#loc29))
+#loc77 = loc("tmp7"(#loc30))
+#loc78 = loc("tmp9"(#loc31))
+#loc79 = loc("tmp9"(#loc32))
+#loc80 = loc("tmp9"(#loc33))
+#loc81 = loc("tmp12"(#loc34))
+#loc82 = loc("tmp12"(#loc35))
+#loc83 = loc("tmp23"(#loc36))
+#loc84 = loc("tmp23"(#loc37))
+#loc85 = loc("tmp23"(#loc38))
+#loc86 = loc("tmp11"(#loc39))
+#loc87 = loc("tmp14"(#loc40))
+#loc88 = loc("tmp16"(#loc41))
+#loc89 = loc("tmp18"(#loc42))
+#loc90 = loc("tmp19"(#loc43))
+#loc91 = loc("tmp20"(#loc44))
+#loc92 = loc("tmp22"(#loc45))
+#loc93 = loc("tmp24"(#loc46))
+#loc94 = loc(fused[#loc63, #loc62])
+#loc95 = loc(fused[#loc65, #loc56])
+#loc96 = loc(callsite(#loc71 at #loc70))
+#loc97 = loc(callsite(#loc72 at #loc70))
+#loc98 = loc(callsite(#loc73 at #loc70))
+#loc99 = loc(callsite(#loc74 at #loc70))
+#loc100 = loc(callsite(#loc75 at #loc70))
+#loc101 = loc(callsite(#loc22 at #loc70))
+#loc102 = loc(callsite(#loc23 at #loc70))
+#loc103 = loc(callsite(#loc24 at #loc70))
+#loc104 = loc(callsite(#loc25 at #loc70))
+#loc105 = loc(callsite(#loc26 at #loc70))
+#loc106 = loc(callsite(#loc27 at #loc70))
+#loc107 = loc(callsite(#loc28 at #loc70))
diff --git a/triton/DTSINFKV23R7UUCB5Y3DX56UD6DUQS3DXJCZPAKYYXLDFCJFQIOA/__grp__triton_red_fused_add_mul_native_layer_norm_0.json b/triton/DTSINFKV23R7UUCB5Y3DX56UD6DUQS3DXJCZPAKYYXLDFCJFQIOA/__grp__triton_red_fused_add_mul_native_layer_norm_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..f268f3763c21c386f0eb939b260d95dd168739d0
--- /dev/null
+++ b/triton/DTSINFKV23R7UUCB5Y3DX56UD6DUQS3DXJCZPAKYYXLDFCJFQIOA/__grp__triton_red_fused_add_mul_native_layer_norm_0.json
@@ -0,0 +1 @@
+{"child_paths": {"triton_red_fused_add_mul_native_layer_norm_0.source": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/DTSINFKV23R7UUCB5Y3DX56UD6DUQS3DXJCZPAKYYXLDFCJFQIOA/triton_red_fused_add_mul_native_layer_norm_0.source", "triton_red_fused_add_mul_native_layer_norm_0.ttir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/DTSINFKV23R7UUCB5Y3DX56UD6DUQS3DXJCZPAKYYXLDFCJFQIOA/triton_red_fused_add_mul_native_layer_norm_0.ttir", "triton_red_fused_add_mul_native_layer_norm_0.ttgir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/DTSINFKV23R7UUCB5Y3DX56UD6DUQS3DXJCZPAKYYXLDFCJFQIOA/triton_red_fused_add_mul_native_layer_norm_0.ttgir", "triton_red_fused_add_mul_native_layer_norm_0.llir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/DTSINFKV23R7UUCB5Y3DX56UD6DUQS3DXJCZPAKYYXLDFCJFQIOA/triton_red_fused_add_mul_native_layer_norm_0.llir", "triton_red_fused_add_mul_native_layer_norm_0.ptx": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/DTSINFKV23R7UUCB5Y3DX56UD6DUQS3DXJCZPAKYYXLDFCJFQIOA/triton_red_fused_add_mul_native_layer_norm_0.ptx", "triton_red_fused_add_mul_native_layer_norm_0.cubin": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/DTSINFKV23R7UUCB5Y3DX56UD6DUQS3DXJCZPAKYYXLDFCJFQIOA/triton_red_fused_add_mul_native_layer_norm_0.cubin", "triton_red_fused_add_mul_native_layer_norm_0.json": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/DTSINFKV23R7UUCB5Y3DX56UD6DUQS3DXJCZPAKYYXLDFCJFQIOA/triton_red_fused_add_mul_native_layer_norm_0.json"}}
\ No newline at end of file
diff --git a/triton/DTSINFKV23R7UUCB5Y3DX56UD6DUQS3DXJCZPAKYYXLDFCJFQIOA/triton_red_fused_add_mul_native_layer_norm_0.cubin b/triton/DTSINFKV23R7UUCB5Y3DX56UD6DUQS3DXJCZPAKYYXLDFCJFQIOA/triton_red_fused_add_mul_native_layer_norm_0.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..af3203563a16f6782f82d431acd4d7a46ad7e8a4
Binary files /dev/null and b/triton/DTSINFKV23R7UUCB5Y3DX56UD6DUQS3DXJCZPAKYYXLDFCJFQIOA/triton_red_fused_add_mul_native_layer_norm_0.cubin differ
diff --git a/triton/DTSINFKV23R7UUCB5Y3DX56UD6DUQS3DXJCZPAKYYXLDFCJFQIOA/triton_red_fused_add_mul_native_layer_norm_0.json b/triton/DTSINFKV23R7UUCB5Y3DX56UD6DUQS3DXJCZPAKYYXLDFCJFQIOA/triton_red_fused_add_mul_native_layer_norm_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..455cfa8ed72cf59c4db3a3e504679a7e88a65112
--- /dev/null
+++ b/triton/DTSINFKV23R7UUCB5Y3DX56UD6DUQS3DXJCZPAKYYXLDFCJFQIOA/triton_red_fused_add_mul_native_layer_norm_0.json
@@ -0,0 +1 @@
+{"hash": "1ce4869555d6e3fa5041ee363bf7d41f87484b63ba45978158c5d6328925821c", "target": {"backend": "cuda", "arch": 89, "warp_size": 32}, "num_warps": 16, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "enable_reflect_ftz": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee", "bf16x3", "bf16x6"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm89", "instrumentation_mode": "", "triton_version": "3.6.0", "tensordesc_meta": [], "shared": 192, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused_add_mul_native_layer_norm_0"}
\ No newline at end of file
diff --git a/triton/DTSINFKV23R7UUCB5Y3DX56UD6DUQS3DXJCZPAKYYXLDFCJFQIOA/triton_red_fused_add_mul_native_layer_norm_0.llir b/triton/DTSINFKV23R7UUCB5Y3DX56UD6DUQS3DXJCZPAKYYXLDFCJFQIOA/triton_red_fused_add_mul_native_layer_norm_0.llir
new file mode 100644
index 0000000000000000000000000000000000000000..badd6098f71f776f3006457a8edf489897186498
--- /dev/null
+++ b/triton/DTSINFKV23R7UUCB5Y3DX56UD6DUQS3DXJCZPAKYYXLDFCJFQIOA/triton_red_fused_add_mul_native_layer_norm_0.llir
@@ -0,0 +1,620 @@
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64"
+
+@global_smem = external addrspace(3) global [0 x i8], align 16
+@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1
+
+; Function Attrs: nounwind
+define ptx_kernel void @triton_red_fused_add_mul_native_layer_norm_0(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, ptr addrspace(1) %6, i32 %7, i32 %8, ptr addrspace(1) readnone captures(none) %9, ptr addrspace(1) readnone captures(none) %10) local_unnamed_addr #0 !dbg !5 {
+__nv_rsqrtf.exit:
+  %11 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !8
+  %12 = icmp samesign ult i32 %11, 2048, !dbg !9
+  %13 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
+  %14 = shl nuw nsw i32 %13, 2, !dbg !10
+  %15 = and i32 %14, 2044, !dbg !10
+  %16 = shl i32 %11, 12, !dbg !11
+  %17 = zext nneg i32 %15 to i64, !dbg !12
+  %18 = sext i32 %16 to i64, !dbg !12
+  %19 = or disjoint i64 %17, %18, !dbg !13
+  %20 = getelementptr bfloat, ptr addrspace(1) %0, i64 %19, !dbg !14
+  %21 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #6, !dbg !15
+  %22 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %20, i64 %21, i1 %12) #6, !dbg !15
+  %23 = getelementptr bfloat, ptr addrspace(1) %1, i64 %17, !dbg !16
+  %24 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !17
+  %25 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %23, i64 %24, i1 true) #6, !dbg !17
+  %26 = getelementptr bfloat, ptr addrspace(1) %2, i64 %19, !dbg !18
+  %27 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #6, !dbg !19
+  %28 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %26, i64 %27, i1 %12) #6, !dbg !19
+  %29 = extractvalue { i32, i32 } %25, 1, !dbg !17
+  %30 = bitcast i32 %29 to <2 x bfloat>, !dbg !17
+  %31 = extractvalue { i32, i32 } %28, 1, !dbg !19
+  %32 = bitcast i32 %31 to <2 x bfloat>, !dbg !19
+  %33 = extractvalue { i32, i32 } %22, 1, !dbg !15
+  %34 = bitcast i32 %33 to <2 x bfloat>, !dbg !15
+  %35 = extractvalue { i32, i32 } %25, 0, !dbg !17
+  %36 = bitcast i32 %35 to <2 x bfloat>, !dbg !17
+  %37 = extractvalue { i32, i32 } %28, 0, !dbg !19
+  %38 = bitcast i32 %37 to <2 x bfloat>, !dbg !19
+  %39 = extractvalue { i32, i32 } %22, 0, !dbg !15
+  %40 = bitcast i32 %39 to <2 x bfloat>, !dbg !15
+  %41 = getelementptr bfloat, ptr addrspace(1) %5, i64 %19, !dbg !20
+  %42 = fpext <2 x bfloat> %36 to <2 x float>, !dbg !21
+  %43 = fpext <2 x bfloat> %38 to <2 x float>, !dbg !22
+  %44 = fmul <2 x float> %42, %43, !dbg !23
+  %45 = fpext <2 x bfloat> %40 to <2 x float>, !dbg !24
+  %46 = fadd <2 x float> %44, %45, !dbg !25
+  %47 = extractelement <2 x float> %46, i64 0, !dbg !26
+  %48 = select i1 %12, float %47, float 0.000000e+00, !dbg !26
+  %49 = extractelement <2 x float> %46, i64 1, !dbg !26
+  %50 = select i1 %12, float %49, float 0.000000e+00, !dbg !26
+  %51 = fptrunc <2 x float> %46 to <2 x bfloat>, !dbg !27
+  %52 = fpext <2 x bfloat> %30 to <2 x float>, !dbg !21
+  %53 = fpext <2 x bfloat> %32 to <2 x float>, !dbg !22
+  %54 = fmul <2 x float> %52, %53, !dbg !23
+  %55 = fpext <2 x bfloat> %34 to <2 x float>, !dbg !24
+  %56 = fadd <2 x float> %54, %55, !dbg !25
+  %57 = extractelement <2 x float> %56, i64 0, !dbg !26
+  %58 = select i1 %12, float %57, float 0.000000e+00, !dbg !26
+  %59 = extractelement <2 x float> %56, i64 1, !dbg !26
+  %60 = select i1 %12, float %59, float 0.000000e+00, !dbg !26
+  %61 = fptrunc <2 x float> %56 to <2 x bfloat>, !dbg !27
+  %62 = bitcast <2 x bfloat> %51 to i32, !dbg !27
+  %63 = bitcast <2 x bfloat> %61 to i32, !dbg !27
+  tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %62, i32 %63, ptr addrspace(1) %41, i1 %12) #6, !dbg !27
+  %64 = or disjoint i64 %17, 2048, !dbg !28
+  %65 = or disjoint i64 %64, %18, !dbg !13
+  %66 = getelementptr bfloat, ptr addrspace(1) %0, i64 %65, !dbg !14
+  %67 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #6, !dbg !15
+  %68 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %66, i64 %67, i1 %12) #6, !dbg !15
+  %69 = extractvalue { i32, i32 } %68, 0, !dbg !15
+  %70 = bitcast i32 %69 to <2 x bfloat>, !dbg !15
+  %71 = extractvalue { i32, i32 } %68, 1, !dbg !15
+  %72 = bitcast i32 %71 to <2 x bfloat>, !dbg !15
+  %73 = getelementptr bfloat, ptr addrspace(1) %1, i64 %64, !dbg !16
+  %74 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !17
+  %75 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %73, i64 %74, i1 true) #6, !dbg !17
+  %76 = extractvalue { i32, i32 } %75, 0, !dbg !17
+  %77 = bitcast i32 %76 to <2 x bfloat>, !dbg !17
+  %78 = extractvalue { i32, i32 } %75, 1, !dbg !17
+  %79 = bitcast i32 %78 to <2 x bfloat>, !dbg !17
+  %80 = getelementptr bfloat, ptr addrspace(1) %2, i64 %65, !dbg !18
+  %81 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #6, !dbg !19
+  %82 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %80, i64 %81, i1 %12) #6, !dbg !19
+  %83 = extractvalue { i32, i32 } %82, 0, !dbg !19
+  %84 = bitcast i32 %83 to <2 x bfloat>, !dbg !19
+  %85 = extractvalue { i32, i32 } %82, 1, !dbg !19
+  %86 = bitcast i32 %85 to <2 x bfloat>, !dbg !19
+  %87 = select i1 %12, float 2.000000e+00, float 1.000000e+00, !dbg !29
+  %88 = select i1 %12, float 2.000000e+00, float 0.000000e+00, !dbg !29
+  %89 = select i1 %12, float 2.000000e+00, float 0.000000e+00, !dbg !29
+  %90 = select i1 %12, float 2.000000e+00, float 0.000000e+00, !dbg !29
+  %91 = select i1 %12, float 2.000000e+00, float 0.000000e+00, !dbg !29
+  %92 = getelementptr bfloat, ptr addrspace(1) %5, i64 %65, !dbg !20
+  %93 = fpext <2 x bfloat> %70 to <2 x float>, !dbg !24
+  %94 = fpext <2 x bfloat> %77 to <2 x float>, !dbg !21
+  %95 = fpext <2 x bfloat> %84 to <2 x float>, !dbg !22
+  %96 = fmul <2 x float> %94, %95, !dbg !23
+  %97 = fadd <2 x float> %96, %93, !dbg !25
+  %98 = extractelement <2 x float> %97, i64 0, !dbg !30
+  %99 = fsub float %98, %48, !dbg !35
+  %100 = tail call float @llvm.nvvm.div.full(float %99, float %87), !dbg !36
+  %101 = fadd float %48, %100, !dbg !37
+  %102 = fsub float %98, %101, !dbg !30
+  %103 = fmul float %99, %102, !dbg !38
+  %104 = fadd float %103, 0.000000e+00, !dbg !39
+  %105 = extractelement <2 x float> %97, i64 1, !dbg !30
+  %106 = fsub float %105, %50, !dbg !35
+  %107 = tail call float @llvm.nvvm.div.full(float %106, float %87), !dbg !36
+  %108 = fadd float %50, %107, !dbg !37
+  %109 = fsub float %105, %108, !dbg !30
+  %110 = fmul float %106, %109, !dbg !38
+  %111 = fadd float %110, 0.000000e+00, !dbg !39
+  %112 = select i1 %12, float %101, float 0.000000e+00, !dbg !26
+  %113 = select i1 %12, float %108, float 0.000000e+00, !dbg !26
+  %114 = fptrunc <2 x float> %97 to <2 x bfloat>, !dbg !27
+  %115 = fpext <2 x bfloat> %72 to <2 x float>, !dbg !24
+  %116 = fpext <2 x bfloat> %79 to <2 x float>, !dbg !21
+  %117 = fpext <2 x bfloat> %86 to <2 x float>, !dbg !22
+  %118 = fmul <2 x float> %116, %117, !dbg !23
+  %119 = fadd <2 x float> %118, %115, !dbg !25
+  %120 = extractelement <2 x float> %119, i64 0, !dbg !30
+  %121 = fsub float %120, %58, !dbg !35
+  %122 = tail call float @llvm.nvvm.div.full(float %121, float %87), !dbg !36
+  %123 = fadd float %58, %122, !dbg !37
+  %124 = fsub float %120, %123, !dbg !30
+  %125 = fmul float %121, %124, !dbg !38
+  %126 = fadd float %125, 0.000000e+00, !dbg !39
+  %127 = extractelement <2 x float> %119, i64 1, !dbg !30
+  %128 = fsub float %127, %60, !dbg !35
+  %129 = tail call float @llvm.nvvm.div.full(float %128, float %87), !dbg !36
+  %130 = fadd float %60, %129, !dbg !37
+  %131 = fsub float %127, %130, !dbg !30
+  %132 = fmul float %128, %131, !dbg !38
+  %133 = fadd float %132, 0.000000e+00, !dbg !39
+  %134 = select i1 %12, float %123, float 0.000000e+00, !dbg !26
+  %135 = select i1 %12, float %130, float 0.000000e+00, !dbg !26
+  %136 = select i1 %12, float %126, float 0.000000e+00, !dbg !40
+  %137 = select i1 %12, float %133, float 0.000000e+00, !dbg !40
+  %138 = fptrunc <2 x float> %119 to <2 x bfloat>, !dbg !27
+  %139 = bitcast <2 x bfloat> %114 to i32, !dbg !27
+  %140 = bitcast <2 x bfloat> %138 to i32, !dbg !27
+  tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %139, i32 %140, ptr addrspace(1) %92, i1 %12) #6, !dbg !27
+  %141 = and i32 %13, 511, !dbg !10
+  %142 = and i32 %13, 31, !dbg !10
+  %143 = lshr i32 %141, 5, !dbg !10
+  %144 = fsub float %113, %112, !dbg !41
+  %145 = select i1 %12, float 4.000000e+00, float 0.000000e+00, !dbg !44
+  %146 = fcmp oeq float %145, 0.000000e+00, !dbg !45
+  %147 = tail call float @llvm.nvvm.div.full(float %89, float %145), !dbg !46
+  %148 = select i1 %146, float 0.000000e+00, float %147, !dbg !47
+  %149 = fmul float %144, %148, !dbg !48
+  %150 = fadd float %112, %149, !dbg !49
+  %151 = fadd float %104, %111, !dbg !50
+  %152 = select i1 %12, float %151, float 0.000000e+00, !dbg !50
+  %153 = fmul float %144, %144, !dbg !51
+  %154 = fmul float %153, %88, !dbg !52
+  %155 = fmul float %154, %148, !dbg !53
+  %156 = fadd float %152, %155, !dbg !54
+  %157 = fsub float %134, %150, !dbg !41
+  %158 = select i1 %12, float 6.000000e+00, float 0.000000e+00, !dbg !44
+  %159 = fcmp oeq float %158, 0.000000e+00, !dbg !45
+  %160 = tail call float @llvm.nvvm.div.full(float %90, float %158), !dbg !46
+  %161 = select i1 %159, float 0.000000e+00, float %160, !dbg !47
+  %162 = fmul float %161, %157, !dbg !48
+  %163 = fadd float %150, %162, !dbg !49
+  %164 = fadd float %136, %156, !dbg !50
+  %165 = fmul float %157, %157, !dbg !51
+  %166 = fmul float %145, %165, !dbg !52
+  %167 = fmul float %161, %166, !dbg !53
+  %168 = fadd float %164, %167, !dbg !54
+  %169 = fsub float %135, %163, !dbg !41
+  %170 = select i1 %12, float 8.000000e+00, float 0.000000e+00, !dbg !44
+  %171 = fcmp oeq float %170, 0.000000e+00, !dbg !45
+  %172 = tail call float @llvm.nvvm.div.full(float %91, float %170), !dbg !46
+  %173 = select i1 %171, float 0.000000e+00, float %172, !dbg !47
+  %174 = fmul float %173, %169, !dbg !48
+  %175 = fadd float %163, %174, !dbg !49
+  %176 = fadd float %137, %168, !dbg !50
+  %177 = fmul float %169, %169, !dbg !51
+  %178 = fmul float %158, %177, !dbg !52
+  %179 = fmul float %173, %178, !dbg !53
+  %180 = fadd float %176, %179, !dbg !54
+  %181 = bitcast float %175 to i32, !dbg !42
+  %182 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %181, i32 16, i32 31), !dbg !42
+  %183 = bitcast i32 %182 to float, !dbg !42
+  %184 = bitcast float %180 to i32, !dbg !42
+  %185 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %184, i32 16, i32 31), !dbg !42
+  %186 = bitcast i32 %185 to float, !dbg !42
+  %187 = bitcast float %170 to i32, !dbg !42
+  %188 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %187, i32 16, i32 31), !dbg !42
+  %189 = bitcast i32 %188 to float, !dbg !42
+  %190 = fsub float %183, %175, !dbg !41
+  %191 = fadd float %170, %189, !dbg !44
+  %192 = fcmp oeq float %191, 0.000000e+00, !dbg !45
+  %193 = tail call float @llvm.nvvm.div.full(float %189, float %191), !dbg !46
+  %194 = select i1 %192, float 0.000000e+00, float %193, !dbg !47
+  %195 = fmul float %194, %190, !dbg !48
+  %196 = fadd float %175, %195, !dbg !49
+  %197 = fadd float %180, %186, !dbg !50
+  %198 = fmul float %190, %190, !dbg !51
+  %199 = fmul float %170, %198, !dbg !52
+  %200 = fmul float %194, %199, !dbg !53
+  %201 = fadd float %197, %200, !dbg !54
+  %202 = bitcast float %196 to i32, !dbg !42
+  %203 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %202, i32 8, i32 31), !dbg !42
+  %204 = bitcast i32 %203 to float, !dbg !42
+  %205 = bitcast float %201 to i32, !dbg !42
+  %206 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %205, i32 8, i32 31), !dbg !42
+  %207 = bitcast i32 %206 to float, !dbg !42
+  %208 = bitcast float %191 to i32, !dbg !42
+  %209 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %208, i32 8, i32 31), !dbg !42
+  %210 = bitcast i32 %209 to float, !dbg !42
+  %211 = fsub float %204, %196, !dbg !41
+  %212 = fadd float %191, %210, !dbg !44
+  %213 = fcmp oeq float %212, 0.000000e+00, !dbg !45
+  %214 = tail call float @llvm.nvvm.div.full(float %210, float %212), !dbg !46
+  %215 = select i1 %213, float 0.000000e+00, float %214, !dbg !47
+  %216 = fmul float %211, %215, !dbg !48
+  %217 = fadd float %196, %216, !dbg !49
+  %218 = fadd float %201, %207, !dbg !50
+  %219 = fmul float %211, %211, !dbg !51
+  %220 = fmul float %191, %219, !dbg !52
+  %221 = fmul float %215, %220, !dbg !53
+  %222 = fadd float %218, %221, !dbg !54
+  %223 = bitcast float %217 to i32, !dbg !42
+  %224 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %223, i32 4, i32 31), !dbg !42
+  %225 = bitcast i32 %224 to float, !dbg !42
+  %226 = bitcast float %222 to i32, !dbg !42
+  %227 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %226, i32 4, i32 31), !dbg !42
+  %228 = bitcast i32 %227 to float, !dbg !42
+  %229 = bitcast float %212 to i32, !dbg !42
+  %230 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %229, i32 4, i32 31), !dbg !42
+  %231 = bitcast i32 %230 to float, !dbg !42
+  %232 = fsub float %225, %217, !dbg !41
+  %233 = fadd float %212, %231, !dbg !44
+  %234 = fcmp oeq float %233, 0.000000e+00, !dbg !45
+  %235 = tail call float @llvm.nvvm.div.full(float %231, float %233), !dbg !46
+  %236 = select i1 %234, float 0.000000e+00, float %235, !dbg !47
+  %237 = fmul float %232, %236, !dbg !48
+  %238 = fadd float %217, %237, !dbg !49
+  %239 = fadd float %222, %228, !dbg !50
+  %240 = fmul float %232, %232, !dbg !51
+  %241 = fmul float %212, %240, !dbg !52
+  %242 = fmul float %236, %241, !dbg !53
+  %243 = fadd float %239, %242, !dbg !54
+  %244 = bitcast float %238 to i32, !dbg !42
+  %245 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %244, i32 2, i32 31), !dbg !42
+  %246 = bitcast i32 %245 to float, !dbg !42
+  %247 = bitcast float %243 to i32, !dbg !42
+  %248 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %247, i32 2, i32 31), !dbg !42
+  %249 = bitcast i32 %248 to float, !dbg !42
+  %250 = bitcast float %233 to i32, !dbg !42
+  %251 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %250, i32 2, i32 31), !dbg !42
+  %252 = bitcast i32 %251 to float, !dbg !42
+  %253 = fsub float %246, %238, !dbg !41
+  %254 = fadd float %233, %252, !dbg !44
+  %255 = fcmp oeq float %254, 0.000000e+00, !dbg !45
+  %256 = tail call float @llvm.nvvm.div.full(float %252, float %254), !dbg !46
+  %257 = select i1 %255, float 0.000000e+00, float %256, !dbg !47
+  %258 = fmul float %253, %257, !dbg !48
+  %259 = fadd float %238, %258, !dbg !49
+  %260 = fadd float %243, %249, !dbg !50
+  %261 = fmul float %253, %253, !dbg !51
+  %262 = fmul float %233, %261, !dbg !52
+  %263 = fmul float %257, %262, !dbg !53
+  %264 = fadd float %260, %263, !dbg !54
+  %265 = bitcast float %259 to i32, !dbg !42
+  %266 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %265, i32 1, i32 31), !dbg !42
+  %267 = bitcast i32 %266 to float, !dbg !42
+  %268 = bitcast float %264 to i32, !dbg !42
+  %269 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %268, i32 1, i32 31), !dbg !42
+  %270 = bitcast i32 %269 to float, !dbg !42
+  %271 = bitcast float %254 to i32, !dbg !42
+  %272 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %271, i32 1, i32 31), !dbg !42
+  %273 = bitcast i32 %272 to float, !dbg !42
+  %274 = fsub float %267, %259, !dbg !41
+  %275 = fadd float %254, %273, !dbg !44
+  %276 = fcmp oeq float %275, 0.000000e+00, !dbg !45
+  %277 = tail call float @llvm.nvvm.div.full(float %273, float %275), !dbg !46
+  %278 = select i1 %276, float 0.000000e+00, float %277, !dbg !47
+  %279 = fmul float %274, %278, !dbg !48
+  %280 = fadd float %259, %279, !dbg !49
+  %281 = fadd float %264, %270, !dbg !50
+  %282 = fmul float %274, %274, !dbg !51
+  %283 = fmul float %254, %282, !dbg !52
+  %284 = fmul float %278, %283, !dbg !53
+  %285 = fadd float %281, %284, !dbg !54
+  %286 = icmp eq i32 %142, 0, !dbg !42
+  %287 = getelementptr float, ptr addrspace(3) @global_smem, i32 %143, !dbg !42
+  %288 = bitcast float %280 to <1 x i32>, !dbg !42
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %287, <1 x i32> %288, i1 %286) #6, !dbg !42
+  %289 = getelementptr float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 64), i32 %143, !dbg !42
+  %290 = bitcast float %285 to <1 x i32>, !dbg !42
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %289, <1 x i32> %290, i1 %286) #6, !dbg !42
+  %291 = getelementptr float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 128), i32 %143, !dbg !42
+  %292 = bitcast float %275 to <1 x i32>, !dbg !42
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %291, <1 x i32> %292, i1 %286) #6, !dbg !42
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !42
+  %293 = icmp samesign ult i32 %141, 16, !dbg !42
+  %294 = getelementptr float, ptr addrspace(3) @global_smem, i32 %141, !dbg !42
+  %295 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %294, i1 %293) #6, !dbg !42
+  %296 = bitcast i32 %295 to float, !dbg !42
+  %297 = getelementptr float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 64), i32 %141, !dbg !42
+  %298 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %297, i1 %293) #6, !dbg !42
+  %299 = bitcast i32 %298 to float, !dbg !42
+  %300 = getelementptr float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 128), i32 %141, !dbg !42
+  %301 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %300, i1 %293) #6, !dbg !42
+  %302 = bitcast i32 %301 to float, !dbg !42
+  %303 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %295, i32 8, i32 31), !dbg !42
+  %304 = bitcast i32 %303 to float, !dbg !42
+  %305 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %298, i32 8, i32 31), !dbg !42
+  %306 = bitcast i32 %305 to float, !dbg !42
+  %307 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %301, i32 8, i32 31), !dbg !42
+  %308 = bitcast i32 %307 to float, !dbg !42
+  %309 = fsub float %304, %296, !dbg !41
+  %310 = fadd float %302, %308, !dbg !44
+  %311 = fcmp oeq float %310, 0.000000e+00, !dbg !45
+  %312 = tail call float @llvm.nvvm.div.full(float %308, float %310), !dbg !46
+  %313 = select i1 %311, float 0.000000e+00, float %312, !dbg !47
+  %314 = fmul float %309, %313, !dbg !48
+  %315 = fadd float %314, %296, !dbg !49
+  %316 = fadd float %299, %306, !dbg !50
+  %317 = fmul float %309, %309, !dbg !51
+  %318 = fmul float %317, %302, !dbg !52
+  %319 = fmul float %318, %313, !dbg !53
+  %320 = fadd float %316, %319, !dbg !54
+  %321 = bitcast float %315 to i32, !dbg !42
+  %322 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %321, i32 4, i32 31), !dbg !42
+  %323 = bitcast i32 %322 to float, !dbg !42
+  %324 = bitcast float %320 to i32, !dbg !42
+  %325 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %324, i32 4, i32 31), !dbg !42
+  %326 = bitcast i32 %325 to float, !dbg !42
+  %327 = bitcast float %310 to i32, !dbg !42
+  %328 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %327, i32 4, i32 31), !dbg !42
+  %329 = bitcast i32 %328 to float, !dbg !42
+  %330 = fsub float %323, %315, !dbg !41
+  %331 = fadd float %310, %329, !dbg !44
+  %332 = fcmp oeq float %331, 0.000000e+00, !dbg !45
+  %333 = tail call float @llvm.nvvm.div.full(float %329, float %331), !dbg !46
+  %334 = select i1 %332, float 0.000000e+00, float %333, !dbg !47
+  %335 = fmul float %330, %334, !dbg !48
+  %336 = fadd float %315, %335, !dbg !49
+  %337 = fadd float %320, %326, !dbg !50
+  %338 = fmul float %330, %330, !dbg !51
+  %339 = fmul float %310, %338, !dbg !52
+  %340 = fmul float %334, %339, !dbg !53
+  %341 = fadd float %337, %340, !dbg !54
+  %342 = bitcast float %336 to i32, !dbg !42
+  %343 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %342, i32 2, i32 31), !dbg !42
+  %344 = bitcast i32 %343 to float, !dbg !42
+  %345 = bitcast float %341 to i32, !dbg !42
+  %346 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %345, i32 2, i32 31), !dbg !42
+  %347 = bitcast i32 %346 to float, !dbg !42
+  %348 = bitcast float %331 to i32, !dbg !42
+  %349 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %348, i32 2, i32 31), !dbg !42
+  %350 = bitcast i32 %349 to float, !dbg !42
+  %351 = fsub float %344, %336, !dbg !41
+  %352 = fadd float %331, %350, !dbg !44
+  %353 = fcmp oeq float %352, 0.000000e+00, !dbg !45
+  %354 = tail call float @llvm.nvvm.div.full(float %350, float %352), !dbg !46
+  %355 = select i1 %353, float 0.000000e+00, float %354, !dbg !47
+  %356 = fmul float %351, %355, !dbg !48
+  %357 = fadd float %336, %356, !dbg !49
+  %358 = fadd float %341, %347, !dbg !50
+  %359 = fmul float %351, %351, !dbg !51
+  %360 = fmul float %331, %359, !dbg !52
+  %361 = fmul float %355, %360, !dbg !53
+  %362 = fadd float %358, %361, !dbg !54
+  %363 = bitcast float %357 to i32, !dbg !42
+  %364 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %363, i32 1, i32 31), !dbg !42
+  %365 = bitcast i32 %364 to float, !dbg !42
+  %366 = bitcast float %362 to i32, !dbg !42
+  %367 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %366, i32 1, i32 31), !dbg !42
+  %368 = bitcast i32 %367 to float, !dbg !42
+  %369 = bitcast float %352 to i32, !dbg !42
+  %370 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %369, i32 1, i32 31), !dbg !42
+  %371 = bitcast i32 %370 to float, !dbg !42
+  %372 = fsub float %365, %357, !dbg !41
+  %373 = fadd float %352, %371, !dbg !44
+  %374 = fcmp oeq float %373, 0.000000e+00, !dbg !45
+  %375 = tail call float @llvm.nvvm.div.full(float %371, float %373), !dbg !46
+  %376 = select i1 %374, float 0.000000e+00, float %375, !dbg !47
+  %377 = fmul float %372, %376, !dbg !48
+  %378 = fadd float %357, %377, !dbg !49
+  %379 = fadd float %362, %368, !dbg !50
+  %380 = fmul float %372, %372, !dbg !51
+  %381 = fmul float %352, %380, !dbg !52
+  %382 = fmul float %376, %381, !dbg !53
+  %383 = fadd float %379, %382, !dbg !54
+  %384 = and i32 %13, 15, !dbg !42
+  %385 = icmp eq i32 %384, 0, !dbg !42
+  %386 = and i1 %293, %385, !dbg !42
+  %387 = bitcast float %378 to <1 x i32>, !dbg !42
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %294, <1 x i32> %387, i1 %386) #6, !dbg !42
+  %388 = bitcast float %383 to <1 x i32>, !dbg !42
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %297, <1 x i32> %388, i1 %386) #6, !dbg !42
+  %389 = bitcast float %373 to <1 x i32>, !dbg !42
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %300, <1 x i32> %389, i1 %386) #6, !dbg !42
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !42
+  %390 = load float, ptr addrspace(3) @global_smem, align 16, !dbg !42
+  %391 = load float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 64), align 16, !dbg !42
+  %392 = tail call float @llvm.nvvm.div.full(float %391, float 4.096000e+03), !dbg !55
+  %393 = fadd float %392, 0x3EB0C6F7A0000000, !dbg !56
+  %394 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !57
+  %395 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !57
+  %396 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !57
+  %397 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !57
+  %.not.i15 = icmp eq i32 %397, 0, !dbg !57
+  br i1 %.not.i15, label %400, label %398, !dbg !57
+
+398:                                              ; preds = %__nv_rsqrtf.exit
+  %399 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %393), !dbg !57
+  br label %__nv_rsqrtf.exit17, !dbg !57
+
+400:                                              ; preds = %__nv_rsqrtf.exit
+  %401 = tail call float @llvm.nvvm.rsqrt.approx.f(float %393), !dbg !57
+  br label %__nv_rsqrtf.exit17, !dbg !57
+
+__nv_rsqrtf.exit17:                               ; preds = %398, %400
+  %.0.i16 = phi float [ %399, %398 ], [ %401, %400 ], !dbg !57
+  %402 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #6, !dbg !58
+  %403 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %41, i64 %402, i1 %12) #6, !dbg !58
+  %404 = extractvalue { i32, i32 } %403, 0, !dbg !58
+  %405 = bitcast i32 %404 to <2 x bfloat>, !dbg !58
+  %406 = extractvalue { i32, i32 } %403, 1, !dbg !58
+  %407 = bitcast i32 %406 to <2 x bfloat>, !dbg !58
+  %408 = getelementptr bfloat, ptr addrspace(1) %3, i64 %17, !dbg !59
+  %409 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !60
+  %410 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %408, i64 %409, i1 true) #6, !dbg !60
+  %411 = extractvalue { i32, i32 } %410, 0, !dbg !60
+  %412 = bitcast i32 %411 to <2 x bfloat>, !dbg !60
+  %413 = extractvalue { i32, i32 } %410, 1, !dbg !60
+  %414 = bitcast i32 %413 to <2 x bfloat>, !dbg !60
+  %415 = getelementptr bfloat, ptr addrspace(1) %4, i64 %17, !dbg !61
+  %416 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !62
+  %417 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %415, i64 %416, i1 true) #6, !dbg !62
+  %418 = extractvalue { i32, i32 } %417, 0, !dbg !62
+  %419 = bitcast i32 %418 to <2 x bfloat>, !dbg !62
+  %420 = extractvalue { i32, i32 } %417, 1, !dbg !62
+  %421 = bitcast i32 %420 to <2 x bfloat>, !dbg !62
+  %422 = getelementptr bfloat, ptr addrspace(1) %6, i64 %19, !dbg !63
+  %423 = fpext <2 x bfloat> %405 to <2 x float>, !dbg !64
+  %424 = fpext <2 x bfloat> %412 to <2 x float>, !dbg !65
+  %425 = fpext <2 x bfloat> %419 to <2 x float>, !dbg !66
+  %426 = insertelement <2 x float> poison, float %390, i64 0, !dbg !67
+  %427 = shufflevector <2 x float> %426, <2 x float> poison, <2 x i32> zeroinitializer, !dbg !67
+  %428 = fsub <2 x float> %423, %427, !dbg !67
+  %429 = insertelement <2 x float> poison, float %.0.i16, i64 0, !dbg !68
+  %430 = shufflevector <2 x float> %429, <2 x float> poison, <2 x i32> zeroinitializer, !dbg !68
+  %431 = fmul <2 x float> %430, %428, !dbg !68
+  %432 = fadd <2 x float> %424, splat (float 1.000000e+00), !dbg !69
+  %433 = fmul <2 x float> %431, %432, !dbg !70
+  %434 = fadd <2 x float> %433, %425, !dbg !71
+  %435 = fptrunc <2 x float> %434 to <2 x bfloat>, !dbg !72
+  %436 = fpext <2 x bfloat> %407 to <2 x float>, !dbg !64
+  %437 = fpext <2 x bfloat> %414 to <2 x float>, !dbg !65
+  %438 = fpext <2 x bfloat> %421 to <2 x float>, !dbg !66
+  %439 = fsub <2 x float> %436, %427, !dbg !67
+  %440 = fmul <2 x float> %430, %439, !dbg !68
+  %441 = fadd <2 x float> %437, splat (float 1.000000e+00), !dbg !69
+  %442 = fmul <2 x float> %440, %441, !dbg !70
+  %443 = fadd <2 x float> %442, %438, !dbg !71
+  %444 = fptrunc <2 x float> %443 to <2 x bfloat>, !dbg !72
+  %445 = bitcast <2 x bfloat> %435 to i32, !dbg !72
+  %446 = bitcast <2 x bfloat> %444 to i32, !dbg !72
+  tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %445, i32 %446, ptr addrspace(1) %422, i1 %12) #6, !dbg !72
+  %447 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #6, !dbg !58
+  %448 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %92, i64 %447, i1 %12) #6, !dbg !58
+  %449 = extractvalue { i32, i32 } %448, 0, !dbg !58
+  %450 = bitcast i32 %449 to <2 x bfloat>, !dbg !58
+  %451 = extractvalue { i32, i32 } %448, 1, !dbg !58
+  %452 = bitcast i32 %451 to <2 x bfloat>, !dbg !58
+  %453 = getelementptr bfloat, ptr addrspace(1) %3, i64 %64, !dbg !59
+  %454 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !60
+  %455 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %453, i64 %454, i1 true) #6, !dbg !60
+  %456 = extractvalue { i32, i32 } %455, 0, !dbg !60
+  %457 = bitcast i32 %456 to <2 x bfloat>, !dbg !60
+  %458 = extractvalue { i32, i32 } %455, 1, !dbg !60
+  %459 = bitcast i32 %458 to <2 x bfloat>, !dbg !60
+  %460 = getelementptr bfloat, ptr addrspace(1) %4, i64 %64, !dbg !61
+  %461 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !62
+  %462 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %460, i64 %461, i1 true) #6, !dbg !62
+  %463 = extractvalue { i32, i32 } %462, 0, !dbg !62
+  %464 = bitcast i32 %463 to <2 x bfloat>, !dbg !62
+  %465 = extractvalue { i32, i32 } %462, 1, !dbg !62
+  %466 = bitcast i32 %465 to <2 x bfloat>, !dbg !62
+  %467 = getelementptr bfloat, ptr addrspace(1) %6, i64 %65, !dbg !63
+  %468 = fpext <2 x bfloat> %450 to <2 x float>, !dbg !64
+  %469 = fpext <2 x bfloat> %457 to <2 x float>, !dbg !65
+  %470 = fpext <2 x bfloat> %464 to <2 x float>, !dbg !66
+  %471 = fsub <2 x float> %468, %427, !dbg !67
+  %472 = fmul <2 x float> %430, %471, !dbg !68
+  %473 = fadd <2 x float> %469, splat (float 1.000000e+00), !dbg !69
+  %474 = fmul <2 x float> %472, %473, !dbg !70
+  %475 = fadd <2 x float> %474, %470, !dbg !71
+  %476 = fptrunc <2 x float> %475 to <2 x bfloat>, !dbg !72
+  %477 = fpext <2 x bfloat> %452 to <2 x float>, !dbg !64
+  %478 = fpext <2 x bfloat> %459 to <2 x float>, !dbg !65
+  %479 = fpext <2 x bfloat> %466 to <2 x float>, !dbg !66
+  %480 = fsub <2 x float> %477, %427, !dbg !67
+  %481 = fmul <2 x float> %430, %480, !dbg !68
+  %482 = fadd <2 x float> %478, splat (float 1.000000e+00), !dbg !69
+  %483 = fmul <2 x float> %481, %482, !dbg !70
+  %484 = fadd <2 x float> %483, %479, !dbg !71
+  %485 = fptrunc <2 x float> %484 to <2 x bfloat>, !dbg !72
+  %486 = bitcast <2 x bfloat> %476 to i32, !dbg !72
+  %487 = bitcast <2 x bfloat> %485 to i32, !dbg !72
+  tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %486, i32 %487, ptr addrspace(1) %467, i1 %12) #6, !dbg !72
+  ret void, !dbg !73
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.div.full(float, float) #2
+
+; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
+declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #3
+
+; Function Attrs: convergent nocallback nounwind
+declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #4
+
+declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #5
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #2
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.rsqrt.approx.f(float) #2
+
+attributes #0 = { nounwind "nvvm.reqntid"="512" }
+attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #2 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
+attributes #3 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
+attributes #4 = { convergent nocallback nounwind }
+attributes #5 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #6 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3}
+!llvm.ident = !{!4}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly)
+!1 = !DIFile(filename: "cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py", directory: "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3")
+!2 = !{i32 2, !"Debug Info Version", i32 3}
+!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
+!4 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
+!5 = distinct !DISubprogram(name: "triton_red_fused_add_mul_native_layer_norm_0", linkageName: "triton_red_fused_add_mul_native_layer_norm_0", scope: !1, file: !1, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
+!7 = !{}
+!8 = !DILocation(line: 23, column: 28, scope: !5)
+!9 = !DILocation(line: 25, column: 21, scope: !5)
+!10 = !DILocation(line: 26, column: 37, scope: !5)
+!11 = !DILocation(line: 38, column: 46, scope: !5)
+!12 = !DILocation(line: 32, column: 43, scope: !5)
+!13 = !DILocation(line: 38, column: 41, scope: !5)
+!14 = !DILocation(line: 38, column: 34, scope: !5)
+!15 = !DILocation(line: 38, column: 51, scope: !5)
+!16 = !DILocation(line: 39, column: 34, scope: !5)
+!17 = !DILocation(line: 39, column: 41, scope: !5)
+!18 = !DILocation(line: 40, column: 34, scope: !5)
+!19 = !DILocation(line: 40, column: 51, scope: !5)
+!20 = !DILocation(line: 51, column: 29, scope: !5)
+!21 = !DILocation(line: 39, column: 94, scope: !5)
+!22 = !DILocation(line: 40, column: 113, scope: !5)
+!23 = !DILocation(line: 41, column: 22, scope: !5)
+!24 = !DILocation(line: 38, column: 113, scope: !5)
+!25 = !DILocation(line: 42, column: 22, scope: !5)
+!26 = !DILocation(line: 48, column: 62, scope: !5)
+!27 = !DILocation(line: 51, column: 52, scope: !5)
+!28 = !DILocation(line: 33, column: 31, scope: !5)
+!29 = !DILocation(line: 50, column: 66, scope: !5)
+!30 = !DILocation(line: 225, column: 39, scope: !31, inlinedAt: !33)
+!31 = distinct !DILexicalBlockFile(scope: !5, file: !32, discriminator: 0)
+!32 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime")
+!33 = !DILocation(line: 46, column: 51, scope: !34)
+!34 = distinct !DILexicalBlockFile(scope: !5, file: !1, discriminator: 0)
+!35 = !DILocation(line: 222, column: 24, scope: !31, inlinedAt: !33)
+!36 = !DILocation(line: 224, column: 34, scope: !31, inlinedAt: !33)
+!37 = !DILocation(line: 224, column: 26, scope: !31, inlinedAt: !33)
+!38 = !DILocation(line: 225, column: 31, scope: !31, inlinedAt: !33)
+!39 = !DILocation(line: 225, column: 22, scope: !31, inlinedAt: !33)
+!40 = !DILocation(line: 49, column: 58, scope: !5)
+!41 = !DILocation(line: 231, column: 21, scope: !31, inlinedAt: !42)
+!42 = !DILocation(line: 243, column: 46, scope: !31, inlinedAt: !43)
+!43 = !DILocation(line: 52, column: 80, scope: !34)
+!44 = !DILocation(line: 232, column: 28, scope: !31, inlinedAt: !42)
+!45 = !DILocation(line: 233, column: 39, scope: !31, inlinedAt: !42)
+!46 = !DILocation(line: 233, column: 60, scope: !31, inlinedAt: !42)
+!47 = !DILocation(line: 233, column: 49, scope: !31, inlinedAt: !42)
+!48 = !DILocation(line: 235, column: 25, scope: !31, inlinedAt: !42)
+!49 = !DILocation(line: 235, column: 17, scope: !31, inlinedAt: !42)
+!50 = !DILocation(line: 236, column: 15, scope: !31, inlinedAt: !42)
+!51 = !DILocation(line: 236, column: 30, scope: !31, inlinedAt: !42)
+!52 = !DILocation(line: 236, column: 38, scope: !31, inlinedAt: !42)
+!53 = !DILocation(line: 236, column: 49, scope: !31, inlinedAt: !42)
+!54 = !DILocation(line: 236, column: 22, scope: !31, inlinedAt: !42)
+!55 = !DILocation(line: 68, column: 25, scope: !5)
+!56 = !DILocation(line: 70, column: 24, scope: !5)
+!57 = !DILocation(line: 71, column: 32, scope: !5)
+!58 = !DILocation(line: 62, column: 53, scope: !5)
+!59 = !DILocation(line: 63, column: 35, scope: !5)
+!60 = !DILocation(line: 63, column: 42, scope: !5)
+!61 = !DILocation(line: 64, column: 35, scope: !5)
+!62 = !DILocation(line: 64, column: 42, scope: !5)
+!63 = !DILocation(line: 78, column: 29, scope: !5)
+!64 = !DILocation(line: 62, column: 115, scope: !5)
+!65 = !DILocation(line: 63, column: 95, scope: !5)
+!66 = !DILocation(line: 64, column: 95, scope: !5)
+!67 = !DILocation(line: 66, column: 24, scope: !5)
+!68 = !DILocation(line: 72, column: 24, scope: !5)
+!69 = !DILocation(line: 75, column: 24, scope: !5)
+!70 = !DILocation(line: 76, column: 24, scope: !5)
+!71 = !DILocation(line: 77, column: 24, scope: !5)
+!72 = !DILocation(line: 78, column: 53, scope: !5)
+!73 = !DILocation(line: 56, column: 4, scope: !5)
diff --git a/triton/DTSINFKV23R7UUCB5Y3DX56UD6DUQS3DXJCZPAKYYXLDFCJFQIOA/triton_red_fused_add_mul_native_layer_norm_0.ptx b/triton/DTSINFKV23R7UUCB5Y3DX56UD6DUQS3DXJCZPAKYYXLDFCJFQIOA/triton_red_fused_add_mul_native_layer_norm_0.ptx
new file mode 100644
index 0000000000000000000000000000000000000000..cde18d5d05d4a49655a043c6c8748dac11e2c17a
--- /dev/null
+++ b/triton/DTSINFKV23R7UUCB5Y3DX56UD6DUQS3DXJCZPAKYYXLDFCJFQIOA/triton_red_fused_add_mul_native_layer_norm_0.ptx
@@ -0,0 +1,1191 @@
+//
+// Generated by LLVM NVPTX Back-End
+//
+
+.version 9.1
+.target sm_89
+.address_size 64
+
+	// .globl	triton_red_fused_add_mul_native_layer_norm_0 // -- Begin function triton_red_fused_add_mul_native_layer_norm_0
+.extern .shared .align 16 .b8 global_smem[];
+.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90};
+                                        // @triton_red_fused_add_mul_native_layer_norm_0
+.visible .entry triton_red_fused_add_mul_native_layer_norm_0(
+	.param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_0_param_0,
+	.param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_0_param_1,
+	.param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_0_param_2,
+	.param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_0_param_3,
+	.param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_0_param_4,
+	.param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_0_param_5,
+	.param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_0_param_6,
+	.param .u32 triton_red_fused_add_mul_native_layer_norm_0_param_7,
+	.param .u32 triton_red_fused_add_mul_native_layer_norm_0_param_8,
+	.param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_0_param_9,
+	.param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_0_param_10
+)
+.reqntid 512
+{
+	.reg .pred 	%p<19>;
+	.reg .b16 	%rs<49>;
+	.reg .b32 	%r<317>;
+	.reg .b64 	%rd<39>;
+	.loc	1 18 0                          // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:18:0
+$L__func_begin0:
+	.loc	1 18 0                          // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:18:0
+
+// %bb.0:                               // %__nv_rsqrtf.exit
+	ld.param.b64 	%rd27, [triton_red_fused_add_mul_native_layer_norm_0_param_0];
+	ld.param.b64 	%rd28, [triton_red_fused_add_mul_native_layer_norm_0_param_1];
+$L__tmp0:
+	.loc	1 23 28                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:23:28
+	mov.u32 	%r49, %ctaid.x;
+	.loc	1 25 21                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:25:21
+	setp.lt.u32 	%p1, %r49, 2048;
+	ld.param.b64 	%rd29, [triton_red_fused_add_mul_native_layer_norm_0_param_2];
+	ld.param.b64 	%rd30, [triton_red_fused_add_mul_native_layer_norm_0_param_3];
+	.loc	1 26 37                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:26:37
+	mov.u32 	%r50, %tid.x;
+	shl.b32 	%r51, %r50, 2;
+	ld.param.b64 	%rd31, [triton_red_fused_add_mul_native_layer_norm_0_param_4];
+	and.b32 	%r52, %r51, 2044;
+	ld.param.b64 	%rd32, [triton_red_fused_add_mul_native_layer_norm_0_param_5];
+	.loc	1 38 46                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:38:46
+	shl.b32 	%r53, %r49, 12;
+	ld.param.b64 	%rd33, [triton_red_fused_add_mul_native_layer_norm_0_param_6];
+	.loc	1 32 43                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:32:43
+	cvt.u64.u32 	%rd34, %r52;
+	cvt.s64.s32 	%rd35, %r53;
+	.loc	1 38 41                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:38:41
+	or.b64 	%rd36, %rd34, %rd35;
+	.loc	1 38 34                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:38:34
+	shl.b64 	%rd37, %rd36, 1;
+	add.s64 	%rd1, %rd27, %rd37;
+	.loc	1 38 51                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:38:51
+	// begin inline asm
+	mov.u64 %rd2, 0x0;
+	createpolicy.fractional.L2::evict_first.b64 %rd2, 1.0;
+	// end inline asm
+	mov.b32 	%r3, 0;
+	// begin inline asm
+	mov.u32 %r1, %r3;
+	mov.u32 %r2, %r3;
+	@%p1 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { %r1, %r2 }, [ %rd1 + 0 ], %rd2;
+	// end inline asm
+	.loc	1 39 34                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:39:34
+	mul.wide.u32 	%rd38, %r52, 2;
+	add.s64 	%rd3, %rd28, %rd38;
+	.loc	1 39 41                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:39:41
+	// begin inline asm
+	mov.u64 %rd4, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd4, 1.0;
+	// end inline asm
+	mov.pred 	%p2, -1;
+	// begin inline asm
+	mov.u32 %r4, %r3;
+	mov.u32 %r5, %r3;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { %r4, %r5 }, [ %rd3 + 0 ], %rd4;
+	// end inline asm
+	.loc	1 40 34                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:40:34
+	add.s64 	%rd5, %rd29, %rd37;
+	.loc	1 40 51                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:40:51
+	// begin inline asm
+	mov.u64 %rd6, 0x0;
+	createpolicy.fractional.L2::evict_first.b64 %rd6, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r6, %r3;
+	mov.u32 %r7, %r3;
+	@%p1 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { %r6, %r7 }, [ %rd5 + 0 ], %rd6;
+	// end inline asm
+	.loc	1 51 29                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:51:29
+	add.s64 	%rd7, %rd32, %rd37;
+	.loc	1 39 94                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:39:94
+	mov.b32 	{%rs1, %rs2}, %r4;
+	cvt.f32.bf16 	%r54, %rs1;
+	cvt.f32.bf16 	%r55, %rs2;
+	.loc	1 40 113                        // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:40:113
+	mov.b32 	{%rs3, %rs4}, %r6;
+	cvt.f32.bf16 	%r56, %rs3;
+	cvt.f32.bf16 	%r57, %rs4;
+	.loc	1 38 113                        // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:38:113
+	mov.b32 	{%rs5, %rs6}, %r1;
+	cvt.f32.bf16 	%r58, %rs5;
+	cvt.f32.bf16 	%r59, %rs6;
+	.loc	1 42 22                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:42:22
+	fma.rn.f32 	%r60, %r55, %r57, %r59;
+	fma.rn.f32 	%r61, %r54, %r56, %r58;
+	.loc	1 48 62                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:48:62
+	selp.f32 	%r62, %r61, 0f00000000, %p1;
+	selp.f32 	%r63, %r60, 0f00000000, %p1;
+	.loc	1 51 52                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:51:52
+	cvt.rn.bf16x2.f32 	%r8, %r60, %r61;
+	.loc	1 39 94                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:39:94
+	mov.b32 	{%rs7, %rs8}, %r5;
+	cvt.f32.bf16 	%r64, %rs7;
+	cvt.f32.bf16 	%r65, %rs8;
+	.loc	1 40 113                        // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:40:113
+	mov.b32 	{%rs9, %rs10}, %r7;
+	cvt.f32.bf16 	%r66, %rs9;
+	cvt.f32.bf16 	%r67, %rs10;
+	.loc	1 38 113                        // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:38:113
+	mov.b32 	{%rs11, %rs12}, %r2;
+	cvt.f32.bf16 	%r68, %rs11;
+	cvt.f32.bf16 	%r69, %rs12;
+	.loc	1 42 22                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:42:22
+	fma.rn.f32 	%r70, %r65, %r67, %r69;
+	fma.rn.f32 	%r71, %r64, %r66, %r68;
+	.loc	1 48 62                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:48:62
+	selp.f32 	%r72, %r71, 0f00000000, %p1;
+	selp.f32 	%r73, %r70, 0f00000000, %p1;
+	.loc	1 51 52                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:51:52
+	cvt.rn.bf16x2.f32 	%r9, %r70, %r71;
+	// begin inline asm
+	@%p1 st.global.v2.b32 [ %rd7 + 0 ], { %r8, %r9 };
+	// end inline asm
+	.loc	1 38 34                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:38:34
+	add.s64 	%rd8, %rd1, 4096;
+	.loc	1 38 51                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:38:51
+	// begin inline asm
+	mov.u64 %rd9, 0x0;
+	createpolicy.fractional.L2::evict_first.b64 %rd9, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r10, %r3;
+	mov.u32 %r11, %r3;
+	@%p1 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { %r10, %r11 }, [ %rd8 + 0 ], %rd9;
+	// end inline asm
+	.loc	1 39 34                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:39:34
+	add.s64 	%rd10, %rd3, 4096;
+	.loc	1 39 41                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:39:41
+	// begin inline asm
+	mov.u64 %rd11, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd11, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r12, %r3;
+	mov.u32 %r13, %r3;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { %r12, %r13 }, [ %rd10 + 0 ], %rd11;
+	// end inline asm
+	.loc	1 40 34                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:40:34
+	add.s64 	%rd12, %rd5, 4096;
+	.loc	1 40 51                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:40:51
+	// begin inline asm
+	mov.u64 %rd13, 0x0;
+	createpolicy.fractional.L2::evict_first.b64 %rd13, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r14, %r3;
+	mov.u32 %r15, %r3;
+	@%p1 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { %r14, %r15 }, [ %rd12 + 0 ], %rd13;
+	// end inline asm
+	.loc	1 50 66                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:50:66
+	selp.f32 	%r74, 0f40000000, 0f3F800000, %p1;
+	selp.f32 	%r75, 0f40000000, 0f00000000, %p1;
+	.loc	1 51 29                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:51:29
+	add.s64 	%rd14, %rd7, 4096;
+	.loc	1 38 113                        // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:38:113
+	mov.b32 	{%rs13, %rs14}, %r10;
+	cvt.f32.bf16 	%r76, %rs13;
+	cvt.f32.bf16 	%r77, %rs14;
+	.loc	1 39 94                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:39:94
+	mov.b32 	{%rs15, %rs16}, %r12;
+	cvt.f32.bf16 	%r78, %rs15;
+	cvt.f32.bf16 	%r79, %rs16;
+	.loc	1 40 113                        // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:40:113
+	mov.b32 	{%rs17, %rs18}, %r14;
+	cvt.f32.bf16 	%r80, %rs17;
+	cvt.f32.bf16 	%r81, %rs18;
+	.loc	1 42 22                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:42:22
+	fma.rn.f32 	%r82, %r79, %r81, %r77;
+	fma.rn.f32 	%r83, %r78, %r80, %r76;
+$L__tmp1:
+	.loc	2 222 24                        // triton_helpers.py:222:24 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:46:51 ]
+	sub.f32 	%r84, %r83, %r62;
+	.loc	2 224 34                        // triton_helpers.py:224:34 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:46:51 ]
+	div.full.f32 	%r85, %r84, %r74;
+	.loc	2 224 26                        // triton_helpers.py:224:26 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:46:51 ]
+	add.f32 	%r86, %r62, %r85;
+	.loc	2 225 39                        // triton_helpers.py:225:39 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:46:51 ]
+	sub.f32 	%r87, %r83, %r86;
+	.loc	2 225 22                        // triton_helpers.py:225:22 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:46:51 ]
+	fma.rn.f32 	%r88, %r84, %r87, 0f00000000;
+	.loc	2 222 24                        // triton_helpers.py:222:24 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:46:51 ]
+	sub.f32 	%r89, %r82, %r63;
+	.loc	2 224 34                        // triton_helpers.py:224:34 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:46:51 ]
+	div.full.f32 	%r90, %r89, %r74;
+	.loc	2 224 26                        // triton_helpers.py:224:26 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:46:51 ]
+	add.f32 	%r91, %r63, %r90;
+	.loc	2 225 39                        // triton_helpers.py:225:39 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:46:51 ]
+	sub.f32 	%r92, %r82, %r91;
+	.loc	2 225 22                        // triton_helpers.py:225:22 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:46:51 ]
+	fma.rn.f32 	%r93, %r89, %r92, 0f00000000;
+$L__tmp2:
+	.loc	1 48 62                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:48:62
+	selp.f32 	%r94, %r86, 0f00000000, %p1;
+	selp.f32 	%r95, %r91, 0f00000000, %p1;
+	.loc	1 51 52                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:51:52
+	cvt.rn.bf16x2.f32 	%r16, %r82, %r83;
+	.loc	1 38 113                        // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:38:113
+	mov.b32 	{%rs19, %rs20}, %r11;
+	cvt.f32.bf16 	%r96, %rs19;
+	cvt.f32.bf16 	%r97, %rs20;
+	.loc	1 39 94                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:39:94
+	mov.b32 	{%rs21, %rs22}, %r13;
+	cvt.f32.bf16 	%r98, %rs21;
+	cvt.f32.bf16 	%r99, %rs22;
+	.loc	1 40 113                        // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:40:113
+	mov.b32 	{%rs23, %rs24}, %r15;
+	cvt.f32.bf16 	%r100, %rs23;
+	cvt.f32.bf16 	%r101, %rs24;
+	.loc	1 42 22                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:42:22
+	fma.rn.f32 	%r102, %r99, %r101, %r97;
+	fma.rn.f32 	%r103, %r98, %r100, %r96;
+$L__tmp3:
+	.loc	2 222 24                        // triton_helpers.py:222:24 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:46:51 ]
+	sub.f32 	%r104, %r103, %r72;
+	.loc	2 224 34                        // triton_helpers.py:224:34 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:46:51 ]
+	div.full.f32 	%r105, %r104, %r74;
+	.loc	2 224 26                        // triton_helpers.py:224:26 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:46:51 ]
+	add.f32 	%r106, %r72, %r105;
+	.loc	2 225 39                        // triton_helpers.py:225:39 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:46:51 ]
+	sub.f32 	%r107, %r103, %r106;
+	.loc	2 225 22                        // triton_helpers.py:225:22 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:46:51 ]
+	fma.rn.f32 	%r108, %r104, %r107, 0f00000000;
+	.loc	2 222 24                        // triton_helpers.py:222:24 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:46:51 ]
+	sub.f32 	%r109, %r102, %r73;
+	.loc	2 224 34                        // triton_helpers.py:224:34 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:46:51 ]
+	div.full.f32 	%r110, %r109, %r74;
+	.loc	2 224 26                        // triton_helpers.py:224:26 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:46:51 ]
+	add.f32 	%r111, %r73, %r110;
+	.loc	2 225 39                        // triton_helpers.py:225:39 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:46:51 ]
+	sub.f32 	%r112, %r102, %r111;
+	.loc	2 225 22                        // triton_helpers.py:225:22 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:46:51 ]
+	fma.rn.f32 	%r113, %r109, %r112, 0f00000000;
+$L__tmp4:
+	.loc	1 48 62                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:48:62
+	selp.f32 	%r114, %r106, 0f00000000, %p1;
+	selp.f32 	%r115, %r111, 0f00000000, %p1;
+	.loc	1 49 58                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:49:58
+	selp.f32 	%r116, %r108, 0f00000000, %p1;
+	selp.f32 	%r117, %r113, 0f00000000, %p1;
+	.loc	1 51 52                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:51:52
+	cvt.rn.bf16x2.f32 	%r17, %r102, %r103;
+	// begin inline asm
+	@%p1 st.global.v2.b32 [ %rd14 + 0 ], { %r16, %r17 };
+	// end inline asm
+	.loc	1 26 37                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:26:37
+	and.b32 	%r118, %r50, 511;
+	and.b32 	%r119, %r50, 31;
+$L__tmp5:
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	sub.f32 	%r120, %r95, %r94;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	selp.f32 	%r121, 0f40800000, 0f00000000, %p1;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	setp.eq.f32 	%p6, %r121, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	div.full.f32 	%r122, %r75, %r121;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	selp.f32 	%r123, 0f00000000, %r122, %p6;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	fma.rn.f32 	%r124, %r120, %r123, %r94;
+	.loc	2 236 15                        // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	add.f32 	%r125, %r88, %r93;
+	selp.f32 	%r126, %r125, 0f00000000, %p1;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	mul.f32 	%r127, %r120, %r120;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	mul.f32 	%r128, %r127, %r75;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	fma.rn.f32 	%r129, %r128, %r123, %r126;
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	sub.f32 	%r130, %r114, %r124;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	selp.f32 	%r131, 0f40C00000, 0f00000000, %p1;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	setp.eq.f32 	%p7, %r131, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	div.full.f32 	%r132, %r75, %r131;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	selp.f32 	%r133, 0f00000000, %r132, %p7;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	fma.rn.f32 	%r134, %r133, %r130, %r124;
+	.loc	2 236 15                        // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	add.f32 	%r135, %r116, %r129;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	mul.f32 	%r136, %r130, %r130;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	mul.f32 	%r137, %r121, %r136;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	fma.rn.f32 	%r138, %r133, %r137, %r135;
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	sub.f32 	%r139, %r115, %r134;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	selp.f32 	%r140, 0f41000000, 0f00000000, %p1;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	setp.eq.f32 	%p8, %r140, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	div.full.f32 	%r141, %r75, %r140;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	selp.f32 	%r142, 0f00000000, %r141, %p8;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	fma.rn.f32 	%r143, %r142, %r139, %r134;
+	.loc	2 236 15                        // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	add.f32 	%r144, %r117, %r138;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	mul.f32 	%r145, %r139, %r139;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	mul.f32 	%r146, %r131, %r145;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	fma.rn.f32 	%r147, %r142, %r146, %r144;
+$L__tmp6:
+	.loc	2 243 46                        // triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ]
+	shfl.sync.bfly.b32 	%r148, %r143, 16, 31, -1;
+	shfl.sync.bfly.b32 	%r149, %r147, 16, 31, -1;
+	shfl.sync.bfly.b32 	%r150, %r140, 16, 31, -1;
+$L__tmp7:
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	sub.f32 	%r151, %r148, %r143;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	add.f32 	%r152, %r140, %r150;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	setp.eq.f32 	%p9, %r152, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	div.full.f32 	%r153, %r150, %r152;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	selp.f32 	%r154, 0f00000000, %r153, %p9;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	fma.rn.f32 	%r155, %r154, %r151, %r143;
+	.loc	2 236 15                        // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	add.f32 	%r156, %r147, %r149;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	mul.f32 	%r157, %r151, %r151;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	mul.f32 	%r158, %r140, %r157;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	fma.rn.f32 	%r159, %r154, %r158, %r156;
+$L__tmp8:
+	.loc	2 243 46                        // triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ]
+	shfl.sync.bfly.b32 	%r160, %r155, 8, 31, -1;
+	shfl.sync.bfly.b32 	%r161, %r159, 8, 31, -1;
+	shfl.sync.bfly.b32 	%r162, %r152, 8, 31, -1;
+$L__tmp9:
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	sub.f32 	%r163, %r160, %r155;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	add.f32 	%r164, %r152, %r162;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	setp.eq.f32 	%p10, %r164, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	div.full.f32 	%r165, %r162, %r164;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	selp.f32 	%r166, 0f00000000, %r165, %p10;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	fma.rn.f32 	%r167, %r163, %r166, %r155;
+	.loc	2 236 15                        // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	add.f32 	%r168, %r159, %r161;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	mul.f32 	%r169, %r163, %r163;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	mul.f32 	%r170, %r152, %r169;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	fma.rn.f32 	%r171, %r166, %r170, %r168;
+$L__tmp10:
+	.loc	2 243 46                        // triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ]
+	shfl.sync.bfly.b32 	%r172, %r167, 4, 31, -1;
+	shfl.sync.bfly.b32 	%r173, %r171, 4, 31, -1;
+	shfl.sync.bfly.b32 	%r174, %r164, 4, 31, -1;
+$L__tmp11:
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	sub.f32 	%r175, %r172, %r167;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	add.f32 	%r176, %r164, %r174;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	setp.eq.f32 	%p11, %r176, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	div.full.f32 	%r177, %r174, %r176;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	selp.f32 	%r178, 0f00000000, %r177, %p11;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	fma.rn.f32 	%r179, %r175, %r178, %r167;
+	.loc	2 236 15                        // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	add.f32 	%r180, %r171, %r173;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	mul.f32 	%r181, %r175, %r175;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	mul.f32 	%r182, %r164, %r181;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	fma.rn.f32 	%r183, %r178, %r182, %r180;
+$L__tmp12:
+	.loc	2 243 46                        // triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ]
+	shfl.sync.bfly.b32 	%r184, %r179, 2, 31, -1;
+	shfl.sync.bfly.b32 	%r185, %r183, 2, 31, -1;
+	shfl.sync.bfly.b32 	%r186, %r176, 2, 31, -1;
+$L__tmp13:
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	sub.f32 	%r187, %r184, %r179;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	add.f32 	%r188, %r176, %r186;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	setp.eq.f32 	%p12, %r188, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	div.full.f32 	%r189, %r186, %r188;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	selp.f32 	%r190, 0f00000000, %r189, %p12;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	fma.rn.f32 	%r191, %r187, %r190, %r179;
+	.loc	2 236 15                        // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	add.f32 	%r192, %r183, %r185;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	mul.f32 	%r193, %r187, %r187;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	mul.f32 	%r194, %r176, %r193;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	fma.rn.f32 	%r195, %r190, %r194, %r192;
+$L__tmp14:
+	.loc	2 243 46                        // triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ]
+	shfl.sync.bfly.b32 	%r196, %r191, 1, 31, -1;
+	shfl.sync.bfly.b32 	%r197, %r195, 1, 31, -1;
+	shfl.sync.bfly.b32 	%r198, %r188, 1, 31, -1;
+$L__tmp15:
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	sub.f32 	%r199, %r196, %r191;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	add.f32 	%r23, %r188, %r198;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	setp.eq.f32 	%p13, %r23, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	div.full.f32 	%r200, %r198, %r23;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	selp.f32 	%r201, 0f00000000, %r200, %p13;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	fma.rn.f32 	%r19, %r199, %r201, %r191;
+	.loc	2 236 15                        // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	add.f32 	%r202, %r195, %r197;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	mul.f32 	%r203, %r199, %r199;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	mul.f32 	%r204, %r188, %r203;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	fma.rn.f32 	%r21, %r201, %r204, %r202;
+$L__tmp16:
+	.loc	2 243 46                        // triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ]
+	setp.eq.b32 	%p3, %r119, 0;
+	shr.u32 	%r205, %r50, 3;
+	and.b32 	%r206, %r205, 60;
+	mov.b32 	%r207, global_smem;
+	add.s32 	%r18, %r207, %r206;
+	// begin inline asm
+	@%p3 st.shared.b32 [ %r18 + 0 ], %r19;
+	// end inline asm
+	add.s32 	%r20, %r18, 64;
+	// begin inline asm
+	@%p3 st.shared.b32 [ %r20 + 0 ], %r21;
+	// end inline asm
+	add.s32 	%r22, %r18, 128;
+	// begin inline asm
+	@%p3 st.shared.b32 [ %r22 + 0 ], %r23;
+	// end inline asm
+	bar.sync 	0;
+	setp.lt.u32 	%p4, %r118, 16;
+	shl.b32 	%r208, %r118, 2;
+	add.s32 	%r25, %r207, %r208;
+	// begin inline asm
+	@%p4 ld.shared.b32 %r24, [ %r25 + 0 ];
+	// end inline asm
+	add.s32 	%r27, %r25, 64;
+	// begin inline asm
+	@%p4 ld.shared.b32 %r26, [ %r27 + 0 ];
+	// end inline asm
+	add.s32 	%r29, %r25, 128;
+	// begin inline asm
+	@%p4 ld.shared.b32 %r28, [ %r29 + 0 ];
+	// end inline asm
+	shfl.sync.bfly.b32 	%r209, %r24, 8, 31, -1;
+	shfl.sync.bfly.b32 	%r210, %r26, 8, 31, -1;
+	shfl.sync.bfly.b32 	%r211, %r28, 8, 31, -1;
+$L__tmp17:
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	sub.f32 	%r212, %r209, %r24;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	add.f32 	%r213, %r28, %r211;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	setp.eq.f32 	%p14, %r213, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	div.full.f32 	%r214, %r211, %r213;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	selp.f32 	%r215, 0f00000000, %r214, %p14;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	fma.rn.f32 	%r216, %r212, %r215, %r24;
+	.loc	2 236 15                        // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	add.f32 	%r217, %r26, %r210;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	mul.f32 	%r218, %r212, %r212;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	mul.f32 	%r219, %r218, %r28;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	fma.rn.f32 	%r220, %r219, %r215, %r217;
+$L__tmp18:
+	.loc	2 243 46                        // triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ]
+	shfl.sync.bfly.b32 	%r221, %r216, 4, 31, -1;
+	shfl.sync.bfly.b32 	%r222, %r220, 4, 31, -1;
+	shfl.sync.bfly.b32 	%r223, %r213, 4, 31, -1;
+$L__tmp19:
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	sub.f32 	%r224, %r221, %r216;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	add.f32 	%r225, %r213, %r223;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	setp.eq.f32 	%p15, %r225, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	div.full.f32 	%r226, %r223, %r225;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	selp.f32 	%r227, 0f00000000, %r226, %p15;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	fma.rn.f32 	%r228, %r224, %r227, %r216;
+	.loc	2 236 15                        // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	add.f32 	%r229, %r220, %r222;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	mul.f32 	%r230, %r224, %r224;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	mul.f32 	%r231, %r213, %r230;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	fma.rn.f32 	%r232, %r227, %r231, %r229;
+$L__tmp20:
+	.loc	2 243 46                        // triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ]
+	shfl.sync.bfly.b32 	%r233, %r228, 2, 31, -1;
+	shfl.sync.bfly.b32 	%r234, %r232, 2, 31, -1;
+	shfl.sync.bfly.b32 	%r235, %r225, 2, 31, -1;
+$L__tmp21:
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	sub.f32 	%r236, %r233, %r228;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	add.f32 	%r237, %r225, %r235;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	setp.eq.f32 	%p16, %r237, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	div.full.f32 	%r238, %r235, %r237;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	selp.f32 	%r239, 0f00000000, %r238, %p16;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	fma.rn.f32 	%r240, %r236, %r239, %r228;
+	.loc	2 236 15                        // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	add.f32 	%r241, %r232, %r234;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	mul.f32 	%r242, %r236, %r236;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	mul.f32 	%r243, %r225, %r242;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	fma.rn.f32 	%r244, %r239, %r243, %r241;
+$L__tmp22:
+	.loc	2 243 46                        // triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ]
+	shfl.sync.bfly.b32 	%r245, %r240, 1, 31, -1;
+	shfl.sync.bfly.b32 	%r246, %r244, 1, 31, -1;
+	shfl.sync.bfly.b32 	%r247, %r237, 1, 31, -1;
+$L__tmp23:
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	sub.f32 	%r248, %r245, %r240;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	add.f32 	%r32, %r237, %r247;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	setp.eq.f32 	%p17, %r32, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	div.full.f32 	%r249, %r247, %r32;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	selp.f32 	%r250, 0f00000000, %r249, %p17;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	fma.rn.f32 	%r30, %r248, %r250, %r240;
+	.loc	2 236 15                        // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	add.f32 	%r251, %r244, %r246;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	mul.f32 	%r252, %r248, %r248;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	mul.f32 	%r253, %r237, %r252;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	fma.rn.f32 	%r31, %r250, %r253, %r251;
+$L__tmp24:
+	.loc	2 243 46                        // triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ]
+	and.b32 	%r254, %r50, 15;
+	setp.eq.b32 	%p18, %r254, 0;
+	and.pred 	%p5, %p4, %p18;
+	// begin inline asm
+	@%p5 st.shared.b32 [ %r25 + 0 ], %r30;
+	// end inline asm
+	// begin inline asm
+	@%p5 st.shared.b32 [ %r27 + 0 ], %r31;
+	// end inline asm
+	// begin inline asm
+	@%p5 st.shared.b32 [ %r29 + 0 ], %r32;
+	// end inline asm
+	bar.sync 	0;
+	ld.shared.b32 	%r255, [global_smem];
+	ld.shared.b32 	%r256, [global_smem+64];
+	mov.b32 	%r257, 0f45800000;
+$L__tmp25:
+	.loc	1 68 25                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:68:25
+	div.full.f32 	%r258, %r256, %r257;
+	.loc	1 70 24                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:70:24
+	add.f32 	%r259, %r258, 0f358637BD;
+	.loc	1 71 32                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:71:32
+	rsqrt.approx.ftz.f32 	%r260, %r259;
+	.loc	1 62 53                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:62:53
+	// begin inline asm
+	mov.u64 %rd15, 0x0;
+	createpolicy.fractional.L2::evict_first.b64 %rd15, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r33, %r3;
+	mov.u32 %r34, %r3;
+	@%p1 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { %r33, %r34 }, [ %rd7 + 0 ], %rd15;
+	// end inline asm
+	.loc	1 63 35                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:63:35
+	add.s64 	%rd16, %rd30, %rd38;
+	.loc	1 63 42                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:63:42
+	// begin inline asm
+	mov.u64 %rd17, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd17, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r35, %r3;
+	mov.u32 %r36, %r3;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { %r35, %r36 }, [ %rd16 + 0 ], %rd17;
+	// end inline asm
+	.loc	1 64 35                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:64:35
+	add.s64 	%rd18, %rd31, %rd38;
+	.loc	1 64 42                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:64:42
+	// begin inline asm
+	mov.u64 %rd19, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd19, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r37, %r3;
+	mov.u32 %r38, %r3;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { %r37, %r38 }, [ %rd18 + 0 ], %rd19;
+	// end inline asm
+	.loc	1 78 29                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:78:29
+	add.s64 	%rd20, %rd33, %rd37;
+	.loc	1 62 115                        // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:62:115
+	mov.b32 	{%rs25, %rs26}, %r33;
+	cvt.f32.bf16 	%r261, %rs26;
+	cvt.f32.bf16 	%r262, %rs25;
+	.loc	1 63 95                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:63:95
+	mov.b32 	{%rs27, %rs28}, %r35;
+	cvt.f32.bf16 	%r263, %rs27;
+	cvt.f32.bf16 	%r264, %rs28;
+	.loc	1 64 95                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:64:95
+	mov.b32 	{%rs29, %rs30}, %r37;
+	cvt.f32.bf16 	%r265, %rs30;
+	cvt.f32.bf16 	%r266, %rs29;
+	.loc	1 66 24                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:66:24
+	sub.f32 	%r267, %r262, %r255;
+	sub.f32 	%r268, %r261, %r255;
+	.loc	1 72 24                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:72:24
+	mul.f32 	%r269, %r260, %r268;
+	mul.f32 	%r270, %r260, %r267;
+	.loc	1 75 24                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:75:24
+	add.f32 	%r271, %r264, 0f3F800000;
+	add.f32 	%r272, %r263, 0f3F800000;
+	.loc	1 77 24                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:77:24
+	fma.rn.f32 	%r273, %r270, %r272, %r266;
+	fma.rn.f32 	%r274, %r269, %r271, %r265;
+	.loc	1 78 53                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:78:53
+	cvt.rn.bf16x2.f32 	%r39, %r274, %r273;
+	.loc	1 62 115                        // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:62:115
+	mov.b32 	{%rs31, %rs32}, %r34;
+	cvt.f32.bf16 	%r275, %rs32;
+	cvt.f32.bf16 	%r276, %rs31;
+	.loc	1 63 95                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:63:95
+	mov.b32 	{%rs33, %rs34}, %r36;
+	cvt.f32.bf16 	%r277, %rs33;
+	cvt.f32.bf16 	%r278, %rs34;
+	.loc	1 64 95                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:64:95
+	mov.b32 	{%rs35, %rs36}, %r38;
+	cvt.f32.bf16 	%r279, %rs36;
+	cvt.f32.bf16 	%r280, %rs35;
+	.loc	1 66 24                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:66:24
+	sub.f32 	%r281, %r276, %r255;
+	sub.f32 	%r282, %r275, %r255;
+	.loc	1 72 24                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:72:24
+	mul.f32 	%r283, %r260, %r282;
+	mul.f32 	%r284, %r260, %r281;
+	.loc	1 75 24                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:75:24
+	add.f32 	%r285, %r278, 0f3F800000;
+	add.f32 	%r286, %r277, 0f3F800000;
+	.loc	1 77 24                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:77:24
+	fma.rn.f32 	%r287, %r284, %r286, %r280;
+	fma.rn.f32 	%r288, %r283, %r285, %r279;
+	.loc	1 78 53                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:78:53
+	cvt.rn.bf16x2.f32 	%r40, %r288, %r287;
+	// begin inline asm
+	@%p1 st.global.v2.b32 [ %rd20 + 0 ], { %r39, %r40 };
+	// end inline asm
+	.loc	1 62 53                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:62:53
+	// begin inline asm
+	mov.u64 %rd21, 0x0;
+	createpolicy.fractional.L2::evict_first.b64 %rd21, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r41, %r3;
+	mov.u32 %r42, %r3;
+	@%p1 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { %r41, %r42 }, [ %rd14 + 0 ], %rd21;
+	// end inline asm
+	.loc	1 63 35                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:63:35
+	add.s64 	%rd22, %rd16, 4096;
+	.loc	1 63 42                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:63:42
+	// begin inline asm
+	mov.u64 %rd23, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd23, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r43, %r3;
+	mov.u32 %r44, %r3;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { %r43, %r44 }, [ %rd22 + 0 ], %rd23;
+	// end inline asm
+	.loc	1 64 35                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:64:35
+	add.s64 	%rd24, %rd18, 4096;
+	.loc	1 64 42                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:64:42
+	// begin inline asm
+	mov.u64 %rd25, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd25, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r45, %r3;
+	mov.u32 %r46, %r3;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { %r45, %r46 }, [ %rd24 + 0 ], %rd25;
+	// end inline asm
+	.loc	1 78 29                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:78:29
+	add.s64 	%rd26, %rd20, 4096;
+	.loc	1 62 115                        // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:62:115
+	mov.b32 	{%rs37, %rs38}, %r41;
+	cvt.f32.bf16 	%r289, %rs38;
+	cvt.f32.bf16 	%r290, %rs37;
+	.loc	1 63 95                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:63:95
+	mov.b32 	{%rs39, %rs40}, %r43;
+	cvt.f32.bf16 	%r291, %rs39;
+	cvt.f32.bf16 	%r292, %rs40;
+	.loc	1 64 95                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:64:95
+	mov.b32 	{%rs41, %rs42}, %r45;
+	cvt.f32.bf16 	%r293, %rs42;
+	cvt.f32.bf16 	%r294, %rs41;
+	.loc	1 66 24                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:66:24
+	sub.f32 	%r295, %r290, %r255;
+	sub.f32 	%r296, %r289, %r255;
+	.loc	1 72 24                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:72:24
+	mul.f32 	%r297, %r260, %r296;
+	mul.f32 	%r298, %r260, %r295;
+	.loc	1 75 24                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:75:24
+	add.f32 	%r299, %r292, 0f3F800000;
+	add.f32 	%r300, %r291, 0f3F800000;
+	.loc	1 77 24                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:77:24
+	fma.rn.f32 	%r301, %r298, %r300, %r294;
+	fma.rn.f32 	%r302, %r297, %r299, %r293;
+	.loc	1 78 53                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:78:53
+	cvt.rn.bf16x2.f32 	%r47, %r302, %r301;
+	.loc	1 62 115                        // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:62:115
+	mov.b32 	{%rs43, %rs44}, %r42;
+	cvt.f32.bf16 	%r303, %rs44;
+	cvt.f32.bf16 	%r304, %rs43;
+	.loc	1 63 95                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:63:95
+	mov.b32 	{%rs45, %rs46}, %r44;
+	cvt.f32.bf16 	%r305, %rs45;
+	cvt.f32.bf16 	%r306, %rs46;
+	.loc	1 64 95                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:64:95
+	mov.b32 	{%rs47, %rs48}, %r46;
+	cvt.f32.bf16 	%r307, %rs48;
+	cvt.f32.bf16 	%r308, %rs47;
+	.loc	1 66 24                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:66:24
+	sub.f32 	%r309, %r304, %r255;
+	sub.f32 	%r310, %r303, %r255;
+	.loc	1 72 24                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:72:24
+	mul.f32 	%r311, %r260, %r310;
+	mul.f32 	%r312, %r260, %r309;
+	.loc	1 75 24                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:75:24
+	add.f32 	%r313, %r306, 0f3F800000;
+	add.f32 	%r314, %r305, 0f3F800000;
+	.loc	1 77 24                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:77:24
+	fma.rn.f32 	%r315, %r312, %r314, %r308;
+	fma.rn.f32 	%r316, %r311, %r313, %r307;
+	.loc	1 78 53                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:78:53
+	cvt.rn.bf16x2.f32 	%r48, %r316, %r315;
+	// begin inline asm
+	@%p1 st.global.v2.b32 [ %rd26 + 0 ], { %r47, %r48 };
+	// end inline asm
+	.loc	1 56 4                          // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:56:4
+	ret;
+$L__tmp26:
+$L__func_end0:
+                                        // -- End function
+}
+	.file	1 "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py"
+	.file	2 "/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py"
+	.section	.debug_abbrev
+	{
+.b8 1                                   // Abbreviation Code
+.b8 17                                  // DW_TAG_compile_unit
+.b8 1                                   // DW_CHILDREN_yes
+.b8 37                                  // DW_AT_producer
+.b8 8                                   // DW_FORM_string
+.b8 19                                  // DW_AT_language
+.b8 5                                   // DW_FORM_data2
+.b8 3                                   // DW_AT_name
+.b8 8                                   // DW_FORM_string
+.b8 16                                  // DW_AT_stmt_list
+.b8 6                                   // DW_FORM_data4
+.b8 27                                  // DW_AT_comp_dir
+.b8 8                                   // DW_FORM_string
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 2                                   // Abbreviation Code
+.b8 46                                  // DW_TAG_subprogram
+.b8 0                                   // DW_CHILDREN_no
+.b8 3                                   // DW_AT_name
+.b8 8                                   // DW_FORM_string
+.b8 32                                  // DW_AT_inline
+.b8 11                                  // DW_FORM_data1
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 3                                   // Abbreviation Code
+.b8 46                                  // DW_TAG_subprogram
+.b8 1                                   // DW_CHILDREN_yes
+.b8 17                                  // DW_AT_low_pc
+.b8 1                                   // DW_FORM_addr
+.b8 18                                  // DW_AT_high_pc
+.b8 1                                   // DW_FORM_addr
+.b8 49                                  // DW_AT_abstract_origin
+.b8 19                                  // DW_FORM_ref4
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 4                                   // Abbreviation Code
+.b8 29                                  // DW_TAG_inlined_subroutine
+.b8 0                                   // DW_CHILDREN_no
+.b8 49                                  // DW_AT_abstract_origin
+.b8 19                                  // DW_FORM_ref4
+.b8 17                                  // DW_AT_low_pc
+.b8 1                                   // DW_FORM_addr
+.b8 18                                  // DW_AT_high_pc
+.b8 1                                   // DW_FORM_addr
+.b8 88                                  // DW_AT_call_file
+.b8 11                                  // DW_FORM_data1
+.b8 89                                  // DW_AT_call_line
+.b8 11                                  // DW_FORM_data1
+.b8 87                                  // DW_AT_call_column
+.b8 11                                  // DW_FORM_data1
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 5                                   // Abbreviation Code
+.b8 29                                  // DW_TAG_inlined_subroutine
+.b8 1                                   // DW_CHILDREN_yes
+.b8 49                                  // DW_AT_abstract_origin
+.b8 19                                  // DW_FORM_ref4
+.b8 17                                  // DW_AT_low_pc
+.b8 1                                   // DW_FORM_addr
+.b8 18                                  // DW_AT_high_pc
+.b8 1                                   // DW_FORM_addr
+.b8 88                                  // DW_AT_call_file
+.b8 11                                  // DW_FORM_data1
+.b8 89                                  // DW_AT_call_line
+.b8 11                                  // DW_FORM_data1
+.b8 87                                  // DW_AT_call_column
+.b8 11                                  // DW_FORM_data1
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 0                                   // EOM(3)
+	}
+	.section	.debug_info
+	{
+.b32 367                                // Length of Unit
+.b8 2                                   // DWARF version number
+.b8 0
+.b32 .debug_abbrev                      // Offset Into Abbrev. Section
+.b8 8                                   // Address Size (in bytes)
+.b8 1                                   // Abbrev [1] 0xb:0x168 DW_TAG_compile_unit
+.b8 116                                 // DW_AT_producer
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 0
+.b8 2                                   // DW_AT_language
+.b8 0
+.b8 99                                  // DW_AT_name
+.b8 119
+.b8 51
+.b8 106
+.b8 98
+.b8 105
+.b8 121
+.b8 53
+.b8 122
+.b8 114
+.b8 107
+.b8 121
+.b8 109
+.b8 55
+.b8 118
+.b8 107
+.b8 110
+.b8 110
+.b8 51
+.b8 122
+.b8 105
+.b8 117
+.b8 107
+.b8 51
+.b8 113
+.b8 105
+.b8 109
+.b8 108
+.b8 98
+.b8 99
+.b8 104
+.b8 105
+.b8 110
+.b8 50
+.b8 98
+.b8 98
+.b8 122
+.b8 51
+.b8 115
+.b8 117
+.b8 102
+.b8 54
+.b8 113
+.b8 120
+.b8 105
+.b8 106
+.b8 110
+.b8 98
+.b8 102
+.b8 99
+.b8 51
+.b8 121
+.b8 46
+.b8 112
+.b8 121
+.b8 0
+.b32 .debug_line                        // DW_AT_stmt_list
+.b8 47                                  // DW_AT_comp_dir
+.b8 97
+.b8 112
+.b8 112
+.b8 47
+.b8 116
+.b8 101
+.b8 110
+.b8 115
+.b8 111
+.b8 114
+.b8 114
+.b8 116
+.b8 95
+.b8 108
+.b8 108
+.b8 109
+.b8 47
+.b8 118
+.b8 105
+.b8 115
+.b8 117
+.b8 97
+.b8 108
+.b8 95
+.b8 103
+.b8 101
+.b8 110
+.b8 47
+.b8 99
+.b8 111
+.b8 109
+.b8 112
+.b8 105
+.b8 108
+.b8 101
+.b8 100
+.b8 95
+.b8 99
+.b8 97
+.b8 99
+.b8 104
+.b8 101
+.b8 47
+.b8 102
+.b8 108
+.b8 117
+.b8 120
+.b8 50
+.b8 95
+.b8 107
+.b8 108
+.b8 101
+.b8 105
+.b8 110
+.b8 95
+.b8 57
+.b8 98
+.b8 95
+.b8 78
+.b8 86
+.b8 73
+.b8 68
+.b8 73
+.b8 65
+.b8 95
+.b8 71
+.b8 101
+.b8 70
+.b8 111
+.b8 114
+.b8 99
+.b8 101
+.b8 95
+.b8 82
+.b8 84
+.b8 88
+.b8 95
+.b8 52
+.b8 48
+.b8 57
+.b8 48
+.b8 95
+.b8 115
+.b8 109
+.b8 56
+.b8 57
+.b8 95
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 50
+.b8 46
+.b8 49
+.b8 48
+.b8 46
+.b8 48
+.b8 97
+.b8 48
+.b8 95
+.b8 98
+.b8 52
+.b8 101
+.b8 52
+.b8 101
+.b8 101
+.b8 56
+.b8 49
+.b8 100
+.b8 51
+.b8 46
+.b8 110
+.b8 118
+.b8 50
+.b8 53
+.b8 46
+.b8 49
+.b8 50
+.b8 95
+.b8 99
+.b8 117
+.b8 100
+.b8 97
+.b8 49
+.b8 51
+.b8 95
+.b8 49
+.b8 47
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 105
+.b8 110
+.b8 100
+.b8 117
+.b8 99
+.b8 116
+.b8 111
+.b8 114
+.b8 47
+.b8 119
+.b8 51
+.b8 0
+.b8 2                                   // Abbrev [2] 0xe4:0x2f DW_TAG_subprogram
+.b8 116                                 // DW_AT_name
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 114
+.b8 101
+.b8 100
+.b8 95
+.b8 102
+.b8 117
+.b8 115
+.b8 101
+.b8 100
+.b8 95
+.b8 97
+.b8 100
+.b8 100
+.b8 95
+.b8 109
+.b8 117
+.b8 108
+.b8 95
+.b8 110
+.b8 97
+.b8 116
+.b8 105
+.b8 118
+.b8 101
+.b8 95
+.b8 108
+.b8 97
+.b8 121
+.b8 101
+.b8 114
+.b8 95
+.b8 110
+.b8 111
+.b8 114
+.b8 109
+.b8 95
+.b8 48
+.b8 0
+.b8 1                                   // DW_AT_inline
+.b8 3                                   // Abbrev [3] 0x113:0x5f DW_TAG_subprogram
+.b64 $L__func_begin0                    // DW_AT_low_pc
+.b64 $L__func_end0                      // DW_AT_high_pc
+.b32 228                                // DW_AT_abstract_origin
+.b8 4                                   // Abbrev [4] 0x128:0x18 DW_TAG_inlined_subroutine
+.b32 228                                // DW_AT_abstract_origin
+.b64 $L__tmp1                           // DW_AT_low_pc
+.b64 $L__tmp4                           // DW_AT_high_pc
+.b8 1                                   // DW_AT_call_file
+.b8 46                                  // DW_AT_call_line
+.b8 51                                  // DW_AT_call_column
+.b8 5                                   // Abbrev [5] 0x140:0x31 DW_TAG_inlined_subroutine
+.b32 228                                // DW_AT_abstract_origin
+.b64 $L__tmp5                           // DW_AT_low_pc
+.b64 $L__tmp25                          // DW_AT_high_pc
+.b8 1                                   // DW_AT_call_file
+.b8 52                                  // DW_AT_call_line
+.b8 80                                  // DW_AT_call_column
+.b8 4                                   // Abbrev [4] 0x158:0x18 DW_TAG_inlined_subroutine
+.b32 228                                // DW_AT_abstract_origin
+.b64 $L__tmp5                           // DW_AT_low_pc
+.b64 $L__tmp24                          // DW_AT_high_pc
+.b8 2                                   // DW_AT_call_file
+.b8 243                                 // DW_AT_call_line
+.b8 46                                  // DW_AT_call_column
+.b8 0                                   // End Of Children Mark
+.b8 0                                   // End Of Children Mark
+.b8 0                                   // End Of Children Mark
+	}
+	.section	.debug_macinfo	{	}
diff --git a/triton/DTSINFKV23R7UUCB5Y3DX56UD6DUQS3DXJCZPAKYYXLDFCJFQIOA/triton_red_fused_add_mul_native_layer_norm_0.source b/triton/DTSINFKV23R7UUCB5Y3DX56UD6DUQS3DXJCZPAKYYXLDFCJFQIOA/triton_red_fused_add_mul_native_layer_norm_0.source
new file mode 100644
index 0000000000000000000000000000000000000000..d7a9ab9a44a28eeacd17f6ff4828312abc8408ac
--- /dev/null
+++ b/triton/DTSINFKV23R7UUCB5Y3DX56UD6DUQS3DXJCZPAKYYXLDFCJFQIOA/triton_red_fused_add_mul_native_layer_norm_0.source
@@ -0,0 +1,486 @@
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":18:0)
+#loc88 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":216:0)
+#loc101 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":133:0)
+#loc105 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":242:0)
+#loc107 = loc(unknown)
+#loc110 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":230:0)
+#loc125 = loc("in_ptr0"(#loc))
+#loc126 = loc("in_ptr1"(#loc))
+#loc127 = loc("in_ptr2"(#loc))
+#loc128 = loc("in_ptr3"(#loc))
+#loc129 = loc("in_ptr4"(#loc))
+#loc130 = loc("out_ptr0"(#loc))
+#loc131 = loc("out_ptr3"(#loc))
+#loc132 = loc("xnumel"(#loc))
+#loc133 = loc("r0_numel"(#loc))
+#loc201 = loc("value"(#loc88))
+#loc202 = loc("mean"(#loc88))
+#loc203 = loc("m2"(#loc88))
+#loc204 = loc("weight"(#loc88))
+#loc205 = loc("first_iteration"(#loc88))
+#loc215 = loc("input"(#loc101))
+#loc216 = loc("mean"(#loc105))
+#loc217 = loc("m2"(#loc105))
+#loc218 = loc("weight"(#loc105))
+#loc219 = loc("mean_1"(#loc110))
+#loc220 = loc("m2_1"(#loc110))
+#loc221 = loc("weight_1"(#loc110))
+#loc222 = loc("mean_2"(#loc110))
+#loc223 = loc("m2_2"(#loc110))
+#loc224 = loc("weight_2"(#loc110))
+#loc231 = loc("new_mean"(#loc201))
+module {
+  tt.func public @triton_red_fused_add_mul_native_layer_norm_0(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %in_ptr4: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr4"(#loc)), %out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %out_ptr3: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr3"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
+    %xnumel_0 = arith.constant 2048 : i32 loc(#loc134)
+    %r0_numel_1 = arith.constant 4096 : i32 loc(#loc135)
+    %xoffset = tt.get_program_id x : i32 loc(#loc136)
+    %xoffset_2 = arith.constant 1 : i32 loc(#loc137)
+    %xoffset_3 = arith.constant 1 : i32 loc(#loc137)
+    %xoffset_4 = arith.muli %xoffset, %xoffset_3 : i32 loc(#loc137)
+    %xindex = tt.make_range {end = 1 : i32, start = 0 : i32} : tensor<1xi32> loc(#loc138)
+    %xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc139)
+    %xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<1x1xi32> loc(#loc140)
+    %xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<1x1xi32> loc(#loc140)
+    %xmask = arith.constant dense<2048> : tensor<1x1xi32> loc(#loc141)
+    %xmask_8 = arith.cmpi slt, %xindex_7, %xmask : tensor<1x1xi32> loc(#loc141)
+    %r0_base = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32> loc(#loc142)
+    %r0_base_9 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<2048xi32> -> tensor<1x2048xi32> loc(#loc143)
+    %tmp7_mean = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_2048__(1,)cconstexpr_fp32_"() : () -> tensor<1x2048xf32> loc(#loc144)
+    %tmp7_m2 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_2048__(1,)cconstexpr_fp32_"() : () -> tensor<1x2048xf32> loc(#loc145)
+    %tmp7_weight = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_2048__(1,)cconstexpr_fp32_"() : () -> tensor<1x2048xf32> loc(#loc146)
+    %c0_i32 = arith.constant 0 : i32 loc(#loc14)
+    %c2048_i32 = arith.constant 2048 : i32 loc(#loc14)
+    %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc14)
+    %1 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc14)
+    %2 = arith.bitcast %c2048_i32 : i32 to i32 loc(#loc14)
+    %3 = ub.poison : i32 loc(#loc14)
+    %tmp7_weight_10:3 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%tmp7_mean_13 = %tmp7_mean, %tmp7_m2_14 = %tmp7_m2, %tmp7_weight_15 = %tmp7_weight) -> (tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32>)  : i32 {
+      %r0_index = tt.splat %r0_offset : i32 -> tensor<1x2048xi32> loc(#loc148)
+      %r0_index_16 = arith.addi %r0_index, %r0_base_9 : tensor<1x2048xi32> loc(#loc148)
+      %r0_mask = arith.constant dense<4096> : tensor<1x2048xi32> loc(#loc149)
+      %r0_mask_17 = arith.cmpi slt, %r0_index_16, %r0_mask : tensor<1x2048xi32> loc(#loc149)
+      %tmp0 = arith.constant 4096 : i32 loc(#loc150)
+      %tmp0_18 = arith.constant 4096 : i32 loc(#loc150)
+      %tmp0_19 = arith.constant dense<4096> : tensor<1x1xi32> loc(#loc150)
+      %tmp0_20 = arith.muli %tmp0_19, %xindex_7 : tensor<1x1xi32> loc(#loc150)
+      %tmp0_21 = tt.broadcast %tmp0_20 : tensor<1x1xi32> -> tensor<1x2048xi32> loc(#loc151)
+      %tmp0_22 = arith.addi %r0_index_16, %tmp0_21 : tensor<1x2048xi32> loc(#loc151)
+      %tmp0_23 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<1x2048x!tt.ptr<bf16>> loc(#loc152)
+      %tmp0_24 = tt.addptr %tmp0_23, %tmp0_22 : tensor<1x2048x!tt.ptr<bf16>>, tensor<1x2048xi32> loc(#loc152)
+      %tmp0_25 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x2048xi1> loc(#loc153)
+      %tmp0_26 = arith.andi %r0_mask_17, %tmp0_25 : tensor<1x2048xi1> loc(#loc153)
+      %tmp0_27 = arith.constant 0.000000e+00 : f32 loc(#loc154)
+      %tmp0_28 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc154)
+      %tmp0_29 = arith.truncf %tmp0_28 : tensor<1x2048xf32> to tensor<1x2048xbf16> loc(#loc154)
+      %tmp0_30 = tt.load %tmp0_24, %tmp0_26, %tmp0_29 evictionPolicy = evict_first : tensor<1x2048x!tt.ptr<bf16>> loc(#loc154)
+      %tmp0_31 = arith.extf %tmp0_30 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc155)
+      %tmp1 = tt.splat %in_ptr1 : !tt.ptr<bf16> -> tensor<1x2048x!tt.ptr<bf16>> loc(#loc156)
+      %tmp1_32 = tt.addptr %tmp1, %r0_index_16 : tensor<1x2048x!tt.ptr<bf16>>, tensor<1x2048xi32> loc(#loc156)
+      %tmp1_33 = arith.constant 0.000000e+00 : f32 loc(#loc157)
+      %tmp1_34 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc157)
+      %tmp1_35 = arith.truncf %tmp1_34 : tensor<1x2048xf32> to tensor<1x2048xbf16> loc(#loc157)
+      %tmp1_36 = tt.load %tmp1_32, %r0_mask_17, %tmp1_35 evictionPolicy = evict_last : tensor<1x2048x!tt.ptr<bf16>> loc(#loc157)
+      %tmp1_37 = arith.extf %tmp1_36 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc158)
+      %tmp2 = arith.constant 4096 : i32 loc(#loc159)
+      %tmp2_38 = arith.constant 4096 : i32 loc(#loc159)
+      %tmp2_39 = arith.constant dense<4096> : tensor<1x1xi32> loc(#loc159)
+      %tmp2_40 = arith.muli %tmp2_39, %xindex_7 : tensor<1x1xi32> loc(#loc159)
+      %tmp2_41 = tt.broadcast %tmp2_40 : tensor<1x1xi32> -> tensor<1x2048xi32> loc(#loc160)
+      %tmp2_42 = arith.addi %r0_index_16, %tmp2_41 : tensor<1x2048xi32> loc(#loc160)
+      %tmp2_43 = tt.splat %in_ptr2 : !tt.ptr<bf16> -> tensor<1x2048x!tt.ptr<bf16>> loc(#loc161)
+      %tmp2_44 = tt.addptr %tmp2_43, %tmp2_42 : tensor<1x2048x!tt.ptr<bf16>>, tensor<1x2048xi32> loc(#loc161)
+      %tmp2_45 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x2048xi1> loc(#loc162)
+      %tmp2_46 = arith.andi %r0_mask_17, %tmp2_45 : tensor<1x2048xi1> loc(#loc162)
+      %tmp2_47 = arith.constant 0.000000e+00 : f32 loc(#loc163)
+      %tmp2_48 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc163)
+      %tmp2_49 = arith.truncf %tmp2_48 : tensor<1x2048xf32> to tensor<1x2048xbf16> loc(#loc163)
+      %tmp2_50 = tt.load %tmp2_44, %tmp2_46, %tmp2_49 evictionPolicy = evict_first : tensor<1x2048x!tt.ptr<bf16>> loc(#loc163)
+      %tmp2_51 = arith.extf %tmp2_50 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc164)
+      %tmp3 = arith.mulf %tmp1_37, %tmp2_51 : tensor<1x2048xf32> loc(#loc165)
+      %tmp4 = arith.addf %tmp0_31, %tmp3 : tensor<1x2048xf32> loc(#loc166)
+      %c0_i32_52 = arith.constant 0 : i32 loc(#loc34)
+      %9 = arith.cmpi eq, %r0_offset, %c0_i32_52 : i32 loc(#loc34)
+      %10:3 = tt.call @torch._inductor.runtime.triton_helpers.welford_reduce__fp32S1_2048S_fp32S1_2048S_fp32S1_2048S_fp32S1_2048S_u1__(%tmp4, %tmp7_mean_13, %tmp7_m2_14, %tmp7_weight_15, %9) : (tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32>, i1) -> (tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32>) loc(#loc35)
+      %tmp7_mean_53 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x2048xi1> loc(#loc167)
+      %tmp7_mean_54 = arith.andi %r0_mask_17, %tmp7_mean_53 : tensor<1x2048xi1> loc(#loc167)
+      %tmp7_mean_55 = arith.select %tmp7_mean_54, %10#0, %tmp7_mean_13 : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc168)
+      %tmp7_m2_56 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x2048xi1> loc(#loc169)
+      %tmp7_m2_57 = arith.andi %r0_mask_17, %tmp7_m2_56 : tensor<1x2048xi1> loc(#loc169)
+      %tmp7_m2_58 = arith.select %tmp7_m2_57, %10#1, %tmp7_m2_14 : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc170)
+      %tmp7_weight_59 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x2048xi1> loc(#loc171)
+      %tmp7_weight_60 = arith.andi %r0_mask_17, %tmp7_weight_59 : tensor<1x2048xi1> loc(#loc171)
+      %tmp7_weight_61 = arith.select %tmp7_weight_60, %10#2, %tmp7_weight_15 : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc172)
+      %c4096_i32 = arith.constant 4096 : i32 loc(#loc42)
+      %c4096_i32_62 = arith.constant 4096 : i32 loc(#loc42)
+      %cst = arith.constant dense<4096> : tensor<1x1xi32> loc(#loc42)
+      %11 = arith.muli %cst, %xindex_7 : tensor<1x1xi32> loc(#loc42)
+      %12 = tt.broadcast %11 : tensor<1x1xi32> -> tensor<1x2048xi32> loc(#loc43)
+      %13 = arith.addi %r0_index_16, %12 : tensor<1x2048xi32> loc(#loc43)
+      %14 = tt.splat %out_ptr0 : !tt.ptr<bf16> -> tensor<1x2048x!tt.ptr<bf16>> loc(#loc44)
+      %15 = tt.addptr %14, %13 : tensor<1x2048x!tt.ptr<bf16>>, tensor<1x2048xi32> loc(#loc44)
+      %16 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x2048xi1> loc(#loc45)
+      %17 = arith.andi %r0_mask_17, %16 : tensor<1x2048xi1> loc(#loc45)
+      %18 = arith.truncf %tmp4 : tensor<1x2048xf32> to tensor<1x2048xbf16> loc(#loc46)
+      tt.store %15, %18, %17 : tensor<1x2048x!tt.ptr<bf16>> loc(#loc46)
+      scf.yield %tmp7_mean_55, %tmp7_m2_58, %tmp7_weight_61 : tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32> loc(#loc47)
+    } loc(#loc237)
+    %4:3 = tt.call @"torch._inductor.runtime.triton_helpers.welford__fp32S1_2048S_fp32S1_2048S_fp32S1_2048S__(3,)cconstexpr_1_"(%tmp7_weight_10#0, %tmp7_weight_10#1, %tmp7_weight_10#2) : (tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32>) -> (tensor<1xf32>, tensor<1xf32>, tensor<1xf32>) loc(#loc48)
+    %tmp7 = tt.expand_dims %4#0 {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc173)
+    %tmp11 = tt.expand_dims %4#1 {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc174)
+    %tmp12 = tt.expand_dims %4#2 {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc175)
+    %c0_i32_11 = arith.constant 0 : i32 loc(#loc52)
+    %c2048_i32_12 = arith.constant 2048 : i32 loc(#loc52)
+    %5 = arith.bitcast %c0_i32_11 : i32 to i32 loc(#loc52)
+    %6 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc52)
+    %7 = arith.bitcast %c2048_i32_12 : i32 to i32 loc(#loc52)
+    %8 = ub.poison : i32 loc(#loc52)
+    scf.for %r0_offset = %5 to %6 step %7  : i32 {
+      %r0_index = tt.splat %r0_offset : i32 -> tensor<1x2048xi32> loc(#loc176)
+      %r0_index_13 = arith.addi %r0_index, %r0_base_9 : tensor<1x2048xi32> loc(#loc176)
+      %r0_mask = arith.constant dense<4096> : tensor<1x2048xi32> loc(#loc177)
+      %r0_mask_14 = arith.cmpi slt, %r0_index_13, %r0_mask : tensor<1x2048xi32> loc(#loc177)
+      %tmp13 = arith.constant 4096 : i32 loc(#loc178)
+      %tmp13_15 = arith.constant 4096 : i32 loc(#loc178)
+      %tmp13_16 = arith.constant dense<4096> : tensor<1x1xi32> loc(#loc178)
+      %tmp13_17 = arith.muli %tmp13_16, %xindex_7 : tensor<1x1xi32> loc(#loc178)
+      %tmp13_18 = tt.broadcast %tmp13_17 : tensor<1x1xi32> -> tensor<1x2048xi32> loc(#loc179)
+      %tmp13_19 = arith.addi %r0_index_13, %tmp13_18 : tensor<1x2048xi32> loc(#loc179)
+      %tmp13_20 = tt.splat %out_ptr0 : !tt.ptr<bf16> -> tensor<1x2048x!tt.ptr<bf16>> loc(#loc180)
+      %tmp13_21 = tt.addptr %tmp13_20, %tmp13_19 : tensor<1x2048x!tt.ptr<bf16>>, tensor<1x2048xi32> loc(#loc180)
+      %tmp13_22 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x2048xi1> loc(#loc181)
+      %tmp13_23 = arith.andi %r0_mask_14, %tmp13_22 : tensor<1x2048xi1> loc(#loc181)
+      %tmp13_24 = arith.constant 0.000000e+00 : f32 loc(#loc182)
+      %tmp13_25 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc182)
+      %tmp13_26 = arith.truncf %tmp13_25 : tensor<1x2048xf32> to tensor<1x2048xbf16> loc(#loc182)
+      %tmp13_27 = tt.load %tmp13_21, %tmp13_23, %tmp13_26 evictionPolicy = evict_first : tensor<1x2048x!tt.ptr<bf16>> loc(#loc182)
+      %tmp13_28 = arith.extf %tmp13_27 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc183)
+      %tmp23 = tt.splat %in_ptr3 : !tt.ptr<bf16> -> tensor<1x2048x!tt.ptr<bf16>> loc(#loc184)
+      %tmp23_29 = tt.addptr %tmp23, %r0_index_13 : tensor<1x2048x!tt.ptr<bf16>>, tensor<1x2048xi32> loc(#loc184)
+      %tmp23_30 = arith.constant 0.000000e+00 : f32 loc(#loc185)
+      %tmp23_31 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc185)
+      %tmp23_32 = arith.truncf %tmp23_31 : tensor<1x2048xf32> to tensor<1x2048xbf16> loc(#loc185)
+      %tmp23_33 = tt.load %tmp23_29, %r0_mask_14, %tmp23_32 evictionPolicy = evict_last : tensor<1x2048x!tt.ptr<bf16>> loc(#loc185)
+      %tmp23_34 = arith.extf %tmp23_33 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc186)
+      %tmp27 = tt.splat %in_ptr4 : !tt.ptr<bf16> -> tensor<1x2048x!tt.ptr<bf16>> loc(#loc187)
+      %tmp27_35 = tt.addptr %tmp27, %r0_index_13 : tensor<1x2048x!tt.ptr<bf16>>, tensor<1x2048xi32> loc(#loc187)
+      %tmp27_36 = arith.constant 0.000000e+00 : f32 loc(#loc188)
+      %tmp27_37 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc188)
+      %tmp27_38 = arith.truncf %tmp27_37 : tensor<1x2048xf32> to tensor<1x2048xbf16> loc(#loc188)
+      %tmp27_39 = tt.load %tmp27_35, %r0_mask_14, %tmp27_38 evictionPolicy = evict_last : tensor<1x2048x!tt.ptr<bf16>> loc(#loc188)
+      %tmp27_40 = arith.extf %tmp27_39 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc189)
+      %tmp15 = tt.broadcast %tmp7 : tensor<1x1xf32> -> tensor<1x2048xf32> loc(#loc190)
+      %tmp15_41 = arith.subf %tmp13_28, %tmp15 : tensor<1x2048xf32> loc(#loc190)
+      %tmp16 = arith.constant 4.096000e+03 : f32 loc(#loc191)
+      %tmp17 = arith.constant dense<4.096000e+03> : tensor<1x1xf32> loc(#loc192)
+      %tmp17_42 = arith.divf %tmp11, %tmp17 : tensor<1x1xf32> loc(#loc192)
+      %tmp18 = arith.constant 9.99999997E-7 : f32 loc(#loc193)
+      %tmp19 = arith.constant dense<9.99999997E-7> : tensor<1x1xf32> loc(#loc194)
+      %tmp19_43 = arith.addf %tmp17_42, %tmp19 : tensor<1x1xf32> loc(#loc194)
+      %tmp20 = tt.extern_elementwise %tmp19_43 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<1x1xf32>) -> tensor<1x1xf32> loc(#loc195)
+      %tmp21 = tt.broadcast %tmp20 : tensor<1x1xf32> -> tensor<1x2048xf32> loc(#loc196)
+      %tmp21_44 = arith.mulf %tmp15_41, %tmp21 : tensor<1x2048xf32> loc(#loc196)
+      %tmp24 = arith.constant 1.000000e+00 : f32 loc(#loc197)
+      %tmp25 = arith.constant dense<1.000000e+00> : tensor<1x2048xf32> loc(#loc198)
+      %tmp25_45 = arith.addf %tmp23_34, %tmp25 : tensor<1x2048xf32> loc(#loc198)
+      %tmp26 = arith.mulf %tmp21_44, %tmp25_45 : tensor<1x2048xf32> loc(#loc199)
+      %tmp28 = arith.addf %tmp26, %tmp27_40 : tensor<1x2048xf32> loc(#loc200)
+      %c4096_i32 = arith.constant 4096 : i32 loc(#loc78)
+      %c4096_i32_46 = arith.constant 4096 : i32 loc(#loc78)
+      %cst = arith.constant dense<4096> : tensor<1x1xi32> loc(#loc78)
+      %9 = arith.muli %cst, %xindex_7 : tensor<1x1xi32> loc(#loc78)
+      %10 = tt.broadcast %9 : tensor<1x1xi32> -> tensor<1x2048xi32> loc(#loc79)
+      %11 = arith.addi %r0_index_13, %10 : tensor<1x2048xi32> loc(#loc79)
+      %12 = tt.splat %out_ptr3 : !tt.ptr<bf16> -> tensor<1x2048x!tt.ptr<bf16>> loc(#loc80)
+      %13 = tt.addptr %12, %11 : tensor<1x2048x!tt.ptr<bf16>>, tensor<1x2048xi32> loc(#loc80)
+      %14 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x2048xi1> loc(#loc81)
+      %15 = arith.andi %r0_mask_14, %14 : tensor<1x2048xi1> loc(#loc81)
+      %16 = arith.truncf %tmp28 : tensor<1x2048xf32> to tensor<1x2048xbf16> loc(#loc82)
+      tt.store %13, %16, %15 : tensor<1x2048x!tt.ptr<bf16>> loc(#loc82)
+    } loc(#loc52)
+    tt.return loc(#loc83)
+  } loc(#loc)
+  tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_2048__(1,)cconstexpr_fp32_"() -> tensor<1x2048xf32> attributes {noinline = false} {
+    %cst = arith.constant 0.000000e+00 : f32 loc(#loc85)
+    %cst_0 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc85)
+    tt.return %cst_0 : tensor<1x2048xf32> loc(#loc86)
+  ^bb1:  // no predecessors
+    %0 = ub.poison : tensor<1x2048xf32> loc(#loc87)
+    tt.return %0 : tensor<1x2048xf32> loc(#loc87)
+  } loc(#loc84)
+  tt.func private @torch._inductor.runtime.triton_helpers.welford_reduce__fp32S1_2048S_fp32S1_2048S_fp32S1_2048S_fp32S1_2048S_u1__(%new_mean: tensor<1x2048xf32> loc("new_mean"(#loc201)), %mean: tensor<1x2048xf32> loc("mean"(#loc88)), %m2: tensor<1x2048xf32> loc("m2"(#loc88)), %weight: tensor<1x2048xf32> loc("weight"(#loc88)), %first_iteration: i1 loc("first_iteration"(#loc88))) -> (tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32>) attributes {noinline = false} {
+    %0:3 = scf.if %first_iteration -> (tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32>) {
+      %new_weight = arith.constant 1.000000e+00 : f32 loc(#loc206)
+      %new_weight_0 = arith.constant dense<1.000000e+00> : tensor<1x2048xf32> loc(#loc232)
+      %new_m2 = tt.call @triton.language.standard.zeros_like__fp32S1_2048S__(%m2) : (tensor<1x2048xf32>) -> tensor<1x2048xf32> loc(#loc233)
+      scf.yield %new_m2, %new_mean, %new_weight_0 : tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32> loc(#loc233)
+    } else {
+      %delta = arith.subf %new_mean, %mean : tensor<1x2048xf32> loc(#loc208)
+      %new_weight = arith.constant 1 : i32 loc(#loc209)
+      %new_weight_0 = arith.constant 1.000000e+00 : f32 loc(#loc209)
+      %new_weight_1 = arith.constant dense<1.000000e+00> : tensor<1x2048xf32> loc(#loc209)
+      %new_weight_2 = arith.addf %weight, %new_weight_1 : tensor<1x2048xf32> loc(#loc234)
+      %new_mean_3 = arith.divf %delta, %new_weight_2 : tensor<1x2048xf32> loc(#loc210)
+      %new_mean_4 = arith.addf %mean, %new_mean_3 : tensor<1x2048xf32> loc(#loc235)
+      %new_m2 = arith.subf %new_mean, %new_mean_4 : tensor<1x2048xf32> loc(#loc212)
+      %new_m2_5 = arith.mulf %delta, %new_m2 : tensor<1x2048xf32> loc(#loc213)
+      %new_m2_6 = arith.addf %m2, %new_m2_5 : tensor<1x2048xf32> loc(#loc236)
+      scf.yield %new_m2_6, %new_mean_4, %new_weight_2 : tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32> loc(#loc214)
+    } loc(#loc89)
+    tt.return %0#1, %0#0, %0#2 : tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32> loc(#loc99)
+  ^bb1:  // no predecessors
+    %1 = ub.poison : tensor<1x2048xf32> loc(#loc100)
+    %2 = ub.poison : tensor<1x2048xf32> loc(#loc100)
+    %3 = ub.poison : tensor<1x2048xf32> loc(#loc100)
+    tt.return %1, %2, %3 : tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32> loc(#loc100)
+  } loc(#loc88)
+  tt.func private @triton.language.standard.zeros_like__fp32S1_2048S__(%input: tensor<1x2048xf32> loc("input"(#loc101))) -> tensor<1x2048xf32> attributes {noinline = false} {
+    %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_2048__(1,)cconstexpr_fp32_"() : () -> tensor<1x2048xf32> loc(#loc102)
+    tt.return %0 : tensor<1x2048xf32> loc(#loc103)
+  ^bb1:  // no predecessors
+    %1 = ub.poison : tensor<1x2048xf32> loc(#loc104)
+    tt.return %1 : tensor<1x2048xf32> loc(#loc104)
+  } loc(#loc101)
+  tt.func private @"torch._inductor.runtime.triton_helpers.welford__fp32S1_2048S_fp32S1_2048S_fp32S1_2048S__(3,)cconstexpr_1_"(%mean: tensor<1x2048xf32> loc("mean"(#loc105)), %m2: tensor<1x2048xf32> loc("m2"(#loc105)), %weight: tensor<1x2048xf32> loc("weight"(#loc105))) -> (tensor<1xf32>, tensor<1xf32>, tensor<1xf32>) attributes {noinline = false} {
+    %0:3 = "tt.reduce"(%mean, %m2, %weight) <{axis = 1 : i32}> ({
+    ^bb0(%arg3: f32 loc(unknown), %arg4: f32 loc(unknown), %arg5: f32 loc(unknown), %arg6: f32 loc(unknown), %arg7: f32 loc(unknown), %arg8: f32 loc(unknown)):
+      %4:3 = tt.call @torch._inductor.runtime.triton_helpers.welford_combine__fp32_fp32_fp32_fp32_fp32_fp32__(%arg3, %arg4, %arg5, %arg6, %arg7, %arg8) : (f32, f32, f32, f32, f32, f32) -> (f32, f32, f32) loc(#loc106)
+      tt.reduce.return %4#0, %4#1, %4#2 : f32, f32, f32 loc(#loc106)
+    }) : (tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32>) -> (tensor<1xf32>, tensor<1xf32>, tensor<1xf32>) loc(#loc106)
+    tt.return %0#0, %0#1, %0#2 : tensor<1xf32>, tensor<1xf32>, tensor<1xf32> loc(#loc108)
+  ^bb1:  // no predecessors
+    %1 = ub.poison : tensor<1xf32> loc(#loc109)
+    %2 = ub.poison : tensor<1xf32> loc(#loc109)
+    %3 = ub.poison : tensor<1xf32> loc(#loc109)
+    tt.return %1, %2, %3 : tensor<1xf32>, tensor<1xf32>, tensor<1xf32> loc(#loc109)
+  } loc(#loc105)
+  tt.func private @torch._inductor.runtime.triton_helpers.welford_combine__fp32_fp32_fp32_fp32_fp32_fp32__(%mean_1: f32 loc("mean_1"(#loc110)), %m2_1: f32 loc("m2_1"(#loc110)), %weight_1: f32 loc("weight_1"(#loc110)), %mean_2: f32 loc("mean_2"(#loc110)), %m2_2: f32 loc("m2_2"(#loc110)), %weight_2: f32 loc("weight_2"(#loc110))) -> (f32, f32, f32) attributes {noinline = false} {
+    %delta = arith.subf %mean_2, %mean_1 : f32 loc(#loc225)
+    %new_weight = arith.addf %weight_1, %weight_2 : f32 loc(#loc226)
+    %w2_over_w = arith.constant 0.000000e+00 : f32 loc(#loc227)
+    %w2_over_w_0 = arith.cmpf oeq, %new_weight, %w2_over_w : f32 loc(#loc227)
+    %w2_over_w_1 = arith.divf %weight_2, %new_weight : f32 loc(#loc228)
+    %w2_over_w_2 = arith.constant 0.000000e+00 : f32 loc(#loc229)
+    %w2_over_w_3 = arith.constant 0.000000e+00 : f32 loc(#loc229)
+    %w2_over_w_4 = arith.select %w2_over_w_0, %w2_over_w_3, %w2_over_w_1 : f32 loc(#loc229)
+    %0 = arith.mulf %delta, %w2_over_w_4 : f32 loc(#loc116)
+    %1 = arith.addf %mean_1, %0 : f32 loc(#loc117)
+    %2 = arith.addf %m2_1, %m2_2 : f32 loc(#loc118)
+    %3 = arith.mulf %delta, %delta : f32 loc(#loc119)
+    %4 = arith.mulf %3, %weight_1 : f32 loc(#loc120)
+    %5 = arith.mulf %4, %w2_over_w_4 : f32 loc(#loc121)
+    %6 = arith.addf %2, %5 : f32 loc(#loc122)
+    tt.return %1, %6, %new_weight : f32, f32, f32 loc(#loc123)
+  ^bb1:  // no predecessors
+    %7 = ub.poison : f32 loc(#loc124)
+    %8 = ub.poison : f32 loc(#loc124)
+    %9 = ub.poison : f32 loc(#loc124)
+    tt.return %7, %8, %9 : f32, f32, f32 loc(#loc124)
+  } loc(#loc110)
+} loc(#loc)
+#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":19:13)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":20:15)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":23:28)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":23:33)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":24:36)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":24:44)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":24:23)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":25:21)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":26:27)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":26:37)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":29:45)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":30:43)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":31:47)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":32:43)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":33:31)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":34:29)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":38:46)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":38:41)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":38:34)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":38:61)
+#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":38:51)
+#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":38:113)
+#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":39:34)
+#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":39:41)
+#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":39:94)
+#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":40:46)
+#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":40:41)
+#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":40:34)
+#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":40:61)
+#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":40:51)
+#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":40:113)
+#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":41:22)
+#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":42:22)
+#loc34 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":46:62)
+#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":46:51)
+#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":48:39)
+#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":48:62)
+#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":49:37)
+#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":49:58)
+#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":50:41)
+#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":50:66)
+#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":51:41)
+#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":51:36)
+#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":51:29)
+#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":51:62)
+#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":51:52)
+#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":51:8)
+#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":52:80)
+#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":53:16)
+#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":54:17)
+#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":55:18)
+#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":56:43)
+#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":57:31)
+#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":58:29)
+#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":62:48)
+#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":62:43)
+#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":62:36)
+#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":62:63)
+#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":62:53)
+#loc60 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":62:115)
+#loc61 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":63:35)
+#loc62 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":63:42)
+#loc63 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":63:95)
+#loc64 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":64:35)
+#loc65 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":64:42)
+#loc66 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":64:95)
+#loc67 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":66:24)
+#loc68 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":67:16)
+#loc69 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":68:25)
+#loc70 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":69:16)
+#loc71 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":70:24)
+#loc72 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":71:32)
+#loc73 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":72:24)
+#loc74 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":74:16)
+#loc75 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":75:24)
+#loc76 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":76:24)
+#loc77 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":77:24)
+#loc78 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":78:41)
+#loc79 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":78:36)
+#loc80 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":78:29)
+#loc81 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":78:63)
+#loc82 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":78:53)
+#loc83 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":56:4)
+#loc84 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":120:0)
+#loc85 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":129:31)
+#loc86 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":129:11)
+#loc87 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":129:4)
+#loc89 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":217:7)
+#loc90 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":218:46)
+#loc91 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":220:31)
+#loc92 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":222:24)
+#loc93 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":223:30)
+#loc94 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":224:34)
+#loc95 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":224:26)
+#loc96 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":225:39)
+#loc97 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":225:31)
+#loc98 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":225:22)
+#loc99 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":226:11)
+#loc100 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":226:4)
+#loc102 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":140:30)
+#loc103 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":140:11)
+#loc104 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":140:4)
+#loc106 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":243:46)
+#loc108 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":243:11)
+#loc109 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":243:4)
+#loc111 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":231:21)
+#loc112 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":232:28)
+#loc113 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:39)
+#loc114 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:60)
+#loc115 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:49)
+#loc116 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":235:25)
+#loc117 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":235:17)
+#loc118 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:15)
+#loc119 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:30)
+#loc120 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:38)
+#loc121 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:49)
+#loc122 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:22)
+#loc123 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":234:11)
+#loc124 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":234:4)
+#loc134 = loc("xnumel"(#loc1))
+#loc135 = loc("r0_numel"(#loc2))
+#loc136 = loc("xoffset"(#loc3))
+#loc137 = loc("xoffset"(#loc4))
+#loc138 = loc("xindex"(#loc5))
+#loc139 = loc("xindex"(#loc6))
+#loc140 = loc("xindex"(#loc7))
+#loc141 = loc("xmask"(#loc8))
+#loc142 = loc("r0_base"(#loc9))
+#loc143 = loc("r0_base"(#loc10))
+#loc144 = loc("tmp7_mean"(#loc11))
+#loc145 = loc("tmp7_m2"(#loc12))
+#loc146 = loc("tmp7_weight"(#loc13))
+#loc147 = loc("tmp7_mean"(#loc14))
+#loc148 = loc("r0_index"(#loc15))
+#loc149 = loc("r0_mask"(#loc16))
+#loc150 = loc("tmp0"(#loc17))
+#loc151 = loc("tmp0"(#loc18))
+#loc152 = loc("tmp0"(#loc19))
+#loc153 = loc("tmp0"(#loc20))
+#loc154 = loc("tmp0"(#loc21))
+#loc155 = loc("tmp0"(#loc22))
+#loc156 = loc("tmp1"(#loc23))
+#loc157 = loc("tmp1"(#loc24))
+#loc158 = loc("tmp1"(#loc25))
+#loc159 = loc("tmp2"(#loc26))
+#loc160 = loc("tmp2"(#loc27))
+#loc161 = loc("tmp2"(#loc28))
+#loc162 = loc("tmp2"(#loc29))
+#loc163 = loc("tmp2"(#loc30))
+#loc164 = loc("tmp2"(#loc31))
+#loc165 = loc("tmp3"(#loc32))
+#loc166 = loc("tmp4"(#loc33))
+#loc167 = loc("tmp7_mean"(#loc36))
+#loc168 = loc("tmp7_mean"(#loc37))
+#loc169 = loc("tmp7_m2"(#loc38))
+#loc170 = loc("tmp7_m2"(#loc39))
+#loc171 = loc("tmp7_weight"(#loc40))
+#loc172 = loc("tmp7_weight"(#loc41))
+#loc173 = loc("tmp7"(#loc49))
+#loc174 = loc("tmp11"(#loc50))
+#loc175 = loc("tmp12"(#loc51))
+#loc176 = loc("r0_index"(#loc53))
+#loc177 = loc("r0_mask"(#loc54))
+#loc178 = loc("tmp13"(#loc55))
+#loc179 = loc("tmp13"(#loc56))
+#loc180 = loc("tmp13"(#loc57))
+#loc181 = loc("tmp13"(#loc58))
+#loc182 = loc("tmp13"(#loc59))
+#loc183 = loc("tmp13"(#loc60))
+#loc184 = loc("tmp23"(#loc61))
+#loc185 = loc("tmp23"(#loc62))
+#loc186 = loc("tmp23"(#loc63))
+#loc187 = loc("tmp27"(#loc64))
+#loc188 = loc("tmp27"(#loc65))
+#loc189 = loc("tmp27"(#loc66))
+#loc190 = loc("tmp15"(#loc67))
+#loc191 = loc("tmp16"(#loc68))
+#loc192 = loc("tmp17"(#loc69))
+#loc193 = loc("tmp18"(#loc70))
+#loc194 = loc("tmp19"(#loc71))
+#loc195 = loc("tmp20"(#loc72))
+#loc196 = loc("tmp21"(#loc73))
+#loc197 = loc("tmp24"(#loc74))
+#loc198 = loc("tmp25"(#loc75))
+#loc199 = loc("tmp26"(#loc76))
+#loc200 = loc("tmp28"(#loc77))
+#loc206 = loc("new_weight"(#loc90))
+#loc207 = loc("new_m2"(#loc91))
+#loc208 = loc("delta"(#loc92))
+#loc209 = loc("new_weight"(#loc93))
+#loc210 = loc("new_mean"(#loc94))
+#loc211 = loc("new_mean"(#loc95))
+#loc212 = loc("new_m2"(#loc96))
+#loc213 = loc("new_m2"(#loc97))
+#loc214 = loc("new_m2"(#loc98))
+#loc225 = loc("delta"(#loc111))
+#loc226 = loc("new_weight"(#loc112))
+#loc227 = loc("w2_over_w"(#loc113))
+#loc228 = loc("w2_over_w"(#loc114))
+#loc229 = loc("w2_over_w"(#loc115))
+#loc230 = loc("tmp7_m2"(#loc147))
+#loc232 = loc("new_weight"(#loc206))
+#loc233 = loc("new_m2"(#loc207))
+#loc234 = loc("new_weight"(#loc209))
+#loc235 = loc("new_mean"(#loc211))
+#loc236 = loc("new_m2"(#loc214))
+#loc237 = loc("tmp7_weight"(#loc230))
diff --git a/triton/DTSINFKV23R7UUCB5Y3DX56UD6DUQS3DXJCZPAKYYXLDFCJFQIOA/triton_red_fused_add_mul_native_layer_norm_0.ttgir b/triton/DTSINFKV23R7UUCB5Y3DX56UD6DUQS3DXJCZPAKYYXLDFCJFQIOA/triton_red_fused_add_mul_native_layer_norm_0.ttgir
new file mode 100644
index 0000000000000000000000000000000000000000..86151ad8e109b79e9f243cc9b35ae3fc961afe98
--- /dev/null
+++ b/triton/DTSINFKV23R7UUCB5Y3DX56UD6DUQS3DXJCZPAKYYXLDFCJFQIOA/triton_red_fused_add_mul_native_layer_norm_0.ttgir
@@ -0,0 +1,295 @@
+#blocked = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 32], warpsPerCTA = [1, 16], order = [1, 0]}>
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":18:0)
+#loc1 = loc(unknown)
+#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":52:80)
+#loc80 = loc("in_ptr0"(#loc))
+#loc81 = loc("in_ptr1"(#loc))
+#loc82 = loc("in_ptr2"(#loc))
+#loc83 = loc("in_ptr3"(#loc))
+#loc84 = loc("in_ptr4"(#loc))
+#loc85 = loc("out_ptr0"(#loc))
+#loc86 = loc("out_ptr3"(#loc))
+#loc87 = loc("xnumel"(#loc))
+#loc88 = loc("r0_numel"(#loc))
+#loc122 = loc(callsite(#loc1 at #loc40))
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 16 : i32, ttg.target = "cuda:89", "ttg.threads-per-warp" = 32 : i32} {
+  tt.func public @triton_red_fused_add_mul_native_layer_norm_0(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %in_ptr4: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr4"(#loc)), %out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %out_ptr3: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr3"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
+    %cst = arith.constant dense<4096> : tensor<1x2048xi32, #blocked> loc(#loc1)
+    %c0_i32 = arith.constant 0 : i32 loc(#loc1)
+    %cst_0 = arith.constant dense<0.000000e+00> : tensor<1x2048xbf16, #blocked> loc(#loc1)
+    %c4096_i32 = arith.constant 4096 : i32 loc(#loc1)
+    %c2048_i32 = arith.constant 2048 : i32 loc(#loc1)
+    %cst_1 = arith.constant 0.000000e+00 : f32 loc(#loc1)
+    %cst_2 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32, #blocked> loc(#loc1)
+    %cst_3 = arith.constant dense<1.000000e+00> : tensor<1x2048xf32, #blocked> loc(#loc1)
+    %cst_4 = arith.constant dense<9.99999997E-7> : tensor<1x1xf32, #blocked> loc(#loc1)
+    %cst_5 = arith.constant dense<4.096000e+03> : tensor<1x1xf32, #blocked> loc(#loc1)
+    %xoffset = tt.get_program_id x : i32 loc(#loc89)
+    %xmask = arith.cmpi slt, %xoffset, %c2048_i32 : i32 loc(#loc90)
+    %r0_base = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc91)
+    %r0_base_6 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<2048xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x2048xi32, #blocked> loc(#loc91)
+    %tmp0 = arith.muli %xoffset, %c4096_i32 : i32 loc(#loc92)
+    %tmp0_7 = tt.splat %tmp0 : i32 -> tensor<1x2048xi32, #blocked> loc(#loc151)
+    %tmp0_8 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<1x2048x!tt.ptr<bf16>, #blocked> loc(#loc94)
+    %tmp0_9 = tt.splat %xmask : i1 -> tensor<1x2048xi1, #blocked> loc(#loc152)
+    %tmp1 = tt.splat %in_ptr1 : !tt.ptr<bf16> -> tensor<1x2048x!tt.ptr<bf16>, #blocked> loc(#loc96)
+    %tmp2 = tt.splat %in_ptr2 : !tt.ptr<bf16> -> tensor<1x2048x!tt.ptr<bf16>, #blocked> loc(#loc97)
+    %0 = tt.splat %out_ptr0 : !tt.ptr<bf16> -> tensor<1x2048x!tt.ptr<bf16>, #blocked> loc(#loc11)
+    %tmp7_weight:3 = scf.for %tmp7_weight_10 = %c0_i32 to %c4096_i32 step %c2048_i32 iter_args(%arg10 = %cst_2, %arg11 = %cst_2, %arg12 = %cst_2) -> (tensor<1x2048xf32, #blocked>, tensor<1x2048xf32, #blocked>, tensor<1x2048xf32, #blocked>)  : i32 {
+      %r0_index = tt.splat %tmp7_weight_10 : i32 -> tensor<1x2048xi32, #blocked> loc(#loc99)
+      %r0_index_11 = arith.addi %r0_index, %r0_base_6 : tensor<1x2048xi32, #blocked> loc(#loc99)
+      %r0_mask = arith.cmpi slt, %r0_index_11, %cst : tensor<1x2048xi32, #blocked> loc(#loc100)
+      %tmp0_12 = arith.addi %r0_index_11, %tmp0_7 : tensor<1x2048xi32, #blocked> loc(#loc93)
+      %tmp0_13 = tt.addptr %tmp0_8, %tmp0_12 : tensor<1x2048x!tt.ptr<bf16>, #blocked>, tensor<1x2048xi32, #blocked> loc(#loc94)
+      %tmp0_14 = arith.andi %r0_mask, %tmp0_9 : tensor<1x2048xi1, #blocked> loc(#loc95)
+      %tmp0_15 = tt.load %tmp0_13, %tmp0_14, %cst_0 evictionPolicy = evict_first : tensor<1x2048x!tt.ptr<bf16>, #blocked> loc(#loc101)
+      %tmp0_16 = arith.extf %tmp0_15 : tensor<1x2048xbf16, #blocked> to tensor<1x2048xf32, #blocked> loc(#loc102)
+      %tmp1_17 = tt.addptr %tmp1, %r0_index_11 : tensor<1x2048x!tt.ptr<bf16>, #blocked>, tensor<1x2048xi32, #blocked> loc(#loc96)
+      %tmp1_18 = tt.load %tmp1_17, %r0_mask, %cst_0 evictionPolicy = evict_last : tensor<1x2048x!tt.ptr<bf16>, #blocked> loc(#loc103)
+      %tmp1_19 = arith.extf %tmp1_18 : tensor<1x2048xbf16, #blocked> to tensor<1x2048xf32, #blocked> loc(#loc104)
+      %tmp2_20 = tt.addptr %tmp2, %tmp0_12 : tensor<1x2048x!tt.ptr<bf16>, #blocked>, tensor<1x2048xi32, #blocked> loc(#loc97)
+      %tmp2_21 = tt.load %tmp2_20, %tmp0_14, %cst_0 evictionPolicy = evict_first : tensor<1x2048x!tt.ptr<bf16>, #blocked> loc(#loc105)
+      %tmp2_22 = arith.extf %tmp2_21 : tensor<1x2048xbf16, #blocked> to tensor<1x2048xf32, #blocked> loc(#loc106)
+      %tmp3 = arith.mulf %tmp1_19, %tmp2_22 : tensor<1x2048xf32, #blocked> loc(#loc107)
+      %tmp4 = arith.addf %tmp0_16, %tmp3 : tensor<1x2048xf32, #blocked> loc(#loc108)
+      %3 = arith.cmpi eq, %tmp7_weight_10, %c0_i32 : i32 loc(#loc23)
+      %4:3 = scf.if %3 -> (tensor<1x2048xf32, #blocked>, tensor<1x2048xf32, #blocked>, tensor<1x2048xf32, #blocked>) {
+        scf.yield %cst_2, %tmp4, %cst_3 : tensor<1x2048xf32, #blocked>, tensor<1x2048xf32, #blocked>, tensor<1x2048xf32, #blocked> loc(#loc176)
+      } else {
+        %delta = arith.subf %tmp4, %arg10 : tensor<1x2048xf32, #blocked> loc(#loc155)
+        %new_weight = arith.addf %arg12, %cst_3 : tensor<1x2048xf32, #blocked> loc(#loc177)
+        %new_mean = arith.divf %delta, %new_weight : tensor<1x2048xf32, #blocked> loc(#loc157)
+        %new_mean_24 = arith.addf %arg10, %new_mean : tensor<1x2048xf32, #blocked> loc(#loc178)
+        %new_m2 = arith.subf %tmp4, %new_mean_24 : tensor<1x2048xf32, #blocked> loc(#loc159)
+        %new_m2_25 = arith.mulf %delta, %new_m2 : tensor<1x2048xf32, #blocked> loc(#loc160)
+        %new_m2_26 = arith.addf %arg11, %new_m2_25 : tensor<1x2048xf32, #blocked> loc(#loc179)
+        scf.yield %new_m2_26, %new_mean_24, %new_weight : tensor<1x2048xf32, #blocked>, tensor<1x2048xf32, #blocked>, tensor<1x2048xf32, #blocked> loc(#loc162)
+      } loc(#loc109)
+      %tmp7_mean = arith.select %tmp0_14, %4#1, %arg10 : tensor<1x2048xi1, #blocked>, tensor<1x2048xf32, #blocked> loc(#loc118)
+      %tmp7_m2 = arith.select %tmp0_14, %4#0, %arg11 : tensor<1x2048xi1, #blocked>, tensor<1x2048xf32, #blocked> loc(#loc119)
+      %tmp7_weight_23 = arith.select %tmp0_14, %4#2, %arg12 : tensor<1x2048xi1, #blocked>, tensor<1x2048xf32, #blocked> loc(#loc120)
+      %5 = tt.addptr %0, %tmp0_12 : tensor<1x2048x!tt.ptr<bf16>, #blocked>, tensor<1x2048xi32, #blocked> loc(#loc11)
+      %6 = arith.truncf %tmp4 : tensor<1x2048xf32, #blocked> to tensor<1x2048xbf16, #blocked> loc(#loc37)
+      tt.store %5, %6, %tmp0_14 : tensor<1x2048x!tt.ptr<bf16>, #blocked> loc(#loc37)
+      scf.yield %tmp7_mean, %tmp7_m2, %tmp7_weight_23 : tensor<1x2048xf32, #blocked>, tensor<1x2048xf32, #blocked>, tensor<1x2048xf32, #blocked> loc(#loc38)
+    } loc(#loc175)
+    %1:3 = "tt.reduce"(%tmp7_weight#0, %tmp7_weight#1, %tmp7_weight#2) <{axis = 1 : i32}> ({
+    ^bb0(%arg9: f32 loc(callsite(#loc1 at #loc40)), %arg10: f32 loc(callsite(#loc1 at #loc40)), %arg11: f32 loc(callsite(#loc1 at #loc40)), %arg12: f32 loc(callsite(#loc1 at #loc40)), %arg13: f32 loc(callsite(#loc1 at #loc40)), %arg14: f32 loc(callsite(#loc1 at #loc40))):
+      %delta = arith.subf %arg12, %arg9 : f32 loc(#loc163)
+      %new_weight = arith.addf %arg11, %arg14 : f32 loc(#loc164)
+      %w2_over_w = arith.cmpf oeq, %new_weight, %cst_1 : f32 loc(#loc165)
+      %w2_over_w_10 = arith.divf %arg14, %new_weight : f32 loc(#loc166)
+      %w2_over_w_11 = arith.select %w2_over_w, %cst_1, %w2_over_w_10 : f32 loc(#loc167)
+      %3 = arith.mulf %delta, %w2_over_w_11 : f32 loc(#loc168)
+      %4 = arith.addf %arg9, %3 : f32 loc(#loc169)
+      %5 = arith.addf %arg10, %arg13 : f32 loc(#loc170)
+      %6 = arith.mulf %delta, %delta : f32 loc(#loc171)
+      %7 = arith.mulf %6, %arg11 : f32 loc(#loc172)
+      %8 = arith.mulf %7, %w2_over_w_11 : f32 loc(#loc173)
+      %9 = arith.addf %5, %8 : f32 loc(#loc174)
+      tt.reduce.return %4, %9, %new_weight : f32, f32, f32 loc(#loc121)
+    }) : (tensor<1x2048xf32, #blocked>, tensor<1x2048xf32, #blocked>, tensor<1x2048xf32, #blocked>) -> (tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked}>>, tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked}>>, tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked}>>) loc(#loc121)
+    %tmp7 = tt.expand_dims %1#0 {axis = 1 : i32} : tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<1x1xf32, #blocked> loc(#loc128)
+    %tmp11 = tt.expand_dims %1#1 {axis = 1 : i32} : tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<1x1xf32, #blocked> loc(#loc129)
+    %tmp23 = tt.splat %in_ptr3 : !tt.ptr<bf16> -> tensor<1x2048x!tt.ptr<bf16>, #blocked> loc(#loc130)
+    %tmp27 = tt.splat %in_ptr4 : !tt.ptr<bf16> -> tensor<1x2048x!tt.ptr<bf16>, #blocked> loc(#loc131)
+    %tmp15 = tt.broadcast %tmp7 : tensor<1x1xf32, #blocked> -> tensor<1x2048xf32, #blocked> loc(#loc132)
+    %tmp17 = arith.divf %tmp11, %cst_5 : tensor<1x1xf32, #blocked> loc(#loc133)
+    %tmp19 = arith.addf %tmp17, %cst_4 : tensor<1x1xf32, #blocked> loc(#loc134)
+    %tmp20 = tt.extern_elementwise %tmp19 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<1x1xf32, #blocked>) -> tensor<1x1xf32, #blocked> loc(#loc135)
+    %tmp21 = tt.broadcast %tmp20 : tensor<1x1xf32, #blocked> -> tensor<1x2048xf32, #blocked> loc(#loc136)
+    %2 = tt.splat %out_ptr3 : !tt.ptr<bf16> -> tensor<1x2048x!tt.ptr<bf16>, #blocked> loc(#loc62)
+    scf.for %r0_offset = %c0_i32 to %c4096_i32 step %c2048_i32  : i32 {
+      %r0_index = tt.splat %r0_offset : i32 -> tensor<1x2048xi32, #blocked> loc(#loc137)
+      %r0_index_10 = arith.addi %r0_index, %r0_base_6 : tensor<1x2048xi32, #blocked> loc(#loc137)
+      %r0_mask = arith.cmpi slt, %r0_index_10, %cst : tensor<1x2048xi32, #blocked> loc(#loc138)
+      %tmp13 = arith.addi %r0_index_10, %tmp0_7 : tensor<1x2048xi32, #blocked> loc(#loc139)
+      %tmp13_11 = tt.addptr %0, %tmp13 : tensor<1x2048x!tt.ptr<bf16>, #blocked>, tensor<1x2048xi32, #blocked> loc(#loc140)
+      %tmp13_12 = arith.andi %r0_mask, %tmp0_9 : tensor<1x2048xi1, #blocked> loc(#loc141)
+      %tmp13_13 = tt.load %tmp13_11, %tmp13_12, %cst_0 evictionPolicy = evict_first : tensor<1x2048x!tt.ptr<bf16>, #blocked> loc(#loc142)
+      %tmp13_14 = arith.extf %tmp13_13 : tensor<1x2048xbf16, #blocked> to tensor<1x2048xf32, #blocked> loc(#loc143)
+      %tmp23_15 = tt.addptr %tmp23, %r0_index_10 : tensor<1x2048x!tt.ptr<bf16>, #blocked>, tensor<1x2048xi32, #blocked> loc(#loc130)
+      %tmp23_16 = tt.load %tmp23_15, %r0_mask, %cst_0 evictionPolicy = evict_last : tensor<1x2048x!tt.ptr<bf16>, #blocked> loc(#loc144)
+      %tmp23_17 = arith.extf %tmp23_16 : tensor<1x2048xbf16, #blocked> to tensor<1x2048xf32, #blocked> loc(#loc145)
+      %tmp27_18 = tt.addptr %tmp27, %r0_index_10 : tensor<1x2048x!tt.ptr<bf16>, #blocked>, tensor<1x2048xi32, #blocked> loc(#loc131)
+      %tmp27_19 = tt.load %tmp27_18, %r0_mask, %cst_0 evictionPolicy = evict_last : tensor<1x2048x!tt.ptr<bf16>, #blocked> loc(#loc146)
+      %tmp27_20 = arith.extf %tmp27_19 : tensor<1x2048xbf16, #blocked> to tensor<1x2048xf32, #blocked> loc(#loc147)
+      %tmp15_21 = arith.subf %tmp13_14, %tmp15 : tensor<1x2048xf32, #blocked> loc(#loc132)
+      %tmp21_22 = arith.mulf %tmp15_21, %tmp21 : tensor<1x2048xf32, #blocked> loc(#loc136)
+      %tmp25 = arith.addf %tmp23_17, %cst_3 : tensor<1x2048xf32, #blocked> loc(#loc148)
+      %tmp26 = arith.mulf %tmp21_22, %tmp25 : tensor<1x2048xf32, #blocked> loc(#loc149)
+      %tmp28 = arith.addf %tmp26, %tmp27_20 : tensor<1x2048xf32, #blocked> loc(#loc150)
+      %3 = tt.addptr %2, %tmp13 : tensor<1x2048x!tt.ptr<bf16>, #blocked>, tensor<1x2048xi32, #blocked> loc(#loc62)
+      %4 = arith.truncf %tmp28 : tensor<1x2048xf32, #blocked> to tensor<1x2048xbf16, #blocked> loc(#loc78)
+      tt.store %3, %4, %tmp13_12 : tensor<1x2048x!tt.ptr<bf16>, #blocked> loc(#loc78)
+    } loc(#loc63)
+    tt.return loc(#loc79)
+  } loc(#loc)
+} loc(#loc)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":23:28)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":25:21)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":26:37)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":38:46)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":38:41)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":38:34)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":38:61)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":39:34)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":40:34)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":51:29)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":32:43)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":33:31)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":34:29)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":38:51)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":38:113)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":39:41)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":39:94)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":40:51)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":40:113)
+#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":41:22)
+#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":42:22)
+#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":46:62)
+#loc24 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":217:7)
+#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":46:51)
+#loc26 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":220:31)
+#loc27 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":222:24)
+#loc28 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":223:30)
+#loc29 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":224:34)
+#loc30 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":224:26)
+#loc31 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":225:39)
+#loc32 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":225:31)
+#loc33 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":225:22)
+#loc34 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":48:62)
+#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":49:58)
+#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":50:66)
+#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":51:52)
+#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":51:8)
+#loc39 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":243:46)
+#loc41 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":231:21)
+#loc42 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":232:28)
+#loc43 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:39)
+#loc44 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:60)
+#loc45 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:49)
+#loc46 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":235:25)
+#loc47 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":235:17)
+#loc48 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:15)
+#loc49 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:30)
+#loc50 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:38)
+#loc51 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:49)
+#loc52 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:22)
+#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":53:16)
+#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":54:17)
+#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":63:35)
+#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":64:35)
+#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":66:24)
+#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":68:25)
+#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":70:24)
+#loc60 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":71:32)
+#loc61 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":72:24)
+#loc62 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":78:29)
+#loc63 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":56:43)
+#loc64 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":57:31)
+#loc65 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":58:29)
+#loc66 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":62:43)
+#loc67 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":62:36)
+#loc68 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":62:63)
+#loc69 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":62:53)
+#loc70 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":62:115)
+#loc71 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":63:42)
+#loc72 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":63:95)
+#loc73 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":64:42)
+#loc74 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":64:95)
+#loc75 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":75:24)
+#loc76 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":76:24)
+#loc77 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":77:24)
+#loc78 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":78:53)
+#loc79 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":56:4)
+#loc89 = loc("xoffset"(#loc2))
+#loc90 = loc("xmask"(#loc3))
+#loc91 = loc("r0_base"(#loc4))
+#loc92 = loc("tmp0"(#loc5))
+#loc93 = loc("tmp0"(#loc6))
+#loc94 = loc("tmp0"(#loc7))
+#loc95 = loc("tmp0"(#loc8))
+#loc96 = loc("tmp1"(#loc9))
+#loc97 = loc("tmp2"(#loc10))
+#loc98 = loc("tmp7_mean"(#loc12))
+#loc99 = loc("r0_index"(#loc13))
+#loc100 = loc("r0_mask"(#loc14))
+#loc101 = loc("tmp0"(#loc15))
+#loc102 = loc("tmp0"(#loc16))
+#loc103 = loc("tmp1"(#loc17))
+#loc104 = loc("tmp1"(#loc18))
+#loc105 = loc("tmp2"(#loc19))
+#loc106 = loc("tmp2"(#loc20))
+#loc107 = loc("tmp3"(#loc21))
+#loc108 = loc("tmp4"(#loc22))
+#loc109 = loc(callsite(#loc24 at #loc25))
+#loc110 = loc("new_m2"(#loc26))
+#loc111 = loc("delta"(#loc27))
+#loc112 = loc("new_weight"(#loc28))
+#loc113 = loc("new_mean"(#loc29))
+#loc114 = loc("new_mean"(#loc30))
+#loc115 = loc("new_m2"(#loc31))
+#loc116 = loc("new_m2"(#loc32))
+#loc117 = loc("new_m2"(#loc33))
+#loc118 = loc("tmp7_mean"(#loc34))
+#loc119 = loc("tmp7_m2"(#loc35))
+#loc120 = loc("tmp7_weight"(#loc36))
+#loc121 = loc(callsite(#loc39 at #loc40))
+#loc123 = loc("delta"(#loc41))
+#loc124 = loc("new_weight"(#loc42))
+#loc125 = loc("w2_over_w"(#loc43))
+#loc126 = loc("w2_over_w"(#loc44))
+#loc127 = loc("w2_over_w"(#loc45))
+#loc128 = loc("tmp7"(#loc53))
+#loc129 = loc("tmp11"(#loc54))
+#loc130 = loc("tmp23"(#loc55))
+#loc131 = loc("tmp27"(#loc56))
+#loc132 = loc("tmp15"(#loc57))
+#loc133 = loc("tmp17"(#loc58))
+#loc134 = loc("tmp19"(#loc59))
+#loc135 = loc("tmp20"(#loc60))
+#loc136 = loc("tmp21"(#loc61))
+#loc137 = loc("r0_index"(#loc64))
+#loc138 = loc("r0_mask"(#loc65))
+#loc139 = loc("tmp13"(#loc66))
+#loc140 = loc("tmp13"(#loc67))
+#loc141 = loc("tmp13"(#loc68))
+#loc142 = loc("tmp13"(#loc69))
+#loc143 = loc("tmp13"(#loc70))
+#loc144 = loc("tmp23"(#loc71))
+#loc145 = loc("tmp23"(#loc72))
+#loc146 = loc("tmp27"(#loc73))
+#loc147 = loc("tmp27"(#loc74))
+#loc148 = loc("tmp25"(#loc75))
+#loc149 = loc("tmp26"(#loc76))
+#loc150 = loc("tmp28"(#loc77))
+#loc151 = loc(fused[#loc93, #loc92])
+#loc152 = loc(fused[#loc95, #loc90])
+#loc153 = loc("tmp7_m2"(#loc98))
+#loc154 = loc("new_m2"(#loc110))
+#loc155 = loc(callsite(#loc111 at #loc25))
+#loc156 = loc("new_weight"(#loc112))
+#loc157 = loc(callsite(#loc113 at #loc25))
+#loc158 = loc("new_mean"(#loc114))
+#loc159 = loc(callsite(#loc115 at #loc25))
+#loc160 = loc(callsite(#loc116 at #loc25))
+#loc161 = loc("new_m2"(#loc117))
+#loc162 = loc(callsite(#loc117 at #loc25))
+#loc163 = loc(callsite(#loc123 at #loc121))
+#loc164 = loc(callsite(#loc124 at #loc121))
+#loc165 = loc(callsite(#loc125 at #loc121))
+#loc166 = loc(callsite(#loc126 at #loc121))
+#loc167 = loc(callsite(#loc127 at #loc121))
+#loc168 = loc(callsite(#loc46 at #loc121))
+#loc169 = loc(callsite(#loc47 at #loc121))
+#loc170 = loc(callsite(#loc48 at #loc121))
+#loc171 = loc(callsite(#loc49 at #loc121))
+#loc172 = loc(callsite(#loc50 at #loc121))
+#loc173 = loc(callsite(#loc51 at #loc121))
+#loc174 = loc(callsite(#loc52 at #loc121))
+#loc175 = loc("tmp7_weight"(#loc153))
+#loc176 = loc(callsite(#loc154 at #loc25))
+#loc177 = loc(callsite(#loc156 at #loc25))
+#loc178 = loc(callsite(#loc158 at #loc25))
+#loc179 = loc(callsite(#loc161 at #loc25))
diff --git a/triton/DTSINFKV23R7UUCB5Y3DX56UD6DUQS3DXJCZPAKYYXLDFCJFQIOA/triton_red_fused_add_mul_native_layer_norm_0.ttir b/triton/DTSINFKV23R7UUCB5Y3DX56UD6DUQS3DXJCZPAKYYXLDFCJFQIOA/triton_red_fused_add_mul_native_layer_norm_0.ttir
new file mode 100644
index 0000000000000000000000000000000000000000..75f9aeef0da4f8198219a446a62130a5cb3727ec
--- /dev/null
+++ b/triton/DTSINFKV23R7UUCB5Y3DX56UD6DUQS3DXJCZPAKYYXLDFCJFQIOA/triton_red_fused_add_mul_native_layer_norm_0.ttir
@@ -0,0 +1,304 @@
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":18:0)
+#loc1 = loc(unknown)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":52:80)
+#loc82 = loc("in_ptr0"(#loc))
+#loc83 = loc("in_ptr1"(#loc))
+#loc84 = loc("in_ptr2"(#loc))
+#loc85 = loc("in_ptr3"(#loc))
+#loc86 = loc("in_ptr4"(#loc))
+#loc87 = loc("out_ptr0"(#loc))
+#loc88 = loc("out_ptr3"(#loc))
+#loc89 = loc("xnumel"(#loc))
+#loc90 = loc("r0_numel"(#loc))
+#loc91 = loc(callsite(#loc1 at #loc2))
+module {
+  tt.func public @triton_red_fused_add_mul_native_layer_norm_0(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %in_ptr4: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr4"(#loc)), %out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %out_ptr3: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr3"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
+    %cst = arith.constant 0.000000e+00 : f32 loc(#loc91)
+    %cst_0 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc1)
+    %cst_1 = arith.constant dense<0.000000e+00> : tensor<1x2048xbf16> loc(#loc1)
+    %c2048_i32 = arith.constant 2048 : i32 loc(#loc1)
+    %c4096_i32 = arith.constant 4096 : i32 loc(#loc1)
+    %cst_2 = arith.constant dense<1.000000e+00> : tensor<1x2048xf32> loc(#loc1)
+    %cst_3 = arith.constant dense<9.99999997E-7> : tensor<1x1xf32> loc(#loc1)
+    %cst_4 = arith.constant dense<4.096000e+03> : tensor<1x1xf32> loc(#loc1)
+    %cst_5 = arith.constant dense<4096> : tensor<1x2048xi32> loc(#loc1)
+    %c0_i32 = arith.constant 0 : i32 loc(#loc1)
+    %xoffset = tt.get_program_id x : i32 loc(#loc92)
+    %xmask = arith.cmpi slt, %xoffset, %c2048_i32 : i32 loc(#loc93)
+    %r0_base = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32> loc(#loc94)
+    %r0_base_6 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<2048xi32> -> tensor<1x2048xi32> loc(#loc95)
+    %tmp7_weight:3 = scf.for %r0_offset = %c0_i32 to %c4096_i32 step %c2048_i32 iter_args(%tmp7_mean = %cst_0, %tmp7_m2 = %cst_0, %tmp7_weight_7 = %cst_0) -> (tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32>)  : i32 {
+      %r0_index = tt.splat %r0_offset : i32 -> tensor<1x2048xi32> loc(#loc97)
+      %r0_index_8 = arith.addi %r0_index, %r0_base_6 : tensor<1x2048xi32> loc(#loc97)
+      %r0_mask = arith.cmpi slt, %r0_index_8, %cst_5 : tensor<1x2048xi32> loc(#loc98)
+      %tmp0 = arith.muli %xoffset, %c4096_i32 : i32 loc(#loc99)
+      %tmp0_9 = tt.splat %tmp0 : i32 -> tensor<1x2048xi32> loc(#loc156)
+      %tmp0_10 = arith.addi %r0_index_8, %tmp0_9 : tensor<1x2048xi32> loc(#loc100)
+      %tmp0_11 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<1x2048x!tt.ptr<bf16>> loc(#loc101)
+      %tmp0_12 = tt.addptr %tmp0_11, %tmp0_10 : tensor<1x2048x!tt.ptr<bf16>>, tensor<1x2048xi32> loc(#loc101)
+      %tmp0_13 = tt.splat %xmask : i1 -> tensor<1x2048xi1> loc(#loc157)
+      %tmp0_14 = arith.andi %r0_mask, %tmp0_13 : tensor<1x2048xi1> loc(#loc102)
+      %tmp0_15 = tt.load %tmp0_12, %tmp0_14, %cst_1 evictionPolicy = evict_first : tensor<1x2048x!tt.ptr<bf16>> loc(#loc103)
+      %tmp0_16 = arith.extf %tmp0_15 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc104)
+      %tmp1 = tt.splat %in_ptr1 : !tt.ptr<bf16> -> tensor<1x2048x!tt.ptr<bf16>> loc(#loc105)
+      %tmp1_17 = tt.addptr %tmp1, %r0_index_8 : tensor<1x2048x!tt.ptr<bf16>>, tensor<1x2048xi32> loc(#loc105)
+      %tmp1_18 = tt.load %tmp1_17, %r0_mask, %cst_1 evictionPolicy = evict_last : tensor<1x2048x!tt.ptr<bf16>> loc(#loc106)
+      %tmp1_19 = arith.extf %tmp1_18 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc107)
+      %tmp2 = tt.splat %in_ptr2 : !tt.ptr<bf16> -> tensor<1x2048x!tt.ptr<bf16>> loc(#loc108)
+      %tmp2_20 = tt.addptr %tmp2, %tmp0_10 : tensor<1x2048x!tt.ptr<bf16>>, tensor<1x2048xi32> loc(#loc108)
+      %tmp2_21 = tt.load %tmp2_20, %tmp0_14, %cst_1 evictionPolicy = evict_first : tensor<1x2048x!tt.ptr<bf16>> loc(#loc109)
+      %tmp2_22 = arith.extf %tmp2_21 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc110)
+      %tmp3 = arith.mulf %tmp1_19, %tmp2_22 : tensor<1x2048xf32> loc(#loc111)
+      %tmp4 = arith.addf %tmp0_16, %tmp3 : tensor<1x2048xf32> loc(#loc112)
+      %1 = arith.cmpi eq, %r0_offset, %c0_i32 : i32 loc(#loc24)
+      %2:3 = scf.if %1 -> (tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32>) {
+        scf.yield %cst_0, %tmp4, %cst_2 : tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32> loc(#loc182)
+      } else {
+        %delta = arith.subf %tmp4, %tmp7_mean : tensor<1x2048xf32> loc(#loc159)
+        %new_weight = arith.addf %tmp7_weight_7, %cst_2 : tensor<1x2048xf32> loc(#loc183)
+        %new_mean = arith.divf %delta, %new_weight : tensor<1x2048xf32> loc(#loc161)
+        %new_mean_26 = arith.addf %tmp7_mean, %new_mean : tensor<1x2048xf32> loc(#loc184)
+        %new_m2 = arith.subf %tmp4, %new_mean_26 : tensor<1x2048xf32> loc(#loc163)
+        %new_m2_27 = arith.mulf %delta, %new_m2 : tensor<1x2048xf32> loc(#loc164)
+        %new_m2_28 = arith.addf %tmp7_m2, %new_m2_27 : tensor<1x2048xf32> loc(#loc185)
+        scf.yield %new_m2_28, %new_mean_26, %new_weight : tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32> loc(#loc166)
+      } loc(#loc113)
+      %tmp7_mean_23 = arith.select %tmp0_14, %2#1, %tmp7_mean : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc122)
+      %tmp7_m2_24 = arith.select %tmp0_14, %2#0, %tmp7_m2 : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc123)
+      %tmp7_weight_25 = arith.select %tmp0_14, %2#2, %tmp7_weight_7 : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc124)
+      %3 = tt.splat %out_ptr0 : !tt.ptr<bf16> -> tensor<1x2048x!tt.ptr<bf16>> loc(#loc38)
+      %4 = tt.addptr %3, %tmp0_10 : tensor<1x2048x!tt.ptr<bf16>>, tensor<1x2048xi32> loc(#loc38)
+      %5 = arith.truncf %tmp4 : tensor<1x2048xf32> to tensor<1x2048xbf16> loc(#loc39)
+      tt.store %4, %5, %tmp0_14 : tensor<1x2048x!tt.ptr<bf16>> loc(#loc39)
+      scf.yield %tmp7_mean_23, %tmp7_m2_24, %tmp7_weight_25 : tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32> loc(#loc40)
+    } loc(#loc181)
+    %0:3 = "tt.reduce"(%tmp7_weight#0, %tmp7_weight#1, %tmp7_weight#2) <{axis = 1 : i32}> ({
+    ^bb0(%arg9: f32 loc(callsite(#loc1 at #loc2)), %arg10: f32 loc(callsite(#loc1 at #loc2)), %arg11: f32 loc(callsite(#loc1 at #loc2)), %arg12: f32 loc(callsite(#loc1 at #loc2)), %arg13: f32 loc(callsite(#loc1 at #loc2)), %arg14: f32 loc(callsite(#loc1 at #loc2))):
+      %delta = arith.subf %arg12, %arg9 : f32 loc(#loc167)
+      %new_weight = arith.addf %arg11, %arg14 : f32 loc(#loc168)
+      %w2_over_w = arith.cmpf oeq, %new_weight, %cst : f32 loc(#loc169)
+      %w2_over_w_7 = arith.divf %arg14, %new_weight : f32 loc(#loc170)
+      %w2_over_w_8 = arith.select %w2_over_w, %cst, %w2_over_w_7 : f32 loc(#loc171)
+      %1 = arith.mulf %delta, %w2_over_w_8 : f32 loc(#loc172)
+      %2 = arith.addf %arg9, %1 : f32 loc(#loc173)
+      %3 = arith.addf %arg10, %arg13 : f32 loc(#loc174)
+      %4 = arith.mulf %delta, %delta : f32 loc(#loc175)
+      %5 = arith.mulf %4, %arg11 : f32 loc(#loc176)
+      %6 = arith.mulf %5, %w2_over_w_8 : f32 loc(#loc177)
+      %7 = arith.addf %3, %6 : f32 loc(#loc178)
+      tt.reduce.return %2, %7, %new_weight : f32, f32, f32 loc(#loc125)
+    }) : (tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32>) -> (tensor<1xf32>, tensor<1xf32>, tensor<1xf32>) loc(#loc125)
+    %tmp7 = tt.expand_dims %0#0 {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc131)
+    %tmp11 = tt.expand_dims %0#1 {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc132)
+    scf.for %r0_offset = %c0_i32 to %c4096_i32 step %c2048_i32  : i32 {
+      %r0_index = tt.splat %r0_offset : i32 -> tensor<1x2048xi32> loc(#loc133)
+      %r0_index_7 = arith.addi %r0_index, %r0_base_6 : tensor<1x2048xi32> loc(#loc133)
+      %r0_mask = arith.cmpi slt, %r0_index_7, %cst_5 : tensor<1x2048xi32> loc(#loc134)
+      %tmp13 = arith.muli %xoffset, %c4096_i32 : i32 loc(#loc135)
+      %tmp13_8 = tt.splat %tmp13 : i32 -> tensor<1x2048xi32> loc(#loc179)
+      %tmp13_9 = arith.addi %r0_index_7, %tmp13_8 : tensor<1x2048xi32> loc(#loc136)
+      %tmp13_10 = tt.splat %out_ptr0 : !tt.ptr<bf16> -> tensor<1x2048x!tt.ptr<bf16>> loc(#loc137)
+      %tmp13_11 = tt.addptr %tmp13_10, %tmp13_9 : tensor<1x2048x!tt.ptr<bf16>>, tensor<1x2048xi32> loc(#loc137)
+      %tmp13_12 = tt.splat %xmask : i1 -> tensor<1x2048xi1> loc(#loc180)
+      %tmp13_13 = arith.andi %r0_mask, %tmp13_12 : tensor<1x2048xi1> loc(#loc138)
+      %tmp13_14 = tt.load %tmp13_11, %tmp13_13, %cst_1 evictionPolicy = evict_first : tensor<1x2048x!tt.ptr<bf16>> loc(#loc139)
+      %tmp13_15 = arith.extf %tmp13_14 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc140)
+      %tmp23 = tt.splat %in_ptr3 : !tt.ptr<bf16> -> tensor<1x2048x!tt.ptr<bf16>> loc(#loc141)
+      %tmp23_16 = tt.addptr %tmp23, %r0_index_7 : tensor<1x2048x!tt.ptr<bf16>>, tensor<1x2048xi32> loc(#loc141)
+      %tmp23_17 = tt.load %tmp23_16, %r0_mask, %cst_1 evictionPolicy = evict_last : tensor<1x2048x!tt.ptr<bf16>> loc(#loc142)
+      %tmp23_18 = arith.extf %tmp23_17 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc143)
+      %tmp27 = tt.splat %in_ptr4 : !tt.ptr<bf16> -> tensor<1x2048x!tt.ptr<bf16>> loc(#loc144)
+      %tmp27_19 = tt.addptr %tmp27, %r0_index_7 : tensor<1x2048x!tt.ptr<bf16>>, tensor<1x2048xi32> loc(#loc144)
+      %tmp27_20 = tt.load %tmp27_19, %r0_mask, %cst_1 evictionPolicy = evict_last : tensor<1x2048x!tt.ptr<bf16>> loc(#loc145)
+      %tmp27_21 = arith.extf %tmp27_20 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc146)
+      %tmp15 = tt.broadcast %tmp7 : tensor<1x1xf32> -> tensor<1x2048xf32> loc(#loc147)
+      %tmp15_22 = arith.subf %tmp13_15, %tmp15 : tensor<1x2048xf32> loc(#loc147)
+      %tmp17 = arith.divf %tmp11, %cst_4 : tensor<1x1xf32> loc(#loc148)
+      %tmp19 = arith.addf %tmp17, %cst_3 : tensor<1x1xf32> loc(#loc149)
+      %tmp20 = tt.extern_elementwise %tmp19 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<1x1xf32>) -> tensor<1x1xf32> loc(#loc150)
+      %tmp21 = tt.broadcast %tmp20 : tensor<1x1xf32> -> tensor<1x2048xf32> loc(#loc151)
+      %tmp21_23 = arith.mulf %tmp15_22, %tmp21 : tensor<1x2048xf32> loc(#loc151)
+      %tmp25 = arith.addf %tmp23_18, %cst_2 : tensor<1x2048xf32> loc(#loc152)
+      %tmp26 = arith.mulf %tmp21_23, %tmp25 : tensor<1x2048xf32> loc(#loc153)
+      %tmp28 = arith.addf %tmp26, %tmp27_21 : tensor<1x2048xf32> loc(#loc154)
+      %1 = tt.splat %out_ptr3 : !tt.ptr<bf16> -> tensor<1x2048x!tt.ptr<bf16>> loc(#loc79)
+      %2 = tt.addptr %1, %tmp13_9 : tensor<1x2048x!tt.ptr<bf16>>, tensor<1x2048xi32> loc(#loc79)
+      %3 = arith.truncf %tmp28 : tensor<1x2048xf32> to tensor<1x2048xbf16> loc(#loc80)
+      tt.store %2, %3, %tmp13_13 : tensor<1x2048x!tt.ptr<bf16>> loc(#loc80)
+    } loc(#loc56)
+    tt.return loc(#loc81)
+  } loc(#loc)
+} loc(#loc)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":23:28)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":25:21)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":26:27)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":26:37)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":32:43)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":33:31)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":34:29)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":38:46)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":38:41)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":38:34)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":38:61)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":38:51)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":38:113)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":39:34)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":39:41)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":39:94)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":40:34)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":40:51)
+#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":40:113)
+#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":41:22)
+#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":42:22)
+#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":46:62)
+#loc25 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":217:7)
+#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":46:51)
+#loc27 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":220:31)
+#loc28 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":222:24)
+#loc29 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":223:30)
+#loc30 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":224:34)
+#loc31 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":224:26)
+#loc32 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":225:39)
+#loc33 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":225:31)
+#loc34 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":225:22)
+#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":48:62)
+#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":49:58)
+#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":50:66)
+#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":51:29)
+#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":51:52)
+#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":51:8)
+#loc41 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":243:46)
+#loc42 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":231:21)
+#loc43 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":232:28)
+#loc44 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:39)
+#loc45 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:60)
+#loc46 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:49)
+#loc47 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":235:25)
+#loc48 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":235:17)
+#loc49 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:15)
+#loc50 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:30)
+#loc51 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:38)
+#loc52 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:49)
+#loc53 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:22)
+#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":53:16)
+#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":54:17)
+#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":56:43)
+#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":57:31)
+#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":58:29)
+#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":62:48)
+#loc60 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":62:43)
+#loc61 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":62:36)
+#loc62 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":62:63)
+#loc63 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":62:53)
+#loc64 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":62:115)
+#loc65 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":63:35)
+#loc66 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":63:42)
+#loc67 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":63:95)
+#loc68 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":64:35)
+#loc69 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":64:42)
+#loc70 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":64:95)
+#loc71 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":66:24)
+#loc72 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":68:25)
+#loc73 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":70:24)
+#loc74 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":71:32)
+#loc75 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":72:24)
+#loc76 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":75:24)
+#loc77 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":76:24)
+#loc78 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":77:24)
+#loc79 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":78:29)
+#loc80 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":78:53)
+#loc81 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":56:4)
+#loc92 = loc("xoffset"(#loc3))
+#loc93 = loc("xmask"(#loc4))
+#loc94 = loc("r0_base"(#loc5))
+#loc95 = loc("r0_base"(#loc6))
+#loc96 = loc("tmp7_mean"(#loc7))
+#loc97 = loc("r0_index"(#loc8))
+#loc98 = loc("r0_mask"(#loc9))
+#loc99 = loc("tmp0"(#loc10))
+#loc100 = loc("tmp0"(#loc11))
+#loc101 = loc("tmp0"(#loc12))
+#loc102 = loc("tmp0"(#loc13))
+#loc103 = loc("tmp0"(#loc14))
+#loc104 = loc("tmp0"(#loc15))
+#loc105 = loc("tmp1"(#loc16))
+#loc106 = loc("tmp1"(#loc17))
+#loc107 = loc("tmp1"(#loc18))
+#loc108 = loc("tmp2"(#loc19))
+#loc109 = loc("tmp2"(#loc20))
+#loc110 = loc("tmp2"(#loc21))
+#loc111 = loc("tmp3"(#loc22))
+#loc112 = loc("tmp4"(#loc23))
+#loc113 = loc(callsite(#loc25 at #loc26))
+#loc114 = loc("new_m2"(#loc27))
+#loc115 = loc("delta"(#loc28))
+#loc116 = loc("new_weight"(#loc29))
+#loc117 = loc("new_mean"(#loc30))
+#loc118 = loc("new_mean"(#loc31))
+#loc119 = loc("new_m2"(#loc32))
+#loc120 = loc("new_m2"(#loc33))
+#loc121 = loc("new_m2"(#loc34))
+#loc122 = loc("tmp7_mean"(#loc35))
+#loc123 = loc("tmp7_m2"(#loc36))
+#loc124 = loc("tmp7_weight"(#loc37))
+#loc125 = loc(callsite(#loc41 at #loc2))
+#loc126 = loc("delta"(#loc42))
+#loc127 = loc("new_weight"(#loc43))
+#loc128 = loc("w2_over_w"(#loc44))
+#loc129 = loc("w2_over_w"(#loc45))
+#loc130 = loc("w2_over_w"(#loc46))
+#loc131 = loc("tmp7"(#loc54))
+#loc132 = loc("tmp11"(#loc55))
+#loc133 = loc("r0_index"(#loc57))
+#loc134 = loc("r0_mask"(#loc58))
+#loc135 = loc("tmp13"(#loc59))
+#loc136 = loc("tmp13"(#loc60))
+#loc137 = loc("tmp13"(#loc61))
+#loc138 = loc("tmp13"(#loc62))
+#loc139 = loc("tmp13"(#loc63))
+#loc140 = loc("tmp13"(#loc64))
+#loc141 = loc("tmp23"(#loc65))
+#loc142 = loc("tmp23"(#loc66))
+#loc143 = loc("tmp23"(#loc67))
+#loc144 = loc("tmp27"(#loc68))
+#loc145 = loc("tmp27"(#loc69))
+#loc146 = loc("tmp27"(#loc70))
+#loc147 = loc("tmp15"(#loc71))
+#loc148 = loc("tmp17"(#loc72))
+#loc149 = loc("tmp19"(#loc73))
+#loc150 = loc("tmp20"(#loc74))
+#loc151 = loc("tmp21"(#loc75))
+#loc152 = loc("tmp25"(#loc76))
+#loc153 = loc("tmp26"(#loc77))
+#loc154 = loc("tmp28"(#loc78))
+#loc155 = loc("tmp7_m2"(#loc96))
+#loc156 = loc(fused[#loc100, #loc99])
+#loc157 = loc(fused[#loc102, #loc93])
+#loc158 = loc("new_m2"(#loc114))
+#loc159 = loc(callsite(#loc115 at #loc26))
+#loc160 = loc("new_weight"(#loc116))
+#loc161 = loc(callsite(#loc117 at #loc26))
+#loc162 = loc("new_mean"(#loc118))
+#loc163 = loc(callsite(#loc119 at #loc26))
+#loc164 = loc(callsite(#loc120 at #loc26))
+#loc165 = loc("new_m2"(#loc121))
+#loc166 = loc(callsite(#loc121 at #loc26))
+#loc167 = loc(callsite(#loc126 at #loc125))
+#loc168 = loc(callsite(#loc127 at #loc125))
+#loc169 = loc(callsite(#loc128 at #loc125))
+#loc170 = loc(callsite(#loc129 at #loc125))
+#loc171 = loc(callsite(#loc130 at #loc125))
+#loc172 = loc(callsite(#loc47 at #loc125))
+#loc173 = loc(callsite(#loc48 at #loc125))
+#loc174 = loc(callsite(#loc49 at #loc125))
+#loc175 = loc(callsite(#loc50 at #loc125))
+#loc176 = loc(callsite(#loc51 at #loc125))
+#loc177 = loc(callsite(#loc52 at #loc125))
+#loc178 = loc(callsite(#loc53 at #loc125))
+#loc179 = loc(fused[#loc136, #loc135])
+#loc180 = loc(fused[#loc138, #loc93])
+#loc181 = loc("tmp7_weight"(#loc155))
+#loc182 = loc(callsite(#loc158 at #loc26))
+#loc183 = loc(callsite(#loc160 at #loc26))
+#loc184 = loc(callsite(#loc162 at #loc26))
+#loc185 = loc(callsite(#loc165 at #loc26))
diff --git a/triton/EEWVWJXNNZ4VFVUQTNJQ7OOCLYR7ISGXRTWLP6JHV3ZM54A5IBNA/__grp__triton_red_fused__fused_rms_norm_view_0.json b/triton/EEWVWJXNNZ4VFVUQTNJQ7OOCLYR7ISGXRTWLP6JHV3ZM54A5IBNA/__grp__triton_red_fused__fused_rms_norm_view_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..91001eafcf9376649341a7595586a1e1ba90dd58
--- /dev/null
+++ b/triton/EEWVWJXNNZ4VFVUQTNJQ7OOCLYR7ISGXRTWLP6JHV3ZM54A5IBNA/__grp__triton_red_fused__fused_rms_norm_view_0.json
@@ -0,0 +1 @@
+{"child_paths": {"triton_red_fused__fused_rms_norm_view_0.source": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/EEWVWJXNNZ4VFVUQTNJQ7OOCLYR7ISGXRTWLP6JHV3ZM54A5IBNA/triton_red_fused__fused_rms_norm_view_0.source", "triton_red_fused__fused_rms_norm_view_0.ttir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/EEWVWJXNNZ4VFVUQTNJQ7OOCLYR7ISGXRTWLP6JHV3ZM54A5IBNA/triton_red_fused__fused_rms_norm_view_0.ttir", "triton_red_fused__fused_rms_norm_view_0.ttgir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/EEWVWJXNNZ4VFVUQTNJQ7OOCLYR7ISGXRTWLP6JHV3ZM54A5IBNA/triton_red_fused__fused_rms_norm_view_0.ttgir", "triton_red_fused__fused_rms_norm_view_0.llir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/EEWVWJXNNZ4VFVUQTNJQ7OOCLYR7ISGXRTWLP6JHV3ZM54A5IBNA/triton_red_fused__fused_rms_norm_view_0.llir", "triton_red_fused__fused_rms_norm_view_0.ptx": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/EEWVWJXNNZ4VFVUQTNJQ7OOCLYR7ISGXRTWLP6JHV3ZM54A5IBNA/triton_red_fused__fused_rms_norm_view_0.ptx", "triton_red_fused__fused_rms_norm_view_0.cubin": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/EEWVWJXNNZ4VFVUQTNJQ7OOCLYR7ISGXRTWLP6JHV3ZM54A5IBNA/triton_red_fused__fused_rms_norm_view_0.cubin", "triton_red_fused__fused_rms_norm_view_0.json": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/EEWVWJXNNZ4VFVUQTNJQ7OOCLYR7ISGXRTWLP6JHV3ZM54A5IBNA/triton_red_fused__fused_rms_norm_view_0.json"}}
\ No newline at end of file
diff --git a/triton/EEWVWJXNNZ4VFVUQTNJQ7OOCLYR7ISGXRTWLP6JHV3ZM54A5IBNA/triton_red_fused__fused_rms_norm_view_0.cubin b/triton/EEWVWJXNNZ4VFVUQTNJQ7OOCLYR7ISGXRTWLP6JHV3ZM54A5IBNA/triton_red_fused__fused_rms_norm_view_0.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..74d4e7c86cbf21f7af7b1dedb57d9062a49f139f
Binary files /dev/null and b/triton/EEWVWJXNNZ4VFVUQTNJQ7OOCLYR7ISGXRTWLP6JHV3ZM54A5IBNA/triton_red_fused__fused_rms_norm_view_0.cubin differ
diff --git a/triton/EEWVWJXNNZ4VFVUQTNJQ7OOCLYR7ISGXRTWLP6JHV3ZM54A5IBNA/triton_red_fused__fused_rms_norm_view_0.json b/triton/EEWVWJXNNZ4VFVUQTNJQ7OOCLYR7ISGXRTWLP6JHV3ZM54A5IBNA/triton_red_fused__fused_rms_norm_view_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..3a1b71fcd088655b6bbe99a00b830e53c565fd0e
--- /dev/null
+++ b/triton/EEWVWJXNNZ4VFVUQTNJQ7OOCLYR7ISGXRTWLP6JHV3ZM54A5IBNA/triton_red_fused__fused_rms_norm_view_0.json
@@ -0,0 +1 @@
+{"hash": "212d5b26ed6e7952d6909b530fb9c25e23f448d78cecb7f927aef2cef01d405a", "target": {"backend": "cuda", "arch": 89, "warp_size": 32}, "num_warps": 8, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "enable_reflect_ftz": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee", "bf16x3", "bf16x6"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm89", "instrumentation_mode": "", "triton_version": "3.6.0", "tensordesc_meta": [], "shared": 32, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused__fused_rms_norm_view_0"}
\ No newline at end of file
diff --git a/triton/EEWVWJXNNZ4VFVUQTNJQ7OOCLYR7ISGXRTWLP6JHV3ZM54A5IBNA/triton_red_fused__fused_rms_norm_view_0.llir b/triton/EEWVWJXNNZ4VFVUQTNJQ7OOCLYR7ISGXRTWLP6JHV3ZM54A5IBNA/triton_red_fused__fused_rms_norm_view_0.llir
new file mode 100644
index 0000000000000000000000000000000000000000..085e37c7d4367276f4f4c95ae5d98ddd15eaa068
--- /dev/null
+++ b/triton/EEWVWJXNNZ4VFVUQTNJQ7OOCLYR7ISGXRTWLP6JHV3ZM54A5IBNA/triton_red_fused__fused_rms_norm_view_0.llir
@@ -0,0 +1,136 @@
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64"
+
+@global_smem = external local_unnamed_addr addrspace(3) global [0 x i8], align 16
+
+; Function Attrs: nounwind
+define ptx_kernel void @triton_red_fused__fused_rms_norm_view_0(ptr addrspace(1) %0, ptr addrspace(1) %1, i32 %2, i32 %3, ptr addrspace(1) readnone captures(none) %4, ptr addrspace(1) readnone captures(none) %5) local_unnamed_addr #0 !dbg !4 {
+  %7 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7
+  %8 = shl i32 %7, 3, !dbg !8
+  %9 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9
+  %10 = and i32 %9, 224, !dbg !9
+  %11 = lshr exact i32 %10, 5, !dbg !9
+  %12 = and i32 %9, 7, !dbg !9
+  %13 = or disjoint i32 %11, %8, !dbg !10
+  %14 = or disjoint i32 %8, %12, !dbg !10
+  %15 = shl nuw nsw i32 %9, 2, !dbg !11
+  %16 = and i32 %15, 124, !dbg !11
+  %17 = sdiv i32 %13, 32, !dbg !12
+  %18 = mul i32 %17, 32, !dbg !13
+  %.decomposed = sub i32 %13, %18, !dbg !13
+  %19 = shl nsw i32 %.decomposed, 7, !dbg !14
+  %20 = or disjoint i32 %19, %16, !dbg !15
+  %21 = mul i32 %17, 12288, !dbg !16
+  %22 = add i32 %20, %21, !dbg !17
+  %23 = sext i32 %22 to i64, !dbg !18
+  %24 = getelementptr bfloat, ptr addrspace(1) %0, i64 %23, !dbg !18
+  %25 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !19
+  %26 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %24, i64 %25, i1 true) #4, !dbg !19
+  %27 = extractvalue { i32, i32 } %26, 0, !dbg !19
+  %28 = bitcast i32 %27 to <2 x bfloat>, !dbg !19
+  %29 = extractvalue { i32, i32 } %26, 1, !dbg !19
+  %30 = bitcast i32 %29 to <2 x bfloat>, !dbg !19
+  %31 = extractelement <2 x bfloat> %28, i64 0, !dbg !19
+  %32 = extractelement <2 x bfloat> %28, i64 1, !dbg !19
+  %33 = extractelement <2 x bfloat> %30, i64 0, !dbg !19
+  %34 = extractelement <2 x bfloat> %30, i64 1, !dbg !19
+  %35 = fpext bfloat %31 to float, !dbg !20
+  %36 = fpext bfloat %32 to float, !dbg !20
+  %37 = fpext bfloat %33 to float, !dbg !20
+  %38 = fpext bfloat %34 to float, !dbg !20
+  %39 = fmul float %35, %35, !dbg !21
+  %40 = fmul float %36, %36, !dbg !21
+  %41 = fmul float %37, %37, !dbg !21
+  %42 = fmul float %38, %38, !dbg !21
+  %43 = fadd float %39, %40, !dbg !22
+  %44 = fadd float %41, %43, !dbg !22
+  %45 = fadd float %42, %44, !dbg !22
+  %46 = bitcast float %45 to i32, !dbg !25
+  %47 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %46, i32 16, i32 31), !dbg !25
+  %48 = bitcast i32 %47 to float, !dbg !25
+  %49 = fadd float %45, %48, !dbg !22
+  %50 = bitcast float %49 to i32, !dbg !25
+  %51 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %50, i32 8, i32 31), !dbg !25
+  %52 = bitcast i32 %51 to float, !dbg !25
+  %53 = fadd float %49, %52, !dbg !22
+  %54 = bitcast float %53 to i32, !dbg !25
+  %55 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %54, i32 4, i32 31), !dbg !25
+  %56 = bitcast i32 %55 to float, !dbg !25
+  %57 = fadd float %53, %56, !dbg !22
+  %58 = bitcast float %57 to i32, !dbg !25
+  %59 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %58, i32 2, i32 31), !dbg !25
+  %60 = bitcast i32 %59 to float, !dbg !25
+  %61 = fadd float %57, %60, !dbg !22
+  %62 = bitcast float %61 to i32, !dbg !25
+  %63 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %62, i32 1, i32 31), !dbg !25
+  %64 = bitcast i32 %63 to float, !dbg !25
+  %65 = fadd float %61, %64, !dbg !22
+  %66 = lshr exact i32 %10, 3, !dbg !28
+  %67 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %66, !dbg !28
+  store float %65, ptr addrspace(3) %67, align 4, !dbg !28
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !28
+  %68 = shl nuw nsw i32 %12, 2, !dbg !28
+  %69 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %68, !dbg !28
+  %70 = load i32, ptr addrspace(3) %69, align 4, !dbg !28
+  %71 = sext i32 %14 to i64, !dbg !29
+  %72 = getelementptr float, ptr addrspace(1) %1, i64 %71, !dbg !29
+  %73 = and i32 %9, 248, !dbg !30
+  %74 = icmp eq i32 %73, 0, !dbg !30
+  tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %70, ptr addrspace(1) %72, i1 %74) #4, !dbg !30
+  ret void, !dbg !31
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1
+
+; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
+declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2
+
+; Function Attrs: convergent nocallback nounwind
+declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #3
+
+attributes #0 = { nounwind "nvvm.reqntid"="256" }
+attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
+attributes #3 = { convergent nocallback nounwind }
+attributes #4 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly)
+!1 = !DIFile(filename: "cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py", directory: "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv")
+!2 = !{i32 2, !"Debug Info Version", i32 3}
+!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
+!4 = distinct !DISubprogram(name: "triton_red_fused__fused_rms_norm_view_0", linkageName: "triton_red_fused__fused_rms_norm_view_0", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!5 = !DISubroutineType(cc: DW_CC_normal, types: !6)
+!6 = !{}
+!7 = !DILocation(line: 23, column: 28, scope: !4)
+!8 = !DILocation(line: 23, column: 33, scope: !4)
+!9 = !DILocation(line: 24, column: 44, scope: !4)
+!10 = !DILocation(line: 24, column: 23, scope: !4)
+!11 = !DILocation(line: 26, column: 37, scope: !4)
+!12 = !DILocation(line: 29, column: 19, scope: !4)
+!13 = !DILocation(line: 28, column: 19, scope: !4)
+!14 = !DILocation(line: 38, column: 45, scope: !4)
+!15 = !DILocation(line: 38, column: 41, scope: !4)
+!16 = !DILocation(line: 38, column: 56, scope: !4)
+!17 = !DILocation(line: 38, column: 50, scope: !4)
+!18 = !DILocation(line: 38, column: 34, scope: !4)
+!19 = !DILocation(line: 38, column: 61, scope: !4)
+!20 = !DILocation(line: 38, column: 115, scope: !4)
+!21 = !DILocation(line: 40, column: 22, scope: !4)
+!22 = !DILocation(line: 263, column: 15, scope: !23, inlinedAt: !25)
+!23 = distinct !DILexicalBlockFile(scope: !4, file: !24, discriminator: 0)
+!24 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.12/dist-packages/triton/language")
+!25 = !DILocation(line: 293, column: 36, scope: !23, inlinedAt: !26)
+!26 = !DILocation(line: 44, column: 25, scope: !27)
+!27 = distinct !DILexicalBlockFile(scope: !4, file: !1, discriminator: 0)
+!28 = !DILocation(line: 44, column: 28, scope: !4)
+!29 = !DILocation(line: 45, column: 25, scope: !4)
+!30 = !DILocation(line: 45, column: 36, scope: !4)
+!31 = !DILocation(line: 45, column: 4, scope: !4)
diff --git a/triton/EEWVWJXNNZ4VFVUQTNJQ7OOCLYR7ISGXRTWLP6JHV3ZM54A5IBNA/triton_red_fused__fused_rms_norm_view_0.ptx b/triton/EEWVWJXNNZ4VFVUQTNJQ7OOCLYR7ISGXRTWLP6JHV3ZM54A5IBNA/triton_red_fused__fused_rms_norm_view_0.ptx
new file mode 100644
index 0000000000000000000000000000000000000000..0f09fb949704c6303aee5f1f135349deac362169
--- /dev/null
+++ b/triton/EEWVWJXNNZ4VFVUQTNJQ7OOCLYR7ISGXRTWLP6JHV3ZM54A5IBNA/triton_red_fused__fused_rms_norm_view_0.ptx
@@ -0,0 +1,506 @@
+//
+// Generated by LLVM NVPTX Back-End
+//
+
+.version 9.1
+.target sm_89
+.address_size 64
+
+	// .globl	triton_red_fused__fused_rms_norm_view_0 // -- Begin function triton_red_fused__fused_rms_norm_view_0
+.extern .shared .align 16 .b8 global_smem[];
+                                        // @triton_red_fused__fused_rms_norm_view_0
+.visible .entry triton_red_fused__fused_rms_norm_view_0(
+	.param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm_view_0_param_0,
+	.param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm_view_0_param_1,
+	.param .u32 triton_red_fused__fused_rms_norm_view_0_param_2,
+	.param .u32 triton_red_fused__fused_rms_norm_view_0_param_3,
+	.param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm_view_0_param_4,
+	.param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm_view_0_param_5
+)
+.reqntid 256
+{
+	.reg .pred 	%p<3>;
+	.reg .b16 	%rs<5>;
+	.reg .b32 	%r<48>;
+	.reg .b64 	%rd<6>;
+	.loc	1 18 0                          // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:18:0
+$L__func_begin0:
+	.loc	1 18 0                          // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:18:0
+
+// %bb.0:
+	ld.param.b64 	%rd4, [triton_red_fused__fused_rms_norm_view_0_param_0];
+	ld.param.b64 	%rd5, [triton_red_fused__fused_rms_norm_view_0_param_1];
+$L__tmp0:
+	.loc	1 23 28                         // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:23:28
+	mov.u32 	%r5, %ctaid.x;
+	.loc	1 23 33                         // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:23:33
+	shl.b32 	%r6, %r5, 3;
+	.loc	1 24 44                         // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:24:44
+	mov.u32 	%r7, %tid.x;
+	and.b32 	%r8, %r7, 224;
+	bfe.u32 	%r9, %r7, 5, 3;
+	and.b32 	%r10, %r7, 7;
+	.loc	1 24 23                         // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:24:23
+	or.b32 	%r11, %r9, %r6;
+	or.b32 	%r12, %r6, %r10;
+	.loc	1 26 37                         // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:26:37
+	shl.b32 	%r13, %r7, 2;
+	and.b32 	%r14, %r13, 124;
+	.loc	1 29 19                         // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:29:19
+	bfe.s32 	%r15, %r5, 28, 1;
+	shr.u32 	%r16, %r15, 27;
+	add.s32 	%r17, %r11, %r16;
+	shr.u32 	%r18, %r17, 5;
+	.loc	1 28 19                         // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:28:19
+	and.b32 	%r19, %r17, 33554400;
+	sub.s32 	%r20, %r11, %r19;
+	.loc	1 38 45                         // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:38:45
+	shl.b32 	%r21, %r20, 7;
+	.loc	1 38 41                         // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:38:41
+	or.b32 	%r22, %r21, %r14;
+	.loc	1 38 50                         // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:38:50
+	mad.lo.s32 	%r23, %r18, 12288, %r22;
+	.loc	1 38 34                         // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:38:34
+	mad.wide.s32 	%rd1, %r23, 2, %rd4;
+	.loc	1 38 61                         // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:38:61
+	// begin inline asm
+	mov.u64 %rd2, 0x0;
+	createpolicy.fractional.L2::evict_first.b64 %rd2, 1.0;
+	// end inline asm
+	mov.b32 	%r3, 0;
+	mov.pred 	%p1, -1;
+	// begin inline asm
+	mov.u32 %r1, %r3;
+	mov.u32 %r2, %r3;
+	@%p1 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { %r1, %r2 }, [ %rd1 + 0 ], %rd2;
+	// end inline asm
+	mov.b32 	{%rs1, %rs2}, %r1;
+	mov.b32 	{%rs3, %rs4}, %r2;
+	.loc	1 38 115                        // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:38:115
+	cvt.f32.bf16 	%r24, %rs1;
+	cvt.f32.bf16 	%r25, %rs2;
+	cvt.f32.bf16 	%r26, %rs3;
+	cvt.f32.bf16 	%r27, %rs4;
+	.loc	1 40 22                         // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:40:22
+	mul.f32 	%r28, %r25, %r25;
+$L__tmp1:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:44:25 ] ]
+	fma.rn.f32 	%r29, %r24, %r24, %r28;
+	fma.rn.f32 	%r30, %r26, %r26, %r29;
+	fma.rn.f32 	%r31, %r27, %r27, %r30;
+$L__tmp2:
+	.loc	2 293 36                        // standard.py:293:36 @[ cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:44:25 ]
+	shfl.sync.bfly.b32 	%r32, %r31, 16, 31, -1;
+$L__tmp3:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:44:25 ] ]
+	add.f32 	%r33, %r31, %r32;
+$L__tmp4:
+	.loc	2 293 36                        // standard.py:293:36 @[ cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:44:25 ]
+	shfl.sync.bfly.b32 	%r34, %r33, 8, 31, -1;
+$L__tmp5:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:44:25 ] ]
+	add.f32 	%r35, %r33, %r34;
+$L__tmp6:
+	.loc	2 293 36                        // standard.py:293:36 @[ cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:44:25 ]
+	shfl.sync.bfly.b32 	%r36, %r35, 4, 31, -1;
+$L__tmp7:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:44:25 ] ]
+	add.f32 	%r37, %r35, %r36;
+$L__tmp8:
+	.loc	2 293 36                        // standard.py:293:36 @[ cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:44:25 ]
+	shfl.sync.bfly.b32 	%r38, %r37, 2, 31, -1;
+$L__tmp9:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:44:25 ] ]
+	add.f32 	%r39, %r37, %r38;
+$L__tmp10:
+	.loc	2 293 36                        // standard.py:293:36 @[ cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:44:25 ]
+	shfl.sync.bfly.b32 	%r40, %r39, 1, 31, -1;
+$L__tmp11:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:44:25 ] ]
+	add.f32 	%r41, %r39, %r40;
+$L__tmp12:
+	.loc	1 44 28                         // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:44:28
+	shr.u32 	%r42, %r8, 3;
+	mov.b32 	%r43, global_smem;
+	add.s32 	%r44, %r43, %r42;
+	st.shared.b32 	[%r44], %r41;
+	bar.sync 	0;
+	shl.b32 	%r45, %r10, 2;
+	add.s32 	%r46, %r43, %r45;
+	ld.shared.b32 	%r4, [%r46];
+	.loc	1 45 25                         // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:45:25
+	mad.wide.s32 	%rd3, %r12, 4, %rd5;
+	.loc	1 45 36                         // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:45:36
+	and.b32 	%r47, %r7, 248;
+	setp.eq.b32 	%p2, %r47, 0;
+	// begin inline asm
+	@%p2 st.global.b32 [ %rd3 + 0 ], { %r4 };
+	// end inline asm
+	.loc	1 45 4                          // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:45:4
+	ret;
+$L__tmp13:
+$L__func_end0:
+                                        // -- End function
+}
+	.file	1 "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py"
+	.file	2 "/usr/local/lib/python3.12/dist-packages/triton/language/standard.py"
+	.section	.debug_abbrev
+	{
+.b8 1                                   // Abbreviation Code
+.b8 17                                  // DW_TAG_compile_unit
+.b8 1                                   // DW_CHILDREN_yes
+.b8 37                                  // DW_AT_producer
+.b8 8                                   // DW_FORM_string
+.b8 19                                  // DW_AT_language
+.b8 5                                   // DW_FORM_data2
+.b8 3                                   // DW_AT_name
+.b8 8                                   // DW_FORM_string
+.b8 16                                  // DW_AT_stmt_list
+.b8 6                                   // DW_FORM_data4
+.b8 27                                  // DW_AT_comp_dir
+.b8 8                                   // DW_FORM_string
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 2                                   // Abbreviation Code
+.b8 46                                  // DW_TAG_subprogram
+.b8 0                                   // DW_CHILDREN_no
+.b8 3                                   // DW_AT_name
+.b8 8                                   // DW_FORM_string
+.b8 32                                  // DW_AT_inline
+.b8 11                                  // DW_FORM_data1
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 3                                   // Abbreviation Code
+.b8 46                                  // DW_TAG_subprogram
+.b8 1                                   // DW_CHILDREN_yes
+.b8 17                                  // DW_AT_low_pc
+.b8 1                                   // DW_FORM_addr
+.b8 18                                  // DW_AT_high_pc
+.b8 1                                   // DW_FORM_addr
+.b8 49                                  // DW_AT_abstract_origin
+.b8 19                                  // DW_FORM_ref4
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 4                                   // Abbreviation Code
+.b8 29                                  // DW_TAG_inlined_subroutine
+.b8 1                                   // DW_CHILDREN_yes
+.b8 49                                  // DW_AT_abstract_origin
+.b8 19                                  // DW_FORM_ref4
+.b8 17                                  // DW_AT_low_pc
+.b8 1                                   // DW_FORM_addr
+.b8 18                                  // DW_AT_high_pc
+.b8 1                                   // DW_FORM_addr
+.b8 88                                  // DW_AT_call_file
+.b8 11                                  // DW_FORM_data1
+.b8 89                                  // DW_AT_call_line
+.b8 11                                  // DW_FORM_data1
+.b8 87                                  // DW_AT_call_column
+.b8 11                                  // DW_FORM_data1
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 5                                   // Abbreviation Code
+.b8 29                                  // DW_TAG_inlined_subroutine
+.b8 0                                   // DW_CHILDREN_no
+.b8 49                                  // DW_AT_abstract_origin
+.b8 19                                  // DW_FORM_ref4
+.b8 17                                  // DW_AT_low_pc
+.b8 1                                   // DW_FORM_addr
+.b8 18                                  // DW_AT_high_pc
+.b8 1                                   // DW_FORM_addr
+.b8 88                                  // DW_AT_call_file
+.b8 11                                  // DW_FORM_data1
+.b8 89                                  // DW_AT_call_line
+.b8 5                                   // DW_FORM_data2
+.b8 87                                  // DW_AT_call_column
+.b8 11                                  // DW_FORM_data1
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 0                                   // EOM(3)
+	}
+	.section	.debug_info
+	{
+.b32 339                                // Length of Unit
+.b8 2                                   // DWARF version number
+.b8 0
+.b32 .debug_abbrev                      // Offset Into Abbrev. Section
+.b8 8                                   // Address Size (in bytes)
+.b8 1                                   // Abbrev [1] 0xb:0x14c DW_TAG_compile_unit
+.b8 116                                 // DW_AT_producer
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 0
+.b8 2                                   // DW_AT_language
+.b8 0
+.b8 99                                  // DW_AT_name
+.b8 119
+.b8 118
+.b8 121
+.b8 116
+.b8 52
+.b8 50
+.b8 55
+.b8 51
+.b8 105
+.b8 117
+.b8 51
+.b8 51
+.b8 109
+.b8 112
+.b8 101
+.b8 101
+.b8 55
+.b8 104
+.b8 98
+.b8 101
+.b8 116
+.b8 53
+.b8 106
+.b8 53
+.b8 101
+.b8 113
+.b8 52
+.b8 52
+.b8 100
+.b8 54
+.b8 102
+.b8 115
+.b8 104
+.b8 103
+.b8 119
+.b8 107
+.b8 121
+.b8 120
+.b8 107
+.b8 110
+.b8 53
+.b8 50
+.b8 103
+.b8 103
+.b8 103
+.b8 107
+.b8 105
+.b8 113
+.b8 104
+.b8 106
+.b8 53
+.b8 46
+.b8 112
+.b8 121
+.b8 0
+.b32 .debug_line                        // DW_AT_stmt_list
+.b8 47                                  // DW_AT_comp_dir
+.b8 97
+.b8 112
+.b8 112
+.b8 47
+.b8 116
+.b8 101
+.b8 110
+.b8 115
+.b8 111
+.b8 114
+.b8 114
+.b8 116
+.b8 95
+.b8 108
+.b8 108
+.b8 109
+.b8 47
+.b8 118
+.b8 105
+.b8 115
+.b8 117
+.b8 97
+.b8 108
+.b8 95
+.b8 103
+.b8 101
+.b8 110
+.b8 47
+.b8 99
+.b8 111
+.b8 109
+.b8 112
+.b8 105
+.b8 108
+.b8 101
+.b8 100
+.b8 95
+.b8 99
+.b8 97
+.b8 99
+.b8 104
+.b8 101
+.b8 47
+.b8 102
+.b8 108
+.b8 117
+.b8 120
+.b8 50
+.b8 95
+.b8 107
+.b8 108
+.b8 101
+.b8 105
+.b8 110
+.b8 95
+.b8 57
+.b8 98
+.b8 95
+.b8 78
+.b8 86
+.b8 73
+.b8 68
+.b8 73
+.b8 65
+.b8 95
+.b8 71
+.b8 101
+.b8 70
+.b8 111
+.b8 114
+.b8 99
+.b8 101
+.b8 95
+.b8 82
+.b8 84
+.b8 88
+.b8 95
+.b8 52
+.b8 48
+.b8 57
+.b8 48
+.b8 95
+.b8 115
+.b8 109
+.b8 56
+.b8 57
+.b8 95
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 50
+.b8 46
+.b8 49
+.b8 48
+.b8 46
+.b8 48
+.b8 97
+.b8 48
+.b8 95
+.b8 98
+.b8 52
+.b8 101
+.b8 52
+.b8 101
+.b8 101
+.b8 56
+.b8 49
+.b8 100
+.b8 51
+.b8 46
+.b8 110
+.b8 118
+.b8 50
+.b8 53
+.b8 46
+.b8 49
+.b8 50
+.b8 95
+.b8 99
+.b8 117
+.b8 100
+.b8 97
+.b8 49
+.b8 51
+.b8 95
+.b8 49
+.b8 47
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 105
+.b8 110
+.b8 100
+.b8 117
+.b8 99
+.b8 116
+.b8 111
+.b8 114
+.b8 47
+.b8 119
+.b8 118
+.b8 0
+.b8 2                                   // Abbrev [2] 0xe4:0x2a DW_TAG_subprogram
+.b8 116                                 // DW_AT_name
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 114
+.b8 101
+.b8 100
+.b8 95
+.b8 102
+.b8 117
+.b8 115
+.b8 101
+.b8 100
+.b8 95
+.b8 95
+.b8 102
+.b8 117
+.b8 115
+.b8 101
+.b8 100
+.b8 95
+.b8 114
+.b8 109
+.b8 115
+.b8 95
+.b8 110
+.b8 111
+.b8 114
+.b8 109
+.b8 95
+.b8 118
+.b8 105
+.b8 101
+.b8 119
+.b8 95
+.b8 48
+.b8 0
+.b8 1                                   // DW_AT_inline
+.b8 3                                   // Abbrev [3] 0x10e:0x48 DW_TAG_subprogram
+.b64 $L__func_begin0                    // DW_AT_low_pc
+.b64 $L__func_end0                      // DW_AT_high_pc
+.b32 228                                // DW_AT_abstract_origin
+.b8 4                                   // Abbrev [4] 0x123:0x32 DW_TAG_inlined_subroutine
+.b32 228                                // DW_AT_abstract_origin
+.b64 $L__tmp1                           // DW_AT_low_pc
+.b64 $L__tmp12                          // DW_AT_high_pc
+.b8 1                                   // DW_AT_call_file
+.b8 44                                  // DW_AT_call_line
+.b8 25                                  // DW_AT_call_column
+.b8 5                                   // Abbrev [5] 0x13b:0x19 DW_TAG_inlined_subroutine
+.b32 228                                // DW_AT_abstract_origin
+.b64 $L__tmp1                           // DW_AT_low_pc
+.b64 $L__tmp12                          // DW_AT_high_pc
+.b8 2                                   // DW_AT_call_file
+.b8 37                                  // DW_AT_call_line
+.b8 1
+.b8 36                                  // DW_AT_call_column
+.b8 0                                   // End Of Children Mark
+.b8 0                                   // End Of Children Mark
+.b8 0                                   // End Of Children Mark
+	}
+	.section	.debug_macinfo	{	}
diff --git a/triton/EEWVWJXNNZ4VFVUQTNJQ7OOCLYR7ISGXRTWLP6JHV3ZM54A5IBNA/triton_red_fused__fused_rms_norm_view_0.source b/triton/EEWVWJXNNZ4VFVUQTNJQ7OOCLYR7ISGXRTWLP6JHV3ZM54A5IBNA/triton_red_fused__fused_rms_norm_view_0.source
new file mode 100644
index 0000000000000000000000000000000000000000..763d5ae6c4f5ff5e6eb3a9630f6cd5ec5350cbb9
--- /dev/null
+++ b/triton/EEWVWJXNNZ4VFVUQTNJQ7OOCLYR7ISGXRTWLP6JHV3ZM54A5IBNA/triton_red_fused__fused_rms_norm_view_0.source
@@ -0,0 +1,167 @@
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":18:0)
+#loc33 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":287:0)
+#loc35 = loc(unknown)
+#loc38 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":262:0)
+#loc42 = loc("in_ptr0"(#loc))
+#loc43 = loc("out_ptr0"(#loc))
+#loc44 = loc("xnumel"(#loc))
+#loc45 = loc("r0_numel"(#loc))
+#loc74 = loc("input"(#loc33))
+#loc75 = loc("a"(#loc38))
+#loc76 = loc("b"(#loc38))
+module {
+  tt.func public @triton_red_fused__fused_rms_norm_view_0(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
+    %xnumel_0 = arith.constant 8192 : i32 loc(#loc46)
+    %r0_numel_1 = arith.constant 128 : i32 loc(#loc47)
+    %xoffset = tt.get_program_id x : i32 loc(#loc48)
+    %xoffset_2 = arith.constant 8 : i32 loc(#loc49)
+    %xoffset_3 = arith.constant 8 : i32 loc(#loc49)
+    %xoffset_4 = arith.muli %xoffset, %xoffset_3 : i32 loc(#loc49)
+    %xindex = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32> loc(#loc50)
+    %xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<8xi32> -> tensor<8x1xi32> loc(#loc51)
+    %xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<8x1xi32> loc(#loc52)
+    %xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<8x1xi32> loc(#loc52)
+    %xmask = arith.constant true loc(#loc53)
+    %xmask_8 = arith.constant dense<true> : tensor<8x128xi1> loc(#loc53)
+    %r0_base = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc54)
+    %r0_base_9 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc55)
+    %x0 = arith.constant 32 : i32 loc(#loc56)
+    %x0_10 = arith.constant 32 : i32 loc(#loc56)
+    %x0_11 = arith.constant dense<32> : tensor<8x1xi32> loc(#loc56)
+    %x0_12 = arith.remsi %xindex_7, %x0_11 : tensor<8x1xi32> loc(#loc56)
+    %x1 = arith.constant 32 : i32 loc(#loc57)
+    %x1_13 = arith.constant 32 : i32 loc(#loc57)
+    %x1_14 = arith.constant dense<32> : tensor<8x1xi32> loc(#loc57)
+    %x1_15 = arith.divsi %xindex_7, %x1_14 : tensor<8x1xi32> loc(#loc57)
+    %_tmp4 = arith.constant 0.000000e+00 : f32 loc(#loc58)
+    %_tmp4_16 = arith.constant dense<0.000000e+00> : tensor<8x128xf32> loc(#loc58)
+    %c0_i32 = arith.constant 0 : i32 loc(#loc14)
+    %c128_i32 = arith.constant 128 : i32 loc(#loc14)
+    %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc14)
+    %1 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc14)
+    %2 = arith.bitcast %c128_i32 : i32 to i32 loc(#loc14)
+    %3 = ub.poison : i32 loc(#loc14)
+    %_tmp4_17 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%_tmp4_19 = %_tmp4_16) -> (tensor<8x128xf32>)  : i32 {
+      %r0_index = tt.splat %r0_offset : i32 -> tensor<1x128xi32> loc(#loc60)
+      %r0_index_20 = arith.addi %r0_index, %r0_base_9 : tensor<1x128xi32> loc(#loc60)
+      %r0_mask = arith.constant dense<128> : tensor<1x128xi32> loc(#loc61)
+      %r0_mask_21 = arith.cmpi slt, %r0_index_20, %r0_mask : tensor<1x128xi32> loc(#loc61)
+      %tmp0 = arith.constant 128 : i32 loc(#loc62)
+      %tmp0_22 = arith.constant 128 : i32 loc(#loc62)
+      %tmp0_23 = arith.constant dense<128> : tensor<8x1xi32> loc(#loc62)
+      %tmp0_24 = arith.muli %tmp0_23, %x0_12 : tensor<8x1xi32> loc(#loc62)
+      %tmp0_25 = tt.broadcast %r0_index_20 : tensor<1x128xi32> -> tensor<8x128xi32> loc(#loc63)
+      %tmp0_26 = tt.broadcast %tmp0_24 : tensor<8x1xi32> -> tensor<8x128xi32> loc(#loc63)
+      %tmp0_27 = arith.addi %tmp0_25, %tmp0_26 : tensor<8x128xi32> loc(#loc63)
+      %tmp0_28 = arith.constant 12288 : i32 loc(#loc64)
+      %tmp0_29 = arith.constant 12288 : i32 loc(#loc64)
+      %tmp0_30 = arith.constant dense<12288> : tensor<8x1xi32> loc(#loc64)
+      %tmp0_31 = arith.muli %tmp0_30, %x1_15 : tensor<8x1xi32> loc(#loc64)
+      %tmp0_32 = tt.broadcast %tmp0_31 : tensor<8x1xi32> -> tensor<8x128xi32> loc(#loc65)
+      %tmp0_33 = arith.addi %tmp0_27, %tmp0_32 : tensor<8x128xi32> loc(#loc65)
+      %tmp0_34 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<8x128x!tt.ptr<bf16>> loc(#loc66)
+      %tmp0_35 = tt.addptr %tmp0_34, %tmp0_33 : tensor<8x128x!tt.ptr<bf16>>, tensor<8x128xi32> loc(#loc66)
+      %tmp0_36 = arith.constant 0.000000e+00 : f32 loc(#loc67)
+      %tmp0_37 = tt.broadcast %r0_mask_21 : tensor<1x128xi1> -> tensor<8x128xi1> loc(#loc67)
+      %tmp0_38 = arith.constant dense<0.000000e+00> : tensor<8x128xf32> loc(#loc67)
+      %tmp0_39 = arith.truncf %tmp0_38 : tensor<8x128xf32> to tensor<8x128xbf16> loc(#loc67)
+      %tmp0_40 = tt.load %tmp0_35, %tmp0_37, %tmp0_39 evictionPolicy = evict_first : tensor<8x128x!tt.ptr<bf16>> loc(#loc67)
+      %tmp0_41 = arith.extf %tmp0_40 : tensor<8x128xbf16> to tensor<8x128xf32> loc(#loc68)
+      %tmp2 = arith.mulf %tmp0_41, %tmp0_41 : tensor<8x128xf32> loc(#loc69)
+      %tmp5 = arith.addf %_tmp4_19, %tmp2 : tensor<8x128xf32> loc(#loc70)
+      %_tmp4_42 = tt.broadcast %r0_mask_21 : tensor<1x128xi1> -> tensor<8x128xi1> loc(#loc71)
+      %_tmp4_43 = arith.select %_tmp4_42, %tmp5, %_tmp4_19 : tensor<8x128xi1>, tensor<8x128xf32> loc(#loc71)
+      scf.yield %_tmp4_43 : tensor<8x128xf32> loc(#loc27)
+    } loc(#loc59)
+    %tmp4 = tt.call @"triton.language.standard.sum__fp32S8_128S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%_tmp4_17) : (tensor<8x128xf32>) -> tensor<8xf32> loc(#loc72)
+    %tmp4_18 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<8xf32> -> tensor<8x1xf32> loc(#loc73)
+    %4 = tt.splat %out_ptr0 : !tt.ptr<f32> -> tensor<8x1x!tt.ptr<f32>> loc(#loc30)
+    %5 = tt.addptr %4, %xindex_7 : tensor<8x1x!tt.ptr<f32>>, tensor<8x1xi32> loc(#loc30)
+    tt.store %5, %tmp4_18 : tensor<8x1x!tt.ptr<f32>> loc(#loc31)
+    tt.return loc(#loc32)
+  } loc(#loc)
+  tt.func private @"triton.language.standard.sum__fp32S8_128S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<8x128xf32> loc("input"(#loc33))) -> tensor<8xf32> attributes {noinline = false} {
+    %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({
+    ^bb0(%arg1: f32 loc(unknown), %arg2: f32 loc(unknown)):
+      %2 = tt.call @triton.language.standard._sum_combine__fp32_fp32__(%arg1, %arg2) : (f32, f32) -> f32 loc(#loc34)
+      tt.reduce.return %2 : f32 loc(#loc34)
+    }) : (tensor<8x128xf32>) -> tensor<8xf32> loc(#loc34)
+    tt.return %0 : tensor<8xf32> loc(#loc36)
+  ^bb1:  // no predecessors
+    %1 = ub.poison : tensor<8xf32> loc(#loc37)
+    tt.return %1 : tensor<8xf32> loc(#loc37)
+  } loc(#loc33)
+  tt.func private @triton.language.standard._sum_combine__fp32_fp32__(%a: f32 loc("a"(#loc38)), %b: f32 loc("b"(#loc38))) -> f32 attributes {noinline = false} {
+    %0 = arith.addf %a, %b : f32 loc(#loc39)
+    tt.return %0 : f32 loc(#loc40)
+  ^bb1:  // no predecessors
+    %1 = ub.poison : f32 loc(#loc41)
+    tt.return %1 : f32 loc(#loc41)
+  } loc(#loc38)
+} loc(#loc)
+#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":19:13)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":20:15)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":23:28)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":23:33)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":24:36)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":24:44)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":24:23)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":25:46)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":26:27)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":26:37)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":28:19)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":29:19)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":30:43)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":32:43)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":33:31)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":34:29)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:45)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:41)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:56)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:50)
+#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:34)
+#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:61)
+#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:115)
+#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":40:22)
+#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":42:23)
+#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":43:40)
+#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":43:8)
+#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":44:25)
+#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":44:28)
+#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":45:25)
+#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":45:36)
+#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":45:4)
+#loc34 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:36)
+#loc36 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:11)
+#loc37 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:4)
+#loc39 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:15)
+#loc40 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:11)
+#loc41 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:4)
+#loc46 = loc("xnumel"(#loc1))
+#loc47 = loc("r0_numel"(#loc2))
+#loc48 = loc("xoffset"(#loc3))
+#loc49 = loc("xoffset"(#loc4))
+#loc50 = loc("xindex"(#loc5))
+#loc51 = loc("xindex"(#loc6))
+#loc52 = loc("xindex"(#loc7))
+#loc53 = loc("xmask"(#loc8))
+#loc54 = loc("r0_base"(#loc9))
+#loc55 = loc("r0_base"(#loc10))
+#loc56 = loc("x0"(#loc11))
+#loc57 = loc("x1"(#loc12))
+#loc58 = loc("_tmp4"(#loc13))
+#loc59 = loc("_tmp4"(#loc14))
+#loc60 = loc("r0_index"(#loc15))
+#loc61 = loc("r0_mask"(#loc16))
+#loc62 = loc("tmp0"(#loc17))
+#loc63 = loc("tmp0"(#loc18))
+#loc64 = loc("tmp0"(#loc19))
+#loc65 = loc("tmp0"(#loc20))
+#loc66 = loc("tmp0"(#loc21))
+#loc67 = loc("tmp0"(#loc22))
+#loc68 = loc("tmp0"(#loc23))
+#loc69 = loc("tmp2"(#loc24))
+#loc70 = loc("tmp5"(#loc25))
+#loc71 = loc("_tmp4"(#loc26))
+#loc72 = loc("tmp4"(#loc28))
+#loc73 = loc("tmp4"(#loc29))
diff --git a/triton/EEWVWJXNNZ4VFVUQTNJQ7OOCLYR7ISGXRTWLP6JHV3ZM54A5IBNA/triton_red_fused__fused_rms_norm_view_0.ttgir b/triton/EEWVWJXNNZ4VFVUQTNJQ7OOCLYR7ISGXRTWLP6JHV3ZM54A5IBNA/triton_red_fused__fused_rms_norm_view_0.ttgir
new file mode 100644
index 0000000000000000000000000000000000000000..64fd00f3aff1339d633f51873800f3b13fdacf1f
--- /dev/null
+++ b/triton/EEWVWJXNNZ4VFVUQTNJQ7OOCLYR7ISGXRTWLP6JHV3ZM54A5IBNA/triton_red_fused__fused_rms_norm_view_0.ttgir
@@ -0,0 +1,108 @@
+#blocked = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 32], warpsPerCTA = [8, 1], order = [1, 0]}>
+#blocked1 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [8, 4], warpsPerCTA = [1, 8], order = [0, 1]}>
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":18:0)
+#loc1 = loc(unknown)
+#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":44:25)
+#loc27 = loc("in_ptr0"(#loc))
+#loc28 = loc("out_ptr0"(#loc))
+#loc29 = loc("xnumel"(#loc))
+#loc30 = loc("r0_numel"(#loc))
+#loc49 = loc("tmp4"(#loc21))
+#loc52 = loc(callsite(#loc1 at #loc49))
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "cuda:89", "ttg.threads-per-warp" = 32 : i32} {
+  tt.func public @triton_red_fused__fused_rms_norm_view_0(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
+    %cst = arith.constant dense<128> : tensor<1x128xi32, #blocked> loc(#loc1)
+    %cst_0 = arith.constant dense<128> : tensor<8x1xi32, #blocked> loc(#loc1)
+    %cst_1 = arith.constant dense<12288> : tensor<8x1xi32, #blocked> loc(#loc1)
+    %cst_2 = arith.constant dense<32> : tensor<8x1xi32, #blocked> loc(#loc1)
+    %c8_i32 = arith.constant 8 : i32 loc(#loc1)
+    %cst_3 = arith.constant dense<0.000000e+00> : tensor<8x128xbf16, #blocked> loc(#loc1)
+    %cst_4 = arith.constant dense<0.000000e+00> : tensor<8x128xf32, #blocked> loc(#loc1)
+    %xoffset = tt.get_program_id x : i32 loc(#loc31)
+    %xoffset_5 = arith.muli %xoffset, %c8_i32 : i32 loc(#loc32)
+    %xindex = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc33)
+    %xindex_6 = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc33)
+    %xindex_7 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<8xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<8x1xi32, #blocked> loc(#loc33)
+    %xindex_8 = tt.expand_dims %xindex_6 {axis = 1 : i32} : tensor<8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<8x1xi32, #blocked1> loc(#loc33)
+    %xindex_9 = tt.splat %xoffset_5 : i32 -> tensor<8x1xi32, #blocked> loc(#loc34)
+    %xindex_10 = tt.splat %xoffset_5 : i32 -> tensor<8x1xi32, #blocked1> loc(#loc34)
+    %xindex_11 = arith.addi %xindex_9, %xindex_7 : tensor<8x1xi32, #blocked> loc(#loc34)
+    %xindex_12 = arith.addi %xindex_10, %xindex_8 : tensor<8x1xi32, #blocked1> loc(#loc34)
+    %r0_base = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc35)
+    %r0_base_13 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x128xi32, #blocked> loc(#loc35)
+    %x0 = arith.remsi %xindex_11, %cst_2 : tensor<8x1xi32, #blocked> loc(#loc36)
+    %x1 = arith.divsi %xindex_11, %cst_2 : tensor<8x1xi32, #blocked> loc(#loc37)
+    %r0_mask = arith.cmpi slt, %r0_base_13, %cst : tensor<1x128xi32, #blocked> loc(#loc38)
+    %tmp0 = arith.muli %x0, %cst_0 : tensor<8x1xi32, #blocked> loc(#loc39)
+    %tmp0_14 = tt.broadcast %r0_base_13 : tensor<1x128xi32, #blocked> -> tensor<8x128xi32, #blocked> loc(#loc40)
+    %tmp0_15 = tt.broadcast %tmp0 : tensor<8x1xi32, #blocked> -> tensor<8x128xi32, #blocked> loc(#loc40)
+    %tmp0_16 = arith.addi %tmp0_14, %tmp0_15 : tensor<8x128xi32, #blocked> loc(#loc40)
+    %tmp0_17 = arith.muli %x1, %cst_1 : tensor<8x1xi32, #blocked> loc(#loc41)
+    %tmp0_18 = tt.broadcast %tmp0_17 : tensor<8x1xi32, #blocked> -> tensor<8x128xi32, #blocked> loc(#loc42)
+    %tmp0_19 = arith.addi %tmp0_16, %tmp0_18 : tensor<8x128xi32, #blocked> loc(#loc42)
+    %tmp0_20 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<8x128x!tt.ptr<bf16>, #blocked> loc(#loc43)
+    %tmp0_21 = tt.addptr %tmp0_20, %tmp0_19 : tensor<8x128x!tt.ptr<bf16>, #blocked>, tensor<8x128xi32, #blocked> loc(#loc43)
+    %tmp0_22 = tt.broadcast %r0_mask : tensor<1x128xi1, #blocked> -> tensor<8x128xi1, #blocked> loc(#loc44)
+    %tmp0_23 = tt.load %tmp0_21, %tmp0_22, %cst_3 evictionPolicy = evict_first : tensor<8x128x!tt.ptr<bf16>, #blocked> loc(#loc44)
+    %tmp0_24 = arith.extf %tmp0_23 : tensor<8x128xbf16, #blocked> to tensor<8x128xf32, #blocked> loc(#loc45)
+    %tmp2 = arith.mulf %tmp0_24, %tmp0_24 : tensor<8x128xf32, #blocked> loc(#loc46)
+    %tmp5 = arith.addf %tmp2, %cst_4 : tensor<8x128xf32, #blocked> loc(#loc47)
+    %_tmp4 = arith.select %tmp0_22, %tmp5, %cst_4 : tensor<8x128xi1, #blocked>, tensor<8x128xf32, #blocked> loc(#loc48)
+    %tmp4 = "tt.reduce"(%_tmp4) <{axis = 1 : i32}> ({
+    ^bb0(%tmp4_27: f32 loc(callsite(#loc1 at #loc49)), %tmp4_28: f32 loc(callsite(#loc1 at #loc49))):
+      %tmp4_29 = arith.addf %tmp4_27, %tmp4_28 : f32 loc(#loc53)
+      tt.reduce.return %tmp4_29 : f32 loc(#loc51)
+    }) : (tensor<8x128xf32, #blocked>) -> tensor<8xf32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc51)
+    %tmp4_25 = ttg.convert_layout %tmp4 : tensor<8xf32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<8xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc50)
+    %tmp4_26 = tt.expand_dims %tmp4_25 {axis = 1 : i32} : tensor<8xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<8x1xf32, #blocked1> loc(#loc50)
+    %0 = tt.splat %out_ptr0 : !tt.ptr<f32> -> tensor<8x1x!tt.ptr<f32>, #blocked1> loc(#loc24)
+    %1 = tt.addptr %0, %xindex_12 : tensor<8x1x!tt.ptr<f32>, #blocked1>, tensor<8x1xi32, #blocked1> loc(#loc24)
+    tt.store %1, %tmp4_26 : tensor<8x1x!tt.ptr<f32>, #blocked1> loc(#loc25)
+    tt.return loc(#loc26)
+  } loc(#loc)
+} loc(#loc)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":23:28)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":23:33)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":24:44)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":24:23)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":26:37)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":28:19)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":29:19)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":34:29)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:45)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:41)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:56)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:50)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:34)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:61)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:115)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":40:22)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":42:23)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":43:40)
+#loc20 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:36)
+#loc22 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:15)
+#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":44:28)
+#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":45:25)
+#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":45:36)
+#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":45:4)
+#loc31 = loc("xoffset"(#loc2))
+#loc32 = loc("xoffset"(#loc3))
+#loc33 = loc("xindex"(#loc4))
+#loc34 = loc("xindex"(#loc5))
+#loc35 = loc("r0_base"(#loc6))
+#loc36 = loc("x0"(#loc7))
+#loc37 = loc("x1"(#loc8))
+#loc38 = loc("r0_mask"(#loc9))
+#loc39 = loc("tmp0"(#loc10))
+#loc40 = loc("tmp0"(#loc11))
+#loc41 = loc("tmp0"(#loc12))
+#loc42 = loc("tmp0"(#loc13))
+#loc43 = loc("tmp0"(#loc14))
+#loc44 = loc("tmp0"(#loc15))
+#loc45 = loc("tmp0"(#loc16))
+#loc46 = loc("tmp2"(#loc17))
+#loc47 = loc("tmp5"(#loc18))
+#loc48 = loc("_tmp4"(#loc19))
+#loc50 = loc("tmp4"(#loc23))
+#loc51 = loc(callsite(#loc20 at #loc49))
+#loc53 = loc(callsite(#loc22 at #loc51))
diff --git a/triton/EEWVWJXNNZ4VFVUQTNJQ7OOCLYR7ISGXRTWLP6JHV3ZM54A5IBNA/triton_red_fused__fused_rms_norm_view_0.ttir b/triton/EEWVWJXNNZ4VFVUQTNJQ7OOCLYR7ISGXRTWLP6JHV3ZM54A5IBNA/triton_red_fused__fused_rms_norm_view_0.ttir
new file mode 100644
index 0000000000000000000000000000000000000000..c61dade683e1a51730becab8945fdb2e97b38e3d
--- /dev/null
+++ b/triton/EEWVWJXNNZ4VFVUQTNJQ7OOCLYR7ISGXRTWLP6JHV3ZM54A5IBNA/triton_red_fused__fused_rms_norm_view_0.ttir
@@ -0,0 +1,105 @@
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":18:0)
+#loc2 = loc(unknown)
+#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":44:25)
+#loc29 = loc("in_ptr0"(#loc))
+#loc30 = loc("out_ptr0"(#loc))
+#loc31 = loc("xnumel"(#loc))
+#loc32 = loc("r0_numel"(#loc))
+#loc53 = loc("tmp4"(#loc23))
+#loc56 = loc(callsite(#loc2 at #loc53))
+module {
+  tt.func public @triton_red_fused__fused_rms_norm_view_0(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
+    %tmp0 = arith.constant dense<0.000000e+00> : tensor<8x128xbf16> loc(#loc33)
+    %cst = arith.constant dense<12288> : tensor<8x1xi32> loc(#loc2)
+    %cst_0 = arith.constant dense<128> : tensor<8x1xi32> loc(#loc2)
+    %cst_1 = arith.constant dense<128> : tensor<1x128xi32> loc(#loc2)
+    %cst_2 = arith.constant dense<0.000000e+00> : tensor<8x128xf32> loc(#loc2)
+    %cst_3 = arith.constant dense<32> : tensor<8x1xi32> loc(#loc2)
+    %c8_i32 = arith.constant 8 : i32 loc(#loc2)
+    %xoffset = tt.get_program_id x : i32 loc(#loc34)
+    %xoffset_4 = arith.muli %xoffset, %c8_i32 : i32 loc(#loc35)
+    %xindex = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32> loc(#loc36)
+    %xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<8xi32> -> tensor<8x1xi32> loc(#loc37)
+    %xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<8x1xi32> loc(#loc38)
+    %xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<8x1xi32> loc(#loc38)
+    %r0_base = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc39)
+    %r0_base_8 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc40)
+    %x0 = arith.remsi %xindex_7, %cst_3 : tensor<8x1xi32> loc(#loc41)
+    %x1 = arith.divsi %xindex_7, %cst_3 : tensor<8x1xi32> loc(#loc42)
+    %r0_mask = arith.cmpi slt, %r0_base_8, %cst_1 : tensor<1x128xi32> loc(#loc43)
+    %tmp0_9 = arith.muli %x0, %cst_0 : tensor<8x1xi32> loc(#loc44)
+    %tmp0_10 = tt.broadcast %r0_base_8 : tensor<1x128xi32> -> tensor<8x128xi32> loc(#loc45)
+    %tmp0_11 = tt.broadcast %tmp0_9 : tensor<8x1xi32> -> tensor<8x128xi32> loc(#loc45)
+    %tmp0_12 = arith.addi %tmp0_10, %tmp0_11 : tensor<8x128xi32> loc(#loc45)
+    %tmp0_13 = arith.muli %x1, %cst : tensor<8x1xi32> loc(#loc46)
+    %tmp0_14 = tt.broadcast %tmp0_13 : tensor<8x1xi32> -> tensor<8x128xi32> loc(#loc47)
+    %tmp0_15 = arith.addi %tmp0_12, %tmp0_14 : tensor<8x128xi32> loc(#loc47)
+    %tmp0_16 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<8x128x!tt.ptr<bf16>> loc(#loc48)
+    %tmp0_17 = tt.addptr %tmp0_16, %tmp0_15 : tensor<8x128x!tt.ptr<bf16>>, tensor<8x128xi32> loc(#loc48)
+    %tmp0_18 = tt.broadcast %r0_mask : tensor<1x128xi1> -> tensor<8x128xi1> loc(#loc33)
+    %tmp0_19 = tt.load %tmp0_17, %tmp0_18, %tmp0 evictionPolicy = evict_first : tensor<8x128x!tt.ptr<bf16>> loc(#loc33)
+    %tmp0_20 = arith.extf %tmp0_19 : tensor<8x128xbf16> to tensor<8x128xf32> loc(#loc49)
+    %tmp2 = arith.mulf %tmp0_20, %tmp0_20 : tensor<8x128xf32> loc(#loc50)
+    %tmp5 = arith.addf %tmp2, %cst_2 : tensor<8x128xf32> loc(#loc51)
+    %_tmp4 = arith.select %tmp0_18, %tmp5, %cst_2 : tensor<8x128xi1>, tensor<8x128xf32> loc(#loc52)
+    %tmp4 = "tt.reduce"(%_tmp4) <{axis = 1 : i32}> ({
+    ^bb0(%tmp4_22: f32 loc(callsite(#loc2 at #loc53)), %tmp4_23: f32 loc(callsite(#loc2 at #loc53))):
+      %tmp4_24 = arith.addf %tmp4_22, %tmp4_23 : f32 loc(#loc57)
+      tt.reduce.return %tmp4_24 : f32 loc(#loc55)
+    }) : (tensor<8x128xf32>) -> tensor<8xf32> loc(#loc55)
+    %tmp4_21 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<8xf32> -> tensor<8x1xf32> loc(#loc54)
+    %0 = tt.splat %out_ptr0 : !tt.ptr<f32> -> tensor<8x1x!tt.ptr<f32>> loc(#loc26)
+    %1 = tt.addptr %0, %xindex_7 : tensor<8x1x!tt.ptr<f32>>, tensor<8x1xi32> loc(#loc26)
+    tt.store %1, %tmp4_21 : tensor<8x1x!tt.ptr<f32>> loc(#loc27)
+    tt.return loc(#loc28)
+  } loc(#loc)
+} loc(#loc)
+#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:61)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":23:28)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":23:33)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":24:36)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":24:44)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":24:23)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":26:27)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":26:37)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":28:19)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":29:19)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":34:29)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:45)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:41)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:56)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:50)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:34)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:115)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":40:22)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":42:23)
+#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":43:40)
+#loc22 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:36)
+#loc24 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:15)
+#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":44:28)
+#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":45:25)
+#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":45:36)
+#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":45:4)
+#loc33 = loc("tmp0"(#loc1))
+#loc34 = loc("xoffset"(#loc3))
+#loc35 = loc("xoffset"(#loc4))
+#loc36 = loc("xindex"(#loc5))
+#loc37 = loc("xindex"(#loc6))
+#loc38 = loc("xindex"(#loc7))
+#loc39 = loc("r0_base"(#loc8))
+#loc40 = loc("r0_base"(#loc9))
+#loc41 = loc("x0"(#loc10))
+#loc42 = loc("x1"(#loc11))
+#loc43 = loc("r0_mask"(#loc12))
+#loc44 = loc("tmp0"(#loc13))
+#loc45 = loc("tmp0"(#loc14))
+#loc46 = loc("tmp0"(#loc15))
+#loc47 = loc("tmp0"(#loc16))
+#loc48 = loc("tmp0"(#loc17))
+#loc49 = loc("tmp0"(#loc18))
+#loc50 = loc("tmp2"(#loc19))
+#loc51 = loc("tmp5"(#loc20))
+#loc52 = loc("_tmp4"(#loc21))
+#loc54 = loc("tmp4"(#loc25))
+#loc55 = loc(callsite(#loc22 at #loc53))
+#loc57 = loc(callsite(#loc24 at #loc55))
diff --git a/triton/ENXFAAYKQI5QASIWI4F5B2IT6OP7L6V6KYE5EI7JD6K7N5WDNP5Q/__grp__triton_red_fused__fused_rms_norm_view_0.json b/triton/ENXFAAYKQI5QASIWI4F5B2IT6OP7L6V6KYE5EI7JD6K7N5WDNP5Q/__grp__triton_red_fused__fused_rms_norm_view_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..967fbaa1dc2598bf06b75b858b5f1530289d91fb
--- /dev/null
+++ b/triton/ENXFAAYKQI5QASIWI4F5B2IT6OP7L6V6KYE5EI7JD6K7N5WDNP5Q/__grp__triton_red_fused__fused_rms_norm_view_0.json
@@ -0,0 +1 @@
+{"child_paths": {"triton_red_fused__fused_rms_norm_view_0.source": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/ENXFAAYKQI5QASIWI4F5B2IT6OP7L6V6KYE5EI7JD6K7N5WDNP5Q/triton_red_fused__fused_rms_norm_view_0.source", "triton_red_fused__fused_rms_norm_view_0.ttir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/ENXFAAYKQI5QASIWI4F5B2IT6OP7L6V6KYE5EI7JD6K7N5WDNP5Q/triton_red_fused__fused_rms_norm_view_0.ttir", "triton_red_fused__fused_rms_norm_view_0.ttgir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/ENXFAAYKQI5QASIWI4F5B2IT6OP7L6V6KYE5EI7JD6K7N5WDNP5Q/triton_red_fused__fused_rms_norm_view_0.ttgir", "triton_red_fused__fused_rms_norm_view_0.llir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/ENXFAAYKQI5QASIWI4F5B2IT6OP7L6V6KYE5EI7JD6K7N5WDNP5Q/triton_red_fused__fused_rms_norm_view_0.llir", "triton_red_fused__fused_rms_norm_view_0.ptx": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/ENXFAAYKQI5QASIWI4F5B2IT6OP7L6V6KYE5EI7JD6K7N5WDNP5Q/triton_red_fused__fused_rms_norm_view_0.ptx", "triton_red_fused__fused_rms_norm_view_0.cubin": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/ENXFAAYKQI5QASIWI4F5B2IT6OP7L6V6KYE5EI7JD6K7N5WDNP5Q/triton_red_fused__fused_rms_norm_view_0.cubin", "triton_red_fused__fused_rms_norm_view_0.json": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/ENXFAAYKQI5QASIWI4F5B2IT6OP7L6V6KYE5EI7JD6K7N5WDNP5Q/triton_red_fused__fused_rms_norm_view_0.json"}}
\ No newline at end of file
diff --git a/triton/ENXFAAYKQI5QASIWI4F5B2IT6OP7L6V6KYE5EI7JD6K7N5WDNP5Q/triton_red_fused__fused_rms_norm_view_0.cubin b/triton/ENXFAAYKQI5QASIWI4F5B2IT6OP7L6V6KYE5EI7JD6K7N5WDNP5Q/triton_red_fused__fused_rms_norm_view_0.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..52f4901b21e9858eb716de05eed9fd4a59e501bf
Binary files /dev/null and b/triton/ENXFAAYKQI5QASIWI4F5B2IT6OP7L6V6KYE5EI7JD6K7N5WDNP5Q/triton_red_fused__fused_rms_norm_view_0.cubin differ
diff --git a/triton/ENXFAAYKQI5QASIWI4F5B2IT6OP7L6V6KYE5EI7JD6K7N5WDNP5Q/triton_red_fused__fused_rms_norm_view_0.json b/triton/ENXFAAYKQI5QASIWI4F5B2IT6OP7L6V6KYE5EI7JD6K7N5WDNP5Q/triton_red_fused__fused_rms_norm_view_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..a5c6a53ab457bcb3054f9470bfa06e4d66fdccec
--- /dev/null
+++ b/triton/ENXFAAYKQI5QASIWI4F5B2IT6OP7L6V6KYE5EI7JD6K7N5WDNP5Q/triton_red_fused__fused_rms_norm_view_0.json
@@ -0,0 +1 @@
+{"hash": "236e50030a823b004916470bd0e913f39ff5fabe5609d223e91f95f6f6c36bfb", "target": {"backend": "cuda", "arch": 89, "warp_size": 32}, "num_warps": 8, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "enable_reflect_ftz": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee", "bf16x3", "bf16x6"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm89", "instrumentation_mode": "", "triton_version": "3.6.0", "tensordesc_meta": [], "shared": 256, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused__fused_rms_norm_view_0"}
\ No newline at end of file
diff --git a/triton/ENXFAAYKQI5QASIWI4F5B2IT6OP7L6V6KYE5EI7JD6K7N5WDNP5Q/triton_red_fused__fused_rms_norm_view_0.llir b/triton/ENXFAAYKQI5QASIWI4F5B2IT6OP7L6V6KYE5EI7JD6K7N5WDNP5Q/triton_red_fused__fused_rms_norm_view_0.llir
new file mode 100644
index 0000000000000000000000000000000000000000..cb65d38c3937385c4545720e6794374b243fbb2f
--- /dev/null
+++ b/triton/ENXFAAYKQI5QASIWI4F5B2IT6OP7L6V6KYE5EI7JD6K7N5WDNP5Q/triton_red_fused__fused_rms_norm_view_0.llir
@@ -0,0 +1,120 @@
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64"
+
+@global_smem = external local_unnamed_addr addrspace(3) global [0 x i8], align 16
+
+; Function Attrs: nounwind
+define ptx_kernel void @triton_red_fused__fused_rms_norm_view_0(ptr addrspace(1) %0, ptr addrspace(1) %1, i32 %2, i32 %3, ptr addrspace(1) readnone captures(none) %4, ptr addrspace(1) readnone captures(none) %5) local_unnamed_addr #0 !dbg !4 {
+  %7 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7
+  %8 = shl i32 %7, 6, !dbg !8
+  %9 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9
+  %10 = and i32 %9, 252, !dbg !9
+  %11 = lshr exact i32 %10, 2, !dbg !9
+  %12 = or disjoint i32 %11, %8, !dbg !10
+  %13 = and i32 %9, 3, !dbg !11
+  %14 = sdiv i32 %12, 32, !dbg !12
+  %15 = mul i32 %14, 32, !dbg !13
+  %.decomposed = sub i32 %12, %15, !dbg !13
+  %16 = shl nsw i32 %.decomposed, 7, !dbg !14
+  %17 = mul i32 %14, 12288, !dbg !15
+  %18 = or disjoint i32 %16, %13
+  %19 = add i32 %18, %17
+  br label %20, !dbg !16
+
+20:                                               ; preds = %6, %20
+  %indvars.iv = phi i64 [ 0, %6 ], [ %indvars.iv.next, %20 ]
+  %21 = phi float [ 0.000000e+00, %6 ], [ %31, %20 ]
+  %22 = trunc nuw nsw i64 %indvars.iv to i32, !dbg !17
+  %23 = add i32 %19, %22, !dbg !17
+  %24 = sext i32 %23 to i64, !dbg !18
+  %25 = getelementptr bfloat, ptr addrspace(1) %0, i64 %24, !dbg !18
+  %26 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !19
+  %27 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %25, i64 %26, i1 true) #4, !dbg !19
+  %28 = bitcast i16 %27 to bfloat, !dbg !19
+  %29 = fpext bfloat %28 to float, !dbg !20
+  %30 = fmul float %29, %29, !dbg !21
+  %31 = fadd float %21, %30, !dbg !22
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 4, !dbg !16
+  %32 = icmp samesign ult i64 %indvars.iv, 124, !dbg !16
+  br i1 %32, label %20, label %33, !dbg !16
+
+33:                                               ; preds = %20
+  %34 = and i32 %9, 63, !dbg !9
+  %35 = or disjoint i32 %8, %34, !dbg !10
+  %36 = bitcast float %31 to i32, !dbg !23
+  %37 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %36, i32 2, i32 31), !dbg !23
+  %38 = bitcast i32 %37 to float, !dbg !23
+  %39 = fadd float %31, %38, !dbg !28
+  %40 = bitcast float %39 to i32, !dbg !23
+  %41 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %40, i32 1, i32 31), !dbg !23
+  %42 = bitcast i32 %41 to float, !dbg !23
+  %43 = fadd float %39, %42, !dbg !28
+  %44 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %10, !dbg !29
+  store float %43, ptr addrspace(3) %44, align 4, !dbg !29
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !29
+  %45 = shl nuw nsw i32 %34, 2, !dbg !29
+  %46 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %45, !dbg !29
+  %47 = load i32, ptr addrspace(3) %46, align 4, !dbg !29
+  %48 = sext i32 %35 to i64, !dbg !30
+  %49 = getelementptr float, ptr addrspace(1) %1, i64 %48, !dbg !30
+  %50 = and i32 %9, 192, !dbg !31
+  %51 = icmp eq i32 %50, 0, !dbg !31
+  tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %47, ptr addrspace(1) %49, i1 %51) #4, !dbg !31
+  ret void, !dbg !32
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1
+
+; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
+declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2
+
+; Function Attrs: convergent nocallback nounwind
+declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #3
+
+attributes #0 = { nounwind "nvvm.reqntid"="256" }
+attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
+attributes #3 = { convergent nocallback nounwind }
+attributes #4 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly)
+!1 = !DIFile(filename: "cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py", directory: "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv")
+!2 = !{i32 2, !"Debug Info Version", i32 3}
+!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
+!4 = distinct !DISubprogram(name: "triton_red_fused__fused_rms_norm_view_0", linkageName: "triton_red_fused__fused_rms_norm_view_0", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!5 = !DISubroutineType(cc: DW_CC_normal, types: !6)
+!6 = !{}
+!7 = !DILocation(line: 23, column: 28, scope: !4)
+!8 = !DILocation(line: 23, column: 33, scope: !4)
+!9 = !DILocation(line: 24, column: 44, scope: !4)
+!10 = !DILocation(line: 24, column: 23, scope: !4)
+!11 = !DILocation(line: 26, column: 37, scope: !4)
+!12 = !DILocation(line: 29, column: 19, scope: !4)
+!13 = !DILocation(line: 28, column: 19, scope: !4)
+!14 = !DILocation(line: 38, column: 45, scope: !4)
+!15 = !DILocation(line: 38, column: 56, scope: !4)
+!16 = !DILocation(line: 32, column: 43, scope: !4)
+!17 = !DILocation(line: 38, column: 50, scope: !4)
+!18 = !DILocation(line: 38, column: 34, scope: !4)
+!19 = !DILocation(line: 38, column: 61, scope: !4)
+!20 = !DILocation(line: 38, column: 115, scope: !4)
+!21 = !DILocation(line: 40, column: 22, scope: !4)
+!22 = !DILocation(line: 42, column: 23, scope: !4)
+!23 = !DILocation(line: 293, column: 36, scope: !24, inlinedAt: !26)
+!24 = distinct !DILexicalBlockFile(scope: !4, file: !25, discriminator: 0)
+!25 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.12/dist-packages/triton/language")
+!26 = !DILocation(line: 44, column: 25, scope: !27)
+!27 = distinct !DILexicalBlockFile(scope: !4, file: !1, discriminator: 0)
+!28 = !DILocation(line: 263, column: 15, scope: !24, inlinedAt: !23)
+!29 = !DILocation(line: 44, column: 28, scope: !4)
+!30 = !DILocation(line: 45, column: 25, scope: !4)
+!31 = !DILocation(line: 45, column: 36, scope: !4)
+!32 = !DILocation(line: 45, column: 4, scope: !4)
diff --git a/triton/ENXFAAYKQI5QASIWI4F5B2IT6OP7L6V6KYE5EI7JD6K7N5WDNP5Q/triton_red_fused__fused_rms_norm_view_0.ptx b/triton/ENXFAAYKQI5QASIWI4F5B2IT6OP7L6V6KYE5EI7JD6K7N5WDNP5Q/triton_red_fused__fused_rms_norm_view_0.ptx
new file mode 100644
index 0000000000000000000000000000000000000000..482078abac6ee361f4f260d598f864b94dac4c30
--- /dev/null
+++ b/triton/ENXFAAYKQI5QASIWI4F5B2IT6OP7L6V6KYE5EI7JD6K7N5WDNP5Q/triton_red_fused__fused_rms_norm_view_0.ptx
@@ -0,0 +1,486 @@
+//
+// Generated by LLVM NVPTX Back-End
+//
+
+.version 9.1
+.target sm_89
+.address_size 64
+
+	// .globl	triton_red_fused__fused_rms_norm_view_0 // -- Begin function triton_red_fused__fused_rms_norm_view_0
+.extern .shared .align 16 .b8 global_smem[];
+                                        // @triton_red_fused__fused_rms_norm_view_0
+.visible .entry triton_red_fused__fused_rms_norm_view_0(
+	.param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm_view_0_param_0,
+	.param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm_view_0_param_1,
+	.param .u32 triton_red_fused__fused_rms_norm_view_0_param_2,
+	.param .u32 triton_red_fused__fused_rms_norm_view_0_param_3,
+	.param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm_view_0_param_4,
+	.param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm_view_0_param_5
+)
+.reqntid 256
+{
+	.reg .pred 	%p<4>;
+	.reg .b16 	%rs<3>;
+	.reg .b32 	%r<33>;
+	.reg .b64 	%rd<9>;
+	.loc	1 18 0                          // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:18:0
+$L__func_begin0:
+	.loc	1 18 0                          // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:18:0
+
+// %bb.0:
+	ld.param.b64 	%rd3, [triton_red_fused__fused_rms_norm_view_0_param_1];
+	ld.param.b64 	%rd2, [triton_red_fused__fused_rms_norm_view_0_param_0];
+$L__tmp0:
+	.loc	1 23 28                         // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:23:28
+	mov.u32 	%r4, %ctaid.x;
+	.loc	1 23 33                         // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:23:33
+	shl.b32 	%r1, %r4, 6;
+	.loc	1 24 44                         // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:24:44
+	mov.u32 	%r2, %tid.x;
+	and.b32 	%r3, %r2, 252;
+	bfe.u32 	%r5, %r2, 2, 6;
+	.loc	1 24 23                         // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:24:23
+	or.b32 	%r6, %r5, %r1;
+	.loc	1 26 37                         // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:26:37
+	and.b32 	%r7, %r2, 3;
+	.loc	1 29 19                         // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:29:19
+	bfe.s32 	%r8, %r4, 25, 1;
+	shr.u32 	%r9, %r8, 27;
+	add.s32 	%r10, %r6, %r9;
+	shr.u32 	%r11, %r10, 5;
+	.loc	1 32 43                         // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:32:43
+	add.s32 	%r12, %r4, %r11;
+	shl.b32 	%r13, %r12, 13;
+	shl.b32 	%r14, %r5, 7;
+	or.b32 	%r15, %r13, %r14;
+	or.b32 	%r16, %r15, %r7;
+	cvt.u64.u32 	%rd1, %r16;
+	mov.b32 	%r32, 0f00000000;
+	mov.b64 	%rd8, -4;
+$L__BB0_1:                              // =>This Inner Loop Header: Depth=1
+	.loc	1 38 34                         // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:38:34
+	add.s64 	%rd6, %rd1, %rd8;
+	cvt.u32.u64 	%r17, %rd6;
+	add.s32 	%r18, %r17, 4;
+	mad.wide.s32 	%rd5, %r18, 2, %rd2;
+	.loc	1 38 61                         // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:38:61
+	// begin inline asm
+	mov.u64 %rd4, 0x0;
+	createpolicy.fractional.L2::evict_first.b64 %rd4, 1.0;
+	// end inline asm
+	mov.b16 	%rs2, 0;
+	mov.pred 	%p1, -1;
+	// begin inline asm
+	mov.u16 %rs1, %rs2;
+	@%p1 ld.global.L1::evict_first.L2::cache_hint.b16 { %rs1 }, [ %rd5 + 0 ], %rd4;
+	// end inline asm
+	.loc	1 38 115                        // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:38:115
+	cvt.f32.bf16 	%r19, %rs1;
+	.loc	1 42 23                         // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:42:23
+	fma.rn.f32 	%r32, %r19, %r19, %r32;
+	.loc	1 32 43                         // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:32:43
+	add.s64 	%rd8, %rd8, 4;
+	setp.lt.u64 	%p2, %rd8, 124;
+	@%p2 bra 	$L__BB0_1;
+// %bb.2:
+	.loc	1 24 44                         // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:24:44
+	and.b32 	%r21, %r2, 63;
+	.loc	1 24 23                         // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:24:23
+	or.b32 	%r22, %r1, %r21;
+$L__tmp1:
+	.loc	2 293 36                        // standard.py:293:36 @[ cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:44:25 ]
+	shfl.sync.bfly.b32 	%r23, %r32, 2, 31, -1;
+$L__tmp2:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:44:25 ] ]
+	add.f32 	%r24, %r32, %r23;
+$L__tmp3:
+	.loc	2 293 36                        // standard.py:293:36 @[ cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:44:25 ]
+	shfl.sync.bfly.b32 	%r25, %r24, 1, 31, -1;
+$L__tmp4:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:44:25 ] ]
+	add.f32 	%r26, %r24, %r25;
+$L__tmp5:
+	.loc	1 44 28                         // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:44:28
+	mov.b32 	%r27, global_smem;
+	add.s32 	%r28, %r27, %r3;
+	st.shared.b32 	[%r28], %r26;
+	bar.sync 	0;
+	shl.b32 	%r29, %r21, 2;
+	add.s32 	%r30, %r27, %r29;
+	ld.shared.b32 	%r20, [%r30];
+	.loc	1 45 25                         // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:45:25
+	mad.wide.s32 	%rd7, %r22, 4, %rd3;
+	.loc	1 45 36                         // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:45:36
+	and.b32 	%r31, %r2, 192;
+	setp.eq.b32 	%p3, %r31, 0;
+	// begin inline asm
+	@%p3 st.global.b32 [ %rd7 + 0 ], { %r20 };
+	// end inline asm
+	.loc	1 45 4                          // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:45:4
+	ret;
+$L__tmp6:
+$L__func_end0:
+                                        // -- End function
+}
+	.file	1 "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py"
+	.file	2 "/usr/local/lib/python3.12/dist-packages/triton/language/standard.py"
+	.section	.debug_abbrev
+	{
+.b8 1                                   // Abbreviation Code
+.b8 17                                  // DW_TAG_compile_unit
+.b8 1                                   // DW_CHILDREN_yes
+.b8 37                                  // DW_AT_producer
+.b8 8                                   // DW_FORM_string
+.b8 19                                  // DW_AT_language
+.b8 5                                   // DW_FORM_data2
+.b8 3                                   // DW_AT_name
+.b8 8                                   // DW_FORM_string
+.b8 16                                  // DW_AT_stmt_list
+.b8 6                                   // DW_FORM_data4
+.b8 27                                  // DW_AT_comp_dir
+.b8 8                                   // DW_FORM_string
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 2                                   // Abbreviation Code
+.b8 46                                  // DW_TAG_subprogram
+.b8 0                                   // DW_CHILDREN_no
+.b8 3                                   // DW_AT_name
+.b8 8                                   // DW_FORM_string
+.b8 32                                  // DW_AT_inline
+.b8 11                                  // DW_FORM_data1
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 3                                   // Abbreviation Code
+.b8 46                                  // DW_TAG_subprogram
+.b8 1                                   // DW_CHILDREN_yes
+.b8 17                                  // DW_AT_low_pc
+.b8 1                                   // DW_FORM_addr
+.b8 18                                  // DW_AT_high_pc
+.b8 1                                   // DW_FORM_addr
+.b8 49                                  // DW_AT_abstract_origin
+.b8 19                                  // DW_FORM_ref4
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 4                                   // Abbreviation Code
+.b8 29                                  // DW_TAG_inlined_subroutine
+.b8 1                                   // DW_CHILDREN_yes
+.b8 49                                  // DW_AT_abstract_origin
+.b8 19                                  // DW_FORM_ref4
+.b8 17                                  // DW_AT_low_pc
+.b8 1                                   // DW_FORM_addr
+.b8 18                                  // DW_AT_high_pc
+.b8 1                                   // DW_FORM_addr
+.b8 88                                  // DW_AT_call_file
+.b8 11                                  // DW_FORM_data1
+.b8 89                                  // DW_AT_call_line
+.b8 11                                  // DW_FORM_data1
+.b8 87                                  // DW_AT_call_column
+.b8 11                                  // DW_FORM_data1
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 5                                   // Abbreviation Code
+.b8 29                                  // DW_TAG_inlined_subroutine
+.b8 0                                   // DW_CHILDREN_no
+.b8 49                                  // DW_AT_abstract_origin
+.b8 19                                  // DW_FORM_ref4
+.b8 17                                  // DW_AT_low_pc
+.b8 1                                   // DW_FORM_addr
+.b8 18                                  // DW_AT_high_pc
+.b8 1                                   // DW_FORM_addr
+.b8 88                                  // DW_AT_call_file
+.b8 11                                  // DW_FORM_data1
+.b8 89                                  // DW_AT_call_line
+.b8 5                                   // DW_FORM_data2
+.b8 87                                  // DW_AT_call_column
+.b8 11                                  // DW_FORM_data1
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 0                                   // EOM(3)
+	}
+	.section	.debug_info
+	{
+.b32 339                                // Length of Unit
+.b8 2                                   // DWARF version number
+.b8 0
+.b32 .debug_abbrev                      // Offset Into Abbrev. Section
+.b8 8                                   // Address Size (in bytes)
+.b8 1                                   // Abbrev [1] 0xb:0x14c DW_TAG_compile_unit
+.b8 116                                 // DW_AT_producer
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 0
+.b8 2                                   // DW_AT_language
+.b8 0
+.b8 99                                  // DW_AT_name
+.b8 119
+.b8 118
+.b8 121
+.b8 116
+.b8 52
+.b8 50
+.b8 55
+.b8 51
+.b8 105
+.b8 117
+.b8 51
+.b8 51
+.b8 109
+.b8 112
+.b8 101
+.b8 101
+.b8 55
+.b8 104
+.b8 98
+.b8 101
+.b8 116
+.b8 53
+.b8 106
+.b8 53
+.b8 101
+.b8 113
+.b8 52
+.b8 52
+.b8 100
+.b8 54
+.b8 102
+.b8 115
+.b8 104
+.b8 103
+.b8 119
+.b8 107
+.b8 121
+.b8 120
+.b8 107
+.b8 110
+.b8 53
+.b8 50
+.b8 103
+.b8 103
+.b8 103
+.b8 107
+.b8 105
+.b8 113
+.b8 104
+.b8 106
+.b8 53
+.b8 46
+.b8 112
+.b8 121
+.b8 0
+.b32 .debug_line                        // DW_AT_stmt_list
+.b8 47                                  // DW_AT_comp_dir
+.b8 97
+.b8 112
+.b8 112
+.b8 47
+.b8 116
+.b8 101
+.b8 110
+.b8 115
+.b8 111
+.b8 114
+.b8 114
+.b8 116
+.b8 95
+.b8 108
+.b8 108
+.b8 109
+.b8 47
+.b8 118
+.b8 105
+.b8 115
+.b8 117
+.b8 97
+.b8 108
+.b8 95
+.b8 103
+.b8 101
+.b8 110
+.b8 47
+.b8 99
+.b8 111
+.b8 109
+.b8 112
+.b8 105
+.b8 108
+.b8 101
+.b8 100
+.b8 95
+.b8 99
+.b8 97
+.b8 99
+.b8 104
+.b8 101
+.b8 47
+.b8 102
+.b8 108
+.b8 117
+.b8 120
+.b8 50
+.b8 95
+.b8 107
+.b8 108
+.b8 101
+.b8 105
+.b8 110
+.b8 95
+.b8 57
+.b8 98
+.b8 95
+.b8 78
+.b8 86
+.b8 73
+.b8 68
+.b8 73
+.b8 65
+.b8 95
+.b8 71
+.b8 101
+.b8 70
+.b8 111
+.b8 114
+.b8 99
+.b8 101
+.b8 95
+.b8 82
+.b8 84
+.b8 88
+.b8 95
+.b8 52
+.b8 48
+.b8 57
+.b8 48
+.b8 95
+.b8 115
+.b8 109
+.b8 56
+.b8 57
+.b8 95
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 50
+.b8 46
+.b8 49
+.b8 48
+.b8 46
+.b8 48
+.b8 97
+.b8 48
+.b8 95
+.b8 98
+.b8 52
+.b8 101
+.b8 52
+.b8 101
+.b8 101
+.b8 56
+.b8 49
+.b8 100
+.b8 51
+.b8 46
+.b8 110
+.b8 118
+.b8 50
+.b8 53
+.b8 46
+.b8 49
+.b8 50
+.b8 95
+.b8 99
+.b8 117
+.b8 100
+.b8 97
+.b8 49
+.b8 51
+.b8 95
+.b8 49
+.b8 47
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 105
+.b8 110
+.b8 100
+.b8 117
+.b8 99
+.b8 116
+.b8 111
+.b8 114
+.b8 47
+.b8 119
+.b8 118
+.b8 0
+.b8 2                                   // Abbrev [2] 0xe4:0x2a DW_TAG_subprogram
+.b8 116                                 // DW_AT_name
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 114
+.b8 101
+.b8 100
+.b8 95
+.b8 102
+.b8 117
+.b8 115
+.b8 101
+.b8 100
+.b8 95
+.b8 95
+.b8 102
+.b8 117
+.b8 115
+.b8 101
+.b8 100
+.b8 95
+.b8 114
+.b8 109
+.b8 115
+.b8 95
+.b8 110
+.b8 111
+.b8 114
+.b8 109
+.b8 95
+.b8 118
+.b8 105
+.b8 101
+.b8 119
+.b8 95
+.b8 48
+.b8 0
+.b8 1                                   // DW_AT_inline
+.b8 3                                   // Abbrev [3] 0x10e:0x48 DW_TAG_subprogram
+.b64 $L__func_begin0                    // DW_AT_low_pc
+.b64 $L__func_end0                      // DW_AT_high_pc
+.b32 228                                // DW_AT_abstract_origin
+.b8 4                                   // Abbrev [4] 0x123:0x32 DW_TAG_inlined_subroutine
+.b32 228                                // DW_AT_abstract_origin
+.b64 $L__tmp1                           // DW_AT_low_pc
+.b64 $L__tmp5                           // DW_AT_high_pc
+.b8 1                                   // DW_AT_call_file
+.b8 44                                  // DW_AT_call_line
+.b8 25                                  // DW_AT_call_column
+.b8 5                                   // Abbrev [5] 0x13b:0x19 DW_TAG_inlined_subroutine
+.b32 228                                // DW_AT_abstract_origin
+.b64 $L__tmp2                           // DW_AT_low_pc
+.b64 $L__tmp5                           // DW_AT_high_pc
+.b8 2                                   // DW_AT_call_file
+.b8 37                                  // DW_AT_call_line
+.b8 1
+.b8 36                                  // DW_AT_call_column
+.b8 0                                   // End Of Children Mark
+.b8 0                                   // End Of Children Mark
+.b8 0                                   // End Of Children Mark
+	}
+	.section	.debug_macinfo	{	}
diff --git a/triton/ENXFAAYKQI5QASIWI4F5B2IT6OP7L6V6KYE5EI7JD6K7N5WDNP5Q/triton_red_fused__fused_rms_norm_view_0.source b/triton/ENXFAAYKQI5QASIWI4F5B2IT6OP7L6V6KYE5EI7JD6K7N5WDNP5Q/triton_red_fused__fused_rms_norm_view_0.source
new file mode 100644
index 0000000000000000000000000000000000000000..de27e59bc4bfffa90557e9fdd59ec98ceafc4977
--- /dev/null
+++ b/triton/ENXFAAYKQI5QASIWI4F5B2IT6OP7L6V6KYE5EI7JD6K7N5WDNP5Q/triton_red_fused__fused_rms_norm_view_0.source
@@ -0,0 +1,167 @@
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":18:0)
+#loc33 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":287:0)
+#loc35 = loc(unknown)
+#loc38 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":262:0)
+#loc42 = loc("in_ptr0"(#loc))
+#loc43 = loc("out_ptr0"(#loc))
+#loc44 = loc("xnumel"(#loc))
+#loc45 = loc("r0_numel"(#loc))
+#loc74 = loc("input"(#loc33))
+#loc75 = loc("a"(#loc38))
+#loc76 = loc("b"(#loc38))
+module {
+  tt.func public @triton_red_fused__fused_rms_norm_view_0(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
+    %xnumel_0 = arith.constant 8192 : i32 loc(#loc46)
+    %r0_numel_1 = arith.constant 128 : i32 loc(#loc47)
+    %xoffset = tt.get_program_id x : i32 loc(#loc48)
+    %xoffset_2 = arith.constant 64 : i32 loc(#loc49)
+    %xoffset_3 = arith.constant 64 : i32 loc(#loc49)
+    %xoffset_4 = arith.muli %xoffset, %xoffset_3 : i32 loc(#loc49)
+    %xindex = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc50)
+    %xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc51)
+    %xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<64x1xi32> loc(#loc52)
+    %xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<64x1xi32> loc(#loc52)
+    %xmask = arith.constant true loc(#loc53)
+    %xmask_8 = arith.constant dense<true> : tensor<64x4xi1> loc(#loc53)
+    %r0_base = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32> loc(#loc54)
+    %r0_base_9 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<4xi32> -> tensor<1x4xi32> loc(#loc55)
+    %x0 = arith.constant 32 : i32 loc(#loc56)
+    %x0_10 = arith.constant 32 : i32 loc(#loc56)
+    %x0_11 = arith.constant dense<32> : tensor<64x1xi32> loc(#loc56)
+    %x0_12 = arith.remsi %xindex_7, %x0_11 : tensor<64x1xi32> loc(#loc56)
+    %x1 = arith.constant 32 : i32 loc(#loc57)
+    %x1_13 = arith.constant 32 : i32 loc(#loc57)
+    %x1_14 = arith.constant dense<32> : tensor<64x1xi32> loc(#loc57)
+    %x1_15 = arith.divsi %xindex_7, %x1_14 : tensor<64x1xi32> loc(#loc57)
+    %_tmp4 = arith.constant 0.000000e+00 : f32 loc(#loc58)
+    %_tmp4_16 = arith.constant dense<0.000000e+00> : tensor<64x4xf32> loc(#loc58)
+    %c0_i32 = arith.constant 0 : i32 loc(#loc14)
+    %c4_i32 = arith.constant 4 : i32 loc(#loc14)
+    %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc14)
+    %1 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc14)
+    %2 = arith.bitcast %c4_i32 : i32 to i32 loc(#loc14)
+    %3 = ub.poison : i32 loc(#loc14)
+    %_tmp4_17 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%_tmp4_19 = %_tmp4_16) -> (tensor<64x4xf32>)  : i32 {
+      %r0_index = tt.splat %r0_offset : i32 -> tensor<1x4xi32> loc(#loc60)
+      %r0_index_20 = arith.addi %r0_index, %r0_base_9 : tensor<1x4xi32> loc(#loc60)
+      %r0_mask = arith.constant dense<128> : tensor<1x4xi32> loc(#loc61)
+      %r0_mask_21 = arith.cmpi slt, %r0_index_20, %r0_mask : tensor<1x4xi32> loc(#loc61)
+      %tmp0 = arith.constant 128 : i32 loc(#loc62)
+      %tmp0_22 = arith.constant 128 : i32 loc(#loc62)
+      %tmp0_23 = arith.constant dense<128> : tensor<64x1xi32> loc(#loc62)
+      %tmp0_24 = arith.muli %tmp0_23, %x0_12 : tensor<64x1xi32> loc(#loc62)
+      %tmp0_25 = tt.broadcast %r0_index_20 : tensor<1x4xi32> -> tensor<64x4xi32> loc(#loc63)
+      %tmp0_26 = tt.broadcast %tmp0_24 : tensor<64x1xi32> -> tensor<64x4xi32> loc(#loc63)
+      %tmp0_27 = arith.addi %tmp0_25, %tmp0_26 : tensor<64x4xi32> loc(#loc63)
+      %tmp0_28 = arith.constant 12288 : i32 loc(#loc64)
+      %tmp0_29 = arith.constant 12288 : i32 loc(#loc64)
+      %tmp0_30 = arith.constant dense<12288> : tensor<64x1xi32> loc(#loc64)
+      %tmp0_31 = arith.muli %tmp0_30, %x1_15 : tensor<64x1xi32> loc(#loc64)
+      %tmp0_32 = tt.broadcast %tmp0_31 : tensor<64x1xi32> -> tensor<64x4xi32> loc(#loc65)
+      %tmp0_33 = arith.addi %tmp0_27, %tmp0_32 : tensor<64x4xi32> loc(#loc65)
+      %tmp0_34 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<64x4x!tt.ptr<bf16>> loc(#loc66)
+      %tmp0_35 = tt.addptr %tmp0_34, %tmp0_33 : tensor<64x4x!tt.ptr<bf16>>, tensor<64x4xi32> loc(#loc66)
+      %tmp0_36 = arith.constant 0.000000e+00 : f32 loc(#loc67)
+      %tmp0_37 = tt.broadcast %r0_mask_21 : tensor<1x4xi1> -> tensor<64x4xi1> loc(#loc67)
+      %tmp0_38 = arith.constant dense<0.000000e+00> : tensor<64x4xf32> loc(#loc67)
+      %tmp0_39 = arith.truncf %tmp0_38 : tensor<64x4xf32> to tensor<64x4xbf16> loc(#loc67)
+      %tmp0_40 = tt.load %tmp0_35, %tmp0_37, %tmp0_39 evictionPolicy = evict_first : tensor<64x4x!tt.ptr<bf16>> loc(#loc67)
+      %tmp0_41 = arith.extf %tmp0_40 : tensor<64x4xbf16> to tensor<64x4xf32> loc(#loc68)
+      %tmp2 = arith.mulf %tmp0_41, %tmp0_41 : tensor<64x4xf32> loc(#loc69)
+      %tmp5 = arith.addf %_tmp4_19, %tmp2 : tensor<64x4xf32> loc(#loc70)
+      %_tmp4_42 = tt.broadcast %r0_mask_21 : tensor<1x4xi1> -> tensor<64x4xi1> loc(#loc71)
+      %_tmp4_43 = arith.select %_tmp4_42, %tmp5, %_tmp4_19 : tensor<64x4xi1>, tensor<64x4xf32> loc(#loc71)
+      scf.yield %_tmp4_43 : tensor<64x4xf32> loc(#loc27)
+    } loc(#loc59)
+    %tmp4 = tt.call @"triton.language.standard.sum__fp32S64_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%_tmp4_17) : (tensor<64x4xf32>) -> tensor<64xf32> loc(#loc72)
+    %tmp4_18 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<64xf32> -> tensor<64x1xf32> loc(#loc73)
+    %4 = tt.splat %out_ptr0 : !tt.ptr<f32> -> tensor<64x1x!tt.ptr<f32>> loc(#loc30)
+    %5 = tt.addptr %4, %xindex_7 : tensor<64x1x!tt.ptr<f32>>, tensor<64x1xi32> loc(#loc30)
+    tt.store %5, %tmp4_18 : tensor<64x1x!tt.ptr<f32>> loc(#loc31)
+    tt.return loc(#loc32)
+  } loc(#loc)
+  tt.func private @"triton.language.standard.sum__fp32S64_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<64x4xf32> loc("input"(#loc33))) -> tensor<64xf32> attributes {noinline = false} {
+    %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({
+    ^bb0(%arg1: f32 loc(unknown), %arg2: f32 loc(unknown)):
+      %2 = tt.call @triton.language.standard._sum_combine__fp32_fp32__(%arg1, %arg2) : (f32, f32) -> f32 loc(#loc34)
+      tt.reduce.return %2 : f32 loc(#loc34)
+    }) : (tensor<64x4xf32>) -> tensor<64xf32> loc(#loc34)
+    tt.return %0 : tensor<64xf32> loc(#loc36)
+  ^bb1:  // no predecessors
+    %1 = ub.poison : tensor<64xf32> loc(#loc37)
+    tt.return %1 : tensor<64xf32> loc(#loc37)
+  } loc(#loc33)
+  tt.func private @triton.language.standard._sum_combine__fp32_fp32__(%a: f32 loc("a"(#loc38)), %b: f32 loc("b"(#loc38))) -> f32 attributes {noinline = false} {
+    %0 = arith.addf %a, %b : f32 loc(#loc39)
+    tt.return %0 : f32 loc(#loc40)
+  ^bb1:  // no predecessors
+    %1 = ub.poison : f32 loc(#loc41)
+    tt.return %1 : f32 loc(#loc41)
+  } loc(#loc38)
+} loc(#loc)
+#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":19:13)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":20:15)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":23:28)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":23:33)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":24:36)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":24:44)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":24:23)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":25:46)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":26:27)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":26:37)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":28:19)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":29:19)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":30:43)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":32:43)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":33:31)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":34:29)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:45)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:41)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:56)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:50)
+#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:34)
+#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:61)
+#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:115)
+#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":40:22)
+#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":42:23)
+#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":43:40)
+#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":43:8)
+#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":44:25)
+#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":44:28)
+#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":45:25)
+#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":45:36)
+#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":45:4)
+#loc34 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:36)
+#loc36 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:11)
+#loc37 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:4)
+#loc39 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:15)
+#loc40 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:11)
+#loc41 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:4)
+#loc46 = loc("xnumel"(#loc1))
+#loc47 = loc("r0_numel"(#loc2))
+#loc48 = loc("xoffset"(#loc3))
+#loc49 = loc("xoffset"(#loc4))
+#loc50 = loc("xindex"(#loc5))
+#loc51 = loc("xindex"(#loc6))
+#loc52 = loc("xindex"(#loc7))
+#loc53 = loc("xmask"(#loc8))
+#loc54 = loc("r0_base"(#loc9))
+#loc55 = loc("r0_base"(#loc10))
+#loc56 = loc("x0"(#loc11))
+#loc57 = loc("x1"(#loc12))
+#loc58 = loc("_tmp4"(#loc13))
+#loc59 = loc("_tmp4"(#loc14))
+#loc60 = loc("r0_index"(#loc15))
+#loc61 = loc("r0_mask"(#loc16))
+#loc62 = loc("tmp0"(#loc17))
+#loc63 = loc("tmp0"(#loc18))
+#loc64 = loc("tmp0"(#loc19))
+#loc65 = loc("tmp0"(#loc20))
+#loc66 = loc("tmp0"(#loc21))
+#loc67 = loc("tmp0"(#loc22))
+#loc68 = loc("tmp0"(#loc23))
+#loc69 = loc("tmp2"(#loc24))
+#loc70 = loc("tmp5"(#loc25))
+#loc71 = loc("_tmp4"(#loc26))
+#loc72 = loc("tmp4"(#loc28))
+#loc73 = loc("tmp4"(#loc29))
diff --git a/triton/ENXFAAYKQI5QASIWI4F5B2IT6OP7L6V6KYE5EI7JD6K7N5WDNP5Q/triton_red_fused__fused_rms_norm_view_0.ttgir b/triton/ENXFAAYKQI5QASIWI4F5B2IT6OP7L6V6KYE5EI7JD6K7N5WDNP5Q/triton_red_fused__fused_rms_norm_view_0.ttgir
new file mode 100644
index 0000000000000000000000000000000000000000..7ba4e37239c9f283234461e18806a84a05f532c6
--- /dev/null
+++ b/triton/ENXFAAYKQI5QASIWI4F5B2IT6OP7L6V6KYE5EI7JD6K7N5WDNP5Q/triton_red_fused__fused_rms_norm_view_0.ttgir
@@ -0,0 +1,121 @@
+#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [8, 4], warpsPerCTA = [8, 1], order = [1, 0]}>
+#blocked1 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [2, 4], order = [0, 1]}>
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":18:0)
+#loc1 = loc(unknown)
+#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":44:25)
+#loc30 = loc("in_ptr0"(#loc))
+#loc31 = loc("out_ptr0"(#loc))
+#loc32 = loc("xnumel"(#loc))
+#loc33 = loc("r0_numel"(#loc))
+#loc54 = loc("tmp4"(#loc24))
+#loc57 = loc(callsite(#loc1 at #loc54))
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "cuda:89", "ttg.threads-per-warp" = 32 : i32} {
+  tt.func public @triton_red_fused__fused_rms_norm_view_0(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
+    %cst = arith.constant dense<32> : tensor<64x1xi32, #blocked> loc(#loc1)
+    %cst_0 = arith.constant dense<128> : tensor<64x1xi32, #blocked> loc(#loc1)
+    %cst_1 = arith.constant dense<12288> : tensor<64x1xi32, #blocked> loc(#loc1)
+    %c0_i32 = arith.constant 0 : i32 loc(#loc1)
+    %c128_i32 = arith.constant 128 : i32 loc(#loc1)
+    %c4_i32 = arith.constant 4 : i32 loc(#loc1)
+    %cst_2 = arith.constant dense<0.000000e+00> : tensor<64x4xbf16, #blocked> loc(#loc1)
+    %cst_3 = arith.constant dense<128> : tensor<1x4xi32, #blocked> loc(#loc1)
+    %cst_4 = arith.constant dense<0.000000e+00> : tensor<64x4xf32, #blocked> loc(#loc1)
+    %c64_i32 = arith.constant 64 : i32 loc(#loc1)
+    %xoffset = tt.get_program_id x : i32 loc(#loc34)
+    %xoffset_5 = arith.muli %xoffset, %c64_i32 : i32 loc(#loc35)
+    %xindex = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc36)
+    %xindex_6 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc36)
+    %xindex_7 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> loc(#loc36)
+    %xindex_8 = tt.expand_dims %xindex_6 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<64x1xi32, #blocked1> loc(#loc36)
+    %xindex_9 = tt.splat %xoffset_5 : i32 -> tensor<64x1xi32, #blocked> loc(#loc37)
+    %xindex_10 = tt.splat %xoffset_5 : i32 -> tensor<64x1xi32, #blocked1> loc(#loc37)
+    %xindex_11 = arith.addi %xindex_9, %xindex_7 : tensor<64x1xi32, #blocked> loc(#loc37)
+    %xindex_12 = arith.addi %xindex_10, %xindex_8 : tensor<64x1xi32, #blocked1> loc(#loc37)
+    %r0_base = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc38)
+    %r0_base_13 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<4xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x4xi32, #blocked> loc(#loc38)
+    %x0 = arith.remsi %xindex_11, %cst : tensor<64x1xi32, #blocked> loc(#loc39)
+    %x1 = arith.divsi %xindex_11, %cst : tensor<64x1xi32, #blocked> loc(#loc40)
+    %tmp0 = arith.muli %x0, %cst_0 : tensor<64x1xi32, #blocked> loc(#loc41)
+    %tmp0_14 = tt.broadcast %tmp0 : tensor<64x1xi32, #blocked> -> tensor<64x4xi32, #blocked> loc(#loc42)
+    %tmp0_15 = arith.muli %x1, %cst_1 : tensor<64x1xi32, #blocked> loc(#loc43)
+    %tmp0_16 = tt.broadcast %tmp0_15 : tensor<64x1xi32, #blocked> -> tensor<64x4xi32, #blocked> loc(#loc44)
+    %tmp0_17 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<64x4x!tt.ptr<bf16>, #blocked> loc(#loc45)
+    %_tmp4 = scf.for %r0_offset = %c0_i32 to %c128_i32 step %c4_i32 iter_args(%_tmp4_20 = %cst_4) -> (tensor<64x4xf32, #blocked>)  : i32 {
+      %r0_index = tt.splat %r0_offset : i32 -> tensor<1x4xi32, #blocked> loc(#loc47)
+      %r0_index_21 = arith.addi %r0_index, %r0_base_13 : tensor<1x4xi32, #blocked> loc(#loc47)
+      %r0_mask = arith.cmpi slt, %r0_index_21, %cst_3 : tensor<1x4xi32, #blocked> loc(#loc48)
+      %tmp0_22 = tt.broadcast %r0_index_21 : tensor<1x4xi32, #blocked> -> tensor<64x4xi32, #blocked> loc(#loc42)
+      %tmp0_23 = arith.addi %tmp0_22, %tmp0_14 : tensor<64x4xi32, #blocked> loc(#loc42)
+      %tmp0_24 = arith.addi %tmp0_23, %tmp0_16 : tensor<64x4xi32, #blocked> loc(#loc44)
+      %tmp0_25 = tt.addptr %tmp0_17, %tmp0_24 : tensor<64x4x!tt.ptr<bf16>, #blocked>, tensor<64x4xi32, #blocked> loc(#loc45)
+      %tmp0_26 = tt.broadcast %r0_mask : tensor<1x4xi1, #blocked> -> tensor<64x4xi1, #blocked> loc(#loc49)
+      %tmp0_27 = tt.load %tmp0_25, %tmp0_26, %cst_2 evictionPolicy = evict_first : tensor<64x4x!tt.ptr<bf16>, #blocked> loc(#loc49)
+      %tmp0_28 = arith.extf %tmp0_27 : tensor<64x4xbf16, #blocked> to tensor<64x4xf32, #blocked> loc(#loc50)
+      %tmp2 = arith.mulf %tmp0_28, %tmp0_28 : tensor<64x4xf32, #blocked> loc(#loc51)
+      %tmp5 = arith.addf %_tmp4_20, %tmp2 : tensor<64x4xf32, #blocked> loc(#loc52)
+      %_tmp4_29 = arith.select %tmp0_26, %tmp5, %_tmp4_20 : tensor<64x4xi1, #blocked>, tensor<64x4xf32, #blocked> loc(#loc53)
+      scf.yield %_tmp4_29 : tensor<64x4xf32, #blocked> loc(#loc22)
+    } loc(#loc46)
+    %tmp4 = "tt.reduce"(%_tmp4) <{axis = 1 : i32}> ({
+    ^bb0(%tmp4_20: f32 loc(callsite(#loc1 at #loc54)), %tmp4_21: f32 loc(callsite(#loc1 at #loc54))):
+      %tmp4_22 = arith.addf %tmp4_20, %tmp4_21 : f32 loc(#loc58)
+      tt.reduce.return %tmp4_22 : f32 loc(#loc56)
+    }) : (tensor<64x4xf32, #blocked>) -> tensor<64xf32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc56)
+    %tmp4_18 = ttg.convert_layout %tmp4 : tensor<64xf32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc55)
+    %tmp4_19 = tt.expand_dims %tmp4_18 {axis = 1 : i32} : tensor<64xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<64x1xf32, #blocked1> loc(#loc55)
+    %0 = tt.splat %out_ptr0 : !tt.ptr<f32> -> tensor<64x1x!tt.ptr<f32>, #blocked1> loc(#loc27)
+    %1 = tt.addptr %0, %xindex_12 : tensor<64x1x!tt.ptr<f32>, #blocked1>, tensor<64x1xi32, #blocked1> loc(#loc27)
+    tt.store %1, %tmp4_19 : tensor<64x1x!tt.ptr<f32>, #blocked1> loc(#loc28)
+    tt.return loc(#loc29)
+  } loc(#loc)
+} loc(#loc)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":23:28)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":23:33)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":24:44)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":24:23)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":26:37)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":28:19)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":29:19)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:45)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:41)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:56)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:50)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:34)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":32:43)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":33:31)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":34:29)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:61)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:115)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":40:22)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":42:23)
+#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":43:40)
+#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":43:8)
+#loc23 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:36)
+#loc25 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:15)
+#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":44:28)
+#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":45:25)
+#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":45:36)
+#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":45:4)
+#loc34 = loc("xoffset"(#loc2))
+#loc35 = loc("xoffset"(#loc3))
+#loc36 = loc("xindex"(#loc4))
+#loc37 = loc("xindex"(#loc5))
+#loc38 = loc("r0_base"(#loc6))
+#loc39 = loc("x0"(#loc7))
+#loc40 = loc("x1"(#loc8))
+#loc41 = loc("tmp0"(#loc9))
+#loc42 = loc("tmp0"(#loc10))
+#loc43 = loc("tmp0"(#loc11))
+#loc44 = loc("tmp0"(#loc12))
+#loc45 = loc("tmp0"(#loc13))
+#loc46 = loc("_tmp4"(#loc14))
+#loc47 = loc("r0_index"(#loc15))
+#loc48 = loc("r0_mask"(#loc16))
+#loc49 = loc("tmp0"(#loc17))
+#loc50 = loc("tmp0"(#loc18))
+#loc51 = loc("tmp2"(#loc19))
+#loc52 = loc("tmp5"(#loc20))
+#loc53 = loc("_tmp4"(#loc21))
+#loc55 = loc("tmp4"(#loc26))
+#loc56 = loc(callsite(#loc23 at #loc54))
+#loc58 = loc(callsite(#loc25 at #loc56))
diff --git a/triton/ENXFAAYKQI5QASIWI4F5B2IT6OP7L6V6KYE5EI7JD6K7N5WDNP5Q/triton_red_fused__fused_rms_norm_view_0.ttir b/triton/ENXFAAYKQI5QASIWI4F5B2IT6OP7L6V6KYE5EI7JD6K7N5WDNP5Q/triton_red_fused__fused_rms_norm_view_0.ttir
new file mode 100644
index 0000000000000000000000000000000000000000..a343a2f6287aeba89f8550cdfcb3f36d7afc9a69
--- /dev/null
+++ b/triton/ENXFAAYKQI5QASIWI4F5B2IT6OP7L6V6KYE5EI7JD6K7N5WDNP5Q/triton_red_fused__fused_rms_norm_view_0.ttir
@@ -0,0 +1,118 @@
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":18:0)
+#loc1 = loc(unknown)
+#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":44:25)
+#loc32 = loc("in_ptr0"(#loc))
+#loc33 = loc("out_ptr0"(#loc))
+#loc34 = loc("xnumel"(#loc))
+#loc35 = loc("r0_numel"(#loc))
+#loc58 = loc("tmp4"(#loc26))
+#loc61 = loc(callsite(#loc1 at #loc58))
+module {
+  tt.func public @triton_red_fused__fused_rms_norm_view_0(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
+    %cst = arith.constant dense<0.000000e+00> : tensor<64x4xbf16> loc(#loc1)
+    %c4_i32 = arith.constant 4 : i32 loc(#loc2)
+    %c128_i32 = arith.constant 128 : i32 loc(#loc2)
+    %c0_i32 = arith.constant 0 : i32 loc(#loc2)
+    %cst_0 = arith.constant dense<12288> : tensor<64x1xi32> loc(#loc1)
+    %cst_1 = arith.constant dense<128> : tensor<64x1xi32> loc(#loc1)
+    %cst_2 = arith.constant dense<128> : tensor<1x4xi32> loc(#loc1)
+    %cst_3 = arith.constant dense<0.000000e+00> : tensor<64x4xf32> loc(#loc1)
+    %cst_4 = arith.constant dense<32> : tensor<64x1xi32> loc(#loc1)
+    %c64_i32 = arith.constant 64 : i32 loc(#loc1)
+    %xoffset = tt.get_program_id x : i32 loc(#loc36)
+    %xoffset_5 = arith.muli %xoffset, %c64_i32 : i32 loc(#loc37)
+    %xindex = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc38)
+    %xindex_6 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc39)
+    %xindex_7 = tt.splat %xoffset_5 : i32 -> tensor<64x1xi32> loc(#loc40)
+    %xindex_8 = arith.addi %xindex_7, %xindex_6 : tensor<64x1xi32> loc(#loc40)
+    %r0_base = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32> loc(#loc41)
+    %r0_base_9 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<4xi32> -> tensor<1x4xi32> loc(#loc42)
+    %x0 = arith.remsi %xindex_8, %cst_4 : tensor<64x1xi32> loc(#loc43)
+    %x1 = arith.divsi %xindex_8, %cst_4 : tensor<64x1xi32> loc(#loc44)
+    %_tmp4 = scf.for %r0_offset = %c0_i32 to %c128_i32 step %c4_i32 iter_args(%_tmp4_11 = %cst_3) -> (tensor<64x4xf32>)  : i32 {
+      %r0_index = tt.splat %r0_offset : i32 -> tensor<1x4xi32> loc(#loc46)
+      %r0_index_12 = arith.addi %r0_index, %r0_base_9 : tensor<1x4xi32> loc(#loc46)
+      %r0_mask = arith.cmpi slt, %r0_index_12, %cst_2 : tensor<1x4xi32> loc(#loc47)
+      %tmp0 = arith.muli %x0, %cst_1 : tensor<64x1xi32> loc(#loc48)
+      %tmp0_13 = tt.broadcast %r0_index_12 : tensor<1x4xi32> -> tensor<64x4xi32> loc(#loc49)
+      %tmp0_14 = tt.broadcast %tmp0 : tensor<64x1xi32> -> tensor<64x4xi32> loc(#loc49)
+      %tmp0_15 = arith.addi %tmp0_13, %tmp0_14 : tensor<64x4xi32> loc(#loc49)
+      %tmp0_16 = arith.muli %x1, %cst_0 : tensor<64x1xi32> loc(#loc50)
+      %tmp0_17 = tt.broadcast %tmp0_16 : tensor<64x1xi32> -> tensor<64x4xi32> loc(#loc51)
+      %tmp0_18 = arith.addi %tmp0_15, %tmp0_17 : tensor<64x4xi32> loc(#loc51)
+      %tmp0_19 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<64x4x!tt.ptr<bf16>> loc(#loc52)
+      %tmp0_20 = tt.addptr %tmp0_19, %tmp0_18 : tensor<64x4x!tt.ptr<bf16>>, tensor<64x4xi32> loc(#loc52)
+      %tmp0_21 = tt.broadcast %r0_mask : tensor<1x4xi1> -> tensor<64x4xi1> loc(#loc53)
+      %tmp0_22 = tt.load %tmp0_20, %tmp0_21, %cst evictionPolicy = evict_first : tensor<64x4x!tt.ptr<bf16>> loc(#loc53)
+      %tmp0_23 = arith.extf %tmp0_22 : tensor<64x4xbf16> to tensor<64x4xf32> loc(#loc54)
+      %tmp2 = arith.mulf %tmp0_23, %tmp0_23 : tensor<64x4xf32> loc(#loc55)
+      %tmp5 = arith.addf %_tmp4_11, %tmp2 : tensor<64x4xf32> loc(#loc56)
+      %_tmp4_24 = arith.select %tmp0_21, %tmp5, %_tmp4_11 : tensor<64x4xi1>, tensor<64x4xf32> loc(#loc57)
+      scf.yield %_tmp4_24 : tensor<64x4xf32> loc(#loc24)
+    } loc(#loc45)
+    %tmp4 = "tt.reduce"(%_tmp4) <{axis = 1 : i32}> ({
+    ^bb0(%tmp4_11: f32 loc(callsite(#loc1 at #loc58)), %tmp4_12: f32 loc(callsite(#loc1 at #loc58))):
+      %tmp4_13 = arith.addf %tmp4_11, %tmp4_12 : f32 loc(#loc62)
+      tt.reduce.return %tmp4_13 : f32 loc(#loc60)
+    }) : (tensor<64x4xf32>) -> tensor<64xf32> loc(#loc60)
+    %tmp4_10 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<64xf32> -> tensor<64x1xf32> loc(#loc59)
+    %0 = tt.splat %out_ptr0 : !tt.ptr<f32> -> tensor<64x1x!tt.ptr<f32>> loc(#loc29)
+    %1 = tt.addptr %0, %xindex_8 : tensor<64x1x!tt.ptr<f32>>, tensor<64x1xi32> loc(#loc29)
+    tt.store %1, %tmp4_10 : tensor<64x1x!tt.ptr<f32>> loc(#loc30)
+    tt.return loc(#loc31)
+  } loc(#loc)
+} loc(#loc)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":32:43)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":23:28)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":23:33)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":24:36)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":24:44)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":24:23)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":26:27)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":26:37)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":28:19)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":29:19)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":33:31)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":34:29)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:45)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:41)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:56)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:50)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:34)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:61)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:115)
+#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":40:22)
+#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":42:23)
+#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":43:40)
+#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":43:8)
+#loc25 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:36)
+#loc27 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:15)
+#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":44:28)
+#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":45:25)
+#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":45:36)
+#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":45:4)
+#loc36 = loc("xoffset"(#loc3))
+#loc37 = loc("xoffset"(#loc4))
+#loc38 = loc("xindex"(#loc5))
+#loc39 = loc("xindex"(#loc6))
+#loc40 = loc("xindex"(#loc7))
+#loc41 = loc("r0_base"(#loc8))
+#loc42 = loc("r0_base"(#loc9))
+#loc43 = loc("x0"(#loc10))
+#loc44 = loc("x1"(#loc11))
+#loc45 = loc("_tmp4"(#loc2))
+#loc46 = loc("r0_index"(#loc12))
+#loc47 = loc("r0_mask"(#loc13))
+#loc48 = loc("tmp0"(#loc14))
+#loc49 = loc("tmp0"(#loc15))
+#loc50 = loc("tmp0"(#loc16))
+#loc51 = loc("tmp0"(#loc17))
+#loc52 = loc("tmp0"(#loc18))
+#loc53 = loc("tmp0"(#loc19))
+#loc54 = loc("tmp0"(#loc20))
+#loc55 = loc("tmp2"(#loc21))
+#loc56 = loc("tmp5"(#loc22))
+#loc57 = loc("_tmp4"(#loc23))
+#loc59 = loc("tmp4"(#loc28))
+#loc60 = loc(callsite(#loc25 at #loc58))
+#loc62 = loc(callsite(#loc27 at #loc60))
diff --git a/triton/EQOEBZDPMDVSX6EJFLBNKY5DUKJXFLSS4SF4QQZQUN6AV3JLHKJQ/__grp__triton_poi_fused_add_mul_1.json b/triton/EQOEBZDPMDVSX6EJFLBNKY5DUKJXFLSS4SF4QQZQUN6AV3JLHKJQ/__grp__triton_poi_fused_add_mul_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..5a6973da0a4ac172ff5cac31106b99eb6547e17f
--- /dev/null
+++ b/triton/EQOEBZDPMDVSX6EJFLBNKY5DUKJXFLSS4SF4QQZQUN6AV3JLHKJQ/__grp__triton_poi_fused_add_mul_1.json
@@ -0,0 +1 @@
+{"child_paths": {"triton_poi_fused_add_mul_1.source": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/EQOEBZDPMDVSX6EJFLBNKY5DUKJXFLSS4SF4QQZQUN6AV3JLHKJQ/triton_poi_fused_add_mul_1.source", "triton_poi_fused_add_mul_1.ttir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/EQOEBZDPMDVSX6EJFLBNKY5DUKJXFLSS4SF4QQZQUN6AV3JLHKJQ/triton_poi_fused_add_mul_1.ttir", "triton_poi_fused_add_mul_1.ttgir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/EQOEBZDPMDVSX6EJFLBNKY5DUKJXFLSS4SF4QQZQUN6AV3JLHKJQ/triton_poi_fused_add_mul_1.ttgir", "triton_poi_fused_add_mul_1.llir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/EQOEBZDPMDVSX6EJFLBNKY5DUKJXFLSS4SF4QQZQUN6AV3JLHKJQ/triton_poi_fused_add_mul_1.llir", "triton_poi_fused_add_mul_1.ptx": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/EQOEBZDPMDVSX6EJFLBNKY5DUKJXFLSS4SF4QQZQUN6AV3JLHKJQ/triton_poi_fused_add_mul_1.ptx", "triton_poi_fused_add_mul_1.cubin": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/EQOEBZDPMDVSX6EJFLBNKY5DUKJXFLSS4SF4QQZQUN6AV3JLHKJQ/triton_poi_fused_add_mul_1.cubin", "triton_poi_fused_add_mul_1.json": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/EQOEBZDPMDVSX6EJFLBNKY5DUKJXFLSS4SF4QQZQUN6AV3JLHKJQ/triton_poi_fused_add_mul_1.json"}}
\ No newline at end of file
diff --git a/triton/EQOEBZDPMDVSX6EJFLBNKY5DUKJXFLSS4SF4QQZQUN6AV3JLHKJQ/triton_poi_fused_add_mul_1.cubin b/triton/EQOEBZDPMDVSX6EJFLBNKY5DUKJXFLSS4SF4QQZQUN6AV3JLHKJQ/triton_poi_fused_add_mul_1.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..b89626598a92bf1ac9449550d720ba4b5e158bef
Binary files /dev/null and b/triton/EQOEBZDPMDVSX6EJFLBNKY5DUKJXFLSS4SF4QQZQUN6AV3JLHKJQ/triton_poi_fused_add_mul_1.cubin differ
diff --git a/triton/EQOEBZDPMDVSX6EJFLBNKY5DUKJXFLSS4SF4QQZQUN6AV3JLHKJQ/triton_poi_fused_add_mul_1.json b/triton/EQOEBZDPMDVSX6EJFLBNKY5DUKJXFLSS4SF4QQZQUN6AV3JLHKJQ/triton_poi_fused_add_mul_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..b4c6a19a5557b2edaccdedec3a69e4675c040cd3
--- /dev/null
+++ b/triton/EQOEBZDPMDVSX6EJFLBNKY5DUKJXFLSS4SF4QQZQUN6AV3JLHKJQ/triton_poi_fused_add_mul_1.json
@@ -0,0 +1 @@
+{"hash": "241c40e46f60eb2bf8892ac2d563a3a29372ae52e48bc84330a37c0aed2b3a93", "target": {"backend": "cuda", "arch": 89, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "enable_reflect_ftz": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee", "bf16x3", "bf16x6"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm89", "instrumentation_mode": "", "triton_version": "3.6.0", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_poi_fused_add_mul_1"}
\ No newline at end of file
diff --git a/triton/EQOEBZDPMDVSX6EJFLBNKY5DUKJXFLSS4SF4QQZQUN6AV3JLHKJQ/triton_poi_fused_add_mul_1.llir b/triton/EQOEBZDPMDVSX6EJFLBNKY5DUKJXFLSS4SF4QQZQUN6AV3JLHKJQ/triton_poi_fused_add_mul_1.llir
new file mode 100644
index 0000000000000000000000000000000000000000..1ba45455eecc075febee7d7d53941a965027b76b
--- /dev/null
+++ b/triton/EQOEBZDPMDVSX6EJFLBNKY5DUKJXFLSS4SF4QQZQUN6AV3JLHKJQ/triton_poi_fused_add_mul_1.llir
@@ -0,0 +1,118 @@
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64"
+
+; Function Attrs: nounwind
+define ptx_kernel void @triton_poi_fused_add_mul_1(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, i32 %4, ptr addrspace(1) readnone captures(none) %5, ptr addrspace(1) readnone captures(none) %6) local_unnamed_addr #0 !dbg !4 {
+  %8 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7
+  %9 = shl i32 %8, 10, !dbg !8
+  %10 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9
+  %11 = shl nuw nsw i32 %10, 3, !dbg !9
+  %12 = and i32 %11, 1016, !dbg !9
+  %13 = or disjoint i32 %12, %9, !dbg !10
+  %14 = srem i32 %13, 4096, !dbg !11
+  %15 = sext i32 %13 to i64, !dbg !12
+  %16 = getelementptr bfloat, ptr addrspace(1) %0, i64 %15, !dbg !12
+  %17 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l"(ptr addrspace(1) %16) #2, !dbg !13
+  %18 = extractvalue { i32, i32, i32, i32 } %17, 0, !dbg !13
+  %19 = bitcast i32 %18 to <2 x bfloat>, !dbg !13
+  %20 = extractvalue { i32, i32, i32, i32 } %17, 1, !dbg !13
+  %21 = bitcast i32 %20 to <2 x bfloat>, !dbg !13
+  %22 = extractvalue { i32, i32, i32, i32 } %17, 2, !dbg !13
+  %23 = bitcast i32 %22 to <2 x bfloat>, !dbg !13
+  %24 = extractvalue { i32, i32, i32, i32 } %17, 3, !dbg !13
+  %25 = bitcast i32 %24 to <2 x bfloat>, !dbg !13
+  %26 = sext i32 %14 to i64, !dbg !14
+  %27 = getelementptr bfloat, ptr addrspace(1) %1, i64 %26, !dbg !14
+  %28 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #2, !dbg !15
+  %29 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ], $5;", "=r,=r,=r,=r,l,l"(ptr addrspace(1) %27, i64 %28) #2, !dbg !15
+  %30 = extractvalue { i32, i32, i32, i32 } %29, 0, !dbg !15
+  %31 = bitcast i32 %30 to <2 x bfloat>, !dbg !15
+  %32 = extractvalue { i32, i32, i32, i32 } %29, 1, !dbg !15
+  %33 = bitcast i32 %32 to <2 x bfloat>, !dbg !15
+  %34 = extractvalue { i32, i32, i32, i32 } %29, 2, !dbg !15
+  %35 = bitcast i32 %34 to <2 x bfloat>, !dbg !15
+  %36 = extractvalue { i32, i32, i32, i32 } %29, 3, !dbg !15
+  %37 = bitcast i32 %36 to <2 x bfloat>, !dbg !15
+  %38 = getelementptr bfloat, ptr addrspace(1) %2, i64 %15, !dbg !16
+  %39 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l"(ptr addrspace(1) %38) #2, !dbg !17
+  %40 = extractvalue { i32, i32, i32, i32 } %39, 0, !dbg !17
+  %41 = bitcast i32 %40 to <2 x bfloat>, !dbg !17
+  %42 = extractvalue { i32, i32, i32, i32 } %39, 1, !dbg !17
+  %43 = bitcast i32 %42 to <2 x bfloat>, !dbg !17
+  %44 = extractvalue { i32, i32, i32, i32 } %39, 2, !dbg !17
+  %45 = bitcast i32 %44 to <2 x bfloat>, !dbg !17
+  %46 = extractvalue { i32, i32, i32, i32 } %39, 3, !dbg !17
+  %47 = bitcast i32 %46 to <2 x bfloat>, !dbg !17
+  %48 = getelementptr bfloat, ptr addrspace(1) %3, i64 %15, !dbg !18
+  %49 = fpext <2 x bfloat> %19 to <2 x float>, !dbg !19
+  %50 = fpext <2 x bfloat> %31 to <2 x float>, !dbg !20
+  %51 = fpext <2 x bfloat> %41 to <2 x float>, !dbg !21
+  %52 = fmul <2 x float> %50, %51, !dbg !22
+  %53 = fadd <2 x float> %52, %49, !dbg !23
+  %54 = fptrunc <2 x float> %53 to <2 x bfloat>, !dbg !24
+  %55 = fpext <2 x bfloat> %21 to <2 x float>, !dbg !19
+  %56 = fpext <2 x bfloat> %33 to <2 x float>, !dbg !20
+  %57 = fpext <2 x bfloat> %43 to <2 x float>, !dbg !21
+  %58 = fmul <2 x float> %56, %57, !dbg !22
+  %59 = fadd <2 x float> %58, %55, !dbg !23
+  %60 = fptrunc <2 x float> %59 to <2 x bfloat>, !dbg !24
+  %61 = fpext <2 x bfloat> %23 to <2 x float>, !dbg !19
+  %62 = fpext <2 x bfloat> %35 to <2 x float>, !dbg !20
+  %63 = fpext <2 x bfloat> %45 to <2 x float>, !dbg !21
+  %64 = fmul <2 x float> %62, %63, !dbg !22
+  %65 = fadd <2 x float> %64, %61, !dbg !23
+  %66 = fptrunc <2 x float> %65 to <2 x bfloat>, !dbg !24
+  %67 = fpext <2 x bfloat> %25 to <2 x float>, !dbg !19
+  %68 = fpext <2 x bfloat> %37 to <2 x float>, !dbg !20
+  %69 = fpext <2 x bfloat> %47 to <2 x float>, !dbg !21
+  %70 = fmul <2 x float> %68, %69, !dbg !22
+  %71 = fadd <2 x float> %70, %67, !dbg !23
+  %72 = fptrunc <2 x float> %71 to <2 x bfloat>, !dbg !24
+  %73 = bitcast <2 x bfloat> %54 to i32, !dbg !24
+  %74 = bitcast <2 x bfloat> %60 to i32, !dbg !24
+  %75 = bitcast <2 x bfloat> %66 to i32, !dbg !24
+  %76 = bitcast <2 x bfloat> %72 to i32, !dbg !24
+  tail call void asm sideeffect "st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l"(i32 %73, i32 %74, i32 %75, i32 %76, ptr addrspace(1) %48) #2, !dbg !24
+  ret void, !dbg !25
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1
+
+attributes #0 = { nounwind "nvvm.reqntid"="128" }
+attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #2 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly)
+!1 = !DIFile(filename: "c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py", directory: "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f")
+!2 = !{i32 2, !"Debug Info Version", i32 3}
+!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
+!4 = distinct !DISubprogram(name: "triton_poi_fused_add_mul_1", linkageName: "triton_poi_fused_add_mul_1", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!5 = !DISubroutineType(cc: DW_CC_normal, types: !6)
+!6 = !{}
+!7 = !DILocation(line: 20, column: 28, scope: !4)
+!8 = !DILocation(line: 20, column: 33, scope: !4)
+!9 = !DILocation(line: 21, column: 36, scope: !4)
+!10 = !DILocation(line: 21, column: 23, scope: !4)
+!11 = !DILocation(line: 24, column: 19, scope: !4)
+!12 = !DILocation(line: 25, column: 30, scope: !4)
+!13 = !DILocation(line: 25, column: 35, scope: !4)
+!14 = !DILocation(line: 26, column: 30, scope: !4)
+!15 = !DILocation(line: 26, column: 35, scope: !4)
+!16 = !DILocation(line: 27, column: 30, scope: !4)
+!17 = !DILocation(line: 27, column: 35, scope: !4)
+!18 = !DILocation(line: 30, column: 25, scope: !4)
+!19 = !DILocation(line: 25, column: 44, scope: !4)
+!20 = !DILocation(line: 26, column: 74, scope: !4)
+!21 = !DILocation(line: 27, column: 44, scope: !4)
+!22 = !DILocation(line: 28, column: 18, scope: !4)
+!23 = !DILocation(line: 29, column: 18, scope: !4)
+!24 = !DILocation(line: 30, column: 36, scope: !4)
+!25 = !DILocation(line: 30, column: 4, scope: !4)
diff --git a/triton/EQOEBZDPMDVSX6EJFLBNKY5DUKJXFLSS4SF4QQZQUN6AV3JLHKJQ/triton_poi_fused_add_mul_1.ptx b/triton/EQOEBZDPMDVSX6EJFLBNKY5DUKJXFLSS4SF4QQZQUN6AV3JLHKJQ/triton_poi_fused_add_mul_1.ptx
new file mode 100644
index 0000000000000000000000000000000000000000..0223689fa51d9eb5bd49ef8290f88509b10135ce
--- /dev/null
+++ b/triton/EQOEBZDPMDVSX6EJFLBNKY5DUKJXFLSS4SF4QQZQUN6AV3JLHKJQ/triton_poi_fused_add_mul_1.ptx
@@ -0,0 +1,407 @@
+//
+// Generated by LLVM NVPTX Back-End
+//
+
+.version 9.1
+.target sm_89
+.address_size 64
+
+	// .globl	triton_poi_fused_add_mul_1 // -- Begin function triton_poi_fused_add_mul_1
+                                        // @triton_poi_fused_add_mul_1
+.visible .entry triton_poi_fused_add_mul_1(
+	.param .u64 .ptr .global .align 1 triton_poi_fused_add_mul_1_param_0,
+	.param .u64 .ptr .global .align 1 triton_poi_fused_add_mul_1_param_1,
+	.param .u64 .ptr .global .align 1 triton_poi_fused_add_mul_1_param_2,
+	.param .u64 .ptr .global .align 1 triton_poi_fused_add_mul_1_param_3,
+	.param .u32 triton_poi_fused_add_mul_1_param_4,
+	.param .u64 .ptr .global .align 1 triton_poi_fused_add_mul_1_param_5,
+	.param .u64 .ptr .global .align 1 triton_poi_fused_add_mul_1_param_6
+)
+.reqntid 128
+{
+	.reg .b16 	%rs<25>;
+	.reg .b32 	%r<60>;
+	.reg .b64 	%rd<11>;
+	.loc	1 18 0                          // c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py:18:0
+$L__func_begin0:
+	.loc	1 18 0                          // c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py:18:0
+
+// %bb.0:
+	ld.param.b64 	%rd6, [triton_poi_fused_add_mul_1_param_0];
+	ld.param.b64 	%rd7, [triton_poi_fused_add_mul_1_param_1];
+$L__tmp0:
+	.loc	1 20 28                         // c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py:20:28
+	mov.u32 	%r17, %ctaid.x;
+	.loc	1 20 33                         // c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py:20:33
+	shl.b32 	%r18, %r17, 10;
+	ld.param.b64 	%rd8, [triton_poi_fused_add_mul_1_param_2];
+	ld.param.b64 	%rd9, [triton_poi_fused_add_mul_1_param_3];
+	.loc	1 21 36                         // c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py:21:36
+	mov.u32 	%r19, %tid.x;
+	shl.b32 	%r20, %r19, 3;
+	and.b32 	%r21, %r20, 1016;
+	.loc	1 21 23                         // c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py:21:23
+	or.b32 	%r22, %r21, %r18;
+	.loc	1 24 19                         // c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py:24:19
+	bfe.s32 	%r23, %r17, 21, 1;
+	shr.u32 	%r24, %r23, 20;
+	add.s32 	%r25, %r22, %r24;
+	and.b32 	%r26, %r25, -4096;
+	sub.s32 	%r27, %r22, %r26;
+	.loc	1 25 30                         // c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py:25:30
+	mul.wide.s32 	%rd10, %r22, 2;
+	add.s64 	%rd1, %rd6, %rd10;
+	.loc	1 25 35                         // c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py:25:35
+	// begin inline asm
+	mov.u32 %r1, 0x0;
+	mov.u32 %r2, 0x0;
+	mov.u32 %r3, 0x0;
+	mov.u32 %r4, 0x0;
+	ld.global.v4.b32 { %r1, %r2, %r3, %r4 }, [ %rd1 + 0 ];
+	// end inline asm
+	.loc	1 26 30                         // c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py:26:30
+	mad.wide.s32 	%rd2, %r27, 2, %rd7;
+	.loc	1 26 35                         // c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py:26:35
+	// begin inline asm
+	mov.u64 %rd3, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd3, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r5, 0x0;
+	mov.u32 %r6, 0x0;
+	mov.u32 %r7, 0x0;
+	mov.u32 %r8, 0x0;
+	ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r5, %r6, %r7, %r8 }, [ %rd2 + 0 ], %rd3;
+	// end inline asm
+	.loc	1 27 30                         // c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py:27:30
+	add.s64 	%rd4, %rd8, %rd10;
+	.loc	1 27 35                         // c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py:27:35
+	// begin inline asm
+	mov.u32 %r9, 0x0;
+	mov.u32 %r10, 0x0;
+	mov.u32 %r11, 0x0;
+	mov.u32 %r12, 0x0;
+	ld.global.v4.b32 { %r9, %r10, %r11, %r12 }, [ %rd4 + 0 ];
+	// end inline asm
+	.loc	1 30 25                         // c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py:30:25
+	add.s64 	%rd5, %rd9, %rd10;
+	.loc	1 25 44                         // c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py:25:44
+	mov.b32 	{%rs1, %rs2}, %r1;
+	cvt.f32.bf16 	%r28, %rs2;
+	cvt.f32.bf16 	%r29, %rs1;
+	.loc	1 26 74                         // c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py:26:74
+	mov.b32 	{%rs3, %rs4}, %r5;
+	cvt.f32.bf16 	%r30, %rs4;
+	cvt.f32.bf16 	%r31, %rs3;
+	.loc	1 27 44                         // c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py:27:44
+	mov.b32 	{%rs5, %rs6}, %r9;
+	cvt.f32.bf16 	%r32, %rs6;
+	cvt.f32.bf16 	%r33, %rs5;
+	.loc	1 29 18                         // c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py:29:18
+	fma.rn.f32 	%r34, %r31, %r33, %r29;
+	fma.rn.f32 	%r35, %r30, %r32, %r28;
+	.loc	1 30 36                         // c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py:30:36
+	cvt.rn.bf16x2.f32 	%r13, %r35, %r34;
+	.loc	1 25 44                         // c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py:25:44
+	mov.b32 	{%rs7, %rs8}, %r2;
+	cvt.f32.bf16 	%r36, %rs8;
+	cvt.f32.bf16 	%r37, %rs7;
+	.loc	1 26 74                         // c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py:26:74
+	mov.b32 	{%rs9, %rs10}, %r6;
+	cvt.f32.bf16 	%r38, %rs10;
+	cvt.f32.bf16 	%r39, %rs9;
+	.loc	1 27 44                         // c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py:27:44
+	mov.b32 	{%rs11, %rs12}, %r10;
+	cvt.f32.bf16 	%r40, %rs12;
+	cvt.f32.bf16 	%r41, %rs11;
+	.loc	1 29 18                         // c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py:29:18
+	fma.rn.f32 	%r42, %r39, %r41, %r37;
+	fma.rn.f32 	%r43, %r38, %r40, %r36;
+	.loc	1 30 36                         // c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py:30:36
+	cvt.rn.bf16x2.f32 	%r14, %r43, %r42;
+	.loc	1 25 44                         // c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py:25:44
+	mov.b32 	{%rs13, %rs14}, %r3;
+	cvt.f32.bf16 	%r44, %rs14;
+	cvt.f32.bf16 	%r45, %rs13;
+	.loc	1 26 74                         // c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py:26:74
+	mov.b32 	{%rs15, %rs16}, %r7;
+	cvt.f32.bf16 	%r46, %rs16;
+	cvt.f32.bf16 	%r47, %rs15;
+	.loc	1 27 44                         // c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py:27:44
+	mov.b32 	{%rs17, %rs18}, %r11;
+	cvt.f32.bf16 	%r48, %rs18;
+	cvt.f32.bf16 	%r49, %rs17;
+	.loc	1 29 18                         // c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py:29:18
+	fma.rn.f32 	%r50, %r47, %r49, %r45;
+	fma.rn.f32 	%r51, %r46, %r48, %r44;
+	.loc	1 30 36                         // c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py:30:36
+	cvt.rn.bf16x2.f32 	%r15, %r51, %r50;
+	.loc	1 25 44                         // c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py:25:44
+	mov.b32 	{%rs19, %rs20}, %r4;
+	cvt.f32.bf16 	%r52, %rs20;
+	cvt.f32.bf16 	%r53, %rs19;
+	.loc	1 26 74                         // c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py:26:74
+	mov.b32 	{%rs21, %rs22}, %r8;
+	cvt.f32.bf16 	%r54, %rs22;
+	cvt.f32.bf16 	%r55, %rs21;
+	.loc	1 27 44                         // c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py:27:44
+	mov.b32 	{%rs23, %rs24}, %r12;
+	cvt.f32.bf16 	%r56, %rs24;
+	cvt.f32.bf16 	%r57, %rs23;
+	.loc	1 29 18                         // c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py:29:18
+	fma.rn.f32 	%r58, %r55, %r57, %r53;
+	fma.rn.f32 	%r59, %r54, %r56, %r52;
+	.loc	1 30 36                         // c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py:30:36
+	cvt.rn.bf16x2.f32 	%r16, %r59, %r58;
+	// begin inline asm
+	st.global.v4.b32 [ %rd5 + 0 ], { %r13, %r14, %r15, %r16 };
+	// end inline asm
+	.loc	1 30 4                          // c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py:30:4
+	ret;
+$L__tmp1:
+$L__func_end0:
+                                        // -- End function
+}
+	.file	1 "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py"
+	.section	.debug_abbrev
+	{
+.b8 1                                   // Abbreviation Code
+.b8 17                                  // DW_TAG_compile_unit
+.b8 0                                   // DW_CHILDREN_no
+.b8 37                                  // DW_AT_producer
+.b8 8                                   // DW_FORM_string
+.b8 19                                  // DW_AT_language
+.b8 5                                   // DW_FORM_data2
+.b8 3                                   // DW_AT_name
+.b8 8                                   // DW_FORM_string
+.b8 16                                  // DW_AT_stmt_list
+.b8 6                                   // DW_FORM_data4
+.b8 27                                  // DW_AT_comp_dir
+.b8 8                                   // DW_FORM_string
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 0                                   // EOM(3)
+	}
+	.section	.debug_info
+	{
+.b32 224                                // Length of Unit
+.b8 2                                   // DWARF version number
+.b8 0
+.b32 .debug_abbrev                      // Offset Into Abbrev. Section
+.b8 8                                   // Address Size (in bytes)
+.b8 1                                   // Abbrev [1] 0xb:0xd9 DW_TAG_compile_unit
+.b8 116                                 // DW_AT_producer
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 0
+.b8 2                                   // DW_AT_language
+.b8 0
+.b8 99                                  // DW_AT_name
+.b8 55
+.b8 102
+.b8 102
+.b8 52
+.b8 105
+.b8 98
+.b8 54
+.b8 54
+.b8 53
+.b8 50
+.b8 111
+.b8 106
+.b8 108
+.b8 108
+.b8 117
+.b8 116
+.b8 109
+.b8 52
+.b8 99
+.b8 55
+.b8 109
+.b8 107
+.b8 122
+.b8 122
+.b8 112
+.b8 121
+.b8 98
+.b8 111
+.b8 110
+.b8 100
+.b8 51
+.b8 112
+.b8 97
+.b8 103
+.b8 117
+.b8 51
+.b8 103
+.b8 108
+.b8 115
+.b8 112
+.b8 119
+.b8 51
+.b8 115
+.b8 122
+.b8 116
+.b8 107
+.b8 102
+.b8 101
+.b8 50
+.b8 122
+.b8 97
+.b8 46
+.b8 112
+.b8 121
+.b8 0
+.b32 .debug_line                        // DW_AT_stmt_list
+.b8 47                                  // DW_AT_comp_dir
+.b8 97
+.b8 112
+.b8 112
+.b8 47
+.b8 116
+.b8 101
+.b8 110
+.b8 115
+.b8 111
+.b8 114
+.b8 114
+.b8 116
+.b8 95
+.b8 108
+.b8 108
+.b8 109
+.b8 47
+.b8 118
+.b8 105
+.b8 115
+.b8 117
+.b8 97
+.b8 108
+.b8 95
+.b8 103
+.b8 101
+.b8 110
+.b8 47
+.b8 99
+.b8 111
+.b8 109
+.b8 112
+.b8 105
+.b8 108
+.b8 101
+.b8 100
+.b8 95
+.b8 99
+.b8 97
+.b8 99
+.b8 104
+.b8 101
+.b8 47
+.b8 102
+.b8 108
+.b8 117
+.b8 120
+.b8 50
+.b8 95
+.b8 107
+.b8 108
+.b8 101
+.b8 105
+.b8 110
+.b8 95
+.b8 57
+.b8 98
+.b8 95
+.b8 78
+.b8 86
+.b8 73
+.b8 68
+.b8 73
+.b8 65
+.b8 95
+.b8 71
+.b8 101
+.b8 70
+.b8 111
+.b8 114
+.b8 99
+.b8 101
+.b8 95
+.b8 82
+.b8 84
+.b8 88
+.b8 95
+.b8 52
+.b8 48
+.b8 57
+.b8 48
+.b8 95
+.b8 115
+.b8 109
+.b8 56
+.b8 57
+.b8 95
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 50
+.b8 46
+.b8 49
+.b8 48
+.b8 46
+.b8 48
+.b8 97
+.b8 48
+.b8 95
+.b8 98
+.b8 52
+.b8 101
+.b8 52
+.b8 101
+.b8 101
+.b8 56
+.b8 49
+.b8 100
+.b8 51
+.b8 46
+.b8 110
+.b8 118
+.b8 50
+.b8 53
+.b8 46
+.b8 49
+.b8 50
+.b8 95
+.b8 99
+.b8 117
+.b8 100
+.b8 97
+.b8 49
+.b8 51
+.b8 95
+.b8 49
+.b8 47
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 105
+.b8 110
+.b8 100
+.b8 117
+.b8 99
+.b8 116
+.b8 111
+.b8 114
+.b8 47
+.b8 55
+.b8 102
+.b8 0
+	}
+	.section	.debug_macinfo	{	}
diff --git a/triton/EQOEBZDPMDVSX6EJFLBNKY5DUKJXFLSS4SF4QQZQUN6AV3JLHKJQ/triton_poi_fused_add_mul_1.source b/triton/EQOEBZDPMDVSX6EJFLBNKY5DUKJXFLSS4SF4QQZQUN6AV3JLHKJQ/triton_poi_fused_add_mul_1.source
new file mode 100644
index 0000000000000000000000000000000000000000..9875c91d6e947ff61a2cfb4412b5ecff5ba79f09
--- /dev/null
+++ b/triton/EQOEBZDPMDVSX6EJFLBNKY5DUKJXFLSS4SF4QQZQUN6AV3JLHKJQ/triton_poi_fused_add_mul_1.source
@@ -0,0 +1,82 @@
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":18:0)
+#loc22 = loc("in_ptr0"(#loc))
+#loc23 = loc("in_ptr1"(#loc))
+#loc24 = loc("in_ptr2"(#loc))
+#loc25 = loc("out_ptr0"(#loc))
+#loc26 = loc("xnumel"(#loc))
+module {
+  tt.func public @triton_poi_fused_add_mul_1(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} {
+    %xnumel_0 = arith.constant 8388608 : i32 loc(#loc27)
+    %xoffset = tt.get_program_id x : i32 loc(#loc28)
+    %xoffset_1 = arith.constant 1024 : i32 loc(#loc29)
+    %xoffset_2 = arith.constant 1024 : i32 loc(#loc29)
+    %xoffset_3 = arith.muli %xoffset, %xoffset_2 : i32 loc(#loc29)
+    %xindex = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32> loc(#loc30)
+    %xindex_4 = tt.splat %xoffset_3 : i32 -> tensor<1024xi32> loc(#loc31)
+    %xindex_5 = arith.addi %xindex_4, %xindex : tensor<1024xi32> loc(#loc31)
+    %xmask = arith.constant true loc(#loc32)
+    %xmask_6 = arith.constant dense<true> : tensor<1024xi1> loc(#loc32)
+    %x0 = arith.constant 4096 : i32 loc(#loc33)
+    %x0_7 = arith.constant 4096 : i32 loc(#loc33)
+    %x0_8 = arith.constant dense<4096> : tensor<1024xi32> loc(#loc33)
+    %x0_9 = arith.remsi %xindex_5, %x0_8 : tensor<1024xi32> loc(#loc33)
+    %tmp0 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<1024x!tt.ptr<bf16>> loc(#loc34)
+    %tmp0_10 = tt.addptr %tmp0, %xindex_5 : tensor<1024x!tt.ptr<bf16>>, tensor<1024xi32> loc(#loc34)
+    %tmp0_11 = tt.load %tmp0_10 : tensor<1024x!tt.ptr<bf16>> loc(#loc35)
+    %tmp0_12 = arith.extf %tmp0_11 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc36)
+    %tmp1 = tt.splat %in_ptr1 : !tt.ptr<bf16> -> tensor<1024x!tt.ptr<bf16>> loc(#loc37)
+    %tmp1_13 = tt.addptr %tmp1, %x0_9 : tensor<1024x!tt.ptr<bf16>>, tensor<1024xi32> loc(#loc37)
+    %tmp1_14 = tt.load %tmp1_13 evictionPolicy = evict_last : tensor<1024x!tt.ptr<bf16>> loc(#loc38)
+    %tmp1_15 = arith.extf %tmp1_14 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc39)
+    %tmp2 = tt.splat %in_ptr2 : !tt.ptr<bf16> -> tensor<1024x!tt.ptr<bf16>> loc(#loc40)
+    %tmp2_16 = tt.addptr %tmp2, %xindex_5 : tensor<1024x!tt.ptr<bf16>>, tensor<1024xi32> loc(#loc40)
+    %tmp2_17 = tt.load %tmp2_16 : tensor<1024x!tt.ptr<bf16>> loc(#loc41)
+    %tmp2_18 = arith.extf %tmp2_17 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc42)
+    %tmp3 = arith.mulf %tmp1_15, %tmp2_18 : tensor<1024xf32> loc(#loc43)
+    %tmp4 = arith.addf %tmp0_12, %tmp3 : tensor<1024xf32> loc(#loc44)
+    %0 = tt.splat %out_ptr0 : !tt.ptr<bf16> -> tensor<1024x!tt.ptr<bf16>> loc(#loc19)
+    %1 = tt.addptr %0, %xindex_5 : tensor<1024x!tt.ptr<bf16>>, tensor<1024xi32> loc(#loc19)
+    %2 = arith.truncf %tmp4 : tensor<1024xf32> to tensor<1024xbf16> loc(#loc20)
+    tt.store %1, %2 : tensor<1024x!tt.ptr<bf16>> loc(#loc20)
+    tt.return loc(#loc21)
+  } loc(#loc)
+} loc(#loc)
+#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":19:13)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":20:28)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":20:33)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":21:36)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":21:23)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":22:36)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":24:19)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":25:30)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":25:35)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":25:44)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":26:30)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":26:35)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":26:74)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":27:30)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":27:35)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":27:44)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":28:18)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":29:18)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":30:25)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":30:36)
+#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":30:4)
+#loc27 = loc("xnumel"(#loc1))
+#loc28 = loc("xoffset"(#loc2))
+#loc29 = loc("xoffset"(#loc3))
+#loc30 = loc("xindex"(#loc4))
+#loc31 = loc("xindex"(#loc5))
+#loc32 = loc("xmask"(#loc6))
+#loc33 = loc("x0"(#loc7))
+#loc34 = loc("tmp0"(#loc8))
+#loc35 = loc("tmp0"(#loc9))
+#loc36 = loc("tmp0"(#loc10))
+#loc37 = loc("tmp1"(#loc11))
+#loc38 = loc("tmp1"(#loc12))
+#loc39 = loc("tmp1"(#loc13))
+#loc40 = loc("tmp2"(#loc14))
+#loc41 = loc("tmp2"(#loc15))
+#loc42 = loc("tmp2"(#loc16))
+#loc43 = loc("tmp3"(#loc17))
+#loc44 = loc("tmp4"(#loc18))
diff --git a/triton/EQOEBZDPMDVSX6EJFLBNKY5DUKJXFLSS4SF4QQZQUN6AV3JLHKJQ/triton_poi_fused_add_mul_1.ttgir b/triton/EQOEBZDPMDVSX6EJFLBNKY5DUKJXFLSS4SF4QQZQUN6AV3JLHKJQ/triton_poi_fused_add_mul_1.ttgir
new file mode 100644
index 0000000000000000000000000000000000000000..2bd931d206dbaacc44db3109cd29764423901b3b
--- /dev/null
+++ b/triton/EQOEBZDPMDVSX6EJFLBNKY5DUKJXFLSS4SF4QQZQUN6AV3JLHKJQ/triton_poi_fused_add_mul_1.ttgir
@@ -0,0 +1,74 @@
+#blocked = #ttg.blocked<{sizePerThread = [8], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}>
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":18:0)
+#loc21 = loc("in_ptr0"(#loc))
+#loc22 = loc("in_ptr1"(#loc))
+#loc23 = loc("in_ptr2"(#loc))
+#loc24 = loc("out_ptr0"(#loc))
+#loc25 = loc("xnumel"(#loc))
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:89", "ttg.threads-per-warp" = 32 : i32} {
+  tt.func public @triton_poi_fused_add_mul_1(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} {
+    %cst = arith.constant dense<4096> : tensor<1024xi32, #blocked> loc(#loc1)
+    %c1024_i32 = arith.constant 1024 : i32 loc(#loc1)
+    %xoffset = tt.get_program_id x : i32 loc(#loc26)
+    %xoffset_0 = arith.muli %xoffset, %c1024_i32 : i32 loc(#loc27)
+    %xindex = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked> loc(#loc28)
+    %xindex_1 = tt.splat %xoffset_0 : i32 -> tensor<1024xi32, #blocked> loc(#loc29)
+    %xindex_2 = arith.addi %xindex_1, %xindex : tensor<1024xi32, #blocked> loc(#loc29)
+    %x0 = arith.remsi %xindex_2, %cst : tensor<1024xi32, #blocked> loc(#loc30)
+    %tmp0 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<1024x!tt.ptr<bf16>, #blocked> loc(#loc31)
+    %tmp0_3 = tt.addptr %tmp0, %xindex_2 : tensor<1024x!tt.ptr<bf16>, #blocked>, tensor<1024xi32, #blocked> loc(#loc31)
+    %tmp0_4 = tt.load %tmp0_3 : tensor<1024x!tt.ptr<bf16>, #blocked> loc(#loc32)
+    %tmp0_5 = arith.extf %tmp0_4 : tensor<1024xbf16, #blocked> to tensor<1024xf32, #blocked> loc(#loc33)
+    %tmp1 = tt.splat %in_ptr1 : !tt.ptr<bf16> -> tensor<1024x!tt.ptr<bf16>, #blocked> loc(#loc34)
+    %tmp1_6 = tt.addptr %tmp1, %x0 : tensor<1024x!tt.ptr<bf16>, #blocked>, tensor<1024xi32, #blocked> loc(#loc34)
+    %tmp1_7 = tt.load %tmp1_6 evictionPolicy = evict_last : tensor<1024x!tt.ptr<bf16>, #blocked> loc(#loc35)
+    %tmp1_8 = arith.extf %tmp1_7 : tensor<1024xbf16, #blocked> to tensor<1024xf32, #blocked> loc(#loc36)
+    %tmp2 = tt.splat %in_ptr2 : !tt.ptr<bf16> -> tensor<1024x!tt.ptr<bf16>, #blocked> loc(#loc37)
+    %tmp2_9 = tt.addptr %tmp2, %xindex_2 : tensor<1024x!tt.ptr<bf16>, #blocked>, tensor<1024xi32, #blocked> loc(#loc37)
+    %tmp2_10 = tt.load %tmp2_9 : tensor<1024x!tt.ptr<bf16>, #blocked> loc(#loc38)
+    %tmp2_11 = arith.extf %tmp2_10 : tensor<1024xbf16, #blocked> to tensor<1024xf32, #blocked> loc(#loc39)
+    %tmp3 = arith.mulf %tmp1_8, %tmp2_11 : tensor<1024xf32, #blocked> loc(#loc40)
+    %tmp4 = arith.addf %tmp0_5, %tmp3 : tensor<1024xf32, #blocked> loc(#loc41)
+    %0 = tt.splat %out_ptr0 : !tt.ptr<bf16> -> tensor<1024x!tt.ptr<bf16>, #blocked> loc(#loc18)
+    %1 = tt.addptr %0, %xindex_2 : tensor<1024x!tt.ptr<bf16>, #blocked>, tensor<1024xi32, #blocked> loc(#loc18)
+    %2 = arith.truncf %tmp4 : tensor<1024xf32, #blocked> to tensor<1024xbf16, #blocked> loc(#loc19)
+    tt.store %1, %2 : tensor<1024x!tt.ptr<bf16>, #blocked> loc(#loc19)
+    tt.return loc(#loc20)
+  } loc(#loc)
+} loc(#loc)
+#loc1 = loc(unknown)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":20:28)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":20:33)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":21:36)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":21:23)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":24:19)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":25:30)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":25:35)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":25:44)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":26:30)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":26:35)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":26:74)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":27:30)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":27:35)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":27:44)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":28:18)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":29:18)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":30:25)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":30:36)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":30:4)
+#loc26 = loc("xoffset"(#loc2))
+#loc27 = loc("xoffset"(#loc3))
+#loc28 = loc("xindex"(#loc4))
+#loc29 = loc("xindex"(#loc5))
+#loc30 = loc("x0"(#loc6))
+#loc31 = loc("tmp0"(#loc7))
+#loc32 = loc("tmp0"(#loc8))
+#loc33 = loc("tmp0"(#loc9))
+#loc34 = loc("tmp1"(#loc10))
+#loc35 = loc("tmp1"(#loc11))
+#loc36 = loc("tmp1"(#loc12))
+#loc37 = loc("tmp2"(#loc13))
+#loc38 = loc("tmp2"(#loc14))
+#loc39 = loc("tmp2"(#loc15))
+#loc40 = loc("tmp3"(#loc16))
+#loc41 = loc("tmp4"(#loc17))
diff --git a/triton/EQOEBZDPMDVSX6EJFLBNKY5DUKJXFLSS4SF4QQZQUN6AV3JLHKJQ/triton_poi_fused_add_mul_1.ttir b/triton/EQOEBZDPMDVSX6EJFLBNKY5DUKJXFLSS4SF4QQZQUN6AV3JLHKJQ/triton_poi_fused_add_mul_1.ttir
new file mode 100644
index 0000000000000000000000000000000000000000..cab525834fbe60a064cdc220ae6ad53066fcf24e
--- /dev/null
+++ b/triton/EQOEBZDPMDVSX6EJFLBNKY5DUKJXFLSS4SF4QQZQUN6AV3JLHKJQ/triton_poi_fused_add_mul_1.ttir
@@ -0,0 +1,73 @@
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":18:0)
+#loc21 = loc("in_ptr0"(#loc))
+#loc22 = loc("in_ptr1"(#loc))
+#loc23 = loc("in_ptr2"(#loc))
+#loc24 = loc("out_ptr0"(#loc))
+#loc25 = loc("xnumel"(#loc))
+module {
+  tt.func public @triton_poi_fused_add_mul_1(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} {
+    %x0 = arith.constant dense<4096> : tensor<1024xi32> loc(#loc26)
+    %c1024_i32 = arith.constant 1024 : i32 loc(#loc2)
+    %xoffset = tt.get_program_id x : i32 loc(#loc27)
+    %xoffset_0 = arith.muli %xoffset, %c1024_i32 : i32 loc(#loc28)
+    %xindex = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32> loc(#loc29)
+    %xindex_1 = tt.splat %xoffset_0 : i32 -> tensor<1024xi32> loc(#loc30)
+    %xindex_2 = arith.addi %xindex_1, %xindex : tensor<1024xi32> loc(#loc30)
+    %x0_3 = arith.remsi %xindex_2, %x0 : tensor<1024xi32> loc(#loc26)
+    %tmp0 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<1024x!tt.ptr<bf16>> loc(#loc31)
+    %tmp0_4 = tt.addptr %tmp0, %xindex_2 : tensor<1024x!tt.ptr<bf16>>, tensor<1024xi32> loc(#loc31)
+    %tmp0_5 = tt.load %tmp0_4 : tensor<1024x!tt.ptr<bf16>> loc(#loc32)
+    %tmp0_6 = arith.extf %tmp0_5 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc33)
+    %tmp1 = tt.splat %in_ptr1 : !tt.ptr<bf16> -> tensor<1024x!tt.ptr<bf16>> loc(#loc34)
+    %tmp1_7 = tt.addptr %tmp1, %x0_3 : tensor<1024x!tt.ptr<bf16>>, tensor<1024xi32> loc(#loc34)
+    %tmp1_8 = tt.load %tmp1_7 evictionPolicy = evict_last : tensor<1024x!tt.ptr<bf16>> loc(#loc35)
+    %tmp1_9 = arith.extf %tmp1_8 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc36)
+    %tmp2 = tt.splat %in_ptr2 : !tt.ptr<bf16> -> tensor<1024x!tt.ptr<bf16>> loc(#loc37)
+    %tmp2_10 = tt.addptr %tmp2, %xindex_2 : tensor<1024x!tt.ptr<bf16>>, tensor<1024xi32> loc(#loc37)
+    %tmp2_11 = tt.load %tmp2_10 : tensor<1024x!tt.ptr<bf16>> loc(#loc38)
+    %tmp2_12 = arith.extf %tmp2_11 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc39)
+    %tmp3 = arith.mulf %tmp1_9, %tmp2_12 : tensor<1024xf32> loc(#loc40)
+    %tmp4 = arith.addf %tmp0_6, %tmp3 : tensor<1024xf32> loc(#loc41)
+    %0 = tt.splat %out_ptr0 : !tt.ptr<bf16> -> tensor<1024x!tt.ptr<bf16>> loc(#loc18)
+    %1 = tt.addptr %0, %xindex_2 : tensor<1024x!tt.ptr<bf16>>, tensor<1024xi32> loc(#loc18)
+    %2 = arith.truncf %tmp4 : tensor<1024xf32> to tensor<1024xbf16> loc(#loc19)
+    tt.store %1, %2 : tensor<1024x!tt.ptr<bf16>> loc(#loc19)
+    tt.return loc(#loc20)
+  } loc(#loc)
+} loc(#loc)
+#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":24:19)
+#loc2 = loc(unknown)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":20:28)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":20:33)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":21:36)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":21:23)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":25:30)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":25:35)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":25:44)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":26:30)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":26:35)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":26:74)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":27:30)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":27:35)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":27:44)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":28:18)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":29:18)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":30:25)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":30:36)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":30:4)
+#loc26 = loc("x0"(#loc1))
+#loc27 = loc("xoffset"(#loc3))
+#loc28 = loc("xoffset"(#loc4))
+#loc29 = loc("xindex"(#loc5))
+#loc30 = loc("xindex"(#loc6))
+#loc31 = loc("tmp0"(#loc7))
+#loc32 = loc("tmp0"(#loc8))
+#loc33 = loc("tmp0"(#loc9))
+#loc34 = loc("tmp1"(#loc10))
+#loc35 = loc("tmp1"(#loc11))
+#loc36 = loc("tmp1"(#loc12))
+#loc37 = loc("tmp2"(#loc13))
+#loc38 = loc("tmp2"(#loc14))
+#loc39 = loc("tmp2"(#loc15))
+#loc40 = loc("tmp3"(#loc16))
+#loc41 = loc("tmp4"(#loc17))
diff --git a/triton/H6VG26TW2DOV7R3PXVPFDX6HZCVIESL5ZYKZWLUWKZYONCE6NSLQ/__grp__triton_red_fused_add_mul_native_layer_norm_0.json b/triton/H6VG26TW2DOV7R3PXVPFDX6HZCVIESL5ZYKZWLUWKZYONCE6NSLQ/__grp__triton_red_fused_add_mul_native_layer_norm_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..b3029a86141c66d076edeed93f1ae540e3f700b5
--- /dev/null
+++ b/triton/H6VG26TW2DOV7R3PXVPFDX6HZCVIESL5ZYKZWLUWKZYONCE6NSLQ/__grp__triton_red_fused_add_mul_native_layer_norm_0.json
@@ -0,0 +1 @@
+{"child_paths": {"triton_red_fused_add_mul_native_layer_norm_0.source": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/H6VG26TW2DOV7R3PXVPFDX6HZCVIESL5ZYKZWLUWKZYONCE6NSLQ/triton_red_fused_add_mul_native_layer_norm_0.source", "triton_red_fused_add_mul_native_layer_norm_0.ttir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/H6VG26TW2DOV7R3PXVPFDX6HZCVIESL5ZYKZWLUWKZYONCE6NSLQ/triton_red_fused_add_mul_native_layer_norm_0.ttir", "triton_red_fused_add_mul_native_layer_norm_0.ttgir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/H6VG26TW2DOV7R3PXVPFDX6HZCVIESL5ZYKZWLUWKZYONCE6NSLQ/triton_red_fused_add_mul_native_layer_norm_0.ttgir", "triton_red_fused_add_mul_native_layer_norm_0.llir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/H6VG26TW2DOV7R3PXVPFDX6HZCVIESL5ZYKZWLUWKZYONCE6NSLQ/triton_red_fused_add_mul_native_layer_norm_0.llir", "triton_red_fused_add_mul_native_layer_norm_0.ptx": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/H6VG26TW2DOV7R3PXVPFDX6HZCVIESL5ZYKZWLUWKZYONCE6NSLQ/triton_red_fused_add_mul_native_layer_norm_0.ptx", "triton_red_fused_add_mul_native_layer_norm_0.cubin": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/H6VG26TW2DOV7R3PXVPFDX6HZCVIESL5ZYKZWLUWKZYONCE6NSLQ/triton_red_fused_add_mul_native_layer_norm_0.cubin", "triton_red_fused_add_mul_native_layer_norm_0.json": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/H6VG26TW2DOV7R3PXVPFDX6HZCVIESL5ZYKZWLUWKZYONCE6NSLQ/triton_red_fused_add_mul_native_layer_norm_0.json"}}
\ No newline at end of file
diff --git a/triton/H6VG26TW2DOV7R3PXVPFDX6HZCVIESL5ZYKZWLUWKZYONCE6NSLQ/triton_red_fused_add_mul_native_layer_norm_0.cubin b/triton/H6VG26TW2DOV7R3PXVPFDX6HZCVIESL5ZYKZWLUWKZYONCE6NSLQ/triton_red_fused_add_mul_native_layer_norm_0.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..68bd792892dae5d1d40cccf5bea7af9065ebbefe
Binary files /dev/null and b/triton/H6VG26TW2DOV7R3PXVPFDX6HZCVIESL5ZYKZWLUWKZYONCE6NSLQ/triton_red_fused_add_mul_native_layer_norm_0.cubin differ
diff --git a/triton/H6VG26TW2DOV7R3PXVPFDX6HZCVIESL5ZYKZWLUWKZYONCE6NSLQ/triton_red_fused_add_mul_native_layer_norm_0.json b/triton/H6VG26TW2DOV7R3PXVPFDX6HZCVIESL5ZYKZWLUWKZYONCE6NSLQ/triton_red_fused_add_mul_native_layer_norm_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..312fa5e84bd5d67a7574ac12a3ccaba6ed604438
--- /dev/null
+++ b/triton/H6VG26TW2DOV7R3PXVPFDX6HZCVIESL5ZYKZWLUWKZYONCE6NSLQ/triton_red_fused_add_mul_native_layer_norm_0.json
@@ -0,0 +1 @@
+{"hash": "3faa6d7a76d0dd5fc76fbd5e51dfc7c8aa82497dce159b2e965670e6889e6c97", "target": {"backend": "cuda", "arch": 89, "warp_size": 32}, "num_warps": 16, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "enable_reflect_ftz": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee", "bf16x3", "bf16x6"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm89", "instrumentation_mode": "", "triton_version": "3.6.0", "tensordesc_meta": [], "shared": 192, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused_add_mul_native_layer_norm_0"}
\ No newline at end of file
diff --git a/triton/H6VG26TW2DOV7R3PXVPFDX6HZCVIESL5ZYKZWLUWKZYONCE6NSLQ/triton_red_fused_add_mul_native_layer_norm_0.llir b/triton/H6VG26TW2DOV7R3PXVPFDX6HZCVIESL5ZYKZWLUWKZYONCE6NSLQ/triton_red_fused_add_mul_native_layer_norm_0.llir
new file mode 100644
index 0000000000000000000000000000000000000000..f7820abb34d8dd7637be0088fdf9db884a3c3fd8
--- /dev/null
+++ b/triton/H6VG26TW2DOV7R3PXVPFDX6HZCVIESL5ZYKZWLUWKZYONCE6NSLQ/triton_red_fused_add_mul_native_layer_norm_0.llir
@@ -0,0 +1,620 @@
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64"
+
+@global_smem = external addrspace(3) global [0 x i8], align 16
+@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1
+
+; Function Attrs: nounwind
+define ptx_kernel void @triton_red_fused_add_mul_native_layer_norm_0(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, ptr addrspace(1) %6, i32 %7, i32 %8, ptr addrspace(1) readnone captures(none) %9, ptr addrspace(1) readnone captures(none) %10) local_unnamed_addr #0 !dbg !5 {
+__nv_rsqrtf.exit:
+  %11 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !8
+  %12 = icmp samesign ult i32 %11, 256, !dbg !9
+  %13 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
+  %14 = shl nuw nsw i32 %13, 2, !dbg !10
+  %15 = and i32 %14, 2044, !dbg !10
+  %16 = shl i32 %11, 12, !dbg !11
+  %17 = zext nneg i32 %15 to i64, !dbg !12
+  %18 = sext i32 %16 to i64, !dbg !12
+  %19 = or disjoint i64 %17, %18, !dbg !13
+  %20 = getelementptr bfloat, ptr addrspace(1) %0, i64 %19, !dbg !14
+  %21 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #6, !dbg !15
+  %22 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %20, i64 %21, i1 %12) #6, !dbg !15
+  %23 = getelementptr bfloat, ptr addrspace(1) %1, i64 %17, !dbg !16
+  %24 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !17
+  %25 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %23, i64 %24, i1 true) #6, !dbg !17
+  %26 = getelementptr bfloat, ptr addrspace(1) %2, i64 %19, !dbg !18
+  %27 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #6, !dbg !19
+  %28 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %26, i64 %27, i1 %12) #6, !dbg !19
+  %29 = extractvalue { i32, i32 } %25, 1, !dbg !17
+  %30 = bitcast i32 %29 to <2 x bfloat>, !dbg !17
+  %31 = extractvalue { i32, i32 } %28, 1, !dbg !19
+  %32 = bitcast i32 %31 to <2 x bfloat>, !dbg !19
+  %33 = extractvalue { i32, i32 } %22, 1, !dbg !15
+  %34 = bitcast i32 %33 to <2 x bfloat>, !dbg !15
+  %35 = extractvalue { i32, i32 } %25, 0, !dbg !17
+  %36 = bitcast i32 %35 to <2 x bfloat>, !dbg !17
+  %37 = extractvalue { i32, i32 } %28, 0, !dbg !19
+  %38 = bitcast i32 %37 to <2 x bfloat>, !dbg !19
+  %39 = extractvalue { i32, i32 } %22, 0, !dbg !15
+  %40 = bitcast i32 %39 to <2 x bfloat>, !dbg !15
+  %41 = getelementptr bfloat, ptr addrspace(1) %5, i64 %19, !dbg !20
+  %42 = fpext <2 x bfloat> %36 to <2 x float>, !dbg !21
+  %43 = fpext <2 x bfloat> %38 to <2 x float>, !dbg !22
+  %44 = fmul <2 x float> %42, %43, !dbg !23
+  %45 = fpext <2 x bfloat> %40 to <2 x float>, !dbg !24
+  %46 = fadd <2 x float> %44, %45, !dbg !25
+  %47 = extractelement <2 x float> %46, i64 0, !dbg !26
+  %48 = select i1 %12, float %47, float 0.000000e+00, !dbg !26
+  %49 = extractelement <2 x float> %46, i64 1, !dbg !26
+  %50 = select i1 %12, float %49, float 0.000000e+00, !dbg !26
+  %51 = fptrunc <2 x float> %46 to <2 x bfloat>, !dbg !27
+  %52 = fpext <2 x bfloat> %30 to <2 x float>, !dbg !21
+  %53 = fpext <2 x bfloat> %32 to <2 x float>, !dbg !22
+  %54 = fmul <2 x float> %52, %53, !dbg !23
+  %55 = fpext <2 x bfloat> %34 to <2 x float>, !dbg !24
+  %56 = fadd <2 x float> %54, %55, !dbg !25
+  %57 = extractelement <2 x float> %56, i64 0, !dbg !26
+  %58 = select i1 %12, float %57, float 0.000000e+00, !dbg !26
+  %59 = extractelement <2 x float> %56, i64 1, !dbg !26
+  %60 = select i1 %12, float %59, float 0.000000e+00, !dbg !26
+  %61 = fptrunc <2 x float> %56 to <2 x bfloat>, !dbg !27
+  %62 = bitcast <2 x bfloat> %51 to i32, !dbg !27
+  %63 = bitcast <2 x bfloat> %61 to i32, !dbg !27
+  tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %62, i32 %63, ptr addrspace(1) %41, i1 %12) #6, !dbg !27
+  %64 = or disjoint i64 %17, 2048, !dbg !28
+  %65 = or disjoint i64 %64, %18, !dbg !13
+  %66 = getelementptr bfloat, ptr addrspace(1) %0, i64 %65, !dbg !14
+  %67 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #6, !dbg !15
+  %68 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %66, i64 %67, i1 %12) #6, !dbg !15
+  %69 = extractvalue { i32, i32 } %68, 0, !dbg !15
+  %70 = bitcast i32 %69 to <2 x bfloat>, !dbg !15
+  %71 = extractvalue { i32, i32 } %68, 1, !dbg !15
+  %72 = bitcast i32 %71 to <2 x bfloat>, !dbg !15
+  %73 = getelementptr bfloat, ptr addrspace(1) %1, i64 %64, !dbg !16
+  %74 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !17
+  %75 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %73, i64 %74, i1 true) #6, !dbg !17
+  %76 = extractvalue { i32, i32 } %75, 0, !dbg !17
+  %77 = bitcast i32 %76 to <2 x bfloat>, !dbg !17
+  %78 = extractvalue { i32, i32 } %75, 1, !dbg !17
+  %79 = bitcast i32 %78 to <2 x bfloat>, !dbg !17
+  %80 = getelementptr bfloat, ptr addrspace(1) %2, i64 %65, !dbg !18
+  %81 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #6, !dbg !19
+  %82 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %80, i64 %81, i1 %12) #6, !dbg !19
+  %83 = extractvalue { i32, i32 } %82, 0, !dbg !19
+  %84 = bitcast i32 %83 to <2 x bfloat>, !dbg !19
+  %85 = extractvalue { i32, i32 } %82, 1, !dbg !19
+  %86 = bitcast i32 %85 to <2 x bfloat>, !dbg !19
+  %87 = select i1 %12, float 2.000000e+00, float 1.000000e+00, !dbg !29
+  %88 = select i1 %12, float 2.000000e+00, float 0.000000e+00, !dbg !29
+  %89 = select i1 %12, float 2.000000e+00, float 0.000000e+00, !dbg !29
+  %90 = select i1 %12, float 2.000000e+00, float 0.000000e+00, !dbg !29
+  %91 = select i1 %12, float 2.000000e+00, float 0.000000e+00, !dbg !29
+  %92 = getelementptr bfloat, ptr addrspace(1) %5, i64 %65, !dbg !20
+  %93 = fpext <2 x bfloat> %70 to <2 x float>, !dbg !24
+  %94 = fpext <2 x bfloat> %77 to <2 x float>, !dbg !21
+  %95 = fpext <2 x bfloat> %84 to <2 x float>, !dbg !22
+  %96 = fmul <2 x float> %94, %95, !dbg !23
+  %97 = fadd <2 x float> %96, %93, !dbg !25
+  %98 = extractelement <2 x float> %97, i64 0, !dbg !30
+  %99 = fsub float %98, %48, !dbg !35
+  %100 = tail call float @llvm.nvvm.div.full(float %99, float %87), !dbg !36
+  %101 = fadd float %48, %100, !dbg !37
+  %102 = fsub float %98, %101, !dbg !30
+  %103 = fmul float %99, %102, !dbg !38
+  %104 = fadd float %103, 0.000000e+00, !dbg !39
+  %105 = extractelement <2 x float> %97, i64 1, !dbg !30
+  %106 = fsub float %105, %50, !dbg !35
+  %107 = tail call float @llvm.nvvm.div.full(float %106, float %87), !dbg !36
+  %108 = fadd float %50, %107, !dbg !37
+  %109 = fsub float %105, %108, !dbg !30
+  %110 = fmul float %106, %109, !dbg !38
+  %111 = fadd float %110, 0.000000e+00, !dbg !39
+  %112 = select i1 %12, float %101, float 0.000000e+00, !dbg !26
+  %113 = select i1 %12, float %108, float 0.000000e+00, !dbg !26
+  %114 = fptrunc <2 x float> %97 to <2 x bfloat>, !dbg !27
+  %115 = fpext <2 x bfloat> %72 to <2 x float>, !dbg !24
+  %116 = fpext <2 x bfloat> %79 to <2 x float>, !dbg !21
+  %117 = fpext <2 x bfloat> %86 to <2 x float>, !dbg !22
+  %118 = fmul <2 x float> %116, %117, !dbg !23
+  %119 = fadd <2 x float> %118, %115, !dbg !25
+  %120 = extractelement <2 x float> %119, i64 0, !dbg !30
+  %121 = fsub float %120, %58, !dbg !35
+  %122 = tail call float @llvm.nvvm.div.full(float %121, float %87), !dbg !36
+  %123 = fadd float %58, %122, !dbg !37
+  %124 = fsub float %120, %123, !dbg !30
+  %125 = fmul float %121, %124, !dbg !38
+  %126 = fadd float %125, 0.000000e+00, !dbg !39
+  %127 = extractelement <2 x float> %119, i64 1, !dbg !30
+  %128 = fsub float %127, %60, !dbg !35
+  %129 = tail call float @llvm.nvvm.div.full(float %128, float %87), !dbg !36
+  %130 = fadd float %60, %129, !dbg !37
+  %131 = fsub float %127, %130, !dbg !30
+  %132 = fmul float %128, %131, !dbg !38
+  %133 = fadd float %132, 0.000000e+00, !dbg !39
+  %134 = select i1 %12, float %123, float 0.000000e+00, !dbg !26
+  %135 = select i1 %12, float %130, float 0.000000e+00, !dbg !26
+  %136 = select i1 %12, float %126, float 0.000000e+00, !dbg !40
+  %137 = select i1 %12, float %133, float 0.000000e+00, !dbg !40
+  %138 = fptrunc <2 x float> %119 to <2 x bfloat>, !dbg !27
+  %139 = bitcast <2 x bfloat> %114 to i32, !dbg !27
+  %140 = bitcast <2 x bfloat> %138 to i32, !dbg !27
+  tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %139, i32 %140, ptr addrspace(1) %92, i1 %12) #6, !dbg !27
+  %141 = and i32 %13, 511, !dbg !10
+  %142 = and i32 %13, 31, !dbg !10
+  %143 = lshr i32 %141, 5, !dbg !10
+  %144 = fsub float %113, %112, !dbg !41
+  %145 = select i1 %12, float 4.000000e+00, float 0.000000e+00, !dbg !44
+  %146 = fcmp oeq float %145, 0.000000e+00, !dbg !45
+  %147 = tail call float @llvm.nvvm.div.full(float %89, float %145), !dbg !46
+  %148 = select i1 %146, float 0.000000e+00, float %147, !dbg !47
+  %149 = fmul float %144, %148, !dbg !48
+  %150 = fadd float %112, %149, !dbg !49
+  %151 = fadd float %104, %111, !dbg !50
+  %152 = select i1 %12, float %151, float 0.000000e+00, !dbg !50
+  %153 = fmul float %144, %144, !dbg !51
+  %154 = fmul float %153, %88, !dbg !52
+  %155 = fmul float %154, %148, !dbg !53
+  %156 = fadd float %152, %155, !dbg !54
+  %157 = fsub float %134, %150, !dbg !41
+  %158 = select i1 %12, float 6.000000e+00, float 0.000000e+00, !dbg !44
+  %159 = fcmp oeq float %158, 0.000000e+00, !dbg !45
+  %160 = tail call float @llvm.nvvm.div.full(float %90, float %158), !dbg !46
+  %161 = select i1 %159, float 0.000000e+00, float %160, !dbg !47
+  %162 = fmul float %161, %157, !dbg !48
+  %163 = fadd float %150, %162, !dbg !49
+  %164 = fadd float %136, %156, !dbg !50
+  %165 = fmul float %157, %157, !dbg !51
+  %166 = fmul float %145, %165, !dbg !52
+  %167 = fmul float %161, %166, !dbg !53
+  %168 = fadd float %164, %167, !dbg !54
+  %169 = fsub float %135, %163, !dbg !41
+  %170 = select i1 %12, float 8.000000e+00, float 0.000000e+00, !dbg !44
+  %171 = fcmp oeq float %170, 0.000000e+00, !dbg !45
+  %172 = tail call float @llvm.nvvm.div.full(float %91, float %170), !dbg !46
+  %173 = select i1 %171, float 0.000000e+00, float %172, !dbg !47
+  %174 = fmul float %173, %169, !dbg !48
+  %175 = fadd float %163, %174, !dbg !49
+  %176 = fadd float %137, %168, !dbg !50
+  %177 = fmul float %169, %169, !dbg !51
+  %178 = fmul float %158, %177, !dbg !52
+  %179 = fmul float %173, %178, !dbg !53
+  %180 = fadd float %176, %179, !dbg !54
+  %181 = bitcast float %175 to i32, !dbg !42
+  %182 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %181, i32 16, i32 31), !dbg !42
+  %183 = bitcast i32 %182 to float, !dbg !42
+  %184 = bitcast float %180 to i32, !dbg !42
+  %185 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %184, i32 16, i32 31), !dbg !42
+  %186 = bitcast i32 %185 to float, !dbg !42
+  %187 = bitcast float %170 to i32, !dbg !42
+  %188 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %187, i32 16, i32 31), !dbg !42
+  %189 = bitcast i32 %188 to float, !dbg !42
+  %190 = fsub float %183, %175, !dbg !41
+  %191 = fadd float %170, %189, !dbg !44
+  %192 = fcmp oeq float %191, 0.000000e+00, !dbg !45
+  %193 = tail call float @llvm.nvvm.div.full(float %189, float %191), !dbg !46
+  %194 = select i1 %192, float 0.000000e+00, float %193, !dbg !47
+  %195 = fmul float %194, %190, !dbg !48
+  %196 = fadd float %175, %195, !dbg !49
+  %197 = fadd float %180, %186, !dbg !50
+  %198 = fmul float %190, %190, !dbg !51
+  %199 = fmul float %170, %198, !dbg !52
+  %200 = fmul float %194, %199, !dbg !53
+  %201 = fadd float %197, %200, !dbg !54
+  %202 = bitcast float %196 to i32, !dbg !42
+  %203 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %202, i32 8, i32 31), !dbg !42
+  %204 = bitcast i32 %203 to float, !dbg !42
+  %205 = bitcast float %201 to i32, !dbg !42
+  %206 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %205, i32 8, i32 31), !dbg !42
+  %207 = bitcast i32 %206 to float, !dbg !42
+  %208 = bitcast float %191 to i32, !dbg !42
+  %209 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %208, i32 8, i32 31), !dbg !42
+  %210 = bitcast i32 %209 to float, !dbg !42
+  %211 = fsub float %204, %196, !dbg !41
+  %212 = fadd float %191, %210, !dbg !44
+  %213 = fcmp oeq float %212, 0.000000e+00, !dbg !45
+  %214 = tail call float @llvm.nvvm.div.full(float %210, float %212), !dbg !46
+  %215 = select i1 %213, float 0.000000e+00, float %214, !dbg !47
+  %216 = fmul float %211, %215, !dbg !48
+  %217 = fadd float %196, %216, !dbg !49
+  %218 = fadd float %201, %207, !dbg !50
+  %219 = fmul float %211, %211, !dbg !51
+  %220 = fmul float %191, %219, !dbg !52
+  %221 = fmul float %215, %220, !dbg !53
+  %222 = fadd float %218, %221, !dbg !54
+  %223 = bitcast float %217 to i32, !dbg !42
+  %224 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %223, i32 4, i32 31), !dbg !42
+  %225 = bitcast i32 %224 to float, !dbg !42
+  %226 = bitcast float %222 to i32, !dbg !42
+  %227 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %226, i32 4, i32 31), !dbg !42
+  %228 = bitcast i32 %227 to float, !dbg !42
+  %229 = bitcast float %212 to i32, !dbg !42
+  %230 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %229, i32 4, i32 31), !dbg !42
+  %231 = bitcast i32 %230 to float, !dbg !42
+  %232 = fsub float %225, %217, !dbg !41
+  %233 = fadd float %212, %231, !dbg !44
+  %234 = fcmp oeq float %233, 0.000000e+00, !dbg !45
+  %235 = tail call float @llvm.nvvm.div.full(float %231, float %233), !dbg !46
+  %236 = select i1 %234, float 0.000000e+00, float %235, !dbg !47
+  %237 = fmul float %232, %236, !dbg !48
+  %238 = fadd float %217, %237, !dbg !49
+  %239 = fadd float %222, %228, !dbg !50
+  %240 = fmul float %232, %232, !dbg !51
+  %241 = fmul float %212, %240, !dbg !52
+  %242 = fmul float %236, %241, !dbg !53
+  %243 = fadd float %239, %242, !dbg !54
+  %244 = bitcast float %238 to i32, !dbg !42
+  %245 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %244, i32 2, i32 31), !dbg !42
+  %246 = bitcast i32 %245 to float, !dbg !42
+  %247 = bitcast float %243 to i32, !dbg !42
+  %248 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %247, i32 2, i32 31), !dbg !42
+  %249 = bitcast i32 %248 to float, !dbg !42
+  %250 = bitcast float %233 to i32, !dbg !42
+  %251 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %250, i32 2, i32 31), !dbg !42
+  %252 = bitcast i32 %251 to float, !dbg !42
+  %253 = fsub float %246, %238, !dbg !41
+  %254 = fadd float %233, %252, !dbg !44
+  %255 = fcmp oeq float %254, 0.000000e+00, !dbg !45
+  %256 = tail call float @llvm.nvvm.div.full(float %252, float %254), !dbg !46
+  %257 = select i1 %255, float 0.000000e+00, float %256, !dbg !47
+  %258 = fmul float %253, %257, !dbg !48
+  %259 = fadd float %238, %258, !dbg !49
+  %260 = fadd float %243, %249, !dbg !50
+  %261 = fmul float %253, %253, !dbg !51
+  %262 = fmul float %233, %261, !dbg !52
+  %263 = fmul float %257, %262, !dbg !53
+  %264 = fadd float %260, %263, !dbg !54
+  %265 = bitcast float %259 to i32, !dbg !42
+  %266 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %265, i32 1, i32 31), !dbg !42
+  %267 = bitcast i32 %266 to float, !dbg !42
+  %268 = bitcast float %264 to i32, !dbg !42
+  %269 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %268, i32 1, i32 31), !dbg !42
+  %270 = bitcast i32 %269 to float, !dbg !42
+  %271 = bitcast float %254 to i32, !dbg !42
+  %272 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %271, i32 1, i32 31), !dbg !42
+  %273 = bitcast i32 %272 to float, !dbg !42
+  %274 = fsub float %267, %259, !dbg !41
+  %275 = fadd float %254, %273, !dbg !44
+  %276 = fcmp oeq float %275, 0.000000e+00, !dbg !45
+  %277 = tail call float @llvm.nvvm.div.full(float %273, float %275), !dbg !46
+  %278 = select i1 %276, float 0.000000e+00, float %277, !dbg !47
+  %279 = fmul float %274, %278, !dbg !48
+  %280 = fadd float %259, %279, !dbg !49
+  %281 = fadd float %264, %270, !dbg !50
+  %282 = fmul float %274, %274, !dbg !51
+  %283 = fmul float %254, %282, !dbg !52
+  %284 = fmul float %278, %283, !dbg !53
+  %285 = fadd float %281, %284, !dbg !54
+  %286 = icmp eq i32 %142, 0, !dbg !42
+  %287 = getelementptr float, ptr addrspace(3) @global_smem, i32 %143, !dbg !42
+  %288 = bitcast float %280 to <1 x i32>, !dbg !42
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %287, <1 x i32> %288, i1 %286) #6, !dbg !42
+  %289 = getelementptr float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 64), i32 %143, !dbg !42
+  %290 = bitcast float %285 to <1 x i32>, !dbg !42
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %289, <1 x i32> %290, i1 %286) #6, !dbg !42
+  %291 = getelementptr float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 128), i32 %143, !dbg !42
+  %292 = bitcast float %275 to <1 x i32>, !dbg !42
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %291, <1 x i32> %292, i1 %286) #6, !dbg !42
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !42
+  %293 = icmp samesign ult i32 %141, 16, !dbg !42
+  %294 = getelementptr float, ptr addrspace(3) @global_smem, i32 %141, !dbg !42
+  %295 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %294, i1 %293) #6, !dbg !42
+  %296 = bitcast i32 %295 to float, !dbg !42
+  %297 = getelementptr float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 64), i32 %141, !dbg !42
+  %298 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %297, i1 %293) #6, !dbg !42
+  %299 = bitcast i32 %298 to float, !dbg !42
+  %300 = getelementptr float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 128), i32 %141, !dbg !42
+  %301 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %300, i1 %293) #6, !dbg !42
+  %302 = bitcast i32 %301 to float, !dbg !42
+  %303 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %295, i32 8, i32 31), !dbg !42
+  %304 = bitcast i32 %303 to float, !dbg !42
+  %305 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %298, i32 8, i32 31), !dbg !42
+  %306 = bitcast i32 %305 to float, !dbg !42
+  %307 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %301, i32 8, i32 31), !dbg !42
+  %308 = bitcast i32 %307 to float, !dbg !42
+  %309 = fsub float %304, %296, !dbg !41
+  %310 = fadd float %302, %308, !dbg !44
+  %311 = fcmp oeq float %310, 0.000000e+00, !dbg !45
+  %312 = tail call float @llvm.nvvm.div.full(float %308, float %310), !dbg !46
+  %313 = select i1 %311, float 0.000000e+00, float %312, !dbg !47
+  %314 = fmul float %309, %313, !dbg !48
+  %315 = fadd float %314, %296, !dbg !49
+  %316 = fadd float %299, %306, !dbg !50
+  %317 = fmul float %309, %309, !dbg !51
+  %318 = fmul float %317, %302, !dbg !52
+  %319 = fmul float %318, %313, !dbg !53
+  %320 = fadd float %316, %319, !dbg !54
+  %321 = bitcast float %315 to i32, !dbg !42
+  %322 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %321, i32 4, i32 31), !dbg !42
+  %323 = bitcast i32 %322 to float, !dbg !42
+  %324 = bitcast float %320 to i32, !dbg !42
+  %325 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %324, i32 4, i32 31), !dbg !42
+  %326 = bitcast i32 %325 to float, !dbg !42
+  %327 = bitcast float %310 to i32, !dbg !42
+  %328 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %327, i32 4, i32 31), !dbg !42
+  %329 = bitcast i32 %328 to float, !dbg !42
+  %330 = fsub float %323, %315, !dbg !41
+  %331 = fadd float %310, %329, !dbg !44
+  %332 = fcmp oeq float %331, 0.000000e+00, !dbg !45
+  %333 = tail call float @llvm.nvvm.div.full(float %329, float %331), !dbg !46
+  %334 = select i1 %332, float 0.000000e+00, float %333, !dbg !47
+  %335 = fmul float %330, %334, !dbg !48
+  %336 = fadd float %315, %335, !dbg !49
+  %337 = fadd float %320, %326, !dbg !50
+  %338 = fmul float %330, %330, !dbg !51
+  %339 = fmul float %310, %338, !dbg !52
+  %340 = fmul float %334, %339, !dbg !53
+  %341 = fadd float %337, %340, !dbg !54
+  %342 = bitcast float %336 to i32, !dbg !42
+  %343 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %342, i32 2, i32 31), !dbg !42
+  %344 = bitcast i32 %343 to float, !dbg !42
+  %345 = bitcast float %341 to i32, !dbg !42
+  %346 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %345, i32 2, i32 31), !dbg !42
+  %347 = bitcast i32 %346 to float, !dbg !42
+  %348 = bitcast float %331 to i32, !dbg !42
+  %349 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %348, i32 2, i32 31), !dbg !42
+  %350 = bitcast i32 %349 to float, !dbg !42
+  %351 = fsub float %344, %336, !dbg !41
+  %352 = fadd float %331, %350, !dbg !44
+  %353 = fcmp oeq float %352, 0.000000e+00, !dbg !45
+  %354 = tail call float @llvm.nvvm.div.full(float %350, float %352), !dbg !46
+  %355 = select i1 %353, float 0.000000e+00, float %354, !dbg !47
+  %356 = fmul float %351, %355, !dbg !48
+  %357 = fadd float %336, %356, !dbg !49
+  %358 = fadd float %341, %347, !dbg !50
+  %359 = fmul float %351, %351, !dbg !51
+  %360 = fmul float %331, %359, !dbg !52
+  %361 = fmul float %355, %360, !dbg !53
+  %362 = fadd float %358, %361, !dbg !54
+  %363 = bitcast float %357 to i32, !dbg !42
+  %364 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %363, i32 1, i32 31), !dbg !42
+  %365 = bitcast i32 %364 to float, !dbg !42
+  %366 = bitcast float %362 to i32, !dbg !42
+  %367 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %366, i32 1, i32 31), !dbg !42
+  %368 = bitcast i32 %367 to float, !dbg !42
+  %369 = bitcast float %352 to i32, !dbg !42
+  %370 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %369, i32 1, i32 31), !dbg !42
+  %371 = bitcast i32 %370 to float, !dbg !42
+  %372 = fsub float %365, %357, !dbg !41
+  %373 = fadd float %352, %371, !dbg !44
+  %374 = fcmp oeq float %373, 0.000000e+00, !dbg !45
+  %375 = tail call float @llvm.nvvm.div.full(float %371, float %373), !dbg !46
+  %376 = select i1 %374, float 0.000000e+00, float %375, !dbg !47
+  %377 = fmul float %372, %376, !dbg !48
+  %378 = fadd float %357, %377, !dbg !49
+  %379 = fadd float %362, %368, !dbg !50
+  %380 = fmul float %372, %372, !dbg !51
+  %381 = fmul float %352, %380, !dbg !52
+  %382 = fmul float %376, %381, !dbg !53
+  %383 = fadd float %379, %382, !dbg !54
+  %384 = and i32 %13, 15, !dbg !42
+  %385 = icmp eq i32 %384, 0, !dbg !42
+  %386 = and i1 %293, %385, !dbg !42
+  %387 = bitcast float %378 to <1 x i32>, !dbg !42
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %294, <1 x i32> %387, i1 %386) #6, !dbg !42
+  %388 = bitcast float %383 to <1 x i32>, !dbg !42
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %297, <1 x i32> %388, i1 %386) #6, !dbg !42
+  %389 = bitcast float %373 to <1 x i32>, !dbg !42
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %300, <1 x i32> %389, i1 %386) #6, !dbg !42
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !42
+  %390 = load float, ptr addrspace(3) @global_smem, align 16, !dbg !42
+  %391 = load float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 64), align 16, !dbg !42
+  %392 = tail call float @llvm.nvvm.div.full(float %391, float 4.096000e+03), !dbg !55
+  %393 = fadd float %392, 0x3EB0C6F7A0000000, !dbg !56
+  %394 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !57
+  %395 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !57
+  %396 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !57
+  %397 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !57
+  %.not.i15 = icmp eq i32 %397, 0, !dbg !57
+  br i1 %.not.i15, label %400, label %398, !dbg !57
+
+398:                                              ; preds = %__nv_rsqrtf.exit
+  %399 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %393), !dbg !57
+  br label %__nv_rsqrtf.exit17, !dbg !57
+
+400:                                              ; preds = %__nv_rsqrtf.exit
+  %401 = tail call float @llvm.nvvm.rsqrt.approx.f(float %393), !dbg !57
+  br label %__nv_rsqrtf.exit17, !dbg !57
+
+__nv_rsqrtf.exit17:                               ; preds = %398, %400
+  %.0.i16 = phi float [ %399, %398 ], [ %401, %400 ], !dbg !57
+  %402 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #6, !dbg !58
+  %403 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %41, i64 %402, i1 %12) #6, !dbg !58
+  %404 = extractvalue { i32, i32 } %403, 0, !dbg !58
+  %405 = bitcast i32 %404 to <2 x bfloat>, !dbg !58
+  %406 = extractvalue { i32, i32 } %403, 1, !dbg !58
+  %407 = bitcast i32 %406 to <2 x bfloat>, !dbg !58
+  %408 = getelementptr bfloat, ptr addrspace(1) %3, i64 %17, !dbg !59
+  %409 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !60
+  %410 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %408, i64 %409, i1 true) #6, !dbg !60
+  %411 = extractvalue { i32, i32 } %410, 0, !dbg !60
+  %412 = bitcast i32 %411 to <2 x bfloat>, !dbg !60
+  %413 = extractvalue { i32, i32 } %410, 1, !dbg !60
+  %414 = bitcast i32 %413 to <2 x bfloat>, !dbg !60
+  %415 = getelementptr bfloat, ptr addrspace(1) %4, i64 %17, !dbg !61
+  %416 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !62
+  %417 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %415, i64 %416, i1 true) #6, !dbg !62
+  %418 = extractvalue { i32, i32 } %417, 0, !dbg !62
+  %419 = bitcast i32 %418 to <2 x bfloat>, !dbg !62
+  %420 = extractvalue { i32, i32 } %417, 1, !dbg !62
+  %421 = bitcast i32 %420 to <2 x bfloat>, !dbg !62
+  %422 = getelementptr bfloat, ptr addrspace(1) %6, i64 %19, !dbg !63
+  %423 = fpext <2 x bfloat> %405 to <2 x float>, !dbg !64
+  %424 = fpext <2 x bfloat> %412 to <2 x float>, !dbg !65
+  %425 = fpext <2 x bfloat> %419 to <2 x float>, !dbg !66
+  %426 = insertelement <2 x float> poison, float %390, i64 0, !dbg !67
+  %427 = shufflevector <2 x float> %426, <2 x float> poison, <2 x i32> zeroinitializer, !dbg !67
+  %428 = fsub <2 x float> %423, %427, !dbg !67
+  %429 = insertelement <2 x float> poison, float %.0.i16, i64 0, !dbg !68
+  %430 = shufflevector <2 x float> %429, <2 x float> poison, <2 x i32> zeroinitializer, !dbg !68
+  %431 = fmul <2 x float> %430, %428, !dbg !68
+  %432 = fadd <2 x float> %424, splat (float 1.000000e+00), !dbg !69
+  %433 = fmul <2 x float> %431, %432, !dbg !70
+  %434 = fadd <2 x float> %433, %425, !dbg !71
+  %435 = fptrunc <2 x float> %434 to <2 x bfloat>, !dbg !72
+  %436 = fpext <2 x bfloat> %407 to <2 x float>, !dbg !64
+  %437 = fpext <2 x bfloat> %414 to <2 x float>, !dbg !65
+  %438 = fpext <2 x bfloat> %421 to <2 x float>, !dbg !66
+  %439 = fsub <2 x float> %436, %427, !dbg !67
+  %440 = fmul <2 x float> %430, %439, !dbg !68
+  %441 = fadd <2 x float> %437, splat (float 1.000000e+00), !dbg !69
+  %442 = fmul <2 x float> %440, %441, !dbg !70
+  %443 = fadd <2 x float> %442, %438, !dbg !71
+  %444 = fptrunc <2 x float> %443 to <2 x bfloat>, !dbg !72
+  %445 = bitcast <2 x bfloat> %435 to i32, !dbg !72
+  %446 = bitcast <2 x bfloat> %444 to i32, !dbg !72
+  tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %445, i32 %446, ptr addrspace(1) %422, i1 %12) #6, !dbg !72
+  %447 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #6, !dbg !58
+  %448 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %92, i64 %447, i1 %12) #6, !dbg !58
+  %449 = extractvalue { i32, i32 } %448, 0, !dbg !58
+  %450 = bitcast i32 %449 to <2 x bfloat>, !dbg !58
+  %451 = extractvalue { i32, i32 } %448, 1, !dbg !58
+  %452 = bitcast i32 %451 to <2 x bfloat>, !dbg !58
+  %453 = getelementptr bfloat, ptr addrspace(1) %3, i64 %64, !dbg !59
+  %454 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !60
+  %455 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %453, i64 %454, i1 true) #6, !dbg !60
+  %456 = extractvalue { i32, i32 } %455, 0, !dbg !60
+  %457 = bitcast i32 %456 to <2 x bfloat>, !dbg !60
+  %458 = extractvalue { i32, i32 } %455, 1, !dbg !60
+  %459 = bitcast i32 %458 to <2 x bfloat>, !dbg !60
+  %460 = getelementptr bfloat, ptr addrspace(1) %4, i64 %64, !dbg !61
+  %461 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !62
+  %462 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %460, i64 %461, i1 true) #6, !dbg !62
+  %463 = extractvalue { i32, i32 } %462, 0, !dbg !62
+  %464 = bitcast i32 %463 to <2 x bfloat>, !dbg !62
+  %465 = extractvalue { i32, i32 } %462, 1, !dbg !62
+  %466 = bitcast i32 %465 to <2 x bfloat>, !dbg !62
+  %467 = getelementptr bfloat, ptr addrspace(1) %6, i64 %65, !dbg !63
+  %468 = fpext <2 x bfloat> %450 to <2 x float>, !dbg !64
+  %469 = fpext <2 x bfloat> %457 to <2 x float>, !dbg !65
+  %470 = fpext <2 x bfloat> %464 to <2 x float>, !dbg !66
+  %471 = fsub <2 x float> %468, %427, !dbg !67
+  %472 = fmul <2 x float> %430, %471, !dbg !68
+  %473 = fadd <2 x float> %469, splat (float 1.000000e+00), !dbg !69
+  %474 = fmul <2 x float> %472, %473, !dbg !70
+  %475 = fadd <2 x float> %474, %470, !dbg !71
+  %476 = fptrunc <2 x float> %475 to <2 x bfloat>, !dbg !72
+  %477 = fpext <2 x bfloat> %452 to <2 x float>, !dbg !64
+  %478 = fpext <2 x bfloat> %459 to <2 x float>, !dbg !65
+  %479 = fpext <2 x bfloat> %466 to <2 x float>, !dbg !66
+  %480 = fsub <2 x float> %477, %427, !dbg !67
+  %481 = fmul <2 x float> %430, %480, !dbg !68
+  %482 = fadd <2 x float> %478, splat (float 1.000000e+00), !dbg !69
+  %483 = fmul <2 x float> %481, %482, !dbg !70
+  %484 = fadd <2 x float> %483, %479, !dbg !71
+  %485 = fptrunc <2 x float> %484 to <2 x bfloat>, !dbg !72
+  %486 = bitcast <2 x bfloat> %476 to i32, !dbg !72
+  %487 = bitcast <2 x bfloat> %485 to i32, !dbg !72
+  tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %486, i32 %487, ptr addrspace(1) %467, i1 %12) #6, !dbg !72
+  ret void, !dbg !73
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.div.full(float, float) #2
+
+; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
+declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #3
+
+; Function Attrs: convergent nocallback nounwind
+declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #4
+
+declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #5
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #2
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.rsqrt.approx.f(float) #2
+
+attributes #0 = { nounwind "nvvm.reqntid"="512" }
+attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #2 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
+attributes #3 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
+attributes #4 = { convergent nocallback nounwind }
+attributes #5 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #6 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3}
+!llvm.ident = !{!4}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly)
+!1 = !DIFile(filename: "ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py", directory: "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3")
+!2 = !{i32 2, !"Debug Info Version", i32 3}
+!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
+!4 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
+!5 = distinct !DISubprogram(name: "triton_red_fused_add_mul_native_layer_norm_0", linkageName: "triton_red_fused_add_mul_native_layer_norm_0", scope: !1, file: !1, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
+!7 = !{}
+!8 = !DILocation(line: 23, column: 28, scope: !5)
+!9 = !DILocation(line: 25, column: 21, scope: !5)
+!10 = !DILocation(line: 26, column: 37, scope: !5)
+!11 = !DILocation(line: 38, column: 46, scope: !5)
+!12 = !DILocation(line: 32, column: 43, scope: !5)
+!13 = !DILocation(line: 38, column: 41, scope: !5)
+!14 = !DILocation(line: 38, column: 34, scope: !5)
+!15 = !DILocation(line: 38, column: 51, scope: !5)
+!16 = !DILocation(line: 39, column: 34, scope: !5)
+!17 = !DILocation(line: 39, column: 41, scope: !5)
+!18 = !DILocation(line: 40, column: 34, scope: !5)
+!19 = !DILocation(line: 40, column: 51, scope: !5)
+!20 = !DILocation(line: 51, column: 29, scope: !5)
+!21 = !DILocation(line: 39, column: 94, scope: !5)
+!22 = !DILocation(line: 40, column: 113, scope: !5)
+!23 = !DILocation(line: 41, column: 22, scope: !5)
+!24 = !DILocation(line: 38, column: 113, scope: !5)
+!25 = !DILocation(line: 42, column: 22, scope: !5)
+!26 = !DILocation(line: 48, column: 62, scope: !5)
+!27 = !DILocation(line: 51, column: 52, scope: !5)
+!28 = !DILocation(line: 33, column: 31, scope: !5)
+!29 = !DILocation(line: 50, column: 66, scope: !5)
+!30 = !DILocation(line: 225, column: 39, scope: !31, inlinedAt: !33)
+!31 = distinct !DILexicalBlockFile(scope: !5, file: !32, discriminator: 0)
+!32 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime")
+!33 = !DILocation(line: 46, column: 51, scope: !34)
+!34 = distinct !DILexicalBlockFile(scope: !5, file: !1, discriminator: 0)
+!35 = !DILocation(line: 222, column: 24, scope: !31, inlinedAt: !33)
+!36 = !DILocation(line: 224, column: 34, scope: !31, inlinedAt: !33)
+!37 = !DILocation(line: 224, column: 26, scope: !31, inlinedAt: !33)
+!38 = !DILocation(line: 225, column: 31, scope: !31, inlinedAt: !33)
+!39 = !DILocation(line: 225, column: 22, scope: !31, inlinedAt: !33)
+!40 = !DILocation(line: 49, column: 58, scope: !5)
+!41 = !DILocation(line: 231, column: 21, scope: !31, inlinedAt: !42)
+!42 = !DILocation(line: 243, column: 46, scope: !31, inlinedAt: !43)
+!43 = !DILocation(line: 52, column: 80, scope: !34)
+!44 = !DILocation(line: 232, column: 28, scope: !31, inlinedAt: !42)
+!45 = !DILocation(line: 233, column: 39, scope: !31, inlinedAt: !42)
+!46 = !DILocation(line: 233, column: 60, scope: !31, inlinedAt: !42)
+!47 = !DILocation(line: 233, column: 49, scope: !31, inlinedAt: !42)
+!48 = !DILocation(line: 235, column: 25, scope: !31, inlinedAt: !42)
+!49 = !DILocation(line: 235, column: 17, scope: !31, inlinedAt: !42)
+!50 = !DILocation(line: 236, column: 15, scope: !31, inlinedAt: !42)
+!51 = !DILocation(line: 236, column: 30, scope: !31, inlinedAt: !42)
+!52 = !DILocation(line: 236, column: 38, scope: !31, inlinedAt: !42)
+!53 = !DILocation(line: 236, column: 49, scope: !31, inlinedAt: !42)
+!54 = !DILocation(line: 236, column: 22, scope: !31, inlinedAt: !42)
+!55 = !DILocation(line: 68, column: 25, scope: !5)
+!56 = !DILocation(line: 70, column: 24, scope: !5)
+!57 = !DILocation(line: 71, column: 32, scope: !5)
+!58 = !DILocation(line: 62, column: 53, scope: !5)
+!59 = !DILocation(line: 63, column: 35, scope: !5)
+!60 = !DILocation(line: 63, column: 42, scope: !5)
+!61 = !DILocation(line: 64, column: 35, scope: !5)
+!62 = !DILocation(line: 64, column: 42, scope: !5)
+!63 = !DILocation(line: 78, column: 29, scope: !5)
+!64 = !DILocation(line: 62, column: 115, scope: !5)
+!65 = !DILocation(line: 63, column: 95, scope: !5)
+!66 = !DILocation(line: 64, column: 95, scope: !5)
+!67 = !DILocation(line: 66, column: 24, scope: !5)
+!68 = !DILocation(line: 72, column: 24, scope: !5)
+!69 = !DILocation(line: 75, column: 24, scope: !5)
+!70 = !DILocation(line: 76, column: 24, scope: !5)
+!71 = !DILocation(line: 77, column: 24, scope: !5)
+!72 = !DILocation(line: 78, column: 53, scope: !5)
+!73 = !DILocation(line: 56, column: 4, scope: !5)
diff --git a/triton/H6VG26TW2DOV7R3PXVPFDX6HZCVIESL5ZYKZWLUWKZYONCE6NSLQ/triton_red_fused_add_mul_native_layer_norm_0.ptx b/triton/H6VG26TW2DOV7R3PXVPFDX6HZCVIESL5ZYKZWLUWKZYONCE6NSLQ/triton_red_fused_add_mul_native_layer_norm_0.ptx
new file mode 100644
index 0000000000000000000000000000000000000000..874348e55c4cecf9b2dd0a4905baaf3c27563c33
--- /dev/null
+++ b/triton/H6VG26TW2DOV7R3PXVPFDX6HZCVIESL5ZYKZWLUWKZYONCE6NSLQ/triton_red_fused_add_mul_native_layer_norm_0.ptx
@@ -0,0 +1,1191 @@
+//
+// Generated by LLVM NVPTX Back-End
+//
+
+.version 9.1
+.target sm_89
+.address_size 64
+
+	// .globl	triton_red_fused_add_mul_native_layer_norm_0 // -- Begin function triton_red_fused_add_mul_native_layer_norm_0
+.extern .shared .align 16 .b8 global_smem[];
+.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90};
+                                        // @triton_red_fused_add_mul_native_layer_norm_0
+.visible .entry triton_red_fused_add_mul_native_layer_norm_0(
+	.param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_0_param_0,
+	.param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_0_param_1,
+	.param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_0_param_2,
+	.param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_0_param_3,
+	.param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_0_param_4,
+	.param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_0_param_5,
+	.param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_0_param_6,
+	.param .u32 triton_red_fused_add_mul_native_layer_norm_0_param_7,
+	.param .u32 triton_red_fused_add_mul_native_layer_norm_0_param_8,
+	.param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_0_param_9,
+	.param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_0_param_10
+)
+.reqntid 512
+{
+	.reg .pred 	%p<19>;
+	.reg .b16 	%rs<49>;
+	.reg .b32 	%r<317>;
+	.reg .b64 	%rd<39>;
+	.loc	1 18 0                          // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:18:0
+$L__func_begin0:
+	.loc	1 18 0                          // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:18:0
+
+// %bb.0:                               // %__nv_rsqrtf.exit
+	ld.param.b64 	%rd27, [triton_red_fused_add_mul_native_layer_norm_0_param_0];
+	ld.param.b64 	%rd28, [triton_red_fused_add_mul_native_layer_norm_0_param_1];
+$L__tmp0:
+	.loc	1 23 28                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:23:28
+	mov.u32 	%r49, %ctaid.x;
+	.loc	1 25 21                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:25:21
+	setp.lt.u32 	%p1, %r49, 256;
+	ld.param.b64 	%rd29, [triton_red_fused_add_mul_native_layer_norm_0_param_2];
+	ld.param.b64 	%rd30, [triton_red_fused_add_mul_native_layer_norm_0_param_3];
+	.loc	1 26 37                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:26:37
+	mov.u32 	%r50, %tid.x;
+	shl.b32 	%r51, %r50, 2;
+	ld.param.b64 	%rd31, [triton_red_fused_add_mul_native_layer_norm_0_param_4];
+	and.b32 	%r52, %r51, 2044;
+	ld.param.b64 	%rd32, [triton_red_fused_add_mul_native_layer_norm_0_param_5];
+	.loc	1 38 46                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:38:46
+	shl.b32 	%r53, %r49, 12;
+	ld.param.b64 	%rd33, [triton_red_fused_add_mul_native_layer_norm_0_param_6];
+	.loc	1 32 43                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:32:43
+	cvt.u64.u32 	%rd34, %r52;
+	cvt.s64.s32 	%rd35, %r53;
+	.loc	1 38 41                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:38:41
+	or.b64 	%rd36, %rd34, %rd35;
+	.loc	1 38 34                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:38:34
+	shl.b64 	%rd37, %rd36, 1;
+	add.s64 	%rd1, %rd27, %rd37;
+	.loc	1 38 51                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:38:51
+	// begin inline asm
+	mov.u64 %rd2, 0x0;
+	createpolicy.fractional.L2::evict_first.b64 %rd2, 1.0;
+	// end inline asm
+	mov.b32 	%r3, 0;
+	// begin inline asm
+	mov.u32 %r1, %r3;
+	mov.u32 %r2, %r3;
+	@%p1 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { %r1, %r2 }, [ %rd1 + 0 ], %rd2;
+	// end inline asm
+	.loc	1 39 34                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:39:34
+	mul.wide.u32 	%rd38, %r52, 2;
+	add.s64 	%rd3, %rd28, %rd38;
+	.loc	1 39 41                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:39:41
+	// begin inline asm
+	mov.u64 %rd4, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd4, 1.0;
+	// end inline asm
+	mov.pred 	%p2, -1;
+	// begin inline asm
+	mov.u32 %r4, %r3;
+	mov.u32 %r5, %r3;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { %r4, %r5 }, [ %rd3 + 0 ], %rd4;
+	// end inline asm
+	.loc	1 40 34                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:40:34
+	add.s64 	%rd5, %rd29, %rd37;
+	.loc	1 40 51                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:40:51
+	// begin inline asm
+	mov.u64 %rd6, 0x0;
+	createpolicy.fractional.L2::evict_first.b64 %rd6, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r6, %r3;
+	mov.u32 %r7, %r3;
+	@%p1 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { %r6, %r7 }, [ %rd5 + 0 ], %rd6;
+	// end inline asm
+	.loc	1 51 29                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:51:29
+	add.s64 	%rd7, %rd32, %rd37;
+	.loc	1 39 94                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:39:94
+	mov.b32 	{%rs1, %rs2}, %r4;
+	cvt.f32.bf16 	%r54, %rs1;
+	cvt.f32.bf16 	%r55, %rs2;
+	.loc	1 40 113                        // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:40:113
+	mov.b32 	{%rs3, %rs4}, %r6;
+	cvt.f32.bf16 	%r56, %rs3;
+	cvt.f32.bf16 	%r57, %rs4;
+	.loc	1 38 113                        // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:38:113
+	mov.b32 	{%rs5, %rs6}, %r1;
+	cvt.f32.bf16 	%r58, %rs5;
+	cvt.f32.bf16 	%r59, %rs6;
+	.loc	1 42 22                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:42:22
+	fma.rn.f32 	%r60, %r55, %r57, %r59;
+	fma.rn.f32 	%r61, %r54, %r56, %r58;
+	.loc	1 48 62                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:48:62
+	selp.f32 	%r62, %r61, 0f00000000, %p1;
+	selp.f32 	%r63, %r60, 0f00000000, %p1;
+	.loc	1 51 52                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:51:52
+	cvt.rn.bf16x2.f32 	%r8, %r60, %r61;
+	.loc	1 39 94                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:39:94
+	mov.b32 	{%rs7, %rs8}, %r5;
+	cvt.f32.bf16 	%r64, %rs7;
+	cvt.f32.bf16 	%r65, %rs8;
+	.loc	1 40 113                        // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:40:113
+	mov.b32 	{%rs9, %rs10}, %r7;
+	cvt.f32.bf16 	%r66, %rs9;
+	cvt.f32.bf16 	%r67, %rs10;
+	.loc	1 38 113                        // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:38:113
+	mov.b32 	{%rs11, %rs12}, %r2;
+	cvt.f32.bf16 	%r68, %rs11;
+	cvt.f32.bf16 	%r69, %rs12;
+	.loc	1 42 22                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:42:22
+	fma.rn.f32 	%r70, %r65, %r67, %r69;
+	fma.rn.f32 	%r71, %r64, %r66, %r68;
+	.loc	1 48 62                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:48:62
+	selp.f32 	%r72, %r71, 0f00000000, %p1;
+	selp.f32 	%r73, %r70, 0f00000000, %p1;
+	.loc	1 51 52                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:51:52
+	cvt.rn.bf16x2.f32 	%r9, %r70, %r71;
+	// begin inline asm
+	@%p1 st.global.v2.b32 [ %rd7 + 0 ], { %r8, %r9 };
+	// end inline asm
+	.loc	1 38 34                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:38:34
+	add.s64 	%rd8, %rd1, 4096;
+	.loc	1 38 51                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:38:51
+	// begin inline asm
+	mov.u64 %rd9, 0x0;
+	createpolicy.fractional.L2::evict_first.b64 %rd9, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r10, %r3;
+	mov.u32 %r11, %r3;
+	@%p1 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { %r10, %r11 }, [ %rd8 + 0 ], %rd9;
+	// end inline asm
+	.loc	1 39 34                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:39:34
+	add.s64 	%rd10, %rd3, 4096;
+	.loc	1 39 41                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:39:41
+	// begin inline asm
+	mov.u64 %rd11, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd11, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r12, %r3;
+	mov.u32 %r13, %r3;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { %r12, %r13 }, [ %rd10 + 0 ], %rd11;
+	// end inline asm
+	.loc	1 40 34                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:40:34
+	add.s64 	%rd12, %rd5, 4096;
+	.loc	1 40 51                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:40:51
+	// begin inline asm
+	mov.u64 %rd13, 0x0;
+	createpolicy.fractional.L2::evict_first.b64 %rd13, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r14, %r3;
+	mov.u32 %r15, %r3;
+	@%p1 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { %r14, %r15 }, [ %rd12 + 0 ], %rd13;
+	// end inline asm
+	.loc	1 50 66                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:50:66
+	selp.f32 	%r74, 0f40000000, 0f3F800000, %p1;
+	selp.f32 	%r75, 0f40000000, 0f00000000, %p1;
+	.loc	1 51 29                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:51:29
+	add.s64 	%rd14, %rd7, 4096;
+	.loc	1 38 113                        // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:38:113
+	mov.b32 	{%rs13, %rs14}, %r10;
+	cvt.f32.bf16 	%r76, %rs13;
+	cvt.f32.bf16 	%r77, %rs14;
+	.loc	1 39 94                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:39:94
+	mov.b32 	{%rs15, %rs16}, %r12;
+	cvt.f32.bf16 	%r78, %rs15;
+	cvt.f32.bf16 	%r79, %rs16;
+	.loc	1 40 113                        // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:40:113
+	mov.b32 	{%rs17, %rs18}, %r14;
+	cvt.f32.bf16 	%r80, %rs17;
+	cvt.f32.bf16 	%r81, %rs18;
+	.loc	1 42 22                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:42:22
+	fma.rn.f32 	%r82, %r79, %r81, %r77;
+	fma.rn.f32 	%r83, %r78, %r80, %r76;
+$L__tmp1:
+	.loc	2 222 24                        // triton_helpers.py:222:24 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:46:51 ]
+	sub.f32 	%r84, %r83, %r62;
+	.loc	2 224 34                        // triton_helpers.py:224:34 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:46:51 ]
+	div.full.f32 	%r85, %r84, %r74;
+	.loc	2 224 26                        // triton_helpers.py:224:26 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:46:51 ]
+	add.f32 	%r86, %r62, %r85;
+	.loc	2 225 39                        // triton_helpers.py:225:39 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:46:51 ]
+	sub.f32 	%r87, %r83, %r86;
+	.loc	2 225 22                        // triton_helpers.py:225:22 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:46:51 ]
+	fma.rn.f32 	%r88, %r84, %r87, 0f00000000;
+	.loc	2 222 24                        // triton_helpers.py:222:24 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:46:51 ]
+	sub.f32 	%r89, %r82, %r63;
+	.loc	2 224 34                        // triton_helpers.py:224:34 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:46:51 ]
+	div.full.f32 	%r90, %r89, %r74;
+	.loc	2 224 26                        // triton_helpers.py:224:26 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:46:51 ]
+	add.f32 	%r91, %r63, %r90;
+	.loc	2 225 39                        // triton_helpers.py:225:39 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:46:51 ]
+	sub.f32 	%r92, %r82, %r91;
+	.loc	2 225 22                        // triton_helpers.py:225:22 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:46:51 ]
+	fma.rn.f32 	%r93, %r89, %r92, 0f00000000;
+$L__tmp2:
+	.loc	1 48 62                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:48:62
+	selp.f32 	%r94, %r86, 0f00000000, %p1;
+	selp.f32 	%r95, %r91, 0f00000000, %p1;
+	.loc	1 51 52                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:51:52
+	cvt.rn.bf16x2.f32 	%r16, %r82, %r83;
+	.loc	1 38 113                        // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:38:113
+	mov.b32 	{%rs19, %rs20}, %r11;
+	cvt.f32.bf16 	%r96, %rs19;
+	cvt.f32.bf16 	%r97, %rs20;
+	.loc	1 39 94                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:39:94
+	mov.b32 	{%rs21, %rs22}, %r13;
+	cvt.f32.bf16 	%r98, %rs21;
+	cvt.f32.bf16 	%r99, %rs22;
+	.loc	1 40 113                        // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:40:113
+	mov.b32 	{%rs23, %rs24}, %r15;
+	cvt.f32.bf16 	%r100, %rs23;
+	cvt.f32.bf16 	%r101, %rs24;
+	.loc	1 42 22                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:42:22
+	fma.rn.f32 	%r102, %r99, %r101, %r97;
+	fma.rn.f32 	%r103, %r98, %r100, %r96;
+$L__tmp3:
+	.loc	2 222 24                        // triton_helpers.py:222:24 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:46:51 ]
+	sub.f32 	%r104, %r103, %r72;
+	.loc	2 224 34                        // triton_helpers.py:224:34 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:46:51 ]
+	div.full.f32 	%r105, %r104, %r74;
+	.loc	2 224 26                        // triton_helpers.py:224:26 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:46:51 ]
+	add.f32 	%r106, %r72, %r105;
+	.loc	2 225 39                        // triton_helpers.py:225:39 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:46:51 ]
+	sub.f32 	%r107, %r103, %r106;
+	.loc	2 225 22                        // triton_helpers.py:225:22 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:46:51 ]
+	fma.rn.f32 	%r108, %r104, %r107, 0f00000000;
+	.loc	2 222 24                        // triton_helpers.py:222:24 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:46:51 ]
+	sub.f32 	%r109, %r102, %r73;
+	.loc	2 224 34                        // triton_helpers.py:224:34 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:46:51 ]
+	div.full.f32 	%r110, %r109, %r74;
+	.loc	2 224 26                        // triton_helpers.py:224:26 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:46:51 ]
+	add.f32 	%r111, %r73, %r110;
+	.loc	2 225 39                        // triton_helpers.py:225:39 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:46:51 ]
+	sub.f32 	%r112, %r102, %r111;
+	.loc	2 225 22                        // triton_helpers.py:225:22 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:46:51 ]
+	fma.rn.f32 	%r113, %r109, %r112, 0f00000000;
+$L__tmp4:
+	.loc	1 48 62                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:48:62
+	selp.f32 	%r114, %r106, 0f00000000, %p1;
+	selp.f32 	%r115, %r111, 0f00000000, %p1;
+	.loc	1 49 58                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:49:58
+	selp.f32 	%r116, %r108, 0f00000000, %p1;
+	selp.f32 	%r117, %r113, 0f00000000, %p1;
+	.loc	1 51 52                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:51:52
+	cvt.rn.bf16x2.f32 	%r17, %r102, %r103;
+	// begin inline asm
+	@%p1 st.global.v2.b32 [ %rd14 + 0 ], { %r16, %r17 };
+	// end inline asm
+	.loc	1 26 37                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:26:37
+	and.b32 	%r118, %r50, 511;
+	and.b32 	%r119, %r50, 31;
+$L__tmp5:
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	sub.f32 	%r120, %r95, %r94;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	selp.f32 	%r121, 0f40800000, 0f00000000, %p1;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	setp.eq.f32 	%p6, %r121, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	div.full.f32 	%r122, %r75, %r121;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	selp.f32 	%r123, 0f00000000, %r122, %p6;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	fma.rn.f32 	%r124, %r120, %r123, %r94;
+	.loc	2 236 15                        // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	add.f32 	%r125, %r88, %r93;
+	selp.f32 	%r126, %r125, 0f00000000, %p1;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	mul.f32 	%r127, %r120, %r120;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	mul.f32 	%r128, %r127, %r75;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	fma.rn.f32 	%r129, %r128, %r123, %r126;
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	sub.f32 	%r130, %r114, %r124;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	selp.f32 	%r131, 0f40C00000, 0f00000000, %p1;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	setp.eq.f32 	%p7, %r131, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	div.full.f32 	%r132, %r75, %r131;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	selp.f32 	%r133, 0f00000000, %r132, %p7;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	fma.rn.f32 	%r134, %r133, %r130, %r124;
+	.loc	2 236 15                        // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	add.f32 	%r135, %r116, %r129;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	mul.f32 	%r136, %r130, %r130;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	mul.f32 	%r137, %r121, %r136;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	fma.rn.f32 	%r138, %r133, %r137, %r135;
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	sub.f32 	%r139, %r115, %r134;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	selp.f32 	%r140, 0f41000000, 0f00000000, %p1;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	setp.eq.f32 	%p8, %r140, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	div.full.f32 	%r141, %r75, %r140;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	selp.f32 	%r142, 0f00000000, %r141, %p8;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	fma.rn.f32 	%r143, %r142, %r139, %r134;
+	.loc	2 236 15                        // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	add.f32 	%r144, %r117, %r138;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	mul.f32 	%r145, %r139, %r139;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	mul.f32 	%r146, %r131, %r145;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	fma.rn.f32 	%r147, %r142, %r146, %r144;
+$L__tmp6:
+	.loc	2 243 46                        // triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ]
+	shfl.sync.bfly.b32 	%r148, %r143, 16, 31, -1;
+	shfl.sync.bfly.b32 	%r149, %r147, 16, 31, -1;
+	shfl.sync.bfly.b32 	%r150, %r140, 16, 31, -1;
+$L__tmp7:
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	sub.f32 	%r151, %r148, %r143;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	add.f32 	%r152, %r140, %r150;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	setp.eq.f32 	%p9, %r152, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	div.full.f32 	%r153, %r150, %r152;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	selp.f32 	%r154, 0f00000000, %r153, %p9;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	fma.rn.f32 	%r155, %r154, %r151, %r143;
+	.loc	2 236 15                        // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	add.f32 	%r156, %r147, %r149;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	mul.f32 	%r157, %r151, %r151;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	mul.f32 	%r158, %r140, %r157;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	fma.rn.f32 	%r159, %r154, %r158, %r156;
+$L__tmp8:
+	.loc	2 243 46                        // triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ]
+	shfl.sync.bfly.b32 	%r160, %r155, 8, 31, -1;
+	shfl.sync.bfly.b32 	%r161, %r159, 8, 31, -1;
+	shfl.sync.bfly.b32 	%r162, %r152, 8, 31, -1;
+$L__tmp9:
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	sub.f32 	%r163, %r160, %r155;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	add.f32 	%r164, %r152, %r162;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	setp.eq.f32 	%p10, %r164, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	div.full.f32 	%r165, %r162, %r164;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	selp.f32 	%r166, 0f00000000, %r165, %p10;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	fma.rn.f32 	%r167, %r163, %r166, %r155;
+	.loc	2 236 15                        // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	add.f32 	%r168, %r159, %r161;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	mul.f32 	%r169, %r163, %r163;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	mul.f32 	%r170, %r152, %r169;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	fma.rn.f32 	%r171, %r166, %r170, %r168;
+$L__tmp10:
+	.loc	2 243 46                        // triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ]
+	shfl.sync.bfly.b32 	%r172, %r167, 4, 31, -1;
+	shfl.sync.bfly.b32 	%r173, %r171, 4, 31, -1;
+	shfl.sync.bfly.b32 	%r174, %r164, 4, 31, -1;
+$L__tmp11:
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	sub.f32 	%r175, %r172, %r167;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	add.f32 	%r176, %r164, %r174;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	setp.eq.f32 	%p11, %r176, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	div.full.f32 	%r177, %r174, %r176;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	selp.f32 	%r178, 0f00000000, %r177, %p11;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	fma.rn.f32 	%r179, %r175, %r178, %r167;
+	.loc	2 236 15                        // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	add.f32 	%r180, %r171, %r173;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	mul.f32 	%r181, %r175, %r175;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	mul.f32 	%r182, %r164, %r181;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	fma.rn.f32 	%r183, %r178, %r182, %r180;
+$L__tmp12:
+	.loc	2 243 46                        // triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ]
+	shfl.sync.bfly.b32 	%r184, %r179, 2, 31, -1;
+	shfl.sync.bfly.b32 	%r185, %r183, 2, 31, -1;
+	shfl.sync.bfly.b32 	%r186, %r176, 2, 31, -1;
+$L__tmp13:
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	sub.f32 	%r187, %r184, %r179;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	add.f32 	%r188, %r176, %r186;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	setp.eq.f32 	%p12, %r188, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	div.full.f32 	%r189, %r186, %r188;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	selp.f32 	%r190, 0f00000000, %r189, %p12;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	fma.rn.f32 	%r191, %r187, %r190, %r179;
+	.loc	2 236 15                        // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	add.f32 	%r192, %r183, %r185;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	mul.f32 	%r193, %r187, %r187;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	mul.f32 	%r194, %r176, %r193;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	fma.rn.f32 	%r195, %r190, %r194, %r192;
+$L__tmp14:
+	.loc	2 243 46                        // triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ]
+	shfl.sync.bfly.b32 	%r196, %r191, 1, 31, -1;
+	shfl.sync.bfly.b32 	%r197, %r195, 1, 31, -1;
+	shfl.sync.bfly.b32 	%r198, %r188, 1, 31, -1;
+$L__tmp15:
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	sub.f32 	%r199, %r196, %r191;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	add.f32 	%r23, %r188, %r198;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	setp.eq.f32 	%p13, %r23, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	div.full.f32 	%r200, %r198, %r23;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	selp.f32 	%r201, 0f00000000, %r200, %p13;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	fma.rn.f32 	%r19, %r199, %r201, %r191;
+	.loc	2 236 15                        // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	add.f32 	%r202, %r195, %r197;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	mul.f32 	%r203, %r199, %r199;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	mul.f32 	%r204, %r188, %r203;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	fma.rn.f32 	%r21, %r201, %r204, %r202;
+$L__tmp16:
+	.loc	2 243 46                        // triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ]
+	setp.eq.b32 	%p3, %r119, 0;
+	shr.u32 	%r205, %r50, 3;
+	and.b32 	%r206, %r205, 60;
+	mov.b32 	%r207, global_smem;
+	add.s32 	%r18, %r207, %r206;
+	// begin inline asm
+	@%p3 st.shared.b32 [ %r18 + 0 ], %r19;
+	// end inline asm
+	add.s32 	%r20, %r18, 64;
+	// begin inline asm
+	@%p3 st.shared.b32 [ %r20 + 0 ], %r21;
+	// end inline asm
+	add.s32 	%r22, %r18, 128;
+	// begin inline asm
+	@%p3 st.shared.b32 [ %r22 + 0 ], %r23;
+	// end inline asm
+	bar.sync 	0;
+	setp.lt.u32 	%p4, %r118, 16;
+	shl.b32 	%r208, %r118, 2;
+	add.s32 	%r25, %r207, %r208;
+	// begin inline asm
+	@%p4 ld.shared.b32 %r24, [ %r25 + 0 ];
+	// end inline asm
+	add.s32 	%r27, %r25, 64;
+	// begin inline asm
+	@%p4 ld.shared.b32 %r26, [ %r27 + 0 ];
+	// end inline asm
+	add.s32 	%r29, %r25, 128;
+	// begin inline asm
+	@%p4 ld.shared.b32 %r28, [ %r29 + 0 ];
+	// end inline asm
+	shfl.sync.bfly.b32 	%r209, %r24, 8, 31, -1;
+	shfl.sync.bfly.b32 	%r210, %r26, 8, 31, -1;
+	shfl.sync.bfly.b32 	%r211, %r28, 8, 31, -1;
+$L__tmp17:
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	sub.f32 	%r212, %r209, %r24;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	add.f32 	%r213, %r28, %r211;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	setp.eq.f32 	%p14, %r213, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	div.full.f32 	%r214, %r211, %r213;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	selp.f32 	%r215, 0f00000000, %r214, %p14;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	fma.rn.f32 	%r216, %r212, %r215, %r24;
+	.loc	2 236 15                        // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	add.f32 	%r217, %r26, %r210;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	mul.f32 	%r218, %r212, %r212;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	mul.f32 	%r219, %r218, %r28;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	fma.rn.f32 	%r220, %r219, %r215, %r217;
+$L__tmp18:
+	.loc	2 243 46                        // triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ]
+	shfl.sync.bfly.b32 	%r221, %r216, 4, 31, -1;
+	shfl.sync.bfly.b32 	%r222, %r220, 4, 31, -1;
+	shfl.sync.bfly.b32 	%r223, %r213, 4, 31, -1;
+$L__tmp19:
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	sub.f32 	%r224, %r221, %r216;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	add.f32 	%r225, %r213, %r223;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	setp.eq.f32 	%p15, %r225, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	div.full.f32 	%r226, %r223, %r225;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	selp.f32 	%r227, 0f00000000, %r226, %p15;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	fma.rn.f32 	%r228, %r224, %r227, %r216;
+	.loc	2 236 15                        // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	add.f32 	%r229, %r220, %r222;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	mul.f32 	%r230, %r224, %r224;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	mul.f32 	%r231, %r213, %r230;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	fma.rn.f32 	%r232, %r227, %r231, %r229;
+$L__tmp20:
+	.loc	2 243 46                        // triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ]
+	shfl.sync.bfly.b32 	%r233, %r228, 2, 31, -1;
+	shfl.sync.bfly.b32 	%r234, %r232, 2, 31, -1;
+	shfl.sync.bfly.b32 	%r235, %r225, 2, 31, -1;
+$L__tmp21:
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	sub.f32 	%r236, %r233, %r228;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	add.f32 	%r237, %r225, %r235;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	setp.eq.f32 	%p16, %r237, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	div.full.f32 	%r238, %r235, %r237;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	selp.f32 	%r239, 0f00000000, %r238, %p16;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	fma.rn.f32 	%r240, %r236, %r239, %r228;
+	.loc	2 236 15                        // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	add.f32 	%r241, %r232, %r234;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	mul.f32 	%r242, %r236, %r236;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	mul.f32 	%r243, %r225, %r242;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	fma.rn.f32 	%r244, %r239, %r243, %r241;
+$L__tmp22:
+	.loc	2 243 46                        // triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ]
+	shfl.sync.bfly.b32 	%r245, %r240, 1, 31, -1;
+	shfl.sync.bfly.b32 	%r246, %r244, 1, 31, -1;
+	shfl.sync.bfly.b32 	%r247, %r237, 1, 31, -1;
+$L__tmp23:
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	sub.f32 	%r248, %r245, %r240;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	add.f32 	%r32, %r237, %r247;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	setp.eq.f32 	%p17, %r32, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	div.full.f32 	%r249, %r247, %r32;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	selp.f32 	%r250, 0f00000000, %r249, %p17;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	fma.rn.f32 	%r30, %r248, %r250, %r240;
+	.loc	2 236 15                        // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	add.f32 	%r251, %r244, %r246;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	mul.f32 	%r252, %r248, %r248;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	mul.f32 	%r253, %r237, %r252;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	fma.rn.f32 	%r31, %r250, %r253, %r251;
+$L__tmp24:
+	.loc	2 243 46                        // triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ]
+	and.b32 	%r254, %r50, 15;
+	setp.eq.b32 	%p18, %r254, 0;
+	and.pred 	%p5, %p4, %p18;
+	// begin inline asm
+	@%p5 st.shared.b32 [ %r25 + 0 ], %r30;
+	// end inline asm
+	// begin inline asm
+	@%p5 st.shared.b32 [ %r27 + 0 ], %r31;
+	// end inline asm
+	// begin inline asm
+	@%p5 st.shared.b32 [ %r29 + 0 ], %r32;
+	// end inline asm
+	bar.sync 	0;
+	ld.shared.b32 	%r255, [global_smem];
+	ld.shared.b32 	%r256, [global_smem+64];
+	mov.b32 	%r257, 0f45800000;
+$L__tmp25:
+	.loc	1 68 25                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:68:25
+	div.full.f32 	%r258, %r256, %r257;
+	.loc	1 70 24                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:70:24
+	add.f32 	%r259, %r258, 0f358637BD;
+	.loc	1 71 32                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:71:32
+	rsqrt.approx.ftz.f32 	%r260, %r259;
+	.loc	1 62 53                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:62:53
+	// begin inline asm
+	mov.u64 %rd15, 0x0;
+	createpolicy.fractional.L2::evict_first.b64 %rd15, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r33, %r3;
+	mov.u32 %r34, %r3;
+	@%p1 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { %r33, %r34 }, [ %rd7 + 0 ], %rd15;
+	// end inline asm
+	.loc	1 63 35                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:63:35
+	add.s64 	%rd16, %rd30, %rd38;
+	.loc	1 63 42                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:63:42
+	// begin inline asm
+	mov.u64 %rd17, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd17, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r35, %r3;
+	mov.u32 %r36, %r3;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { %r35, %r36 }, [ %rd16 + 0 ], %rd17;
+	// end inline asm
+	.loc	1 64 35                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:64:35
+	add.s64 	%rd18, %rd31, %rd38;
+	.loc	1 64 42                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:64:42
+	// begin inline asm
+	mov.u64 %rd19, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd19, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r37, %r3;
+	mov.u32 %r38, %r3;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { %r37, %r38 }, [ %rd18 + 0 ], %rd19;
+	// end inline asm
+	.loc	1 78 29                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:78:29
+	add.s64 	%rd20, %rd33, %rd37;
+	.loc	1 62 115                        // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:62:115
+	mov.b32 	{%rs25, %rs26}, %r33;
+	cvt.f32.bf16 	%r261, %rs26;
+	cvt.f32.bf16 	%r262, %rs25;
+	.loc	1 63 95                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:63:95
+	mov.b32 	{%rs27, %rs28}, %r35;
+	cvt.f32.bf16 	%r263, %rs27;
+	cvt.f32.bf16 	%r264, %rs28;
+	.loc	1 64 95                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:64:95
+	mov.b32 	{%rs29, %rs30}, %r37;
+	cvt.f32.bf16 	%r265, %rs30;
+	cvt.f32.bf16 	%r266, %rs29;
+	.loc	1 66 24                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:66:24
+	sub.f32 	%r267, %r262, %r255;
+	sub.f32 	%r268, %r261, %r255;
+	.loc	1 72 24                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:72:24
+	mul.f32 	%r269, %r260, %r268;
+	mul.f32 	%r270, %r260, %r267;
+	.loc	1 75 24                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:75:24
+	add.f32 	%r271, %r264, 0f3F800000;
+	add.f32 	%r272, %r263, 0f3F800000;
+	.loc	1 77 24                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:77:24
+	fma.rn.f32 	%r273, %r270, %r272, %r266;
+	fma.rn.f32 	%r274, %r269, %r271, %r265;
+	.loc	1 78 53                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:78:53
+	cvt.rn.bf16x2.f32 	%r39, %r274, %r273;
+	.loc	1 62 115                        // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:62:115
+	mov.b32 	{%rs31, %rs32}, %r34;
+	cvt.f32.bf16 	%r275, %rs32;
+	cvt.f32.bf16 	%r276, %rs31;
+	.loc	1 63 95                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:63:95
+	mov.b32 	{%rs33, %rs34}, %r36;
+	cvt.f32.bf16 	%r277, %rs33;
+	cvt.f32.bf16 	%r278, %rs34;
+	.loc	1 64 95                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:64:95
+	mov.b32 	{%rs35, %rs36}, %r38;
+	cvt.f32.bf16 	%r279, %rs36;
+	cvt.f32.bf16 	%r280, %rs35;
+	.loc	1 66 24                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:66:24
+	sub.f32 	%r281, %r276, %r255;
+	sub.f32 	%r282, %r275, %r255;
+	.loc	1 72 24                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:72:24
+	mul.f32 	%r283, %r260, %r282;
+	mul.f32 	%r284, %r260, %r281;
+	.loc	1 75 24                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:75:24
+	add.f32 	%r285, %r278, 0f3F800000;
+	add.f32 	%r286, %r277, 0f3F800000;
+	.loc	1 77 24                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:77:24
+	fma.rn.f32 	%r287, %r284, %r286, %r280;
+	fma.rn.f32 	%r288, %r283, %r285, %r279;
+	.loc	1 78 53                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:78:53
+	cvt.rn.bf16x2.f32 	%r40, %r288, %r287;
+	// begin inline asm
+	@%p1 st.global.v2.b32 [ %rd20 + 0 ], { %r39, %r40 };
+	// end inline asm
+	.loc	1 62 53                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:62:53
+	// begin inline asm
+	mov.u64 %rd21, 0x0;
+	createpolicy.fractional.L2::evict_first.b64 %rd21, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r41, %r3;
+	mov.u32 %r42, %r3;
+	@%p1 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { %r41, %r42 }, [ %rd14 + 0 ], %rd21;
+	// end inline asm
+	.loc	1 63 35                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:63:35
+	add.s64 	%rd22, %rd16, 4096;
+	.loc	1 63 42                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:63:42
+	// begin inline asm
+	mov.u64 %rd23, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd23, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r43, %r3;
+	mov.u32 %r44, %r3;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { %r43, %r44 }, [ %rd22 + 0 ], %rd23;
+	// end inline asm
+	.loc	1 64 35                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:64:35
+	add.s64 	%rd24, %rd18, 4096;
+	.loc	1 64 42                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:64:42
+	// begin inline asm
+	mov.u64 %rd25, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd25, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r45, %r3;
+	mov.u32 %r46, %r3;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { %r45, %r46 }, [ %rd24 + 0 ], %rd25;
+	// end inline asm
+	.loc	1 78 29                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:78:29
+	add.s64 	%rd26, %rd20, 4096;
+	.loc	1 62 115                        // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:62:115
+	mov.b32 	{%rs37, %rs38}, %r41;
+	cvt.f32.bf16 	%r289, %rs38;
+	cvt.f32.bf16 	%r290, %rs37;
+	.loc	1 63 95                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:63:95
+	mov.b32 	{%rs39, %rs40}, %r43;
+	cvt.f32.bf16 	%r291, %rs39;
+	cvt.f32.bf16 	%r292, %rs40;
+	.loc	1 64 95                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:64:95
+	mov.b32 	{%rs41, %rs42}, %r45;
+	cvt.f32.bf16 	%r293, %rs42;
+	cvt.f32.bf16 	%r294, %rs41;
+	.loc	1 66 24                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:66:24
+	sub.f32 	%r295, %r290, %r255;
+	sub.f32 	%r296, %r289, %r255;
+	.loc	1 72 24                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:72:24
+	mul.f32 	%r297, %r260, %r296;
+	mul.f32 	%r298, %r260, %r295;
+	.loc	1 75 24                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:75:24
+	add.f32 	%r299, %r292, 0f3F800000;
+	add.f32 	%r300, %r291, 0f3F800000;
+	.loc	1 77 24                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:77:24
+	fma.rn.f32 	%r301, %r298, %r300, %r294;
+	fma.rn.f32 	%r302, %r297, %r299, %r293;
+	.loc	1 78 53                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:78:53
+	cvt.rn.bf16x2.f32 	%r47, %r302, %r301;
+	.loc	1 62 115                        // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:62:115
+	mov.b32 	{%rs43, %rs44}, %r42;
+	cvt.f32.bf16 	%r303, %rs44;
+	cvt.f32.bf16 	%r304, %rs43;
+	.loc	1 63 95                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:63:95
+	mov.b32 	{%rs45, %rs46}, %r44;
+	cvt.f32.bf16 	%r305, %rs45;
+	cvt.f32.bf16 	%r306, %rs46;
+	.loc	1 64 95                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:64:95
+	mov.b32 	{%rs47, %rs48}, %r46;
+	cvt.f32.bf16 	%r307, %rs48;
+	cvt.f32.bf16 	%r308, %rs47;
+	.loc	1 66 24                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:66:24
+	sub.f32 	%r309, %r304, %r255;
+	sub.f32 	%r310, %r303, %r255;
+	.loc	1 72 24                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:72:24
+	mul.f32 	%r311, %r260, %r310;
+	mul.f32 	%r312, %r260, %r309;
+	.loc	1 75 24                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:75:24
+	add.f32 	%r313, %r306, 0f3F800000;
+	add.f32 	%r314, %r305, 0f3F800000;
+	.loc	1 77 24                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:77:24
+	fma.rn.f32 	%r315, %r312, %r314, %r308;
+	fma.rn.f32 	%r316, %r311, %r313, %r307;
+	.loc	1 78 53                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:78:53
+	cvt.rn.bf16x2.f32 	%r48, %r316, %r315;
+	// begin inline asm
+	@%p1 st.global.v2.b32 [ %rd26 + 0 ], { %r47, %r48 };
+	// end inline asm
+	.loc	1 56 4                          // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:56:4
+	ret;
+$L__tmp26:
+$L__func_end0:
+                                        // -- End function
+}
+	.file	1 "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py"
+	.file	2 "/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py"
+	.section	.debug_abbrev
+	{
+.b8 1                                   // Abbreviation Code
+.b8 17                                  // DW_TAG_compile_unit
+.b8 1                                   // DW_CHILDREN_yes
+.b8 37                                  // DW_AT_producer
+.b8 8                                   // DW_FORM_string
+.b8 19                                  // DW_AT_language
+.b8 5                                   // DW_FORM_data2
+.b8 3                                   // DW_AT_name
+.b8 8                                   // DW_FORM_string
+.b8 16                                  // DW_AT_stmt_list
+.b8 6                                   // DW_FORM_data4
+.b8 27                                  // DW_AT_comp_dir
+.b8 8                                   // DW_FORM_string
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 2                                   // Abbreviation Code
+.b8 46                                  // DW_TAG_subprogram
+.b8 0                                   // DW_CHILDREN_no
+.b8 3                                   // DW_AT_name
+.b8 8                                   // DW_FORM_string
+.b8 32                                  // DW_AT_inline
+.b8 11                                  // DW_FORM_data1
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 3                                   // Abbreviation Code
+.b8 46                                  // DW_TAG_subprogram
+.b8 1                                   // DW_CHILDREN_yes
+.b8 17                                  // DW_AT_low_pc
+.b8 1                                   // DW_FORM_addr
+.b8 18                                  // DW_AT_high_pc
+.b8 1                                   // DW_FORM_addr
+.b8 49                                  // DW_AT_abstract_origin
+.b8 19                                  // DW_FORM_ref4
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 4                                   // Abbreviation Code
+.b8 29                                  // DW_TAG_inlined_subroutine
+.b8 0                                   // DW_CHILDREN_no
+.b8 49                                  // DW_AT_abstract_origin
+.b8 19                                  // DW_FORM_ref4
+.b8 17                                  // DW_AT_low_pc
+.b8 1                                   // DW_FORM_addr
+.b8 18                                  // DW_AT_high_pc
+.b8 1                                   // DW_FORM_addr
+.b8 88                                  // DW_AT_call_file
+.b8 11                                  // DW_FORM_data1
+.b8 89                                  // DW_AT_call_line
+.b8 11                                  // DW_FORM_data1
+.b8 87                                  // DW_AT_call_column
+.b8 11                                  // DW_FORM_data1
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 5                                   // Abbreviation Code
+.b8 29                                  // DW_TAG_inlined_subroutine
+.b8 1                                   // DW_CHILDREN_yes
+.b8 49                                  // DW_AT_abstract_origin
+.b8 19                                  // DW_FORM_ref4
+.b8 17                                  // DW_AT_low_pc
+.b8 1                                   // DW_FORM_addr
+.b8 18                                  // DW_AT_high_pc
+.b8 1                                   // DW_FORM_addr
+.b8 88                                  // DW_AT_call_file
+.b8 11                                  // DW_FORM_data1
+.b8 89                                  // DW_AT_call_line
+.b8 11                                  // DW_FORM_data1
+.b8 87                                  // DW_AT_call_column
+.b8 11                                  // DW_FORM_data1
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 0                                   // EOM(3)
+	}
+	.section	.debug_info
+	{
+.b32 367                                // Length of Unit
+.b8 2                                   // DWARF version number
+.b8 0
+.b32 .debug_abbrev                      // Offset Into Abbrev. Section
+.b8 8                                   // Address Size (in bytes)
+.b8 1                                   // Abbrev [1] 0xb:0x168 DW_TAG_compile_unit
+.b8 116                                 // DW_AT_producer
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 0
+.b8 2                                   // DW_AT_language
+.b8 0
+.b8 99                                  // DW_AT_name
+.b8 97
+.b8 51
+.b8 109
+.b8 101
+.b8 110
+.b8 108
+.b8 102
+.b8 117
+.b8 108
+.b8 100
+.b8 116
+.b8 104
+.b8 103
+.b8 109
+.b8 110
+.b8 99
+.b8 102
+.b8 112
+.b8 106
+.b8 107
+.b8 52
+.b8 53
+.b8 50
+.b8 120
+.b8 107
+.b8 114
+.b8 111
+.b8 115
+.b8 55
+.b8 105
+.b8 100
+.b8 114
+.b8 109
+.b8 105
+.b8 108
+.b8 54
+.b8 112
+.b8 99
+.b8 111
+.b8 101
+.b8 105
+.b8 103
+.b8 114
+.b8 97
+.b8 121
+.b8 109
+.b8 99
+.b8 103
+.b8 52
+.b8 101
+.b8 54
+.b8 46
+.b8 112
+.b8 121
+.b8 0
+.b32 .debug_line                        // DW_AT_stmt_list
+.b8 47                                  // DW_AT_comp_dir
+.b8 97
+.b8 112
+.b8 112
+.b8 47
+.b8 116
+.b8 101
+.b8 110
+.b8 115
+.b8 111
+.b8 114
+.b8 114
+.b8 116
+.b8 95
+.b8 108
+.b8 108
+.b8 109
+.b8 47
+.b8 118
+.b8 105
+.b8 115
+.b8 117
+.b8 97
+.b8 108
+.b8 95
+.b8 103
+.b8 101
+.b8 110
+.b8 47
+.b8 99
+.b8 111
+.b8 109
+.b8 112
+.b8 105
+.b8 108
+.b8 101
+.b8 100
+.b8 95
+.b8 99
+.b8 97
+.b8 99
+.b8 104
+.b8 101
+.b8 47
+.b8 102
+.b8 108
+.b8 117
+.b8 120
+.b8 50
+.b8 95
+.b8 107
+.b8 108
+.b8 101
+.b8 105
+.b8 110
+.b8 95
+.b8 57
+.b8 98
+.b8 95
+.b8 78
+.b8 86
+.b8 73
+.b8 68
+.b8 73
+.b8 65
+.b8 95
+.b8 71
+.b8 101
+.b8 70
+.b8 111
+.b8 114
+.b8 99
+.b8 101
+.b8 95
+.b8 82
+.b8 84
+.b8 88
+.b8 95
+.b8 52
+.b8 48
+.b8 57
+.b8 48
+.b8 95
+.b8 115
+.b8 109
+.b8 56
+.b8 57
+.b8 95
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 50
+.b8 46
+.b8 49
+.b8 48
+.b8 46
+.b8 48
+.b8 97
+.b8 48
+.b8 95
+.b8 98
+.b8 52
+.b8 101
+.b8 52
+.b8 101
+.b8 101
+.b8 56
+.b8 49
+.b8 100
+.b8 51
+.b8 46
+.b8 110
+.b8 118
+.b8 50
+.b8 53
+.b8 46
+.b8 49
+.b8 50
+.b8 95
+.b8 99
+.b8 117
+.b8 100
+.b8 97
+.b8 49
+.b8 51
+.b8 95
+.b8 49
+.b8 47
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 105
+.b8 110
+.b8 100
+.b8 117
+.b8 99
+.b8 116
+.b8 111
+.b8 114
+.b8 47
+.b8 97
+.b8 51
+.b8 0
+.b8 2                                   // Abbrev [2] 0xe4:0x2f DW_TAG_subprogram
+.b8 116                                 // DW_AT_name
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 114
+.b8 101
+.b8 100
+.b8 95
+.b8 102
+.b8 117
+.b8 115
+.b8 101
+.b8 100
+.b8 95
+.b8 97
+.b8 100
+.b8 100
+.b8 95
+.b8 109
+.b8 117
+.b8 108
+.b8 95
+.b8 110
+.b8 97
+.b8 116
+.b8 105
+.b8 118
+.b8 101
+.b8 95
+.b8 108
+.b8 97
+.b8 121
+.b8 101
+.b8 114
+.b8 95
+.b8 110
+.b8 111
+.b8 114
+.b8 109
+.b8 95
+.b8 48
+.b8 0
+.b8 1                                   // DW_AT_inline
+.b8 3                                   // Abbrev [3] 0x113:0x5f DW_TAG_subprogram
+.b64 $L__func_begin0                    // DW_AT_low_pc
+.b64 $L__func_end0                      // DW_AT_high_pc
+.b32 228                                // DW_AT_abstract_origin
+.b8 4                                   // Abbrev [4] 0x128:0x18 DW_TAG_inlined_subroutine
+.b32 228                                // DW_AT_abstract_origin
+.b64 $L__tmp1                           // DW_AT_low_pc
+.b64 $L__tmp4                           // DW_AT_high_pc
+.b8 1                                   // DW_AT_call_file
+.b8 46                                  // DW_AT_call_line
+.b8 51                                  // DW_AT_call_column
+.b8 5                                   // Abbrev [5] 0x140:0x31 DW_TAG_inlined_subroutine
+.b32 228                                // DW_AT_abstract_origin
+.b64 $L__tmp5                           // DW_AT_low_pc
+.b64 $L__tmp25                          // DW_AT_high_pc
+.b8 1                                   // DW_AT_call_file
+.b8 52                                  // DW_AT_call_line
+.b8 80                                  // DW_AT_call_column
+.b8 4                                   // Abbrev [4] 0x158:0x18 DW_TAG_inlined_subroutine
+.b32 228                                // DW_AT_abstract_origin
+.b64 $L__tmp5                           // DW_AT_low_pc
+.b64 $L__tmp24                          // DW_AT_high_pc
+.b8 2                                   // DW_AT_call_file
+.b8 243                                 // DW_AT_call_line
+.b8 46                                  // DW_AT_call_column
+.b8 0                                   // End Of Children Mark
+.b8 0                                   // End Of Children Mark
+.b8 0                                   // End Of Children Mark
+	}
+	.section	.debug_macinfo	{	}
diff --git a/triton/H6VG26TW2DOV7R3PXVPFDX6HZCVIESL5ZYKZWLUWKZYONCE6NSLQ/triton_red_fused_add_mul_native_layer_norm_0.source b/triton/H6VG26TW2DOV7R3PXVPFDX6HZCVIESL5ZYKZWLUWKZYONCE6NSLQ/triton_red_fused_add_mul_native_layer_norm_0.source
new file mode 100644
index 0000000000000000000000000000000000000000..5e6092ee3295c72d6f32f207fcd27da802f853c6
--- /dev/null
+++ b/triton/H6VG26TW2DOV7R3PXVPFDX6HZCVIESL5ZYKZWLUWKZYONCE6NSLQ/triton_red_fused_add_mul_native_layer_norm_0.source
@@ -0,0 +1,486 @@
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":18:0)
+#loc88 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":216:0)
+#loc101 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":133:0)
+#loc105 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":242:0)
+#loc107 = loc(unknown)
+#loc110 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":230:0)
+#loc125 = loc("in_ptr0"(#loc))
+#loc126 = loc("in_ptr1"(#loc))
+#loc127 = loc("in_ptr2"(#loc))
+#loc128 = loc("in_ptr3"(#loc))
+#loc129 = loc("in_ptr4"(#loc))
+#loc130 = loc("out_ptr0"(#loc))
+#loc131 = loc("out_ptr3"(#loc))
+#loc132 = loc("xnumel"(#loc))
+#loc133 = loc("r0_numel"(#loc))
+#loc201 = loc("value"(#loc88))
+#loc202 = loc("mean"(#loc88))
+#loc203 = loc("m2"(#loc88))
+#loc204 = loc("weight"(#loc88))
+#loc205 = loc("first_iteration"(#loc88))
+#loc215 = loc("input"(#loc101))
+#loc216 = loc("mean"(#loc105))
+#loc217 = loc("m2"(#loc105))
+#loc218 = loc("weight"(#loc105))
+#loc219 = loc("mean_1"(#loc110))
+#loc220 = loc("m2_1"(#loc110))
+#loc221 = loc("weight_1"(#loc110))
+#loc222 = loc("mean_2"(#loc110))
+#loc223 = loc("m2_2"(#loc110))
+#loc224 = loc("weight_2"(#loc110))
+#loc231 = loc("new_mean"(#loc201))
+module {
+  tt.func public @triton_red_fused_add_mul_native_layer_norm_0(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %in_ptr4: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr4"(#loc)), %out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %out_ptr3: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr3"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
+    %xnumel_0 = arith.constant 256 : i32 loc(#loc134)
+    %r0_numel_1 = arith.constant 4096 : i32 loc(#loc135)
+    %xoffset = tt.get_program_id x : i32 loc(#loc136)
+    %xoffset_2 = arith.constant 1 : i32 loc(#loc137)
+    %xoffset_3 = arith.constant 1 : i32 loc(#loc137)
+    %xoffset_4 = arith.muli %xoffset, %xoffset_3 : i32 loc(#loc137)
+    %xindex = tt.make_range {end = 1 : i32, start = 0 : i32} : tensor<1xi32> loc(#loc138)
+    %xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc139)
+    %xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<1x1xi32> loc(#loc140)
+    %xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<1x1xi32> loc(#loc140)
+    %xmask = arith.constant dense<256> : tensor<1x1xi32> loc(#loc141)
+    %xmask_8 = arith.cmpi slt, %xindex_7, %xmask : tensor<1x1xi32> loc(#loc141)
+    %r0_base = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32> loc(#loc142)
+    %r0_base_9 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<2048xi32> -> tensor<1x2048xi32> loc(#loc143)
+    %tmp7_mean = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_2048__(1,)cconstexpr_fp32_"() : () -> tensor<1x2048xf32> loc(#loc144)
+    %tmp7_m2 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_2048__(1,)cconstexpr_fp32_"() : () -> tensor<1x2048xf32> loc(#loc145)
+    %tmp7_weight = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_2048__(1,)cconstexpr_fp32_"() : () -> tensor<1x2048xf32> loc(#loc146)
+    %c0_i32 = arith.constant 0 : i32 loc(#loc14)
+    %c2048_i32 = arith.constant 2048 : i32 loc(#loc14)
+    %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc14)
+    %1 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc14)
+    %2 = arith.bitcast %c2048_i32 : i32 to i32 loc(#loc14)
+    %3 = ub.poison : i32 loc(#loc14)
+    %tmp7_weight_10:3 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%tmp7_mean_13 = %tmp7_mean, %tmp7_m2_14 = %tmp7_m2, %tmp7_weight_15 = %tmp7_weight) -> (tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32>)  : i32 {
+      %r0_index = tt.splat %r0_offset : i32 -> tensor<1x2048xi32> loc(#loc148)
+      %r0_index_16 = arith.addi %r0_index, %r0_base_9 : tensor<1x2048xi32> loc(#loc148)
+      %r0_mask = arith.constant dense<4096> : tensor<1x2048xi32> loc(#loc149)
+      %r0_mask_17 = arith.cmpi slt, %r0_index_16, %r0_mask : tensor<1x2048xi32> loc(#loc149)
+      %tmp0 = arith.constant 4096 : i32 loc(#loc150)
+      %tmp0_18 = arith.constant 4096 : i32 loc(#loc150)
+      %tmp0_19 = arith.constant dense<4096> : tensor<1x1xi32> loc(#loc150)
+      %tmp0_20 = arith.muli %tmp0_19, %xindex_7 : tensor<1x1xi32> loc(#loc150)
+      %tmp0_21 = tt.broadcast %tmp0_20 : tensor<1x1xi32> -> tensor<1x2048xi32> loc(#loc151)
+      %tmp0_22 = arith.addi %r0_index_16, %tmp0_21 : tensor<1x2048xi32> loc(#loc151)
+      %tmp0_23 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<1x2048x!tt.ptr<bf16>> loc(#loc152)
+      %tmp0_24 = tt.addptr %tmp0_23, %tmp0_22 : tensor<1x2048x!tt.ptr<bf16>>, tensor<1x2048xi32> loc(#loc152)
+      %tmp0_25 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x2048xi1> loc(#loc153)
+      %tmp0_26 = arith.andi %r0_mask_17, %tmp0_25 : tensor<1x2048xi1> loc(#loc153)
+      %tmp0_27 = arith.constant 0.000000e+00 : f32 loc(#loc154)
+      %tmp0_28 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc154)
+      %tmp0_29 = arith.truncf %tmp0_28 : tensor<1x2048xf32> to tensor<1x2048xbf16> loc(#loc154)
+      %tmp0_30 = tt.load %tmp0_24, %tmp0_26, %tmp0_29 evictionPolicy = evict_first : tensor<1x2048x!tt.ptr<bf16>> loc(#loc154)
+      %tmp0_31 = arith.extf %tmp0_30 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc155)
+      %tmp1 = tt.splat %in_ptr1 : !tt.ptr<bf16> -> tensor<1x2048x!tt.ptr<bf16>> loc(#loc156)
+      %tmp1_32 = tt.addptr %tmp1, %r0_index_16 : tensor<1x2048x!tt.ptr<bf16>>, tensor<1x2048xi32> loc(#loc156)
+      %tmp1_33 = arith.constant 0.000000e+00 : f32 loc(#loc157)
+      %tmp1_34 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc157)
+      %tmp1_35 = arith.truncf %tmp1_34 : tensor<1x2048xf32> to tensor<1x2048xbf16> loc(#loc157)
+      %tmp1_36 = tt.load %tmp1_32, %r0_mask_17, %tmp1_35 evictionPolicy = evict_last : tensor<1x2048x!tt.ptr<bf16>> loc(#loc157)
+      %tmp1_37 = arith.extf %tmp1_36 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc158)
+      %tmp2 = arith.constant 4096 : i32 loc(#loc159)
+      %tmp2_38 = arith.constant 4096 : i32 loc(#loc159)
+      %tmp2_39 = arith.constant dense<4096> : tensor<1x1xi32> loc(#loc159)
+      %tmp2_40 = arith.muli %tmp2_39, %xindex_7 : tensor<1x1xi32> loc(#loc159)
+      %tmp2_41 = tt.broadcast %tmp2_40 : tensor<1x1xi32> -> tensor<1x2048xi32> loc(#loc160)
+      %tmp2_42 = arith.addi %r0_index_16, %tmp2_41 : tensor<1x2048xi32> loc(#loc160)
+      %tmp2_43 = tt.splat %in_ptr2 : !tt.ptr<bf16> -> tensor<1x2048x!tt.ptr<bf16>> loc(#loc161)
+      %tmp2_44 = tt.addptr %tmp2_43, %tmp2_42 : tensor<1x2048x!tt.ptr<bf16>>, tensor<1x2048xi32> loc(#loc161)
+      %tmp2_45 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x2048xi1> loc(#loc162)
+      %tmp2_46 = arith.andi %r0_mask_17, %tmp2_45 : tensor<1x2048xi1> loc(#loc162)
+      %tmp2_47 = arith.constant 0.000000e+00 : f32 loc(#loc163)
+      %tmp2_48 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc163)
+      %tmp2_49 = arith.truncf %tmp2_48 : tensor<1x2048xf32> to tensor<1x2048xbf16> loc(#loc163)
+      %tmp2_50 = tt.load %tmp2_44, %tmp2_46, %tmp2_49 evictionPolicy = evict_first : tensor<1x2048x!tt.ptr<bf16>> loc(#loc163)
+      %tmp2_51 = arith.extf %tmp2_50 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc164)
+      %tmp3 = arith.mulf %tmp1_37, %tmp2_51 : tensor<1x2048xf32> loc(#loc165)
+      %tmp4 = arith.addf %tmp0_31, %tmp3 : tensor<1x2048xf32> loc(#loc166)
+      %c0_i32_52 = arith.constant 0 : i32 loc(#loc34)
+      %9 = arith.cmpi eq, %r0_offset, %c0_i32_52 : i32 loc(#loc34)
+      %10:3 = tt.call @torch._inductor.runtime.triton_helpers.welford_reduce__fp32S1_2048S_fp32S1_2048S_fp32S1_2048S_fp32S1_2048S_u1__(%tmp4, %tmp7_mean_13, %tmp7_m2_14, %tmp7_weight_15, %9) : (tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32>, i1) -> (tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32>) loc(#loc35)
+      %tmp7_mean_53 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x2048xi1> loc(#loc167)
+      %tmp7_mean_54 = arith.andi %r0_mask_17, %tmp7_mean_53 : tensor<1x2048xi1> loc(#loc167)
+      %tmp7_mean_55 = arith.select %tmp7_mean_54, %10#0, %tmp7_mean_13 : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc168)
+      %tmp7_m2_56 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x2048xi1> loc(#loc169)
+      %tmp7_m2_57 = arith.andi %r0_mask_17, %tmp7_m2_56 : tensor<1x2048xi1> loc(#loc169)
+      %tmp7_m2_58 = arith.select %tmp7_m2_57, %10#1, %tmp7_m2_14 : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc170)
+      %tmp7_weight_59 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x2048xi1> loc(#loc171)
+      %tmp7_weight_60 = arith.andi %r0_mask_17, %tmp7_weight_59 : tensor<1x2048xi1> loc(#loc171)
+      %tmp7_weight_61 = arith.select %tmp7_weight_60, %10#2, %tmp7_weight_15 : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc172)
+      %c4096_i32 = arith.constant 4096 : i32 loc(#loc42)
+      %c4096_i32_62 = arith.constant 4096 : i32 loc(#loc42)
+      %cst = arith.constant dense<4096> : tensor<1x1xi32> loc(#loc42)
+      %11 = arith.muli %cst, %xindex_7 : tensor<1x1xi32> loc(#loc42)
+      %12 = tt.broadcast %11 : tensor<1x1xi32> -> tensor<1x2048xi32> loc(#loc43)
+      %13 = arith.addi %r0_index_16, %12 : tensor<1x2048xi32> loc(#loc43)
+      %14 = tt.splat %out_ptr0 : !tt.ptr<bf16> -> tensor<1x2048x!tt.ptr<bf16>> loc(#loc44)
+      %15 = tt.addptr %14, %13 : tensor<1x2048x!tt.ptr<bf16>>, tensor<1x2048xi32> loc(#loc44)
+      %16 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x2048xi1> loc(#loc45)
+      %17 = arith.andi %r0_mask_17, %16 : tensor<1x2048xi1> loc(#loc45)
+      %18 = arith.truncf %tmp4 : tensor<1x2048xf32> to tensor<1x2048xbf16> loc(#loc46)
+      tt.store %15, %18, %17 : tensor<1x2048x!tt.ptr<bf16>> loc(#loc46)
+      scf.yield %tmp7_mean_55, %tmp7_m2_58, %tmp7_weight_61 : tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32> loc(#loc47)
+    } loc(#loc237)
+    %4:3 = tt.call @"torch._inductor.runtime.triton_helpers.welford__fp32S1_2048S_fp32S1_2048S_fp32S1_2048S__(3,)cconstexpr_1_"(%tmp7_weight_10#0, %tmp7_weight_10#1, %tmp7_weight_10#2) : (tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32>) -> (tensor<1xf32>, tensor<1xf32>, tensor<1xf32>) loc(#loc48)
+    %tmp7 = tt.expand_dims %4#0 {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc173)
+    %tmp11 = tt.expand_dims %4#1 {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc174)
+    %tmp12 = tt.expand_dims %4#2 {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc175)
+    %c0_i32_11 = arith.constant 0 : i32 loc(#loc52)
+    %c2048_i32_12 = arith.constant 2048 : i32 loc(#loc52)
+    %5 = arith.bitcast %c0_i32_11 : i32 to i32 loc(#loc52)
+    %6 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc52)
+    %7 = arith.bitcast %c2048_i32_12 : i32 to i32 loc(#loc52)
+    %8 = ub.poison : i32 loc(#loc52)
+    scf.for %r0_offset = %5 to %6 step %7  : i32 {
+      %r0_index = tt.splat %r0_offset : i32 -> tensor<1x2048xi32> loc(#loc176)
+      %r0_index_13 = arith.addi %r0_index, %r0_base_9 : tensor<1x2048xi32> loc(#loc176)
+      %r0_mask = arith.constant dense<4096> : tensor<1x2048xi32> loc(#loc177)
+      %r0_mask_14 = arith.cmpi slt, %r0_index_13, %r0_mask : tensor<1x2048xi32> loc(#loc177)
+      %tmp13 = arith.constant 4096 : i32 loc(#loc178)
+      %tmp13_15 = arith.constant 4096 : i32 loc(#loc178)
+      %tmp13_16 = arith.constant dense<4096> : tensor<1x1xi32> loc(#loc178)
+      %tmp13_17 = arith.muli %tmp13_16, %xindex_7 : tensor<1x1xi32> loc(#loc178)
+      %tmp13_18 = tt.broadcast %tmp13_17 : tensor<1x1xi32> -> tensor<1x2048xi32> loc(#loc179)
+      %tmp13_19 = arith.addi %r0_index_13, %tmp13_18 : tensor<1x2048xi32> loc(#loc179)
+      %tmp13_20 = tt.splat %out_ptr0 : !tt.ptr<bf16> -> tensor<1x2048x!tt.ptr<bf16>> loc(#loc180)
+      %tmp13_21 = tt.addptr %tmp13_20, %tmp13_19 : tensor<1x2048x!tt.ptr<bf16>>, tensor<1x2048xi32> loc(#loc180)
+      %tmp13_22 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x2048xi1> loc(#loc181)
+      %tmp13_23 = arith.andi %r0_mask_14, %tmp13_22 : tensor<1x2048xi1> loc(#loc181)
+      %tmp13_24 = arith.constant 0.000000e+00 : f32 loc(#loc182)
+      %tmp13_25 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc182)
+      %tmp13_26 = arith.truncf %tmp13_25 : tensor<1x2048xf32> to tensor<1x2048xbf16> loc(#loc182)
+      %tmp13_27 = tt.load %tmp13_21, %tmp13_23, %tmp13_26 evictionPolicy = evict_first : tensor<1x2048x!tt.ptr<bf16>> loc(#loc182)
+      %tmp13_28 = arith.extf %tmp13_27 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc183)
+      %tmp23 = tt.splat %in_ptr3 : !tt.ptr<bf16> -> tensor<1x2048x!tt.ptr<bf16>> loc(#loc184)
+      %tmp23_29 = tt.addptr %tmp23, %r0_index_13 : tensor<1x2048x!tt.ptr<bf16>>, tensor<1x2048xi32> loc(#loc184)
+      %tmp23_30 = arith.constant 0.000000e+00 : f32 loc(#loc185)
+      %tmp23_31 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc185)
+      %tmp23_32 = arith.truncf %tmp23_31 : tensor<1x2048xf32> to tensor<1x2048xbf16> loc(#loc185)
+      %tmp23_33 = tt.load %tmp23_29, %r0_mask_14, %tmp23_32 evictionPolicy = evict_last : tensor<1x2048x!tt.ptr<bf16>> loc(#loc185)
+      %tmp23_34 = arith.extf %tmp23_33 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc186)
+      %tmp27 = tt.splat %in_ptr4 : !tt.ptr<bf16> -> tensor<1x2048x!tt.ptr<bf16>> loc(#loc187)
+      %tmp27_35 = tt.addptr %tmp27, %r0_index_13 : tensor<1x2048x!tt.ptr<bf16>>, tensor<1x2048xi32> loc(#loc187)
+      %tmp27_36 = arith.constant 0.000000e+00 : f32 loc(#loc188)
+      %tmp27_37 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc188)
+      %tmp27_38 = arith.truncf %tmp27_37 : tensor<1x2048xf32> to tensor<1x2048xbf16> loc(#loc188)
+      %tmp27_39 = tt.load %tmp27_35, %r0_mask_14, %tmp27_38 evictionPolicy = evict_last : tensor<1x2048x!tt.ptr<bf16>> loc(#loc188)
+      %tmp27_40 = arith.extf %tmp27_39 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc189)
+      %tmp15 = tt.broadcast %tmp7 : tensor<1x1xf32> -> tensor<1x2048xf32> loc(#loc190)
+      %tmp15_41 = arith.subf %tmp13_28, %tmp15 : tensor<1x2048xf32> loc(#loc190)
+      %tmp16 = arith.constant 4.096000e+03 : f32 loc(#loc191)
+      %tmp17 = arith.constant dense<4.096000e+03> : tensor<1x1xf32> loc(#loc192)
+      %tmp17_42 = arith.divf %tmp11, %tmp17 : tensor<1x1xf32> loc(#loc192)
+      %tmp18 = arith.constant 9.99999997E-7 : f32 loc(#loc193)
+      %tmp19 = arith.constant dense<9.99999997E-7> : tensor<1x1xf32> loc(#loc194)
+      %tmp19_43 = arith.addf %tmp17_42, %tmp19 : tensor<1x1xf32> loc(#loc194)
+      %tmp20 = tt.extern_elementwise %tmp19_43 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<1x1xf32>) -> tensor<1x1xf32> loc(#loc195)
+      %tmp21 = tt.broadcast %tmp20 : tensor<1x1xf32> -> tensor<1x2048xf32> loc(#loc196)
+      %tmp21_44 = arith.mulf %tmp15_41, %tmp21 : tensor<1x2048xf32> loc(#loc196)
+      %tmp24 = arith.constant 1.000000e+00 : f32 loc(#loc197)
+      %tmp25 = arith.constant dense<1.000000e+00> : tensor<1x2048xf32> loc(#loc198)
+      %tmp25_45 = arith.addf %tmp23_34, %tmp25 : tensor<1x2048xf32> loc(#loc198)
+      %tmp26 = arith.mulf %tmp21_44, %tmp25_45 : tensor<1x2048xf32> loc(#loc199)
+      %tmp28 = arith.addf %tmp26, %tmp27_40 : tensor<1x2048xf32> loc(#loc200)
+      %c4096_i32 = arith.constant 4096 : i32 loc(#loc78)
+      %c4096_i32_46 = arith.constant 4096 : i32 loc(#loc78)
+      %cst = arith.constant dense<4096> : tensor<1x1xi32> loc(#loc78)
+      %9 = arith.muli %cst, %xindex_7 : tensor<1x1xi32> loc(#loc78)
+      %10 = tt.broadcast %9 : tensor<1x1xi32> -> tensor<1x2048xi32> loc(#loc79)
+      %11 = arith.addi %r0_index_13, %10 : tensor<1x2048xi32> loc(#loc79)
+      %12 = tt.splat %out_ptr3 : !tt.ptr<bf16> -> tensor<1x2048x!tt.ptr<bf16>> loc(#loc80)
+      %13 = tt.addptr %12, %11 : tensor<1x2048x!tt.ptr<bf16>>, tensor<1x2048xi32> loc(#loc80)
+      %14 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x2048xi1> loc(#loc81)
+      %15 = arith.andi %r0_mask_14, %14 : tensor<1x2048xi1> loc(#loc81)
+      %16 = arith.truncf %tmp28 : tensor<1x2048xf32> to tensor<1x2048xbf16> loc(#loc82)
+      tt.store %13, %16, %15 : tensor<1x2048x!tt.ptr<bf16>> loc(#loc82)
+    } loc(#loc52)
+    tt.return loc(#loc83)
+  } loc(#loc)
+  tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_2048__(1,)cconstexpr_fp32_"() -> tensor<1x2048xf32> attributes {noinline = false} {
+    %cst = arith.constant 0.000000e+00 : f32 loc(#loc85)
+    %cst_0 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc85)
+    tt.return %cst_0 : tensor<1x2048xf32> loc(#loc86)
+  ^bb1:  // no predecessors
+    %0 = ub.poison : tensor<1x2048xf32> loc(#loc87)
+    tt.return %0 : tensor<1x2048xf32> loc(#loc87)
+  } loc(#loc84)
+  tt.func private @torch._inductor.runtime.triton_helpers.welford_reduce__fp32S1_2048S_fp32S1_2048S_fp32S1_2048S_fp32S1_2048S_u1__(%new_mean: tensor<1x2048xf32> loc("new_mean"(#loc201)), %mean: tensor<1x2048xf32> loc("mean"(#loc88)), %m2: tensor<1x2048xf32> loc("m2"(#loc88)), %weight: tensor<1x2048xf32> loc("weight"(#loc88)), %first_iteration: i1 loc("first_iteration"(#loc88))) -> (tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32>) attributes {noinline = false} {
+    %0:3 = scf.if %first_iteration -> (tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32>) {
+      %new_weight = arith.constant 1.000000e+00 : f32 loc(#loc206)
+      %new_weight_0 = arith.constant dense<1.000000e+00> : tensor<1x2048xf32> loc(#loc232)
+      %new_m2 = tt.call @triton.language.standard.zeros_like__fp32S1_2048S__(%m2) : (tensor<1x2048xf32>) -> tensor<1x2048xf32> loc(#loc233)
+      scf.yield %new_m2, %new_mean, %new_weight_0 : tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32> loc(#loc233)
+    } else {
+      %delta = arith.subf %new_mean, %mean : tensor<1x2048xf32> loc(#loc208)
+      %new_weight = arith.constant 1 : i32 loc(#loc209)
+      %new_weight_0 = arith.constant 1.000000e+00 : f32 loc(#loc209)
+      %new_weight_1 = arith.constant dense<1.000000e+00> : tensor<1x2048xf32> loc(#loc209)
+      %new_weight_2 = arith.addf %weight, %new_weight_1 : tensor<1x2048xf32> loc(#loc234)
+      %new_mean_3 = arith.divf %delta, %new_weight_2 : tensor<1x2048xf32> loc(#loc210)
+      %new_mean_4 = arith.addf %mean, %new_mean_3 : tensor<1x2048xf32> loc(#loc235)
+      %new_m2 = arith.subf %new_mean, %new_mean_4 : tensor<1x2048xf32> loc(#loc212)
+      %new_m2_5 = arith.mulf %delta, %new_m2 : tensor<1x2048xf32> loc(#loc213)
+      %new_m2_6 = arith.addf %m2, %new_m2_5 : tensor<1x2048xf32> loc(#loc236)
+      scf.yield %new_m2_6, %new_mean_4, %new_weight_2 : tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32> loc(#loc214)
+    } loc(#loc89)
+    tt.return %0#1, %0#0, %0#2 : tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32> loc(#loc99)
+  ^bb1:  // no predecessors
+    %1 = ub.poison : tensor<1x2048xf32> loc(#loc100)
+    %2 = ub.poison : tensor<1x2048xf32> loc(#loc100)
+    %3 = ub.poison : tensor<1x2048xf32> loc(#loc100)
+    tt.return %1, %2, %3 : tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32> loc(#loc100)
+  } loc(#loc88)
+  tt.func private @triton.language.standard.zeros_like__fp32S1_2048S__(%input: tensor<1x2048xf32> loc("input"(#loc101))) -> tensor<1x2048xf32> attributes {noinline = false} {
+    %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_2048__(1,)cconstexpr_fp32_"() : () -> tensor<1x2048xf32> loc(#loc102)
+    tt.return %0 : tensor<1x2048xf32> loc(#loc103)
+  ^bb1:  // no predecessors
+    %1 = ub.poison : tensor<1x2048xf32> loc(#loc104)
+    tt.return %1 : tensor<1x2048xf32> loc(#loc104)
+  } loc(#loc101)
+  tt.func private @"torch._inductor.runtime.triton_helpers.welford__fp32S1_2048S_fp32S1_2048S_fp32S1_2048S__(3,)cconstexpr_1_"(%mean: tensor<1x2048xf32> loc("mean"(#loc105)), %m2: tensor<1x2048xf32> loc("m2"(#loc105)), %weight: tensor<1x2048xf32> loc("weight"(#loc105))) -> (tensor<1xf32>, tensor<1xf32>, tensor<1xf32>) attributes {noinline = false} {
+    %0:3 = "tt.reduce"(%mean, %m2, %weight) <{axis = 1 : i32}> ({
+    ^bb0(%arg3: f32 loc(unknown), %arg4: f32 loc(unknown), %arg5: f32 loc(unknown), %arg6: f32 loc(unknown), %arg7: f32 loc(unknown), %arg8: f32 loc(unknown)):
+      %4:3 = tt.call @torch._inductor.runtime.triton_helpers.welford_combine__fp32_fp32_fp32_fp32_fp32_fp32__(%arg3, %arg4, %arg5, %arg6, %arg7, %arg8) : (f32, f32, f32, f32, f32, f32) -> (f32, f32, f32) loc(#loc106)
+      tt.reduce.return %4#0, %4#1, %4#2 : f32, f32, f32 loc(#loc106)
+    }) : (tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32>) -> (tensor<1xf32>, tensor<1xf32>, tensor<1xf32>) loc(#loc106)
+    tt.return %0#0, %0#1, %0#2 : tensor<1xf32>, tensor<1xf32>, tensor<1xf32> loc(#loc108)
+  ^bb1:  // no predecessors
+    %1 = ub.poison : tensor<1xf32> loc(#loc109)
+    %2 = ub.poison : tensor<1xf32> loc(#loc109)
+    %3 = ub.poison : tensor<1xf32> loc(#loc109)
+    tt.return %1, %2, %3 : tensor<1xf32>, tensor<1xf32>, tensor<1xf32> loc(#loc109)
+  } loc(#loc105)
+  tt.func private @torch._inductor.runtime.triton_helpers.welford_combine__fp32_fp32_fp32_fp32_fp32_fp32__(%mean_1: f32 loc("mean_1"(#loc110)), %m2_1: f32 loc("m2_1"(#loc110)), %weight_1: f32 loc("weight_1"(#loc110)), %mean_2: f32 loc("mean_2"(#loc110)), %m2_2: f32 loc("m2_2"(#loc110)), %weight_2: f32 loc("weight_2"(#loc110))) -> (f32, f32, f32) attributes {noinline = false} {
+    %delta = arith.subf %mean_2, %mean_1 : f32 loc(#loc225)
+    %new_weight = arith.addf %weight_1, %weight_2 : f32 loc(#loc226)
+    %w2_over_w = arith.constant 0.000000e+00 : f32 loc(#loc227)
+    %w2_over_w_0 = arith.cmpf oeq, %new_weight, %w2_over_w : f32 loc(#loc227)
+    %w2_over_w_1 = arith.divf %weight_2, %new_weight : f32 loc(#loc228)
+    %w2_over_w_2 = arith.constant 0.000000e+00 : f32 loc(#loc229)
+    %w2_over_w_3 = arith.constant 0.000000e+00 : f32 loc(#loc229)
+    %w2_over_w_4 = arith.select %w2_over_w_0, %w2_over_w_3, %w2_over_w_1 : f32 loc(#loc229)
+    %0 = arith.mulf %delta, %w2_over_w_4 : f32 loc(#loc116)
+    %1 = arith.addf %mean_1, %0 : f32 loc(#loc117)
+    %2 = arith.addf %m2_1, %m2_2 : f32 loc(#loc118)
+    %3 = arith.mulf %delta, %delta : f32 loc(#loc119)
+    %4 = arith.mulf %3, %weight_1 : f32 loc(#loc120)
+    %5 = arith.mulf %4, %w2_over_w_4 : f32 loc(#loc121)
+    %6 = arith.addf %2, %5 : f32 loc(#loc122)
+    tt.return %1, %6, %new_weight : f32, f32, f32 loc(#loc123)
+  ^bb1:  // no predecessors
+    %7 = ub.poison : f32 loc(#loc124)
+    %8 = ub.poison : f32 loc(#loc124)
+    %9 = ub.poison : f32 loc(#loc124)
+    tt.return %7, %8, %9 : f32, f32, f32 loc(#loc124)
+  } loc(#loc110)
+} loc(#loc)
+#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":19:13)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":20:15)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":23:28)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":23:33)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":24:36)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":24:44)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":24:23)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":25:21)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":26:27)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":26:37)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":29:45)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":30:43)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":31:47)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":32:43)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":33:31)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":34:29)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":38:46)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":38:41)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":38:34)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":38:61)
+#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":38:51)
+#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":38:113)
+#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":39:34)
+#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":39:41)
+#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":39:94)
+#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":40:46)
+#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":40:41)
+#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":40:34)
+#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":40:61)
+#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":40:51)
+#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":40:113)
+#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":41:22)
+#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":42:22)
+#loc34 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":46:62)
+#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":46:51)
+#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":48:39)
+#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":48:62)
+#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":49:37)
+#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":49:58)
+#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":50:41)
+#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":50:66)
+#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":51:41)
+#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":51:36)
+#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":51:29)
+#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":51:62)
+#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":51:52)
+#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":51:8)
+#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":52:80)
+#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":53:16)
+#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":54:17)
+#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":55:18)
+#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":56:43)
+#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":57:31)
+#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":58:29)
+#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":62:48)
+#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":62:43)
+#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":62:36)
+#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":62:63)
+#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":62:53)
+#loc60 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":62:115)
+#loc61 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":63:35)
+#loc62 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":63:42)
+#loc63 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":63:95)
+#loc64 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":64:35)
+#loc65 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":64:42)
+#loc66 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":64:95)
+#loc67 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":66:24)
+#loc68 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":67:16)
+#loc69 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":68:25)
+#loc70 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":69:16)
+#loc71 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":70:24)
+#loc72 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":71:32)
+#loc73 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":72:24)
+#loc74 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":74:16)
+#loc75 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":75:24)
+#loc76 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":76:24)
+#loc77 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":77:24)
+#loc78 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":78:41)
+#loc79 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":78:36)
+#loc80 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":78:29)
+#loc81 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":78:63)
+#loc82 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":78:53)
+#loc83 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":56:4)
+#loc84 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":120:0)
+#loc85 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":129:31)
+#loc86 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":129:11)
+#loc87 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":129:4)
+#loc89 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":217:7)
+#loc90 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":218:46)
+#loc91 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":220:31)
+#loc92 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":222:24)
+#loc93 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":223:30)
+#loc94 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":224:34)
+#loc95 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":224:26)
+#loc96 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":225:39)
+#loc97 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":225:31)
+#loc98 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":225:22)
+#loc99 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":226:11)
+#loc100 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":226:4)
+#loc102 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":140:30)
+#loc103 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":140:11)
+#loc104 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":140:4)
+#loc106 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":243:46)
+#loc108 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":243:11)
+#loc109 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":243:4)
+#loc111 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":231:21)
+#loc112 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":232:28)
+#loc113 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:39)
+#loc114 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:60)
+#loc115 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:49)
+#loc116 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":235:25)
+#loc117 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":235:17)
+#loc118 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:15)
+#loc119 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:30)
+#loc120 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:38)
+#loc121 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:49)
+#loc122 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:22)
+#loc123 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":234:11)
+#loc124 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":234:4)
+#loc134 = loc("xnumel"(#loc1))
+#loc135 = loc("r0_numel"(#loc2))
+#loc136 = loc("xoffset"(#loc3))
+#loc137 = loc("xoffset"(#loc4))
+#loc138 = loc("xindex"(#loc5))
+#loc139 = loc("xindex"(#loc6))
+#loc140 = loc("xindex"(#loc7))
+#loc141 = loc("xmask"(#loc8))
+#loc142 = loc("r0_base"(#loc9))
+#loc143 = loc("r0_base"(#loc10))
+#loc144 = loc("tmp7_mean"(#loc11))
+#loc145 = loc("tmp7_m2"(#loc12))
+#loc146 = loc("tmp7_weight"(#loc13))
+#loc147 = loc("tmp7_mean"(#loc14))
+#loc148 = loc("r0_index"(#loc15))
+#loc149 = loc("r0_mask"(#loc16))
+#loc150 = loc("tmp0"(#loc17))
+#loc151 = loc("tmp0"(#loc18))
+#loc152 = loc("tmp0"(#loc19))
+#loc153 = loc("tmp0"(#loc20))
+#loc154 = loc("tmp0"(#loc21))
+#loc155 = loc("tmp0"(#loc22))
+#loc156 = loc("tmp1"(#loc23))
+#loc157 = loc("tmp1"(#loc24))
+#loc158 = loc("tmp1"(#loc25))
+#loc159 = loc("tmp2"(#loc26))
+#loc160 = loc("tmp2"(#loc27))
+#loc161 = loc("tmp2"(#loc28))
+#loc162 = loc("tmp2"(#loc29))
+#loc163 = loc("tmp2"(#loc30))
+#loc164 = loc("tmp2"(#loc31))
+#loc165 = loc("tmp3"(#loc32))
+#loc166 = loc("tmp4"(#loc33))
+#loc167 = loc("tmp7_mean"(#loc36))
+#loc168 = loc("tmp7_mean"(#loc37))
+#loc169 = loc("tmp7_m2"(#loc38))
+#loc170 = loc("tmp7_m2"(#loc39))
+#loc171 = loc("tmp7_weight"(#loc40))
+#loc172 = loc("tmp7_weight"(#loc41))
+#loc173 = loc("tmp7"(#loc49))
+#loc174 = loc("tmp11"(#loc50))
+#loc175 = loc("tmp12"(#loc51))
+#loc176 = loc("r0_index"(#loc53))
+#loc177 = loc("r0_mask"(#loc54))
+#loc178 = loc("tmp13"(#loc55))
+#loc179 = loc("tmp13"(#loc56))
+#loc180 = loc("tmp13"(#loc57))
+#loc181 = loc("tmp13"(#loc58))
+#loc182 = loc("tmp13"(#loc59))
+#loc183 = loc("tmp13"(#loc60))
+#loc184 = loc("tmp23"(#loc61))
+#loc185 = loc("tmp23"(#loc62))
+#loc186 = loc("tmp23"(#loc63))
+#loc187 = loc("tmp27"(#loc64))
+#loc188 = loc("tmp27"(#loc65))
+#loc189 = loc("tmp27"(#loc66))
+#loc190 = loc("tmp15"(#loc67))
+#loc191 = loc("tmp16"(#loc68))
+#loc192 = loc("tmp17"(#loc69))
+#loc193 = loc("tmp18"(#loc70))
+#loc194 = loc("tmp19"(#loc71))
+#loc195 = loc("tmp20"(#loc72))
+#loc196 = loc("tmp21"(#loc73))
+#loc197 = loc("tmp24"(#loc74))
+#loc198 = loc("tmp25"(#loc75))
+#loc199 = loc("tmp26"(#loc76))
+#loc200 = loc("tmp28"(#loc77))
+#loc206 = loc("new_weight"(#loc90))
+#loc207 = loc("new_m2"(#loc91))
+#loc208 = loc("delta"(#loc92))
+#loc209 = loc("new_weight"(#loc93))
+#loc210 = loc("new_mean"(#loc94))
+#loc211 = loc("new_mean"(#loc95))
+#loc212 = loc("new_m2"(#loc96))
+#loc213 = loc("new_m2"(#loc97))
+#loc214 = loc("new_m2"(#loc98))
+#loc225 = loc("delta"(#loc111))
+#loc226 = loc("new_weight"(#loc112))
+#loc227 = loc("w2_over_w"(#loc113))
+#loc228 = loc("w2_over_w"(#loc114))
+#loc229 = loc("w2_over_w"(#loc115))
+#loc230 = loc("tmp7_m2"(#loc147))
+#loc232 = loc("new_weight"(#loc206))
+#loc233 = loc("new_m2"(#loc207))
+#loc234 = loc("new_weight"(#loc209))
+#loc235 = loc("new_mean"(#loc211))
+#loc236 = loc("new_m2"(#loc214))
+#loc237 = loc("tmp7_weight"(#loc230))
diff --git a/triton/H6VG26TW2DOV7R3PXVPFDX6HZCVIESL5ZYKZWLUWKZYONCE6NSLQ/triton_red_fused_add_mul_native_layer_norm_0.ttgir b/triton/H6VG26TW2DOV7R3PXVPFDX6HZCVIESL5ZYKZWLUWKZYONCE6NSLQ/triton_red_fused_add_mul_native_layer_norm_0.ttgir
new file mode 100644
index 0000000000000000000000000000000000000000..2010544834ba256a5641bc71f4f9fb3c597503d9
--- /dev/null
+++ b/triton/H6VG26TW2DOV7R3PXVPFDX6HZCVIESL5ZYKZWLUWKZYONCE6NSLQ/triton_red_fused_add_mul_native_layer_norm_0.ttgir
@@ -0,0 +1,296 @@
+#blocked = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 32], warpsPerCTA = [1, 16], order = [1, 0]}>
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":18:0)
+#loc1 = loc(unknown)
+#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":52:80)
+#loc80 = loc("in_ptr0"(#loc))
+#loc81 = loc("in_ptr1"(#loc))
+#loc82 = loc("in_ptr2"(#loc))
+#loc83 = loc("in_ptr3"(#loc))
+#loc84 = loc("in_ptr4"(#loc))
+#loc85 = loc("out_ptr0"(#loc))
+#loc86 = loc("out_ptr3"(#loc))
+#loc87 = loc("xnumel"(#loc))
+#loc88 = loc("r0_numel"(#loc))
+#loc122 = loc(callsite(#loc1 at #loc40))
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 16 : i32, ttg.target = "cuda:89", "ttg.threads-per-warp" = 32 : i32} {
+  tt.func public @triton_red_fused_add_mul_native_layer_norm_0(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %in_ptr4: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr4"(#loc)), %out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %out_ptr3: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr3"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
+    %cst = arith.constant dense<4096> : tensor<1x2048xi32, #blocked> loc(#loc1)
+    %c0_i32 = arith.constant 0 : i32 loc(#loc1)
+    %cst_0 = arith.constant dense<0.000000e+00> : tensor<1x2048xbf16, #blocked> loc(#loc1)
+    %c4096_i32 = arith.constant 4096 : i32 loc(#loc1)
+    %c2048_i32 = arith.constant 2048 : i32 loc(#loc1)
+    %c256_i32 = arith.constant 256 : i32 loc(#loc1)
+    %cst_1 = arith.constant 0.000000e+00 : f32 loc(#loc1)
+    %cst_2 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32, #blocked> loc(#loc1)
+    %cst_3 = arith.constant dense<1.000000e+00> : tensor<1x2048xf32, #blocked> loc(#loc1)
+    %cst_4 = arith.constant dense<9.99999997E-7> : tensor<1x1xf32, #blocked> loc(#loc1)
+    %cst_5 = arith.constant dense<4.096000e+03> : tensor<1x1xf32, #blocked> loc(#loc1)
+    %xoffset = tt.get_program_id x : i32 loc(#loc89)
+    %xmask = arith.cmpi slt, %xoffset, %c256_i32 : i32 loc(#loc90)
+    %r0_base = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc91)
+    %r0_base_6 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<2048xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x2048xi32, #blocked> loc(#loc91)
+    %tmp0 = arith.muli %xoffset, %c4096_i32 : i32 loc(#loc92)
+    %tmp0_7 = tt.splat %tmp0 : i32 -> tensor<1x2048xi32, #blocked> loc(#loc151)
+    %tmp0_8 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<1x2048x!tt.ptr<bf16>, #blocked> loc(#loc94)
+    %tmp0_9 = tt.splat %xmask : i1 -> tensor<1x2048xi1, #blocked> loc(#loc152)
+    %tmp1 = tt.splat %in_ptr1 : !tt.ptr<bf16> -> tensor<1x2048x!tt.ptr<bf16>, #blocked> loc(#loc96)
+    %tmp2 = tt.splat %in_ptr2 : !tt.ptr<bf16> -> tensor<1x2048x!tt.ptr<bf16>, #blocked> loc(#loc97)
+    %0 = tt.splat %out_ptr0 : !tt.ptr<bf16> -> tensor<1x2048x!tt.ptr<bf16>, #blocked> loc(#loc11)
+    %tmp7_weight:3 = scf.for %tmp7_weight_10 = %c0_i32 to %c4096_i32 step %c2048_i32 iter_args(%arg10 = %cst_2, %arg11 = %cst_2, %arg12 = %cst_2) -> (tensor<1x2048xf32, #blocked>, tensor<1x2048xf32, #blocked>, tensor<1x2048xf32, #blocked>)  : i32 {
+      %r0_index = tt.splat %tmp7_weight_10 : i32 -> tensor<1x2048xi32, #blocked> loc(#loc99)
+      %r0_index_11 = arith.addi %r0_index, %r0_base_6 : tensor<1x2048xi32, #blocked> loc(#loc99)
+      %r0_mask = arith.cmpi slt, %r0_index_11, %cst : tensor<1x2048xi32, #blocked> loc(#loc100)
+      %tmp0_12 = arith.addi %r0_index_11, %tmp0_7 : tensor<1x2048xi32, #blocked> loc(#loc93)
+      %tmp0_13 = tt.addptr %tmp0_8, %tmp0_12 : tensor<1x2048x!tt.ptr<bf16>, #blocked>, tensor<1x2048xi32, #blocked> loc(#loc94)
+      %tmp0_14 = arith.andi %r0_mask, %tmp0_9 : tensor<1x2048xi1, #blocked> loc(#loc95)
+      %tmp0_15 = tt.load %tmp0_13, %tmp0_14, %cst_0 evictionPolicy = evict_first : tensor<1x2048x!tt.ptr<bf16>, #blocked> loc(#loc101)
+      %tmp0_16 = arith.extf %tmp0_15 : tensor<1x2048xbf16, #blocked> to tensor<1x2048xf32, #blocked> loc(#loc102)
+      %tmp1_17 = tt.addptr %tmp1, %r0_index_11 : tensor<1x2048x!tt.ptr<bf16>, #blocked>, tensor<1x2048xi32, #blocked> loc(#loc96)
+      %tmp1_18 = tt.load %tmp1_17, %r0_mask, %cst_0 evictionPolicy = evict_last : tensor<1x2048x!tt.ptr<bf16>, #blocked> loc(#loc103)
+      %tmp1_19 = arith.extf %tmp1_18 : tensor<1x2048xbf16, #blocked> to tensor<1x2048xf32, #blocked> loc(#loc104)
+      %tmp2_20 = tt.addptr %tmp2, %tmp0_12 : tensor<1x2048x!tt.ptr<bf16>, #blocked>, tensor<1x2048xi32, #blocked> loc(#loc97)
+      %tmp2_21 = tt.load %tmp2_20, %tmp0_14, %cst_0 evictionPolicy = evict_first : tensor<1x2048x!tt.ptr<bf16>, #blocked> loc(#loc105)
+      %tmp2_22 = arith.extf %tmp2_21 : tensor<1x2048xbf16, #blocked> to tensor<1x2048xf32, #blocked> loc(#loc106)
+      %tmp3 = arith.mulf %tmp1_19, %tmp2_22 : tensor<1x2048xf32, #blocked> loc(#loc107)
+      %tmp4 = arith.addf %tmp0_16, %tmp3 : tensor<1x2048xf32, #blocked> loc(#loc108)
+      %3 = arith.cmpi eq, %tmp7_weight_10, %c0_i32 : i32 loc(#loc23)
+      %4:3 = scf.if %3 -> (tensor<1x2048xf32, #blocked>, tensor<1x2048xf32, #blocked>, tensor<1x2048xf32, #blocked>) {
+        scf.yield %cst_2, %tmp4, %cst_3 : tensor<1x2048xf32, #blocked>, tensor<1x2048xf32, #blocked>, tensor<1x2048xf32, #blocked> loc(#loc176)
+      } else {
+        %delta = arith.subf %tmp4, %arg10 : tensor<1x2048xf32, #blocked> loc(#loc155)
+        %new_weight = arith.addf %arg12, %cst_3 : tensor<1x2048xf32, #blocked> loc(#loc177)
+        %new_mean = arith.divf %delta, %new_weight : tensor<1x2048xf32, #blocked> loc(#loc157)
+        %new_mean_24 = arith.addf %arg10, %new_mean : tensor<1x2048xf32, #blocked> loc(#loc178)
+        %new_m2 = arith.subf %tmp4, %new_mean_24 : tensor<1x2048xf32, #blocked> loc(#loc159)
+        %new_m2_25 = arith.mulf %delta, %new_m2 : tensor<1x2048xf32, #blocked> loc(#loc160)
+        %new_m2_26 = arith.addf %arg11, %new_m2_25 : tensor<1x2048xf32, #blocked> loc(#loc179)
+        scf.yield %new_m2_26, %new_mean_24, %new_weight : tensor<1x2048xf32, #blocked>, tensor<1x2048xf32, #blocked>, tensor<1x2048xf32, #blocked> loc(#loc162)
+      } loc(#loc109)
+      %tmp7_mean = arith.select %tmp0_14, %4#1, %arg10 : tensor<1x2048xi1, #blocked>, tensor<1x2048xf32, #blocked> loc(#loc118)
+      %tmp7_m2 = arith.select %tmp0_14, %4#0, %arg11 : tensor<1x2048xi1, #blocked>, tensor<1x2048xf32, #blocked> loc(#loc119)
+      %tmp7_weight_23 = arith.select %tmp0_14, %4#2, %arg12 : tensor<1x2048xi1, #blocked>, tensor<1x2048xf32, #blocked> loc(#loc120)
+      %5 = tt.addptr %0, %tmp0_12 : tensor<1x2048x!tt.ptr<bf16>, #blocked>, tensor<1x2048xi32, #blocked> loc(#loc11)
+      %6 = arith.truncf %tmp4 : tensor<1x2048xf32, #blocked> to tensor<1x2048xbf16, #blocked> loc(#loc37)
+      tt.store %5, %6, %tmp0_14 : tensor<1x2048x!tt.ptr<bf16>, #blocked> loc(#loc37)
+      scf.yield %tmp7_mean, %tmp7_m2, %tmp7_weight_23 : tensor<1x2048xf32, #blocked>, tensor<1x2048xf32, #blocked>, tensor<1x2048xf32, #blocked> loc(#loc38)
+    } loc(#loc175)
+    %1:3 = "tt.reduce"(%tmp7_weight#0, %tmp7_weight#1, %tmp7_weight#2) <{axis = 1 : i32}> ({
+    ^bb0(%arg9: f32 loc(callsite(#loc1 at #loc40)), %arg10: f32 loc(callsite(#loc1 at #loc40)), %arg11: f32 loc(callsite(#loc1 at #loc40)), %arg12: f32 loc(callsite(#loc1 at #loc40)), %arg13: f32 loc(callsite(#loc1 at #loc40)), %arg14: f32 loc(callsite(#loc1 at #loc40))):
+      %delta = arith.subf %arg12, %arg9 : f32 loc(#loc163)
+      %new_weight = arith.addf %arg11, %arg14 : f32 loc(#loc164)
+      %w2_over_w = arith.cmpf oeq, %new_weight, %cst_1 : f32 loc(#loc165)
+      %w2_over_w_10 = arith.divf %arg14, %new_weight : f32 loc(#loc166)
+      %w2_over_w_11 = arith.select %w2_over_w, %cst_1, %w2_over_w_10 : f32 loc(#loc167)
+      %3 = arith.mulf %delta, %w2_over_w_11 : f32 loc(#loc168)
+      %4 = arith.addf %arg9, %3 : f32 loc(#loc169)
+      %5 = arith.addf %arg10, %arg13 : f32 loc(#loc170)
+      %6 = arith.mulf %delta, %delta : f32 loc(#loc171)
+      %7 = arith.mulf %6, %arg11 : f32 loc(#loc172)
+      %8 = arith.mulf %7, %w2_over_w_11 : f32 loc(#loc173)
+      %9 = arith.addf %5, %8 : f32 loc(#loc174)
+      tt.reduce.return %4, %9, %new_weight : f32, f32, f32 loc(#loc121)
+    }) : (tensor<1x2048xf32, #blocked>, tensor<1x2048xf32, #blocked>, tensor<1x2048xf32, #blocked>) -> (tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked}>>, tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked}>>, tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked}>>) loc(#loc121)
+    %tmp7 = tt.expand_dims %1#0 {axis = 1 : i32} : tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<1x1xf32, #blocked> loc(#loc128)
+    %tmp11 = tt.expand_dims %1#1 {axis = 1 : i32} : tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<1x1xf32, #blocked> loc(#loc129)
+    %tmp23 = tt.splat %in_ptr3 : !tt.ptr<bf16> -> tensor<1x2048x!tt.ptr<bf16>, #blocked> loc(#loc130)
+    %tmp27 = tt.splat %in_ptr4 : !tt.ptr<bf16> -> tensor<1x2048x!tt.ptr<bf16>, #blocked> loc(#loc131)
+    %tmp15 = tt.broadcast %tmp7 : tensor<1x1xf32, #blocked> -> tensor<1x2048xf32, #blocked> loc(#loc132)
+    %tmp17 = arith.divf %tmp11, %cst_5 : tensor<1x1xf32, #blocked> loc(#loc133)
+    %tmp19 = arith.addf %tmp17, %cst_4 : tensor<1x1xf32, #blocked> loc(#loc134)
+    %tmp20 = tt.extern_elementwise %tmp19 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<1x1xf32, #blocked>) -> tensor<1x1xf32, #blocked> loc(#loc135)
+    %tmp21 = tt.broadcast %tmp20 : tensor<1x1xf32, #blocked> -> tensor<1x2048xf32, #blocked> loc(#loc136)
+    %2 = tt.splat %out_ptr3 : !tt.ptr<bf16> -> tensor<1x2048x!tt.ptr<bf16>, #blocked> loc(#loc62)
+    scf.for %r0_offset = %c0_i32 to %c4096_i32 step %c2048_i32  : i32 {
+      %r0_index = tt.splat %r0_offset : i32 -> tensor<1x2048xi32, #blocked> loc(#loc137)
+      %r0_index_10 = arith.addi %r0_index, %r0_base_6 : tensor<1x2048xi32, #blocked> loc(#loc137)
+      %r0_mask = arith.cmpi slt, %r0_index_10, %cst : tensor<1x2048xi32, #blocked> loc(#loc138)
+      %tmp13 = arith.addi %r0_index_10, %tmp0_7 : tensor<1x2048xi32, #blocked> loc(#loc139)
+      %tmp13_11 = tt.addptr %0, %tmp13 : tensor<1x2048x!tt.ptr<bf16>, #blocked>, tensor<1x2048xi32, #blocked> loc(#loc140)
+      %tmp13_12 = arith.andi %r0_mask, %tmp0_9 : tensor<1x2048xi1, #blocked> loc(#loc141)
+      %tmp13_13 = tt.load %tmp13_11, %tmp13_12, %cst_0 evictionPolicy = evict_first : tensor<1x2048x!tt.ptr<bf16>, #blocked> loc(#loc142)
+      %tmp13_14 = arith.extf %tmp13_13 : tensor<1x2048xbf16, #blocked> to tensor<1x2048xf32, #blocked> loc(#loc143)
+      %tmp23_15 = tt.addptr %tmp23, %r0_index_10 : tensor<1x2048x!tt.ptr<bf16>, #blocked>, tensor<1x2048xi32, #blocked> loc(#loc130)
+      %tmp23_16 = tt.load %tmp23_15, %r0_mask, %cst_0 evictionPolicy = evict_last : tensor<1x2048x!tt.ptr<bf16>, #blocked> loc(#loc144)
+      %tmp23_17 = arith.extf %tmp23_16 : tensor<1x2048xbf16, #blocked> to tensor<1x2048xf32, #blocked> loc(#loc145)
+      %tmp27_18 = tt.addptr %tmp27, %r0_index_10 : tensor<1x2048x!tt.ptr<bf16>, #blocked>, tensor<1x2048xi32, #blocked> loc(#loc131)
+      %tmp27_19 = tt.load %tmp27_18, %r0_mask, %cst_0 evictionPolicy = evict_last : tensor<1x2048x!tt.ptr<bf16>, #blocked> loc(#loc146)
+      %tmp27_20 = arith.extf %tmp27_19 : tensor<1x2048xbf16, #blocked> to tensor<1x2048xf32, #blocked> loc(#loc147)
+      %tmp15_21 = arith.subf %tmp13_14, %tmp15 : tensor<1x2048xf32, #blocked> loc(#loc132)
+      %tmp21_22 = arith.mulf %tmp15_21, %tmp21 : tensor<1x2048xf32, #blocked> loc(#loc136)
+      %tmp25 = arith.addf %tmp23_17, %cst_3 : tensor<1x2048xf32, #blocked> loc(#loc148)
+      %tmp26 = arith.mulf %tmp21_22, %tmp25 : tensor<1x2048xf32, #blocked> loc(#loc149)
+      %tmp28 = arith.addf %tmp26, %tmp27_20 : tensor<1x2048xf32, #blocked> loc(#loc150)
+      %3 = tt.addptr %2, %tmp13 : tensor<1x2048x!tt.ptr<bf16>, #blocked>, tensor<1x2048xi32, #blocked> loc(#loc62)
+      %4 = arith.truncf %tmp28 : tensor<1x2048xf32, #blocked> to tensor<1x2048xbf16, #blocked> loc(#loc78)
+      tt.store %3, %4, %tmp13_12 : tensor<1x2048x!tt.ptr<bf16>, #blocked> loc(#loc78)
+    } loc(#loc63)
+    tt.return loc(#loc79)
+  } loc(#loc)
+} loc(#loc)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":23:28)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":25:21)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":26:37)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":38:46)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":38:41)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":38:34)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":38:61)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":39:34)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":40:34)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":51:29)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":32:43)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":33:31)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":34:29)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":38:51)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":38:113)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":39:41)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":39:94)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":40:51)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":40:113)
+#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":41:22)
+#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":42:22)
+#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":46:62)
+#loc24 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":217:7)
+#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":46:51)
+#loc26 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":220:31)
+#loc27 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":222:24)
+#loc28 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":223:30)
+#loc29 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":224:34)
+#loc30 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":224:26)
+#loc31 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":225:39)
+#loc32 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":225:31)
+#loc33 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":225:22)
+#loc34 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":48:62)
+#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":49:58)
+#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":50:66)
+#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":51:52)
+#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":51:8)
+#loc39 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":243:46)
+#loc41 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":231:21)
+#loc42 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":232:28)
+#loc43 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:39)
+#loc44 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:60)
+#loc45 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:49)
+#loc46 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":235:25)
+#loc47 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":235:17)
+#loc48 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:15)
+#loc49 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:30)
+#loc50 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:38)
+#loc51 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:49)
+#loc52 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:22)
+#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":53:16)
+#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":54:17)
+#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":63:35)
+#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":64:35)
+#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":66:24)
+#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":68:25)
+#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":70:24)
+#loc60 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":71:32)
+#loc61 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":72:24)
+#loc62 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":78:29)
+#loc63 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":56:43)
+#loc64 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":57:31)
+#loc65 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":58:29)
+#loc66 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":62:43)
+#loc67 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":62:36)
+#loc68 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":62:63)
+#loc69 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":62:53)
+#loc70 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":62:115)
+#loc71 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":63:42)
+#loc72 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":63:95)
+#loc73 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":64:42)
+#loc74 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":64:95)
+#loc75 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":75:24)
+#loc76 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":76:24)
+#loc77 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":77:24)
+#loc78 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":78:53)
+#loc79 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":56:4)
+#loc89 = loc("xoffset"(#loc2))
+#loc90 = loc("xmask"(#loc3))
+#loc91 = loc("r0_base"(#loc4))
+#loc92 = loc("tmp0"(#loc5))
+#loc93 = loc("tmp0"(#loc6))
+#loc94 = loc("tmp0"(#loc7))
+#loc95 = loc("tmp0"(#loc8))
+#loc96 = loc("tmp1"(#loc9))
+#loc97 = loc("tmp2"(#loc10))
+#loc98 = loc("tmp7_mean"(#loc12))
+#loc99 = loc("r0_index"(#loc13))
+#loc100 = loc("r0_mask"(#loc14))
+#loc101 = loc("tmp0"(#loc15))
+#loc102 = loc("tmp0"(#loc16))
+#loc103 = loc("tmp1"(#loc17))
+#loc104 = loc("tmp1"(#loc18))
+#loc105 = loc("tmp2"(#loc19))
+#loc106 = loc("tmp2"(#loc20))
+#loc107 = loc("tmp3"(#loc21))
+#loc108 = loc("tmp4"(#loc22))
+#loc109 = loc(callsite(#loc24 at #loc25))
+#loc110 = loc("new_m2"(#loc26))
+#loc111 = loc("delta"(#loc27))
+#loc112 = loc("new_weight"(#loc28))
+#loc113 = loc("new_mean"(#loc29))
+#loc114 = loc("new_mean"(#loc30))
+#loc115 = loc("new_m2"(#loc31))
+#loc116 = loc("new_m2"(#loc32))
+#loc117 = loc("new_m2"(#loc33))
+#loc118 = loc("tmp7_mean"(#loc34))
+#loc119 = loc("tmp7_m2"(#loc35))
+#loc120 = loc("tmp7_weight"(#loc36))
+#loc121 = loc(callsite(#loc39 at #loc40))
+#loc123 = loc("delta"(#loc41))
+#loc124 = loc("new_weight"(#loc42))
+#loc125 = loc("w2_over_w"(#loc43))
+#loc126 = loc("w2_over_w"(#loc44))
+#loc127 = loc("w2_over_w"(#loc45))
+#loc128 = loc("tmp7"(#loc53))
+#loc129 = loc("tmp11"(#loc54))
+#loc130 = loc("tmp23"(#loc55))
+#loc131 = loc("tmp27"(#loc56))
+#loc132 = loc("tmp15"(#loc57))
+#loc133 = loc("tmp17"(#loc58))
+#loc134 = loc("tmp19"(#loc59))
+#loc135 = loc("tmp20"(#loc60))
+#loc136 = loc("tmp21"(#loc61))
+#loc137 = loc("r0_index"(#loc64))
+#loc138 = loc("r0_mask"(#loc65))
+#loc139 = loc("tmp13"(#loc66))
+#loc140 = loc("tmp13"(#loc67))
+#loc141 = loc("tmp13"(#loc68))
+#loc142 = loc("tmp13"(#loc69))
+#loc143 = loc("tmp13"(#loc70))
+#loc144 = loc("tmp23"(#loc71))
+#loc145 = loc("tmp23"(#loc72))
+#loc146 = loc("tmp27"(#loc73))
+#loc147 = loc("tmp27"(#loc74))
+#loc148 = loc("tmp25"(#loc75))
+#loc149 = loc("tmp26"(#loc76))
+#loc150 = loc("tmp28"(#loc77))
+#loc151 = loc(fused[#loc93, #loc92])
+#loc152 = loc(fused[#loc95, #loc90])
+#loc153 = loc("tmp7_m2"(#loc98))
+#loc154 = loc("new_m2"(#loc110))
+#loc155 = loc(callsite(#loc111 at #loc25))
+#loc156 = loc("new_weight"(#loc112))
+#loc157 = loc(callsite(#loc113 at #loc25))
+#loc158 = loc("new_mean"(#loc114))
+#loc159 = loc(callsite(#loc115 at #loc25))
+#loc160 = loc(callsite(#loc116 at #loc25))
+#loc161 = loc("new_m2"(#loc117))
+#loc162 = loc(callsite(#loc117 at #loc25))
+#loc163 = loc(callsite(#loc123 at #loc121))
+#loc164 = loc(callsite(#loc124 at #loc121))
+#loc165 = loc(callsite(#loc125 at #loc121))
+#loc166 = loc(callsite(#loc126 at #loc121))
+#loc167 = loc(callsite(#loc127 at #loc121))
+#loc168 = loc(callsite(#loc46 at #loc121))
+#loc169 = loc(callsite(#loc47 at #loc121))
+#loc170 = loc(callsite(#loc48 at #loc121))
+#loc171 = loc(callsite(#loc49 at #loc121))
+#loc172 = loc(callsite(#loc50 at #loc121))
+#loc173 = loc(callsite(#loc51 at #loc121))
+#loc174 = loc(callsite(#loc52 at #loc121))
+#loc175 = loc("tmp7_weight"(#loc153))
+#loc176 = loc(callsite(#loc154 at #loc25))
+#loc177 = loc(callsite(#loc156 at #loc25))
+#loc178 = loc(callsite(#loc158 at #loc25))
+#loc179 = loc(callsite(#loc161 at #loc25))
diff --git a/triton/H6VG26TW2DOV7R3PXVPFDX6HZCVIESL5ZYKZWLUWKZYONCE6NSLQ/triton_red_fused_add_mul_native_layer_norm_0.ttir b/triton/H6VG26TW2DOV7R3PXVPFDX6HZCVIESL5ZYKZWLUWKZYONCE6NSLQ/triton_red_fused_add_mul_native_layer_norm_0.ttir
new file mode 100644
index 0000000000000000000000000000000000000000..ff3805d522fe42e1a65b3b1a2d7892c8e94599f4
--- /dev/null
+++ b/triton/H6VG26TW2DOV7R3PXVPFDX6HZCVIESL5ZYKZWLUWKZYONCE6NSLQ/triton_red_fused_add_mul_native_layer_norm_0.ttir
@@ -0,0 +1,305 @@
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":18:0)
+#loc2 = loc(unknown)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":52:80)
+#loc82 = loc("in_ptr0"(#loc))
+#loc83 = loc("in_ptr1"(#loc))
+#loc84 = loc("in_ptr2"(#loc))
+#loc85 = loc("in_ptr3"(#loc))
+#loc86 = loc("in_ptr4"(#loc))
+#loc87 = loc("out_ptr0"(#loc))
+#loc88 = loc("out_ptr3"(#loc))
+#loc89 = loc("xnumel"(#loc))
+#loc90 = loc("r0_numel"(#loc))
+#loc92 = loc(callsite(#loc2 at #loc3))
+module {
+  tt.func public @triton_red_fused_add_mul_native_layer_norm_0(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %in_ptr4: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr4"(#loc)), %out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %out_ptr3: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr3"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
+    %xmask = arith.constant 256 : i32 loc(#loc91)
+    %cst = arith.constant 0.000000e+00 : f32 loc(#loc92)
+    %cst_0 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc2)
+    %cst_1 = arith.constant dense<0.000000e+00> : tensor<1x2048xbf16> loc(#loc2)
+    %c2048_i32 = arith.constant 2048 : i32 loc(#loc2)
+    %c4096_i32 = arith.constant 4096 : i32 loc(#loc2)
+    %cst_2 = arith.constant dense<1.000000e+00> : tensor<1x2048xf32> loc(#loc2)
+    %cst_3 = arith.constant dense<9.99999997E-7> : tensor<1x1xf32> loc(#loc2)
+    %cst_4 = arith.constant dense<4.096000e+03> : tensor<1x1xf32> loc(#loc2)
+    %cst_5 = arith.constant dense<4096> : tensor<1x2048xi32> loc(#loc2)
+    %c0_i32 = arith.constant 0 : i32 loc(#loc2)
+    %xoffset = tt.get_program_id x : i32 loc(#loc93)
+    %xmask_6 = arith.cmpi slt, %xoffset, %xmask : i32 loc(#loc91)
+    %r0_base = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32> loc(#loc94)
+    %r0_base_7 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<2048xi32> -> tensor<1x2048xi32> loc(#loc95)
+    %tmp7_weight:3 = scf.for %r0_offset = %c0_i32 to %c4096_i32 step %c2048_i32 iter_args(%tmp7_mean = %cst_0, %tmp7_m2 = %cst_0, %tmp7_weight_8 = %cst_0) -> (tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32>)  : i32 {
+      %r0_index = tt.splat %r0_offset : i32 -> tensor<1x2048xi32> loc(#loc97)
+      %r0_index_9 = arith.addi %r0_index, %r0_base_7 : tensor<1x2048xi32> loc(#loc97)
+      %r0_mask = arith.cmpi slt, %r0_index_9, %cst_5 : tensor<1x2048xi32> loc(#loc98)
+      %tmp0 = arith.muli %xoffset, %c4096_i32 : i32 loc(#loc99)
+      %tmp0_10 = tt.splat %tmp0 : i32 -> tensor<1x2048xi32> loc(#loc156)
+      %tmp0_11 = arith.addi %r0_index_9, %tmp0_10 : tensor<1x2048xi32> loc(#loc100)
+      %tmp0_12 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<1x2048x!tt.ptr<bf16>> loc(#loc101)
+      %tmp0_13 = tt.addptr %tmp0_12, %tmp0_11 : tensor<1x2048x!tt.ptr<bf16>>, tensor<1x2048xi32> loc(#loc101)
+      %tmp0_14 = tt.splat %xmask_6 : i1 -> tensor<1x2048xi1> loc(#loc157)
+      %tmp0_15 = arith.andi %r0_mask, %tmp0_14 : tensor<1x2048xi1> loc(#loc102)
+      %tmp0_16 = tt.load %tmp0_13, %tmp0_15, %cst_1 evictionPolicy = evict_first : tensor<1x2048x!tt.ptr<bf16>> loc(#loc103)
+      %tmp0_17 = arith.extf %tmp0_16 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc104)
+      %tmp1 = tt.splat %in_ptr1 : !tt.ptr<bf16> -> tensor<1x2048x!tt.ptr<bf16>> loc(#loc105)
+      %tmp1_18 = tt.addptr %tmp1, %r0_index_9 : tensor<1x2048x!tt.ptr<bf16>>, tensor<1x2048xi32> loc(#loc105)
+      %tmp1_19 = tt.load %tmp1_18, %r0_mask, %cst_1 evictionPolicy = evict_last : tensor<1x2048x!tt.ptr<bf16>> loc(#loc106)
+      %tmp1_20 = arith.extf %tmp1_19 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc107)
+      %tmp2 = tt.splat %in_ptr2 : !tt.ptr<bf16> -> tensor<1x2048x!tt.ptr<bf16>> loc(#loc108)
+      %tmp2_21 = tt.addptr %tmp2, %tmp0_11 : tensor<1x2048x!tt.ptr<bf16>>, tensor<1x2048xi32> loc(#loc108)
+      %tmp2_22 = tt.load %tmp2_21, %tmp0_15, %cst_1 evictionPolicy = evict_first : tensor<1x2048x!tt.ptr<bf16>> loc(#loc109)
+      %tmp2_23 = arith.extf %tmp2_22 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc110)
+      %tmp3 = arith.mulf %tmp1_20, %tmp2_23 : tensor<1x2048xf32> loc(#loc111)
+      %tmp4 = arith.addf %tmp0_17, %tmp3 : tensor<1x2048xf32> loc(#loc112)
+      %1 = arith.cmpi eq, %r0_offset, %c0_i32 : i32 loc(#loc24)
+      %2:3 = scf.if %1 -> (tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32>) {
+        scf.yield %cst_0, %tmp4, %cst_2 : tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32> loc(#loc182)
+      } else {
+        %delta = arith.subf %tmp4, %tmp7_mean : tensor<1x2048xf32> loc(#loc159)
+        %new_weight = arith.addf %tmp7_weight_8, %cst_2 : tensor<1x2048xf32> loc(#loc183)
+        %new_mean = arith.divf %delta, %new_weight : tensor<1x2048xf32> loc(#loc161)
+        %new_mean_27 = arith.addf %tmp7_mean, %new_mean : tensor<1x2048xf32> loc(#loc184)
+        %new_m2 = arith.subf %tmp4, %new_mean_27 : tensor<1x2048xf32> loc(#loc163)
+        %new_m2_28 = arith.mulf %delta, %new_m2 : tensor<1x2048xf32> loc(#loc164)
+        %new_m2_29 = arith.addf %tmp7_m2, %new_m2_28 : tensor<1x2048xf32> loc(#loc185)
+        scf.yield %new_m2_29, %new_mean_27, %new_weight : tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32> loc(#loc166)
+      } loc(#loc113)
+      %tmp7_mean_24 = arith.select %tmp0_15, %2#1, %tmp7_mean : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc122)
+      %tmp7_m2_25 = arith.select %tmp0_15, %2#0, %tmp7_m2 : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc123)
+      %tmp7_weight_26 = arith.select %tmp0_15, %2#2, %tmp7_weight_8 : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc124)
+      %3 = tt.splat %out_ptr0 : !tt.ptr<bf16> -> tensor<1x2048x!tt.ptr<bf16>> loc(#loc38)
+      %4 = tt.addptr %3, %tmp0_11 : tensor<1x2048x!tt.ptr<bf16>>, tensor<1x2048xi32> loc(#loc38)
+      %5 = arith.truncf %tmp4 : tensor<1x2048xf32> to tensor<1x2048xbf16> loc(#loc39)
+      tt.store %4, %5, %tmp0_15 : tensor<1x2048x!tt.ptr<bf16>> loc(#loc39)
+      scf.yield %tmp7_mean_24, %tmp7_m2_25, %tmp7_weight_26 : tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32> loc(#loc40)
+    } loc(#loc181)
+    %0:3 = "tt.reduce"(%tmp7_weight#0, %tmp7_weight#1, %tmp7_weight#2) <{axis = 1 : i32}> ({
+    ^bb0(%arg9: f32 loc(callsite(#loc2 at #loc3)), %arg10: f32 loc(callsite(#loc2 at #loc3)), %arg11: f32 loc(callsite(#loc2 at #loc3)), %arg12: f32 loc(callsite(#loc2 at #loc3)), %arg13: f32 loc(callsite(#loc2 at #loc3)), %arg14: f32 loc(callsite(#loc2 at #loc3))):
+      %delta = arith.subf %arg12, %arg9 : f32 loc(#loc167)
+      %new_weight = arith.addf %arg11, %arg14 : f32 loc(#loc168)
+      %w2_over_w = arith.cmpf oeq, %new_weight, %cst : f32 loc(#loc169)
+      %w2_over_w_8 = arith.divf %arg14, %new_weight : f32 loc(#loc170)
+      %w2_over_w_9 = arith.select %w2_over_w, %cst, %w2_over_w_8 : f32 loc(#loc171)
+      %1 = arith.mulf %delta, %w2_over_w_9 : f32 loc(#loc172)
+      %2 = arith.addf %arg9, %1 : f32 loc(#loc173)
+      %3 = arith.addf %arg10, %arg13 : f32 loc(#loc174)
+      %4 = arith.mulf %delta, %delta : f32 loc(#loc175)
+      %5 = arith.mulf %4, %arg11 : f32 loc(#loc176)
+      %6 = arith.mulf %5, %w2_over_w_9 : f32 loc(#loc177)
+      %7 = arith.addf %3, %6 : f32 loc(#loc178)
+      tt.reduce.return %2, %7, %new_weight : f32, f32, f32 loc(#loc125)
+    }) : (tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32>) -> (tensor<1xf32>, tensor<1xf32>, tensor<1xf32>) loc(#loc125)
+    %tmp7 = tt.expand_dims %0#0 {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc131)
+    %tmp11 = tt.expand_dims %0#1 {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc132)
+    scf.for %r0_offset = %c0_i32 to %c4096_i32 step %c2048_i32  : i32 {
+      %r0_index = tt.splat %r0_offset : i32 -> tensor<1x2048xi32> loc(#loc133)
+      %r0_index_8 = arith.addi %r0_index, %r0_base_7 : tensor<1x2048xi32> loc(#loc133)
+      %r0_mask = arith.cmpi slt, %r0_index_8, %cst_5 : tensor<1x2048xi32> loc(#loc134)
+      %tmp13 = arith.muli %xoffset, %c4096_i32 : i32 loc(#loc135)
+      %tmp13_9 = tt.splat %tmp13 : i32 -> tensor<1x2048xi32> loc(#loc179)
+      %tmp13_10 = arith.addi %r0_index_8, %tmp13_9 : tensor<1x2048xi32> loc(#loc136)
+      %tmp13_11 = tt.splat %out_ptr0 : !tt.ptr<bf16> -> tensor<1x2048x!tt.ptr<bf16>> loc(#loc137)
+      %tmp13_12 = tt.addptr %tmp13_11, %tmp13_10 : tensor<1x2048x!tt.ptr<bf16>>, tensor<1x2048xi32> loc(#loc137)
+      %tmp13_13 = tt.splat %xmask_6 : i1 -> tensor<1x2048xi1> loc(#loc180)
+      %tmp13_14 = arith.andi %r0_mask, %tmp13_13 : tensor<1x2048xi1> loc(#loc138)
+      %tmp13_15 = tt.load %tmp13_12, %tmp13_14, %cst_1 evictionPolicy = evict_first : tensor<1x2048x!tt.ptr<bf16>> loc(#loc139)
+      %tmp13_16 = arith.extf %tmp13_15 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc140)
+      %tmp23 = tt.splat %in_ptr3 : !tt.ptr<bf16> -> tensor<1x2048x!tt.ptr<bf16>> loc(#loc141)
+      %tmp23_17 = tt.addptr %tmp23, %r0_index_8 : tensor<1x2048x!tt.ptr<bf16>>, tensor<1x2048xi32> loc(#loc141)
+      %tmp23_18 = tt.load %tmp23_17, %r0_mask, %cst_1 evictionPolicy = evict_last : tensor<1x2048x!tt.ptr<bf16>> loc(#loc142)
+      %tmp23_19 = arith.extf %tmp23_18 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc143)
+      %tmp27 = tt.splat %in_ptr4 : !tt.ptr<bf16> -> tensor<1x2048x!tt.ptr<bf16>> loc(#loc144)
+      %tmp27_20 = tt.addptr %tmp27, %r0_index_8 : tensor<1x2048x!tt.ptr<bf16>>, tensor<1x2048xi32> loc(#loc144)
+      %tmp27_21 = tt.load %tmp27_20, %r0_mask, %cst_1 evictionPolicy = evict_last : tensor<1x2048x!tt.ptr<bf16>> loc(#loc145)
+      %tmp27_22 = arith.extf %tmp27_21 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc146)
+      %tmp15 = tt.broadcast %tmp7 : tensor<1x1xf32> -> tensor<1x2048xf32> loc(#loc147)
+      %tmp15_23 = arith.subf %tmp13_16, %tmp15 : tensor<1x2048xf32> loc(#loc147)
+      %tmp17 = arith.divf %tmp11, %cst_4 : tensor<1x1xf32> loc(#loc148)
+      %tmp19 = arith.addf %tmp17, %cst_3 : tensor<1x1xf32> loc(#loc149)
+      %tmp20 = tt.extern_elementwise %tmp19 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<1x1xf32>) -> tensor<1x1xf32> loc(#loc150)
+      %tmp21 = tt.broadcast %tmp20 : tensor<1x1xf32> -> tensor<1x2048xf32> loc(#loc151)
+      %tmp21_24 = arith.mulf %tmp15_23, %tmp21 : tensor<1x2048xf32> loc(#loc151)
+      %tmp25 = arith.addf %tmp23_19, %cst_2 : tensor<1x2048xf32> loc(#loc152)
+      %tmp26 = arith.mulf %tmp21_24, %tmp25 : tensor<1x2048xf32> loc(#loc153)
+      %tmp28 = arith.addf %tmp26, %tmp27_22 : tensor<1x2048xf32> loc(#loc154)
+      %1 = tt.splat %out_ptr3 : !tt.ptr<bf16> -> tensor<1x2048x!tt.ptr<bf16>> loc(#loc79)
+      %2 = tt.addptr %1, %tmp13_10 : tensor<1x2048x!tt.ptr<bf16>>, tensor<1x2048xi32> loc(#loc79)
+      %3 = arith.truncf %tmp28 : tensor<1x2048xf32> to tensor<1x2048xbf16> loc(#loc80)
+      tt.store %2, %3, %tmp13_14 : tensor<1x2048x!tt.ptr<bf16>> loc(#loc80)
+    } loc(#loc56)
+    tt.return loc(#loc81)
+  } loc(#loc)
+} loc(#loc)
+#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":25:21)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":23:28)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":26:27)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":26:37)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":32:43)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":33:31)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":34:29)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":38:46)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":38:41)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":38:34)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":38:61)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":38:51)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":38:113)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":39:34)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":39:41)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":39:94)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":40:34)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":40:51)
+#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":40:113)
+#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":41:22)
+#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":42:22)
+#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":46:62)
+#loc25 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":217:7)
+#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":46:51)
+#loc27 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":220:31)
+#loc28 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":222:24)
+#loc29 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":223:30)
+#loc30 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":224:34)
+#loc31 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":224:26)
+#loc32 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":225:39)
+#loc33 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":225:31)
+#loc34 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":225:22)
+#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":48:62)
+#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":49:58)
+#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":50:66)
+#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":51:29)
+#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":51:52)
+#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":51:8)
+#loc41 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":243:46)
+#loc42 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":231:21)
+#loc43 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":232:28)
+#loc44 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:39)
+#loc45 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:60)
+#loc46 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:49)
+#loc47 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":235:25)
+#loc48 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":235:17)
+#loc49 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:15)
+#loc50 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:30)
+#loc51 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:38)
+#loc52 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:49)
+#loc53 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:22)
+#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":53:16)
+#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":54:17)
+#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":56:43)
+#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":57:31)
+#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":58:29)
+#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":62:48)
+#loc60 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":62:43)
+#loc61 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":62:36)
+#loc62 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":62:63)
+#loc63 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":62:53)
+#loc64 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":62:115)
+#loc65 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":63:35)
+#loc66 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":63:42)
+#loc67 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":63:95)
+#loc68 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":64:35)
+#loc69 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":64:42)
+#loc70 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":64:95)
+#loc71 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":66:24)
+#loc72 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":68:25)
+#loc73 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":70:24)
+#loc74 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":71:32)
+#loc75 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":72:24)
+#loc76 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":75:24)
+#loc77 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":76:24)
+#loc78 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":77:24)
+#loc79 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":78:29)
+#loc80 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":78:53)
+#loc81 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":56:4)
+#loc91 = loc("xmask"(#loc1))
+#loc93 = loc("xoffset"(#loc4))
+#loc94 = loc("r0_base"(#loc5))
+#loc95 = loc("r0_base"(#loc6))
+#loc96 = loc("tmp7_mean"(#loc7))
+#loc97 = loc("r0_index"(#loc8))
+#loc98 = loc("r0_mask"(#loc9))
+#loc99 = loc("tmp0"(#loc10))
+#loc100 = loc("tmp0"(#loc11))
+#loc101 = loc("tmp0"(#loc12))
+#loc102 = loc("tmp0"(#loc13))
+#loc103 = loc("tmp0"(#loc14))
+#loc104 = loc("tmp0"(#loc15))
+#loc105 = loc("tmp1"(#loc16))
+#loc106 = loc("tmp1"(#loc17))
+#loc107 = loc("tmp1"(#loc18))
+#loc108 = loc("tmp2"(#loc19))
+#loc109 = loc("tmp2"(#loc20))
+#loc110 = loc("tmp2"(#loc21))
+#loc111 = loc("tmp3"(#loc22))
+#loc112 = loc("tmp4"(#loc23))
+#loc113 = loc(callsite(#loc25 at #loc26))
+#loc114 = loc("new_m2"(#loc27))
+#loc115 = loc("delta"(#loc28))
+#loc116 = loc("new_weight"(#loc29))
+#loc117 = loc("new_mean"(#loc30))
+#loc118 = loc("new_mean"(#loc31))
+#loc119 = loc("new_m2"(#loc32))
+#loc120 = loc("new_m2"(#loc33))
+#loc121 = loc("new_m2"(#loc34))
+#loc122 = loc("tmp7_mean"(#loc35))
+#loc123 = loc("tmp7_m2"(#loc36))
+#loc124 = loc("tmp7_weight"(#loc37))
+#loc125 = loc(callsite(#loc41 at #loc3))
+#loc126 = loc("delta"(#loc42))
+#loc127 = loc("new_weight"(#loc43))
+#loc128 = loc("w2_over_w"(#loc44))
+#loc129 = loc("w2_over_w"(#loc45))
+#loc130 = loc("w2_over_w"(#loc46))
+#loc131 = loc("tmp7"(#loc54))
+#loc132 = loc("tmp11"(#loc55))
+#loc133 = loc("r0_index"(#loc57))
+#loc134 = loc("r0_mask"(#loc58))
+#loc135 = loc("tmp13"(#loc59))
+#loc136 = loc("tmp13"(#loc60))
+#loc137 = loc("tmp13"(#loc61))
+#loc138 = loc("tmp13"(#loc62))
+#loc139 = loc("tmp13"(#loc63))
+#loc140 = loc("tmp13"(#loc64))
+#loc141 = loc("tmp23"(#loc65))
+#loc142 = loc("tmp23"(#loc66))
+#loc143 = loc("tmp23"(#loc67))
+#loc144 = loc("tmp27"(#loc68))
+#loc145 = loc("tmp27"(#loc69))
+#loc146 = loc("tmp27"(#loc70))
+#loc147 = loc("tmp15"(#loc71))
+#loc148 = loc("tmp17"(#loc72))
+#loc149 = loc("tmp19"(#loc73))
+#loc150 = loc("tmp20"(#loc74))
+#loc151 = loc("tmp21"(#loc75))
+#loc152 = loc("tmp25"(#loc76))
+#loc153 = loc("tmp26"(#loc77))
+#loc154 = loc("tmp28"(#loc78))
+#loc155 = loc("tmp7_m2"(#loc96))
+#loc156 = loc(fused[#loc100, #loc99])
+#loc157 = loc(fused[#loc102, #loc91])
+#loc158 = loc("new_m2"(#loc114))
+#loc159 = loc(callsite(#loc115 at #loc26))
+#loc160 = loc("new_weight"(#loc116))
+#loc161 = loc(callsite(#loc117 at #loc26))
+#loc162 = loc("new_mean"(#loc118))
+#loc163 = loc(callsite(#loc119 at #loc26))
+#loc164 = loc(callsite(#loc120 at #loc26))
+#loc165 = loc("new_m2"(#loc121))
+#loc166 = loc(callsite(#loc121 at #loc26))
+#loc167 = loc(callsite(#loc126 at #loc125))
+#loc168 = loc(callsite(#loc127 at #loc125))
+#loc169 = loc(callsite(#loc128 at #loc125))
+#loc170 = loc(callsite(#loc129 at #loc125))
+#loc171 = loc(callsite(#loc130 at #loc125))
+#loc172 = loc(callsite(#loc47 at #loc125))
+#loc173 = loc(callsite(#loc48 at #loc125))
+#loc174 = loc(callsite(#loc49 at #loc125))
+#loc175 = loc(callsite(#loc50 at #loc125))
+#loc176 = loc(callsite(#loc51 at #loc125))
+#loc177 = loc(callsite(#loc52 at #loc125))
+#loc178 = loc(callsite(#loc53 at #loc125))
+#loc179 = loc(fused[#loc136, #loc135])
+#loc180 = loc(fused[#loc138, #loc91])
+#loc181 = loc("tmp7_weight"(#loc155))
+#loc182 = loc(callsite(#loc158 at #loc26))
+#loc183 = loc(callsite(#loc160 at #loc26))
+#loc184 = loc(callsite(#loc162 at #loc26))
+#loc185 = loc(callsite(#loc165 at #loc26))
diff --git a/triton/IISWO2KKAFZHWSAJ7JGYGTPZMC74WHQJ75EROZ4BVM4XNJEETP5Q/__grp__triton_red_fused__fused_rms_norm_view_0.json b/triton/IISWO2KKAFZHWSAJ7JGYGTPZMC74WHQJ75EROZ4BVM4XNJEETP5Q/__grp__triton_red_fused__fused_rms_norm_view_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..932ca86f4455410e14f569df43cc5bbd0861c738
--- /dev/null
+++ b/triton/IISWO2KKAFZHWSAJ7JGYGTPZMC74WHQJ75EROZ4BVM4XNJEETP5Q/__grp__triton_red_fused__fused_rms_norm_view_0.json
@@ -0,0 +1 @@
+{"child_paths": {"triton_red_fused__fused_rms_norm_view_0.source": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/IISWO2KKAFZHWSAJ7JGYGTPZMC74WHQJ75EROZ4BVM4XNJEETP5Q/triton_red_fused__fused_rms_norm_view_0.source", "triton_red_fused__fused_rms_norm_view_0.ttir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/IISWO2KKAFZHWSAJ7JGYGTPZMC74WHQJ75EROZ4BVM4XNJEETP5Q/triton_red_fused__fused_rms_norm_view_0.ttir", "triton_red_fused__fused_rms_norm_view_0.ttgir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/IISWO2KKAFZHWSAJ7JGYGTPZMC74WHQJ75EROZ4BVM4XNJEETP5Q/triton_red_fused__fused_rms_norm_view_0.ttgir", "triton_red_fused__fused_rms_norm_view_0.llir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/IISWO2KKAFZHWSAJ7JGYGTPZMC74WHQJ75EROZ4BVM4XNJEETP5Q/triton_red_fused__fused_rms_norm_view_0.llir", "triton_red_fused__fused_rms_norm_view_0.ptx": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/IISWO2KKAFZHWSAJ7JGYGTPZMC74WHQJ75EROZ4BVM4XNJEETP5Q/triton_red_fused__fused_rms_norm_view_0.ptx", "triton_red_fused__fused_rms_norm_view_0.cubin": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/IISWO2KKAFZHWSAJ7JGYGTPZMC74WHQJ75EROZ4BVM4XNJEETP5Q/triton_red_fused__fused_rms_norm_view_0.cubin", "triton_red_fused__fused_rms_norm_view_0.json": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/IISWO2KKAFZHWSAJ7JGYGTPZMC74WHQJ75EROZ4BVM4XNJEETP5Q/triton_red_fused__fused_rms_norm_view_0.json"}}
\ No newline at end of file
diff --git a/triton/IISWO2KKAFZHWSAJ7JGYGTPZMC74WHQJ75EROZ4BVM4XNJEETP5Q/triton_red_fused__fused_rms_norm_view_0.cubin b/triton/IISWO2KKAFZHWSAJ7JGYGTPZMC74WHQJ75EROZ4BVM4XNJEETP5Q/triton_red_fused__fused_rms_norm_view_0.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..0802a2b59468ab17484cdbfbc8a186ebc2bb489c
Binary files /dev/null and b/triton/IISWO2KKAFZHWSAJ7JGYGTPZMC74WHQJ75EROZ4BVM4XNJEETP5Q/triton_red_fused__fused_rms_norm_view_0.cubin differ
diff --git a/triton/IISWO2KKAFZHWSAJ7JGYGTPZMC74WHQJ75EROZ4BVM4XNJEETP5Q/triton_red_fused__fused_rms_norm_view_0.json b/triton/IISWO2KKAFZHWSAJ7JGYGTPZMC74WHQJ75EROZ4BVM4XNJEETP5Q/triton_red_fused__fused_rms_norm_view_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..01f2a19d02cbe3d66a037b1facd055735996ac17
--- /dev/null
+++ b/triton/IISWO2KKAFZHWSAJ7JGYGTPZMC74WHQJ75EROZ4BVM4XNJEETP5Q/triton_red_fused__fused_rms_norm_view_0.json
@@ -0,0 +1 @@
+{"hash": "422567694a01727b4809fa4d834df960bfcb1e09ff49176781ab3976a4849bfb", "target": {"backend": "cuda", "arch": 89, "warp_size": 32}, "num_warps": 2, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "enable_reflect_ftz": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee", "bf16x3", "bf16x6"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm89", "instrumentation_mode": "", "triton_version": "3.6.0", "tensordesc_meta": [], "shared": 8, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused__fused_rms_norm_view_0"}
\ No newline at end of file
diff --git a/triton/IISWO2KKAFZHWSAJ7JGYGTPZMC74WHQJ75EROZ4BVM4XNJEETP5Q/triton_red_fused__fused_rms_norm_view_0.llir b/triton/IISWO2KKAFZHWSAJ7JGYGTPZMC74WHQJ75EROZ4BVM4XNJEETP5Q/triton_red_fused__fused_rms_norm_view_0.llir
new file mode 100644
index 0000000000000000000000000000000000000000..f1910083599ab90fb7aa12f23b7d3438c792e868
--- /dev/null
+++ b/triton/IISWO2KKAFZHWSAJ7JGYGTPZMC74WHQJ75EROZ4BVM4XNJEETP5Q/triton_red_fused__fused_rms_norm_view_0.llir
@@ -0,0 +1,136 @@
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64"
+
+@global_smem = external local_unnamed_addr addrspace(3) global [0 x i8], align 16
+
+; Function Attrs: nounwind
+define ptx_kernel void @triton_red_fused__fused_rms_norm_view_0(ptr addrspace(1) %0, ptr addrspace(1) %1, i32 %2, i32 %3, ptr addrspace(1) readnone captures(none) %4, ptr addrspace(1) readnone captures(none) %5) local_unnamed_addr #0 !dbg !4 {
+  %7 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7
+  %8 = shl nuw i32 %7, 1, !dbg !8
+  %9 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9
+  %10 = and i32 %9, 32, !dbg !9
+  %.lobit = lshr exact i32 %10, 5, !dbg !9
+  %11 = and i32 %9, 1, !dbg !9
+  %12 = or disjoint i32 %.lobit, %8, !dbg !10
+  %13 = or disjoint i32 %8, %11, !dbg !10
+  %14 = shl nuw nsw i32 %9, 2, !dbg !11
+  %15 = and i32 %14, 124, !dbg !11
+  %16 = sdiv i32 %12, 32, !dbg !12
+  %17 = mul i32 %16, 32, !dbg !13
+  %.decomposed = sub i32 %12, %17, !dbg !13
+  %18 = shl nsw i32 %.decomposed, 7, !dbg !14
+  %19 = or disjoint i32 %18, %15, !dbg !15
+  %20 = mul i32 %16, 12288, !dbg !16
+  %21 = add i32 %19, %20, !dbg !17
+  %22 = sext i32 %21 to i64, !dbg !18
+  %23 = getelementptr bfloat, ptr addrspace(1) %0, i64 %22, !dbg !18
+  %24 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !19
+  %25 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %23, i64 %24, i1 true) #4, !dbg !19
+  %26 = extractvalue { i32, i32 } %25, 0, !dbg !19
+  %27 = bitcast i32 %26 to <2 x bfloat>, !dbg !19
+  %28 = extractvalue { i32, i32 } %25, 1, !dbg !19
+  %29 = bitcast i32 %28 to <2 x bfloat>, !dbg !19
+  %30 = extractelement <2 x bfloat> %27, i64 0, !dbg !19
+  %31 = extractelement <2 x bfloat> %27, i64 1, !dbg !19
+  %32 = extractelement <2 x bfloat> %29, i64 0, !dbg !19
+  %33 = extractelement <2 x bfloat> %29, i64 1, !dbg !19
+  %34 = fpext bfloat %30 to float, !dbg !20
+  %35 = fpext bfloat %31 to float, !dbg !20
+  %36 = fpext bfloat %32 to float, !dbg !20
+  %37 = fpext bfloat %33 to float, !dbg !20
+  %38 = fmul float %34, %34, !dbg !21
+  %39 = fmul float %35, %35, !dbg !21
+  %40 = fmul float %36, %36, !dbg !21
+  %41 = fmul float %37, %37, !dbg !21
+  %42 = fadd float %38, %39, !dbg !22
+  %43 = fadd float %40, %42, !dbg !22
+  %44 = fadd float %41, %43, !dbg !22
+  %45 = bitcast float %44 to i32, !dbg !25
+  %46 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %45, i32 16, i32 31), !dbg !25
+  %47 = bitcast i32 %46 to float, !dbg !25
+  %48 = fadd float %44, %47, !dbg !22
+  %49 = bitcast float %48 to i32, !dbg !25
+  %50 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %49, i32 8, i32 31), !dbg !25
+  %51 = bitcast i32 %50 to float, !dbg !25
+  %52 = fadd float %48, %51, !dbg !22
+  %53 = bitcast float %52 to i32, !dbg !25
+  %54 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %53, i32 4, i32 31), !dbg !25
+  %55 = bitcast i32 %54 to float, !dbg !25
+  %56 = fadd float %52, %55, !dbg !22
+  %57 = bitcast float %56 to i32, !dbg !25
+  %58 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %57, i32 2, i32 31), !dbg !25
+  %59 = bitcast i32 %58 to float, !dbg !25
+  %60 = fadd float %56, %59, !dbg !22
+  %61 = bitcast float %60 to i32, !dbg !25
+  %62 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %61, i32 1, i32 31), !dbg !25
+  %63 = bitcast i32 %62 to float, !dbg !25
+  %64 = fadd float %60, %63, !dbg !22
+  %65 = lshr exact i32 %10, 3, !dbg !28
+  %66 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %65, !dbg !28
+  store float %64, ptr addrspace(3) %66, align 4, !dbg !28
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !28
+  %67 = shl nuw nsw i32 %11, 2, !dbg !28
+  %68 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %67, !dbg !28
+  %69 = load i32, ptr addrspace(3) %68, align 4, !dbg !28
+  %70 = sext i32 %13 to i64, !dbg !29
+  %71 = getelementptr float, ptr addrspace(1) %1, i64 %70, !dbg !29
+  %72 = and i32 %9, 62, !dbg !30
+  %73 = icmp eq i32 %72, 0, !dbg !30
+  tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %69, ptr addrspace(1) %71, i1 %73) #4, !dbg !30
+  ret void, !dbg !31
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1
+
+; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
+declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2
+
+; Function Attrs: convergent nocallback nounwind
+declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #3
+
+attributes #0 = { nounwind "nvvm.reqntid"="64" }
+attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
+attributes #3 = { convergent nocallback nounwind }
+attributes #4 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly)
+!1 = !DIFile(filename: "cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py", directory: "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv")
+!2 = !{i32 2, !"Debug Info Version", i32 3}
+!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
+!4 = distinct !DISubprogram(name: "triton_red_fused__fused_rms_norm_view_0", linkageName: "triton_red_fused__fused_rms_norm_view_0", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!5 = !DISubroutineType(cc: DW_CC_normal, types: !6)
+!6 = !{}
+!7 = !DILocation(line: 23, column: 28, scope: !4)
+!8 = !DILocation(line: 23, column: 33, scope: !4)
+!9 = !DILocation(line: 24, column: 44, scope: !4)
+!10 = !DILocation(line: 24, column: 23, scope: !4)
+!11 = !DILocation(line: 26, column: 37, scope: !4)
+!12 = !DILocation(line: 29, column: 19, scope: !4)
+!13 = !DILocation(line: 28, column: 19, scope: !4)
+!14 = !DILocation(line: 38, column: 45, scope: !4)
+!15 = !DILocation(line: 38, column: 41, scope: !4)
+!16 = !DILocation(line: 38, column: 56, scope: !4)
+!17 = !DILocation(line: 38, column: 50, scope: !4)
+!18 = !DILocation(line: 38, column: 34, scope: !4)
+!19 = !DILocation(line: 38, column: 61, scope: !4)
+!20 = !DILocation(line: 38, column: 115, scope: !4)
+!21 = !DILocation(line: 40, column: 22, scope: !4)
+!22 = !DILocation(line: 263, column: 15, scope: !23, inlinedAt: !25)
+!23 = distinct !DILexicalBlockFile(scope: !4, file: !24, discriminator: 0)
+!24 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.12/dist-packages/triton/language")
+!25 = !DILocation(line: 293, column: 36, scope: !23, inlinedAt: !26)
+!26 = !DILocation(line: 44, column: 25, scope: !27)
+!27 = distinct !DILexicalBlockFile(scope: !4, file: !1, discriminator: 0)
+!28 = !DILocation(line: 44, column: 28, scope: !4)
+!29 = !DILocation(line: 45, column: 25, scope: !4)
+!30 = !DILocation(line: 45, column: 36, scope: !4)
+!31 = !DILocation(line: 45, column: 4, scope: !4)
diff --git a/triton/IISWO2KKAFZHWSAJ7JGYGTPZMC74WHQJ75EROZ4BVM4XNJEETP5Q/triton_red_fused__fused_rms_norm_view_0.ptx b/triton/IISWO2KKAFZHWSAJ7JGYGTPZMC74WHQJ75EROZ4BVM4XNJEETP5Q/triton_red_fused__fused_rms_norm_view_0.ptx
new file mode 100644
index 0000000000000000000000000000000000000000..f0f59859a87382a24f528ca0c118a3d521f76b7c
--- /dev/null
+++ b/triton/IISWO2KKAFZHWSAJ7JGYGTPZMC74WHQJ75EROZ4BVM4XNJEETP5Q/triton_red_fused__fused_rms_norm_view_0.ptx
@@ -0,0 +1,506 @@
+//
+// Generated by LLVM NVPTX Back-End
+//
+
+.version 9.1
+.target sm_89
+.address_size 64
+
+	// .globl	triton_red_fused__fused_rms_norm_view_0 // -- Begin function triton_red_fused__fused_rms_norm_view_0
+.extern .shared .align 16 .b8 global_smem[];
+                                        // @triton_red_fused__fused_rms_norm_view_0
+.visible .entry triton_red_fused__fused_rms_norm_view_0(
+	.param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm_view_0_param_0,
+	.param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm_view_0_param_1,
+	.param .u32 triton_red_fused__fused_rms_norm_view_0_param_2,
+	.param .u32 triton_red_fused__fused_rms_norm_view_0_param_3,
+	.param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm_view_0_param_4,
+	.param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm_view_0_param_5
+)
+.reqntid 64
+{
+	.reg .pred 	%p<3>;
+	.reg .b16 	%rs<5>;
+	.reg .b32 	%r<48>;
+	.reg .b64 	%rd<6>;
+	.loc	1 18 0                          // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:18:0
+$L__func_begin0:
+	.loc	1 18 0                          // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:18:0
+
+// %bb.0:
+	ld.param.b64 	%rd4, [triton_red_fused__fused_rms_norm_view_0_param_0];
+	ld.param.b64 	%rd5, [triton_red_fused__fused_rms_norm_view_0_param_1];
+$L__tmp0:
+	.loc	1 23 28                         // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:23:28
+	mov.u32 	%r5, %ctaid.x;
+	.loc	1 23 33                         // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:23:33
+	shl.b32 	%r6, %r5, 1;
+	.loc	1 24 44                         // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:24:44
+	mov.u32 	%r7, %tid.x;
+	and.b32 	%r8, %r7, 32;
+	bfe.u32 	%r9, %r7, 5, 1;
+	and.b32 	%r10, %r7, 1;
+	.loc	1 24 23                         // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:24:23
+	or.b32 	%r11, %r9, %r6;
+	or.b32 	%r12, %r6, %r10;
+	.loc	1 26 37                         // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:26:37
+	shl.b32 	%r13, %r7, 2;
+	and.b32 	%r14, %r13, 124;
+	.loc	1 29 19                         // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:29:19
+	bfe.s32 	%r15, %r5, 30, 1;
+	shr.u32 	%r16, %r15, 27;
+	add.s32 	%r17, %r11, %r16;
+	shr.u32 	%r18, %r17, 5;
+	.loc	1 28 19                         // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:28:19
+	and.b32 	%r19, %r17, 33554400;
+	sub.s32 	%r20, %r11, %r19;
+	.loc	1 38 45                         // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:38:45
+	shl.b32 	%r21, %r20, 7;
+	.loc	1 38 41                         // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:38:41
+	or.b32 	%r22, %r21, %r14;
+	.loc	1 38 50                         // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:38:50
+	mad.lo.s32 	%r23, %r18, 12288, %r22;
+	.loc	1 38 34                         // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:38:34
+	mad.wide.s32 	%rd1, %r23, 2, %rd4;
+	.loc	1 38 61                         // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:38:61
+	// begin inline asm
+	mov.u64 %rd2, 0x0;
+	createpolicy.fractional.L2::evict_first.b64 %rd2, 1.0;
+	// end inline asm
+	mov.b32 	%r3, 0;
+	mov.pred 	%p1, -1;
+	// begin inline asm
+	mov.u32 %r1, %r3;
+	mov.u32 %r2, %r3;
+	@%p1 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { %r1, %r2 }, [ %rd1 + 0 ], %rd2;
+	// end inline asm
+	mov.b32 	{%rs1, %rs2}, %r1;
+	mov.b32 	{%rs3, %rs4}, %r2;
+	.loc	1 38 115                        // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:38:115
+	cvt.f32.bf16 	%r24, %rs1;
+	cvt.f32.bf16 	%r25, %rs2;
+	cvt.f32.bf16 	%r26, %rs3;
+	cvt.f32.bf16 	%r27, %rs4;
+	.loc	1 40 22                         // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:40:22
+	mul.f32 	%r28, %r25, %r25;
+$L__tmp1:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:44:25 ] ]
+	fma.rn.f32 	%r29, %r24, %r24, %r28;
+	fma.rn.f32 	%r30, %r26, %r26, %r29;
+	fma.rn.f32 	%r31, %r27, %r27, %r30;
+$L__tmp2:
+	.loc	2 293 36                        // standard.py:293:36 @[ cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:44:25 ]
+	shfl.sync.bfly.b32 	%r32, %r31, 16, 31, -1;
+$L__tmp3:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:44:25 ] ]
+	add.f32 	%r33, %r31, %r32;
+$L__tmp4:
+	.loc	2 293 36                        // standard.py:293:36 @[ cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:44:25 ]
+	shfl.sync.bfly.b32 	%r34, %r33, 8, 31, -1;
+$L__tmp5:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:44:25 ] ]
+	add.f32 	%r35, %r33, %r34;
+$L__tmp6:
+	.loc	2 293 36                        // standard.py:293:36 @[ cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:44:25 ]
+	shfl.sync.bfly.b32 	%r36, %r35, 4, 31, -1;
+$L__tmp7:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:44:25 ] ]
+	add.f32 	%r37, %r35, %r36;
+$L__tmp8:
+	.loc	2 293 36                        // standard.py:293:36 @[ cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:44:25 ]
+	shfl.sync.bfly.b32 	%r38, %r37, 2, 31, -1;
+$L__tmp9:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:44:25 ] ]
+	add.f32 	%r39, %r37, %r38;
+$L__tmp10:
+	.loc	2 293 36                        // standard.py:293:36 @[ cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:44:25 ]
+	shfl.sync.bfly.b32 	%r40, %r39, 1, 31, -1;
+$L__tmp11:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:44:25 ] ]
+	add.f32 	%r41, %r39, %r40;
+$L__tmp12:
+	.loc	1 44 28                         // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:44:28
+	shr.u32 	%r42, %r8, 3;
+	mov.b32 	%r43, global_smem;
+	add.s32 	%r44, %r43, %r42;
+	st.shared.b32 	[%r44], %r41;
+	bar.sync 	0;
+	shl.b32 	%r45, %r10, 2;
+	add.s32 	%r46, %r43, %r45;
+	ld.shared.b32 	%r4, [%r46];
+	.loc	1 45 25                         // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:45:25
+	mad.wide.s32 	%rd3, %r12, 4, %rd5;
+	.loc	1 45 36                         // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:45:36
+	and.b32 	%r47, %r7, 62;
+	setp.eq.b32 	%p2, %r47, 0;
+	// begin inline asm
+	@%p2 st.global.b32 [ %rd3 + 0 ], { %r4 };
+	// end inline asm
+	.loc	1 45 4                          // cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py:45:4
+	ret;
+$L__tmp13:
+$L__func_end0:
+                                        // -- End function
+}
+	.file	1 "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py"
+	.file	2 "/usr/local/lib/python3.12/dist-packages/triton/language/standard.py"
+	.section	.debug_abbrev
+	{
+.b8 1                                   // Abbreviation Code
+.b8 17                                  // DW_TAG_compile_unit
+.b8 1                                   // DW_CHILDREN_yes
+.b8 37                                  // DW_AT_producer
+.b8 8                                   // DW_FORM_string
+.b8 19                                  // DW_AT_language
+.b8 5                                   // DW_FORM_data2
+.b8 3                                   // DW_AT_name
+.b8 8                                   // DW_FORM_string
+.b8 16                                  // DW_AT_stmt_list
+.b8 6                                   // DW_FORM_data4
+.b8 27                                  // DW_AT_comp_dir
+.b8 8                                   // DW_FORM_string
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 2                                   // Abbreviation Code
+.b8 46                                  // DW_TAG_subprogram
+.b8 0                                   // DW_CHILDREN_no
+.b8 3                                   // DW_AT_name
+.b8 8                                   // DW_FORM_string
+.b8 32                                  // DW_AT_inline
+.b8 11                                  // DW_FORM_data1
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 3                                   // Abbreviation Code
+.b8 46                                  // DW_TAG_subprogram
+.b8 1                                   // DW_CHILDREN_yes
+.b8 17                                  // DW_AT_low_pc
+.b8 1                                   // DW_FORM_addr
+.b8 18                                  // DW_AT_high_pc
+.b8 1                                   // DW_FORM_addr
+.b8 49                                  // DW_AT_abstract_origin
+.b8 19                                  // DW_FORM_ref4
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 4                                   // Abbreviation Code
+.b8 29                                  // DW_TAG_inlined_subroutine
+.b8 1                                   // DW_CHILDREN_yes
+.b8 49                                  // DW_AT_abstract_origin
+.b8 19                                  // DW_FORM_ref4
+.b8 17                                  // DW_AT_low_pc
+.b8 1                                   // DW_FORM_addr
+.b8 18                                  // DW_AT_high_pc
+.b8 1                                   // DW_FORM_addr
+.b8 88                                  // DW_AT_call_file
+.b8 11                                  // DW_FORM_data1
+.b8 89                                  // DW_AT_call_line
+.b8 11                                  // DW_FORM_data1
+.b8 87                                  // DW_AT_call_column
+.b8 11                                  // DW_FORM_data1
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 5                                   // Abbreviation Code
+.b8 29                                  // DW_TAG_inlined_subroutine
+.b8 0                                   // DW_CHILDREN_no
+.b8 49                                  // DW_AT_abstract_origin
+.b8 19                                  // DW_FORM_ref4
+.b8 17                                  // DW_AT_low_pc
+.b8 1                                   // DW_FORM_addr
+.b8 18                                  // DW_AT_high_pc
+.b8 1                                   // DW_FORM_addr
+.b8 88                                  // DW_AT_call_file
+.b8 11                                  // DW_FORM_data1
+.b8 89                                  // DW_AT_call_line
+.b8 5                                   // DW_FORM_data2
+.b8 87                                  // DW_AT_call_column
+.b8 11                                  // DW_FORM_data1
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 0                                   // EOM(3)
+	}
+	.section	.debug_info
+	{
+.b32 339                                // Length of Unit
+.b8 2                                   // DWARF version number
+.b8 0
+.b32 .debug_abbrev                      // Offset Into Abbrev. Section
+.b8 8                                   // Address Size (in bytes)
+.b8 1                                   // Abbrev [1] 0xb:0x14c DW_TAG_compile_unit
+.b8 116                                 // DW_AT_producer
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 0
+.b8 2                                   // DW_AT_language
+.b8 0
+.b8 99                                  // DW_AT_name
+.b8 119
+.b8 118
+.b8 121
+.b8 116
+.b8 52
+.b8 50
+.b8 55
+.b8 51
+.b8 105
+.b8 117
+.b8 51
+.b8 51
+.b8 109
+.b8 112
+.b8 101
+.b8 101
+.b8 55
+.b8 104
+.b8 98
+.b8 101
+.b8 116
+.b8 53
+.b8 106
+.b8 53
+.b8 101
+.b8 113
+.b8 52
+.b8 52
+.b8 100
+.b8 54
+.b8 102
+.b8 115
+.b8 104
+.b8 103
+.b8 119
+.b8 107
+.b8 121
+.b8 120
+.b8 107
+.b8 110
+.b8 53
+.b8 50
+.b8 103
+.b8 103
+.b8 103
+.b8 107
+.b8 105
+.b8 113
+.b8 104
+.b8 106
+.b8 53
+.b8 46
+.b8 112
+.b8 121
+.b8 0
+.b32 .debug_line                        // DW_AT_stmt_list
+.b8 47                                  // DW_AT_comp_dir
+.b8 97
+.b8 112
+.b8 112
+.b8 47
+.b8 116
+.b8 101
+.b8 110
+.b8 115
+.b8 111
+.b8 114
+.b8 114
+.b8 116
+.b8 95
+.b8 108
+.b8 108
+.b8 109
+.b8 47
+.b8 118
+.b8 105
+.b8 115
+.b8 117
+.b8 97
+.b8 108
+.b8 95
+.b8 103
+.b8 101
+.b8 110
+.b8 47
+.b8 99
+.b8 111
+.b8 109
+.b8 112
+.b8 105
+.b8 108
+.b8 101
+.b8 100
+.b8 95
+.b8 99
+.b8 97
+.b8 99
+.b8 104
+.b8 101
+.b8 47
+.b8 102
+.b8 108
+.b8 117
+.b8 120
+.b8 50
+.b8 95
+.b8 107
+.b8 108
+.b8 101
+.b8 105
+.b8 110
+.b8 95
+.b8 57
+.b8 98
+.b8 95
+.b8 78
+.b8 86
+.b8 73
+.b8 68
+.b8 73
+.b8 65
+.b8 95
+.b8 71
+.b8 101
+.b8 70
+.b8 111
+.b8 114
+.b8 99
+.b8 101
+.b8 95
+.b8 82
+.b8 84
+.b8 88
+.b8 95
+.b8 52
+.b8 48
+.b8 57
+.b8 48
+.b8 95
+.b8 115
+.b8 109
+.b8 56
+.b8 57
+.b8 95
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 50
+.b8 46
+.b8 49
+.b8 48
+.b8 46
+.b8 48
+.b8 97
+.b8 48
+.b8 95
+.b8 98
+.b8 52
+.b8 101
+.b8 52
+.b8 101
+.b8 101
+.b8 56
+.b8 49
+.b8 100
+.b8 51
+.b8 46
+.b8 110
+.b8 118
+.b8 50
+.b8 53
+.b8 46
+.b8 49
+.b8 50
+.b8 95
+.b8 99
+.b8 117
+.b8 100
+.b8 97
+.b8 49
+.b8 51
+.b8 95
+.b8 49
+.b8 47
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 105
+.b8 110
+.b8 100
+.b8 117
+.b8 99
+.b8 116
+.b8 111
+.b8 114
+.b8 47
+.b8 119
+.b8 118
+.b8 0
+.b8 2                                   // Abbrev [2] 0xe4:0x2a DW_TAG_subprogram
+.b8 116                                 // DW_AT_name
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 114
+.b8 101
+.b8 100
+.b8 95
+.b8 102
+.b8 117
+.b8 115
+.b8 101
+.b8 100
+.b8 95
+.b8 95
+.b8 102
+.b8 117
+.b8 115
+.b8 101
+.b8 100
+.b8 95
+.b8 114
+.b8 109
+.b8 115
+.b8 95
+.b8 110
+.b8 111
+.b8 114
+.b8 109
+.b8 95
+.b8 118
+.b8 105
+.b8 101
+.b8 119
+.b8 95
+.b8 48
+.b8 0
+.b8 1                                   // DW_AT_inline
+.b8 3                                   // Abbrev [3] 0x10e:0x48 DW_TAG_subprogram
+.b64 $L__func_begin0                    // DW_AT_low_pc
+.b64 $L__func_end0                      // DW_AT_high_pc
+.b32 228                                // DW_AT_abstract_origin
+.b8 4                                   // Abbrev [4] 0x123:0x32 DW_TAG_inlined_subroutine
+.b32 228                                // DW_AT_abstract_origin
+.b64 $L__tmp1                           // DW_AT_low_pc
+.b64 $L__tmp12                          // DW_AT_high_pc
+.b8 1                                   // DW_AT_call_file
+.b8 44                                  // DW_AT_call_line
+.b8 25                                  // DW_AT_call_column
+.b8 5                                   // Abbrev [5] 0x13b:0x19 DW_TAG_inlined_subroutine
+.b32 228                                // DW_AT_abstract_origin
+.b64 $L__tmp1                           // DW_AT_low_pc
+.b64 $L__tmp12                          // DW_AT_high_pc
+.b8 2                                   // DW_AT_call_file
+.b8 37                                  // DW_AT_call_line
+.b8 1
+.b8 36                                  // DW_AT_call_column
+.b8 0                                   // End Of Children Mark
+.b8 0                                   // End Of Children Mark
+.b8 0                                   // End Of Children Mark
+	}
+	.section	.debug_macinfo	{	}
diff --git a/triton/IISWO2KKAFZHWSAJ7JGYGTPZMC74WHQJ75EROZ4BVM4XNJEETP5Q/triton_red_fused__fused_rms_norm_view_0.source b/triton/IISWO2KKAFZHWSAJ7JGYGTPZMC74WHQJ75EROZ4BVM4XNJEETP5Q/triton_red_fused__fused_rms_norm_view_0.source
new file mode 100644
index 0000000000000000000000000000000000000000..74292b61b4e3768b12b3e3f2202229378c7a7bde
--- /dev/null
+++ b/triton/IISWO2KKAFZHWSAJ7JGYGTPZMC74WHQJ75EROZ4BVM4XNJEETP5Q/triton_red_fused__fused_rms_norm_view_0.source
@@ -0,0 +1,167 @@
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":18:0)
+#loc33 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":287:0)
+#loc35 = loc(unknown)
+#loc38 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":262:0)
+#loc42 = loc("in_ptr0"(#loc))
+#loc43 = loc("out_ptr0"(#loc))
+#loc44 = loc("xnumel"(#loc))
+#loc45 = loc("r0_numel"(#loc))
+#loc74 = loc("input"(#loc33))
+#loc75 = loc("a"(#loc38))
+#loc76 = loc("b"(#loc38))
+module {
+  tt.func public @triton_red_fused__fused_rms_norm_view_0(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
+    %xnumel_0 = arith.constant 8192 : i32 loc(#loc46)
+    %r0_numel_1 = arith.constant 128 : i32 loc(#loc47)
+    %xoffset = tt.get_program_id x : i32 loc(#loc48)
+    %xoffset_2 = arith.constant 2 : i32 loc(#loc49)
+    %xoffset_3 = arith.constant 2 : i32 loc(#loc49)
+    %xoffset_4 = arith.muli %xoffset, %xoffset_3 : i32 loc(#loc49)
+    %xindex = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc50)
+    %xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<2xi32> -> tensor<2x1xi32> loc(#loc51)
+    %xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<2x1xi32> loc(#loc52)
+    %xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<2x1xi32> loc(#loc52)
+    %xmask = arith.constant true loc(#loc53)
+    %xmask_8 = arith.constant dense<true> : tensor<2x128xi1> loc(#loc53)
+    %r0_base = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc54)
+    %r0_base_9 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc55)
+    %x0 = arith.constant 32 : i32 loc(#loc56)
+    %x0_10 = arith.constant 32 : i32 loc(#loc56)
+    %x0_11 = arith.constant dense<32> : tensor<2x1xi32> loc(#loc56)
+    %x0_12 = arith.remsi %xindex_7, %x0_11 : tensor<2x1xi32> loc(#loc56)
+    %x1 = arith.constant 32 : i32 loc(#loc57)
+    %x1_13 = arith.constant 32 : i32 loc(#loc57)
+    %x1_14 = arith.constant dense<32> : tensor<2x1xi32> loc(#loc57)
+    %x1_15 = arith.divsi %xindex_7, %x1_14 : tensor<2x1xi32> loc(#loc57)
+    %_tmp4 = arith.constant 0.000000e+00 : f32 loc(#loc58)
+    %_tmp4_16 = arith.constant dense<0.000000e+00> : tensor<2x128xf32> loc(#loc58)
+    %c0_i32 = arith.constant 0 : i32 loc(#loc14)
+    %c128_i32 = arith.constant 128 : i32 loc(#loc14)
+    %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc14)
+    %1 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc14)
+    %2 = arith.bitcast %c128_i32 : i32 to i32 loc(#loc14)
+    %3 = ub.poison : i32 loc(#loc14)
+    %_tmp4_17 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%_tmp4_19 = %_tmp4_16) -> (tensor<2x128xf32>)  : i32 {
+      %r0_index = tt.splat %r0_offset : i32 -> tensor<1x128xi32> loc(#loc60)
+      %r0_index_20 = arith.addi %r0_index, %r0_base_9 : tensor<1x128xi32> loc(#loc60)
+      %r0_mask = arith.constant dense<128> : tensor<1x128xi32> loc(#loc61)
+      %r0_mask_21 = arith.cmpi slt, %r0_index_20, %r0_mask : tensor<1x128xi32> loc(#loc61)
+      %tmp0 = arith.constant 128 : i32 loc(#loc62)
+      %tmp0_22 = arith.constant 128 : i32 loc(#loc62)
+      %tmp0_23 = arith.constant dense<128> : tensor<2x1xi32> loc(#loc62)
+      %tmp0_24 = arith.muli %tmp0_23, %x0_12 : tensor<2x1xi32> loc(#loc62)
+      %tmp0_25 = tt.broadcast %r0_index_20 : tensor<1x128xi32> -> tensor<2x128xi32> loc(#loc63)
+      %tmp0_26 = tt.broadcast %tmp0_24 : tensor<2x1xi32> -> tensor<2x128xi32> loc(#loc63)
+      %tmp0_27 = arith.addi %tmp0_25, %tmp0_26 : tensor<2x128xi32> loc(#loc63)
+      %tmp0_28 = arith.constant 12288 : i32 loc(#loc64)
+      %tmp0_29 = arith.constant 12288 : i32 loc(#loc64)
+      %tmp0_30 = arith.constant dense<12288> : tensor<2x1xi32> loc(#loc64)
+      %tmp0_31 = arith.muli %tmp0_30, %x1_15 : tensor<2x1xi32> loc(#loc64)
+      %tmp0_32 = tt.broadcast %tmp0_31 : tensor<2x1xi32> -> tensor<2x128xi32> loc(#loc65)
+      %tmp0_33 = arith.addi %tmp0_27, %tmp0_32 : tensor<2x128xi32> loc(#loc65)
+      %tmp0_34 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<2x128x!tt.ptr<bf16>> loc(#loc66)
+      %tmp0_35 = tt.addptr %tmp0_34, %tmp0_33 : tensor<2x128x!tt.ptr<bf16>>, tensor<2x128xi32> loc(#loc66)
+      %tmp0_36 = arith.constant 0.000000e+00 : f32 loc(#loc67)
+      %tmp0_37 = tt.broadcast %r0_mask_21 : tensor<1x128xi1> -> tensor<2x128xi1> loc(#loc67)
+      %tmp0_38 = arith.constant dense<0.000000e+00> : tensor<2x128xf32> loc(#loc67)
+      %tmp0_39 = arith.truncf %tmp0_38 : tensor<2x128xf32> to tensor<2x128xbf16> loc(#loc67)
+      %tmp0_40 = tt.load %tmp0_35, %tmp0_37, %tmp0_39 evictionPolicy = evict_first : tensor<2x128x!tt.ptr<bf16>> loc(#loc67)
+      %tmp0_41 = arith.extf %tmp0_40 : tensor<2x128xbf16> to tensor<2x128xf32> loc(#loc68)
+      %tmp2 = arith.mulf %tmp0_41, %tmp0_41 : tensor<2x128xf32> loc(#loc69)
+      %tmp5 = arith.addf %_tmp4_19, %tmp2 : tensor<2x128xf32> loc(#loc70)
+      %_tmp4_42 = tt.broadcast %r0_mask_21 : tensor<1x128xi1> -> tensor<2x128xi1> loc(#loc71)
+      %_tmp4_43 = arith.select %_tmp4_42, %tmp5, %_tmp4_19 : tensor<2x128xi1>, tensor<2x128xf32> loc(#loc71)
+      scf.yield %_tmp4_43 : tensor<2x128xf32> loc(#loc27)
+    } loc(#loc59)
+    %tmp4 = tt.call @"triton.language.standard.sum__fp32S2_128S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%_tmp4_17) : (tensor<2x128xf32>) -> tensor<2xf32> loc(#loc72)
+    %tmp4_18 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<2xf32> -> tensor<2x1xf32> loc(#loc73)
+    %4 = tt.splat %out_ptr0 : !tt.ptr<f32> -> tensor<2x1x!tt.ptr<f32>> loc(#loc30)
+    %5 = tt.addptr %4, %xindex_7 : tensor<2x1x!tt.ptr<f32>>, tensor<2x1xi32> loc(#loc30)
+    tt.store %5, %tmp4_18 : tensor<2x1x!tt.ptr<f32>> loc(#loc31)
+    tt.return loc(#loc32)
+  } loc(#loc)
+  tt.func private @"triton.language.standard.sum__fp32S2_128S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<2x128xf32> loc("input"(#loc33))) -> tensor<2xf32> attributes {noinline = false} {
+    %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({
+    ^bb0(%arg1: f32 loc(unknown), %arg2: f32 loc(unknown)):
+      %2 = tt.call @triton.language.standard._sum_combine__fp32_fp32__(%arg1, %arg2) : (f32, f32) -> f32 loc(#loc34)
+      tt.reduce.return %2 : f32 loc(#loc34)
+    }) : (tensor<2x128xf32>) -> tensor<2xf32> loc(#loc34)
+    tt.return %0 : tensor<2xf32> loc(#loc36)
+  ^bb1:  // no predecessors
+    %1 = ub.poison : tensor<2xf32> loc(#loc37)
+    tt.return %1 : tensor<2xf32> loc(#loc37)
+  } loc(#loc33)
+  tt.func private @triton.language.standard._sum_combine__fp32_fp32__(%a: f32 loc("a"(#loc38)), %b: f32 loc("b"(#loc38))) -> f32 attributes {noinline = false} {
+    %0 = arith.addf %a, %b : f32 loc(#loc39)
+    tt.return %0 : f32 loc(#loc40)
+  ^bb1:  // no predecessors
+    %1 = ub.poison : f32 loc(#loc41)
+    tt.return %1 : f32 loc(#loc41)
+  } loc(#loc38)
+} loc(#loc)
+#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":19:13)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":20:15)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":23:28)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":23:33)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":24:36)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":24:44)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":24:23)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":25:46)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":26:27)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":26:37)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":28:19)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":29:19)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":30:43)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":32:43)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":33:31)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":34:29)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:45)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:41)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:56)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:50)
+#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:34)
+#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:61)
+#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:115)
+#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":40:22)
+#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":42:23)
+#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":43:40)
+#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":43:8)
+#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":44:25)
+#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":44:28)
+#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":45:25)
+#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":45:36)
+#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":45:4)
+#loc34 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:36)
+#loc36 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:11)
+#loc37 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:4)
+#loc39 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:15)
+#loc40 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:11)
+#loc41 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:4)
+#loc46 = loc("xnumel"(#loc1))
+#loc47 = loc("r0_numel"(#loc2))
+#loc48 = loc("xoffset"(#loc3))
+#loc49 = loc("xoffset"(#loc4))
+#loc50 = loc("xindex"(#loc5))
+#loc51 = loc("xindex"(#loc6))
+#loc52 = loc("xindex"(#loc7))
+#loc53 = loc("xmask"(#loc8))
+#loc54 = loc("r0_base"(#loc9))
+#loc55 = loc("r0_base"(#loc10))
+#loc56 = loc("x0"(#loc11))
+#loc57 = loc("x1"(#loc12))
+#loc58 = loc("_tmp4"(#loc13))
+#loc59 = loc("_tmp4"(#loc14))
+#loc60 = loc("r0_index"(#loc15))
+#loc61 = loc("r0_mask"(#loc16))
+#loc62 = loc("tmp0"(#loc17))
+#loc63 = loc("tmp0"(#loc18))
+#loc64 = loc("tmp0"(#loc19))
+#loc65 = loc("tmp0"(#loc20))
+#loc66 = loc("tmp0"(#loc21))
+#loc67 = loc("tmp0"(#loc22))
+#loc68 = loc("tmp0"(#loc23))
+#loc69 = loc("tmp2"(#loc24))
+#loc70 = loc("tmp5"(#loc25))
+#loc71 = loc("_tmp4"(#loc26))
+#loc72 = loc("tmp4"(#loc28))
+#loc73 = loc("tmp4"(#loc29))
diff --git a/triton/IISWO2KKAFZHWSAJ7JGYGTPZMC74WHQJ75EROZ4BVM4XNJEETP5Q/triton_red_fused__fused_rms_norm_view_0.ttgir b/triton/IISWO2KKAFZHWSAJ7JGYGTPZMC74WHQJ75EROZ4BVM4XNJEETP5Q/triton_red_fused__fused_rms_norm_view_0.ttgir
new file mode 100644
index 0000000000000000000000000000000000000000..96d664b0b0bc11ab1284ba1b2348b8a772f48057
--- /dev/null
+++ b/triton/IISWO2KKAFZHWSAJ7JGYGTPZMC74WHQJ75EROZ4BVM4XNJEETP5Q/triton_red_fused__fused_rms_norm_view_0.ttgir
@@ -0,0 +1,108 @@
+#blocked = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 32], warpsPerCTA = [2, 1], order = [1, 0]}>
+#blocked1 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [2, 16], warpsPerCTA = [1, 2], order = [0, 1]}>
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":18:0)
+#loc1 = loc(unknown)
+#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":44:25)
+#loc27 = loc("in_ptr0"(#loc))
+#loc28 = loc("out_ptr0"(#loc))
+#loc29 = loc("xnumel"(#loc))
+#loc30 = loc("r0_numel"(#loc))
+#loc49 = loc("tmp4"(#loc21))
+#loc52 = loc(callsite(#loc1 at #loc49))
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 2 : i32, ttg.target = "cuda:89", "ttg.threads-per-warp" = 32 : i32} {
+  tt.func public @triton_red_fused__fused_rms_norm_view_0(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
+    %cst = arith.constant dense<128> : tensor<1x128xi32, #blocked> loc(#loc1)
+    %cst_0 = arith.constant dense<128> : tensor<2x1xi32, #blocked> loc(#loc1)
+    %cst_1 = arith.constant dense<12288> : tensor<2x1xi32, #blocked> loc(#loc1)
+    %cst_2 = arith.constant dense<32> : tensor<2x1xi32, #blocked> loc(#loc1)
+    %c2_i32 = arith.constant 2 : i32 loc(#loc1)
+    %cst_3 = arith.constant dense<0.000000e+00> : tensor<2x128xbf16, #blocked> loc(#loc1)
+    %cst_4 = arith.constant dense<0.000000e+00> : tensor<2x128xf32, #blocked> loc(#loc1)
+    %xoffset = tt.get_program_id x : i32 loc(#loc31)
+    %xoffset_5 = arith.muli %xoffset, %c2_i32 : i32 loc(#loc32)
+    %xindex = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc33)
+    %xindex_6 = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc33)
+    %xindex_7 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<2xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<2x1xi32, #blocked> loc(#loc33)
+    %xindex_8 = tt.expand_dims %xindex_6 {axis = 1 : i32} : tensor<2xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<2x1xi32, #blocked1> loc(#loc33)
+    %xindex_9 = tt.splat %xoffset_5 : i32 -> tensor<2x1xi32, #blocked> loc(#loc34)
+    %xindex_10 = tt.splat %xoffset_5 : i32 -> tensor<2x1xi32, #blocked1> loc(#loc34)
+    %xindex_11 = arith.addi %xindex_9, %xindex_7 : tensor<2x1xi32, #blocked> loc(#loc34)
+    %xindex_12 = arith.addi %xindex_10, %xindex_8 : tensor<2x1xi32, #blocked1> loc(#loc34)
+    %r0_base = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc35)
+    %r0_base_13 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x128xi32, #blocked> loc(#loc35)
+    %x0 = arith.remsi %xindex_11, %cst_2 : tensor<2x1xi32, #blocked> loc(#loc36)
+    %x1 = arith.divsi %xindex_11, %cst_2 : tensor<2x1xi32, #blocked> loc(#loc37)
+    %r0_mask = arith.cmpi slt, %r0_base_13, %cst : tensor<1x128xi32, #blocked> loc(#loc38)
+    %tmp0 = arith.muli %x0, %cst_0 : tensor<2x1xi32, #blocked> loc(#loc39)
+    %tmp0_14 = tt.broadcast %r0_base_13 : tensor<1x128xi32, #blocked> -> tensor<2x128xi32, #blocked> loc(#loc40)
+    %tmp0_15 = tt.broadcast %tmp0 : tensor<2x1xi32, #blocked> -> tensor<2x128xi32, #blocked> loc(#loc40)
+    %tmp0_16 = arith.addi %tmp0_14, %tmp0_15 : tensor<2x128xi32, #blocked> loc(#loc40)
+    %tmp0_17 = arith.muli %x1, %cst_1 : tensor<2x1xi32, #blocked> loc(#loc41)
+    %tmp0_18 = tt.broadcast %tmp0_17 : tensor<2x1xi32, #blocked> -> tensor<2x128xi32, #blocked> loc(#loc42)
+    %tmp0_19 = arith.addi %tmp0_16, %tmp0_18 : tensor<2x128xi32, #blocked> loc(#loc42)
+    %tmp0_20 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<2x128x!tt.ptr<bf16>, #blocked> loc(#loc43)
+    %tmp0_21 = tt.addptr %tmp0_20, %tmp0_19 : tensor<2x128x!tt.ptr<bf16>, #blocked>, tensor<2x128xi32, #blocked> loc(#loc43)
+    %tmp0_22 = tt.broadcast %r0_mask : tensor<1x128xi1, #blocked> -> tensor<2x128xi1, #blocked> loc(#loc44)
+    %tmp0_23 = tt.load %tmp0_21, %tmp0_22, %cst_3 evictionPolicy = evict_first : tensor<2x128x!tt.ptr<bf16>, #blocked> loc(#loc44)
+    %tmp0_24 = arith.extf %tmp0_23 : tensor<2x128xbf16, #blocked> to tensor<2x128xf32, #blocked> loc(#loc45)
+    %tmp2 = arith.mulf %tmp0_24, %tmp0_24 : tensor<2x128xf32, #blocked> loc(#loc46)
+    %tmp5 = arith.addf %tmp2, %cst_4 : tensor<2x128xf32, #blocked> loc(#loc47)
+    %_tmp4 = arith.select %tmp0_22, %tmp5, %cst_4 : tensor<2x128xi1, #blocked>, tensor<2x128xf32, #blocked> loc(#loc48)
+    %tmp4 = "tt.reduce"(%_tmp4) <{axis = 1 : i32}> ({
+    ^bb0(%tmp4_27: f32 loc(callsite(#loc1 at #loc49)), %tmp4_28: f32 loc(callsite(#loc1 at #loc49))):
+      %tmp4_29 = arith.addf %tmp4_27, %tmp4_28 : f32 loc(#loc53)
+      tt.reduce.return %tmp4_29 : f32 loc(#loc51)
+    }) : (tensor<2x128xf32, #blocked>) -> tensor<2xf32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc51)
+    %tmp4_25 = ttg.convert_layout %tmp4 : tensor<2xf32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<2xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc50)
+    %tmp4_26 = tt.expand_dims %tmp4_25 {axis = 1 : i32} : tensor<2xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<2x1xf32, #blocked1> loc(#loc50)
+    %0 = tt.splat %out_ptr0 : !tt.ptr<f32> -> tensor<2x1x!tt.ptr<f32>, #blocked1> loc(#loc24)
+    %1 = tt.addptr %0, %xindex_12 : tensor<2x1x!tt.ptr<f32>, #blocked1>, tensor<2x1xi32, #blocked1> loc(#loc24)
+    tt.store %1, %tmp4_26 : tensor<2x1x!tt.ptr<f32>, #blocked1> loc(#loc25)
+    tt.return loc(#loc26)
+  } loc(#loc)
+} loc(#loc)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":23:28)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":23:33)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":24:44)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":24:23)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":26:37)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":28:19)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":29:19)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":34:29)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:45)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:41)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:56)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:50)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:34)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:61)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:115)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":40:22)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":42:23)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":43:40)
+#loc20 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:36)
+#loc22 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:15)
+#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":44:28)
+#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":45:25)
+#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":45:36)
+#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":45:4)
+#loc31 = loc("xoffset"(#loc2))
+#loc32 = loc("xoffset"(#loc3))
+#loc33 = loc("xindex"(#loc4))
+#loc34 = loc("xindex"(#loc5))
+#loc35 = loc("r0_base"(#loc6))
+#loc36 = loc("x0"(#loc7))
+#loc37 = loc("x1"(#loc8))
+#loc38 = loc("r0_mask"(#loc9))
+#loc39 = loc("tmp0"(#loc10))
+#loc40 = loc("tmp0"(#loc11))
+#loc41 = loc("tmp0"(#loc12))
+#loc42 = loc("tmp0"(#loc13))
+#loc43 = loc("tmp0"(#loc14))
+#loc44 = loc("tmp0"(#loc15))
+#loc45 = loc("tmp0"(#loc16))
+#loc46 = loc("tmp2"(#loc17))
+#loc47 = loc("tmp5"(#loc18))
+#loc48 = loc("_tmp4"(#loc19))
+#loc50 = loc("tmp4"(#loc23))
+#loc51 = loc(callsite(#loc20 at #loc49))
+#loc53 = loc(callsite(#loc22 at #loc51))
diff --git a/triton/IISWO2KKAFZHWSAJ7JGYGTPZMC74WHQJ75EROZ4BVM4XNJEETP5Q/triton_red_fused__fused_rms_norm_view_0.ttir b/triton/IISWO2KKAFZHWSAJ7JGYGTPZMC74WHQJ75EROZ4BVM4XNJEETP5Q/triton_red_fused__fused_rms_norm_view_0.ttir
new file mode 100644
index 0000000000000000000000000000000000000000..df05c30e88f2bca31a6d1a5cd7292038c85559b0
--- /dev/null
+++ b/triton/IISWO2KKAFZHWSAJ7JGYGTPZMC74WHQJ75EROZ4BVM4XNJEETP5Q/triton_red_fused__fused_rms_norm_view_0.ttir
@@ -0,0 +1,105 @@
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":18:0)
+#loc2 = loc(unknown)
+#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":44:25)
+#loc29 = loc("in_ptr0"(#loc))
+#loc30 = loc("out_ptr0"(#loc))
+#loc31 = loc("xnumel"(#loc))
+#loc32 = loc("r0_numel"(#loc))
+#loc53 = loc("tmp4"(#loc23))
+#loc56 = loc(callsite(#loc2 at #loc53))
+module {
+  tt.func public @triton_red_fused__fused_rms_norm_view_0(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
+    %tmp0 = arith.constant dense<0.000000e+00> : tensor<2x128xbf16> loc(#loc33)
+    %cst = arith.constant dense<12288> : tensor<2x1xi32> loc(#loc2)
+    %cst_0 = arith.constant dense<128> : tensor<2x1xi32> loc(#loc2)
+    %cst_1 = arith.constant dense<128> : tensor<1x128xi32> loc(#loc2)
+    %cst_2 = arith.constant dense<0.000000e+00> : tensor<2x128xf32> loc(#loc2)
+    %cst_3 = arith.constant dense<32> : tensor<2x1xi32> loc(#loc2)
+    %c2_i32 = arith.constant 2 : i32 loc(#loc2)
+    %xoffset = tt.get_program_id x : i32 loc(#loc34)
+    %xoffset_4 = arith.muli %xoffset, %c2_i32 : i32 loc(#loc35)
+    %xindex = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc36)
+    %xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<2xi32> -> tensor<2x1xi32> loc(#loc37)
+    %xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<2x1xi32> loc(#loc38)
+    %xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<2x1xi32> loc(#loc38)
+    %r0_base = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc39)
+    %r0_base_8 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc40)
+    %x0 = arith.remsi %xindex_7, %cst_3 : tensor<2x1xi32> loc(#loc41)
+    %x1 = arith.divsi %xindex_7, %cst_3 : tensor<2x1xi32> loc(#loc42)
+    %r0_mask = arith.cmpi slt, %r0_base_8, %cst_1 : tensor<1x128xi32> loc(#loc43)
+    %tmp0_9 = arith.muli %x0, %cst_0 : tensor<2x1xi32> loc(#loc44)
+    %tmp0_10 = tt.broadcast %r0_base_8 : tensor<1x128xi32> -> tensor<2x128xi32> loc(#loc45)
+    %tmp0_11 = tt.broadcast %tmp0_9 : tensor<2x1xi32> -> tensor<2x128xi32> loc(#loc45)
+    %tmp0_12 = arith.addi %tmp0_10, %tmp0_11 : tensor<2x128xi32> loc(#loc45)
+    %tmp0_13 = arith.muli %x1, %cst : tensor<2x1xi32> loc(#loc46)
+    %tmp0_14 = tt.broadcast %tmp0_13 : tensor<2x1xi32> -> tensor<2x128xi32> loc(#loc47)
+    %tmp0_15 = arith.addi %tmp0_12, %tmp0_14 : tensor<2x128xi32> loc(#loc47)
+    %tmp0_16 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<2x128x!tt.ptr<bf16>> loc(#loc48)
+    %tmp0_17 = tt.addptr %tmp0_16, %tmp0_15 : tensor<2x128x!tt.ptr<bf16>>, tensor<2x128xi32> loc(#loc48)
+    %tmp0_18 = tt.broadcast %r0_mask : tensor<1x128xi1> -> tensor<2x128xi1> loc(#loc33)
+    %tmp0_19 = tt.load %tmp0_17, %tmp0_18, %tmp0 evictionPolicy = evict_first : tensor<2x128x!tt.ptr<bf16>> loc(#loc33)
+    %tmp0_20 = arith.extf %tmp0_19 : tensor<2x128xbf16> to tensor<2x128xf32> loc(#loc49)
+    %tmp2 = arith.mulf %tmp0_20, %tmp0_20 : tensor<2x128xf32> loc(#loc50)
+    %tmp5 = arith.addf %tmp2, %cst_2 : tensor<2x128xf32> loc(#loc51)
+    %_tmp4 = arith.select %tmp0_18, %tmp5, %cst_2 : tensor<2x128xi1>, tensor<2x128xf32> loc(#loc52)
+    %tmp4 = "tt.reduce"(%_tmp4) <{axis = 1 : i32}> ({
+    ^bb0(%tmp4_22: f32 loc(callsite(#loc2 at #loc53)), %tmp4_23: f32 loc(callsite(#loc2 at #loc53))):
+      %tmp4_24 = arith.addf %tmp4_22, %tmp4_23 : f32 loc(#loc57)
+      tt.reduce.return %tmp4_24 : f32 loc(#loc55)
+    }) : (tensor<2x128xf32>) -> tensor<2xf32> loc(#loc55)
+    %tmp4_21 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<2xf32> -> tensor<2x1xf32> loc(#loc54)
+    %0 = tt.splat %out_ptr0 : !tt.ptr<f32> -> tensor<2x1x!tt.ptr<f32>> loc(#loc26)
+    %1 = tt.addptr %0, %xindex_7 : tensor<2x1x!tt.ptr<f32>>, tensor<2x1xi32> loc(#loc26)
+    tt.store %1, %tmp4_21 : tensor<2x1x!tt.ptr<f32>> loc(#loc27)
+    tt.return loc(#loc28)
+  } loc(#loc)
+} loc(#loc)
+#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:61)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":23:28)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":23:33)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":24:36)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":24:44)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":24:23)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":26:27)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":26:37)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":28:19)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":29:19)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":34:29)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:45)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:41)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:56)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:50)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:34)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":38:115)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":40:22)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":42:23)
+#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":43:40)
+#loc22 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:36)
+#loc24 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:15)
+#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":44:28)
+#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":45:25)
+#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":45:36)
+#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/wv/cwvyt4273iu33mpee7hbet5j5eq44d6fshgwkyxkn52gggkiqhj5.py":45:4)
+#loc33 = loc("tmp0"(#loc1))
+#loc34 = loc("xoffset"(#loc3))
+#loc35 = loc("xoffset"(#loc4))
+#loc36 = loc("xindex"(#loc5))
+#loc37 = loc("xindex"(#loc6))
+#loc38 = loc("xindex"(#loc7))
+#loc39 = loc("r0_base"(#loc8))
+#loc40 = loc("r0_base"(#loc9))
+#loc41 = loc("x0"(#loc10))
+#loc42 = loc("x1"(#loc11))
+#loc43 = loc("r0_mask"(#loc12))
+#loc44 = loc("tmp0"(#loc13))
+#loc45 = loc("tmp0"(#loc14))
+#loc46 = loc("tmp0"(#loc15))
+#loc47 = loc("tmp0"(#loc16))
+#loc48 = loc("tmp0"(#loc17))
+#loc49 = loc("tmp0"(#loc18))
+#loc50 = loc("tmp2"(#loc19))
+#loc51 = loc("tmp5"(#loc20))
+#loc52 = loc("_tmp4"(#loc21))
+#loc54 = loc("tmp4"(#loc25))
+#loc55 = loc(callsite(#loc22 at #loc53))
+#loc57 = loc(callsite(#loc24 at #loc55))
diff --git a/triton/JYGUFKIB7REG3CRGFOLJUXXXXNZKCJWILDAQA4L5F5UY64FVXYPA/__grp__triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.json b/triton/JYGUFKIB7REG3CRGFOLJUXXXXNZKCJWILDAQA4L5F5UY64FVXYPA/__grp__triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..ced2e448a5053bde918d3e1dc9318c6d4465f51d
--- /dev/null
+++ b/triton/JYGUFKIB7REG3CRGFOLJUXXXXNZKCJWILDAQA4L5F5UY64FVXYPA/__grp__triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.json
@@ -0,0 +1 @@
+{"child_paths": {"triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.source": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/JYGUFKIB7REG3CRGFOLJUXXXXNZKCJWILDAQA4L5F5UY64FVXYPA/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.source", "triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.ttir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/JYGUFKIB7REG3CRGFOLJUXXXXNZKCJWILDAQA4L5F5UY64FVXYPA/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.ttir", "triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.ttgir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/JYGUFKIB7REG3CRGFOLJUXXXXNZKCJWILDAQA4L5F5UY64FVXYPA/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.ttgir", "triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.llir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/JYGUFKIB7REG3CRGFOLJUXXXXNZKCJWILDAQA4L5F5UY64FVXYPA/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.llir", "triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.ptx": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/JYGUFKIB7REG3CRGFOLJUXXXXNZKCJWILDAQA4L5F5UY64FVXYPA/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.ptx", "triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.cubin": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/JYGUFKIB7REG3CRGFOLJUXXXXNZKCJWILDAQA4L5F5UY64FVXYPA/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.cubin", "triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.json": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/JYGUFKIB7REG3CRGFOLJUXXXXNZKCJWILDAQA4L5F5UY64FVXYPA/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.json"}}
\ No newline at end of file
diff --git a/triton/JYGUFKIB7REG3CRGFOLJUXXXXNZKCJWILDAQA4L5F5UY64FVXYPA/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.cubin b/triton/JYGUFKIB7REG3CRGFOLJUXXXXNZKCJWILDAQA4L5F5UY64FVXYPA/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..a88c26affae58e1efc7c2aeb3f00ac3c39ea3335
Binary files /dev/null and b/triton/JYGUFKIB7REG3CRGFOLJUXXXXNZKCJWILDAQA4L5F5UY64FVXYPA/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.cubin differ
diff --git a/triton/JYGUFKIB7REG3CRGFOLJUXXXXNZKCJWILDAQA4L5F5UY64FVXYPA/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.json b/triton/JYGUFKIB7REG3CRGFOLJUXXXXNZKCJWILDAQA4L5F5UY64FVXYPA/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..b6c3904719b57b210bf4fca250b0fbe6f4b17003
--- /dev/null
+++ b/triton/JYGUFKIB7REG3CRGFOLJUXXXXNZKCJWILDAQA4L5F5UY64FVXYPA/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.json
@@ -0,0 +1 @@
+{"hash": "4e0d42a901fc486d8a262b969a5ef7bb72a126c858c100717d2f698f70b5be1e", "target": {"backend": "cuda", "arch": 89, "warp_size": 32}, "num_warps": 8, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "enable_reflect_ftz": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee", "bf16x3", "bf16x6"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm89", "instrumentation_mode": "", "triton_version": "3.6.0", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1"}
\ No newline at end of file
diff --git a/triton/JYGUFKIB7REG3CRGFOLJUXXXXNZKCJWILDAQA4L5F5UY64FVXYPA/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.llir b/triton/JYGUFKIB7REG3CRGFOLJUXXXXNZKCJWILDAQA4L5F5UY64FVXYPA/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.llir
new file mode 100644
index 0000000000000000000000000000000000000000..4e4ada2b9fffdf4df1a5d349c9dde847a043dce3
--- /dev/null
+++ b/triton/JYGUFKIB7REG3CRGFOLJUXXXXNZKCJWILDAQA4L5F5UY64FVXYPA/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.llir
@@ -0,0 +1,78 @@
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64"
+
+; Function Attrs: nounwind
+define ptx_kernel void @triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1(ptr addrspace(1) %0, ptr addrspace(1) %1, i64 %2, i32 %3, ptr addrspace(1) readnone captures(none) %4, ptr addrspace(1) readnone captures(none) %5) local_unnamed_addr #0 !dbg !4 {
+  %7 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7
+  %8 = shl i32 %7, 9, !dbg !8
+  %9 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9
+  %10 = shl nuw nsw i32 %9, 1, !dbg !9
+  %11 = and i32 %10, 510, !dbg !9
+  %12 = or disjoint i32 %11, %8, !dbg !10
+  %13 = or i32 %10, %8, !dbg !10
+  %14 = or disjoint i32 %13, 1, !dbg !10
+  %15 = sdiv i32 %12, 128, !dbg !11
+  %16 = mul i32 %15, 128, !dbg !12
+  %.decomposed = sub i32 %12, %16, !dbg !12
+  %17 = srem i32 %14, 128, !dbg !12
+  %18 = srem i32 %15, 2304, !dbg !13
+  %19 = sdiv i32 %12, 294912, !dbg !14
+  %20 = shl nsw i32 %19, 7, !dbg !15
+  %21 = add nsw i32 %20, %.decomposed, !dbg !16
+  %22 = add nsw i32 %20, %17, !dbg !16
+  %23 = sext i32 %18 to i64, !dbg !17
+  %24 = mul i64 %2, %23, !dbg !17
+  %25 = sext i32 %21 to i64, !dbg !18
+  %26 = sext i32 %22 to i64, !dbg !18
+  %27 = getelementptr bfloat, ptr addrspace(1) %0, i64 %24, !dbg !19
+  %28 = getelementptr bfloat, ptr addrspace(1) %27, i64 %25, !dbg !19
+  %29 = getelementptr bfloat, ptr addrspace(1) %27, i64 %26, !dbg !19
+  %30 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09ld.global.b16 { $0 }, [ $1 + 0 ];", "=c,l"(ptr addrspace(1) %28) #2, !dbg !20
+  %31 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09ld.global.b16 { $0 }, [ $1 + 0 ];", "=c,l"(ptr addrspace(1) %29) #2, !dbg !20
+  %32 = sext i32 %12 to i64, !dbg !21
+  %33 = getelementptr bfloat, ptr addrspace(1) %1, i64 %32, !dbg !21
+  %34 = insertelement <2 x i16> poison, i16 %30, i64 0, !dbg !22
+  %35 = insertelement <2 x i16> %34, i16 %31, i64 1, !dbg !22
+  %36 = bitcast <2 x i16> %35 to i32, !dbg !22
+  tail call void asm sideeffect "st.global.b32 [ $1 + 0 ], { $0 };", "r,l"(i32 %36, ptr addrspace(1) %33) #2, !dbg !22
+  ret void, !dbg !23
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1
+
+attributes #0 = { nounwind "nvvm.reqntid"="256" }
+attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #2 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly)
+!1 = !DIFile(filename: "c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py", directory: "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v")
+!2 = !{i32 2, !"Debug Info Version", i32 3}
+!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
+!4 = distinct !DISubprogram(name: "triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1", linkageName: "triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!5 = !DISubroutineType(cc: DW_CC_normal, types: !6)
+!6 = !{}
+!7 = !DILocation(line: 20, column: 28, scope: !4)
+!8 = !DILocation(line: 20, column: 33, scope: !4)
+!9 = !DILocation(line: 21, column: 36, scope: !4)
+!10 = !DILocation(line: 21, column: 23, scope: !4)
+!11 = !DILocation(line: 24, column: 21, scope: !4)
+!12 = !DILocation(line: 23, column: 19, scope: !4)
+!13 = !DILocation(line: 24, column: 28, scope: !4)
+!14 = !DILocation(line: 25, column: 19, scope: !4)
+!15 = !DILocation(line: 27, column: 39, scope: !4)
+!16 = !DILocation(line: 27, column: 35, scope: !4)
+!17 = !DILocation(line: 27, column: 48, scope: !4)
+!18 = !DILocation(line: 27, column: 44, scope: !4)
+!19 = !DILocation(line: 27, column: 30, scope: !4)
+!20 = !DILocation(line: 27, column: 53, scope: !4)
+!21 = !DILocation(line: 28, column: 25, scope: !4)
+!22 = !DILocation(line: 28, column: 36, scope: !4)
+!23 = !DILocation(line: 28, column: 4, scope: !4)
diff --git a/triton/JYGUFKIB7REG3CRGFOLJUXXXXNZKCJWILDAQA4L5F5UY64FVXYPA/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.ptx b/triton/JYGUFKIB7REG3CRGFOLJUXXXXNZKCJWILDAQA4L5F5UY64FVXYPA/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.ptx
new file mode 100644
index 0000000000000000000000000000000000000000..91999aebea37c8e6cc2bbfa06a376383ef0032bd
--- /dev/null
+++ b/triton/JYGUFKIB7REG3CRGFOLJUXXXXNZKCJWILDAQA4L5F5UY64FVXYPA/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.ptx
@@ -0,0 +1,347 @@
+//
+// Generated by LLVM NVPTX Back-End
+//
+
+.version 9.1
+.target sm_89
+.address_size 64
+
+	// .globl	triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1 // -- Begin function triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1
+                                        // @triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1
+.visible .entry triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1(
+	.param .u64 .ptr .global .align 1 triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1_param_0,
+	.param .u64 .ptr .global .align 1 triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1_param_1,
+	.param .u64 triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1_param_2,
+	.param .u32 triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1_param_3,
+	.param .u64 .ptr .global .align 1 triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1_param_4,
+	.param .u64 .ptr .global .align 1 triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1_param_5
+)
+.reqntid 256
+{
+	.reg .b16 	%rs<3>;
+	.reg .b32 	%r<32>;
+	.reg .b64 	%rd<11>;
+	.loc	1 18 0                          // c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py:18:0
+$L__func_begin0:
+	.loc	1 18 0                          // c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py:18:0
+
+// %bb.0:
+	ld.param.b64 	%rd4, [triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1_param_0];
+	ld.param.b64 	%rd5, [triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1_param_1];
+$L__tmp0:
+	.loc	1 20 28                         // c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py:20:28
+	mov.u32 	%r2, %ctaid.x;
+	.loc	1 20 33                         // c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py:20:33
+	shl.b32 	%r3, %r2, 9;
+	ld.param.b64 	%rd6, [triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1_param_2];
+	.loc	1 21 36                         // c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py:21:36
+	mov.u32 	%r4, %tid.x;
+	shl.b32 	%r5, %r4, 1;
+	and.b32 	%r6, %r5, 510;
+	.loc	1 21 23                         // c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py:21:23
+	or.b32 	%r7, %r6, %r3;
+	or.b32 	%r8, %r5, %r3;
+	or.b32 	%r9, %r8, 1;
+	.loc	1 24 21                         // c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py:24:21
+	bfe.s32 	%r10, %r2, 22, 1;
+	.loc	1 23 19                         // c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py:23:19
+	shr.u32 	%r11, %r10, 25;
+	.loc	1 24 21                         // c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py:24:21
+	add.s32 	%r12, %r7, %r11;
+	shr.s32 	%r13, %r12, 7;
+	.loc	1 23 19                         // c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py:23:19
+	and.b32 	%r14, %r12, -128;
+	sub.s32 	%r15, %r7, %r14;
+	add.s32 	%r16, %r9, %r11;
+	and.b32 	%r17, %r16, -128;
+	sub.s32 	%r18, %r9, %r17;
+	.loc	1 24 28                         // c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py:24:28
+	mul.hi.s32 	%r19, %r13, 954437177;
+	shr.u32 	%r20, %r19, 31;
+	shr.s32 	%r21, %r19, 9;
+	add.s32 	%r22, %r21, %r20;
+	mul.lo.s32 	%r23, %r22, 2304;
+	sub.s32 	%r24, %r13, %r23;
+	.loc	1 25 19                         // c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py:25:19
+	mul.hi.s32 	%r25, %r7, 954437177;
+	shr.u32 	%r26, %r25, 31;
+	shr.s32 	%r27, %r25, 16;
+	add.s32 	%r28, %r27, %r26;
+	.loc	1 27 39                         // c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py:27:39
+	shl.b32 	%r29, %r28, 7;
+	.loc	1 27 35                         // c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py:27:35
+	add.s32 	%r30, %r29, %r15;
+	add.s32 	%r31, %r29, %r18;
+	.loc	1 27 48                         // c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py:27:48
+	cvt.s64.s32 	%rd7, %r24;
+	mul.lo.s64 	%rd8, %rd6, %rd7;
+	.loc	1 27 30                         // c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py:27:30
+	shl.b64 	%rd9, %rd8, 1;
+	add.s64 	%rd10, %rd4, %rd9;
+	mad.wide.s32 	%rd1, %r30, 2, %rd10;
+	mad.wide.s32 	%rd2, %r31, 2, %rd10;
+	.loc	1 27 53                         // c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py:27:53
+	// begin inline asm
+	mov.u16 %rs1, 0x0;
+	ld.global.b16 { %rs1 }, [ %rd1 + 0 ];
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs2, 0x0;
+	ld.global.b16 { %rs2 }, [ %rd2 + 0 ];
+	// end inline asm
+	.loc	1 28 25                         // c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py:28:25
+	mad.wide.s32 	%rd3, %r7, 2, %rd5;
+	.loc	1 28 36                         // c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py:28:36
+	mov.b32 	%r1, {%rs1, %rs2};
+	// begin inline asm
+	st.global.b32 [ %rd3 + 0 ], { %r1 };
+	// end inline asm
+	.loc	1 28 4                          // c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py:28:4
+	ret;
+$L__tmp1:
+$L__func_end0:
+                                        // -- End function
+}
+	.file	1 "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py"
+	.section	.debug_abbrev
+	{
+.b8 1                                   // Abbreviation Code
+.b8 17                                  // DW_TAG_compile_unit
+.b8 0                                   // DW_CHILDREN_no
+.b8 37                                  // DW_AT_producer
+.b8 8                                   // DW_FORM_string
+.b8 19                                  // DW_AT_language
+.b8 5                                   // DW_FORM_data2
+.b8 3                                   // DW_AT_name
+.b8 8                                   // DW_FORM_string
+.b8 16                                  // DW_AT_stmt_list
+.b8 6                                   // DW_FORM_data4
+.b8 27                                  // DW_AT_comp_dir
+.b8 8                                   // DW_FORM_string
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 0                                   // EOM(3)
+	}
+	.section	.debug_info
+	{
+.b32 224                                // Length of Unit
+.b8 2                                   // DWARF version number
+.b8 0
+.b32 .debug_abbrev                      // Offset Into Abbrev. Section
+.b8 8                                   // Address Size (in bytes)
+.b8 1                                   // Abbrev [1] 0xb:0xd9 DW_TAG_compile_unit
+.b8 116                                 // DW_AT_producer
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 0
+.b8 2                                   // DW_AT_language
+.b8 0
+.b8 99                                  // DW_AT_name
+.b8 51
+.b8 118
+.b8 106
+.b8 105
+.b8 108
+.b8 118
+.b8 99
+.b8 121
+.b8 55
+.b8 115
+.b8 100
+.b8 113
+.b8 99
+.b8 97
+.b8 120
+.b8 102
+.b8 115
+.b8 112
+.b8 102
+.b8 102
+.b8 97
+.b8 100
+.b8 98
+.b8 115
+.b8 114
+.b8 121
+.b8 51
+.b8 115
+.b8 113
+.b8 109
+.b8 52
+.b8 106
+.b8 55
+.b8 113
+.b8 112
+.b8 54
+.b8 117
+.b8 51
+.b8 116
+.b8 117
+.b8 115
+.b8 114
+.b8 54
+.b8 112
+.b8 51
+.b8 52
+.b8 115
+.b8 98
+.b8 105
+.b8 97
+.b8 114
+.b8 46
+.b8 112
+.b8 121
+.b8 0
+.b32 .debug_line                        // DW_AT_stmt_list
+.b8 47                                  // DW_AT_comp_dir
+.b8 97
+.b8 112
+.b8 112
+.b8 47
+.b8 116
+.b8 101
+.b8 110
+.b8 115
+.b8 111
+.b8 114
+.b8 114
+.b8 116
+.b8 95
+.b8 108
+.b8 108
+.b8 109
+.b8 47
+.b8 118
+.b8 105
+.b8 115
+.b8 117
+.b8 97
+.b8 108
+.b8 95
+.b8 103
+.b8 101
+.b8 110
+.b8 47
+.b8 99
+.b8 111
+.b8 109
+.b8 112
+.b8 105
+.b8 108
+.b8 101
+.b8 100
+.b8 95
+.b8 99
+.b8 97
+.b8 99
+.b8 104
+.b8 101
+.b8 47
+.b8 102
+.b8 108
+.b8 117
+.b8 120
+.b8 50
+.b8 95
+.b8 107
+.b8 108
+.b8 101
+.b8 105
+.b8 110
+.b8 95
+.b8 57
+.b8 98
+.b8 95
+.b8 78
+.b8 86
+.b8 73
+.b8 68
+.b8 73
+.b8 65
+.b8 95
+.b8 71
+.b8 101
+.b8 70
+.b8 111
+.b8 114
+.b8 99
+.b8 101
+.b8 95
+.b8 82
+.b8 84
+.b8 88
+.b8 95
+.b8 52
+.b8 48
+.b8 57
+.b8 48
+.b8 95
+.b8 115
+.b8 109
+.b8 56
+.b8 57
+.b8 95
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 50
+.b8 46
+.b8 49
+.b8 48
+.b8 46
+.b8 48
+.b8 97
+.b8 48
+.b8 95
+.b8 98
+.b8 52
+.b8 101
+.b8 52
+.b8 101
+.b8 101
+.b8 56
+.b8 49
+.b8 100
+.b8 51
+.b8 46
+.b8 110
+.b8 118
+.b8 50
+.b8 53
+.b8 46
+.b8 49
+.b8 50
+.b8 95
+.b8 99
+.b8 117
+.b8 100
+.b8 97
+.b8 49
+.b8 51
+.b8 95
+.b8 49
+.b8 47
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 105
+.b8 110
+.b8 100
+.b8 117
+.b8 99
+.b8 116
+.b8 111
+.b8 114
+.b8 47
+.b8 51
+.b8 118
+.b8 0
+	}
+	.section	.debug_macinfo	{	}
diff --git a/triton/JYGUFKIB7REG3CRGFOLJUXXXXNZKCJWILDAQA4L5F5UY64FVXYPA/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.source b/triton/JYGUFKIB7REG3CRGFOLJUXXXXNZKCJWILDAQA4L5F5UY64FVXYPA/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.source
new file mode 100644
index 0000000000000000000000000000000000000000..ee6869b52e7acd12652977b4b7c9ec9bbff8e6ee
--- /dev/null
+++ b/triton/JYGUFKIB7REG3CRGFOLJUXXXXNZKCJWILDAQA4L5F5UY64FVXYPA/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.source
@@ -0,0 +1,91 @@
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":18:0)
+#loc21 = loc("in_ptr0"(#loc))
+#loc22 = loc("out_ptr0"(#loc))
+#loc23 = loc("ks0"(#loc))
+#loc24 = loc("xnumel"(#loc))
+module {
+  tt.func public @triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i64 loc("ks0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} {
+    %xnumel_0 = arith.constant 9437184 : i32 loc(#loc25)
+    %xoffset = tt.get_program_id x : i32 loc(#loc26)
+    %xoffset_1 = arith.constant 512 : i32 loc(#loc27)
+    %xoffset_2 = arith.constant 512 : i32 loc(#loc27)
+    %xoffset_3 = arith.muli %xoffset, %xoffset_2 : i32 loc(#loc27)
+    %xindex = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32> loc(#loc28)
+    %xindex_4 = tt.splat %xoffset_3 : i32 -> tensor<512xi32> loc(#loc29)
+    %xindex_5 = arith.addi %xindex_4, %xindex : tensor<512xi32> loc(#loc29)
+    %xmask = arith.constant true loc(#loc30)
+    %xmask_6 = arith.constant dense<true> : tensor<512xi1> loc(#loc30)
+    %x0 = arith.constant 128 : i32 loc(#loc31)
+    %x0_7 = arith.constant 128 : i32 loc(#loc31)
+    %x0_8 = arith.constant dense<128> : tensor<512xi32> loc(#loc31)
+    %x0_9 = arith.remsi %xindex_5, %x0_8 : tensor<512xi32> loc(#loc31)
+    %x1 = arith.constant 128 : i32 loc(#loc32)
+    %x1_10 = arith.constant 128 : i32 loc(#loc32)
+    %x1_11 = arith.constant dense<128> : tensor<512xi32> loc(#loc32)
+    %x1_12 = arith.divsi %xindex_5, %x1_11 : tensor<512xi32> loc(#loc32)
+    %x1_13 = arith.constant 2304 : i32 loc(#loc33)
+    %x1_14 = arith.constant 2304 : i32 loc(#loc33)
+    %x1_15 = arith.constant dense<2304> : tensor<512xi32> loc(#loc33)
+    %x1_16 = arith.remsi %x1_12, %x1_15 : tensor<512xi32> loc(#loc33)
+    %x2 = arith.constant 294912 : i32 loc(#loc34)
+    %x2_17 = arith.constant 294912 : i32 loc(#loc34)
+    %x2_18 = arith.constant dense<294912> : tensor<512xi32> loc(#loc34)
+    %x2_19 = arith.divsi %xindex_5, %x2_18 : tensor<512xi32> loc(#loc34)
+    %tmp0 = arith.constant 128 : i32 loc(#loc35)
+    %tmp0_20 = arith.constant 128 : i32 loc(#loc35)
+    %tmp0_21 = arith.constant dense<128> : tensor<512xi32> loc(#loc35)
+    %tmp0_22 = arith.muli %tmp0_21, %x2_19 : tensor<512xi32> loc(#loc35)
+    %tmp0_23 = arith.addi %x0_9, %tmp0_22 : tensor<512xi32> loc(#loc36)
+    %tmp0_24 = arith.extsi %x1_16 : tensor<512xi32> to tensor<512xi64> loc(#loc37)
+    %tmp0_25 = tt.splat %ks0 : i64 -> tensor<512xi64> loc(#loc37)
+    %tmp0_26 = arith.muli %tmp0_25, %tmp0_24 : tensor<512xi64> loc(#loc37)
+    %tmp0_27 = arith.extsi %tmp0_23 : tensor<512xi32> to tensor<512xi64> loc(#loc38)
+    %tmp0_28 = arith.addi %tmp0_27, %tmp0_26 : tensor<512xi64> loc(#loc38)
+    %tmp0_29 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<512x!tt.ptr<bf16>> loc(#loc39)
+    %tmp0_30 = tt.addptr %tmp0_29, %tmp0_28 : tensor<512x!tt.ptr<bf16>>, tensor<512xi64> loc(#loc39)
+    %tmp0_31 = tt.load %tmp0_30 : tensor<512x!tt.ptr<bf16>> loc(#loc40)
+    %tmp0_32 = arith.extf %tmp0_31 : tensor<512xbf16> to tensor<512xf32> loc(#loc41)
+    %0 = tt.splat %out_ptr0 : !tt.ptr<bf16> -> tensor<512x!tt.ptr<bf16>> loc(#loc18)
+    %1 = tt.addptr %0, %xindex_5 : tensor<512x!tt.ptr<bf16>>, tensor<512xi32> loc(#loc18)
+    %2 = arith.truncf %tmp0_32 : tensor<512xf32> to tensor<512xbf16> loc(#loc19)
+    tt.store %1, %2 : tensor<512x!tt.ptr<bf16>> loc(#loc19)
+    tt.return loc(#loc20)
+  } loc(#loc)
+} loc(#loc)
+#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":19:13)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":20:28)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":20:33)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":21:36)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":21:23)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":22:36)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":23:19)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":24:21)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":24:28)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":25:19)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":27:39)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":27:35)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":27:48)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":27:44)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":27:30)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":27:53)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":27:62)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":28:25)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":28:36)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":28:4)
+#loc25 = loc("xnumel"(#loc1))
+#loc26 = loc("xoffset"(#loc2))
+#loc27 = loc("xoffset"(#loc3))
+#loc28 = loc("xindex"(#loc4))
+#loc29 = loc("xindex"(#loc5))
+#loc30 = loc("xmask"(#loc6))
+#loc31 = loc("x0"(#loc7))
+#loc32 = loc("x1"(#loc8))
+#loc33 = loc("x1"(#loc9))
+#loc34 = loc("x2"(#loc10))
+#loc35 = loc("tmp0"(#loc11))
+#loc36 = loc("tmp0"(#loc12))
+#loc37 = loc("tmp0"(#loc13))
+#loc38 = loc("tmp0"(#loc14))
+#loc39 = loc("tmp0"(#loc15))
+#loc40 = loc("tmp0"(#loc16))
+#loc41 = loc("tmp0"(#loc17))
diff --git a/triton/JYGUFKIB7REG3CRGFOLJUXXXXNZKCJWILDAQA4L5F5UY64FVXYPA/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.ttgir b/triton/JYGUFKIB7REG3CRGFOLJUXXXXNZKCJWILDAQA4L5F5UY64FVXYPA/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.ttgir
new file mode 100644
index 0000000000000000000000000000000000000000..f1ea43d06080fe21d8e550b12521c39e21772378
--- /dev/null
+++ b/triton/JYGUFKIB7REG3CRGFOLJUXXXXNZKCJWILDAQA4L5F5UY64FVXYPA/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.ttgir
@@ -0,0 +1,69 @@
+#blocked = #ttg.blocked<{sizePerThread = [2], threadsPerWarp = [32], warpsPerCTA = [8], order = [0]}>
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":18:0)
+#loc19 = loc("in_ptr0"(#loc))
+#loc20 = loc("out_ptr0"(#loc))
+#loc21 = loc("ks0"(#loc))
+#loc22 = loc("xnumel"(#loc))
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "cuda:89", "ttg.threads-per-warp" = 32 : i32} {
+  tt.func public @triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i64 loc("ks0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} {
+    %cst = arith.constant dense<128> : tensor<512xi32, #blocked> loc(#loc1)
+    %cst_0 = arith.constant dense<2304> : tensor<512xi32, #blocked> loc(#loc1)
+    %cst_1 = arith.constant dense<294912> : tensor<512xi32, #blocked> loc(#loc1)
+    %c512_i32 = arith.constant 512 : i32 loc(#loc1)
+    %xoffset = tt.get_program_id x : i32 loc(#loc23)
+    %xoffset_2 = arith.muli %xoffset, %c512_i32 : i32 loc(#loc24)
+    %xindex = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32, #blocked> loc(#loc25)
+    %xindex_3 = tt.splat %xoffset_2 : i32 -> tensor<512xi32, #blocked> loc(#loc26)
+    %xindex_4 = arith.addi %xindex_3, %xindex : tensor<512xi32, #blocked> loc(#loc26)
+    %x0 = arith.remsi %xindex_4, %cst : tensor<512xi32, #blocked> loc(#loc27)
+    %x1 = arith.divsi %xindex_4, %cst : tensor<512xi32, #blocked> loc(#loc28)
+    %x1_5 = arith.remsi %x1, %cst_0 : tensor<512xi32, #blocked> loc(#loc29)
+    %x2 = arith.divsi %xindex_4, %cst_1 : tensor<512xi32, #blocked> loc(#loc30)
+    %tmp0 = arith.muli %x2, %cst : tensor<512xi32, #blocked> loc(#loc31)
+    %tmp0_6 = arith.addi %x0, %tmp0 : tensor<512xi32, #blocked> loc(#loc32)
+    %tmp0_7 = arith.extsi %x1_5 : tensor<512xi32, #blocked> to tensor<512xi64, #blocked> loc(#loc33)
+    %tmp0_8 = tt.splat %ks0 : i64 -> tensor<512xi64, #blocked> loc(#loc33)
+    %tmp0_9 = arith.muli %tmp0_8, %tmp0_7 : tensor<512xi64, #blocked> loc(#loc33)
+    %tmp0_10 = arith.extsi %tmp0_6 : tensor<512xi32, #blocked> to tensor<512xi64, #blocked> loc(#loc34)
+    %tmp0_11 = arith.addi %tmp0_10, %tmp0_9 : tensor<512xi64, #blocked> loc(#loc34)
+    %tmp0_12 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<512x!tt.ptr<bf16>, #blocked> loc(#loc35)
+    %tmp0_13 = tt.addptr %tmp0_12, %tmp0_11 : tensor<512x!tt.ptr<bf16>, #blocked>, tensor<512xi64, #blocked> loc(#loc35)
+    %tmp0_14 = tt.load %tmp0_13 : tensor<512x!tt.ptr<bf16>, #blocked> loc(#loc36)
+    %0 = tt.splat %out_ptr0 : !tt.ptr<bf16> -> tensor<512x!tt.ptr<bf16>, #blocked> loc(#loc16)
+    %1 = tt.addptr %0, %xindex_4 : tensor<512x!tt.ptr<bf16>, #blocked>, tensor<512xi32, #blocked> loc(#loc16)
+    tt.store %1, %tmp0_14 : tensor<512x!tt.ptr<bf16>, #blocked> loc(#loc17)
+    tt.return loc(#loc18)
+  } loc(#loc)
+} loc(#loc)
+#loc1 = loc(unknown)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":20:28)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":20:33)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":21:36)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":21:23)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":23:19)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":24:21)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":24:28)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":25:19)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":27:39)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":27:35)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":27:48)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":27:44)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":27:30)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":27:53)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":28:25)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":28:36)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":28:4)
+#loc23 = loc("xoffset"(#loc2))
+#loc24 = loc("xoffset"(#loc3))
+#loc25 = loc("xindex"(#loc4))
+#loc26 = loc("xindex"(#loc5))
+#loc27 = loc("x0"(#loc6))
+#loc28 = loc("x1"(#loc7))
+#loc29 = loc("x1"(#loc8))
+#loc30 = loc("x2"(#loc9))
+#loc31 = loc("tmp0"(#loc10))
+#loc32 = loc("tmp0"(#loc11))
+#loc33 = loc("tmp0"(#loc12))
+#loc34 = loc("tmp0"(#loc13))
+#loc35 = loc("tmp0"(#loc14))
+#loc36 = loc("tmp0"(#loc15))
diff --git a/triton/JYGUFKIB7REG3CRGFOLJUXXXXNZKCJWILDAQA4L5F5UY64FVXYPA/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.ttir b/triton/JYGUFKIB7REG3CRGFOLJUXXXXNZKCJWILDAQA4L5F5UY64FVXYPA/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.ttir
new file mode 100644
index 0000000000000000000000000000000000000000..5d28d519f32fcd08b0182402e63035db0aec1a21
--- /dev/null
+++ b/triton/JYGUFKIB7REG3CRGFOLJUXXXXNZKCJWILDAQA4L5F5UY64FVXYPA/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.ttir
@@ -0,0 +1,68 @@
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":18:0)
+#loc19 = loc("in_ptr0"(#loc))
+#loc20 = loc("out_ptr0"(#loc))
+#loc21 = loc("ks0"(#loc))
+#loc22 = loc("xnumel"(#loc))
+module {
+  tt.func public @triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i64 loc("ks0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} {
+    %x2 = arith.constant dense<294912> : tensor<512xi32> loc(#loc23)
+    %x1 = arith.constant dense<2304> : tensor<512xi32> loc(#loc24)
+    %cst = arith.constant dense<128> : tensor<512xi32> loc(#loc3)
+    %c512_i32 = arith.constant 512 : i32 loc(#loc3)
+    %xoffset = tt.get_program_id x : i32 loc(#loc25)
+    %xoffset_0 = arith.muli %xoffset, %c512_i32 : i32 loc(#loc26)
+    %xindex = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32> loc(#loc27)
+    %xindex_1 = tt.splat %xoffset_0 : i32 -> tensor<512xi32> loc(#loc28)
+    %xindex_2 = arith.addi %xindex_1, %xindex : tensor<512xi32> loc(#loc28)
+    %x0 = arith.remsi %xindex_2, %cst : tensor<512xi32> loc(#loc29)
+    %x1_3 = arith.divsi %xindex_2, %cst : tensor<512xi32> loc(#loc30)
+    %x1_4 = arith.remsi %x1_3, %x1 : tensor<512xi32> loc(#loc24)
+    %x2_5 = arith.divsi %xindex_2, %x2 : tensor<512xi32> loc(#loc23)
+    %tmp0 = arith.muli %x2_5, %cst : tensor<512xi32> loc(#loc31)
+    %tmp0_6 = arith.addi %x0, %tmp0 : tensor<512xi32> loc(#loc32)
+    %tmp0_7 = arith.extsi %x1_4 : tensor<512xi32> to tensor<512xi64> loc(#loc33)
+    %tmp0_8 = tt.splat %ks0 : i64 -> tensor<512xi64> loc(#loc33)
+    %tmp0_9 = arith.muli %tmp0_8, %tmp0_7 : tensor<512xi64> loc(#loc33)
+    %tmp0_10 = arith.extsi %tmp0_6 : tensor<512xi32> to tensor<512xi64> loc(#loc34)
+    %tmp0_11 = arith.addi %tmp0_10, %tmp0_9 : tensor<512xi64> loc(#loc34)
+    %tmp0_12 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<512x!tt.ptr<bf16>> loc(#loc35)
+    %tmp0_13 = tt.addptr %tmp0_12, %tmp0_11 : tensor<512x!tt.ptr<bf16>>, tensor<512xi64> loc(#loc35)
+    %tmp0_14 = tt.load %tmp0_13 : tensor<512x!tt.ptr<bf16>> loc(#loc36)
+    %0 = tt.splat %out_ptr0 : !tt.ptr<bf16> -> tensor<512x!tt.ptr<bf16>> loc(#loc16)
+    %1 = tt.addptr %0, %xindex_2 : tensor<512x!tt.ptr<bf16>>, tensor<512xi32> loc(#loc16)
+    tt.store %1, %tmp0_14 : tensor<512x!tt.ptr<bf16>> loc(#loc17)
+    tt.return loc(#loc18)
+  } loc(#loc)
+} loc(#loc)
+#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":25:19)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":24:28)
+#loc3 = loc(unknown)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":20:28)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":20:33)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":21:36)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":21:23)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":23:19)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":24:21)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":27:39)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":27:35)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":27:48)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":27:44)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":27:30)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":27:53)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":28:25)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":28:36)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":28:4)
+#loc23 = loc("x2"(#loc1))
+#loc24 = loc("x1"(#loc2))
+#loc25 = loc("xoffset"(#loc4))
+#loc26 = loc("xoffset"(#loc5))
+#loc27 = loc("xindex"(#loc6))
+#loc28 = loc("xindex"(#loc7))
+#loc29 = loc("x0"(#loc8))
+#loc30 = loc("x1"(#loc9))
+#loc31 = loc("tmp0"(#loc10))
+#loc32 = loc("tmp0"(#loc11))
+#loc33 = loc("tmp0"(#loc12))
+#loc34 = loc("tmp0"(#loc13))
+#loc35 = loc("tmp0"(#loc14))
+#loc36 = loc("tmp0"(#loc15))
diff --git a/triton/K67YKHK7SDRMJU7RXHVXJT2KOUZAO4I5423SJJDUEAFURSHREZEQ/__grp__triton_poi_fused__fused_rms_norm_cat_view_2.json b/triton/K67YKHK7SDRMJU7RXHVXJT2KOUZAO4I5423SJJDUEAFURSHREZEQ/__grp__triton_poi_fused__fused_rms_norm_cat_view_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..7a504b8d523622799a4e500e32ee3409f7634151
--- /dev/null
+++ b/triton/K67YKHK7SDRMJU7RXHVXJT2KOUZAO4I5423SJJDUEAFURSHREZEQ/__grp__triton_poi_fused__fused_rms_norm_cat_view_2.json
@@ -0,0 +1 @@
+{"child_paths": {"triton_poi_fused__fused_rms_norm_cat_view_2.source": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/K67YKHK7SDRMJU7RXHVXJT2KOUZAO4I5423SJJDUEAFURSHREZEQ/triton_poi_fused__fused_rms_norm_cat_view_2.source", "triton_poi_fused__fused_rms_norm_cat_view_2.ttir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/K67YKHK7SDRMJU7RXHVXJT2KOUZAO4I5423SJJDUEAFURSHREZEQ/triton_poi_fused__fused_rms_norm_cat_view_2.ttir", "triton_poi_fused__fused_rms_norm_cat_view_2.ttgir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/K67YKHK7SDRMJU7RXHVXJT2KOUZAO4I5423SJJDUEAFURSHREZEQ/triton_poi_fused__fused_rms_norm_cat_view_2.ttgir", "triton_poi_fused__fused_rms_norm_cat_view_2.llir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/K67YKHK7SDRMJU7RXHVXJT2KOUZAO4I5423SJJDUEAFURSHREZEQ/triton_poi_fused__fused_rms_norm_cat_view_2.llir", "triton_poi_fused__fused_rms_norm_cat_view_2.ptx": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/K67YKHK7SDRMJU7RXHVXJT2KOUZAO4I5423SJJDUEAFURSHREZEQ/triton_poi_fused__fused_rms_norm_cat_view_2.ptx", "triton_poi_fused__fused_rms_norm_cat_view_2.cubin": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/K67YKHK7SDRMJU7RXHVXJT2KOUZAO4I5423SJJDUEAFURSHREZEQ/triton_poi_fused__fused_rms_norm_cat_view_2.cubin", "triton_poi_fused__fused_rms_norm_cat_view_2.json": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/K67YKHK7SDRMJU7RXHVXJT2KOUZAO4I5423SJJDUEAFURSHREZEQ/triton_poi_fused__fused_rms_norm_cat_view_2.json"}}
\ No newline at end of file
diff --git a/triton/K67YKHK7SDRMJU7RXHVXJT2KOUZAO4I5423SJJDUEAFURSHREZEQ/triton_poi_fused__fused_rms_norm_cat_view_2.cubin b/triton/K67YKHK7SDRMJU7RXHVXJT2KOUZAO4I5423SJJDUEAFURSHREZEQ/triton_poi_fused__fused_rms_norm_cat_view_2.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..4bbfb628bbda9fc4f510def2e7c473e8ff845e28
Binary files /dev/null and b/triton/K67YKHK7SDRMJU7RXHVXJT2KOUZAO4I5423SJJDUEAFURSHREZEQ/triton_poi_fused__fused_rms_norm_cat_view_2.cubin differ
diff --git a/triton/K67YKHK7SDRMJU7RXHVXJT2KOUZAO4I5423SJJDUEAFURSHREZEQ/triton_poi_fused__fused_rms_norm_cat_view_2.json b/triton/K67YKHK7SDRMJU7RXHVXJT2KOUZAO4I5423SJJDUEAFURSHREZEQ/triton_poi_fused__fused_rms_norm_cat_view_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..4d400ea1b75f80f42a8d60372e019c868d172932
--- /dev/null
+++ b/triton/K67YKHK7SDRMJU7RXHVXJT2KOUZAO4I5423SJJDUEAFURSHREZEQ/triton_poi_fused__fused_rms_norm_cat_view_2.json
@@ -0,0 +1 @@
+{"hash": "57bf851d5f90e2c4d3f1b9eb74cf4a753207711de6b724a474200b48c8f12649", "target": {"backend": "cuda", "arch": 89, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "enable_reflect_ftz": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee", "bf16x3", "bf16x6"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm89", "instrumentation_mode": "", "triton_version": "3.6.0", "tensordesc_meta": [], "shared": 4096, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_poi_fused__fused_rms_norm_cat_view_2"}
\ No newline at end of file
diff --git a/triton/K67YKHK7SDRMJU7RXHVXJT2KOUZAO4I5423SJJDUEAFURSHREZEQ/triton_poi_fused__fused_rms_norm_cat_view_2.llir b/triton/K67YKHK7SDRMJU7RXHVXJT2KOUZAO4I5423SJJDUEAFURSHREZEQ/triton_poi_fused__fused_rms_norm_cat_view_2.llir
new file mode 100644
index 0000000000000000000000000000000000000000..485e85db49eb1197102cae7b9b2285d2d718c6ba
--- /dev/null
+++ b/triton/K67YKHK7SDRMJU7RXHVXJT2KOUZAO4I5423SJJDUEAFURSHREZEQ/triton_poi_fused__fused_rms_norm_cat_view_2.llir
@@ -0,0 +1,770 @@
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64"
+
+@global_smem = external local_unnamed_addr addrspace(3) global [0 x i8], align 16
+@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1
+
+; Function Attrs: nounwind
+define ptx_kernel void @triton_poi_fused__fused_rms_norm_cat_view_2(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, ptr addrspace(1) %6, i32 %7, i32 %8, ptr addrspace(1) readnone captures(none) %9, ptr addrspace(1) readnone captures(none) %10) local_unnamed_addr #0 !dbg !5 {
+  %12 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y(), !dbg !8
+  %13 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.z(), !dbg !9
+  %14 = tail call i32 @llvm.nvvm.read.ptx.sreg.nctaid.y(), !dbg !10
+  %15 = mul nuw i32 %13, %14, !dbg !11
+  %16 = add nuw i32 %15, %12, !dbg !12
+  %17 = shl i32 %16, 5, !dbg !13
+  %18 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !14
+  %19 = lshr i32 %18, 2, !dbg !14
+  %20 = and i32 %19, 31, !dbg !14
+  %21 = and i32 %18, 7, !dbg !14
+  %22 = shl nuw nsw i32 %21, 2, !dbg !14
+  %23 = or disjoint i32 %17, %20, !dbg !15
+  %24 = or disjoint i32 %17, %22, !dbg !15
+  %25 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !16
+  %26 = shl i32 %25, 5, !dbg !17
+  %27 = shl nuw nsw i32 %18, 3, !dbg !18
+  %28 = and i32 %27, 24, !dbg !18
+  %29 = lshr i32 %18, 3, !dbg !18
+  %30 = and i32 %29, 15, !dbg !18
+  %31 = or disjoint i32 %28, %26, !dbg !19
+  %32 = or disjoint i32 %30, %26, !dbg !19
+  %33 = icmp slt i32 %31, 128, !dbg !20
+  %34 = icmp slt i32 %32, 128, !dbg !20
+  %35 = sdiv i32 %23, 32, !dbg !21
+  %36 = sdiv i32 %24, 32, !dbg !21
+  %37 = mul i32 %35, 32, !dbg !22
+  %.decomposed = sub i32 %23, %37, !dbg !22
+  %38 = mul i32 %36, 32, !dbg !22
+  %.decomposed53 = sub i32 %24, %38, !dbg !22
+  %39 = icmp slt i32 %23, 8192, !dbg !23
+  %40 = icmp slt i32 %24, 8192, !dbg !23
+  %41 = shl nsw i32 %.decomposed, 7, !dbg !24
+  %42 = add i32 %41, %31, !dbg !25
+  %43 = mul i32 %35, 12288, !dbg !26
+  %44 = add i32 %42, %43, !dbg !27
+  %45 = sext i32 %44 to i64, !dbg !28
+  %46 = getelementptr bfloat, ptr addrspace(1) %0, i64 %45, !dbg !28
+  %47 = and i1 %33, %39, !dbg !29
+  %48 = and i1 %34, %40, !dbg !29
+  %49 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !30
+  %50 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %46, i64 %49, i1 %47) #5, !dbg !30
+  %51 = extractvalue { i32, i32, i32, i32 } %50, 0, !dbg !30
+  %52 = extractvalue { i32, i32, i32, i32 } %50, 1, !dbg !30
+  %53 = extractvalue { i32, i32, i32, i32 } %50, 2, !dbg !30
+  %54 = extractvalue { i32, i32, i32, i32 } %50, 3, !dbg !30
+  %55 = insertelement <2 x i32> poison, i32 %51, i64 0, !dbg !30
+  %56 = insertelement <2 x i32> %55, i32 %53, i64 1, !dbg !30
+  %57 = lshr <2 x i32> %56, splat (i32 16), !dbg !30
+  %58 = trunc nuw <2 x i32> %57 to <2 x i16>, !dbg !30
+  %59 = insertelement <2 x i32> poison, i32 %52, i64 0, !dbg !30
+  %60 = insertelement <2 x i32> %59, i32 %54, i64 1, !dbg !30
+  %61 = lshr <2 x i32> %60, splat (i32 16), !dbg !30
+  %62 = trunc nuw <2 x i32> %61 to <2 x i16>, !dbg !30
+  %63 = shl nuw nsw i32 %18, 5, !dbg !31
+  %64 = and i32 %63, 480, !dbg !31
+  %65 = and i32 %18, 12, !dbg !31
+  %66 = shl nuw nsw i32 %65, 1, !dbg !31
+  %67 = and i32 %18, 112, !dbg !31
+  %68 = lshr exact i32 %67, 2, !dbg !31
+  %69 = or disjoint i32 %64, %66, !dbg !31
+  %70 = xor i32 %69, %68, !dbg !31
+  %71 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %70, !dbg !31
+  %72 = trunc i32 %51 to i16, !dbg !31
+  %73 = trunc i32 %53 to i16, !dbg !31
+  %74 = insertelement <2 x i16> poison, i16 %72, i64 0, !dbg !31
+  %75 = insertelement <2 x i16> %74, i16 %73, i64 1, !dbg !31
+  store <2 x i16> %75, ptr addrspace(3) %71, align 4, !dbg !31
+  %76 = xor i32 %70, 544, !dbg !31
+  %77 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %76, !dbg !31
+  store <2 x i16> %58, ptr addrspace(3) %77, align 4, !dbg !31
+  %78 = xor i32 %70, 1088, !dbg !31
+  %79 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %78, !dbg !31
+  %80 = trunc i32 %52 to i16, !dbg !31
+  %81 = trunc i32 %54 to i16, !dbg !31
+  %82 = insertelement <2 x i16> poison, i16 %80, i64 0, !dbg !31
+  %83 = insertelement <2 x i16> %82, i16 %81, i64 1, !dbg !31
+  store <2 x i16> %83, ptr addrspace(3) %79, align 4, !dbg !31
+  %84 = xor i32 %70, 1632, !dbg !31
+  %85 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %84, !dbg !31
+  store <2 x i16> %62, ptr addrspace(3) %85, align 4, !dbg !31
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !31
+  %86 = shl nuw nsw i32 %18, 6, !dbg !31
+  %87 = and i32 %86, 1536, !dbg !31
+  %88 = shl nuw nsw i32 %18, 2, !dbg !31
+  %89 = and i32 %88, 124, !dbg !31
+  %90 = and i32 %18, 32, !dbg !31
+  %91 = lshr exact i32 %90, 4, !dbg !31
+  %92 = and i32 %18, 64, !dbg !31
+  %93 = lshr exact i32 %92, 1, !dbg !31
+  %94 = or disjoint i32 %87, %89, !dbg !31
+  %95 = xor i32 %94, %93, !dbg !31
+  %96 = or disjoint i32 %95, %91, !dbg !31
+  %97 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %96, !dbg !31
+  %98 = load bfloat, ptr addrspace(3) %97, align 2, !dbg !31
+  %99 = xor i32 %96, 136, !dbg !31
+  %100 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %99, !dbg !31
+  %101 = load bfloat, ptr addrspace(3) %100, align 2, !dbg !31
+  %102 = xor i32 %96, 272, !dbg !31
+  %103 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %102, !dbg !31
+  %104 = load bfloat, ptr addrspace(3) %103, align 2, !dbg !31
+  %105 = xor i32 %96, 408, !dbg !31
+  %106 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %105, !dbg !31
+  %107 = load bfloat, ptr addrspace(3) %106, align 2, !dbg !31
+  %108 = xor i32 %96, 64, !dbg !31
+  %109 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %108, !dbg !31
+  %110 = load bfloat, ptr addrspace(3) %109, align 2, !dbg !31
+  %111 = xor i32 %96, 200, !dbg !31
+  %112 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %111, !dbg !31
+  %113 = load bfloat, ptr addrspace(3) %112, align 2, !dbg !31
+  %114 = xor i32 %96, 336, !dbg !31
+  %115 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %114, !dbg !31
+  %116 = load bfloat, ptr addrspace(3) %115, align 2, !dbg !31
+  %117 = xor i32 %96, 472, !dbg !31
+  %118 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %117, !dbg !31
+  %119 = load bfloat, ptr addrspace(3) %118, align 2, !dbg !31
+  %120 = fpext bfloat %98 to float, !dbg !31
+  %121 = fpext bfloat %101 to float, !dbg !31
+  %122 = fpext bfloat %104 to float, !dbg !31
+  %123 = fpext bfloat %107 to float, !dbg !31
+  %124 = fpext bfloat %110 to float, !dbg !31
+  %125 = fpext bfloat %113 to float, !dbg !31
+  %126 = fpext bfloat %116 to float, !dbg !31
+  %127 = fpext bfloat %119 to float, !dbg !31
+  %128 = sext i32 %24 to i64, !dbg !32
+  %129 = getelementptr float, ptr addrspace(1) %1, i64 %128, !dbg !32
+  %130 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !33
+  %131 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %129, i64 %130, i1 %48) #5, !dbg !33
+  %132 = extractvalue { i32, i32, i32, i32 } %131, 0, !dbg !33
+  %133 = extractvalue { i32, i32, i32, i32 } %131, 1, !dbg !33
+  %134 = extractvalue { i32, i32, i32, i32 } %131, 2, !dbg !33
+  %135 = extractvalue { i32, i32, i32, i32 } %131, 3, !dbg !33
+  %136 = bitcast i32 %132 to float, !dbg !33
+  %137 = bitcast i32 %133 to float, !dbg !33
+  %138 = bitcast i32 %134 to float, !dbg !33
+  %139 = bitcast i32 %135 to float, !dbg !33
+  %140 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !33
+  %141 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %129, i64 %140, i1 %48) #5, !dbg !33
+  %142 = extractvalue { i32, i32, i32, i32 } %141, 0, !dbg !33
+  %143 = extractvalue { i32, i32, i32, i32 } %141, 1, !dbg !33
+  %144 = extractvalue { i32, i32, i32, i32 } %141, 2, !dbg !33
+  %145 = extractvalue { i32, i32, i32, i32 } %141, 3, !dbg !33
+  %146 = bitcast i32 %142 to float, !dbg !33
+  %147 = bitcast i32 %143 to float, !dbg !33
+  %148 = bitcast i32 %144 to float, !dbg !33
+  %149 = bitcast i32 %145 to float, !dbg !33
+  %150 = tail call float @llvm.nvvm.div.full(float %136, float 1.280000e+02), !dbg !34
+  %151 = tail call float @llvm.nvvm.div.full(float %137, float 1.280000e+02), !dbg !34
+  %152 = tail call float @llvm.nvvm.div.full(float %138, float 1.280000e+02), !dbg !34
+  %153 = tail call float @llvm.nvvm.div.full(float %139, float 1.280000e+02), !dbg !34
+  %154 = tail call float @llvm.nvvm.div.full(float %146, float 1.280000e+02), !dbg !34
+  %155 = tail call float @llvm.nvvm.div.full(float %147, float 1.280000e+02), !dbg !34
+  %156 = tail call float @llvm.nvvm.div.full(float %148, float 1.280000e+02), !dbg !34
+  %157 = tail call float @llvm.nvvm.div.full(float %149, float 1.280000e+02), !dbg !34
+  %158 = fadd float %150, 0x3EB0C6F7A0000000, !dbg !35
+  %159 = fadd float %151, 0x3EB0C6F7A0000000, !dbg !35
+  %160 = fadd float %152, 0x3EB0C6F7A0000000, !dbg !35
+  %161 = fadd float %153, 0x3EB0C6F7A0000000, !dbg !35
+  %162 = fadd float %154, 0x3EB0C6F7A0000000, !dbg !35
+  %163 = fadd float %155, 0x3EB0C6F7A0000000, !dbg !35
+  %164 = fadd float %156, 0x3EB0C6F7A0000000, !dbg !35
+  %165 = fadd float %157, 0x3EB0C6F7A0000000, !dbg !35
+  %166 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !36
+  %.not.i = icmp eq i32 %166, 0, !dbg !36
+  br i1 %.not.i, label %169, label %167, !dbg !36
+
+167:                                              ; preds = %11
+  %168 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %158), !dbg !36
+  br label %__nv_rsqrtf.exit, !dbg !36
+
+169:                                              ; preds = %11
+  %170 = tail call float @llvm.nvvm.rsqrt.approx.f(float %158), !dbg !36
+  br label %__nv_rsqrtf.exit, !dbg !36
+
+__nv_rsqrtf.exit:                                 ; preds = %167, %169
+  %.0.i = phi float [ %168, %167 ], [ %170, %169 ], !dbg !36
+  %171 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !36
+  %.not.i8 = icmp eq i32 %171, 0, !dbg !36
+  br i1 %.not.i8, label %174, label %172, !dbg !36
+
+172:                                              ; preds = %__nv_rsqrtf.exit
+  %173 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %159), !dbg !36
+  br label %__nv_rsqrtf.exit10, !dbg !36
+
+174:                                              ; preds = %__nv_rsqrtf.exit
+  %175 = tail call float @llvm.nvvm.rsqrt.approx.f(float %159), !dbg !36
+  br label %__nv_rsqrtf.exit10, !dbg !36
+
+__nv_rsqrtf.exit10:                               ; preds = %172, %174
+  %.0.i9 = phi float [ %173, %172 ], [ %175, %174 ], !dbg !36
+  %176 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !36
+  %.not.i11 = icmp eq i32 %176, 0, !dbg !36
+  br i1 %.not.i11, label %179, label %177, !dbg !36
+
+177:                                              ; preds = %__nv_rsqrtf.exit10
+  %178 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %160), !dbg !36
+  br label %__nv_rsqrtf.exit13, !dbg !36
+
+179:                                              ; preds = %__nv_rsqrtf.exit10
+  %180 = tail call float @llvm.nvvm.rsqrt.approx.f(float %160), !dbg !36
+  br label %__nv_rsqrtf.exit13, !dbg !36
+
+__nv_rsqrtf.exit13:                               ; preds = %177, %179
+  %.0.i12 = phi float [ %178, %177 ], [ %180, %179 ], !dbg !36
+  %181 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !36
+  %.not.i14 = icmp eq i32 %181, 0, !dbg !36
+  br i1 %.not.i14, label %184, label %182, !dbg !36
+
+182:                                              ; preds = %__nv_rsqrtf.exit13
+  %183 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %161), !dbg !36
+  br label %__nv_rsqrtf.exit16, !dbg !36
+
+184:                                              ; preds = %__nv_rsqrtf.exit13
+  %185 = tail call float @llvm.nvvm.rsqrt.approx.f(float %161), !dbg !36
+  br label %__nv_rsqrtf.exit16, !dbg !36
+
+__nv_rsqrtf.exit16:                               ; preds = %182, %184
+  %.0.i15 = phi float [ %183, %182 ], [ %185, %184 ], !dbg !36
+  %186 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !36
+  %.not.i17 = icmp eq i32 %186, 0, !dbg !36
+  br i1 %.not.i17, label %189, label %187, !dbg !36
+
+187:                                              ; preds = %__nv_rsqrtf.exit16
+  %188 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %162), !dbg !36
+  br label %__nv_rsqrtf.exit19, !dbg !36
+
+189:                                              ; preds = %__nv_rsqrtf.exit16
+  %190 = tail call float @llvm.nvvm.rsqrt.approx.f(float %162), !dbg !36
+  br label %__nv_rsqrtf.exit19, !dbg !36
+
+__nv_rsqrtf.exit19:                               ; preds = %187, %189
+  %.0.i18 = phi float [ %188, %187 ], [ %190, %189 ], !dbg !36
+  %191 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !36
+  %.not.i20 = icmp eq i32 %191, 0, !dbg !36
+  br i1 %.not.i20, label %194, label %192, !dbg !36
+
+192:                                              ; preds = %__nv_rsqrtf.exit19
+  %193 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %163), !dbg !36
+  br label %__nv_rsqrtf.exit22, !dbg !36
+
+194:                                              ; preds = %__nv_rsqrtf.exit19
+  %195 = tail call float @llvm.nvvm.rsqrt.approx.f(float %163), !dbg !36
+  br label %__nv_rsqrtf.exit22, !dbg !36
+
+__nv_rsqrtf.exit22:                               ; preds = %192, %194
+  %.0.i21 = phi float [ %193, %192 ], [ %195, %194 ], !dbg !36
+  %196 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !36
+  %.not.i23 = icmp eq i32 %196, 0, !dbg !36
+  br i1 %.not.i23, label %199, label %197, !dbg !36
+
+197:                                              ; preds = %__nv_rsqrtf.exit22
+  %198 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %164), !dbg !36
+  br label %__nv_rsqrtf.exit25, !dbg !36
+
+199:                                              ; preds = %__nv_rsqrtf.exit22
+  %200 = tail call float @llvm.nvvm.rsqrt.approx.f(float %164), !dbg !36
+  br label %__nv_rsqrtf.exit25, !dbg !36
+
+__nv_rsqrtf.exit25:                               ; preds = %197, %199
+  %.0.i24 = phi float [ %198, %197 ], [ %200, %199 ], !dbg !36
+  %201 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !36
+  %.not.i26 = icmp eq i32 %201, 0, !dbg !36
+  br i1 %.not.i26, label %204, label %202, !dbg !36
+
+202:                                              ; preds = %__nv_rsqrtf.exit25
+  %203 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %165), !dbg !36
+  br label %__nv_rsqrtf.exit28, !dbg !36
+
+204:                                              ; preds = %__nv_rsqrtf.exit25
+  %205 = tail call float @llvm.nvvm.rsqrt.approx.f(float %165), !dbg !36
+  br label %__nv_rsqrtf.exit28, !dbg !36
+
+__nv_rsqrtf.exit28:                               ; preds = %202, %204
+  %.0.i27 = phi float [ %203, %202 ], [ %205, %204 ], !dbg !36
+  %206 = fmul float %.0.i, %120, !dbg !37
+  %207 = fmul float %.0.i9, %121, !dbg !37
+  %208 = fmul float %.0.i12, %122, !dbg !37
+  %209 = fmul float %.0.i15, %123, !dbg !37
+  %210 = fmul float %.0.i18, %124, !dbg !37
+  %211 = fmul float %.0.i21, %125, !dbg !37
+  %212 = fmul float %.0.i24, %126, !dbg !37
+  %213 = fmul float %.0.i27, %127, !dbg !37
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !37
+  %214 = shl nuw nsw i32 %21, 3, !dbg !37
+  %215 = and i32 %18, 8, !dbg !37
+  %216 = icmp eq i32 %215, 0, !dbg !37
+  %217 = select i1 %216, i32 0, i32 1088, !dbg !37
+  %218 = and i32 %18, 16, !dbg !37
+  %219 = icmp eq i32 %218, 0, !dbg !37
+  %220 = select i1 %219, i32 0, i32 2052, !dbg !37
+  %221 = shl nuw nsw i32 %90, 2, !dbg !37
+  %222 = or disjoint i32 %220, %221, !dbg !37
+  %223 = or disjoint i32 %217, %214, !dbg !37
+  %224 = xor i32 %223, %92, !dbg !37
+  %225 = or disjoint i32 %224, %222, !dbg !37
+  %226 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %225, !dbg !37
+  store float %206, ptr addrspace(3) %226, align 4, !dbg !37
+  %227 = xor i32 %225, 272, !dbg !37
+  %228 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %227, !dbg !37
+  store float %207, ptr addrspace(3) %228, align 4, !dbg !37
+  %229 = xor i32 %225, 544, !dbg !37
+  %230 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %229, !dbg !37
+  store float %208, ptr addrspace(3) %230, align 4, !dbg !37
+  %231 = xor i32 %225, 816, !dbg !37
+  %232 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %231, !dbg !37
+  store float %209, ptr addrspace(3) %232, align 4, !dbg !37
+  %233 = xor i32 %225, 4, !dbg !37
+  %234 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %233, !dbg !37
+  store float %210, ptr addrspace(3) %234, align 4, !dbg !37
+  %235 = xor i32 %225, 276, !dbg !37
+  %236 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %235, !dbg !37
+  store float %211, ptr addrspace(3) %236, align 4, !dbg !37
+  %237 = xor i32 %225, 548, !dbg !37
+  %238 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %237, !dbg !37
+  store float %212, ptr addrspace(3) %238, align 4, !dbg !37
+  %239 = xor i32 %225, 820, !dbg !37
+  %240 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %239, !dbg !37
+  store float %213, ptr addrspace(3) %240, align 4, !dbg !37
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !37
+  %241 = and i32 %86, 832, !dbg !37
+  %242 = shl nuw nsw i32 %65, 2, !dbg !37
+  %243 = lshr exact i32 %67, 1, !dbg !37
+  %244 = shl nuw nsw i32 %18, 1, !dbg !37
+  %245 = and i32 %244, 4, !dbg !37
+  %246 = or disjoint i32 %241, %242, !dbg !37
+  %247 = xor i32 %246, %243, !dbg !37
+  %248 = or disjoint i32 %247, %245, !dbg !37
+  %249 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %248, !dbg !37
+  %250 = load float, ptr addrspace(3) %249, align 4, !dbg !37
+  %251 = getelementptr inbounds nuw i8, ptr addrspace(3) %249, i32 128, !dbg !37
+  %252 = load float, ptr addrspace(3) %251, align 4, !dbg !37
+  %253 = xor i32 %248, 1088, !dbg !37
+  %254 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %253, !dbg !37
+  %255 = load float, ptr addrspace(3) %254, align 4, !dbg !37
+  %256 = getelementptr inbounds nuw i8, ptr addrspace(3) %254, i32 128, !dbg !37
+  %257 = load float, ptr addrspace(3) %256, align 4, !dbg !37
+  %258 = xor i32 %248, 2052, !dbg !37
+  %259 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %258, !dbg !37
+  %260 = load float, ptr addrspace(3) %259, align 4, !dbg !37
+  %261 = getelementptr inbounds nuw i8, ptr addrspace(3) %259, i32 128, !dbg !37
+  %262 = load float, ptr addrspace(3) %261, align 4, !dbg !37
+  %263 = xor i32 %248, 3140, !dbg !37
+  %264 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %263, !dbg !37
+  %265 = load float, ptr addrspace(3) %264, align 4, !dbg !37
+  %266 = getelementptr inbounds nuw i8, ptr addrspace(3) %264, i32 128, !dbg !37
+  %267 = load float, ptr addrspace(3) %266, align 4, !dbg !37
+  %268 = sext i32 %31 to i64, !dbg !38
+  %269 = getelementptr bfloat, ptr addrspace(1) %2, i64 %268, !dbg !38
+  %270 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !39
+  %271 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %269, i64 %270, i1 %47) #5, !dbg !39
+  %272 = add i32 %44, -3145728, !dbg !40
+  %273 = sext i32 %272 to i64, !dbg !41
+  %274 = getelementptr bfloat, ptr addrspace(1) %3, i64 %273, !dbg !41
+  %275 = add i32 %17, -8192, !dbg !42
+  %276 = icmp ult i32 %275, 65536, !dbg !42
+  %277 = and i1 %33, %276, !dbg !42
+  %278 = and i1 %34, %276, !dbg !42
+  %279 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !43
+  %280 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %274, i64 %279, i1 %277) #5, !dbg !43
+  %281 = extractvalue { i32, i32, i32, i32 } %280, 0, !dbg !43
+  %282 = extractvalue { i32, i32, i32, i32 } %280, 1, !dbg !43
+  %283 = extractvalue { i32, i32, i32, i32 } %280, 2, !dbg !43
+  %284 = extractvalue { i32, i32, i32, i32 } %280, 3, !dbg !43
+  %285 = insertelement <2 x i32> poison, i32 %281, i64 0, !dbg !43
+  %286 = insertelement <2 x i32> %285, i32 %283, i64 1, !dbg !43
+  %287 = lshr <2 x i32> %286, splat (i32 16), !dbg !43
+  %288 = trunc nuw <2 x i32> %287 to <2 x i16>, !dbg !43
+  %289 = insertelement <2 x i32> poison, i32 %282, i64 0, !dbg !43
+  %290 = insertelement <2 x i32> %289, i32 %284, i64 1, !dbg !43
+  %291 = lshr <2 x i32> %290, splat (i32 16), !dbg !43
+  %292 = trunc nuw <2 x i32> %291 to <2 x i16>, !dbg !43
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !44
+  %293 = trunc i32 %281 to i16, !dbg !44
+  %294 = trunc i32 %283 to i16, !dbg !44
+  %295 = insertelement <2 x i16> poison, i16 %293, i64 0, !dbg !44
+  %296 = insertelement <2 x i16> %295, i16 %294, i64 1, !dbg !44
+  store <2 x i16> %296, ptr addrspace(3) %71, align 4, !dbg !44
+  store <2 x i16> %288, ptr addrspace(3) %77, align 4, !dbg !44
+  %297 = trunc i32 %282 to i16, !dbg !44
+  %298 = trunc i32 %284 to i16, !dbg !44
+  %299 = insertelement <2 x i16> poison, i16 %297, i64 0, !dbg !44
+  %300 = insertelement <2 x i16> %299, i16 %298, i64 1, !dbg !44
+  store <2 x i16> %300, ptr addrspace(3) %79, align 4, !dbg !44
+  store <2 x i16> %292, ptr addrspace(3) %85, align 4, !dbg !44
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !44
+  %301 = load bfloat, ptr addrspace(3) %97, align 2, !dbg !44
+  %302 = load bfloat, ptr addrspace(3) %100, align 2, !dbg !44
+  %303 = load bfloat, ptr addrspace(3) %103, align 2, !dbg !44
+  %304 = load bfloat, ptr addrspace(3) %106, align 2, !dbg !44
+  %305 = load bfloat, ptr addrspace(3) %109, align 2, !dbg !44
+  %306 = load bfloat, ptr addrspace(3) %112, align 2, !dbg !44
+  %307 = load bfloat, ptr addrspace(3) %115, align 2, !dbg !44
+  %308 = load bfloat, ptr addrspace(3) %118, align 2, !dbg !44
+  %309 = shl nsw i32 %36, 5, !dbg !45
+  %310 = add nsw i32 %.decomposed53, -8192, !dbg !45
+  %311 = add i32 %310, %309, !dbg !46
+  %312 = sext i32 %311 to i64, !dbg !47
+  %313 = getelementptr float, ptr addrspace(1) %4, i64 %312, !dbg !47
+  %314 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !48
+  %315 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %313, i64 %314, i1 %278) #5, !dbg !48
+  %316 = extractvalue { i32, i32, i32, i32 } %315, 0, !dbg !48
+  %317 = extractvalue { i32, i32, i32, i32 } %315, 1, !dbg !48
+  %318 = extractvalue { i32, i32, i32, i32 } %315, 2, !dbg !48
+  %319 = extractvalue { i32, i32, i32, i32 } %315, 3, !dbg !48
+  %320 = bitcast i32 %316 to float, !dbg !48
+  %321 = bitcast i32 %317 to float, !dbg !48
+  %322 = bitcast i32 %318 to float, !dbg !48
+  %323 = bitcast i32 %319 to float, !dbg !48
+  %324 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !48
+  %325 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %313, i64 %324, i1 %278) #5, !dbg !48
+  %326 = extractvalue { i32, i32, i32, i32 } %325, 0, !dbg !48
+  %327 = extractvalue { i32, i32, i32, i32 } %325, 1, !dbg !48
+  %328 = extractvalue { i32, i32, i32, i32 } %325, 2, !dbg !48
+  %329 = extractvalue { i32, i32, i32, i32 } %325, 3, !dbg !48
+  %330 = bitcast i32 %326 to float, !dbg !48
+  %331 = bitcast i32 %327 to float, !dbg !48
+  %332 = bitcast i32 %328 to float, !dbg !48
+  %333 = bitcast i32 %329 to float, !dbg !48
+  %334 = tail call float @llvm.nvvm.div.full(float %320, float 1.280000e+02), !dbg !49
+  %335 = tail call float @llvm.nvvm.div.full(float %321, float 1.280000e+02), !dbg !49
+  %336 = tail call float @llvm.nvvm.div.full(float %322, float 1.280000e+02), !dbg !49
+  %337 = tail call float @llvm.nvvm.div.full(float %323, float 1.280000e+02), !dbg !49
+  %338 = tail call float @llvm.nvvm.div.full(float %330, float 1.280000e+02), !dbg !49
+  %339 = tail call float @llvm.nvvm.div.full(float %331, float 1.280000e+02), !dbg !49
+  %340 = tail call float @llvm.nvvm.div.full(float %332, float 1.280000e+02), !dbg !49
+  %341 = tail call float @llvm.nvvm.div.full(float %333, float 1.280000e+02), !dbg !49
+  %342 = fadd float %334, 0x3EB0C6F7A0000000, !dbg !50
+  %343 = fadd float %335, 0x3EB0C6F7A0000000, !dbg !50
+  %344 = fadd float %336, 0x3EB0C6F7A0000000, !dbg !50
+  %345 = fadd float %337, 0x3EB0C6F7A0000000, !dbg !50
+  %346 = fadd float %338, 0x3EB0C6F7A0000000, !dbg !50
+  %347 = fadd float %339, 0x3EB0C6F7A0000000, !dbg !50
+  %348 = fadd float %340, 0x3EB0C6F7A0000000, !dbg !50
+  %349 = fadd float %341, 0x3EB0C6F7A0000000, !dbg !50
+  %350 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !51
+  %.not.i29 = icmp eq i32 %350, 0, !dbg !51
+  br i1 %.not.i29, label %353, label %351, !dbg !51
+
+351:                                              ; preds = %__nv_rsqrtf.exit28
+  %352 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %342), !dbg !51
+  br label %__nv_rsqrtf.exit31, !dbg !51
+
+353:                                              ; preds = %__nv_rsqrtf.exit28
+  %354 = tail call float @llvm.nvvm.rsqrt.approx.f(float %342), !dbg !51
+  br label %__nv_rsqrtf.exit31, !dbg !51
+
+__nv_rsqrtf.exit31:                               ; preds = %351, %353
+  %.0.i30 = phi float [ %352, %351 ], [ %354, %353 ], !dbg !51
+  %355 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !51
+  %.not.i32 = icmp eq i32 %355, 0, !dbg !51
+  br i1 %.not.i32, label %358, label %356, !dbg !51
+
+356:                                              ; preds = %__nv_rsqrtf.exit31
+  %357 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %343), !dbg !51
+  br label %__nv_rsqrtf.exit34, !dbg !51
+
+358:                                              ; preds = %__nv_rsqrtf.exit31
+  %359 = tail call float @llvm.nvvm.rsqrt.approx.f(float %343), !dbg !51
+  br label %__nv_rsqrtf.exit34, !dbg !51
+
+__nv_rsqrtf.exit34:                               ; preds = %356, %358
+  %.0.i33 = phi float [ %357, %356 ], [ %359, %358 ], !dbg !51
+  %360 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !51
+  %.not.i35 = icmp eq i32 %360, 0, !dbg !51
+  br i1 %.not.i35, label %363, label %361, !dbg !51
+
+361:                                              ; preds = %__nv_rsqrtf.exit34
+  %362 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %344), !dbg !51
+  br label %__nv_rsqrtf.exit37, !dbg !51
+
+363:                                              ; preds = %__nv_rsqrtf.exit34
+  %364 = tail call float @llvm.nvvm.rsqrt.approx.f(float %344), !dbg !51
+  br label %__nv_rsqrtf.exit37, !dbg !51
+
+__nv_rsqrtf.exit37:                               ; preds = %361, %363
+  %.0.i36 = phi float [ %362, %361 ], [ %364, %363 ], !dbg !51
+  %365 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !51
+  %.not.i38 = icmp eq i32 %365, 0, !dbg !51
+  br i1 %.not.i38, label %368, label %366, !dbg !51
+
+366:                                              ; preds = %__nv_rsqrtf.exit37
+  %367 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %345), !dbg !51
+  br label %__nv_rsqrtf.exit40, !dbg !51
+
+368:                                              ; preds = %__nv_rsqrtf.exit37
+  %369 = tail call float @llvm.nvvm.rsqrt.approx.f(float %345), !dbg !51
+  br label %__nv_rsqrtf.exit40, !dbg !51
+
+__nv_rsqrtf.exit40:                               ; preds = %366, %368
+  %.0.i39 = phi float [ %367, %366 ], [ %369, %368 ], !dbg !51
+  %370 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !51
+  %.not.i41 = icmp eq i32 %370, 0, !dbg !51
+  br i1 %.not.i41, label %373, label %371, !dbg !51
+
+371:                                              ; preds = %__nv_rsqrtf.exit40
+  %372 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %346), !dbg !51
+  br label %__nv_rsqrtf.exit43, !dbg !51
+
+373:                                              ; preds = %__nv_rsqrtf.exit40
+  %374 = tail call float @llvm.nvvm.rsqrt.approx.f(float %346), !dbg !51
+  br label %__nv_rsqrtf.exit43, !dbg !51
+
+__nv_rsqrtf.exit43:                               ; preds = %371, %373
+  %.0.i42 = phi float [ %372, %371 ], [ %374, %373 ], !dbg !51
+  %375 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !51
+  %.not.i44 = icmp eq i32 %375, 0, !dbg !51
+  br i1 %.not.i44, label %378, label %376, !dbg !51
+
+376:                                              ; preds = %__nv_rsqrtf.exit43
+  %377 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %347), !dbg !51
+  br label %__nv_rsqrtf.exit46, !dbg !51
+
+378:                                              ; preds = %__nv_rsqrtf.exit43
+  %379 = tail call float @llvm.nvvm.rsqrt.approx.f(float %347), !dbg !51
+  br label %__nv_rsqrtf.exit46, !dbg !51
+
+__nv_rsqrtf.exit46:                               ; preds = %376, %378
+  %.0.i45 = phi float [ %377, %376 ], [ %379, %378 ], !dbg !51
+  %380 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !51
+  %.not.i47 = icmp eq i32 %380, 0, !dbg !51
+  br i1 %.not.i47, label %383, label %381, !dbg !51
+
+381:                                              ; preds = %__nv_rsqrtf.exit46
+  %382 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %348), !dbg !51
+  br label %__nv_rsqrtf.exit49, !dbg !51
+
+383:                                              ; preds = %__nv_rsqrtf.exit46
+  %384 = tail call float @llvm.nvvm.rsqrt.approx.f(float %348), !dbg !51
+  br label %__nv_rsqrtf.exit49, !dbg !51
+
+__nv_rsqrtf.exit49:                               ; preds = %381, %383
+  %.0.i48 = phi float [ %382, %381 ], [ %384, %383 ], !dbg !51
+  %385 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !51
+  %.not.i50 = icmp eq i32 %385, 0, !dbg !51
+  br i1 %.not.i50, label %388, label %386, !dbg !51
+
+386:                                              ; preds = %__nv_rsqrtf.exit49
+  %387 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %349), !dbg !51
+  br label %__nv_rsqrtf.exit52, !dbg !51
+
+388:                                              ; preds = %__nv_rsqrtf.exit49
+  %389 = tail call float @llvm.nvvm.rsqrt.approx.f(float %349), !dbg !51
+  br label %__nv_rsqrtf.exit52, !dbg !51
+
+__nv_rsqrtf.exit52:                               ; preds = %386, %388
+  %.0.i51 = phi float [ %387, %386 ], [ %389, %388 ], !dbg !51
+  %390 = fpext bfloat %308 to float, !dbg !44
+  %391 = fpext bfloat %307 to float, !dbg !44
+  %392 = fpext bfloat %306 to float, !dbg !44
+  %393 = fpext bfloat %305 to float, !dbg !44
+  %394 = fpext bfloat %304 to float, !dbg !44
+  %395 = fpext bfloat %303 to float, !dbg !44
+  %396 = fpext bfloat %302 to float, !dbg !44
+  %397 = fpext bfloat %301 to float, !dbg !44
+  %398 = extractvalue { i32, i32, i32, i32 } %271, 3, !dbg !39
+  %399 = bitcast i32 %398 to <2 x bfloat>, !dbg !39
+  %400 = extractvalue { i32, i32, i32, i32 } %271, 2, !dbg !39
+  %401 = bitcast i32 %400 to <2 x bfloat>, !dbg !39
+  %402 = extractvalue { i32, i32, i32, i32 } %271, 1, !dbg !39
+  %403 = bitcast i32 %402 to <2 x bfloat>, !dbg !39
+  %404 = extractvalue { i32, i32, i32, i32 } %271, 0, !dbg !39
+  %405 = bitcast i32 %404 to <2 x bfloat>, !dbg !39
+  %406 = icmp slt i32 %23, 73728, !dbg !52
+  %407 = fmul float %.0.i30, %397, !dbg !53
+  %408 = fmul float %.0.i33, %396, !dbg !53
+  %409 = fmul float %.0.i36, %395, !dbg !53
+  %410 = fmul float %.0.i39, %394, !dbg !53
+  %411 = fmul float %.0.i42, %393, !dbg !53
+  %412 = fmul float %.0.i45, %392, !dbg !53
+  %413 = fmul float %.0.i48, %391, !dbg !53
+  %414 = fmul float %.0.i51, %390, !dbg !53
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !53
+  store float %407, ptr addrspace(3) %226, align 4, !dbg !53
+  store float %408, ptr addrspace(3) %228, align 4, !dbg !53
+  store float %409, ptr addrspace(3) %230, align 4, !dbg !53
+  store float %410, ptr addrspace(3) %232, align 4, !dbg !53
+  store float %411, ptr addrspace(3) %234, align 4, !dbg !53
+  store float %412, ptr addrspace(3) %236, align 4, !dbg !53
+  store float %413, ptr addrspace(3) %238, align 4, !dbg !53
+  store float %414, ptr addrspace(3) %240, align 4, !dbg !53
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !53
+  %415 = load float, ptr addrspace(3) %249, align 4, !dbg !53
+  %416 = load float, ptr addrspace(3) %251, align 4, !dbg !53
+  %417 = load float, ptr addrspace(3) %254, align 4, !dbg !53
+  %418 = load float, ptr addrspace(3) %256, align 4, !dbg !53
+  %419 = load float, ptr addrspace(3) %259, align 4, !dbg !53
+  %420 = load float, ptr addrspace(3) %261, align 4, !dbg !53
+  %421 = load float, ptr addrspace(3) %264, align 4, !dbg !53
+  %422 = load float, ptr addrspace(3) %266, align 4, !dbg !53
+  %423 = getelementptr bfloat, ptr addrspace(1) %5, i64 %268, !dbg !54
+  %424 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !55
+  %425 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %423, i64 %424, i1 %277) #5, !dbg !55
+  %426 = extractvalue { i32, i32, i32, i32 } %425, 0, !dbg !55
+  %427 = bitcast i32 %426 to <2 x bfloat>, !dbg !55
+  %428 = extractvalue { i32, i32, i32, i32 } %425, 1, !dbg !55
+  %429 = bitcast i32 %428 to <2 x bfloat>, !dbg !55
+  %430 = extractvalue { i32, i32, i32, i32 } %425, 2, !dbg !55
+  %431 = bitcast i32 %430 to <2 x bfloat>, !dbg !55
+  %432 = extractvalue { i32, i32, i32, i32 } %425, 3, !dbg !55
+  %433 = bitcast i32 %432 to <2 x bfloat>, !dbg !55
+  %434 = shl i32 %23, 7, !dbg !56
+  %435 = add i32 %434, %31, !dbg !57
+  %436 = sext i32 %435 to i64, !dbg !58
+  %437 = getelementptr bfloat, ptr addrspace(1) %6, i64 %436, !dbg !58
+  %438 = and i1 %33, %406, !dbg !59
+  %439 = fpext <2 x bfloat> %405 to <2 x float>, !dbg !60
+  %440 = insertelement <2 x float> poison, float %250, i64 0, !dbg !61
+  %441 = insertelement <2 x float> %440, float %255, i64 1, !dbg !61
+  %442 = fmul <2 x float> %441, %439, !dbg !61
+  %443 = fpext <2 x bfloat> %427 to <2 x float>, !dbg !62
+  %444 = insertelement <2 x float> poison, float %415, i64 0, !dbg !63
+  %445 = insertelement <2 x float> %444, float %417, i64 1, !dbg !63
+  %446 = fmul <2 x float> %445, %443, !dbg !63
+  %447 = insertelement <2 x i1> poison, i1 %39, i64 0, !dbg !64
+  %448 = shufflevector <2 x i1> %447, <2 x i1> poison, <2 x i32> zeroinitializer, !dbg !64
+  %449 = select <2 x i1> %448, <2 x float> %442, <2 x float> %446, !dbg !64
+  %450 = fptrunc <2 x float> %449 to <2 x bfloat>, !dbg !65
+  %451 = fpext <2 x bfloat> %403 to <2 x float>, !dbg !60
+  %452 = insertelement <2 x float> poison, float %260, i64 0, !dbg !61
+  %453 = insertelement <2 x float> %452, float %265, i64 1, !dbg !61
+  %454 = fmul <2 x float> %453, %451, !dbg !61
+  %455 = fpext <2 x bfloat> %429 to <2 x float>, !dbg !62
+  %456 = insertelement <2 x float> poison, float %419, i64 0, !dbg !63
+  %457 = insertelement <2 x float> %456, float %421, i64 1, !dbg !63
+  %458 = fmul <2 x float> %457, %455, !dbg !63
+  %459 = select <2 x i1> %448, <2 x float> %454, <2 x float> %458, !dbg !64
+  %460 = fptrunc <2 x float> %459 to <2 x bfloat>, !dbg !65
+  %461 = fpext <2 x bfloat> %401 to <2 x float>, !dbg !60
+  %462 = insertelement <2 x float> poison, float %252, i64 0, !dbg !61
+  %463 = insertelement <2 x float> %462, float %257, i64 1, !dbg !61
+  %464 = fmul <2 x float> %463, %461, !dbg !61
+  %465 = fpext <2 x bfloat> %431 to <2 x float>, !dbg !62
+  %466 = insertelement <2 x float> poison, float %416, i64 0, !dbg !63
+  %467 = insertelement <2 x float> %466, float %418, i64 1, !dbg !63
+  %468 = fmul <2 x float> %467, %465, !dbg !63
+  %469 = select <2 x i1> %448, <2 x float> %464, <2 x float> %468, !dbg !64
+  %470 = fptrunc <2 x float> %469 to <2 x bfloat>, !dbg !65
+  %471 = fpext <2 x bfloat> %399 to <2 x float>, !dbg !60
+  %472 = insertelement <2 x float> poison, float %262, i64 0, !dbg !61
+  %473 = insertelement <2 x float> %472, float %267, i64 1, !dbg !61
+  %474 = fmul <2 x float> %473, %471, !dbg !61
+  %475 = fpext <2 x bfloat> %433 to <2 x float>, !dbg !62
+  %476 = insertelement <2 x float> poison, float %420, i64 0, !dbg !63
+  %477 = insertelement <2 x float> %476, float %422, i64 1, !dbg !63
+  %478 = fmul <2 x float> %477, %475, !dbg !63
+  %479 = select <2 x i1> %448, <2 x float> %474, <2 x float> %478, !dbg !64
+  %480 = fptrunc <2 x float> %479 to <2 x bfloat>, !dbg !65
+  %481 = bitcast <2 x bfloat> %450 to i32, !dbg !65
+  %482 = bitcast <2 x bfloat> %460 to i32, !dbg !65
+  %483 = bitcast <2 x bfloat> %470 to i32, !dbg !65
+  %484 = bitcast <2 x bfloat> %480 to i32, !dbg !65
+  tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %481, i32 %482, i32 %483, i32 %484, ptr addrspace(1) %437, i1 %438) #5, !dbg !65
+  ret void, !dbg !66
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 65535) i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() #1
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 65535) i32 @llvm.nvvm.read.ptx.sreg.ctaid.z() #1
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 1, 65536) i32 @llvm.nvvm.read.ptx.sreg.nctaid.y() #1
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1
+
+; Function Attrs: convergent nocallback nounwind
+declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #2
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.div.full(float, float) #3
+
+declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #4
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #3
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.rsqrt.approx.f(float) #3
+
+attributes #0 = { nounwind "nvvm.reqntid"="128" }
+attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #2 = { convergent nocallback nounwind }
+attributes #3 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
+attributes #4 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #5 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3}
+!llvm.ident = !{!4}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly)
+!1 = !DIFile(filename: "c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py", directory: "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h")
+!2 = !{i32 2, !"Debug Info Version", i32 3}
+!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
+!4 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
+!5 = distinct !DISubprogram(name: "triton_poi_fused__fused_rms_norm_cat_view_2", linkageName: "triton_poi_fused__fused_rms_norm_cat_view_2", scope: !1, file: !1, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
+!7 = !{}
+!8 = !DILocation(line: 21, column: 29, scope: !5)
+!9 = !DILocation(line: 21, column: 48, scope: !5)
+!10 = !DILocation(line: 21, column: 69, scope: !5)
+!11 = !DILocation(line: 21, column: 53, scope: !5)
+!12 = !DILocation(line: 21, column: 34, scope: !5)
+!13 = !DILocation(line: 21, column: 75, scope: !5)
+!14 = !DILocation(line: 22, column: 44, scope: !5)
+!15 = !DILocation(line: 22, column: 23, scope: !5)
+!16 = !DILocation(line: 24, column: 28, scope: !5)
+!17 = !DILocation(line: 24, column: 33, scope: !5)
+!18 = !DILocation(line: 25, column: 44, scope: !5)
+!19 = !DILocation(line: 25, column: 23, scope: !5)
+!20 = !DILocation(line: 26, column: 21, scope: !5)
+!21 = !DILocation(line: 27, column: 19, scope: !5)
+!22 = !DILocation(line: 29, column: 19, scope: !5)
+!23 = !DILocation(line: 35, column: 18, scope: !5)
+!24 = !DILocation(line: 36, column: 39, scope: !5)
+!25 = !DILocation(line: 36, column: 35, scope: !5)
+!26 = !DILocation(line: 36, column: 51, scope: !5)
+!27 = !DILocation(line: 36, column: 44, scope: !5)
+!28 = !DILocation(line: 36, column: 30, scope: !5)
+!29 = !DILocation(line: 36, column: 64, scope: !5)
+!30 = !DILocation(line: 36, column: 57, scope: !5)
+!31 = !DILocation(line: 36, column: 123, scope: !5)
+!32 = !DILocation(line: 38, column: 30, scope: !5)
+!33 = !DILocation(line: 38, column: 80, scope: !5)
+!34 = !DILocation(line: 40, column: 19, scope: !5)
+!35 = !DILocation(line: 42, column: 19, scope: !5)
+!36 = !DILocation(line: 43, column: 28, scope: !5)
+!37 = !DILocation(line: 44, column: 19, scope: !5)
+!38 = !DILocation(line: 45, column: 31, scope: !5)
+!39 = !DILocation(line: 45, column: 71, scope: !5)
+!40 = !DILocation(line: 54, column: 45, scope: !5)
+!41 = !DILocation(line: 54, column: 31, scope: !5)
+!42 = !DILocation(line: 54, column: 83, scope: !5)
+!43 = !DILocation(line: 54, column: 67, scope: !5)
+!44 = !DILocation(line: 54, column: 134, scope: !5)
+!45 = !DILocation(line: 56, column: 56, scope: !5)
+!46 = !DILocation(line: 56, column: 52, scope: !5)
+!47 = !DILocation(line: 56, column: 31, scope: !5)
+!48 = !DILocation(line: 56, column: 90, scope: !5)
+!49 = !DILocation(line: 58, column: 21, scope: !5)
+!50 = !DILocation(line: 60, column: 20, scope: !5)
+!51 = !DILocation(line: 61, column: 28, scope: !5)
+!52 = !DILocation(line: 23, column: 21, scope: !5)
+!53 = !DILocation(line: 62, column: 20, scope: !5)
+!54 = !DILocation(line: 63, column: 31, scope: !5)
+!55 = !DILocation(line: 63, column: 71, scope: !5)
+!56 = !DILocation(line: 70, column: 34, scope: !5)
+!57 = !DILocation(line: 70, column: 30, scope: !5)
+!58 = !DILocation(line: 70, column: 25, scope: !5)
+!59 = !DILocation(line: 70, column: 54, scope: !5)
+!60 = !DILocation(line: 45, column: 137, scope: !5)
+!61 = !DILocation(line: 47, column: 20, scope: !5)
+!62 = !DILocation(line: 63, column: 138, scope: !5)
+!63 = !DILocation(line: 65, column: 20, scope: !5)
+!64 = !DILocation(line: 0, scope: !5)
+!65 = !DILocation(line: 70, column: 46, scope: !5)
+!66 = !DILocation(line: 70, column: 4, scope: !5)
diff --git a/triton/K67YKHK7SDRMJU7RXHVXJT2KOUZAO4I5423SJJDUEAFURSHREZEQ/triton_poi_fused__fused_rms_norm_cat_view_2.ptx b/triton/K67YKHK7SDRMJU7RXHVXJT2KOUZAO4I5423SJJDUEAFURSHREZEQ/triton_poi_fused__fused_rms_norm_cat_view_2.ptx
new file mode 100644
index 0000000000000000000000000000000000000000..10806715d04c6a3d4436851dc528408993296919
--- /dev/null
+++ b/triton/K67YKHK7SDRMJU7RXHVXJT2KOUZAO4I5423SJJDUEAFURSHREZEQ/triton_poi_fused__fused_rms_norm_cat_view_2.ptx
@@ -0,0 +1,796 @@
+//
+// Generated by LLVM NVPTX Back-End
+//
+
+.version 9.1
+.target sm_89
+.address_size 64
+
+	// .globl	triton_poi_fused__fused_rms_norm_cat_view_2 // -- Begin function triton_poi_fused__fused_rms_norm_cat_view_2
+.extern .shared .align 16 .b8 global_smem[];
+.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90};
+                                        // @triton_poi_fused__fused_rms_norm_cat_view_2
+.visible .entry triton_poi_fused__fused_rms_norm_cat_view_2(
+	.param .u64 .ptr .global .align 1 triton_poi_fused__fused_rms_norm_cat_view_2_param_0,
+	.param .u64 .ptr .global .align 1 triton_poi_fused__fused_rms_norm_cat_view_2_param_1,
+	.param .u64 .ptr .global .align 1 triton_poi_fused__fused_rms_norm_cat_view_2_param_2,
+	.param .u64 .ptr .global .align 1 triton_poi_fused__fused_rms_norm_cat_view_2_param_3,
+	.param .u64 .ptr .global .align 1 triton_poi_fused__fused_rms_norm_cat_view_2_param_4,
+	.param .u64 .ptr .global .align 1 triton_poi_fused__fused_rms_norm_cat_view_2_param_5,
+	.param .u64 .ptr .global .align 1 triton_poi_fused__fused_rms_norm_cat_view_2_param_6,
+	.param .u32 triton_poi_fused__fused_rms_norm_cat_view_2_param_7,
+	.param .u32 triton_poi_fused__fused_rms_norm_cat_view_2_param_8,
+	.param .u64 .ptr .global .align 1 triton_poi_fused__fused_rms_norm_cat_view_2_param_9,
+	.param .u64 .ptr .global .align 1 triton_poi_fused__fused_rms_norm_cat_view_2_param_10
+)
+.reqntid 128
+{
+	.reg .pred 	%p<12>;
+	.reg .b16 	%rs<33>;
+	.reg .b32 	%r<297>;
+	.reg .b64 	%rd<24>;
+	.loc	1 18 0                          // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:18:0
+$L__func_begin0:
+	.loc	1 18 0                          // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:18:0
+
+// %bb.0:                               // %__nv_rsqrtf.exit
+	ld.param.b64 	%rd16, [triton_poi_fused__fused_rms_norm_cat_view_2_param_0];
+	ld.param.b64 	%rd17, [triton_poi_fused__fused_rms_norm_cat_view_2_param_1];
+$L__tmp0:
+	.loc	1 21 29                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:21:29
+	mov.u32 	%r38, %ctaid.y;
+	ld.param.b64 	%rd18, [triton_poi_fused__fused_rms_norm_cat_view_2_param_2];
+	.loc	1 21 48                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:21:48
+	mov.u32 	%r39, %ctaid.z;
+	ld.param.b64 	%rd19, [triton_poi_fused__fused_rms_norm_cat_view_2_param_3];
+	.loc	1 21 69                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:21:69
+	mov.u32 	%r40, %nctaid.y;
+	ld.param.b64 	%rd20, [triton_poi_fused__fused_rms_norm_cat_view_2_param_4];
+	.loc	1 21 34                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:21:34
+	mad.lo.s32 	%r41, %r39, %r40, %r38;
+	ld.param.b64 	%rd21, [triton_poi_fused__fused_rms_norm_cat_view_2_param_5];
+	.loc	1 21 75                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:21:75
+	shl.b32 	%r42, %r41, 5;
+	ld.param.b64 	%rd22, [triton_poi_fused__fused_rms_norm_cat_view_2_param_6];
+	.loc	1 22 44                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:22:44
+	mov.u32 	%r43, %tid.x;
+	bfe.u32 	%r44, %r43, 2, 5;
+	and.b32 	%r45, %r43, 7;
+	shl.b32 	%r46, %r45, 2;
+	.loc	1 22 23                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:22:23
+	or.b32 	%r47, %r42, %r44;
+	or.b32 	%r48, %r42, %r46;
+	.loc	1 24 28                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:24:28
+	mov.u32 	%r49, %ctaid.x;
+	.loc	1 24 33                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:24:33
+	shl.b32 	%r50, %r49, 5;
+	.loc	1 25 44                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:25:44
+	shl.b32 	%r51, %r43, 3;
+	and.b32 	%r52, %r51, 24;
+	bfe.u32 	%r53, %r43, 3, 4;
+	.loc	1 25 23                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:25:23
+	or.b32 	%r54, %r52, %r50;
+	or.b32 	%r55, %r53, %r50;
+	.loc	1 26 21                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:26:21
+	setp.lt.s32 	%p6, %r54, 128;
+	setp.lt.s32 	%p7, %r55, 128;
+	.loc	1 27 19                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:27:19
+	bfe.s32 	%r56, %r41, 26, 1;
+	shr.u32 	%r57, %r56, 27;
+	add.s32 	%r58, %r47, %r57;
+	shr.u32 	%r59, %r58, 5;
+	.loc	1 29 19                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:29:19
+	and.b32 	%r60, %r58, 33554400;
+	sub.s32 	%r61, %r47, %r60;
+	.loc	1 35 18                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:35:18
+	setp.lt.s32 	%p8, %r47, 8192;
+	setp.lt.s32 	%p9, %r48, 8192;
+	.loc	1 36 39                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:36:39
+	shl.b32 	%r62, %r61, 7;
+	.loc	1 36 35                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:36:35
+	add.s32 	%r63, %r62, %r54;
+	.loc	1 36 44                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:36:44
+	mad.lo.s32 	%r64, %r59, 12288, %r63;
+	.loc	1 36 30                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:36:30
+	mad.wide.s32 	%rd1, %r64, 2, %rd16;
+	.loc	1 36 64                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:36:64
+	and.pred 	%p1, %p6, %p8;
+	and.pred 	%p2, %p7, %p9;
+	.loc	1 36 57                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:36:57
+	// begin inline asm
+	mov.u64 %rd2, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd2, 1.0;
+	// end inline asm
+	mov.b32 	%r5, 0;
+	// begin inline asm
+	mov.u32 %r1, %r5;
+	mov.u32 %r2, %r5;
+	mov.u32 %r3, %r5;
+	mov.u32 %r4, %r5;
+	@%p1 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r1, %r2, %r3, %r4 }, [ %rd1 + 0 ], %rd2;
+	// end inline asm
+	prmt.b32 	%r65, %r1, %r3, 0x7632U;
+	prmt.b32 	%r66, %r2, %r4, 0x7632U;
+	.loc	1 36 123                        // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:36:123
+	shl.b32 	%r67, %r43, 5;
+	and.b32 	%r68, %r67, 480;
+	and.b32 	%r69, %r43, 12;
+	shl.b32 	%r70, %r69, 1;
+	and.b32 	%r71, %r43, 112;
+	shr.u32 	%r72, %r71, 2;
+	or.b32 	%r73, %r68, %r70;
+	xor.b32 	%r74, %r73, %r72;
+	mov.b32 	%r75, global_smem;
+	add.s32 	%r76, %r75, %r74;
+	prmt.b32 	%r77, %r1, %r3, 0x5410U;
+	st.shared.b32 	[%r76], %r77;
+	xor.b32 	%r78, %r74, 32;
+	add.s32 	%r79, %r75, %r78;
+	st.shared.b32 	[%r79+512], %r65;
+	xor.b32 	%r80, %r74, 64;
+	add.s32 	%r81, %r75, %r80;
+	prmt.b32 	%r82, %r2, %r4, 0x5410U;
+	st.shared.b32 	[%r81+1024], %r82;
+	xor.b32 	%r83, %r74, 96;
+	add.s32 	%r84, %r75, %r83;
+	st.shared.b32 	[%r84+1536], %r66;
+	bar.sync 	0;
+	shl.b32 	%r85, %r43, 6;
+	and.b32 	%r86, %r85, 1536;
+	shl.b32 	%r87, %r43, 2;
+	and.b32 	%r88, %r87, 124;
+	and.b32 	%r89, %r43, 32;
+	shr.u32 	%r90, %r89, 4;
+	and.b32 	%r91, %r43, 64;
+	shr.u32 	%r92, %r91, 1;
+	or.b32 	%r93, %r86, %r88;
+	xor.b32 	%r94, %r93, %r92;
+	or.b32 	%r95, %r94, %r90;
+	add.s32 	%r96, %r75, %r95;
+	ld.shared.b16 	%rs1, [%r96];
+	xor.b32 	%r97, %r95, 8;
+	add.s32 	%r98, %r75, %r97;
+	ld.shared.b16 	%rs2, [%r98+128];
+	xor.b32 	%r99, %r95, 16;
+	add.s32 	%r100, %r75, %r99;
+	ld.shared.b16 	%rs3, [%r100+256];
+	xor.b32 	%r101, %r95, 24;
+	add.s32 	%r102, %r75, %r101;
+	ld.shared.b16 	%rs4, [%r102+384];
+	xor.b32 	%r103, %r95, 64;
+	add.s32 	%r104, %r75, %r103;
+	ld.shared.b16 	%rs5, [%r104];
+	xor.b32 	%r105, %r95, 72;
+	add.s32 	%r106, %r75, %r105;
+	ld.shared.b16 	%rs6, [%r106+128];
+	xor.b32 	%r107, %r95, 80;
+	add.s32 	%r108, %r75, %r107;
+	ld.shared.b16 	%rs7, [%r108+256];
+	xor.b32 	%r109, %r95, 88;
+	add.s32 	%r110, %r75, %r109;
+	ld.shared.b16 	%rs8, [%r110+384];
+	cvt.f32.bf16 	%r111, %rs1;
+	cvt.f32.bf16 	%r112, %rs2;
+	cvt.f32.bf16 	%r113, %rs3;
+	cvt.f32.bf16 	%r114, %rs4;
+	cvt.f32.bf16 	%r115, %rs5;
+	cvt.f32.bf16 	%r116, %rs6;
+	cvt.f32.bf16 	%r117, %rs7;
+	cvt.f32.bf16 	%r118, %rs8;
+	.loc	1 38 30                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:38:30
+	mad.wide.s32 	%rd3, %r48, 4, %rd17;
+	.loc	1 38 80                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:38:80
+	// begin inline asm
+	mov.u64 %rd4, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd4, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r6, %r5;
+	mov.u32 %r7, %r5;
+	mov.u32 %r8, %r5;
+	mov.u32 %r9, %r5;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r6, %r7, %r8, %r9 }, [ %rd3 + 0 ], %rd4;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd5, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd5, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r10, %r5;
+	mov.u32 %r11, %r5;
+	mov.u32 %r12, %r5;
+	mov.u32 %r13, %r5;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r10, %r11, %r12, %r13 }, [ %rd3 + 0 ], %rd5;
+	// end inline asm
+	mov.b32 	%r119, 0f43000000;
+	.loc	1 40 19                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:40:19
+	div.full.f32 	%r120, %r6, %r119;
+	div.full.f32 	%r121, %r7, %r119;
+	div.full.f32 	%r122, %r8, %r119;
+	div.full.f32 	%r123, %r9, %r119;
+	div.full.f32 	%r124, %r10, %r119;
+	div.full.f32 	%r125, %r11, %r119;
+	div.full.f32 	%r126, %r12, %r119;
+	div.full.f32 	%r127, %r13, %r119;
+	.loc	1 42 19                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:42:19
+	add.f32 	%r128, %r120, 0f358637BD;
+	add.f32 	%r129, %r121, 0f358637BD;
+	add.f32 	%r130, %r122, 0f358637BD;
+	add.f32 	%r131, %r123, 0f358637BD;
+	add.f32 	%r132, %r124, 0f358637BD;
+	add.f32 	%r133, %r125, 0f358637BD;
+	add.f32 	%r134, %r126, 0f358637BD;
+	add.f32 	%r135, %r127, 0f358637BD;
+	.loc	1 43 28                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:43:28
+	rsqrt.approx.ftz.f32 	%r136, %r128;
+	rsqrt.approx.ftz.f32 	%r137, %r129;
+	rsqrt.approx.ftz.f32 	%r138, %r130;
+	rsqrt.approx.ftz.f32 	%r139, %r131;
+	rsqrt.approx.ftz.f32 	%r140, %r132;
+	rsqrt.approx.ftz.f32 	%r141, %r133;
+	rsqrt.approx.ftz.f32 	%r142, %r134;
+	rsqrt.approx.ftz.f32 	%r143, %r135;
+	.loc	1 44 19                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:44:19
+	mul.f32 	%r144, %r136, %r111;
+	mul.f32 	%r145, %r137, %r112;
+	mul.f32 	%r146, %r138, %r113;
+	mul.f32 	%r147, %r139, %r114;
+	mul.f32 	%r148, %r140, %r115;
+	mul.f32 	%r149, %r141, %r116;
+	mul.f32 	%r150, %r142, %r117;
+	mul.f32 	%r151, %r143, %r118;
+	bar.sync 	0;
+	shl.b32 	%r152, %r45, 3;
+	bfe.s32 	%r153, %r43, 3, 1;
+	and.b32 	%r154, %r153, 1088;
+	bfe.s32 	%r155, %r43, 4, 1;
+	and.b32 	%r156, %r155, 2052;
+	shl.b32 	%r157, %r89, 2;
+	or.b32 	%r158, %r156, %r157;
+	or.b32 	%r159, %r154, %r152;
+	xor.b32 	%r160, %r159, %r91;
+	or.b32 	%r161, %r160, %r158;
+	add.s32 	%r162, %r75, %r161;
+	st.shared.b32 	[%r162], %r144;
+	xor.b32 	%r163, %r161, 16;
+	add.s32 	%r164, %r75, %r163;
+	st.shared.b32 	[%r164+256], %r145;
+	xor.b32 	%r165, %r161, 32;
+	add.s32 	%r166, %r75, %r165;
+	st.shared.b32 	[%r166+512], %r146;
+	xor.b32 	%r167, %r161, 48;
+	add.s32 	%r168, %r75, %r167;
+	st.shared.b32 	[%r168+768], %r147;
+	xor.b32 	%r169, %r161, 4;
+	add.s32 	%r170, %r75, %r169;
+	st.shared.b32 	[%r170], %r148;
+	xor.b32 	%r171, %r161, 20;
+	add.s32 	%r172, %r75, %r171;
+	st.shared.b32 	[%r172+256], %r149;
+	xor.b32 	%r173, %r161, 36;
+	add.s32 	%r174, %r75, %r173;
+	st.shared.b32 	[%r174+512], %r150;
+	xor.b32 	%r175, %r161, 52;
+	add.s32 	%r176, %r75, %r175;
+	st.shared.b32 	[%r176+768], %r151;
+	bar.sync 	0;
+	and.b32 	%r177, %r85, 832;
+	shl.b32 	%r178, %r69, 2;
+	shr.u32 	%r179, %r71, 1;
+	shl.b32 	%r180, %r43, 1;
+	and.b32 	%r181, %r180, 4;
+	or.b32 	%r182, %r177, %r178;
+	xor.b32 	%r183, %r182, %r179;
+	or.b32 	%r184, %r183, %r181;
+	add.s32 	%r185, %r75, %r184;
+	ld.shared.b32 	%r186, [%r185];
+	ld.shared.b32 	%r187, [%r185+128];
+	xor.b32 	%r188, %r184, 64;
+	add.s32 	%r189, %r75, %r188;
+	ld.shared.b32 	%r190, [%r189+1024];
+	ld.shared.b32 	%r191, [%r189+1152];
+	xor.b32 	%r192, %r184, 4;
+	add.s32 	%r193, %r75, %r192;
+	ld.shared.b32 	%r194, [%r193+2048];
+	ld.shared.b32 	%r195, [%r193+2176];
+	xor.b32 	%r196, %r184, 68;
+	add.s32 	%r197, %r75, %r196;
+	ld.shared.b32 	%r198, [%r197+3072];
+	ld.shared.b32 	%r199, [%r197+3200];
+	.loc	1 45 31                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:45:31
+	mul.wide.s32 	%rd23, %r54, 2;
+	add.s64 	%rd6, %rd18, %rd23;
+	.loc	1 45 71                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:45:71
+	// begin inline asm
+	mov.u64 %rd7, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd7, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r14, %r5;
+	mov.u32 %r15, %r5;
+	mov.u32 %r16, %r5;
+	mov.u32 %r17, %r5;
+	@%p1 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r14, %r15, %r16, %r17 }, [ %rd6 + 0 ], %rd7;
+	// end inline asm
+	.loc	1 54 45                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:54:45
+	add.s32 	%r200, %r64, -3145728;
+	.loc	1 54 31                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:54:31
+	mad.wide.s32 	%rd8, %r200, 2, %rd19;
+	.loc	1 54 83                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:54:83
+	add.s32 	%r201, %r42, -8192;
+	setp.lt.u32 	%p10, %r201, 65536;
+	and.pred 	%p3, %p6, %p10;
+	and.pred 	%p4, %p7, %p10;
+	.loc	1 54 67                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:54:67
+	// begin inline asm
+	mov.u64 %rd9, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd9, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r18, %r5;
+	mov.u32 %r19, %r5;
+	mov.u32 %r20, %r5;
+	mov.u32 %r21, %r5;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r18, %r19, %r20, %r21 }, [ %rd8 + 0 ], %rd9;
+	// end inline asm
+	prmt.b32 	%r202, %r18, %r20, 0x7632U;
+	prmt.b32 	%r203, %r19, %r21, 0x7632U;
+	.loc	1 54 134                        // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:54:134
+	bar.sync 	0;
+	prmt.b32 	%r204, %r18, %r20, 0x5410U;
+	st.shared.b32 	[%r76], %r204;
+	st.shared.b32 	[%r79+512], %r202;
+	prmt.b32 	%r205, %r19, %r21, 0x5410U;
+	st.shared.b32 	[%r81+1024], %r205;
+	st.shared.b32 	[%r84+1536], %r203;
+	bar.sync 	0;
+	ld.shared.b16 	%rs9, [%r96];
+	ld.shared.b16 	%rs10, [%r98+128];
+	ld.shared.b16 	%rs11, [%r100+256];
+	ld.shared.b16 	%rs12, [%r102+384];
+	ld.shared.b16 	%rs13, [%r104];
+	ld.shared.b16 	%rs14, [%r106+128];
+	ld.shared.b16 	%rs15, [%r108+256];
+	ld.shared.b16 	%rs16, [%r110+384];
+	.loc	1 56 52                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:56:52
+	add.s32 	%r206, %r48, -8192;
+	.loc	1 56 31                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:56:31
+	mad.wide.s32 	%rd10, %r206, 4, %rd20;
+	.loc	1 56 90                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:56:90
+	// begin inline asm
+	mov.u64 %rd11, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd11, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r22, %r5;
+	mov.u32 %r23, %r5;
+	mov.u32 %r24, %r5;
+	mov.u32 %r25, %r5;
+	@%p4 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r22, %r23, %r24, %r25 }, [ %rd10 + 0 ], %rd11;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd12, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd12, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r26, %r5;
+	mov.u32 %r27, %r5;
+	mov.u32 %r28, %r5;
+	mov.u32 %r29, %r5;
+	@%p4 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r26, %r27, %r28, %r29 }, [ %rd10 + 0 ], %rd12;
+	// end inline asm
+	.loc	1 58 21                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:58:21
+	div.full.f32 	%r207, %r22, %r119;
+	div.full.f32 	%r208, %r23, %r119;
+	div.full.f32 	%r209, %r24, %r119;
+	div.full.f32 	%r210, %r25, %r119;
+	div.full.f32 	%r211, %r26, %r119;
+	div.full.f32 	%r212, %r27, %r119;
+	div.full.f32 	%r213, %r28, %r119;
+	div.full.f32 	%r214, %r29, %r119;
+	.loc	1 60 20                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:60:20
+	add.f32 	%r215, %r207, 0f358637BD;
+	add.f32 	%r216, %r208, 0f358637BD;
+	add.f32 	%r217, %r209, 0f358637BD;
+	add.f32 	%r218, %r210, 0f358637BD;
+	add.f32 	%r219, %r211, 0f358637BD;
+	add.f32 	%r220, %r212, 0f358637BD;
+	add.f32 	%r221, %r213, 0f358637BD;
+	add.f32 	%r222, %r214, 0f358637BD;
+	.loc	1 61 28                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:61:28
+	rsqrt.approx.ftz.f32 	%r223, %r215;
+	rsqrt.approx.ftz.f32 	%r224, %r216;
+	rsqrt.approx.ftz.f32 	%r225, %r217;
+	rsqrt.approx.ftz.f32 	%r226, %r218;
+	rsqrt.approx.ftz.f32 	%r227, %r219;
+	rsqrt.approx.ftz.f32 	%r228, %r220;
+	rsqrt.approx.ftz.f32 	%r229, %r221;
+	rsqrt.approx.ftz.f32 	%r230, %r222;
+	.loc	1 54 134                        // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:54:134
+	cvt.f32.bf16 	%r231, %rs16;
+	cvt.f32.bf16 	%r232, %rs15;
+	cvt.f32.bf16 	%r233, %rs14;
+	cvt.f32.bf16 	%r234, %rs13;
+	cvt.f32.bf16 	%r235, %rs12;
+	cvt.f32.bf16 	%r236, %rs11;
+	cvt.f32.bf16 	%r237, %rs10;
+	cvt.f32.bf16 	%r238, %rs9;
+	.loc	1 23 21                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:23:21
+	setp.lt.s32 	%p11, %r47, 73728;
+	.loc	1 62 20                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:62:20
+	mul.f32 	%r239, %r223, %r238;
+	mul.f32 	%r240, %r224, %r237;
+	mul.f32 	%r241, %r225, %r236;
+	mul.f32 	%r242, %r226, %r235;
+	mul.f32 	%r243, %r227, %r234;
+	mul.f32 	%r244, %r228, %r233;
+	mul.f32 	%r245, %r229, %r232;
+	mul.f32 	%r246, %r230, %r231;
+	bar.sync 	0;
+	st.shared.b32 	[%r162], %r239;
+	st.shared.b32 	[%r164+256], %r240;
+	st.shared.b32 	[%r166+512], %r241;
+	st.shared.b32 	[%r168+768], %r242;
+	st.shared.b32 	[%r170], %r243;
+	st.shared.b32 	[%r172+256], %r244;
+	st.shared.b32 	[%r174+512], %r245;
+	st.shared.b32 	[%r176+768], %r246;
+	bar.sync 	0;
+	ld.shared.b32 	%r247, [%r185];
+	ld.shared.b32 	%r248, [%r185+128];
+	ld.shared.b32 	%r249, [%r189+1024];
+	ld.shared.b32 	%r250, [%r189+1152];
+	ld.shared.b32 	%r251, [%r193+2048];
+	ld.shared.b32 	%r252, [%r193+2176];
+	ld.shared.b32 	%r253, [%r197+3072];
+	ld.shared.b32 	%r254, [%r197+3200];
+	.loc	1 63 31                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:63:31
+	add.s64 	%rd13, %rd21, %rd23;
+	.loc	1 63 71                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:63:71
+	// begin inline asm
+	mov.u64 %rd14, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd14, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r30, %r5;
+	mov.u32 %r31, %r5;
+	mov.u32 %r32, %r5;
+	mov.u32 %r33, %r5;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r30, %r31, %r32, %r33 }, [ %rd13 + 0 ], %rd14;
+	// end inline asm
+	.loc	1 70 34                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:70:34
+	shl.b32 	%r255, %r47, 7;
+	.loc	1 70 30                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:70:30
+	add.s32 	%r256, %r255, %r54;
+	.loc	1 70 25                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:70:25
+	mad.wide.s32 	%rd15, %r256, 2, %rd22;
+	.loc	1 70 54                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:70:54
+	and.pred 	%p5, %p6, %p11;
+	.loc	1 45 137                        // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:45:137
+	mov.b32 	{%rs17, %rs18}, %r14;
+	cvt.f32.bf16 	%r257, %rs17;
+	cvt.f32.bf16 	%r258, %rs18;
+	.loc	1 47 20                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:47:20
+	mul.f32 	%r259, %r190, %r258;
+	mul.f32 	%r260, %r186, %r257;
+	.loc	1 63 138                        // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:63:138
+	mov.b32 	{%rs19, %rs20}, %r30;
+	cvt.f32.bf16 	%r261, %rs19;
+	cvt.f32.bf16 	%r262, %rs20;
+	.loc	1 65 20                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:65:20
+	mul.f32 	%r263, %r249, %r262;
+	mul.f32 	%r264, %r247, %r261;
+	.loc	1 0 0                           // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:0
+	selp.f32 	%r265, %r260, %r264, %p8;
+	selp.f32 	%r266, %r259, %r263, %p8;
+	.loc	1 70 46                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:70:46
+	cvt.rn.bf16x2.f32 	%r34, %r266, %r265;
+	.loc	1 45 137                        // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:45:137
+	mov.b32 	{%rs21, %rs22}, %r15;
+	cvt.f32.bf16 	%r267, %rs21;
+	cvt.f32.bf16 	%r268, %rs22;
+	.loc	1 47 20                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:47:20
+	mul.f32 	%r269, %r198, %r268;
+	mul.f32 	%r270, %r194, %r267;
+	.loc	1 63 138                        // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:63:138
+	mov.b32 	{%rs23, %rs24}, %r31;
+	cvt.f32.bf16 	%r271, %rs23;
+	cvt.f32.bf16 	%r272, %rs24;
+	.loc	1 65 20                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:65:20
+	mul.f32 	%r273, %r253, %r272;
+	mul.f32 	%r274, %r251, %r271;
+	.loc	1 0 0                           // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:0
+	selp.f32 	%r275, %r270, %r274, %p8;
+	selp.f32 	%r276, %r269, %r273, %p8;
+	.loc	1 70 46                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:70:46
+	cvt.rn.bf16x2.f32 	%r35, %r276, %r275;
+	.loc	1 45 137                        // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:45:137
+	mov.b32 	{%rs25, %rs26}, %r16;
+	cvt.f32.bf16 	%r277, %rs25;
+	cvt.f32.bf16 	%r278, %rs26;
+	.loc	1 47 20                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:47:20
+	mul.f32 	%r279, %r191, %r278;
+	mul.f32 	%r280, %r187, %r277;
+	.loc	1 63 138                        // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:63:138
+	mov.b32 	{%rs27, %rs28}, %r32;
+	cvt.f32.bf16 	%r281, %rs27;
+	cvt.f32.bf16 	%r282, %rs28;
+	.loc	1 65 20                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:65:20
+	mul.f32 	%r283, %r250, %r282;
+	mul.f32 	%r284, %r248, %r281;
+	.loc	1 0 0                           // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:0
+	selp.f32 	%r285, %r280, %r284, %p8;
+	selp.f32 	%r286, %r279, %r283, %p8;
+	.loc	1 70 46                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:70:46
+	cvt.rn.bf16x2.f32 	%r36, %r286, %r285;
+	.loc	1 45 137                        // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:45:137
+	mov.b32 	{%rs29, %rs30}, %r17;
+	cvt.f32.bf16 	%r287, %rs29;
+	cvt.f32.bf16 	%r288, %rs30;
+	.loc	1 47 20                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:47:20
+	mul.f32 	%r289, %r199, %r288;
+	mul.f32 	%r290, %r195, %r287;
+	.loc	1 63 138                        // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:63:138
+	mov.b32 	{%rs31, %rs32}, %r33;
+	cvt.f32.bf16 	%r291, %rs31;
+	cvt.f32.bf16 	%r292, %rs32;
+	.loc	1 65 20                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:65:20
+	mul.f32 	%r293, %r254, %r292;
+	mul.f32 	%r294, %r252, %r291;
+	.loc	1 0 0                           // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:0
+	selp.f32 	%r295, %r290, %r294, %p8;
+	selp.f32 	%r296, %r289, %r293, %p8;
+	.loc	1 70 46                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:70:46
+	cvt.rn.bf16x2.f32 	%r37, %r296, %r295;
+	// begin inline asm
+	@%p5 st.global.v4.b32 [ %rd15 + 0 ], { %r34, %r35, %r36, %r37 };
+	// end inline asm
+	.loc	1 70 4                          // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:70:4
+	ret;
+$L__tmp1:
+$L__func_end0:
+                                        // -- End function
+}
+	.file	1 "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py"
+	.section	.debug_abbrev
+	{
+.b8 1                                   // Abbreviation Code
+.b8 17                                  // DW_TAG_compile_unit
+.b8 0                                   // DW_CHILDREN_no
+.b8 37                                  // DW_AT_producer
+.b8 8                                   // DW_FORM_string
+.b8 19                                  // DW_AT_language
+.b8 5                                   // DW_FORM_data2
+.b8 3                                   // DW_AT_name
+.b8 8                                   // DW_FORM_string
+.b8 16                                  // DW_AT_stmt_list
+.b8 6                                   // DW_FORM_data4
+.b8 27                                  // DW_AT_comp_dir
+.b8 8                                   // DW_FORM_string
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 0                                   // EOM(3)
+	}
+	.section	.debug_info
+	{
+.b32 224                                // Length of Unit
+.b8 2                                   // DWARF version number
+.b8 0
+.b32 .debug_abbrev                      // Offset Into Abbrev. Section
+.b8 8                                   // Address Size (in bytes)
+.b8 1                                   // Abbrev [1] 0xb:0xd9 DW_TAG_compile_unit
+.b8 116                                 // DW_AT_producer
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 0
+.b8 2                                   // DW_AT_language
+.b8 0
+.b8 99                                  // DW_AT_name
+.b8 50
+.b8 104
+.b8 105
+.b8 106
+.b8 51
+.b8 104
+.b8 109
+.b8 108
+.b8 111
+.b8 117
+.b8 109
+.b8 120
+.b8 100
+.b8 109
+.b8 104
+.b8 117
+.b8 101
+.b8 122
+.b8 115
+.b8 121
+.b8 104
+.b8 107
+.b8 109
+.b8 110
+.b8 113
+.b8 103
+.b8 110
+.b8 102
+.b8 97
+.b8 53
+.b8 105
+.b8 118
+.b8 114
+.b8 101
+.b8 50
+.b8 55
+.b8 117
+.b8 111
+.b8 115
+.b8 121
+.b8 109
+.b8 97
+.b8 109
+.b8 51
+.b8 100
+.b8 114
+.b8 55
+.b8 97
+.b8 53
+.b8 120
+.b8 98
+.b8 46
+.b8 112
+.b8 121
+.b8 0
+.b32 .debug_line                        // DW_AT_stmt_list
+.b8 47                                  // DW_AT_comp_dir
+.b8 97
+.b8 112
+.b8 112
+.b8 47
+.b8 116
+.b8 101
+.b8 110
+.b8 115
+.b8 111
+.b8 114
+.b8 114
+.b8 116
+.b8 95
+.b8 108
+.b8 108
+.b8 109
+.b8 47
+.b8 118
+.b8 105
+.b8 115
+.b8 117
+.b8 97
+.b8 108
+.b8 95
+.b8 103
+.b8 101
+.b8 110
+.b8 47
+.b8 99
+.b8 111
+.b8 109
+.b8 112
+.b8 105
+.b8 108
+.b8 101
+.b8 100
+.b8 95
+.b8 99
+.b8 97
+.b8 99
+.b8 104
+.b8 101
+.b8 47
+.b8 102
+.b8 108
+.b8 117
+.b8 120
+.b8 50
+.b8 95
+.b8 107
+.b8 108
+.b8 101
+.b8 105
+.b8 110
+.b8 95
+.b8 57
+.b8 98
+.b8 95
+.b8 78
+.b8 86
+.b8 73
+.b8 68
+.b8 73
+.b8 65
+.b8 95
+.b8 71
+.b8 101
+.b8 70
+.b8 111
+.b8 114
+.b8 99
+.b8 101
+.b8 95
+.b8 82
+.b8 84
+.b8 88
+.b8 95
+.b8 52
+.b8 48
+.b8 57
+.b8 48
+.b8 95
+.b8 115
+.b8 109
+.b8 56
+.b8 57
+.b8 95
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 50
+.b8 46
+.b8 49
+.b8 48
+.b8 46
+.b8 48
+.b8 97
+.b8 48
+.b8 95
+.b8 98
+.b8 52
+.b8 101
+.b8 52
+.b8 101
+.b8 101
+.b8 56
+.b8 49
+.b8 100
+.b8 51
+.b8 46
+.b8 110
+.b8 118
+.b8 50
+.b8 53
+.b8 46
+.b8 49
+.b8 50
+.b8 95
+.b8 99
+.b8 117
+.b8 100
+.b8 97
+.b8 49
+.b8 51
+.b8 95
+.b8 49
+.b8 47
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 105
+.b8 110
+.b8 100
+.b8 117
+.b8 99
+.b8 116
+.b8 111
+.b8 114
+.b8 47
+.b8 50
+.b8 104
+.b8 0
+	}
+	.section	.debug_macinfo	{	}
diff --git a/triton/K67YKHK7SDRMJU7RXHVXJT2KOUZAO4I5423SJJDUEAFURSHREZEQ/triton_poi_fused__fused_rms_norm_cat_view_2.source b/triton/K67YKHK7SDRMJU7RXHVXJT2KOUZAO4I5423SJJDUEAFURSHREZEQ/triton_poi_fused__fused_rms_norm_cat_view_2.source
new file mode 100644
index 0000000000000000000000000000000000000000..bedde25ba37a672088b4ad4a355020770a713c28
--- /dev/null
+++ b/triton/K67YKHK7SDRMJU7RXHVXJT2KOUZAO4I5423SJJDUEAFURSHREZEQ/triton_poi_fused__fused_rms_norm_cat_view_2.source
@@ -0,0 +1,415 @@
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":18:0)
+#loc99 = loc("in_ptr0"(#loc))
+#loc100 = loc("in_ptr1"(#loc))
+#loc101 = loc("in_ptr2"(#loc))
+#loc102 = loc("in_ptr3"(#loc))
+#loc103 = loc("in_ptr4"(#loc))
+#loc104 = loc("in_ptr5"(#loc))
+#loc105 = loc("out_ptr0"(#loc))
+#loc106 = loc("ynumel"(#loc))
+#loc107 = loc("xnumel"(#loc))
+module {
+  tt.func public @triton_poi_fused__fused_rms_norm_cat_view_2(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %in_ptr4: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr4"(#loc)), %in_ptr5: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr5"(#loc)), %out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ynumel: i32 {tt.divisibility = 16 : i32} loc("ynumel"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} {
+    %ynumel_0 = arith.constant 73728 : i32 loc(#loc108)
+    %xnumel_1 = arith.constant 128 : i32 loc(#loc109)
+    %yoffset = tt.get_program_id y : i32 loc(#loc110)
+    %yoffset_2 = tt.get_program_id z : i32 loc(#loc111)
+    %yoffset_3 = tt.get_num_programs y : i32 loc(#loc112)
+    %yoffset_4 = arith.muli %yoffset_2, %yoffset_3 : i32 loc(#loc113)
+    %yoffset_5 = arith.addi %yoffset, %yoffset_4 : i32 loc(#loc114)
+    %yoffset_6 = arith.constant 32 : i32 loc(#loc115)
+    %yoffset_7 = arith.constant 32 : i32 loc(#loc115)
+    %yoffset_8 = arith.muli %yoffset_5, %yoffset_7 : i32 loc(#loc115)
+    %yindex = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32> loc(#loc116)
+    %yindex_9 = tt.expand_dims %yindex {axis = 1 : i32} : tensor<32xi32> -> tensor<32x1xi32> loc(#loc117)
+    %yindex_10 = tt.splat %yoffset_8 : i32 -> tensor<32x1xi32> loc(#loc118)
+    %yindex_11 = arith.addi %yindex_10, %yindex_9 : tensor<32x1xi32> loc(#loc118)
+    %ymask = arith.constant dense<73728> : tensor<32x1xi32> loc(#loc119)
+    %ymask_12 = arith.cmpi slt, %yindex_11, %ymask : tensor<32x1xi32> loc(#loc119)
+    %xoffset = tt.get_program_id x : i32 loc(#loc120)
+    %xoffset_13 = arith.constant 32 : i32 loc(#loc121)
+    %xoffset_14 = arith.constant 32 : i32 loc(#loc121)
+    %xoffset_15 = arith.muli %xoffset, %xoffset_14 : i32 loc(#loc121)
+    %xindex = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32> loc(#loc122)
+    %xindex_16 = tt.expand_dims %xindex {axis = 0 : i32} : tensor<32xi32> -> tensor<1x32xi32> loc(#loc123)
+    %xindex_17 = tt.splat %xoffset_15 : i32 -> tensor<1x32xi32> loc(#loc124)
+    %xindex_18 = arith.addi %xindex_17, %xindex_16 : tensor<1x32xi32> loc(#loc124)
+    %xmask = arith.constant dense<128> : tensor<1x32xi32> loc(#loc125)
+    %xmask_19 = arith.cmpi slt, %xindex_18, %xmask : tensor<1x32xi32> loc(#loc125)
+    %y1 = arith.constant 32 : i32 loc(#loc126)
+    %y1_20 = arith.constant 32 : i32 loc(#loc126)
+    %y1_21 = arith.constant dense<32> : tensor<32x1xi32> loc(#loc126)
+    %y1_22 = arith.divsi %yindex_11, %y1_21 : tensor<32x1xi32> loc(#loc126)
+    %y0 = arith.constant 32 : i32 loc(#loc127)
+    %y0_23 = arith.constant 32 : i32 loc(#loc127)
+    %y0_24 = arith.constant dense<32> : tensor<32x1xi32> loc(#loc127)
+    %y0_25 = arith.remsi %yindex_11, %y0_24 : tensor<32x1xi32> loc(#loc127)
+    %tmp1 = arith.constant 0 : i64 loc(#loc128)
+    %tmp1_26 = arith.constant dense<0> : tensor<1x1xi64> loc(#loc128)
+    %tmp2 = arith.extsi %y1_22 : tensor<32x1xi32> to tensor<32x1xi64> loc(#loc129)
+    %tmp2_27 = arith.constant dense<0> : tensor<32x1xi64> loc(#loc129)
+    %tmp2_28 = arith.cmpi sge, %tmp2, %tmp2_27 : tensor<32x1xi64> loc(#loc129)
+    %tmp3 = arith.constant 256 : i64 loc(#loc130)
+    %tmp3_29 = arith.constant dense<256> : tensor<1x1xi64> loc(#loc130)
+    %tmp4 = arith.extsi %y1_22 : tensor<32x1xi32> to tensor<32x1xi64> loc(#loc131)
+    %tmp4_30 = arith.constant dense<256> : tensor<32x1xi64> loc(#loc131)
+    %tmp4_31 = arith.cmpi slt, %tmp4, %tmp4_30 : tensor<32x1xi64> loc(#loc131)
+    %tmp5 = arith.constant 128 : i32 loc(#loc132)
+    %tmp5_32 = arith.constant 128 : i32 loc(#loc132)
+    %tmp5_33 = arith.constant dense<128> : tensor<32x1xi32> loc(#loc132)
+    %tmp5_34 = arith.muli %tmp5_33, %y0_25 : tensor<32x1xi32> loc(#loc132)
+    %tmp5_35 = tt.broadcast %xindex_18 : tensor<1x32xi32> -> tensor<32x32xi32> loc(#loc133)
+    %tmp5_36 = tt.broadcast %tmp5_34 : tensor<32x1xi32> -> tensor<32x32xi32> loc(#loc133)
+    %tmp5_37 = arith.addi %tmp5_35, %tmp5_36 : tensor<32x32xi32> loc(#loc133)
+    %tmp5_38 = arith.constant 12288 : i32 loc(#loc134)
+    %tmp5_39 = arith.constant 12288 : i32 loc(#loc134)
+    %tmp5_40 = arith.constant dense<12288> : tensor<32x1xi32> loc(#loc134)
+    %tmp5_41 = arith.muli %tmp5_40, %y1_22 : tensor<32x1xi32> loc(#loc134)
+    %tmp5_42 = tt.broadcast %tmp5_41 : tensor<32x1xi32> -> tensor<32x32xi32> loc(#loc135)
+    %tmp5_43 = arith.addi %tmp5_37, %tmp5_42 : tensor<32x32xi32> loc(#loc135)
+    %tmp5_44 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<32x32x!tt.ptr<bf16>> loc(#loc136)
+    %tmp5_45 = tt.addptr %tmp5_44, %tmp5_43 : tensor<32x32x!tt.ptr<bf16>>, tensor<32x32xi32> loc(#loc136)
+    %tmp5_46 = tt.broadcast %tmp4_31 : tensor<32x1xi1> -> tensor<32x32xi1> loc(#loc137)
+    %tmp5_47 = tt.broadcast %xmask_19 : tensor<1x32xi1> -> tensor<32x32xi1> loc(#loc137)
+    %tmp5_48 = arith.andi %tmp5_46, %tmp5_47 : tensor<32x32xi1> loc(#loc137)
+    %tmp5_49 = tt.broadcast %ymask_12 : tensor<32x1xi1> -> tensor<32x32xi1> loc(#loc138)
+    %tmp5_50 = arith.andi %tmp5_48, %tmp5_49 : tensor<32x32xi1> loc(#loc138)
+    %tmp5_51 = arith.constant 0.000000e+00 : f32 loc(#loc139)
+    %tmp5_52 = arith.constant dense<0.000000e+00> : tensor<32x32xf32> loc(#loc139)
+    %tmp5_53 = arith.truncf %tmp5_52 : tensor<32x32xf32> to tensor<32x32xbf16> loc(#loc139)
+    %tmp5_54 = tt.load %tmp5_45, %tmp5_50, %tmp5_53 evictionPolicy = evict_last : tensor<32x32x!tt.ptr<bf16>> loc(#loc139)
+    %tmp5_55 = arith.extf %tmp5_54 : tensor<32x32xbf16> to tensor<32x32xf32> loc(#loc140)
+    %tmp7 = arith.constant 32 : i32 loc(#loc141)
+    %tmp7_56 = arith.constant 32 : i32 loc(#loc141)
+    %tmp7_57 = arith.constant dense<32> : tensor<32x1xi32> loc(#loc141)
+    %tmp7_58 = arith.muli %tmp7_57, %y1_22 : tensor<32x1xi32> loc(#loc141)
+    %tmp7_59 = arith.addi %y0_25, %tmp7_58 : tensor<32x1xi32> loc(#loc142)
+    %tmp7_60 = tt.broadcast %tmp7_59 : tensor<32x1xi32> -> tensor<32x32xi32> loc(#loc143)
+    %tmp7_61 = tt.splat %in_ptr1 : !tt.ptr<f32> -> tensor<32x32x!tt.ptr<f32>> loc(#loc144)
+    %tmp7_62 = tt.addptr %tmp7_61, %tmp7_60 : tensor<32x32x!tt.ptr<f32>>, tensor<32x32xi32> loc(#loc144)
+    %tmp7_63 = tt.broadcast %tmp4_31 : tensor<32x1xi1> -> tensor<32x32xi1> loc(#loc145)
+    %tmp7_64 = tt.broadcast %xmask_19 : tensor<1x32xi1> -> tensor<32x32xi1> loc(#loc145)
+    %tmp7_65 = arith.andi %tmp7_63, %tmp7_64 : tensor<32x32xi1> loc(#loc145)
+    %tmp7_66 = tt.broadcast %ymask_12 : tensor<32x1xi1> -> tensor<32x32xi1> loc(#loc146)
+    %tmp7_67 = arith.andi %tmp7_65, %tmp7_66 : tensor<32x32xi1> loc(#loc146)
+    %tmp7_68 = arith.constant 0.000000e+00 : f32 loc(#loc147)
+    %tmp7_69 = arith.constant dense<0.000000e+00> : tensor<32x32xf32> loc(#loc147)
+    %tmp7_70 = tt.load %tmp7_62, %tmp7_67, %tmp7_69 evictionPolicy = evict_last : tensor<32x32x!tt.ptr<f32>> loc(#loc147)
+    %tmp8 = arith.constant 1.280000e+02 : f32 loc(#loc148)
+    %tmp9 = arith.constant dense<1.280000e+02> : tensor<32x32xf32> loc(#loc149)
+    %tmp9_71 = arith.divf %tmp7_70, %tmp9 : tensor<32x32xf32> loc(#loc149)
+    %tmp10 = arith.constant 9.99999997E-7 : f32 loc(#loc150)
+    %tmp11 = arith.constant dense<9.99999997E-7> : tensor<32x32xf32> loc(#loc151)
+    %tmp11_72 = arith.addf %tmp9_71, %tmp11 : tensor<32x32xf32> loc(#loc151)
+    %tmp12 = tt.extern_elementwise %tmp11_72 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<32x32xf32>) -> tensor<32x32xf32> loc(#loc152)
+    %tmp13 = arith.mulf %tmp5_55, %tmp12 : tensor<32x32xf32> loc(#loc153)
+    %tmp14 = tt.broadcast %xindex_18 : tensor<1x32xi32> -> tensor<32x32xi32> loc(#loc154)
+    %tmp14_73 = tt.splat %in_ptr2 : !tt.ptr<bf16> -> tensor<32x32x!tt.ptr<bf16>> loc(#loc155)
+    %tmp14_74 = tt.addptr %tmp14_73, %tmp14 : tensor<32x32x!tt.ptr<bf16>>, tensor<32x32xi32> loc(#loc155)
+    %tmp14_75 = tt.broadcast %tmp4_31 : tensor<32x1xi1> -> tensor<32x32xi1> loc(#loc156)
+    %tmp14_76 = tt.broadcast %xmask_19 : tensor<1x32xi1> -> tensor<32x32xi1> loc(#loc156)
+    %tmp14_77 = arith.andi %tmp14_75, %tmp14_76 : tensor<32x32xi1> loc(#loc156)
+    %tmp14_78 = tt.broadcast %ymask_12 : tensor<32x1xi1> -> tensor<32x32xi1> loc(#loc157)
+    %tmp14_79 = arith.andi %tmp14_77, %tmp14_78 : tensor<32x32xi1> loc(#loc157)
+    %tmp14_80 = arith.constant 0.000000e+00 : f32 loc(#loc158)
+    %tmp14_81 = arith.constant dense<0.000000e+00> : tensor<32x32xf32> loc(#loc158)
+    %tmp14_82 = arith.truncf %tmp14_81 : tensor<32x32xf32> to tensor<32x32xbf16> loc(#loc158)
+    %tmp14_83 = tt.load %tmp14_74, %tmp14_79, %tmp14_82 evictionPolicy = evict_last : tensor<32x32x!tt.ptr<bf16>> loc(#loc158)
+    %tmp14_84 = arith.extf %tmp14_83 : tensor<32x32xbf16> to tensor<32x32xf32> loc(#loc159)
+    %tmp16 = arith.mulf %tmp13, %tmp14_84 : tensor<32x32xf32> loc(#loc160)
+    %tmp18 = arith.constant 0.000000e+00 : f32 loc(#loc161)
+    %tmp18_85 = arith.constant dense<0.000000e+00> : tensor<32x32xf32> loc(#loc161)
+    %tmp19 = tt.broadcast %tmp4_31 : tensor<32x1xi1> -> tensor<32x32xi1> loc(#loc162)
+    %tmp19_86 = arith.select %tmp19, %tmp16, %tmp18_85 : tensor<32x32xi1>, tensor<32x32xf32> loc(#loc162)
+    %tmp20 = arith.extsi %y1_22 : tensor<32x1xi32> to tensor<32x1xi64> loc(#loc163)
+    %tmp20_87 = arith.constant dense<256> : tensor<32x1xi64> loc(#loc163)
+    %tmp20_88 = arith.cmpi sge, %tmp20, %tmp20_87 : tensor<32x1xi64> loc(#loc163)
+    %tmp21 = arith.constant 2304 : i64 loc(#loc164)
+    %tmp21_89 = arith.constant dense<2304> : tensor<1x1xi64> loc(#loc164)
+    %tmp22 = arith.extsi %y1_22 : tensor<32x1xi32> to tensor<32x1xi64> loc(#loc165)
+    %tmp22_90 = arith.constant dense<2304> : tensor<32x1xi64> loc(#loc165)
+    %tmp22_91 = arith.cmpi slt, %tmp22, %tmp22_90 : tensor<32x1xi64> loc(#loc165)
+    %tmp23 = arith.constant 128 : i32 loc(#loc166)
+    %tmp23_92 = arith.constant 128 : i32 loc(#loc166)
+    %tmp23_93 = arith.constant dense<128> : tensor<32x1xi32> loc(#loc166)
+    %tmp23_94 = arith.muli %tmp23_93, %y0_25 : tensor<32x1xi32> loc(#loc166)
+    %tmp23_95 = tt.broadcast %xindex_18 : tensor<1x32xi32> -> tensor<32x32xi32> loc(#loc167)
+    %tmp23_96 = tt.broadcast %tmp23_94 : tensor<32x1xi32> -> tensor<32x32xi32> loc(#loc167)
+    %tmp23_97 = arith.addi %tmp23_95, %tmp23_96 : tensor<32x32xi32> loc(#loc167)
+    %tmp23_98 = arith.constant -256 : i32 loc(#loc168)
+    %tmp23_99 = arith.constant -256 : i32 loc(#loc168)
+    %tmp23_100 = arith.constant dense<-256> : tensor<32x1xi32> loc(#loc168)
+    %tmp23_101 = arith.addi %tmp23_100, %y1_22 : tensor<32x1xi32> loc(#loc168)
+    %tmp23_102 = arith.constant 12288 : i32 loc(#loc169)
+    %tmp23_103 = arith.constant 12288 : i32 loc(#loc169)
+    %tmp23_104 = arith.constant dense<12288> : tensor<32x1xi32> loc(#loc169)
+    %tmp23_105 = arith.muli %tmp23_104, %tmp23_101 : tensor<32x1xi32> loc(#loc169)
+    %tmp23_106 = tt.broadcast %tmp23_105 : tensor<32x1xi32> -> tensor<32x32xi32> loc(#loc170)
+    %tmp23_107 = arith.addi %tmp23_97, %tmp23_106 : tensor<32x32xi32> loc(#loc170)
+    %tmp23_108 = tt.splat %in_ptr3 : !tt.ptr<bf16> -> tensor<32x32x!tt.ptr<bf16>> loc(#loc171)
+    %tmp23_109 = tt.addptr %tmp23_108, %tmp23_107 : tensor<32x32x!tt.ptr<bf16>>, tensor<32x32xi32> loc(#loc171)
+    %tmp23_110 = tt.broadcast %tmp20_88 : tensor<32x1xi1> -> tensor<32x32xi1> loc(#loc172)
+    %tmp23_111 = tt.broadcast %xmask_19 : tensor<1x32xi1> -> tensor<32x32xi1> loc(#loc172)
+    %tmp23_112 = arith.andi %tmp23_110, %tmp23_111 : tensor<32x32xi1> loc(#loc172)
+    %tmp23_113 = tt.broadcast %ymask_12 : tensor<32x1xi1> -> tensor<32x32xi1> loc(#loc173)
+    %tmp23_114 = arith.andi %tmp23_112, %tmp23_113 : tensor<32x32xi1> loc(#loc173)
+    %tmp23_115 = arith.constant 0.000000e+00 : f32 loc(#loc174)
+    %tmp23_116 = arith.constant dense<0.000000e+00> : tensor<32x32xf32> loc(#loc174)
+    %tmp23_117 = arith.truncf %tmp23_116 : tensor<32x32xf32> to tensor<32x32xbf16> loc(#loc174)
+    %tmp23_118 = tt.load %tmp23_109, %tmp23_114, %tmp23_117 evictionPolicy = evict_last : tensor<32x32x!tt.ptr<bf16>> loc(#loc174)
+    %tmp23_119 = arith.extf %tmp23_118 : tensor<32x32xbf16> to tensor<32x32xf32> loc(#loc175)
+    %tmp25 = arith.constant -256 : i32 loc(#loc176)
+    %tmp25_120 = arith.constant -256 : i32 loc(#loc176)
+    %tmp25_121 = arith.constant dense<-256> : tensor<32x1xi32> loc(#loc176)
+    %tmp25_122 = arith.addi %tmp25_121, %y1_22 : tensor<32x1xi32> loc(#loc176)
+    %tmp25_123 = arith.constant 32 : i32 loc(#loc177)
+    %tmp25_124 = arith.constant 32 : i32 loc(#loc177)
+    %tmp25_125 = arith.constant dense<32> : tensor<32x1xi32> loc(#loc177)
+    %tmp25_126 = arith.muli %tmp25_125, %tmp25_122 : tensor<32x1xi32> loc(#loc177)
+    %tmp25_127 = arith.addi %y0_25, %tmp25_126 : tensor<32x1xi32> loc(#loc178)
+    %tmp25_128 = tt.broadcast %tmp25_127 : tensor<32x1xi32> -> tensor<32x32xi32> loc(#loc179)
+    %tmp25_129 = tt.splat %in_ptr4 : !tt.ptr<f32> -> tensor<32x32x!tt.ptr<f32>> loc(#loc180)
+    %tmp25_130 = tt.addptr %tmp25_129, %tmp25_128 : tensor<32x32x!tt.ptr<f32>>, tensor<32x32xi32> loc(#loc180)
+    %tmp25_131 = tt.broadcast %tmp20_88 : tensor<32x1xi1> -> tensor<32x32xi1> loc(#loc181)
+    %tmp25_132 = tt.broadcast %xmask_19 : tensor<1x32xi1> -> tensor<32x32xi1> loc(#loc181)
+    %tmp25_133 = arith.andi %tmp25_131, %tmp25_132 : tensor<32x32xi1> loc(#loc181)
+    %tmp25_134 = tt.broadcast %ymask_12 : tensor<32x1xi1> -> tensor<32x32xi1> loc(#loc182)
+    %tmp25_135 = arith.andi %tmp25_133, %tmp25_134 : tensor<32x32xi1> loc(#loc182)
+    %tmp25_136 = arith.constant 0.000000e+00 : f32 loc(#loc183)
+    %tmp25_137 = arith.constant dense<0.000000e+00> : tensor<32x32xf32> loc(#loc183)
+    %tmp25_138 = tt.load %tmp25_130, %tmp25_135, %tmp25_137 evictionPolicy = evict_last : tensor<32x32x!tt.ptr<f32>> loc(#loc183)
+    %tmp26 = arith.constant 1.280000e+02 : f32 loc(#loc184)
+    %tmp27 = arith.constant dense<1.280000e+02> : tensor<32x32xf32> loc(#loc185)
+    %tmp27_139 = arith.divf %tmp25_138, %tmp27 : tensor<32x32xf32> loc(#loc185)
+    %tmp28 = arith.constant 9.99999997E-7 : f32 loc(#loc186)
+    %tmp29 = arith.constant dense<9.99999997E-7> : tensor<32x32xf32> loc(#loc187)
+    %tmp29_140 = arith.addf %tmp27_139, %tmp29 : tensor<32x32xf32> loc(#loc187)
+    %tmp30 = tt.extern_elementwise %tmp29_140 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<32x32xf32>) -> tensor<32x32xf32> loc(#loc188)
+    %tmp31 = arith.mulf %tmp23_119, %tmp30 : tensor<32x32xf32> loc(#loc189)
+    %tmp32 = tt.broadcast %xindex_18 : tensor<1x32xi32> -> tensor<32x32xi32> loc(#loc190)
+    %tmp32_141 = tt.splat %in_ptr5 : !tt.ptr<bf16> -> tensor<32x32x!tt.ptr<bf16>> loc(#loc191)
+    %tmp32_142 = tt.addptr %tmp32_141, %tmp32 : tensor<32x32x!tt.ptr<bf16>>, tensor<32x32xi32> loc(#loc191)
+    %tmp32_143 = tt.broadcast %tmp20_88 : tensor<32x1xi1> -> tensor<32x32xi1> loc(#loc192)
+    %tmp32_144 = tt.broadcast %xmask_19 : tensor<1x32xi1> -> tensor<32x32xi1> loc(#loc192)
+    %tmp32_145 = arith.andi %tmp32_143, %tmp32_144 : tensor<32x32xi1> loc(#loc192)
+    %tmp32_146 = tt.broadcast %ymask_12 : tensor<32x1xi1> -> tensor<32x32xi1> loc(#loc193)
+    %tmp32_147 = arith.andi %tmp32_145, %tmp32_146 : tensor<32x32xi1> loc(#loc193)
+    %tmp32_148 = arith.constant 0.000000e+00 : f32 loc(#loc194)
+    %tmp32_149 = arith.constant dense<0.000000e+00> : tensor<32x32xf32> loc(#loc194)
+    %tmp32_150 = arith.truncf %tmp32_149 : tensor<32x32xf32> to tensor<32x32xbf16> loc(#loc194)
+    %tmp32_151 = tt.load %tmp32_142, %tmp32_147, %tmp32_150 evictionPolicy = evict_last : tensor<32x32x!tt.ptr<bf16>> loc(#loc194)
+    %tmp32_152 = arith.extf %tmp32_151 : tensor<32x32xbf16> to tensor<32x32xf32> loc(#loc195)
+    %tmp34 = arith.mulf %tmp31, %tmp32_152 : tensor<32x32xf32> loc(#loc196)
+    %tmp36 = arith.constant 0.000000e+00 : f32 loc(#loc197)
+    %tmp36_153 = arith.constant dense<0.000000e+00> : tensor<32x32xf32> loc(#loc197)
+    %tmp37 = tt.broadcast %tmp20_88 : tensor<32x1xi1> -> tensor<32x32xi1> loc(#loc198)
+    %tmp37_154 = arith.select %tmp37, %tmp34, %tmp36_153 : tensor<32x32xi1>, tensor<32x32xf32> loc(#loc198)
+    %tmp38 = tt.broadcast %tmp4_31 : tensor<32x1xi1> -> tensor<32x32xi1> loc(#loc199)
+    %tmp38_155 = arith.select %tmp38, %tmp19_86, %tmp37_154 : tensor<32x32xi1>, tensor<32x32xf32> loc(#loc199)
+    %c128_i32 = arith.constant 128 : i32 loc(#loc93)
+    %c128_i32_156 = arith.constant 128 : i32 loc(#loc93)
+    %cst = arith.constant dense<128> : tensor<32x1xi32> loc(#loc93)
+    %0 = arith.muli %cst, %yindex_11 : tensor<32x1xi32> loc(#loc93)
+    %1 = tt.broadcast %xindex_18 : tensor<1x32xi32> -> tensor<32x32xi32> loc(#loc94)
+    %2 = tt.broadcast %0 : tensor<32x1xi32> -> tensor<32x32xi32> loc(#loc94)
+    %3 = arith.addi %1, %2 : tensor<32x32xi32> loc(#loc94)
+    %4 = tt.splat %out_ptr0 : !tt.ptr<bf16> -> tensor<32x32x!tt.ptr<bf16>> loc(#loc95)
+    %5 = tt.addptr %4, %3 : tensor<32x32x!tt.ptr<bf16>>, tensor<32x32xi32> loc(#loc95)
+    %6 = tt.broadcast %xmask_19 : tensor<1x32xi1> -> tensor<32x32xi1> loc(#loc96)
+    %7 = tt.broadcast %ymask_12 : tensor<32x1xi1> -> tensor<32x32xi1> loc(#loc96)
+    %8 = arith.andi %6, %7 : tensor<32x32xi1> loc(#loc96)
+    %9 = arith.truncf %tmp38_155 : tensor<32x32xf32> to tensor<32x32xbf16> loc(#loc97)
+    tt.store %5, %9, %8 : tensor<32x32x!tt.ptr<bf16>> loc(#loc97)
+    tt.return loc(#loc98)
+  } loc(#loc)
+} loc(#loc)
+#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":19:13)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":20:13)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:29)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:48)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:69)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:53)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:34)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:75)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":22:36)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":22:44)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":22:23)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":23:21)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":24:28)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":24:33)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":25:36)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":25:44)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":25:23)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":26:21)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":27:19)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":29:19)
+#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":32:30)
+#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":33:19)
+#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":34:32)
+#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":35:18)
+#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:39)
+#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:35)
+#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:51)
+#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:44)
+#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:30)
+#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:64)
+#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:72)
+#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:57)
+#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:123)
+#loc34 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:55)
+#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:51)
+#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:60)
+#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:30)
+#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:87)
+#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:95)
+#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:80)
+#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":39:11)
+#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":40:19)
+#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":41:12)
+#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":42:19)
+#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":43:28)
+#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":44:19)
+#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:51)
+#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:31)
+#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:78)
+#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:86)
+#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:71)
+#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:137)
+#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":47:20)
+#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":49:38)
+#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":50:34)
+#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":51:20)
+#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":52:34)
+#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":53:19)
+#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:40)
+#loc60 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:36)
+#loc61 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:61)
+#loc62 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:52)
+#loc63 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:45)
+#loc64 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:31)
+#loc65 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:75)
+#loc66 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:83)
+#loc67 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:67)
+#loc68 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:134)
+#loc69 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:65)
+#loc70 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:56)
+#loc71 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:52)
+#loc72 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:70)
+#loc73 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:31)
+#loc74 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:98)
+#loc75 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:106)
+#loc76 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:90)
+#loc77 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":57:12)
+#loc78 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":58:21)
+#loc79 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":59:12)
+#loc80 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":60:20)
+#loc81 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":61:28)
+#loc82 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":62:20)
+#loc83 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:51)
+#loc84 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:31)
+#loc85 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:79)
+#loc86 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:87)
+#loc87 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:71)
+#loc88 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:138)
+#loc89 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":65:20)
+#loc90 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":67:38)
+#loc91 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":68:35)
+#loc92 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":69:34)
+#loc93 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:34)
+#loc94 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:30)
+#loc95 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:25)
+#loc96 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:54)
+#loc97 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:46)
+#loc98 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:4)
+#loc108 = loc("ynumel"(#loc1))
+#loc109 = loc("xnumel"(#loc2))
+#loc110 = loc("yoffset"(#loc3))
+#loc111 = loc("yoffset"(#loc4))
+#loc112 = loc("yoffset"(#loc5))
+#loc113 = loc("yoffset"(#loc6))
+#loc114 = loc("yoffset"(#loc7))
+#loc115 = loc("yoffset"(#loc8))
+#loc116 = loc("yindex"(#loc9))
+#loc117 = loc("yindex"(#loc10))
+#loc118 = loc("yindex"(#loc11))
+#loc119 = loc("ymask"(#loc12))
+#loc120 = loc("xoffset"(#loc13))
+#loc121 = loc("xoffset"(#loc14))
+#loc122 = loc("xindex"(#loc15))
+#loc123 = loc("xindex"(#loc16))
+#loc124 = loc("xindex"(#loc17))
+#loc125 = loc("xmask"(#loc18))
+#loc126 = loc("y1"(#loc19))
+#loc127 = loc("y0"(#loc20))
+#loc128 = loc("tmp1"(#loc21))
+#loc129 = loc("tmp2"(#loc22))
+#loc130 = loc("tmp3"(#loc23))
+#loc131 = loc("tmp4"(#loc24))
+#loc132 = loc("tmp5"(#loc25))
+#loc133 = loc("tmp5"(#loc26))
+#loc134 = loc("tmp5"(#loc27))
+#loc135 = loc("tmp5"(#loc28))
+#loc136 = loc("tmp5"(#loc29))
+#loc137 = loc("tmp5"(#loc30))
+#loc138 = loc("tmp5"(#loc31))
+#loc139 = loc("tmp5"(#loc32))
+#loc140 = loc("tmp5"(#loc33))
+#loc141 = loc("tmp7"(#loc34))
+#loc142 = loc("tmp7"(#loc35))
+#loc143 = loc("tmp7"(#loc36))
+#loc144 = loc("tmp7"(#loc37))
+#loc145 = loc("tmp7"(#loc38))
+#loc146 = loc("tmp7"(#loc39))
+#loc147 = loc("tmp7"(#loc40))
+#loc148 = loc("tmp8"(#loc41))
+#loc149 = loc("tmp9"(#loc42))
+#loc150 = loc("tmp10"(#loc43))
+#loc151 = loc("tmp11"(#loc44))
+#loc152 = loc("tmp12"(#loc45))
+#loc153 = loc("tmp13"(#loc46))
+#loc154 = loc("tmp14"(#loc47))
+#loc155 = loc("tmp14"(#loc48))
+#loc156 = loc("tmp14"(#loc49))
+#loc157 = loc("tmp14"(#loc50))
+#loc158 = loc("tmp14"(#loc51))
+#loc159 = loc("tmp14"(#loc52))
+#loc160 = loc("tmp16"(#loc53))
+#loc161 = loc("tmp18"(#loc54))
+#loc162 = loc("tmp19"(#loc55))
+#loc163 = loc("tmp20"(#loc56))
+#loc164 = loc("tmp21"(#loc57))
+#loc165 = loc("tmp22"(#loc58))
+#loc166 = loc("tmp23"(#loc59))
+#loc167 = loc("tmp23"(#loc60))
+#loc168 = loc("tmp23"(#loc61))
+#loc169 = loc("tmp23"(#loc62))
+#loc170 = loc("tmp23"(#loc63))
+#loc171 = loc("tmp23"(#loc64))
+#loc172 = loc("tmp23"(#loc65))
+#loc173 = loc("tmp23"(#loc66))
+#loc174 = loc("tmp23"(#loc67))
+#loc175 = loc("tmp23"(#loc68))
+#loc176 = loc("tmp25"(#loc69))
+#loc177 = loc("tmp25"(#loc70))
+#loc178 = loc("tmp25"(#loc71))
+#loc179 = loc("tmp25"(#loc72))
+#loc180 = loc("tmp25"(#loc73))
+#loc181 = loc("tmp25"(#loc74))
+#loc182 = loc("tmp25"(#loc75))
+#loc183 = loc("tmp25"(#loc76))
+#loc184 = loc("tmp26"(#loc77))
+#loc185 = loc("tmp27"(#loc78))
+#loc186 = loc("tmp28"(#loc79))
+#loc187 = loc("tmp29"(#loc80))
+#loc188 = loc("tmp30"(#loc81))
+#loc189 = loc("tmp31"(#loc82))
+#loc190 = loc("tmp32"(#loc83))
+#loc191 = loc("tmp32"(#loc84))
+#loc192 = loc("tmp32"(#loc85))
+#loc193 = loc("tmp32"(#loc86))
+#loc194 = loc("tmp32"(#loc87))
+#loc195 = loc("tmp32"(#loc88))
+#loc196 = loc("tmp34"(#loc89))
+#loc197 = loc("tmp36"(#loc90))
+#loc198 = loc("tmp37"(#loc91))
+#loc199 = loc("tmp38"(#loc92))
diff --git a/triton/K67YKHK7SDRMJU7RXHVXJT2KOUZAO4I5423SJJDUEAFURSHREZEQ/triton_poi_fused__fused_rms_norm_cat_view_2.ttgir b/triton/K67YKHK7SDRMJU7RXHVXJT2KOUZAO4I5423SJJDUEAFURSHREZEQ/triton_poi_fused__fused_rms_norm_cat_view_2.ttgir
new file mode 100644
index 0000000000000000000000000000000000000000..53961e330d1b0b3c12cf7204ff1cc228278ac7cd
--- /dev/null
+++ b/triton/K67YKHK7SDRMJU7RXHVXJT2KOUZAO4I5423SJJDUEAFURSHREZEQ/triton_poi_fused__fused_rms_norm_cat_view_2.ttgir
@@ -0,0 +1,287 @@
+#blocked = #ttg.blocked<{sizePerThread = [4, 1], threadsPerWarp = [8, 4], warpsPerCTA = [1, 4], order = [0, 1]}>
+#blocked1 = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [8, 4], warpsPerCTA = [4, 1], order = [1, 0]}>
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":18:0)
+#loc70 = loc("in_ptr0"(#loc))
+#loc71 = loc("in_ptr1"(#loc))
+#loc72 = loc("in_ptr2"(#loc))
+#loc73 = loc("in_ptr3"(#loc))
+#loc74 = loc("in_ptr4"(#loc))
+#loc75 = loc("in_ptr5"(#loc))
+#loc76 = loc("out_ptr0"(#loc))
+#loc77 = loc("ynumel"(#loc))
+#loc78 = loc("xnumel"(#loc))
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:89", "ttg.threads-per-warp" = 32 : i32} {
+  tt.func public @triton_poi_fused__fused_rms_norm_cat_view_2(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %in_ptr4: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr4"(#loc)), %in_ptr5: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr5"(#loc)), %out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ynumel: i32 {tt.divisibility = 16 : i32} loc("ynumel"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} {
+    %cst = arith.constant dense<-256> : tensor<32x1xi32, #blocked> loc(#loc1)
+    %cst_0 = arith.constant dense<-256> : tensor<32x1xi32, #blocked1> loc(#loc1)
+    %cst_1 = arith.constant dense<12288> : tensor<32x1xi32, #blocked1> loc(#loc1)
+    %cst_2 = arith.constant dense<128> : tensor<32x1xi32, #blocked1> loc(#loc1)
+    %cst_3 = arith.constant dense<256> : tensor<32x1xi64, #blocked> loc(#loc1)
+    %cst_4 = arith.constant dense<256> : tensor<32x1xi64, #blocked1> loc(#loc1)
+    %cst_5 = arith.constant dense<32> : tensor<32x1xi32, #blocked> loc(#loc1)
+    %cst_6 = arith.constant dense<32> : tensor<32x1xi32, #blocked1> loc(#loc1)
+    %cst_7 = arith.constant dense<128> : tensor<1x32xi32, #blocked> loc(#loc1)
+    %cst_8 = arith.constant dense<128> : tensor<1x32xi32, #blocked1> loc(#loc1)
+    %cst_9 = arith.constant dense<73728> : tensor<32x1xi32, #blocked> loc(#loc1)
+    %cst_10 = arith.constant dense<73728> : tensor<32x1xi32, #blocked1> loc(#loc1)
+    %c32_i32 = arith.constant 32 : i32 loc(#loc1)
+    %cst_11 = arith.constant dense<0.000000e+00> : tensor<32x32xbf16, #blocked1> loc(#loc1)
+    %cst_12 = arith.constant dense<0.000000e+00> : tensor<32x32xf32, #blocked> loc(#loc1)
+    %cst_13 = arith.constant dense<9.99999997E-7> : tensor<32x32xf32, #blocked> loc(#loc1)
+    %cst_14 = arith.constant dense<1.280000e+02> : tensor<32x32xf32, #blocked> loc(#loc1)
+    %cst_15 = arith.constant dense<0.000000e+00> : tensor<32x32xf32, #blocked1> loc(#loc1)
+    %yoffset = tt.get_program_id y : i32 loc(#loc79)
+    %yoffset_16 = tt.get_program_id z : i32 loc(#loc80)
+    %yoffset_17 = tt.get_num_programs y : i32 loc(#loc81)
+    %yoffset_18 = arith.muli %yoffset_16, %yoffset_17 : i32 loc(#loc82)
+    %yoffset_19 = arith.addi %yoffset, %yoffset_18 : i32 loc(#loc83)
+    %yoffset_20 = arith.muli %yoffset_19, %c32_i32 : i32 loc(#loc84)
+    %yindex = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc85)
+    %yindex_21 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc85)
+    %yindex_22 = tt.expand_dims %yindex {axis = 1 : i32} : tensor<32xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<32x1xi32, #blocked1> loc(#loc85)
+    %yindex_23 = tt.expand_dims %yindex_21 {axis = 1 : i32} : tensor<32xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<32x1xi32, #blocked> loc(#loc85)
+    %yindex_24 = tt.splat %yoffset_20 : i32 -> tensor<32x1xi32, #blocked1> loc(#loc86)
+    %yindex_25 = tt.splat %yoffset_20 : i32 -> tensor<32x1xi32, #blocked> loc(#loc86)
+    %yindex_26 = arith.addi %yindex_24, %yindex_22 : tensor<32x1xi32, #blocked1> loc(#loc86)
+    %yindex_27 = arith.addi %yindex_25, %yindex_23 : tensor<32x1xi32, #blocked> loc(#loc86)
+    %ymask = arith.cmpi slt, %yindex_26, %cst_10 : tensor<32x1xi32, #blocked1> loc(#loc87)
+    %ymask_28 = arith.cmpi slt, %yindex_27, %cst_9 : tensor<32x1xi32, #blocked> loc(#loc87)
+    %xoffset = tt.get_program_id x : i32 loc(#loc88)
+    %xoffset_29 = arith.muli %xoffset, %c32_i32 : i32 loc(#loc89)
+    %xindex = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc90)
+    %xindex_30 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc90)
+    %xindex_31 = tt.expand_dims %xindex {axis = 0 : i32} : tensor<32xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x32xi32, #blocked1> loc(#loc90)
+    %xindex_32 = tt.expand_dims %xindex_30 {axis = 0 : i32} : tensor<32xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x32xi32, #blocked> loc(#loc90)
+    %xindex_33 = tt.splat %xoffset_29 : i32 -> tensor<1x32xi32, #blocked1> loc(#loc91)
+    %xindex_34 = tt.splat %xoffset_29 : i32 -> tensor<1x32xi32, #blocked> loc(#loc91)
+    %xindex_35 = arith.addi %xindex_33, %xindex_31 : tensor<1x32xi32, #blocked1> loc(#loc91)
+    %xindex_36 = arith.addi %xindex_34, %xindex_32 : tensor<1x32xi32, #blocked> loc(#loc91)
+    %xmask = arith.cmpi slt, %xindex_35, %cst_8 : tensor<1x32xi32, #blocked1> loc(#loc92)
+    %xmask_37 = arith.cmpi slt, %xindex_36, %cst_7 : tensor<1x32xi32, #blocked> loc(#loc92)
+    %y1 = arith.divsi %yindex_26, %cst_6 : tensor<32x1xi32, #blocked1> loc(#loc93)
+    %y1_38 = arith.divsi %yindex_27, %cst_5 : tensor<32x1xi32, #blocked> loc(#loc93)
+    %y0 = arith.remsi %yindex_26, %cst_6 : tensor<32x1xi32, #blocked1> loc(#loc94)
+    %y0_39 = arith.remsi %yindex_27, %cst_5 : tensor<32x1xi32, #blocked> loc(#loc94)
+    %tmp4 = arith.extsi %y1 : tensor<32x1xi32, #blocked1> to tensor<32x1xi64, #blocked1> loc(#loc95)
+    %tmp4_40 = arith.extsi %y1_38 : tensor<32x1xi32, #blocked> to tensor<32x1xi64, #blocked> loc(#loc95)
+    %tmp4_41 = arith.cmpi slt, %tmp4, %cst_4 : tensor<32x1xi64, #blocked1> loc(#loc95)
+    %tmp4_42 = arith.cmpi slt, %tmp4_40, %cst_3 : tensor<32x1xi64, #blocked> loc(#loc95)
+    %tmp5 = arith.muli %y0, %cst_2 : tensor<32x1xi32, #blocked1> loc(#loc96)
+    %tmp5_43 = tt.broadcast %xindex_35 : tensor<1x32xi32, #blocked1> -> tensor<32x32xi32, #blocked1> loc(#loc97)
+    %tmp5_44 = tt.broadcast %tmp5 : tensor<32x1xi32, #blocked1> -> tensor<32x32xi32, #blocked1> loc(#loc97)
+    %tmp5_45 = arith.addi %tmp5_43, %tmp5_44 : tensor<32x32xi32, #blocked1> loc(#loc97)
+    %tmp5_46 = arith.muli %y1, %cst_1 : tensor<32x1xi32, #blocked1> loc(#loc98)
+    %tmp5_47 = tt.broadcast %tmp5_46 : tensor<32x1xi32, #blocked1> -> tensor<32x32xi32, #blocked1> loc(#loc99)
+    %tmp5_48 = arith.addi %tmp5_45, %tmp5_47 : tensor<32x32xi32, #blocked1> loc(#loc99)
+    %tmp5_49 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<32x32x!tt.ptr<bf16>, #blocked1> loc(#loc100)
+    %tmp5_50 = tt.addptr %tmp5_49, %tmp5_48 : tensor<32x32x!tt.ptr<bf16>, #blocked1>, tensor<32x32xi32, #blocked1> loc(#loc100)
+    %tmp5_51 = tt.broadcast %tmp4_41 : tensor<32x1xi1, #blocked1> -> tensor<32x32xi1, #blocked1> loc(#loc101)
+    %tmp5_52 = tt.broadcast %tmp4_42 : tensor<32x1xi1, #blocked> -> tensor<32x32xi1, #blocked> loc(#loc101)
+    %tmp5_53 = tt.broadcast %xmask : tensor<1x32xi1, #blocked1> -> tensor<32x32xi1, #blocked1> loc(#loc101)
+    %tmp5_54 = tt.broadcast %xmask_37 : tensor<1x32xi1, #blocked> -> tensor<32x32xi1, #blocked> loc(#loc101)
+    %tmp5_55 = arith.andi %tmp5_51, %tmp5_53 : tensor<32x32xi1, #blocked1> loc(#loc101)
+    %tmp5_56 = arith.andi %tmp5_52, %tmp5_54 : tensor<32x32xi1, #blocked> loc(#loc101)
+    %tmp5_57 = tt.broadcast %ymask : tensor<32x1xi1, #blocked1> -> tensor<32x32xi1, #blocked1> loc(#loc102)
+    %tmp5_58 = tt.broadcast %ymask_28 : tensor<32x1xi1, #blocked> -> tensor<32x32xi1, #blocked> loc(#loc102)
+    %tmp5_59 = arith.andi %tmp5_55, %tmp5_57 : tensor<32x32xi1, #blocked1> loc(#loc102)
+    %tmp5_60 = arith.andi %tmp5_56, %tmp5_58 : tensor<32x32xi1, #blocked> loc(#loc102)
+    %tmp5_61 = tt.load %tmp5_50, %tmp5_59, %cst_11 evictionPolicy = evict_last : tensor<32x32x!tt.ptr<bf16>, #blocked1> loc(#loc103)
+    %tmp5_62 = ttg.convert_layout %tmp5_61 : tensor<32x32xbf16, #blocked1> -> tensor<32x32xbf16, #blocked> loc(#loc104)
+    %tmp5_63 = arith.extf %tmp5_62 : tensor<32x32xbf16, #blocked> to tensor<32x32xf32, #blocked> loc(#loc104)
+    %tmp7 = arith.muli %y1_38, %cst_5 : tensor<32x1xi32, #blocked> loc(#loc105)
+    %tmp7_64 = arith.addi %y0_39, %tmp7 : tensor<32x1xi32, #blocked> loc(#loc106)
+    %tmp7_65 = tt.splat %in_ptr1 : !tt.ptr<f32> -> tensor<32x1x!tt.ptr<f32>, #blocked> loc(#loc107)
+    %tmp7_66 = tt.addptr %tmp7_65, %tmp7_64 : tensor<32x1x!tt.ptr<f32>, #blocked>, tensor<32x1xi32, #blocked> loc(#loc107)
+    %tmp7_67 = tt.broadcast %tmp7_66 : tensor<32x1x!tt.ptr<f32>, #blocked> -> tensor<32x32x!tt.ptr<f32>, #blocked> loc(#loc107)
+    %tmp7_68 = tt.load %tmp7_67, %tmp5_60, %cst_12 evictionPolicy = evict_last : tensor<32x32x!tt.ptr<f32>, #blocked> loc(#loc108)
+    %tmp9 = arith.divf %tmp7_68, %cst_14 : tensor<32x32xf32, #blocked> loc(#loc109)
+    %tmp11 = arith.addf %tmp9, %cst_13 : tensor<32x32xf32, #blocked> loc(#loc110)
+    %tmp12 = tt.extern_elementwise %tmp11 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<32x32xf32, #blocked>) -> tensor<32x32xf32, #blocked> loc(#loc111)
+    %tmp13 = arith.mulf %tmp5_63, %tmp12 : tensor<32x32xf32, #blocked> loc(#loc112)
+    %tmp13_69 = ttg.convert_layout %tmp13 : tensor<32x32xf32, #blocked> -> tensor<32x32xf32, #blocked1> loc(#loc112)
+    %tmp14 = tt.splat %in_ptr2 : !tt.ptr<bf16> -> tensor<1x32x!tt.ptr<bf16>, #blocked1> loc(#loc113)
+    %tmp14_70 = tt.addptr %tmp14, %xindex_35 : tensor<1x32x!tt.ptr<bf16>, #blocked1>, tensor<1x32xi32, #blocked1> loc(#loc113)
+    %tmp14_71 = tt.broadcast %tmp14_70 : tensor<1x32x!tt.ptr<bf16>, #blocked1> -> tensor<32x32x!tt.ptr<bf16>, #blocked1> loc(#loc113)
+    %tmp14_72 = tt.load %tmp14_71, %tmp5_59, %cst_11 evictionPolicy = evict_last : tensor<32x32x!tt.ptr<bf16>, #blocked1> loc(#loc114)
+    %tmp14_73 = arith.extf %tmp14_72 : tensor<32x32xbf16, #blocked1> to tensor<32x32xf32, #blocked1> loc(#loc115)
+    %tmp16 = arith.mulf %tmp13_69, %tmp14_73 : tensor<32x32xf32, #blocked1> loc(#loc116)
+    %tmp20 = arith.cmpi sge, %tmp4, %cst_4 : tensor<32x1xi64, #blocked1> loc(#loc117)
+    %tmp20_74 = arith.cmpi sge, %tmp4_40, %cst_3 : tensor<32x1xi64, #blocked> loc(#loc117)
+    %tmp23 = arith.addi %y1, %cst_0 : tensor<32x1xi32, #blocked1> loc(#loc118)
+    %tmp23_75 = arith.addi %y1_38, %cst : tensor<32x1xi32, #blocked> loc(#loc118)
+    %tmp23_76 = arith.muli %tmp23, %cst_1 : tensor<32x1xi32, #blocked1> loc(#loc119)
+    %tmp23_77 = tt.broadcast %tmp23_76 : tensor<32x1xi32, #blocked1> -> tensor<32x32xi32, #blocked1> loc(#loc120)
+    %tmp23_78 = arith.addi %tmp5_45, %tmp23_77 : tensor<32x32xi32, #blocked1> loc(#loc120)
+    %tmp23_79 = tt.splat %in_ptr3 : !tt.ptr<bf16> -> tensor<32x32x!tt.ptr<bf16>, #blocked1> loc(#loc121)
+    %tmp23_80 = tt.addptr %tmp23_79, %tmp23_78 : tensor<32x32x!tt.ptr<bf16>, #blocked1>, tensor<32x32xi32, #blocked1> loc(#loc121)
+    %tmp23_81 = tt.broadcast %tmp20 : tensor<32x1xi1, #blocked1> -> tensor<32x32xi1, #blocked1> loc(#loc122)
+    %tmp23_82 = tt.broadcast %tmp20_74 : tensor<32x1xi1, #blocked> -> tensor<32x32xi1, #blocked> loc(#loc122)
+    %tmp23_83 = arith.andi %tmp23_81, %tmp5_53 : tensor<32x32xi1, #blocked1> loc(#loc122)
+    %tmp23_84 = arith.andi %tmp23_82, %tmp5_54 : tensor<32x32xi1, #blocked> loc(#loc122)
+    %tmp23_85 = arith.andi %tmp23_83, %tmp5_57 : tensor<32x32xi1, #blocked1> loc(#loc123)
+    %tmp23_86 = arith.andi %tmp23_84, %tmp5_58 : tensor<32x32xi1, #blocked> loc(#loc123)
+    %tmp23_87 = tt.load %tmp23_80, %tmp23_85, %cst_11 evictionPolicy = evict_last : tensor<32x32x!tt.ptr<bf16>, #blocked1> loc(#loc124)
+    %tmp23_88 = ttg.convert_layout %tmp23_87 : tensor<32x32xbf16, #blocked1> -> tensor<32x32xbf16, #blocked> loc(#loc125)
+    %tmp23_89 = arith.extf %tmp23_88 : tensor<32x32xbf16, #blocked> to tensor<32x32xf32, #blocked> loc(#loc125)
+    %tmp25 = arith.muli %tmp23_75, %cst_5 : tensor<32x1xi32, #blocked> loc(#loc126)
+    %tmp25_90 = arith.addi %y0_39, %tmp25 : tensor<32x1xi32, #blocked> loc(#loc127)
+    %tmp25_91 = tt.splat %in_ptr4 : !tt.ptr<f32> -> tensor<32x1x!tt.ptr<f32>, #blocked> loc(#loc128)
+    %tmp25_92 = tt.addptr %tmp25_91, %tmp25_90 : tensor<32x1x!tt.ptr<f32>, #blocked>, tensor<32x1xi32, #blocked> loc(#loc128)
+    %tmp25_93 = tt.broadcast %tmp25_92 : tensor<32x1x!tt.ptr<f32>, #blocked> -> tensor<32x32x!tt.ptr<f32>, #blocked> loc(#loc128)
+    %tmp25_94 = tt.load %tmp25_93, %tmp23_86, %cst_12 evictionPolicy = evict_last : tensor<32x32x!tt.ptr<f32>, #blocked> loc(#loc129)
+    %tmp27 = arith.divf %tmp25_94, %cst_14 : tensor<32x32xf32, #blocked> loc(#loc130)
+    %tmp29 = arith.addf %tmp27, %cst_13 : tensor<32x32xf32, #blocked> loc(#loc131)
+    %tmp30 = tt.extern_elementwise %tmp29 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<32x32xf32, #blocked>) -> tensor<32x32xf32, #blocked> loc(#loc132)
+    %tmp31 = arith.mulf %tmp23_89, %tmp30 : tensor<32x32xf32, #blocked> loc(#loc133)
+    %tmp31_95 = ttg.convert_layout %tmp31 : tensor<32x32xf32, #blocked> -> tensor<32x32xf32, #blocked1> loc(#loc133)
+    %tmp32 = tt.splat %in_ptr5 : !tt.ptr<bf16> -> tensor<1x32x!tt.ptr<bf16>, #blocked1> loc(#loc134)
+    %tmp32_96 = tt.addptr %tmp32, %xindex_35 : tensor<1x32x!tt.ptr<bf16>, #blocked1>, tensor<1x32xi32, #blocked1> loc(#loc134)
+    %tmp32_97 = tt.broadcast %tmp32_96 : tensor<1x32x!tt.ptr<bf16>, #blocked1> -> tensor<32x32x!tt.ptr<bf16>, #blocked1> loc(#loc134)
+    %tmp32_98 = tt.load %tmp32_97, %tmp23_85, %cst_11 evictionPolicy = evict_last : tensor<32x32x!tt.ptr<bf16>, #blocked1> loc(#loc135)
+    %tmp32_99 = arith.extf %tmp32_98 : tensor<32x32xbf16, #blocked1> to tensor<32x32xf32, #blocked1> loc(#loc136)
+    %tmp34 = arith.mulf %tmp31_95, %tmp32_99 : tensor<32x32xf32, #blocked1> loc(#loc137)
+    %tmp37 = arith.select %tmp23_81, %tmp34, %cst_15 : tensor<32x32xi1, #blocked1>, tensor<32x32xf32, #blocked1> loc(#loc138)
+    %tmp38 = arith.select %tmp5_51, %tmp16, %tmp37 : tensor<32x32xi1, #blocked1>, tensor<32x32xf32, #blocked1> loc(#loc141)
+    %0 = arith.muli %yindex_26, %cst_2 : tensor<32x1xi32, #blocked1> loc(#loc64)
+    %1 = tt.broadcast %0 : tensor<32x1xi32, #blocked1> -> tensor<32x32xi32, #blocked1> loc(#loc65)
+    %2 = arith.addi %tmp5_43, %1 : tensor<32x32xi32, #blocked1> loc(#loc65)
+    %3 = tt.splat %out_ptr0 : !tt.ptr<bf16> -> tensor<32x32x!tt.ptr<bf16>, #blocked1> loc(#loc66)
+    %4 = tt.addptr %3, %2 : tensor<32x32x!tt.ptr<bf16>, #blocked1>, tensor<32x32xi32, #blocked1> loc(#loc66)
+    %5 = arith.andi %tmp5_53, %tmp5_57 : tensor<32x32xi1, #blocked1> loc(#loc67)
+    %6 = arith.truncf %tmp38 : tensor<32x32xf32, #blocked1> to tensor<32x32xbf16, #blocked1> loc(#loc68)
+    tt.store %4, %6, %5 : tensor<32x32x!tt.ptr<bf16>, #blocked1> loc(#loc68)
+    tt.return loc(#loc69)
+  } loc(#loc)
+} loc(#loc)
+#loc1 = loc(unknown)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:29)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:48)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:69)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:53)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:34)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:75)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":22:44)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":22:23)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":23:21)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":24:28)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":24:33)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":25:44)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":25:23)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":26:21)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":27:19)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":29:19)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":35:18)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:39)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:35)
+#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:51)
+#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:44)
+#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:30)
+#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:64)
+#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:72)
+#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:57)
+#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:123)
+#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:55)
+#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:51)
+#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:30)
+#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:80)
+#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":40:19)
+#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":42:19)
+#loc34 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":43:28)
+#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":44:19)
+#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:31)
+#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:71)
+#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:137)
+#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":47:20)
+#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":51:20)
+#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:61)
+#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:52)
+#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:45)
+#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:31)
+#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:75)
+#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:83)
+#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:67)
+#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:134)
+#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:56)
+#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:52)
+#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:31)
+#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:90)
+#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":58:21)
+#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":60:20)
+#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":61:28)
+#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":62:20)
+#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:31)
+#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:71)
+#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:138)
+#loc60 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":65:20)
+#loc61 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":68:35)
+#loc62 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":69:34)
+#loc63 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":50:34)
+#loc64 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:34)
+#loc65 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:30)
+#loc66 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:25)
+#loc67 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:54)
+#loc68 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:46)
+#loc69 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:4)
+#loc79 = loc("yoffset"(#loc2))
+#loc80 = loc("yoffset"(#loc3))
+#loc81 = loc("yoffset"(#loc4))
+#loc82 = loc("yoffset"(#loc5))
+#loc83 = loc("yoffset"(#loc6))
+#loc84 = loc("yoffset"(#loc7))
+#loc85 = loc("yindex"(#loc8))
+#loc86 = loc("yindex"(#loc9))
+#loc87 = loc("ymask"(#loc10))
+#loc88 = loc("xoffset"(#loc11))
+#loc89 = loc("xoffset"(#loc12))
+#loc90 = loc("xindex"(#loc13))
+#loc91 = loc("xindex"(#loc14))
+#loc92 = loc("xmask"(#loc15))
+#loc93 = loc("y1"(#loc16))
+#loc94 = loc("y0"(#loc17))
+#loc95 = loc("tmp4"(#loc18))
+#loc96 = loc("tmp5"(#loc19))
+#loc97 = loc("tmp5"(#loc20))
+#loc98 = loc("tmp5"(#loc21))
+#loc99 = loc("tmp5"(#loc22))
+#loc100 = loc("tmp5"(#loc23))
+#loc101 = loc("tmp5"(#loc24))
+#loc102 = loc("tmp5"(#loc25))
+#loc103 = loc("tmp5"(#loc26))
+#loc104 = loc("tmp5"(#loc27))
+#loc105 = loc("tmp7"(#loc28))
+#loc106 = loc("tmp7"(#loc29))
+#loc107 = loc("tmp7"(#loc30))
+#loc108 = loc("tmp7"(#loc31))
+#loc109 = loc("tmp9"(#loc32))
+#loc110 = loc("tmp11"(#loc33))
+#loc111 = loc("tmp12"(#loc34))
+#loc112 = loc("tmp13"(#loc35))
+#loc113 = loc("tmp14"(#loc36))
+#loc114 = loc("tmp14"(#loc37))
+#loc115 = loc("tmp14"(#loc38))
+#loc116 = loc("tmp16"(#loc39))
+#loc117 = loc("tmp20"(#loc40))
+#loc118 = loc("tmp23"(#loc41))
+#loc119 = loc("tmp23"(#loc42))
+#loc120 = loc("tmp23"(#loc43))
+#loc121 = loc("tmp23"(#loc44))
+#loc122 = loc("tmp23"(#loc45))
+#loc123 = loc("tmp23"(#loc46))
+#loc124 = loc("tmp23"(#loc47))
+#loc125 = loc("tmp23"(#loc48))
+#loc126 = loc("tmp25"(#loc49))
+#loc127 = loc("tmp25"(#loc50))
+#loc128 = loc("tmp25"(#loc51))
+#loc129 = loc("tmp25"(#loc52))
+#loc130 = loc("tmp27"(#loc53))
+#loc131 = loc("tmp29"(#loc54))
+#loc132 = loc("tmp30"(#loc55))
+#loc133 = loc("tmp31"(#loc56))
+#loc134 = loc("tmp32"(#loc57))
+#loc135 = loc("tmp32"(#loc58))
+#loc136 = loc("tmp32"(#loc59))
+#loc137 = loc("tmp34"(#loc60))
+#loc138 = loc("tmp37"(#loc61))
+#loc139 = loc("tmp38"(#loc62))
+#loc140 = loc("tmp19"(#loc63))
+#loc141 = loc(fused[#loc139, #loc140])
diff --git a/triton/K67YKHK7SDRMJU7RXHVXJT2KOUZAO4I5423SJJDUEAFURSHREZEQ/triton_poi_fused__fused_rms_norm_cat_view_2.ttir b/triton/K67YKHK7SDRMJU7RXHVXJT2KOUZAO4I5423SJJDUEAFURSHREZEQ/triton_poi_fused__fused_rms_norm_cat_view_2.ttir
new file mode 100644
index 0000000000000000000000000000000000000000..f48e3f6109303d1ab499e3be6e39babdceba0dce
--- /dev/null
+++ b/triton/K67YKHK7SDRMJU7RXHVXJT2KOUZAO4I5423SJJDUEAFURSHREZEQ/triton_poi_fused__fused_rms_norm_cat_view_2.ttir
@@ -0,0 +1,252 @@
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":18:0)
+#loc71 = loc("in_ptr0"(#loc))
+#loc72 = loc("in_ptr1"(#loc))
+#loc73 = loc("in_ptr2"(#loc))
+#loc74 = loc("in_ptr3"(#loc))
+#loc75 = loc("in_ptr4"(#loc))
+#loc76 = loc("in_ptr5"(#loc))
+#loc77 = loc("out_ptr0"(#loc))
+#loc78 = loc("ynumel"(#loc))
+#loc79 = loc("xnumel"(#loc))
+module {
+  tt.func public @triton_poi_fused__fused_rms_norm_cat_view_2(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %in_ptr4: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr4"(#loc)), %in_ptr5: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr5"(#loc)), %out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ynumel: i32 {tt.divisibility = 16 : i32} loc("ynumel"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} {
+    %cst = arith.constant dense<0.000000e+00> : tensor<32x32xbf16> loc(#loc1)
+    %cst_0 = arith.constant dense<-256> : tensor<32x1xi32> loc(#loc1)
+    %cst_1 = arith.constant dense<9.99999997E-7> : tensor<32x32xf32> loc(#loc1)
+    %cst_2 = arith.constant dense<1.280000e+02> : tensor<32x32xf32> loc(#loc1)
+    %cst_3 = arith.constant dense<0.000000e+00> : tensor<32x32xf32> loc(#loc1)
+    %cst_4 = arith.constant dense<12288> : tensor<32x1xi32> loc(#loc1)
+    %cst_5 = arith.constant dense<128> : tensor<32x1xi32> loc(#loc1)
+    %cst_6 = arith.constant dense<256> : tensor<32x1xi64> loc(#loc1)
+    %cst_7 = arith.constant dense<32> : tensor<32x1xi32> loc(#loc1)
+    %xmask = arith.constant dense<128> : tensor<1x32xi32> loc(#loc80)
+    %ymask = arith.constant dense<73728> : tensor<32x1xi32> loc(#loc81)
+    %c32_i32 = arith.constant 32 : i32 loc(#loc1)
+    %yoffset = tt.get_program_id y : i32 loc(#loc82)
+    %yoffset_8 = tt.get_program_id z : i32 loc(#loc83)
+    %yoffset_9 = tt.get_num_programs y : i32 loc(#loc84)
+    %yoffset_10 = arith.muli %yoffset_8, %yoffset_9 : i32 loc(#loc85)
+    %yoffset_11 = arith.addi %yoffset, %yoffset_10 : i32 loc(#loc86)
+    %yoffset_12 = arith.muli %yoffset_11, %c32_i32 : i32 loc(#loc87)
+    %yindex = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32> loc(#loc88)
+    %yindex_13 = tt.expand_dims %yindex {axis = 1 : i32} : tensor<32xi32> -> tensor<32x1xi32> loc(#loc89)
+    %yindex_14 = tt.splat %yoffset_12 : i32 -> tensor<32x1xi32> loc(#loc90)
+    %yindex_15 = arith.addi %yindex_14, %yindex_13 : tensor<32x1xi32> loc(#loc90)
+    %ymask_16 = arith.cmpi slt, %yindex_15, %ymask : tensor<32x1xi32> loc(#loc81)
+    %xoffset = tt.get_program_id x : i32 loc(#loc91)
+    %xoffset_17 = arith.muli %xoffset, %c32_i32 : i32 loc(#loc92)
+    %xindex = tt.expand_dims %yindex {axis = 0 : i32} : tensor<32xi32> -> tensor<1x32xi32> loc(#loc93)
+    %xindex_18 = tt.splat %xoffset_17 : i32 -> tensor<1x32xi32> loc(#loc94)
+    %xindex_19 = arith.addi %xindex_18, %xindex : tensor<1x32xi32> loc(#loc94)
+    %xmask_20 = arith.cmpi slt, %xindex_19, %xmask : tensor<1x32xi32> loc(#loc80)
+    %y1 = arith.divsi %yindex_15, %cst_7 : tensor<32x1xi32> loc(#loc95)
+    %y0 = arith.remsi %yindex_15, %cst_7 : tensor<32x1xi32> loc(#loc96)
+    %tmp4 = arith.extsi %y1 : tensor<32x1xi32> to tensor<32x1xi64> loc(#loc97)
+    %tmp4_21 = arith.cmpi slt, %tmp4, %cst_6 : tensor<32x1xi64> loc(#loc97)
+    %tmp5 = arith.muli %y0, %cst_5 : tensor<32x1xi32> loc(#loc98)
+    %tmp5_22 = tt.broadcast %xindex_19 : tensor<1x32xi32> -> tensor<32x32xi32> loc(#loc99)
+    %tmp5_23 = tt.broadcast %tmp5 : tensor<32x1xi32> -> tensor<32x32xi32> loc(#loc99)
+    %tmp5_24 = arith.addi %tmp5_22, %tmp5_23 : tensor<32x32xi32> loc(#loc99)
+    %tmp5_25 = arith.muli %y1, %cst_4 : tensor<32x1xi32> loc(#loc100)
+    %tmp5_26 = tt.broadcast %tmp5_25 : tensor<32x1xi32> -> tensor<32x32xi32> loc(#loc101)
+    %tmp5_27 = arith.addi %tmp5_24, %tmp5_26 : tensor<32x32xi32> loc(#loc101)
+    %tmp5_28 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<32x32x!tt.ptr<bf16>> loc(#loc102)
+    %tmp5_29 = tt.addptr %tmp5_28, %tmp5_27 : tensor<32x32x!tt.ptr<bf16>>, tensor<32x32xi32> loc(#loc102)
+    %tmp5_30 = tt.broadcast %tmp4_21 : tensor<32x1xi1> -> tensor<32x32xi1> loc(#loc103)
+    %tmp5_31 = tt.broadcast %xmask_20 : tensor<1x32xi1> -> tensor<32x32xi1> loc(#loc103)
+    %tmp5_32 = arith.andi %tmp5_30, %tmp5_31 : tensor<32x32xi1> loc(#loc103)
+    %tmp5_33 = tt.broadcast %ymask_16 : tensor<32x1xi1> -> tensor<32x32xi1> loc(#loc104)
+    %tmp5_34 = arith.andi %tmp5_32, %tmp5_33 : tensor<32x32xi1> loc(#loc104)
+    %tmp5_35 = tt.load %tmp5_29, %tmp5_34, %cst evictionPolicy = evict_last : tensor<32x32x!tt.ptr<bf16>> loc(#loc105)
+    %tmp5_36 = arith.extf %tmp5_35 : tensor<32x32xbf16> to tensor<32x32xf32> loc(#loc106)
+    %tmp7 = arith.muli %y1, %cst_7 : tensor<32x1xi32> loc(#loc107)
+    %tmp7_37 = arith.addi %y0, %tmp7 : tensor<32x1xi32> loc(#loc108)
+    %tmp7_38 = tt.splat %in_ptr1 : !tt.ptr<f32> -> tensor<32x1x!tt.ptr<f32>> loc(#loc109)
+    %tmp7_39 = tt.addptr %tmp7_38, %tmp7_37 : tensor<32x1x!tt.ptr<f32>>, tensor<32x1xi32> loc(#loc109)
+    %tmp7_40 = tt.broadcast %tmp7_39 : tensor<32x1x!tt.ptr<f32>> -> tensor<32x32x!tt.ptr<f32>> loc(#loc109)
+    %tmp7_41 = tt.load %tmp7_40, %tmp5_34, %cst_3 evictionPolicy = evict_last : tensor<32x32x!tt.ptr<f32>> loc(#loc110)
+    %tmp9 = arith.divf %tmp7_41, %cst_2 : tensor<32x32xf32> loc(#loc111)
+    %tmp11 = arith.addf %tmp9, %cst_1 : tensor<32x32xf32> loc(#loc112)
+    %tmp12 = tt.extern_elementwise %tmp11 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<32x32xf32>) -> tensor<32x32xf32> loc(#loc113)
+    %tmp13 = arith.mulf %tmp5_36, %tmp12 : tensor<32x32xf32> loc(#loc114)
+    %tmp14 = tt.splat %in_ptr2 : !tt.ptr<bf16> -> tensor<1x32x!tt.ptr<bf16>> loc(#loc115)
+    %tmp14_42 = tt.addptr %tmp14, %xindex_19 : tensor<1x32x!tt.ptr<bf16>>, tensor<1x32xi32> loc(#loc115)
+    %tmp14_43 = tt.broadcast %tmp14_42 : tensor<1x32x!tt.ptr<bf16>> -> tensor<32x32x!tt.ptr<bf16>> loc(#loc115)
+    %tmp14_44 = tt.load %tmp14_43, %tmp5_34, %cst evictionPolicy = evict_last : tensor<32x32x!tt.ptr<bf16>> loc(#loc116)
+    %tmp14_45 = arith.extf %tmp14_44 : tensor<32x32xbf16> to tensor<32x32xf32> loc(#loc117)
+    %tmp16 = arith.mulf %tmp13, %tmp14_45 : tensor<32x32xf32> loc(#loc118)
+    %tmp19 = arith.select %tmp5_30, %tmp16, %cst_3 : tensor<32x32xi1>, tensor<32x32xf32> loc(#loc119)
+    %tmp20 = arith.cmpi sge, %tmp4, %cst_6 : tensor<32x1xi64> loc(#loc120)
+    %tmp23 = arith.addi %y1, %cst_0 : tensor<32x1xi32> loc(#loc121)
+    %tmp23_46 = arith.muli %tmp23, %cst_4 : tensor<32x1xi32> loc(#loc122)
+    %tmp23_47 = tt.broadcast %tmp23_46 : tensor<32x1xi32> -> tensor<32x32xi32> loc(#loc123)
+    %tmp23_48 = arith.addi %tmp5_24, %tmp23_47 : tensor<32x32xi32> loc(#loc123)
+    %tmp23_49 = tt.splat %in_ptr3 : !tt.ptr<bf16> -> tensor<32x32x!tt.ptr<bf16>> loc(#loc124)
+    %tmp23_50 = tt.addptr %tmp23_49, %tmp23_48 : tensor<32x32x!tt.ptr<bf16>>, tensor<32x32xi32> loc(#loc124)
+    %tmp23_51 = tt.broadcast %tmp20 : tensor<32x1xi1> -> tensor<32x32xi1> loc(#loc125)
+    %tmp23_52 = arith.andi %tmp23_51, %tmp5_31 : tensor<32x32xi1> loc(#loc125)
+    %tmp23_53 = arith.andi %tmp23_52, %tmp5_33 : tensor<32x32xi1> loc(#loc126)
+    %tmp23_54 = tt.load %tmp23_50, %tmp23_53, %cst evictionPolicy = evict_last : tensor<32x32x!tt.ptr<bf16>> loc(#loc127)
+    %tmp23_55 = arith.extf %tmp23_54 : tensor<32x32xbf16> to tensor<32x32xf32> loc(#loc128)
+    %tmp25 = arith.muli %tmp23, %cst_7 : tensor<32x1xi32> loc(#loc129)
+    %tmp25_56 = arith.addi %y0, %tmp25 : tensor<32x1xi32> loc(#loc130)
+    %tmp25_57 = tt.splat %in_ptr4 : !tt.ptr<f32> -> tensor<32x1x!tt.ptr<f32>> loc(#loc131)
+    %tmp25_58 = tt.addptr %tmp25_57, %tmp25_56 : tensor<32x1x!tt.ptr<f32>>, tensor<32x1xi32> loc(#loc131)
+    %tmp25_59 = tt.broadcast %tmp25_58 : tensor<32x1x!tt.ptr<f32>> -> tensor<32x32x!tt.ptr<f32>> loc(#loc131)
+    %tmp25_60 = tt.load %tmp25_59, %tmp23_53, %cst_3 evictionPolicy = evict_last : tensor<32x32x!tt.ptr<f32>> loc(#loc132)
+    %tmp27 = arith.divf %tmp25_60, %cst_2 : tensor<32x32xf32> loc(#loc133)
+    %tmp29 = arith.addf %tmp27, %cst_1 : tensor<32x32xf32> loc(#loc134)
+    %tmp30 = tt.extern_elementwise %tmp29 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<32x32xf32>) -> tensor<32x32xf32> loc(#loc135)
+    %tmp31 = arith.mulf %tmp23_55, %tmp30 : tensor<32x32xf32> loc(#loc136)
+    %tmp32 = tt.splat %in_ptr5 : !tt.ptr<bf16> -> tensor<1x32x!tt.ptr<bf16>> loc(#loc137)
+    %tmp32_61 = tt.addptr %tmp32, %xindex_19 : tensor<1x32x!tt.ptr<bf16>>, tensor<1x32xi32> loc(#loc137)
+    %tmp32_62 = tt.broadcast %tmp32_61 : tensor<1x32x!tt.ptr<bf16>> -> tensor<32x32x!tt.ptr<bf16>> loc(#loc137)
+    %tmp32_63 = tt.load %tmp32_62, %tmp23_53, %cst evictionPolicy = evict_last : tensor<32x32x!tt.ptr<bf16>> loc(#loc138)
+    %tmp32_64 = arith.extf %tmp32_63 : tensor<32x32xbf16> to tensor<32x32xf32> loc(#loc139)
+    %tmp34 = arith.mulf %tmp31, %tmp32_64 : tensor<32x32xf32> loc(#loc140)
+    %tmp37 = arith.select %tmp23_51, %tmp34, %cst_3 : tensor<32x32xi1>, tensor<32x32xf32> loc(#loc141)
+    %tmp38 = arith.select %tmp5_30, %tmp19, %tmp37 : tensor<32x32xi1>, tensor<32x32xf32> loc(#loc142)
+    %0 = arith.muli %yindex_15, %cst_5 : tensor<32x1xi32> loc(#loc65)
+    %1 = tt.broadcast %0 : tensor<32x1xi32> -> tensor<32x32xi32> loc(#loc66)
+    %2 = arith.addi %tmp5_22, %1 : tensor<32x32xi32> loc(#loc66)
+    %3 = tt.splat %out_ptr0 : !tt.ptr<bf16> -> tensor<32x32x!tt.ptr<bf16>> loc(#loc67)
+    %4 = tt.addptr %3, %2 : tensor<32x32x!tt.ptr<bf16>>, tensor<32x32xi32> loc(#loc67)
+    %5 = arith.andi %tmp5_31, %tmp5_33 : tensor<32x32xi1> loc(#loc68)
+    %6 = arith.truncf %tmp38 : tensor<32x32xf32> to tensor<32x32xbf16> loc(#loc69)
+    tt.store %4, %6, %5 : tensor<32x32x!tt.ptr<bf16>> loc(#loc69)
+    tt.return loc(#loc70)
+  } loc(#loc)
+} loc(#loc)
+#loc1 = loc(unknown)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":26:21)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":23:21)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:29)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:48)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:69)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:53)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:34)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:75)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":22:36)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":22:44)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":22:23)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":24:28)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":24:33)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":25:44)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":25:23)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":27:19)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":29:19)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":35:18)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:39)
+#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:35)
+#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:51)
+#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:44)
+#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:30)
+#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:64)
+#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:72)
+#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:57)
+#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:123)
+#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:55)
+#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:51)
+#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:30)
+#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:80)
+#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":40:19)
+#loc34 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":42:19)
+#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":43:28)
+#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":44:19)
+#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:31)
+#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:71)
+#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:137)
+#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":47:20)
+#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":50:34)
+#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":51:20)
+#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:61)
+#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:52)
+#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:45)
+#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:31)
+#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:75)
+#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:83)
+#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:67)
+#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:134)
+#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:56)
+#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:52)
+#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:31)
+#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:90)
+#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":58:21)
+#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":60:20)
+#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":61:28)
+#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":62:20)
+#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:31)
+#loc60 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:71)
+#loc61 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:138)
+#loc62 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":65:20)
+#loc63 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":68:35)
+#loc64 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":69:34)
+#loc65 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:34)
+#loc66 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:30)
+#loc67 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:25)
+#loc68 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:54)
+#loc69 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:46)
+#loc70 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:4)
+#loc80 = loc("xmask"(#loc2))
+#loc81 = loc("ymask"(#loc3))
+#loc82 = loc("yoffset"(#loc4))
+#loc83 = loc("yoffset"(#loc5))
+#loc84 = loc("yoffset"(#loc6))
+#loc85 = loc("yoffset"(#loc7))
+#loc86 = loc("yoffset"(#loc8))
+#loc87 = loc("yoffset"(#loc9))
+#loc88 = loc("yindex"(#loc10))
+#loc89 = loc("yindex"(#loc11))
+#loc90 = loc("yindex"(#loc12))
+#loc91 = loc("xoffset"(#loc13))
+#loc92 = loc("xoffset"(#loc14))
+#loc93 = loc("xindex"(#loc15))
+#loc94 = loc("xindex"(#loc16))
+#loc95 = loc("y1"(#loc17))
+#loc96 = loc("y0"(#loc18))
+#loc97 = loc("tmp4"(#loc19))
+#loc98 = loc("tmp5"(#loc20))
+#loc99 = loc("tmp5"(#loc21))
+#loc100 = loc("tmp5"(#loc22))
+#loc101 = loc("tmp5"(#loc23))
+#loc102 = loc("tmp5"(#loc24))
+#loc103 = loc("tmp5"(#loc25))
+#loc104 = loc("tmp5"(#loc26))
+#loc105 = loc("tmp5"(#loc27))
+#loc106 = loc("tmp5"(#loc28))
+#loc107 = loc("tmp7"(#loc29))
+#loc108 = loc("tmp7"(#loc30))
+#loc109 = loc("tmp7"(#loc31))
+#loc110 = loc("tmp7"(#loc32))
+#loc111 = loc("tmp9"(#loc33))
+#loc112 = loc("tmp11"(#loc34))
+#loc113 = loc("tmp12"(#loc35))
+#loc114 = loc("tmp13"(#loc36))
+#loc115 = loc("tmp14"(#loc37))
+#loc116 = loc("tmp14"(#loc38))
+#loc117 = loc("tmp14"(#loc39))
+#loc118 = loc("tmp16"(#loc40))
+#loc119 = loc("tmp19"(#loc41))
+#loc120 = loc("tmp20"(#loc42))
+#loc121 = loc("tmp23"(#loc43))
+#loc122 = loc("tmp23"(#loc44))
+#loc123 = loc("tmp23"(#loc45))
+#loc124 = loc("tmp23"(#loc46))
+#loc125 = loc("tmp23"(#loc47))
+#loc126 = loc("tmp23"(#loc48))
+#loc127 = loc("tmp23"(#loc49))
+#loc128 = loc("tmp23"(#loc50))
+#loc129 = loc("tmp25"(#loc51))
+#loc130 = loc("tmp25"(#loc52))
+#loc131 = loc("tmp25"(#loc53))
+#loc132 = loc("tmp25"(#loc54))
+#loc133 = loc("tmp27"(#loc55))
+#loc134 = loc("tmp29"(#loc56))
+#loc135 = loc("tmp30"(#loc57))
+#loc136 = loc("tmp31"(#loc58))
+#loc137 = loc("tmp32"(#loc59))
+#loc138 = loc("tmp32"(#loc60))
+#loc139 = loc("tmp32"(#loc61))
+#loc140 = loc("tmp34"(#loc62))
+#loc141 = loc("tmp37"(#loc63))
+#loc142 = loc("tmp38"(#loc64))
diff --git a/triton/LGWNITOJHF5DZS2J4SNJA2RC2V3NCW5I54J4KKH76NMKB5PT6OPQ/__grp__triton_poi_fused_clone_permute_2.json b/triton/LGWNITOJHF5DZS2J4SNJA2RC2V3NCW5I54J4KKH76NMKB5PT6OPQ/__grp__triton_poi_fused_clone_permute_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..903c72293c624aecc79f6067774535d1dabfb681
--- /dev/null
+++ b/triton/LGWNITOJHF5DZS2J4SNJA2RC2V3NCW5I54J4KKH76NMKB5PT6OPQ/__grp__triton_poi_fused_clone_permute_2.json
@@ -0,0 +1 @@
+{"child_paths": {"triton_poi_fused_clone_permute_2.source": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/LGWNITOJHF5DZS2J4SNJA2RC2V3NCW5I54J4KKH76NMKB5PT6OPQ/triton_poi_fused_clone_permute_2.source", "triton_poi_fused_clone_permute_2.ttir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/LGWNITOJHF5DZS2J4SNJA2RC2V3NCW5I54J4KKH76NMKB5PT6OPQ/triton_poi_fused_clone_permute_2.ttir", "triton_poi_fused_clone_permute_2.ttgir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/LGWNITOJHF5DZS2J4SNJA2RC2V3NCW5I54J4KKH76NMKB5PT6OPQ/triton_poi_fused_clone_permute_2.ttgir", "triton_poi_fused_clone_permute_2.llir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/LGWNITOJHF5DZS2J4SNJA2RC2V3NCW5I54J4KKH76NMKB5PT6OPQ/triton_poi_fused_clone_permute_2.llir", "triton_poi_fused_clone_permute_2.ptx": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/LGWNITOJHF5DZS2J4SNJA2RC2V3NCW5I54J4KKH76NMKB5PT6OPQ/triton_poi_fused_clone_permute_2.ptx", "triton_poi_fused_clone_permute_2.cubin": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/LGWNITOJHF5DZS2J4SNJA2RC2V3NCW5I54J4KKH76NMKB5PT6OPQ/triton_poi_fused_clone_permute_2.cubin", "triton_poi_fused_clone_permute_2.json": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/LGWNITOJHF5DZS2J4SNJA2RC2V3NCW5I54J4KKH76NMKB5PT6OPQ/triton_poi_fused_clone_permute_2.json"}}
\ No newline at end of file
diff --git a/triton/LGWNITOJHF5DZS2J4SNJA2RC2V3NCW5I54J4KKH76NMKB5PT6OPQ/triton_poi_fused_clone_permute_2.cubin b/triton/LGWNITOJHF5DZS2J4SNJA2RC2V3NCW5I54J4KKH76NMKB5PT6OPQ/triton_poi_fused_clone_permute_2.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..2437c4b29a514d6b45a8e6011640ad0e2c299a56
Binary files /dev/null and b/triton/LGWNITOJHF5DZS2J4SNJA2RC2V3NCW5I54J4KKH76NMKB5PT6OPQ/triton_poi_fused_clone_permute_2.cubin differ
diff --git a/triton/LGWNITOJHF5DZS2J4SNJA2RC2V3NCW5I54J4KKH76NMKB5PT6OPQ/triton_poi_fused_clone_permute_2.json b/triton/LGWNITOJHF5DZS2J4SNJA2RC2V3NCW5I54J4KKH76NMKB5PT6OPQ/triton_poi_fused_clone_permute_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..4145f3013225748d5438e23ca6d7a9884e6c61d5
--- /dev/null
+++ b/triton/LGWNITOJHF5DZS2J4SNJA2RC2V3NCW5I54J4KKH76NMKB5PT6OPQ/triton_poi_fused_clone_permute_2.json
@@ -0,0 +1 @@
+{"hash": "59acd44dc9397a3ccb49e49a906a22d576d15ba8ef13c528fff358a0f5f3f39f", "target": {"backend": "cuda", "arch": 89, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "enable_reflect_ftz": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee", "bf16x3", "bf16x6"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm89", "instrumentation_mode": "", "triton_version": "3.6.0", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_poi_fused_clone_permute_2"}
\ No newline at end of file
diff --git a/triton/LGWNITOJHF5DZS2J4SNJA2RC2V3NCW5I54J4KKH76NMKB5PT6OPQ/triton_poi_fused_clone_permute_2.llir b/triton/LGWNITOJHF5DZS2J4SNJA2RC2V3NCW5I54J4KKH76NMKB5PT6OPQ/triton_poi_fused_clone_permute_2.llir
new file mode 100644
index 0000000000000000000000000000000000000000..caaf59d1e6aefc0aea76717363909c38e62a6758
--- /dev/null
+++ b/triton/LGWNITOJHF5DZS2J4SNJA2RC2V3NCW5I54J4KKH76NMKB5PT6OPQ/triton_poi_fused_clone_permute_2.llir
@@ -0,0 +1,71 @@
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64"
+
+; Function Attrs: nounwind
+define ptx_kernel void @triton_poi_fused_clone_permute_2(ptr addrspace(1) %0, ptr addrspace(1) %1, i32 %2, ptr addrspace(1) readnone captures(none) %3, ptr addrspace(1) readnone captures(none) %4) local_unnamed_addr #0 !dbg !4 {
+  %6 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7
+  %7 = shl i32 %6, 10, !dbg !8
+  %8 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9
+  %9 = shl nuw nsw i32 %8, 3, !dbg !9
+  %10 = and i32 %9, 1016, !dbg !9
+  %11 = or disjoint i32 %10, %7, !dbg !10
+  %12 = sdiv i32 %11, 128, !dbg !11
+  %13 = mul i32 %12, 128, !dbg !12
+  %.decomposed = sub i32 %11, %13, !dbg !12
+  %14 = srem i32 %12, 32, !dbg !13
+  %15 = sdiv i32 %11, 4096, !dbg !14
+  %16 = shl nsw i32 %15, 7, !dbg !15
+  %17 = add nsw i32 %16, %.decomposed, !dbg !16
+  %18 = mul nsw i32 %14, 294912, !dbg !17
+  %19 = add nsw i32 %17, %18, !dbg !18
+  %20 = sext i32 %19 to i64, !dbg !19
+  %21 = getelementptr bfloat, ptr addrspace(1) %0, i64 %20, !dbg !19
+  %22 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l"(ptr addrspace(1) %21) #2, !dbg !20
+  %23 = extractvalue { i32, i32, i32, i32 } %22, 0, !dbg !20
+  %24 = extractvalue { i32, i32, i32, i32 } %22, 1, !dbg !20
+  %25 = extractvalue { i32, i32, i32, i32 } %22, 2, !dbg !20
+  %26 = extractvalue { i32, i32, i32, i32 } %22, 3, !dbg !20
+  %27 = sext i32 %11 to i64, !dbg !21
+  %28 = getelementptr bfloat, ptr addrspace(1) %1, i64 %27, !dbg !21
+  tail call void asm sideeffect "st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l"(i32 %23, i32 %24, i32 %25, i32 %26, ptr addrspace(1) %28) #2, !dbg !22
+  ret void, !dbg !23
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1
+
+attributes #0 = { nounwind "nvvm.reqntid"="128" }
+attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #2 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly)
+!1 = !DIFile(filename: "cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py", directory: "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4")
+!2 = !{i32 2, !"Debug Info Version", i32 3}
+!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
+!4 = distinct !DISubprogram(name: "triton_poi_fused_clone_permute_2", linkageName: "triton_poi_fused_clone_permute_2", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!5 = !DISubroutineType(cc: DW_CC_normal, types: !6)
+!6 = !{}
+!7 = !DILocation(line: 20, column: 28, scope: !4)
+!8 = !DILocation(line: 20, column: 33, scope: !4)
+!9 = !DILocation(line: 21, column: 36, scope: !4)
+!10 = !DILocation(line: 21, column: 23, scope: !4)
+!11 = !DILocation(line: 24, column: 21, scope: !4)
+!12 = !DILocation(line: 23, column: 19, scope: !4)
+!13 = !DILocation(line: 24, column: 28, scope: !4)
+!14 = !DILocation(line: 25, column: 19, scope: !4)
+!15 = !DILocation(line: 27, column: 39, scope: !4)
+!16 = !DILocation(line: 27, column: 35, scope: !4)
+!17 = !DILocation(line: 27, column: 51, scope: !4)
+!18 = !DILocation(line: 27, column: 44, scope: !4)
+!19 = !DILocation(line: 27, column: 30, scope: !4)
+!20 = !DILocation(line: 27, column: 56, scope: !4)
+!21 = !DILocation(line: 28, column: 25, scope: !4)
+!22 = !DILocation(line: 28, column: 36, scope: !4)
+!23 = !DILocation(line: 28, column: 4, scope: !4)
diff --git a/triton/LGWNITOJHF5DZS2J4SNJA2RC2V3NCW5I54J4KKH76NMKB5PT6OPQ/triton_poi_fused_clone_permute_2.ptx b/triton/LGWNITOJHF5DZS2J4SNJA2RC2V3NCW5I54J4KKH76NMKB5PT6OPQ/triton_poi_fused_clone_permute_2.ptx
new file mode 100644
index 0000000000000000000000000000000000000000..d7aee7940e0b44591a40af1b2d2c6128c743133f
--- /dev/null
+++ b/triton/LGWNITOJHF5DZS2J4SNJA2RC2V3NCW5I54J4KKH76NMKB5PT6OPQ/triton_poi_fused_clone_permute_2.ptx
@@ -0,0 +1,327 @@
+//
+// Generated by LLVM NVPTX Back-End
+//
+
+.version 9.1
+.target sm_89
+.address_size 64
+
+	// .globl	triton_poi_fused_clone_permute_2 // -- Begin function triton_poi_fused_clone_permute_2
+                                        // @triton_poi_fused_clone_permute_2
+.visible .entry triton_poi_fused_clone_permute_2(
+	.param .u64 .ptr .global .align 1 triton_poi_fused_clone_permute_2_param_0,
+	.param .u64 .ptr .global .align 1 triton_poi_fused_clone_permute_2_param_1,
+	.param .u32 triton_poi_fused_clone_permute_2_param_2,
+	.param .u64 .ptr .global .align 1 triton_poi_fused_clone_permute_2_param_3,
+	.param .u64 .ptr .global .align 1 triton_poi_fused_clone_permute_2_param_4
+)
+.reqntid 128
+{
+	.reg .b32 	%r<27>;
+	.reg .b64 	%rd<5>;
+	.loc	1 18 0                          // cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py:18:0
+$L__func_begin0:
+	.loc	1 18 0                          // cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py:18:0
+
+// %bb.0:
+	ld.param.b64 	%rd3, [triton_poi_fused_clone_permute_2_param_0];
+	ld.param.b64 	%rd4, [triton_poi_fused_clone_permute_2_param_1];
+$L__tmp0:
+	.loc	1 20 28                         // cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py:20:28
+	mov.u32 	%r5, %ctaid.x;
+	.loc	1 20 33                         // cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py:20:33
+	shl.b32 	%r6, %r5, 10;
+	.loc	1 21 36                         // cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py:21:36
+	mov.u32 	%r7, %tid.x;
+	shl.b32 	%r8, %r7, 3;
+	and.b32 	%r9, %r8, 1016;
+	.loc	1 21 23                         // cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py:21:23
+	or.b32 	%r10, %r9, %r6;
+	.loc	1 24 21                         // cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py:24:21
+	bfe.s32 	%r11, %r5, 21, 1;
+	shr.u32 	%r12, %r11, 25;
+	add.s32 	%r13, %r10, %r12;
+	shr.s32 	%r14, %r13, 7;
+	.loc	1 23 19                         // cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py:23:19
+	and.b32 	%r15, %r13, -128;
+	sub.s32 	%r16, %r10, %r15;
+	.loc	1 24 28                         // cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py:24:28
+	shr.u32 	%r17, %r14, 27;
+	add.s32 	%r18, %r14, %r17;
+	and.b32 	%r19, %r18, 131040;
+	sub.s32 	%r20, %r14, %r19;
+	.loc	1 25 19                         // cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py:25:19
+	shr.u32 	%r21, %r11, 20;
+	add.s32 	%r22, %r10, %r21;
+	shr.s32 	%r23, %r22, 12;
+	.loc	1 27 39                         // cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py:27:39
+	shl.b32 	%r24, %r23, 7;
+	.loc	1 27 35                         // cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py:27:35
+	add.s32 	%r25, %r24, %r16;
+	.loc	1 27 44                         // cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py:27:44
+	mad.lo.s32 	%r26, %r20, 294912, %r25;
+	.loc	1 27 30                         // cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py:27:30
+	mad.wide.s32 	%rd1, %r26, 2, %rd3;
+	.loc	1 27 56                         // cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py:27:56
+	// begin inline asm
+	mov.u32 %r1, 0x0;
+	mov.u32 %r2, 0x0;
+	mov.u32 %r3, 0x0;
+	mov.u32 %r4, 0x0;
+	ld.global.v4.b32 { %r1, %r2, %r3, %r4 }, [ %rd1 + 0 ];
+	// end inline asm
+	.loc	1 28 25                         // cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py:28:25
+	mad.wide.s32 	%rd2, %r10, 2, %rd4;
+	.loc	1 28 36                         // cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py:28:36
+	// begin inline asm
+	st.global.v4.b32 [ %rd2 + 0 ], { %r1, %r2, %r3, %r4 };
+	// end inline asm
+	.loc	1 28 4                          // cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py:28:4
+	ret;
+$L__tmp1:
+$L__func_end0:
+                                        // -- End function
+}
+	.file	1 "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py"
+	.section	.debug_abbrev
+	{
+.b8 1                                   // Abbreviation Code
+.b8 17                                  // DW_TAG_compile_unit
+.b8 0                                   // DW_CHILDREN_no
+.b8 37                                  // DW_AT_producer
+.b8 8                                   // DW_FORM_string
+.b8 19                                  // DW_AT_language
+.b8 5                                   // DW_FORM_data2
+.b8 3                                   // DW_AT_name
+.b8 8                                   // DW_FORM_string
+.b8 16                                  // DW_AT_stmt_list
+.b8 6                                   // DW_FORM_data4
+.b8 27                                  // DW_AT_comp_dir
+.b8 8                                   // DW_FORM_string
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 0                                   // EOM(3)
+	}
+	.section	.debug_info
+	{
+.b32 224                                // Length of Unit
+.b8 2                                   // DWARF version number
+.b8 0
+.b32 .debug_abbrev                      // Offset Into Abbrev. Section
+.b8 8                                   // Address Size (in bytes)
+.b8 1                                   // Abbrev [1] 0xb:0xd9 DW_TAG_compile_unit
+.b8 116                                 // DW_AT_producer
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 0
+.b8 2                                   // DW_AT_language
+.b8 0
+.b8 99                                  // DW_AT_name
+.b8 106
+.b8 52
+.b8 55
+.b8 118
+.b8 122
+.b8 50
+.b8 117
+.b8 55
+.b8 105
+.b8 51
+.b8 116
+.b8 104
+.b8 53
+.b8 51
+.b8 99
+.b8 102
+.b8 50
+.b8 101
+.b8 108
+.b8 99
+.b8 53
+.b8 102
+.b8 105
+.b8 121
+.b8 108
+.b8 118
+.b8 121
+.b8 107
+.b8 55
+.b8 111
+.b8 51
+.b8 110
+.b8 105
+.b8 50
+.b8 112
+.b8 110
+.b8 52
+.b8 99
+.b8 50
+.b8 98
+.b8 100
+.b8 100
+.b8 114
+.b8 122
+.b8 113
+.b8 53
+.b8 106
+.b8 110
+.b8 117
+.b8 110
+.b8 113
+.b8 46
+.b8 112
+.b8 121
+.b8 0
+.b32 .debug_line                        // DW_AT_stmt_list
+.b8 47                                  // DW_AT_comp_dir
+.b8 97
+.b8 112
+.b8 112
+.b8 47
+.b8 116
+.b8 101
+.b8 110
+.b8 115
+.b8 111
+.b8 114
+.b8 114
+.b8 116
+.b8 95
+.b8 108
+.b8 108
+.b8 109
+.b8 47
+.b8 118
+.b8 105
+.b8 115
+.b8 117
+.b8 97
+.b8 108
+.b8 95
+.b8 103
+.b8 101
+.b8 110
+.b8 47
+.b8 99
+.b8 111
+.b8 109
+.b8 112
+.b8 105
+.b8 108
+.b8 101
+.b8 100
+.b8 95
+.b8 99
+.b8 97
+.b8 99
+.b8 104
+.b8 101
+.b8 47
+.b8 102
+.b8 108
+.b8 117
+.b8 120
+.b8 50
+.b8 95
+.b8 107
+.b8 108
+.b8 101
+.b8 105
+.b8 110
+.b8 95
+.b8 57
+.b8 98
+.b8 95
+.b8 78
+.b8 86
+.b8 73
+.b8 68
+.b8 73
+.b8 65
+.b8 95
+.b8 71
+.b8 101
+.b8 70
+.b8 111
+.b8 114
+.b8 99
+.b8 101
+.b8 95
+.b8 82
+.b8 84
+.b8 88
+.b8 95
+.b8 52
+.b8 48
+.b8 57
+.b8 48
+.b8 95
+.b8 115
+.b8 109
+.b8 56
+.b8 57
+.b8 95
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 50
+.b8 46
+.b8 49
+.b8 48
+.b8 46
+.b8 48
+.b8 97
+.b8 48
+.b8 95
+.b8 98
+.b8 52
+.b8 101
+.b8 52
+.b8 101
+.b8 101
+.b8 56
+.b8 49
+.b8 100
+.b8 51
+.b8 46
+.b8 110
+.b8 118
+.b8 50
+.b8 53
+.b8 46
+.b8 49
+.b8 50
+.b8 95
+.b8 99
+.b8 117
+.b8 100
+.b8 97
+.b8 49
+.b8 51
+.b8 95
+.b8 49
+.b8 47
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 105
+.b8 110
+.b8 100
+.b8 117
+.b8 99
+.b8 116
+.b8 111
+.b8 114
+.b8 47
+.b8 106
+.b8 52
+.b8 0
+	}
+	.section	.debug_macinfo	{	}
diff --git a/triton/LGWNITOJHF5DZS2J4SNJA2RC2V3NCW5I54J4KKH76NMKB5PT6OPQ/triton_poi_fused_clone_permute_2.source b/triton/LGWNITOJHF5DZS2J4SNJA2RC2V3NCW5I54J4KKH76NMKB5PT6OPQ/triton_poi_fused_clone_permute_2.source
new file mode 100644
index 0000000000000000000000000000000000000000..a4f167f87bf44ebdc32b58d5074809003c95ca3b
--- /dev/null
+++ b/triton/LGWNITOJHF5DZS2J4SNJA2RC2V3NCW5I54J4KKH76NMKB5PT6OPQ/triton_poi_fused_clone_permute_2.source
@@ -0,0 +1,90 @@
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":18:0)
+#loc21 = loc("in_ptr0"(#loc))
+#loc22 = loc("out_ptr0"(#loc))
+#loc23 = loc("xnumel"(#loc))
+module {
+  tt.func public @triton_poi_fused_clone_permute_2(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} {
+    %xnumel_0 = arith.constant 9437184 : i32 loc(#loc24)
+    %xoffset = tt.get_program_id x : i32 loc(#loc25)
+    %xoffset_1 = arith.constant 1024 : i32 loc(#loc26)
+    %xoffset_2 = arith.constant 1024 : i32 loc(#loc26)
+    %xoffset_3 = arith.muli %xoffset, %xoffset_2 : i32 loc(#loc26)
+    %xindex = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32> loc(#loc27)
+    %xindex_4 = tt.splat %xoffset_3 : i32 -> tensor<1024xi32> loc(#loc28)
+    %xindex_5 = arith.addi %xindex_4, %xindex : tensor<1024xi32> loc(#loc28)
+    %xmask = arith.constant true loc(#loc29)
+    %xmask_6 = arith.constant dense<true> : tensor<1024xi1> loc(#loc29)
+    %x0 = arith.constant 128 : i32 loc(#loc30)
+    %x0_7 = arith.constant 128 : i32 loc(#loc30)
+    %x0_8 = arith.constant dense<128> : tensor<1024xi32> loc(#loc30)
+    %x0_9 = arith.remsi %xindex_5, %x0_8 : tensor<1024xi32> loc(#loc30)
+    %x1 = arith.constant 128 : i32 loc(#loc31)
+    %x1_10 = arith.constant 128 : i32 loc(#loc31)
+    %x1_11 = arith.constant dense<128> : tensor<1024xi32> loc(#loc31)
+    %x1_12 = arith.divsi %xindex_5, %x1_11 : tensor<1024xi32> loc(#loc31)
+    %x1_13 = arith.constant 32 : i32 loc(#loc32)
+    %x1_14 = arith.constant 32 : i32 loc(#loc32)
+    %x1_15 = arith.constant dense<32> : tensor<1024xi32> loc(#loc32)
+    %x1_16 = arith.remsi %x1_12, %x1_15 : tensor<1024xi32> loc(#loc32)
+    %x2 = arith.constant 4096 : i32 loc(#loc33)
+    %x2_17 = arith.constant 4096 : i32 loc(#loc33)
+    %x2_18 = arith.constant dense<4096> : tensor<1024xi32> loc(#loc33)
+    %x2_19 = arith.divsi %xindex_5, %x2_18 : tensor<1024xi32> loc(#loc33)
+    %tmp0 = arith.constant 128 : i32 loc(#loc34)
+    %tmp0_20 = arith.constant 128 : i32 loc(#loc34)
+    %tmp0_21 = arith.constant dense<128> : tensor<1024xi32> loc(#loc34)
+    %tmp0_22 = arith.muli %tmp0_21, %x2_19 : tensor<1024xi32> loc(#loc34)
+    %tmp0_23 = arith.addi %x0_9, %tmp0_22 : tensor<1024xi32> loc(#loc35)
+    %tmp0_24 = arith.constant 294912 : i32 loc(#loc36)
+    %tmp0_25 = arith.constant 294912 : i32 loc(#loc36)
+    %tmp0_26 = arith.constant dense<294912> : tensor<1024xi32> loc(#loc36)
+    %tmp0_27 = arith.muli %tmp0_26, %x1_16 : tensor<1024xi32> loc(#loc36)
+    %tmp0_28 = arith.addi %tmp0_23, %tmp0_27 : tensor<1024xi32> loc(#loc37)
+    %tmp0_29 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<1024x!tt.ptr<bf16>> loc(#loc38)
+    %tmp0_30 = tt.addptr %tmp0_29, %tmp0_28 : tensor<1024x!tt.ptr<bf16>>, tensor<1024xi32> loc(#loc38)
+    %tmp0_31 = tt.load %tmp0_30 : tensor<1024x!tt.ptr<bf16>> loc(#loc39)
+    %tmp0_32 = arith.extf %tmp0_31 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc40)
+    %0 = tt.splat %out_ptr0 : !tt.ptr<bf16> -> tensor<1024x!tt.ptr<bf16>> loc(#loc18)
+    %1 = tt.addptr %0, %xindex_5 : tensor<1024x!tt.ptr<bf16>>, tensor<1024xi32> loc(#loc18)
+    %2 = arith.truncf %tmp0_32 : tensor<1024xf32> to tensor<1024xbf16> loc(#loc19)
+    tt.store %1, %2 : tensor<1024x!tt.ptr<bf16>> loc(#loc19)
+    tt.return loc(#loc20)
+  } loc(#loc)
+} loc(#loc)
+#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":19:13)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":20:28)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":20:33)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":21:36)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":21:23)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":22:36)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":23:19)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":24:21)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":24:28)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":25:19)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":27:39)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":27:35)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":27:51)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":27:44)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":27:30)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":27:56)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":27:65)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":28:25)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":28:36)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":28:4)
+#loc24 = loc("xnumel"(#loc1))
+#loc25 = loc("xoffset"(#loc2))
+#loc26 = loc("xoffset"(#loc3))
+#loc27 = loc("xindex"(#loc4))
+#loc28 = loc("xindex"(#loc5))
+#loc29 = loc("xmask"(#loc6))
+#loc30 = loc("x0"(#loc7))
+#loc31 = loc("x1"(#loc8))
+#loc32 = loc("x1"(#loc9))
+#loc33 = loc("x2"(#loc10))
+#loc34 = loc("tmp0"(#loc11))
+#loc35 = loc("tmp0"(#loc12))
+#loc36 = loc("tmp0"(#loc13))
+#loc37 = loc("tmp0"(#loc14))
+#loc38 = loc("tmp0"(#loc15))
+#loc39 = loc("tmp0"(#loc16))
+#loc40 = loc("tmp0"(#loc17))
diff --git a/triton/LGWNITOJHF5DZS2J4SNJA2RC2V3NCW5I54J4KKH76NMKB5PT6OPQ/triton_poi_fused_clone_permute_2.ttgir b/triton/LGWNITOJHF5DZS2J4SNJA2RC2V3NCW5I54J4KKH76NMKB5PT6OPQ/triton_poi_fused_clone_permute_2.ttgir
new file mode 100644
index 0000000000000000000000000000000000000000..a263fc90fc68e8620403d752aaae32960c71c163
--- /dev/null
+++ b/triton/LGWNITOJHF5DZS2J4SNJA2RC2V3NCW5I54J4KKH76NMKB5PT6OPQ/triton_poi_fused_clone_permute_2.ttgir
@@ -0,0 +1,66 @@
+#blocked = #ttg.blocked<{sizePerThread = [8], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}>
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":18:0)
+#loc19 = loc("in_ptr0"(#loc))
+#loc20 = loc("out_ptr0"(#loc))
+#loc21 = loc("xnumel"(#loc))
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:89", "ttg.threads-per-warp" = 32 : i32} {
+  tt.func public @triton_poi_fused_clone_permute_2(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} {
+    %cst = arith.constant dense<128> : tensor<1024xi32, #blocked> loc(#loc1)
+    %cst_0 = arith.constant dense<32> : tensor<1024xi32, #blocked> loc(#loc1)
+    %cst_1 = arith.constant dense<4096> : tensor<1024xi32, #blocked> loc(#loc1)
+    %cst_2 = arith.constant dense<294912> : tensor<1024xi32, #blocked> loc(#loc1)
+    %c1024_i32 = arith.constant 1024 : i32 loc(#loc1)
+    %xoffset = tt.get_program_id x : i32 loc(#loc22)
+    %xoffset_3 = arith.muli %xoffset, %c1024_i32 : i32 loc(#loc23)
+    %xindex = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked> loc(#loc24)
+    %xindex_4 = tt.splat %xoffset_3 : i32 -> tensor<1024xi32, #blocked> loc(#loc25)
+    %xindex_5 = arith.addi %xindex_4, %xindex : tensor<1024xi32, #blocked> loc(#loc25)
+    %x0 = arith.remsi %xindex_5, %cst : tensor<1024xi32, #blocked> loc(#loc26)
+    %x1 = arith.divsi %xindex_5, %cst : tensor<1024xi32, #blocked> loc(#loc27)
+    %x1_6 = arith.remsi %x1, %cst_0 : tensor<1024xi32, #blocked> loc(#loc28)
+    %x2 = arith.divsi %xindex_5, %cst_1 : tensor<1024xi32, #blocked> loc(#loc29)
+    %tmp0 = arith.muli %x2, %cst : tensor<1024xi32, #blocked> loc(#loc30)
+    %tmp0_7 = arith.addi %x0, %tmp0 : tensor<1024xi32, #blocked> loc(#loc31)
+    %tmp0_8 = arith.muli %x1_6, %cst_2 : tensor<1024xi32, #blocked> loc(#loc32)
+    %tmp0_9 = arith.addi %tmp0_7, %tmp0_8 : tensor<1024xi32, #blocked> loc(#loc33)
+    %tmp0_10 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<1024x!tt.ptr<bf16>, #blocked> loc(#loc34)
+    %tmp0_11 = tt.addptr %tmp0_10, %tmp0_9 : tensor<1024x!tt.ptr<bf16>, #blocked>, tensor<1024xi32, #blocked> loc(#loc34)
+    %tmp0_12 = tt.load %tmp0_11 : tensor<1024x!tt.ptr<bf16>, #blocked> loc(#loc35)
+    %0 = tt.splat %out_ptr0 : !tt.ptr<bf16> -> tensor<1024x!tt.ptr<bf16>, #blocked> loc(#loc16)
+    %1 = tt.addptr %0, %xindex_5 : tensor<1024x!tt.ptr<bf16>, #blocked>, tensor<1024xi32, #blocked> loc(#loc16)
+    tt.store %1, %tmp0_12 : tensor<1024x!tt.ptr<bf16>, #blocked> loc(#loc17)
+    tt.return loc(#loc18)
+  } loc(#loc)
+} loc(#loc)
+#loc1 = loc(unknown)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":20:28)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":20:33)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":21:36)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":21:23)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":23:19)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":24:21)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":24:28)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":25:19)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":27:39)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":27:35)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":27:51)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":27:44)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":27:30)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":27:56)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":28:25)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":28:36)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":28:4)
+#loc22 = loc("xoffset"(#loc2))
+#loc23 = loc("xoffset"(#loc3))
+#loc24 = loc("xindex"(#loc4))
+#loc25 = loc("xindex"(#loc5))
+#loc26 = loc("x0"(#loc6))
+#loc27 = loc("x1"(#loc7))
+#loc28 = loc("x1"(#loc8))
+#loc29 = loc("x2"(#loc9))
+#loc30 = loc("tmp0"(#loc10))
+#loc31 = loc("tmp0"(#loc11))
+#loc32 = loc("tmp0"(#loc12))
+#loc33 = loc("tmp0"(#loc13))
+#loc34 = loc("tmp0"(#loc14))
+#loc35 = loc("tmp0"(#loc15))
diff --git a/triton/LGWNITOJHF5DZS2J4SNJA2RC2V3NCW5I54J4KKH76NMKB5PT6OPQ/triton_poi_fused_clone_permute_2.ttir b/triton/LGWNITOJHF5DZS2J4SNJA2RC2V3NCW5I54J4KKH76NMKB5PT6OPQ/triton_poi_fused_clone_permute_2.ttir
new file mode 100644
index 0000000000000000000000000000000000000000..91cbecaea485ac121c1028ec64dfaeaab96f36be
--- /dev/null
+++ b/triton/LGWNITOJHF5DZS2J4SNJA2RC2V3NCW5I54J4KKH76NMKB5PT6OPQ/triton_poi_fused_clone_permute_2.ttir
@@ -0,0 +1,65 @@
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":18:0)
+#loc19 = loc("in_ptr0"(#loc))
+#loc20 = loc("out_ptr0"(#loc))
+#loc21 = loc("xnumel"(#loc))
+module {
+  tt.func public @triton_poi_fused_clone_permute_2(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} {
+    %tmp0 = arith.constant dense<294912> : tensor<1024xi32> loc(#loc22)
+    %x2 = arith.constant dense<4096> : tensor<1024xi32> loc(#loc23)
+    %x1 = arith.constant dense<32> : tensor<1024xi32> loc(#loc24)
+    %cst = arith.constant dense<128> : tensor<1024xi32> loc(#loc4)
+    %c1024_i32 = arith.constant 1024 : i32 loc(#loc4)
+    %xoffset = tt.get_program_id x : i32 loc(#loc25)
+    %xoffset_0 = arith.muli %xoffset, %c1024_i32 : i32 loc(#loc26)
+    %xindex = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32> loc(#loc27)
+    %xindex_1 = tt.splat %xoffset_0 : i32 -> tensor<1024xi32> loc(#loc28)
+    %xindex_2 = arith.addi %xindex_1, %xindex : tensor<1024xi32> loc(#loc28)
+    %x0 = arith.remsi %xindex_2, %cst : tensor<1024xi32> loc(#loc29)
+    %x1_3 = arith.divsi %xindex_2, %cst : tensor<1024xi32> loc(#loc30)
+    %x1_4 = arith.remsi %x1_3, %x1 : tensor<1024xi32> loc(#loc24)
+    %x2_5 = arith.divsi %xindex_2, %x2 : tensor<1024xi32> loc(#loc23)
+    %tmp0_6 = arith.muli %x2_5, %cst : tensor<1024xi32> loc(#loc31)
+    %tmp0_7 = arith.addi %x0, %tmp0_6 : tensor<1024xi32> loc(#loc32)
+    %tmp0_8 = arith.muli %x1_4, %tmp0 : tensor<1024xi32> loc(#loc22)
+    %tmp0_9 = arith.addi %tmp0_7, %tmp0_8 : tensor<1024xi32> loc(#loc33)
+    %tmp0_10 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<1024x!tt.ptr<bf16>> loc(#loc34)
+    %tmp0_11 = tt.addptr %tmp0_10, %tmp0_9 : tensor<1024x!tt.ptr<bf16>>, tensor<1024xi32> loc(#loc34)
+    %tmp0_12 = tt.load %tmp0_11 : tensor<1024x!tt.ptr<bf16>> loc(#loc35)
+    %0 = tt.splat %out_ptr0 : !tt.ptr<bf16> -> tensor<1024x!tt.ptr<bf16>> loc(#loc16)
+    %1 = tt.addptr %0, %xindex_2 : tensor<1024x!tt.ptr<bf16>>, tensor<1024xi32> loc(#loc16)
+    tt.store %1, %tmp0_12 : tensor<1024x!tt.ptr<bf16>> loc(#loc17)
+    tt.return loc(#loc18)
+  } loc(#loc)
+} loc(#loc)
+#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":27:51)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":25:19)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":24:28)
+#loc4 = loc(unknown)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":20:28)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":20:33)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":21:36)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":21:23)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":23:19)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":24:21)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":27:39)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":27:35)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":27:44)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":27:30)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":27:56)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":28:25)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":28:36)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":28:4)
+#loc22 = loc("tmp0"(#loc1))
+#loc23 = loc("x2"(#loc2))
+#loc24 = loc("x1"(#loc3))
+#loc25 = loc("xoffset"(#loc5))
+#loc26 = loc("xoffset"(#loc6))
+#loc27 = loc("xindex"(#loc7))
+#loc28 = loc("xindex"(#loc8))
+#loc29 = loc("x0"(#loc9))
+#loc30 = loc("x1"(#loc10))
+#loc31 = loc("tmp0"(#loc11))
+#loc32 = loc("tmp0"(#loc12))
+#loc33 = loc("tmp0"(#loc13))
+#loc34 = loc("tmp0"(#loc14))
+#loc35 = loc("tmp0"(#loc15))
diff --git a/triton/MRY6R44HLP4JU6GX5XHYUUNGQ6LLBRKAPA6ZNJPRYYOSCCZPWAPQ/__grp__triton_red_fused_add_mul_native_layer_norm_0.json b/triton/MRY6R44HLP4JU6GX5XHYUUNGQ6LLBRKAPA6ZNJPRYYOSCCZPWAPQ/__grp__triton_red_fused_add_mul_native_layer_norm_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..569dcdf957ffed86f3be8853d17d08b5fc2b4bd3
--- /dev/null
+++ b/triton/MRY6R44HLP4JU6GX5XHYUUNGQ6LLBRKAPA6ZNJPRYYOSCCZPWAPQ/__grp__triton_red_fused_add_mul_native_layer_norm_0.json
@@ -0,0 +1 @@
+{"child_paths": {"triton_red_fused_add_mul_native_layer_norm_0.source": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/MRY6R44HLP4JU6GX5XHYUUNGQ6LLBRKAPA6ZNJPRYYOSCCZPWAPQ/triton_red_fused_add_mul_native_layer_norm_0.source", "triton_red_fused_add_mul_native_layer_norm_0.ttir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/MRY6R44HLP4JU6GX5XHYUUNGQ6LLBRKAPA6ZNJPRYYOSCCZPWAPQ/triton_red_fused_add_mul_native_layer_norm_0.ttir", "triton_red_fused_add_mul_native_layer_norm_0.ttgir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/MRY6R44HLP4JU6GX5XHYUUNGQ6LLBRKAPA6ZNJPRYYOSCCZPWAPQ/triton_red_fused_add_mul_native_layer_norm_0.ttgir", "triton_red_fused_add_mul_native_layer_norm_0.llir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/MRY6R44HLP4JU6GX5XHYUUNGQ6LLBRKAPA6ZNJPRYYOSCCZPWAPQ/triton_red_fused_add_mul_native_layer_norm_0.llir", "triton_red_fused_add_mul_native_layer_norm_0.ptx": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/MRY6R44HLP4JU6GX5XHYUUNGQ6LLBRKAPA6ZNJPRYYOSCCZPWAPQ/triton_red_fused_add_mul_native_layer_norm_0.ptx", "triton_red_fused_add_mul_native_layer_norm_0.cubin": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/MRY6R44HLP4JU6GX5XHYUUNGQ6LLBRKAPA6ZNJPRYYOSCCZPWAPQ/triton_red_fused_add_mul_native_layer_norm_0.cubin", "triton_red_fused_add_mul_native_layer_norm_0.json": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/MRY6R44HLP4JU6GX5XHYUUNGQ6LLBRKAPA6ZNJPRYYOSCCZPWAPQ/triton_red_fused_add_mul_native_layer_norm_0.json"}}
\ No newline at end of file
diff --git a/triton/MRY6R44HLP4JU6GX5XHYUUNGQ6LLBRKAPA6ZNJPRYYOSCCZPWAPQ/triton_red_fused_add_mul_native_layer_norm_0.cubin b/triton/MRY6R44HLP4JU6GX5XHYUUNGQ6LLBRKAPA6ZNJPRYYOSCCZPWAPQ/triton_red_fused_add_mul_native_layer_norm_0.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..880e547d63b4727dc6ca3e49e07795249ebac9a2
Binary files /dev/null and b/triton/MRY6R44HLP4JU6GX5XHYUUNGQ6LLBRKAPA6ZNJPRYYOSCCZPWAPQ/triton_red_fused_add_mul_native_layer_norm_0.cubin differ
diff --git a/triton/MRY6R44HLP4JU6GX5XHYUUNGQ6LLBRKAPA6ZNJPRYYOSCCZPWAPQ/triton_red_fused_add_mul_native_layer_norm_0.json b/triton/MRY6R44HLP4JU6GX5XHYUUNGQ6LLBRKAPA6ZNJPRYYOSCCZPWAPQ/triton_red_fused_add_mul_native_layer_norm_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..67bd342c52ef8d3d0c858b23a4ec778f075123a7
--- /dev/null
+++ b/triton/MRY6R44HLP4JU6GX5XHYUUNGQ6LLBRKAPA6ZNJPRYYOSCCZPWAPQ/triton_red_fused_add_mul_native_layer_norm_0.json
@@ -0,0 +1 @@
+{"hash": "6471e8f3875bf89a78d7edcf8a51a68796b0c540783d96a5f1c61d210b2fb01f", "target": {"backend": "cuda", "arch": 89, "warp_size": 32}, "num_warps": 16, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "enable_reflect_ftz": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee", "bf16x3", "bf16x6"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm89", "instrumentation_mode": "", "triton_version": "3.6.0", "tensordesc_meta": [], "shared": 192, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused_add_mul_native_layer_norm_0"}
\ No newline at end of file
diff --git a/triton/MRY6R44HLP4JU6GX5XHYUUNGQ6LLBRKAPA6ZNJPRYYOSCCZPWAPQ/triton_red_fused_add_mul_native_layer_norm_0.llir b/triton/MRY6R44HLP4JU6GX5XHYUUNGQ6LLBRKAPA6ZNJPRYYOSCCZPWAPQ/triton_red_fused_add_mul_native_layer_norm_0.llir
new file mode 100644
index 0000000000000000000000000000000000000000..4d8c7a489efd29644f0f5a29ec68874c4a81affc
--- /dev/null
+++ b/triton/MRY6R44HLP4JU6GX5XHYUUNGQ6LLBRKAPA6ZNJPRYYOSCCZPWAPQ/triton_red_fused_add_mul_native_layer_norm_0.llir
@@ -0,0 +1,547 @@
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64"
+
+@global_smem = external addrspace(3) global [0 x i8], align 16
+@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1
+
+; Function Attrs: nounwind
+define ptx_kernel void @triton_red_fused_add_mul_native_layer_norm_0(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, i32 %4, i32 %5, ptr addrspace(1) readnone captures(none) %6, ptr addrspace(1) readnone captures(none) %7) local_unnamed_addr #0 !dbg !5 {
+__nv_rsqrtf.exit:
+  %8 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !8
+  %9 = icmp samesign ult i32 %8, 2048, !dbg !9
+  %10 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
+  %11 = and i32 %10, 511, !dbg !10
+  %12 = and i32 %10, 31, !dbg !10
+  %13 = lshr i32 %11, 5, !dbg !10
+  %14 = shl nuw nsw i32 %10, 3, !dbg !10
+  %15 = and i32 %14, 4088, !dbg !10
+  %16 = shl i32 %8, 12, !dbg !11
+  %17 = or disjoint i32 %15, %16, !dbg !12
+  %18 = sext i32 %17 to i64, !dbg !13
+  %19 = getelementptr bfloat, ptr addrspace(1) %0, i64 %18, !dbg !13
+  %20 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !14
+  %21 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %19, i64 %20, i1 %9) #6, !dbg !14
+  %22 = extractvalue { i32, i32, i32, i32 } %21, 0, !dbg !14
+  %23 = bitcast i32 %22 to <2 x bfloat>, !dbg !14
+  %24 = extractvalue { i32, i32, i32, i32 } %21, 1, !dbg !14
+  %25 = bitcast i32 %24 to <2 x bfloat>, !dbg !14
+  %26 = extractvalue { i32, i32, i32, i32 } %21, 2, !dbg !14
+  %27 = bitcast i32 %26 to <2 x bfloat>, !dbg !14
+  %28 = extractvalue { i32, i32, i32, i32 } %21, 3, !dbg !14
+  %29 = bitcast i32 %28 to <2 x bfloat>, !dbg !14
+  %30 = extractelement <2 x bfloat> %23, i64 0, !dbg !14
+  %31 = extractelement <2 x bfloat> %23, i64 1, !dbg !14
+  %32 = extractelement <2 x bfloat> %25, i64 0, !dbg !14
+  %33 = extractelement <2 x bfloat> %25, i64 1, !dbg !14
+  %34 = extractelement <2 x bfloat> %27, i64 0, !dbg !14
+  %35 = extractelement <2 x bfloat> %27, i64 1, !dbg !14
+  %36 = extractelement <2 x bfloat> %29, i64 0, !dbg !14
+  %37 = extractelement <2 x bfloat> %29, i64 1, !dbg !14
+  %38 = fpext bfloat %30 to float, !dbg !15
+  %39 = fpext bfloat %31 to float, !dbg !15
+  %40 = fpext bfloat %32 to float, !dbg !15
+  %41 = fpext bfloat %33 to float, !dbg !15
+  %42 = fpext bfloat %34 to float, !dbg !15
+  %43 = fpext bfloat %35 to float, !dbg !15
+  %44 = fpext bfloat %36 to float, !dbg !15
+  %45 = fpext bfloat %37 to float, !dbg !15
+  %46 = select i1 %9, float %38, float 0.000000e+00, !dbg !16
+  %47 = select i1 %9, float %39, float 0.000000e+00, !dbg !16
+  %48 = select i1 %9, float %40, float 0.000000e+00, !dbg !16
+  %49 = select i1 %9, float %41, float 0.000000e+00, !dbg !16
+  %50 = select i1 %9, float %42, float 0.000000e+00, !dbg !16
+  %51 = select i1 %9, float %43, float 0.000000e+00, !dbg !16
+  %52 = select i1 %9, float %44, float 0.000000e+00, !dbg !16
+  %53 = select i1 %9, float %45, float 0.000000e+00, !dbg !16
+  %54 = select i1 %9, float 1.000000e+00, float 0.000000e+00, !dbg !17
+  %55 = fsub float %47, %46, !dbg !18
+  %56 = select i1 %9, float 2.000000e+00, float 0.000000e+00, !dbg !24
+  %57 = fcmp oeq float %56, 0.000000e+00, !dbg !25
+  %58 = tail call float @llvm.nvvm.div.full(float %54, float %56), !dbg !26
+  %59 = select i1 %57, float 0.000000e+00, float %58, !dbg !27
+  %60 = fmul float %59, %55, !dbg !28
+  %61 = fadd float %46, %60, !dbg !29
+  %62 = fmul float %55, %55, !dbg !30
+  %63 = fmul float %54, %62, !dbg !31
+  %64 = fmul float %59, %63, !dbg !32
+  %65 = fadd float %64, 0.000000e+00, !dbg !33
+  %66 = fsub float %48, %61, !dbg !18
+  %67 = select i1 %9, float 3.000000e+00, float 0.000000e+00, !dbg !24
+  %68 = fcmp oeq float %67, 0.000000e+00, !dbg !25
+  %69 = tail call float @llvm.nvvm.div.full(float %54, float %67), !dbg !26
+  %70 = select i1 %68, float 0.000000e+00, float %69, !dbg !27
+  %71 = fmul float %70, %66, !dbg !28
+  %72 = fadd float %61, %71, !dbg !29
+  %73 = fmul float %66, %66, !dbg !30
+  %74 = fmul float %56, %73, !dbg !31
+  %75 = fmul float %70, %74, !dbg !32
+  %76 = fadd float %65, %75, !dbg !33
+  %77 = fsub float %49, %72, !dbg !18
+  %78 = select i1 %9, float 4.000000e+00, float 0.000000e+00, !dbg !24
+  %79 = fcmp oeq float %78, 0.000000e+00, !dbg !25
+  %80 = tail call float @llvm.nvvm.div.full(float %54, float %78), !dbg !26
+  %81 = select i1 %79, float 0.000000e+00, float %80, !dbg !27
+  %82 = fmul float %81, %77, !dbg !28
+  %83 = fadd float %72, %82, !dbg !29
+  %84 = fmul float %77, %77, !dbg !30
+  %85 = fmul float %67, %84, !dbg !31
+  %86 = fmul float %81, %85, !dbg !32
+  %87 = fadd float %76, %86, !dbg !33
+  %88 = fsub float %50, %83, !dbg !18
+  %89 = select i1 %9, float 5.000000e+00, float 0.000000e+00, !dbg !24
+  %90 = fcmp oeq float %89, 0.000000e+00, !dbg !25
+  %91 = tail call float @llvm.nvvm.div.full(float %54, float %89), !dbg !26
+  %92 = select i1 %90, float 0.000000e+00, float %91, !dbg !27
+  %93 = fmul float %92, %88, !dbg !28
+  %94 = fadd float %83, %93, !dbg !29
+  %95 = fmul float %88, %88, !dbg !30
+  %96 = fmul float %78, %95, !dbg !31
+  %97 = fmul float %92, %96, !dbg !32
+  %98 = fadd float %87, %97, !dbg !33
+  %99 = fsub float %51, %94, !dbg !18
+  %100 = select i1 %9, float 6.000000e+00, float 0.000000e+00, !dbg !24
+  %101 = fcmp oeq float %100, 0.000000e+00, !dbg !25
+  %102 = tail call float @llvm.nvvm.div.full(float %54, float %100), !dbg !26
+  %103 = select i1 %101, float 0.000000e+00, float %102, !dbg !27
+  %104 = fmul float %103, %99, !dbg !28
+  %105 = fadd float %94, %104, !dbg !29
+  %106 = fmul float %99, %99, !dbg !30
+  %107 = fmul float %89, %106, !dbg !31
+  %108 = fmul float %103, %107, !dbg !32
+  %109 = fadd float %98, %108, !dbg !33
+  %110 = fsub float %52, %105, !dbg !18
+  %111 = select i1 %9, float 7.000000e+00, float 0.000000e+00, !dbg !24
+  %112 = fcmp oeq float %111, 0.000000e+00, !dbg !25
+  %113 = tail call float @llvm.nvvm.div.full(float %54, float %111), !dbg !26
+  %114 = select i1 %112, float 0.000000e+00, float %113, !dbg !27
+  %115 = fmul float %114, %110, !dbg !28
+  %116 = fadd float %105, %115, !dbg !29
+  %117 = fmul float %110, %110, !dbg !30
+  %118 = fmul float %100, %117, !dbg !31
+  %119 = fmul float %114, %118, !dbg !32
+  %120 = fadd float %109, %119, !dbg !33
+  %121 = fsub float %53, %116, !dbg !18
+  %122 = select i1 %9, float 8.000000e+00, float 0.000000e+00, !dbg !24
+  %123 = fcmp oeq float %122, 0.000000e+00, !dbg !25
+  %124 = tail call float @llvm.nvvm.div.full(float %54, float %122), !dbg !26
+  %125 = select i1 %123, float 0.000000e+00, float %124, !dbg !27
+  %126 = fmul float %125, %121, !dbg !28
+  %127 = fadd float %116, %126, !dbg !29
+  %128 = fmul float %121, %121, !dbg !30
+  %129 = fmul float %111, %128, !dbg !31
+  %130 = fmul float %125, %129, !dbg !32
+  %131 = fadd float %120, %130, !dbg !33
+  %132 = bitcast float %127 to i32, !dbg !21
+  %133 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %132, i32 16, i32 31), !dbg !21
+  %134 = bitcast i32 %133 to float, !dbg !21
+  %135 = bitcast float %131 to i32, !dbg !21
+  %136 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %135, i32 16, i32 31), !dbg !21
+  %137 = bitcast i32 %136 to float, !dbg !21
+  %138 = bitcast float %122 to i32, !dbg !21
+  %139 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %138, i32 16, i32 31), !dbg !21
+  %140 = bitcast i32 %139 to float, !dbg !21
+  %141 = fsub float %134, %127, !dbg !18
+  %142 = fadd float %122, %140, !dbg !24
+  %143 = fcmp oeq float %142, 0.000000e+00, !dbg !25
+  %144 = tail call float @llvm.nvvm.div.full(float %140, float %142), !dbg !26
+  %145 = select i1 %143, float 0.000000e+00, float %144, !dbg !27
+  %146 = fmul float %145, %141, !dbg !28
+  %147 = fadd float %127, %146, !dbg !29
+  %148 = fadd float %131, %137, !dbg !34
+  %149 = fmul float %141, %141, !dbg !30
+  %150 = fmul float %122, %149, !dbg !31
+  %151 = fmul float %145, %150, !dbg !32
+  %152 = fadd float %148, %151, !dbg !33
+  %153 = bitcast float %147 to i32, !dbg !21
+  %154 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %153, i32 8, i32 31), !dbg !21
+  %155 = bitcast i32 %154 to float, !dbg !21
+  %156 = bitcast float %152 to i32, !dbg !21
+  %157 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %156, i32 8, i32 31), !dbg !21
+  %158 = bitcast i32 %157 to float, !dbg !21
+  %159 = bitcast float %142 to i32, !dbg !21
+  %160 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %159, i32 8, i32 31), !dbg !21
+  %161 = bitcast i32 %160 to float, !dbg !21
+  %162 = fsub float %155, %147, !dbg !18
+  %163 = fadd float %142, %161, !dbg !24
+  %164 = fcmp oeq float %163, 0.000000e+00, !dbg !25
+  %165 = tail call float @llvm.nvvm.div.full(float %161, float %163), !dbg !26
+  %166 = select i1 %164, float 0.000000e+00, float %165, !dbg !27
+  %167 = fmul float %166, %162, !dbg !28
+  %168 = fadd float %147, %167, !dbg !29
+  %169 = fadd float %152, %158, !dbg !34
+  %170 = fmul float %162, %162, !dbg !30
+  %171 = fmul float %142, %170, !dbg !31
+  %172 = fmul float %166, %171, !dbg !32
+  %173 = fadd float %169, %172, !dbg !33
+  %174 = bitcast float %168 to i32, !dbg !21
+  %175 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %174, i32 4, i32 31), !dbg !21
+  %176 = bitcast i32 %175 to float, !dbg !21
+  %177 = bitcast float %173 to i32, !dbg !21
+  %178 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %177, i32 4, i32 31), !dbg !21
+  %179 = bitcast i32 %178 to float, !dbg !21
+  %180 = bitcast float %163 to i32, !dbg !21
+  %181 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %180, i32 4, i32 31), !dbg !21
+  %182 = bitcast i32 %181 to float, !dbg !21
+  %183 = fsub float %176, %168, !dbg !18
+  %184 = fadd float %163, %182, !dbg !24
+  %185 = fcmp oeq float %184, 0.000000e+00, !dbg !25
+  %186 = tail call float @llvm.nvvm.div.full(float %182, float %184), !dbg !26
+  %187 = select i1 %185, float 0.000000e+00, float %186, !dbg !27
+  %188 = fmul float %187, %183, !dbg !28
+  %189 = fadd float %168, %188, !dbg !29
+  %190 = fadd float %173, %179, !dbg !34
+  %191 = fmul float %183, %183, !dbg !30
+  %192 = fmul float %163, %191, !dbg !31
+  %193 = fmul float %187, %192, !dbg !32
+  %194 = fadd float %190, %193, !dbg !33
+  %195 = bitcast float %189 to i32, !dbg !21
+  %196 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %195, i32 2, i32 31), !dbg !21
+  %197 = bitcast i32 %196 to float, !dbg !21
+  %198 = bitcast float %194 to i32, !dbg !21
+  %199 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %198, i32 2, i32 31), !dbg !21
+  %200 = bitcast i32 %199 to float, !dbg !21
+  %201 = bitcast float %184 to i32, !dbg !21
+  %202 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %201, i32 2, i32 31), !dbg !21
+  %203 = bitcast i32 %202 to float, !dbg !21
+  %204 = fsub float %197, %189, !dbg !18
+  %205 = fadd float %184, %203, !dbg !24
+  %206 = fcmp oeq float %205, 0.000000e+00, !dbg !25
+  %207 = tail call float @llvm.nvvm.div.full(float %203, float %205), !dbg !26
+  %208 = select i1 %206, float 0.000000e+00, float %207, !dbg !27
+  %209 = fmul float %208, %204, !dbg !28
+  %210 = fadd float %189, %209, !dbg !29
+  %211 = fadd float %194, %200, !dbg !34
+  %212 = fmul float %204, %204, !dbg !30
+  %213 = fmul float %184, %212, !dbg !31
+  %214 = fmul float %208, %213, !dbg !32
+  %215 = fadd float %211, %214, !dbg !33
+  %216 = bitcast float %210 to i32, !dbg !21
+  %217 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %216, i32 1, i32 31), !dbg !21
+  %218 = bitcast i32 %217 to float, !dbg !21
+  %219 = bitcast float %215 to i32, !dbg !21
+  %220 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %219, i32 1, i32 31), !dbg !21
+  %221 = bitcast i32 %220 to float, !dbg !21
+  %222 = bitcast float %205 to i32, !dbg !21
+  %223 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %222, i32 1, i32 31), !dbg !21
+  %224 = bitcast i32 %223 to float, !dbg !21
+  %225 = fsub float %218, %210, !dbg !18
+  %226 = fadd float %205, %224, !dbg !24
+  %227 = fcmp oeq float %226, 0.000000e+00, !dbg !25
+  %228 = tail call float @llvm.nvvm.div.full(float %224, float %226), !dbg !26
+  %229 = select i1 %227, float 0.000000e+00, float %228, !dbg !27
+  %230 = fmul float %229, %225, !dbg !28
+  %231 = fadd float %210, %230, !dbg !29
+  %232 = fadd float %215, %221, !dbg !34
+  %233 = fmul float %225, %225, !dbg !30
+  %234 = fmul float %205, %233, !dbg !31
+  %235 = fmul float %229, %234, !dbg !32
+  %236 = fadd float %232, %235, !dbg !33
+  %237 = icmp eq i32 %12, 0, !dbg !21
+  %238 = getelementptr float, ptr addrspace(3) @global_smem, i32 %13, !dbg !21
+  %239 = bitcast float %231 to <1 x i32>, !dbg !21
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %238, <1 x i32> %239, i1 %237) #6, !dbg !21
+  %240 = getelementptr float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 64), i32 %13, !dbg !21
+  %241 = bitcast float %236 to <1 x i32>, !dbg !21
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %240, <1 x i32> %241, i1 %237) #6, !dbg !21
+  %242 = getelementptr float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 128), i32 %13, !dbg !21
+  %243 = bitcast float %226 to <1 x i32>, !dbg !21
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %242, <1 x i32> %243, i1 %237) #6, !dbg !21
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !21
+  %244 = icmp samesign ult i32 %11, 16, !dbg !21
+  %245 = getelementptr float, ptr addrspace(3) @global_smem, i32 %11, !dbg !21
+  %246 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %245, i1 %244) #6, !dbg !21
+  %247 = bitcast i32 %246 to float, !dbg !21
+  %248 = getelementptr float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 64), i32 %11, !dbg !21
+  %249 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %248, i1 %244) #6, !dbg !21
+  %250 = bitcast i32 %249 to float, !dbg !21
+  %251 = getelementptr float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 128), i32 %11, !dbg !21
+  %252 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %251, i1 %244) #6, !dbg !21
+  %253 = bitcast i32 %252 to float, !dbg !21
+  %254 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %246, i32 8, i32 31), !dbg !21
+  %255 = bitcast i32 %254 to float, !dbg !21
+  %256 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %249, i32 8, i32 31), !dbg !21
+  %257 = bitcast i32 %256 to float, !dbg !21
+  %258 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %252, i32 8, i32 31), !dbg !21
+  %259 = bitcast i32 %258 to float, !dbg !21
+  %260 = fsub float %255, %247, !dbg !18
+  %261 = fadd float %253, %259, !dbg !24
+  %262 = fcmp oeq float %261, 0.000000e+00, !dbg !25
+  %263 = tail call float @llvm.nvvm.div.full(float %259, float %261), !dbg !26
+  %264 = select i1 %262, float 0.000000e+00, float %263, !dbg !27
+  %265 = fmul float %260, %264, !dbg !28
+  %266 = fadd float %265, %247, !dbg !29
+  %267 = fadd float %250, %257, !dbg !34
+  %268 = fmul float %260, %260, !dbg !30
+  %269 = fmul float %268, %253, !dbg !31
+  %270 = fmul float %269, %264, !dbg !32
+  %271 = fadd float %267, %270, !dbg !33
+  %272 = bitcast float %266 to i32, !dbg !21
+  %273 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %272, i32 4, i32 31), !dbg !21
+  %274 = bitcast i32 %273 to float, !dbg !21
+  %275 = bitcast float %271 to i32, !dbg !21
+  %276 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %275, i32 4, i32 31), !dbg !21
+  %277 = bitcast i32 %276 to float, !dbg !21
+  %278 = bitcast float %261 to i32, !dbg !21
+  %279 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %278, i32 4, i32 31), !dbg !21
+  %280 = bitcast i32 %279 to float, !dbg !21
+  %281 = fsub float %274, %266, !dbg !18
+  %282 = fadd float %261, %280, !dbg !24
+  %283 = fcmp oeq float %282, 0.000000e+00, !dbg !25
+  %284 = tail call float @llvm.nvvm.div.full(float %280, float %282), !dbg !26
+  %285 = select i1 %283, float 0.000000e+00, float %284, !dbg !27
+  %286 = fmul float %281, %285, !dbg !28
+  %287 = fadd float %266, %286, !dbg !29
+  %288 = fadd float %271, %277, !dbg !34
+  %289 = fmul float %281, %281, !dbg !30
+  %290 = fmul float %261, %289, !dbg !31
+  %291 = fmul float %285, %290, !dbg !32
+  %292 = fadd float %288, %291, !dbg !33
+  %293 = bitcast float %287 to i32, !dbg !21
+  %294 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %293, i32 2, i32 31), !dbg !21
+  %295 = bitcast i32 %294 to float, !dbg !21
+  %296 = bitcast float %292 to i32, !dbg !21
+  %297 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %296, i32 2, i32 31), !dbg !21
+  %298 = bitcast i32 %297 to float, !dbg !21
+  %299 = bitcast float %282 to i32, !dbg !21
+  %300 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %299, i32 2, i32 31), !dbg !21
+  %301 = bitcast i32 %300 to float, !dbg !21
+  %302 = fsub float %295, %287, !dbg !18
+  %303 = fadd float %282, %301, !dbg !24
+  %304 = fcmp oeq float %303, 0.000000e+00, !dbg !25
+  %305 = tail call float @llvm.nvvm.div.full(float %301, float %303), !dbg !26
+  %306 = select i1 %304, float 0.000000e+00, float %305, !dbg !27
+  %307 = fmul float %302, %306, !dbg !28
+  %308 = fadd float %287, %307, !dbg !29
+  %309 = fadd float %292, %298, !dbg !34
+  %310 = fmul float %302, %302, !dbg !30
+  %311 = fmul float %282, %310, !dbg !31
+  %312 = fmul float %306, %311, !dbg !32
+  %313 = fadd float %309, %312, !dbg !33
+  %314 = bitcast float %308 to i32, !dbg !21
+  %315 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %314, i32 1, i32 31), !dbg !21
+  %316 = bitcast i32 %315 to float, !dbg !21
+  %317 = bitcast float %313 to i32, !dbg !21
+  %318 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %317, i32 1, i32 31), !dbg !21
+  %319 = bitcast i32 %318 to float, !dbg !21
+  %320 = bitcast float %303 to i32, !dbg !21
+  %321 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %320, i32 1, i32 31), !dbg !21
+  %322 = bitcast i32 %321 to float, !dbg !21
+  %323 = fsub float %316, %308, !dbg !18
+  %324 = fadd float %303, %322, !dbg !24
+  %325 = fcmp oeq float %324, 0.000000e+00, !dbg !25
+  %326 = tail call float @llvm.nvvm.div.full(float %322, float %324), !dbg !26
+  %327 = select i1 %325, float 0.000000e+00, float %326, !dbg !27
+  %328 = fmul float %323, %327, !dbg !28
+  %329 = fadd float %308, %328, !dbg !29
+  %330 = fadd float %313, %319, !dbg !34
+  %331 = fmul float %323, %323, !dbg !30
+  %332 = fmul float %303, %331, !dbg !31
+  %333 = fmul float %327, %332, !dbg !32
+  %334 = fadd float %330, %333, !dbg !33
+  %335 = and i32 %10, 15, !dbg !21
+  %336 = icmp eq i32 %335, 0, !dbg !21
+  %337 = and i1 %244, %336, !dbg !21
+  %338 = bitcast float %329 to <1 x i32>, !dbg !21
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %245, <1 x i32> %338, i1 %337) #6, !dbg !21
+  %339 = bitcast float %334 to <1 x i32>, !dbg !21
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %248, <1 x i32> %339, i1 %337) #6, !dbg !21
+  %340 = bitcast float %324 to <1 x i32>, !dbg !21
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %251, <1 x i32> %340, i1 %337) #6, !dbg !21
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !21
+  %341 = load float, ptr addrspace(3) @global_smem, align 16, !dbg !21
+  %342 = load float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 64), align 16, !dbg !21
+  %343 = zext nneg i32 %15 to i64, !dbg !35
+  %344 = getelementptr bfloat, ptr addrspace(1) %1, i64 %343, !dbg !35
+  %345 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !36
+  %346 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %344, i64 %345, i1 true) #6, !dbg !36
+  %347 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #6, !dbg !37
+  %348 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %19, i64 %347, i1 %9) #6, !dbg !37
+  %349 = getelementptr bfloat, ptr addrspace(1) %2, i64 %343, !dbg !38
+  %350 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !39
+  %351 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %349, i64 %350, i1 true) #6, !dbg !39
+  %352 = tail call float @llvm.nvvm.div.full(float %342, float 4.096000e+03), !dbg !40
+  %353 = fadd float %352, 0x3EB0C6F7A0000000, !dbg !41
+  %354 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !42
+  %355 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !42
+  %356 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !42
+  %357 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !42
+  %358 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !42
+  %359 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !42
+  %360 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !42
+  %361 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !42
+  %.not.i19 = icmp eq i32 %361, 0, !dbg !42
+  br i1 %.not.i19, label %364, label %362, !dbg !42
+
+362:                                              ; preds = %__nv_rsqrtf.exit
+  %363 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %353), !dbg !42
+  br label %__nv_rsqrtf.exit21, !dbg !42
+
+364:                                              ; preds = %__nv_rsqrtf.exit
+  %365 = tail call float @llvm.nvvm.rsqrt.approx.f(float %353), !dbg !42
+  br label %__nv_rsqrtf.exit21, !dbg !42
+
+__nv_rsqrtf.exit21:                               ; preds = %362, %364
+  %.0.i20 = phi float [ %363, %362 ], [ %365, %364 ], !dbg !42
+  %366 = extractvalue { i32, i32, i32, i32 } %348, 3, !dbg !37
+  %367 = bitcast i32 %366 to <2 x bfloat>, !dbg !37
+  %368 = extractvalue { i32, i32, i32, i32 } %348, 2, !dbg !37
+  %369 = bitcast i32 %368 to <2 x bfloat>, !dbg !37
+  %370 = extractvalue { i32, i32, i32, i32 } %348, 1, !dbg !37
+  %371 = bitcast i32 %370 to <2 x bfloat>, !dbg !37
+  %372 = extractvalue { i32, i32, i32, i32 } %348, 0, !dbg !37
+  %373 = bitcast i32 %372 to <2 x bfloat>, !dbg !37
+  %374 = extractvalue { i32, i32, i32, i32 } %346, 3, !dbg !36
+  %375 = bitcast i32 %374 to <2 x bfloat>, !dbg !36
+  %376 = extractvalue { i32, i32, i32, i32 } %346, 2, !dbg !36
+  %377 = bitcast i32 %376 to <2 x bfloat>, !dbg !36
+  %378 = extractvalue { i32, i32, i32, i32 } %346, 1, !dbg !36
+  %379 = bitcast i32 %378 to <2 x bfloat>, !dbg !36
+  %380 = extractvalue { i32, i32, i32, i32 } %346, 0, !dbg !36
+  %381 = bitcast i32 %380 to <2 x bfloat>, !dbg !36
+  %382 = extractvalue { i32, i32, i32, i32 } %351, 3, !dbg !39
+  %383 = bitcast i32 %382 to <2 x bfloat>, !dbg !39
+  %384 = extractvalue { i32, i32, i32, i32 } %351, 2, !dbg !39
+  %385 = bitcast i32 %384 to <2 x bfloat>, !dbg !39
+  %386 = extractvalue { i32, i32, i32, i32 } %351, 1, !dbg !39
+  %387 = bitcast i32 %386 to <2 x bfloat>, !dbg !39
+  %388 = extractvalue { i32, i32, i32, i32 } %351, 0, !dbg !39
+  %389 = bitcast i32 %388 to <2 x bfloat>, !dbg !39
+  %390 = getelementptr bfloat, ptr addrspace(1) %3, i64 %18, !dbg !43
+  %391 = fpext <2 x bfloat> %373 to <2 x float>, !dbg !44
+  %392 = insertelement <2 x float> poison, float %341, i64 0, !dbg !45
+  %393 = shufflevector <2 x float> %392, <2 x float> poison, <2 x i32> zeroinitializer, !dbg !45
+  %394 = fsub <2 x float> %391, %393, !dbg !45
+  %395 = fpext <2 x bfloat> %381 to <2 x float>, !dbg !46
+  %396 = fadd <2 x float> %395, splat (float 1.000000e+00), !dbg !47
+  %397 = fpext <2 x bfloat> %389 to <2 x float>, !dbg !48
+  %398 = insertelement <2 x float> poison, float %.0.i20, i64 0, !dbg !49
+  %399 = shufflevector <2 x float> %398, <2 x float> poison, <2 x i32> zeroinitializer, !dbg !49
+  %400 = fmul <2 x float> %394, %399, !dbg !49
+  %401 = fmul <2 x float> %396, %400, !dbg !50
+  %402 = fadd <2 x float> %401, %397, !dbg !51
+  %403 = fptrunc <2 x float> %402 to <2 x bfloat>, !dbg !52
+  %404 = fpext <2 x bfloat> %371 to <2 x float>, !dbg !44
+  %405 = fsub <2 x float> %404, %393, !dbg !45
+  %406 = fpext <2 x bfloat> %379 to <2 x float>, !dbg !46
+  %407 = fadd <2 x float> %406, splat (float 1.000000e+00), !dbg !47
+  %408 = fpext <2 x bfloat> %387 to <2 x float>, !dbg !48
+  %409 = fmul <2 x float> %405, %399, !dbg !49
+  %410 = fmul <2 x float> %407, %409, !dbg !50
+  %411 = fadd <2 x float> %410, %408, !dbg !51
+  %412 = fptrunc <2 x float> %411 to <2 x bfloat>, !dbg !52
+  %413 = fpext <2 x bfloat> %369 to <2 x float>, !dbg !44
+  %414 = fsub <2 x float> %413, %393, !dbg !45
+  %415 = fpext <2 x bfloat> %377 to <2 x float>, !dbg !46
+  %416 = fadd <2 x float> %415, splat (float 1.000000e+00), !dbg !47
+  %417 = fpext <2 x bfloat> %385 to <2 x float>, !dbg !48
+  %418 = fmul <2 x float> %414, %399, !dbg !49
+  %419 = fmul <2 x float> %416, %418, !dbg !50
+  %420 = fadd <2 x float> %419, %417, !dbg !51
+  %421 = fptrunc <2 x float> %420 to <2 x bfloat>, !dbg !52
+  %422 = fpext <2 x bfloat> %367 to <2 x float>, !dbg !44
+  %423 = fsub <2 x float> %422, %393, !dbg !45
+  %424 = fpext <2 x bfloat> %375 to <2 x float>, !dbg !46
+  %425 = fadd <2 x float> %424, splat (float 1.000000e+00), !dbg !47
+  %426 = fpext <2 x bfloat> %383 to <2 x float>, !dbg !48
+  %427 = fmul <2 x float> %423, %399, !dbg !49
+  %428 = fmul <2 x float> %425, %427, !dbg !50
+  %429 = fadd <2 x float> %428, %426, !dbg !51
+  %430 = fptrunc <2 x float> %429 to <2 x bfloat>, !dbg !52
+  %431 = bitcast <2 x bfloat> %403 to i32, !dbg !52
+  %432 = bitcast <2 x bfloat> %412 to i32, !dbg !52
+  %433 = bitcast <2 x bfloat> %421 to i32, !dbg !52
+  %434 = bitcast <2 x bfloat> %430 to i32, !dbg !52
+  tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %431, i32 %432, i32 %433, i32 %434, ptr addrspace(1) %390, i1 %9) #6, !dbg !52
+  ret void, !dbg !53
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.div.full(float, float) #2
+
+; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
+declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #3
+
+; Function Attrs: convergent nocallback nounwind
+declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #4
+
+declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #5
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #2
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.rsqrt.approx.f(float) #2
+
+attributes #0 = { nounwind "nvvm.reqntid"="512" }
+attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #2 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
+attributes #3 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
+attributes #4 = { convergent nocallback nounwind }
+attributes #5 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #6 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3}
+!llvm.ident = !{!4}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly)
+!1 = !DIFile(filename: "cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py", directory: "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww")
+!2 = !{i32 2, !"Debug Info Version", i32 3}
+!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
+!4 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
+!5 = distinct !DISubprogram(name: "triton_red_fused_add_mul_native_layer_norm_0", linkageName: "triton_red_fused_add_mul_native_layer_norm_0", scope: !1, file: !1, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
+!7 = !{}
+!8 = !DILocation(line: 23, column: 28, scope: !5)
+!9 = !DILocation(line: 25, column: 21, scope: !5)
+!10 = !DILocation(line: 26, column: 37, scope: !5)
+!11 = !DILocation(line: 38, column: 46, scope: !5)
+!12 = !DILocation(line: 38, column: 41, scope: !5)
+!13 = !DILocation(line: 38, column: 34, scope: !5)
+!14 = !DILocation(line: 38, column: 51, scope: !5)
+!15 = !DILocation(line: 38, column: 112, scope: !5)
+!16 = !DILocation(line: 44, column: 62, scope: !5)
+!17 = !DILocation(line: 46, column: 66, scope: !5)
+!18 = !DILocation(line: 231, column: 21, scope: !19, inlinedAt: !21)
+!19 = distinct !DILexicalBlockFile(scope: !5, file: !20, discriminator: 0)
+!20 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime")
+!21 = !DILocation(line: 243, column: 46, scope: !19, inlinedAt: !22)
+!22 = !DILocation(line: 47, column: 79, scope: !23)
+!23 = distinct !DILexicalBlockFile(scope: !5, file: !1, discriminator: 0)
+!24 = !DILocation(line: 232, column: 28, scope: !19, inlinedAt: !21)
+!25 = !DILocation(line: 233, column: 39, scope: !19, inlinedAt: !21)
+!26 = !DILocation(line: 233, column: 60, scope: !19, inlinedAt: !21)
+!27 = !DILocation(line: 233, column: 49, scope: !19, inlinedAt: !21)
+!28 = !DILocation(line: 235, column: 25, scope: !19, inlinedAt: !21)
+!29 = !DILocation(line: 235, column: 17, scope: !19, inlinedAt: !21)
+!30 = !DILocation(line: 236, column: 30, scope: !19, inlinedAt: !21)
+!31 = !DILocation(line: 236, column: 38, scope: !19, inlinedAt: !21)
+!32 = !DILocation(line: 236, column: 49, scope: !19, inlinedAt: !21)
+!33 = !DILocation(line: 236, column: 22, scope: !19, inlinedAt: !21)
+!34 = !DILocation(line: 236, column: 15, scope: !19, inlinedAt: !21)
+!35 = !DILocation(line: 57, column: 34, scope: !5)
+!36 = !DILocation(line: 57, column: 41, scope: !5)
+!37 = !DILocation(line: 58, column: 52, scope: !5)
+!38 = !DILocation(line: 59, column: 35, scope: !5)
+!39 = !DILocation(line: 59, column: 42, scope: !5)
+!40 = !DILocation(line: 65, column: 24, scope: !5)
+!41 = !DILocation(line: 67, column: 24, scope: !5)
+!42 = !DILocation(line: 68, column: 32, scope: !5)
+!43 = !DILocation(line: 73, column: 29, scope: !5)
+!44 = !DILocation(line: 58, column: 114, scope: !5)
+!45 = !DILocation(line: 63, column: 24, scope: !5)
+!46 = !DILocation(line: 57, column: 94, scope: !5)
+!47 = !DILocation(line: 61, column: 23, scope: !5)
+!48 = !DILocation(line: 59, column: 95, scope: !5)
+!49 = !DILocation(line: 69, column: 24, scope: !5)
+!50 = !DILocation(line: 71, column: 24, scope: !5)
+!51 = !DILocation(line: 72, column: 24, scope: !5)
+!52 = !DILocation(line: 73, column: 53, scope: !5)
+!53 = !DILocation(line: 51, column: 4, scope: !5)
diff --git a/triton/MRY6R44HLP4JU6GX5XHYUUNGQ6LLBRKAPA6ZNJPRYYOSCCZPWAPQ/triton_red_fused_add_mul_native_layer_norm_0.ptx b/triton/MRY6R44HLP4JU6GX5XHYUUNGQ6LLBRKAPA6ZNJPRYYOSCCZPWAPQ/triton_red_fused_add_mul_native_layer_norm_0.ptx
new file mode 100644
index 0000000000000000000000000000000000000000..2788ffc3798a0f8a72f374d735e4a318dea4886c
--- /dev/null
+++ b/triton/MRY6R44HLP4JU6GX5XHYUUNGQ6LLBRKAPA6ZNJPRYYOSCCZPWAPQ/triton_red_fused_add_mul_native_layer_norm_0.ptx
@@ -0,0 +1,1032 @@
+//
+// Generated by LLVM NVPTX Back-End
+//
+
+.version 9.1
+.target sm_89
+.address_size 64
+
+	// .globl	triton_red_fused_add_mul_native_layer_norm_0 // -- Begin function triton_red_fused_add_mul_native_layer_norm_0
+.extern .shared .align 16 .b8 global_smem[];
+.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90};
+                                        // @triton_red_fused_add_mul_native_layer_norm_0
+.visible .entry triton_red_fused_add_mul_native_layer_norm_0(
+	.param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_0_param_0,
+	.param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_0_param_1,
+	.param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_0_param_2,
+	.param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_0_param_3,
+	.param .u32 triton_red_fused_add_mul_native_layer_norm_0_param_4,
+	.param .u32 triton_red_fused_add_mul_native_layer_norm_0_param_5,
+	.param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_0_param_6,
+	.param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_0_param_7
+)
+.reqntid 512
+{
+	.reg .pred 	%p<23>;
+	.reg .b16 	%rs<33>;
+	.reg .b32 	%r<287>;
+	.reg .b64 	%rd<15>;
+	.loc	1 18 0                          // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:18:0
+$L__func_begin0:
+	.loc	1 18 0                          // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:18:0
+
+// %bb.0:                               // %__nv_rsqrtf.exit
+	ld.param.b64 	%rd9, [triton_red_fused_add_mul_native_layer_norm_0_param_0];
+	ld.param.b64 	%rd10, [triton_red_fused_add_mul_native_layer_norm_0_param_1];
+$L__tmp0:
+	.loc	1 23 28                         // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:23:28
+	mov.u32 	%r37, %ctaid.x;
+	.loc	1 25 21                         // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:25:21
+	setp.lt.u32 	%p1, %r37, 2048;
+	ld.param.b64 	%rd11, [triton_red_fused_add_mul_native_layer_norm_0_param_2];
+	ld.param.b64 	%rd12, [triton_red_fused_add_mul_native_layer_norm_0_param_3];
+	.loc	1 26 37                         // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:26:37
+	mov.u32 	%r38, %tid.x;
+	and.b32 	%r39, %r38, 511;
+	and.b32 	%r40, %r38, 31;
+	shl.b32 	%r41, %r38, 3;
+	and.b32 	%r42, %r41, 4088;
+	.loc	1 38 46                         // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:38:46
+	shl.b32 	%r43, %r37, 12;
+	.loc	1 38 41                         // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:38:41
+	or.b32 	%r44, %r42, %r43;
+	.loc	1 38 34                         // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:38:34
+	mul.wide.s32 	%rd13, %r44, 2;
+	add.s64 	%rd1, %rd9, %rd13;
+	.loc	1 38 51                         // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:38:51
+	// begin inline asm
+	mov.u64 %rd2, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd2, 1.0;
+	// end inline asm
+	mov.b32 	%r5, 0;
+	// begin inline asm
+	mov.u32 %r1, %r5;
+	mov.u32 %r2, %r5;
+	mov.u32 %r3, %r5;
+	mov.u32 %r4, %r5;
+	@%p1 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r1, %r2, %r3, %r4 }, [ %rd1 + 0 ], %rd2;
+	// end inline asm
+	mov.b32 	{%rs1, %rs2}, %r1;
+	mov.b32 	{%rs3, %rs4}, %r2;
+	mov.b32 	{%rs5, %rs6}, %r3;
+	mov.b32 	{%rs7, %rs8}, %r4;
+	.loc	1 38 112                        // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:38:112
+	cvt.f32.bf16 	%r45, %rs1;
+	cvt.f32.bf16 	%r46, %rs2;
+	cvt.f32.bf16 	%r47, %rs3;
+	cvt.f32.bf16 	%r48, %rs4;
+	cvt.f32.bf16 	%r49, %rs5;
+	cvt.f32.bf16 	%r50, %rs6;
+	cvt.f32.bf16 	%r51, %rs7;
+	cvt.f32.bf16 	%r52, %rs8;
+	.loc	1 44 62                         // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:44:62
+	selp.f32 	%r53, %r45, 0f00000000, %p1;
+	selp.f32 	%r54, %r46, 0f00000000, %p1;
+	selp.f32 	%r55, %r47, 0f00000000, %p1;
+	selp.f32 	%r56, %r48, 0f00000000, %p1;
+	selp.f32 	%r57, %r49, 0f00000000, %p1;
+	selp.f32 	%r58, %r50, 0f00000000, %p1;
+	selp.f32 	%r59, %r51, 0f00000000, %p1;
+	selp.f32 	%r60, %r52, 0f00000000, %p1;
+	.loc	1 46 66                         // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:46:66
+	selp.f32 	%r61, 0f3F800000, 0f00000000, %p1;
+$L__tmp1:
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	sub.f32 	%r62, %r54, %r53;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	selp.f32 	%r63, 0f40000000, 0f00000000, %p1;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	setp.eq.f32 	%p6, %r63, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	div.full.f32 	%r64, %r61, %r63;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	selp.f32 	%r65, 0f00000000, %r64, %p6;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	fma.rn.f32 	%r66, %r65, %r62, %r53;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	mul.f32 	%r67, %r62, %r62;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	mul.f32 	%r68, %r61, %r67;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	fma.rn.f32 	%r69, %r65, %r68, 0f00000000;
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	sub.f32 	%r70, %r55, %r66;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	selp.f32 	%r71, 0f40400000, 0f00000000, %p1;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	setp.eq.f32 	%p7, %r71, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	div.full.f32 	%r72, %r61, %r71;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	selp.f32 	%r73, 0f00000000, %r72, %p7;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	fma.rn.f32 	%r74, %r73, %r70, %r66;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	mul.f32 	%r75, %r70, %r70;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	mul.f32 	%r76, %r63, %r75;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	fma.rn.f32 	%r77, %r73, %r76, %r69;
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	sub.f32 	%r78, %r56, %r74;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	selp.f32 	%r79, 0f40800000, 0f00000000, %p1;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	setp.eq.f32 	%p8, %r79, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	div.full.f32 	%r80, %r61, %r79;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	selp.f32 	%r81, 0f00000000, %r80, %p8;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	fma.rn.f32 	%r82, %r81, %r78, %r74;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	mul.f32 	%r83, %r78, %r78;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	mul.f32 	%r84, %r71, %r83;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	fma.rn.f32 	%r85, %r81, %r84, %r77;
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	sub.f32 	%r86, %r57, %r82;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	selp.f32 	%r87, 0f40A00000, 0f00000000, %p1;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	setp.eq.f32 	%p9, %r87, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	div.full.f32 	%r88, %r61, %r87;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	selp.f32 	%r89, 0f00000000, %r88, %p9;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	fma.rn.f32 	%r90, %r89, %r86, %r82;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	mul.f32 	%r91, %r86, %r86;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	mul.f32 	%r92, %r79, %r91;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	fma.rn.f32 	%r93, %r89, %r92, %r85;
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	sub.f32 	%r94, %r58, %r90;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	selp.f32 	%r95, 0f40C00000, 0f00000000, %p1;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	setp.eq.f32 	%p10, %r95, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	div.full.f32 	%r96, %r61, %r95;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	selp.f32 	%r97, 0f00000000, %r96, %p10;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	fma.rn.f32 	%r98, %r97, %r94, %r90;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	mul.f32 	%r99, %r94, %r94;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	mul.f32 	%r100, %r87, %r99;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	fma.rn.f32 	%r101, %r97, %r100, %r93;
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	sub.f32 	%r102, %r59, %r98;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	selp.f32 	%r103, 0f40E00000, 0f00000000, %p1;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	setp.eq.f32 	%p11, %r103, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	div.full.f32 	%r104, %r61, %r103;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	selp.f32 	%r105, 0f00000000, %r104, %p11;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	fma.rn.f32 	%r106, %r105, %r102, %r98;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	mul.f32 	%r107, %r102, %r102;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	mul.f32 	%r108, %r95, %r107;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	fma.rn.f32 	%r109, %r105, %r108, %r101;
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	sub.f32 	%r110, %r60, %r106;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	selp.f32 	%r111, 0f41000000, 0f00000000, %p1;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	setp.eq.f32 	%p12, %r111, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	div.full.f32 	%r112, %r61, %r111;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	selp.f32 	%r113, 0f00000000, %r112, %p12;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	fma.rn.f32 	%r114, %r113, %r110, %r106;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	mul.f32 	%r115, %r110, %r110;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	mul.f32 	%r116, %r103, %r115;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	fma.rn.f32 	%r117, %r113, %r116, %r109;
+$L__tmp2:
+	.loc	2 243 46                        // triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ]
+	shfl.sync.bfly.b32 	%r118, %r114, 16, 31, -1;
+	shfl.sync.bfly.b32 	%r119, %r117, 16, 31, -1;
+	shfl.sync.bfly.b32 	%r120, %r111, 16, 31, -1;
+$L__tmp3:
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	sub.f32 	%r121, %r118, %r114;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	add.f32 	%r122, %r111, %r120;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	setp.eq.f32 	%p13, %r122, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	div.full.f32 	%r123, %r120, %r122;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	selp.f32 	%r124, 0f00000000, %r123, %p13;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	fma.rn.f32 	%r125, %r124, %r121, %r114;
+	.loc	2 236 15                        // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	add.f32 	%r126, %r117, %r119;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	mul.f32 	%r127, %r121, %r121;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	mul.f32 	%r128, %r111, %r127;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	fma.rn.f32 	%r129, %r124, %r128, %r126;
+$L__tmp4:
+	.loc	2 243 46                        // triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ]
+	shfl.sync.bfly.b32 	%r130, %r125, 8, 31, -1;
+	shfl.sync.bfly.b32 	%r131, %r129, 8, 31, -1;
+	shfl.sync.bfly.b32 	%r132, %r122, 8, 31, -1;
+$L__tmp5:
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	sub.f32 	%r133, %r130, %r125;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	add.f32 	%r134, %r122, %r132;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	setp.eq.f32 	%p14, %r134, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	div.full.f32 	%r135, %r132, %r134;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	selp.f32 	%r136, 0f00000000, %r135, %p14;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	fma.rn.f32 	%r137, %r136, %r133, %r125;
+	.loc	2 236 15                        // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	add.f32 	%r138, %r129, %r131;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	mul.f32 	%r139, %r133, %r133;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	mul.f32 	%r140, %r122, %r139;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	fma.rn.f32 	%r141, %r136, %r140, %r138;
+$L__tmp6:
+	.loc	2 243 46                        // triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ]
+	shfl.sync.bfly.b32 	%r142, %r137, 4, 31, -1;
+	shfl.sync.bfly.b32 	%r143, %r141, 4, 31, -1;
+	shfl.sync.bfly.b32 	%r144, %r134, 4, 31, -1;
+$L__tmp7:
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	sub.f32 	%r145, %r142, %r137;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	add.f32 	%r146, %r134, %r144;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	setp.eq.f32 	%p15, %r146, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	div.full.f32 	%r147, %r144, %r146;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	selp.f32 	%r148, 0f00000000, %r147, %p15;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	fma.rn.f32 	%r149, %r148, %r145, %r137;
+	.loc	2 236 15                        // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	add.f32 	%r150, %r141, %r143;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	mul.f32 	%r151, %r145, %r145;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	mul.f32 	%r152, %r134, %r151;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	fma.rn.f32 	%r153, %r148, %r152, %r150;
+$L__tmp8:
+	.loc	2 243 46                        // triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ]
+	shfl.sync.bfly.b32 	%r154, %r149, 2, 31, -1;
+	shfl.sync.bfly.b32 	%r155, %r153, 2, 31, -1;
+	shfl.sync.bfly.b32 	%r156, %r146, 2, 31, -1;
+$L__tmp9:
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	sub.f32 	%r157, %r154, %r149;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	add.f32 	%r158, %r146, %r156;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	setp.eq.f32 	%p16, %r158, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	div.full.f32 	%r159, %r156, %r158;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	selp.f32 	%r160, 0f00000000, %r159, %p16;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	fma.rn.f32 	%r161, %r160, %r157, %r149;
+	.loc	2 236 15                        // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	add.f32 	%r162, %r153, %r155;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	mul.f32 	%r163, %r157, %r157;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	mul.f32 	%r164, %r146, %r163;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	fma.rn.f32 	%r165, %r160, %r164, %r162;
+$L__tmp10:
+	.loc	2 243 46                        // triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ]
+	shfl.sync.bfly.b32 	%r166, %r161, 1, 31, -1;
+	shfl.sync.bfly.b32 	%r167, %r165, 1, 31, -1;
+	shfl.sync.bfly.b32 	%r168, %r158, 1, 31, -1;
+$L__tmp11:
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	sub.f32 	%r169, %r166, %r161;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	add.f32 	%r11, %r158, %r168;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	setp.eq.f32 	%p17, %r11, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	div.full.f32 	%r170, %r168, %r11;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	selp.f32 	%r171, 0f00000000, %r170, %p17;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	fma.rn.f32 	%r7, %r171, %r169, %r161;
+	.loc	2 236 15                        // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	add.f32 	%r172, %r165, %r167;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	mul.f32 	%r173, %r169, %r169;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	mul.f32 	%r174, %r158, %r173;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	fma.rn.f32 	%r9, %r171, %r174, %r172;
+$L__tmp12:
+	.loc	2 243 46                        // triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ]
+	setp.eq.b32 	%p2, %r40, 0;
+	shr.u32 	%r175, %r38, 3;
+	and.b32 	%r176, %r175, 60;
+	mov.b32 	%r177, global_smem;
+	add.s32 	%r6, %r177, %r176;
+	// begin inline asm
+	@%p2 st.shared.b32 [ %r6 + 0 ], %r7;
+	// end inline asm
+	add.s32 	%r8, %r6, 64;
+	// begin inline asm
+	@%p2 st.shared.b32 [ %r8 + 0 ], %r9;
+	// end inline asm
+	add.s32 	%r10, %r6, 128;
+	// begin inline asm
+	@%p2 st.shared.b32 [ %r10 + 0 ], %r11;
+	// end inline asm
+	bar.sync 	0;
+	setp.lt.u32 	%p3, %r39, 16;
+	shl.b32 	%r178, %r39, 2;
+	add.s32 	%r13, %r177, %r178;
+	// begin inline asm
+	@%p3 ld.shared.b32 %r12, [ %r13 + 0 ];
+	// end inline asm
+	add.s32 	%r15, %r13, 64;
+	// begin inline asm
+	@%p3 ld.shared.b32 %r14, [ %r15 + 0 ];
+	// end inline asm
+	add.s32 	%r17, %r13, 128;
+	// begin inline asm
+	@%p3 ld.shared.b32 %r16, [ %r17 + 0 ];
+	// end inline asm
+	shfl.sync.bfly.b32 	%r179, %r12, 8, 31, -1;
+	shfl.sync.bfly.b32 	%r180, %r14, 8, 31, -1;
+	shfl.sync.bfly.b32 	%r181, %r16, 8, 31, -1;
+$L__tmp13:
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	sub.f32 	%r182, %r179, %r12;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	add.f32 	%r183, %r16, %r181;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	setp.eq.f32 	%p18, %r183, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	div.full.f32 	%r184, %r181, %r183;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	selp.f32 	%r185, 0f00000000, %r184, %p18;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	fma.rn.f32 	%r186, %r182, %r185, %r12;
+	.loc	2 236 15                        // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	add.f32 	%r187, %r14, %r180;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	mul.f32 	%r188, %r182, %r182;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	mul.f32 	%r189, %r188, %r16;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	fma.rn.f32 	%r190, %r189, %r185, %r187;
+$L__tmp14:
+	.loc	2 243 46                        // triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ]
+	shfl.sync.bfly.b32 	%r191, %r186, 4, 31, -1;
+	shfl.sync.bfly.b32 	%r192, %r190, 4, 31, -1;
+	shfl.sync.bfly.b32 	%r193, %r183, 4, 31, -1;
+$L__tmp15:
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	sub.f32 	%r194, %r191, %r186;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	add.f32 	%r195, %r183, %r193;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	setp.eq.f32 	%p19, %r195, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	div.full.f32 	%r196, %r193, %r195;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	selp.f32 	%r197, 0f00000000, %r196, %p19;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	fma.rn.f32 	%r198, %r194, %r197, %r186;
+	.loc	2 236 15                        // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	add.f32 	%r199, %r190, %r192;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	mul.f32 	%r200, %r194, %r194;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	mul.f32 	%r201, %r183, %r200;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	fma.rn.f32 	%r202, %r197, %r201, %r199;
+$L__tmp16:
+	.loc	2 243 46                        // triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ]
+	shfl.sync.bfly.b32 	%r203, %r198, 2, 31, -1;
+	shfl.sync.bfly.b32 	%r204, %r202, 2, 31, -1;
+	shfl.sync.bfly.b32 	%r205, %r195, 2, 31, -1;
+$L__tmp17:
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	sub.f32 	%r206, %r203, %r198;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	add.f32 	%r207, %r195, %r205;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	setp.eq.f32 	%p20, %r207, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	div.full.f32 	%r208, %r205, %r207;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	selp.f32 	%r209, 0f00000000, %r208, %p20;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	fma.rn.f32 	%r210, %r206, %r209, %r198;
+	.loc	2 236 15                        // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	add.f32 	%r211, %r202, %r204;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	mul.f32 	%r212, %r206, %r206;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	mul.f32 	%r213, %r195, %r212;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	fma.rn.f32 	%r214, %r209, %r213, %r211;
+$L__tmp18:
+	.loc	2 243 46                        // triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ]
+	shfl.sync.bfly.b32 	%r215, %r210, 1, 31, -1;
+	shfl.sync.bfly.b32 	%r216, %r214, 1, 31, -1;
+	shfl.sync.bfly.b32 	%r217, %r207, 1, 31, -1;
+$L__tmp19:
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	sub.f32 	%r218, %r215, %r210;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	add.f32 	%r20, %r207, %r217;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	setp.eq.f32 	%p21, %r20, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	div.full.f32 	%r219, %r217, %r20;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	selp.f32 	%r220, 0f00000000, %r219, %p21;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	fma.rn.f32 	%r18, %r218, %r220, %r210;
+	.loc	2 236 15                        // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	add.f32 	%r221, %r214, %r216;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	mul.f32 	%r222, %r218, %r218;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	mul.f32 	%r223, %r207, %r222;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ] ]
+	fma.rn.f32 	%r19, %r220, %r223, %r221;
+$L__tmp20:
+	.loc	2 243 46                        // triton_helpers.py:243:46 @[ cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:47:79 ]
+	and.b32 	%r224, %r38, 15;
+	setp.eq.b32 	%p22, %r224, 0;
+	and.pred 	%p4, %p3, %p22;
+	// begin inline asm
+	@%p4 st.shared.b32 [ %r13 + 0 ], %r18;
+	// end inline asm
+	// begin inline asm
+	@%p4 st.shared.b32 [ %r15 + 0 ], %r19;
+	// end inline asm
+	// begin inline asm
+	@%p4 st.shared.b32 [ %r17 + 0 ], %r20;
+	// end inline asm
+	bar.sync 	0;
+	ld.shared.b32 	%r225, [global_smem];
+	ld.shared.b32 	%r226, [global_smem+64];
+$L__tmp21:
+	.loc	1 57 34                         // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:57:34
+	mul.wide.u32 	%rd14, %r42, 2;
+	add.s64 	%rd3, %rd10, %rd14;
+	.loc	1 57 41                         // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:57:41
+	// begin inline asm
+	mov.u64 %rd4, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd4, 1.0;
+	// end inline asm
+	mov.pred 	%p5, -1;
+	// begin inline asm
+	mov.u32 %r21, %r5;
+	mov.u32 %r22, %r5;
+	mov.u32 %r23, %r5;
+	mov.u32 %r24, %r5;
+	@%p5 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r21, %r22, %r23, %r24 }, [ %rd3 + 0 ], %rd4;
+	// end inline asm
+	.loc	1 58 52                         // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:58:52
+	// begin inline asm
+	mov.u64 %rd5, 0x0;
+	createpolicy.fractional.L2::evict_first.b64 %rd5, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r25, %r5;
+	mov.u32 %r26, %r5;
+	mov.u32 %r27, %r5;
+	mov.u32 %r28, %r5;
+	@%p1 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { %r25, %r26, %r27, %r28 }, [ %rd1 + 0 ], %rd5;
+	// end inline asm
+	.loc	1 59 35                         // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:59:35
+	add.s64 	%rd6, %rd11, %rd14;
+	.loc	1 59 42                         // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:59:42
+	// begin inline asm
+	mov.u64 %rd7, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd7, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r29, %r5;
+	mov.u32 %r30, %r5;
+	mov.u32 %r31, %r5;
+	mov.u32 %r32, %r5;
+	@%p5 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r29, %r30, %r31, %r32 }, [ %rd6 + 0 ], %rd7;
+	// end inline asm
+	mov.b32 	%r227, 0f45800000;
+	.loc	1 65 24                         // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:65:24
+	div.full.f32 	%r228, %r226, %r227;
+	.loc	1 67 24                         // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:67:24
+	add.f32 	%r229, %r228, 0f358637BD;
+	.loc	1 68 32                         // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:68:32
+	rsqrt.approx.ftz.f32 	%r230, %r229;
+	.loc	1 73 29                         // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:73:29
+	add.s64 	%rd8, %rd12, %rd13;
+	.loc	1 58 114                        // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:58:114
+	mov.b32 	{%rs9, %rs10}, %r25;
+	cvt.f32.bf16 	%r231, %rs10;
+	cvt.f32.bf16 	%r232, %rs9;
+	.loc	1 63 24                         // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:63:24
+	sub.f32 	%r233, %r232, %r225;
+	sub.f32 	%r234, %r231, %r225;
+	.loc	1 57 94                         // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:57:94
+	mov.b32 	{%rs11, %rs12}, %r21;
+	cvt.f32.bf16 	%r235, %rs11;
+	cvt.f32.bf16 	%r236, %rs12;
+	.loc	1 61 23                         // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:61:23
+	add.f32 	%r237, %r236, 0f3F800000;
+	add.f32 	%r238, %r235, 0f3F800000;
+	.loc	1 59 95                         // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:59:95
+	mov.b32 	{%rs13, %rs14}, %r29;
+	cvt.f32.bf16 	%r239, %rs14;
+	cvt.f32.bf16 	%r240, %rs13;
+	.loc	1 69 24                         // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:69:24
+	mul.f32 	%r241, %r234, %r230;
+	mul.f32 	%r242, %r233, %r230;
+	.loc	1 72 24                         // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:72:24
+	fma.rn.f32 	%r243, %r238, %r242, %r240;
+	fma.rn.f32 	%r244, %r237, %r241, %r239;
+	.loc	1 73 53                         // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:73:53
+	cvt.rn.bf16x2.f32 	%r33, %r244, %r243;
+	.loc	1 58 114                        // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:58:114
+	mov.b32 	{%rs15, %rs16}, %r26;
+	cvt.f32.bf16 	%r245, %rs16;
+	cvt.f32.bf16 	%r246, %rs15;
+	.loc	1 63 24                         // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:63:24
+	sub.f32 	%r247, %r246, %r225;
+	sub.f32 	%r248, %r245, %r225;
+	.loc	1 57 94                         // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:57:94
+	mov.b32 	{%rs17, %rs18}, %r22;
+	cvt.f32.bf16 	%r249, %rs17;
+	cvt.f32.bf16 	%r250, %rs18;
+	.loc	1 61 23                         // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:61:23
+	add.f32 	%r251, %r250, 0f3F800000;
+	add.f32 	%r252, %r249, 0f3F800000;
+	.loc	1 59 95                         // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:59:95
+	mov.b32 	{%rs19, %rs20}, %r30;
+	cvt.f32.bf16 	%r253, %rs20;
+	cvt.f32.bf16 	%r254, %rs19;
+	.loc	1 69 24                         // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:69:24
+	mul.f32 	%r255, %r248, %r230;
+	mul.f32 	%r256, %r247, %r230;
+	.loc	1 72 24                         // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:72:24
+	fma.rn.f32 	%r257, %r252, %r256, %r254;
+	fma.rn.f32 	%r258, %r251, %r255, %r253;
+	.loc	1 73 53                         // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:73:53
+	cvt.rn.bf16x2.f32 	%r34, %r258, %r257;
+	.loc	1 58 114                        // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:58:114
+	mov.b32 	{%rs21, %rs22}, %r27;
+	cvt.f32.bf16 	%r259, %rs22;
+	cvt.f32.bf16 	%r260, %rs21;
+	.loc	1 63 24                         // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:63:24
+	sub.f32 	%r261, %r260, %r225;
+	sub.f32 	%r262, %r259, %r225;
+	.loc	1 57 94                         // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:57:94
+	mov.b32 	{%rs23, %rs24}, %r23;
+	cvt.f32.bf16 	%r263, %rs23;
+	cvt.f32.bf16 	%r264, %rs24;
+	.loc	1 61 23                         // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:61:23
+	add.f32 	%r265, %r264, 0f3F800000;
+	add.f32 	%r266, %r263, 0f3F800000;
+	.loc	1 59 95                         // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:59:95
+	mov.b32 	{%rs25, %rs26}, %r31;
+	cvt.f32.bf16 	%r267, %rs26;
+	cvt.f32.bf16 	%r268, %rs25;
+	.loc	1 69 24                         // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:69:24
+	mul.f32 	%r269, %r262, %r230;
+	mul.f32 	%r270, %r261, %r230;
+	.loc	1 72 24                         // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:72:24
+	fma.rn.f32 	%r271, %r266, %r270, %r268;
+	fma.rn.f32 	%r272, %r265, %r269, %r267;
+	.loc	1 73 53                         // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:73:53
+	cvt.rn.bf16x2.f32 	%r35, %r272, %r271;
+	.loc	1 58 114                        // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:58:114
+	mov.b32 	{%rs27, %rs28}, %r28;
+	cvt.f32.bf16 	%r273, %rs28;
+	cvt.f32.bf16 	%r274, %rs27;
+	.loc	1 63 24                         // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:63:24
+	sub.f32 	%r275, %r274, %r225;
+	sub.f32 	%r276, %r273, %r225;
+	.loc	1 57 94                         // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:57:94
+	mov.b32 	{%rs29, %rs30}, %r24;
+	cvt.f32.bf16 	%r277, %rs29;
+	cvt.f32.bf16 	%r278, %rs30;
+	.loc	1 61 23                         // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:61:23
+	add.f32 	%r279, %r278, 0f3F800000;
+	add.f32 	%r280, %r277, 0f3F800000;
+	.loc	1 59 95                         // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:59:95
+	mov.b32 	{%rs31, %rs32}, %r32;
+	cvt.f32.bf16 	%r281, %rs32;
+	cvt.f32.bf16 	%r282, %rs31;
+	.loc	1 69 24                         // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:69:24
+	mul.f32 	%r283, %r276, %r230;
+	mul.f32 	%r284, %r275, %r230;
+	.loc	1 72 24                         // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:72:24
+	fma.rn.f32 	%r285, %r280, %r284, %r282;
+	fma.rn.f32 	%r286, %r279, %r283, %r281;
+	.loc	1 73 53                         // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:73:53
+	cvt.rn.bf16x2.f32 	%r36, %r286, %r285;
+	// begin inline asm
+	@%p1 st.global.v4.b32 [ %rd8 + 0 ], { %r33, %r34, %r35, %r36 };
+	// end inline asm
+	.loc	1 51 4                          // cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py:51:4
+	ret;
+$L__tmp22:
+$L__func_end0:
+                                        // -- End function
+}
+	.file	1 "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py"
+	.file	2 "/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py"
+	.section	.debug_abbrev
+	{
+.b8 1                                   // Abbreviation Code
+.b8 17                                  // DW_TAG_compile_unit
+.b8 1                                   // DW_CHILDREN_yes
+.b8 37                                  // DW_AT_producer
+.b8 8                                   // DW_FORM_string
+.b8 19                                  // DW_AT_language
+.b8 5                                   // DW_FORM_data2
+.b8 3                                   // DW_AT_name
+.b8 8                                   // DW_FORM_string
+.b8 16                                  // DW_AT_stmt_list
+.b8 6                                   // DW_FORM_data4
+.b8 27                                  // DW_AT_comp_dir
+.b8 8                                   // DW_FORM_string
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 2                                   // Abbreviation Code
+.b8 46                                  // DW_TAG_subprogram
+.b8 0                                   // DW_CHILDREN_no
+.b8 3                                   // DW_AT_name
+.b8 8                                   // DW_FORM_string
+.b8 32                                  // DW_AT_inline
+.b8 11                                  // DW_FORM_data1
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 3                                   // Abbreviation Code
+.b8 46                                  // DW_TAG_subprogram
+.b8 1                                   // DW_CHILDREN_yes
+.b8 17                                  // DW_AT_low_pc
+.b8 1                                   // DW_FORM_addr
+.b8 18                                  // DW_AT_high_pc
+.b8 1                                   // DW_FORM_addr
+.b8 49                                  // DW_AT_abstract_origin
+.b8 19                                  // DW_FORM_ref4
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 4                                   // Abbreviation Code
+.b8 29                                  // DW_TAG_inlined_subroutine
+.b8 1                                   // DW_CHILDREN_yes
+.b8 49                                  // DW_AT_abstract_origin
+.b8 19                                  // DW_FORM_ref4
+.b8 17                                  // DW_AT_low_pc
+.b8 1                                   // DW_FORM_addr
+.b8 18                                  // DW_AT_high_pc
+.b8 1                                   // DW_FORM_addr
+.b8 88                                  // DW_AT_call_file
+.b8 11                                  // DW_FORM_data1
+.b8 89                                  // DW_AT_call_line
+.b8 11                                  // DW_FORM_data1
+.b8 87                                  // DW_AT_call_column
+.b8 11                                  // DW_FORM_data1
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 5                                   // Abbreviation Code
+.b8 29                                  // DW_TAG_inlined_subroutine
+.b8 0                                   // DW_CHILDREN_no
+.b8 49                                  // DW_AT_abstract_origin
+.b8 19                                  // DW_FORM_ref4
+.b8 17                                  // DW_AT_low_pc
+.b8 1                                   // DW_FORM_addr
+.b8 18                                  // DW_AT_high_pc
+.b8 1                                   // DW_FORM_addr
+.b8 88                                  // DW_AT_call_file
+.b8 11                                  // DW_FORM_data1
+.b8 89                                  // DW_AT_call_line
+.b8 11                                  // DW_FORM_data1
+.b8 87                                  // DW_AT_call_column
+.b8 11                                  // DW_FORM_data1
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 0                                   // EOM(3)
+	}
+	.section	.debug_info
+	{
+.b32 343                                // Length of Unit
+.b8 2                                   // DWARF version number
+.b8 0
+.b32 .debug_abbrev                      // Offset Into Abbrev. Section
+.b8 8                                   // Address Size (in bytes)
+.b8 1                                   // Abbrev [1] 0xb:0x150 DW_TAG_compile_unit
+.b8 116                                 // DW_AT_producer
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 0
+.b8 2                                   // DW_AT_language
+.b8 0
+.b8 99                                  // DW_AT_name
+.b8 119
+.b8 119
+.b8 105
+.b8 122
+.b8 122
+.b8 106
+.b8 119
+.b8 109
+.b8 100
+.b8 52
+.b8 97
+.b8 106
+.b8 108
+.b8 117
+.b8 98
+.b8 120
+.b8 112
+.b8 118
+.b8 120
+.b8 105
+.b8 100
+.b8 106
+.b8 105
+.b8 121
+.b8 51
+.b8 108
+.b8 100
+.b8 118
+.b8 53
+.b8 101
+.b8 102
+.b8 108
+.b8 119
+.b8 108
+.b8 117
+.b8 100
+.b8 103
+.b8 105
+.b8 122
+.b8 99
+.b8 97
+.b8 104
+.b8 118
+.b8 115
+.b8 112
+.b8 52
+.b8 105
+.b8 55
+.b8 53
+.b8 115
+.b8 50
+.b8 46
+.b8 112
+.b8 121
+.b8 0
+.b32 .debug_line                        // DW_AT_stmt_list
+.b8 47                                  // DW_AT_comp_dir
+.b8 97
+.b8 112
+.b8 112
+.b8 47
+.b8 116
+.b8 101
+.b8 110
+.b8 115
+.b8 111
+.b8 114
+.b8 114
+.b8 116
+.b8 95
+.b8 108
+.b8 108
+.b8 109
+.b8 47
+.b8 118
+.b8 105
+.b8 115
+.b8 117
+.b8 97
+.b8 108
+.b8 95
+.b8 103
+.b8 101
+.b8 110
+.b8 47
+.b8 99
+.b8 111
+.b8 109
+.b8 112
+.b8 105
+.b8 108
+.b8 101
+.b8 100
+.b8 95
+.b8 99
+.b8 97
+.b8 99
+.b8 104
+.b8 101
+.b8 47
+.b8 102
+.b8 108
+.b8 117
+.b8 120
+.b8 50
+.b8 95
+.b8 107
+.b8 108
+.b8 101
+.b8 105
+.b8 110
+.b8 95
+.b8 57
+.b8 98
+.b8 95
+.b8 78
+.b8 86
+.b8 73
+.b8 68
+.b8 73
+.b8 65
+.b8 95
+.b8 71
+.b8 101
+.b8 70
+.b8 111
+.b8 114
+.b8 99
+.b8 101
+.b8 95
+.b8 82
+.b8 84
+.b8 88
+.b8 95
+.b8 52
+.b8 48
+.b8 57
+.b8 48
+.b8 95
+.b8 115
+.b8 109
+.b8 56
+.b8 57
+.b8 95
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 50
+.b8 46
+.b8 49
+.b8 48
+.b8 46
+.b8 48
+.b8 97
+.b8 48
+.b8 95
+.b8 98
+.b8 52
+.b8 101
+.b8 52
+.b8 101
+.b8 101
+.b8 56
+.b8 49
+.b8 100
+.b8 51
+.b8 46
+.b8 110
+.b8 118
+.b8 50
+.b8 53
+.b8 46
+.b8 49
+.b8 50
+.b8 95
+.b8 99
+.b8 117
+.b8 100
+.b8 97
+.b8 49
+.b8 51
+.b8 95
+.b8 49
+.b8 47
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 105
+.b8 110
+.b8 100
+.b8 117
+.b8 99
+.b8 116
+.b8 111
+.b8 114
+.b8 47
+.b8 119
+.b8 119
+.b8 0
+.b8 2                                   // Abbrev [2] 0xe4:0x2f DW_TAG_subprogram
+.b8 116                                 // DW_AT_name
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 114
+.b8 101
+.b8 100
+.b8 95
+.b8 102
+.b8 117
+.b8 115
+.b8 101
+.b8 100
+.b8 95
+.b8 97
+.b8 100
+.b8 100
+.b8 95
+.b8 109
+.b8 117
+.b8 108
+.b8 95
+.b8 110
+.b8 97
+.b8 116
+.b8 105
+.b8 118
+.b8 101
+.b8 95
+.b8 108
+.b8 97
+.b8 121
+.b8 101
+.b8 114
+.b8 95
+.b8 110
+.b8 111
+.b8 114
+.b8 109
+.b8 95
+.b8 48
+.b8 0
+.b8 1                                   // DW_AT_inline
+.b8 3                                   // Abbrev [3] 0x113:0x47 DW_TAG_subprogram
+.b64 $L__func_begin0                    // DW_AT_low_pc
+.b64 $L__func_end0                      // DW_AT_high_pc
+.b32 228                                // DW_AT_abstract_origin
+.b8 4                                   // Abbrev [4] 0x128:0x31 DW_TAG_inlined_subroutine
+.b32 228                                // DW_AT_abstract_origin
+.b64 $L__tmp1                           // DW_AT_low_pc
+.b64 $L__tmp21                          // DW_AT_high_pc
+.b8 1                                   // DW_AT_call_file
+.b8 47                                  // DW_AT_call_line
+.b8 79                                  // DW_AT_call_column
+.b8 5                                   // Abbrev [5] 0x140:0x18 DW_TAG_inlined_subroutine
+.b32 228                                // DW_AT_abstract_origin
+.b64 $L__tmp1                           // DW_AT_low_pc
+.b64 $L__tmp20                          // DW_AT_high_pc
+.b8 2                                   // DW_AT_call_file
+.b8 243                                 // DW_AT_call_line
+.b8 46                                  // DW_AT_call_column
+.b8 0                                   // End Of Children Mark
+.b8 0                                   // End Of Children Mark
+.b8 0                                   // End Of Children Mark
+	}
+	.section	.debug_macinfo	{	}
diff --git a/triton/MRY6R44HLP4JU6GX5XHYUUNGQ6LLBRKAPA6ZNJPRYYOSCCZPWAPQ/triton_red_fused_add_mul_native_layer_norm_0.source b/triton/MRY6R44HLP4JU6GX5XHYUUNGQ6LLBRKAPA6ZNJPRYYOSCCZPWAPQ/triton_red_fused_add_mul_native_layer_norm_0.source
new file mode 100644
index 0000000000000000000000000000000000000000..20e39eee37abfa344439ff226f13b900d337b002
--- /dev/null
+++ b/triton/MRY6R44HLP4JU6GX5XHYUUNGQ6LLBRKAPA6ZNJPRYYOSCCZPWAPQ/triton_red_fused_add_mul_native_layer_norm_0.source
@@ -0,0 +1,420 @@
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":18:0)
+#loc72 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":216:0)
+#loc85 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":133:0)
+#loc89 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":242:0)
+#loc91 = loc(unknown)
+#loc94 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":230:0)
+#loc109 = loc("in_ptr0"(#loc))
+#loc110 = loc("in_ptr1"(#loc))
+#loc111 = loc("in_ptr2"(#loc))
+#loc112 = loc("out_ptr2"(#loc))
+#loc113 = loc("xnumel"(#loc))
+#loc114 = loc("r0_numel"(#loc))
+#loc171 = loc("value"(#loc72))
+#loc172 = loc("mean"(#loc72))
+#loc173 = loc("m2"(#loc72))
+#loc174 = loc("weight"(#loc72))
+#loc175 = loc("first_iteration"(#loc72))
+#loc185 = loc("input"(#loc85))
+#loc186 = loc("mean"(#loc89))
+#loc187 = loc("m2"(#loc89))
+#loc188 = loc("weight"(#loc89))
+#loc189 = loc("mean_1"(#loc94))
+#loc190 = loc("m2_1"(#loc94))
+#loc191 = loc("weight_1"(#loc94))
+#loc192 = loc("mean_2"(#loc94))
+#loc193 = loc("m2_2"(#loc94))
+#loc194 = loc("weight_2"(#loc94))
+#loc201 = loc("new_mean"(#loc171))
+module {
+  tt.func public @triton_red_fused_add_mul_native_layer_norm_0(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %out_ptr2: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
+    %xnumel_0 = arith.constant 2048 : i32 loc(#loc115)
+    %r0_numel_1 = arith.constant 4096 : i32 loc(#loc116)
+    %xoffset = tt.get_program_id x : i32 loc(#loc117)
+    %xoffset_2 = arith.constant 1 : i32 loc(#loc118)
+    %xoffset_3 = arith.constant 1 : i32 loc(#loc118)
+    %xoffset_4 = arith.muli %xoffset, %xoffset_3 : i32 loc(#loc118)
+    %xindex = tt.make_range {end = 1 : i32, start = 0 : i32} : tensor<1xi32> loc(#loc119)
+    %xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc120)
+    %xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<1x1xi32> loc(#loc121)
+    %xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<1x1xi32> loc(#loc121)
+    %xmask = arith.constant dense<2048> : tensor<1x1xi32> loc(#loc122)
+    %xmask_8 = arith.cmpi slt, %xindex_7, %xmask : tensor<1x1xi32> loc(#loc122)
+    %r0_base = tt.make_range {end = 4096 : i32, start = 0 : i32} : tensor<4096xi32> loc(#loc123)
+    %r0_base_9 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<4096xi32> -> tensor<1x4096xi32> loc(#loc124)
+    %tmp3_mean = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_4096__(1,)cconstexpr_fp32_"() : () -> tensor<1x4096xf32> loc(#loc125)
+    %tmp3_m2 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_4096__(1,)cconstexpr_fp32_"() : () -> tensor<1x4096xf32> loc(#loc126)
+    %tmp3_weight = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_4096__(1,)cconstexpr_fp32_"() : () -> tensor<1x4096xf32> loc(#loc127)
+    %c0_i32 = arith.constant 0 : i32 loc(#loc14)
+    %c4096_i32 = arith.constant 4096 : i32 loc(#loc14)
+    %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc14)
+    %1 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc14)
+    %2 = arith.bitcast %c4096_i32 : i32 to i32 loc(#loc14)
+    %3 = ub.poison : i32 loc(#loc14)
+    %tmp3_weight_10:3 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%tmp3_mean_13 = %tmp3_mean, %tmp3_m2_14 = %tmp3_m2, %tmp3_weight_15 = %tmp3_weight) -> (tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32>)  : i32 {
+      %r0_index = tt.splat %r0_offset : i32 -> tensor<1x4096xi32> loc(#loc129)
+      %r0_index_16 = arith.addi %r0_index, %r0_base_9 : tensor<1x4096xi32> loc(#loc129)
+      %r0_mask = arith.constant dense<4096> : tensor<1x4096xi32> loc(#loc130)
+      %r0_mask_17 = arith.cmpi slt, %r0_index_16, %r0_mask : tensor<1x4096xi32> loc(#loc130)
+      %tmp0 = arith.constant 4096 : i32 loc(#loc131)
+      %tmp0_18 = arith.constant 4096 : i32 loc(#loc131)
+      %tmp0_19 = arith.constant dense<4096> : tensor<1x1xi32> loc(#loc131)
+      %tmp0_20 = arith.muli %tmp0_19, %xindex_7 : tensor<1x1xi32> loc(#loc131)
+      %tmp0_21 = tt.broadcast %tmp0_20 : tensor<1x1xi32> -> tensor<1x4096xi32> loc(#loc132)
+      %tmp0_22 = arith.addi %r0_index_16, %tmp0_21 : tensor<1x4096xi32> loc(#loc132)
+      %tmp0_23 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<1x4096x!tt.ptr<bf16>> loc(#loc133)
+      %tmp0_24 = tt.addptr %tmp0_23, %tmp0_22 : tensor<1x4096x!tt.ptr<bf16>>, tensor<1x4096xi32> loc(#loc133)
+      %tmp0_25 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x4096xi1> loc(#loc134)
+      %tmp0_26 = arith.andi %r0_mask_17, %tmp0_25 : tensor<1x4096xi1> loc(#loc134)
+      %tmp0_27 = arith.constant 0.000000e+00 : f32 loc(#loc135)
+      %tmp0_28 = arith.constant dense<0.000000e+00> : tensor<1x4096xf32> loc(#loc135)
+      %tmp0_29 = arith.truncf %tmp0_28 : tensor<1x4096xf32> to tensor<1x4096xbf16> loc(#loc135)
+      %tmp0_30 = tt.load %tmp0_24, %tmp0_26, %tmp0_29 evictionPolicy = evict_last : tensor<1x4096x!tt.ptr<bf16>> loc(#loc135)
+      %tmp0_31 = arith.extf %tmp0_30 : tensor<1x4096xbf16> to tensor<1x4096xf32> loc(#loc136)
+      %c0_i32_32 = arith.constant 0 : i32 loc(#loc23)
+      %9 = arith.cmpi eq, %r0_offset, %c0_i32_32 : i32 loc(#loc23)
+      %10:3 = tt.call @torch._inductor.runtime.triton_helpers.welford_reduce__fp32S1_4096S_fp32S1_4096S_fp32S1_4096S_fp32S1_4096S_u1__(%tmp0_31, %tmp3_mean_13, %tmp3_m2_14, %tmp3_weight_15, %9) : (tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32>, i1) -> (tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32>) loc(#loc24)
+      %tmp3_mean_33 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x4096xi1> loc(#loc137)
+      %tmp3_mean_34 = arith.andi %r0_mask_17, %tmp3_mean_33 : tensor<1x4096xi1> loc(#loc137)
+      %tmp3_mean_35 = arith.select %tmp3_mean_34, %10#0, %tmp3_mean_13 : tensor<1x4096xi1>, tensor<1x4096xf32> loc(#loc138)
+      %tmp3_m2_36 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x4096xi1> loc(#loc139)
+      %tmp3_m2_37 = arith.andi %r0_mask_17, %tmp3_m2_36 : tensor<1x4096xi1> loc(#loc139)
+      %tmp3_m2_38 = arith.select %tmp3_m2_37, %10#1, %tmp3_m2_14 : tensor<1x4096xi1>, tensor<1x4096xf32> loc(#loc140)
+      %tmp3_weight_39 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x4096xi1> loc(#loc141)
+      %tmp3_weight_40 = arith.andi %r0_mask_17, %tmp3_weight_39 : tensor<1x4096xi1> loc(#loc141)
+      %tmp3_weight_41 = arith.select %tmp3_weight_40, %10#2, %tmp3_weight_15 : tensor<1x4096xi1>, tensor<1x4096xf32> loc(#loc142)
+      scf.yield %tmp3_mean_35, %tmp3_m2_38, %tmp3_weight_41 : tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32> loc(#loc31)
+    } loc(#loc207)
+    %4:3 = tt.call @"torch._inductor.runtime.triton_helpers.welford__fp32S1_4096S_fp32S1_4096S_fp32S1_4096S__(3,)cconstexpr_1_"(%tmp3_weight_10#0, %tmp3_weight_10#1, %tmp3_weight_10#2) : (tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32>) -> (tensor<1xf32>, tensor<1xf32>, tensor<1xf32>) loc(#loc32)
+    %tmp3 = tt.expand_dims %4#0 {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc143)
+    %tmp7 = tt.expand_dims %4#1 {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc144)
+    %tmp8 = tt.expand_dims %4#2 {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc145)
+    %c0_i32_11 = arith.constant 0 : i32 loc(#loc36)
+    %c4096_i32_12 = arith.constant 4096 : i32 loc(#loc36)
+    %5 = arith.bitcast %c0_i32_11 : i32 to i32 loc(#loc36)
+    %6 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc36)
+    %7 = arith.bitcast %c4096_i32_12 : i32 to i32 loc(#loc36)
+    %8 = ub.poison : i32 loc(#loc36)
+    scf.for %r0_offset = %5 to %6 step %7  : i32 {
+      %r0_index = tt.splat %r0_offset : i32 -> tensor<1x4096xi32> loc(#loc146)
+      %r0_index_13 = arith.addi %r0_index, %r0_base_9 : tensor<1x4096xi32> loc(#loc146)
+      %r0_mask = arith.constant dense<4096> : tensor<1x4096xi32> loc(#loc147)
+      %r0_mask_14 = arith.cmpi slt, %r0_index_13, %r0_mask : tensor<1x4096xi32> loc(#loc147)
+      %tmp9 = tt.splat %in_ptr1 : !tt.ptr<bf16> -> tensor<1x4096x!tt.ptr<bf16>> loc(#loc148)
+      %tmp9_15 = tt.addptr %tmp9, %r0_index_13 : tensor<1x4096x!tt.ptr<bf16>>, tensor<1x4096xi32> loc(#loc148)
+      %tmp9_16 = arith.constant 0.000000e+00 : f32 loc(#loc149)
+      %tmp9_17 = arith.constant dense<0.000000e+00> : tensor<1x4096xf32> loc(#loc149)
+      %tmp9_18 = arith.truncf %tmp9_17 : tensor<1x4096xf32> to tensor<1x4096xbf16> loc(#loc149)
+      %tmp9_19 = tt.load %tmp9_15, %r0_mask_14, %tmp9_18 evictionPolicy = evict_last : tensor<1x4096x!tt.ptr<bf16>> loc(#loc149)
+      %tmp9_20 = arith.extf %tmp9_19 : tensor<1x4096xbf16> to tensor<1x4096xf32> loc(#loc150)
+      %tmp12 = arith.constant 4096 : i32 loc(#loc151)
+      %tmp12_21 = arith.constant 4096 : i32 loc(#loc151)
+      %tmp12_22 = arith.constant dense<4096> : tensor<1x1xi32> loc(#loc151)
+      %tmp12_23 = arith.muli %tmp12_22, %xindex_7 : tensor<1x1xi32> loc(#loc151)
+      %tmp12_24 = tt.broadcast %tmp12_23 : tensor<1x1xi32> -> tensor<1x4096xi32> loc(#loc152)
+      %tmp12_25 = arith.addi %r0_index_13, %tmp12_24 : tensor<1x4096xi32> loc(#loc152)
+      %tmp12_26 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<1x4096x!tt.ptr<bf16>> loc(#loc153)
+      %tmp12_27 = tt.addptr %tmp12_26, %tmp12_25 : tensor<1x4096x!tt.ptr<bf16>>, tensor<1x4096xi32> loc(#loc153)
+      %tmp12_28 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x4096xi1> loc(#loc154)
+      %tmp12_29 = arith.andi %r0_mask_14, %tmp12_28 : tensor<1x4096xi1> loc(#loc154)
+      %tmp12_30 = arith.constant 0.000000e+00 : f32 loc(#loc155)
+      %tmp12_31 = arith.constant dense<0.000000e+00> : tensor<1x4096xf32> loc(#loc155)
+      %tmp12_32 = arith.truncf %tmp12_31 : tensor<1x4096xf32> to tensor<1x4096xbf16> loc(#loc155)
+      %tmp12_33 = tt.load %tmp12_27, %tmp12_29, %tmp12_32 evictionPolicy = evict_first : tensor<1x4096x!tt.ptr<bf16>> loc(#loc155)
+      %tmp12_34 = arith.extf %tmp12_33 : tensor<1x4096xbf16> to tensor<1x4096xf32> loc(#loc156)
+      %tmp23 = tt.splat %in_ptr2 : !tt.ptr<bf16> -> tensor<1x4096x!tt.ptr<bf16>> loc(#loc157)
+      %tmp23_35 = tt.addptr %tmp23, %r0_index_13 : tensor<1x4096x!tt.ptr<bf16>>, tensor<1x4096xi32> loc(#loc157)
+      %tmp23_36 = arith.constant 0.000000e+00 : f32 loc(#loc158)
+      %tmp23_37 = arith.constant dense<0.000000e+00> : tensor<1x4096xf32> loc(#loc158)
+      %tmp23_38 = arith.truncf %tmp23_37 : tensor<1x4096xf32> to tensor<1x4096xbf16> loc(#loc158)
+      %tmp23_39 = tt.load %tmp23_35, %r0_mask_14, %tmp23_38 evictionPolicy = evict_last : tensor<1x4096x!tt.ptr<bf16>> loc(#loc158)
+      %tmp23_40 = arith.extf %tmp23_39 : tensor<1x4096xbf16> to tensor<1x4096xf32> loc(#loc159)
+      %tmp10 = arith.constant 1.000000e+00 : f32 loc(#loc160)
+      %tmp11 = arith.constant dense<1.000000e+00> : tensor<1x4096xf32> loc(#loc161)
+      %tmp11_41 = arith.addf %tmp9_20, %tmp11 : tensor<1x4096xf32> loc(#loc161)
+      %tmp14 = tt.broadcast %tmp3 : tensor<1x1xf32> -> tensor<1x4096xf32> loc(#loc162)
+      %tmp14_42 = arith.subf %tmp12_34, %tmp14 : tensor<1x4096xf32> loc(#loc162)
+      %tmp15 = arith.constant 4.096000e+03 : f32 loc(#loc163)
+      %tmp16 = arith.constant dense<4.096000e+03> : tensor<1x1xf32> loc(#loc164)
+      %tmp16_43 = arith.divf %tmp7, %tmp16 : tensor<1x1xf32> loc(#loc164)
+      %tmp17 = arith.constant 9.99999997E-7 : f32 loc(#loc165)
+      %tmp18 = arith.constant dense<9.99999997E-7> : tensor<1x1xf32> loc(#loc166)
+      %tmp18_44 = arith.addf %tmp16_43, %tmp18 : tensor<1x1xf32> loc(#loc166)
+      %tmp19 = tt.extern_elementwise %tmp18_44 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<1x1xf32>) -> tensor<1x1xf32> loc(#loc167)
+      %tmp20 = tt.broadcast %tmp19 : tensor<1x1xf32> -> tensor<1x4096xf32> loc(#loc168)
+      %tmp20_45 = arith.mulf %tmp14_42, %tmp20 : tensor<1x4096xf32> loc(#loc168)
+      %tmp22 = arith.mulf %tmp11_41, %tmp20_45 : tensor<1x4096xf32> loc(#loc169)
+      %tmp24 = arith.addf %tmp22, %tmp23_40 : tensor<1x4096xf32> loc(#loc170)
+      %c4096_i32_46 = arith.constant 4096 : i32 loc(#loc62)
+      %c4096_i32_47 = arith.constant 4096 : i32 loc(#loc62)
+      %cst = arith.constant dense<4096> : tensor<1x1xi32> loc(#loc62)
+      %9 = arith.muli %cst, %xindex_7 : tensor<1x1xi32> loc(#loc62)
+      %10 = tt.broadcast %9 : tensor<1x1xi32> -> tensor<1x4096xi32> loc(#loc63)
+      %11 = arith.addi %r0_index_13, %10 : tensor<1x4096xi32> loc(#loc63)
+      %12 = tt.splat %out_ptr2 : !tt.ptr<bf16> -> tensor<1x4096x!tt.ptr<bf16>> loc(#loc64)
+      %13 = tt.addptr %12, %11 : tensor<1x4096x!tt.ptr<bf16>>, tensor<1x4096xi32> loc(#loc64)
+      %14 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x4096xi1> loc(#loc65)
+      %15 = arith.andi %r0_mask_14, %14 : tensor<1x4096xi1> loc(#loc65)
+      %16 = arith.truncf %tmp24 : tensor<1x4096xf32> to tensor<1x4096xbf16> loc(#loc66)
+      tt.store %13, %16, %15 : tensor<1x4096x!tt.ptr<bf16>> loc(#loc66)
+    } loc(#loc36)
+    tt.return loc(#loc67)
+  } loc(#loc)
+  tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_4096__(1,)cconstexpr_fp32_"() -> tensor<1x4096xf32> attributes {noinline = false} {
+    %cst = arith.constant 0.000000e+00 : f32 loc(#loc69)
+    %cst_0 = arith.constant dense<0.000000e+00> : tensor<1x4096xf32> loc(#loc69)
+    tt.return %cst_0 : tensor<1x4096xf32> loc(#loc70)
+  ^bb1:  // no predecessors
+    %0 = ub.poison : tensor<1x4096xf32> loc(#loc71)
+    tt.return %0 : tensor<1x4096xf32> loc(#loc71)
+  } loc(#loc68)
+  tt.func private @torch._inductor.runtime.triton_helpers.welford_reduce__fp32S1_4096S_fp32S1_4096S_fp32S1_4096S_fp32S1_4096S_u1__(%new_mean: tensor<1x4096xf32> loc("new_mean"(#loc171)), %mean: tensor<1x4096xf32> loc("mean"(#loc72)), %m2: tensor<1x4096xf32> loc("m2"(#loc72)), %weight: tensor<1x4096xf32> loc("weight"(#loc72)), %first_iteration: i1 loc("first_iteration"(#loc72))) -> (tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32>) attributes {noinline = false} {
+    %0:3 = scf.if %first_iteration -> (tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32>) {
+      %new_weight = arith.constant 1.000000e+00 : f32 loc(#loc176)
+      %new_weight_0 = arith.constant dense<1.000000e+00> : tensor<1x4096xf32> loc(#loc202)
+      %new_m2 = tt.call @triton.language.standard.zeros_like__fp32S1_4096S__(%m2) : (tensor<1x4096xf32>) -> tensor<1x4096xf32> loc(#loc203)
+      scf.yield %new_m2, %new_mean, %new_weight_0 : tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32> loc(#loc203)
+    } else {
+      %delta = arith.subf %new_mean, %mean : tensor<1x4096xf32> loc(#loc178)
+      %new_weight = arith.constant 1 : i32 loc(#loc179)
+      %new_weight_0 = arith.constant 1.000000e+00 : f32 loc(#loc179)
+      %new_weight_1 = arith.constant dense<1.000000e+00> : tensor<1x4096xf32> loc(#loc179)
+      %new_weight_2 = arith.addf %weight, %new_weight_1 : tensor<1x4096xf32> loc(#loc204)
+      %new_mean_3 = arith.divf %delta, %new_weight_2 : tensor<1x4096xf32> loc(#loc180)
+      %new_mean_4 = arith.addf %mean, %new_mean_3 : tensor<1x4096xf32> loc(#loc205)
+      %new_m2 = arith.subf %new_mean, %new_mean_4 : tensor<1x4096xf32> loc(#loc182)
+      %new_m2_5 = arith.mulf %delta, %new_m2 : tensor<1x4096xf32> loc(#loc183)
+      %new_m2_6 = arith.addf %m2, %new_m2_5 : tensor<1x4096xf32> loc(#loc206)
+      scf.yield %new_m2_6, %new_mean_4, %new_weight_2 : tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32> loc(#loc184)
+    } loc(#loc73)
+    tt.return %0#1, %0#0, %0#2 : tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32> loc(#loc83)
+  ^bb1:  // no predecessors
+    %1 = ub.poison : tensor<1x4096xf32> loc(#loc84)
+    %2 = ub.poison : tensor<1x4096xf32> loc(#loc84)
+    %3 = ub.poison : tensor<1x4096xf32> loc(#loc84)
+    tt.return %1, %2, %3 : tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32> loc(#loc84)
+  } loc(#loc72)
+  tt.func private @triton.language.standard.zeros_like__fp32S1_4096S__(%input: tensor<1x4096xf32> loc("input"(#loc85))) -> tensor<1x4096xf32> attributes {noinline = false} {
+    %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_4096__(1,)cconstexpr_fp32_"() : () -> tensor<1x4096xf32> loc(#loc86)
+    tt.return %0 : tensor<1x4096xf32> loc(#loc87)
+  ^bb1:  // no predecessors
+    %1 = ub.poison : tensor<1x4096xf32> loc(#loc88)
+    tt.return %1 : tensor<1x4096xf32> loc(#loc88)
+  } loc(#loc85)
+  tt.func private @"torch._inductor.runtime.triton_helpers.welford__fp32S1_4096S_fp32S1_4096S_fp32S1_4096S__(3,)cconstexpr_1_"(%mean: tensor<1x4096xf32> loc("mean"(#loc89)), %m2: tensor<1x4096xf32> loc("m2"(#loc89)), %weight: tensor<1x4096xf32> loc("weight"(#loc89))) -> (tensor<1xf32>, tensor<1xf32>, tensor<1xf32>) attributes {noinline = false} {
+    %0:3 = "tt.reduce"(%mean, %m2, %weight) <{axis = 1 : i32}> ({
+    ^bb0(%arg3: f32 loc(unknown), %arg4: f32 loc(unknown), %arg5: f32 loc(unknown), %arg6: f32 loc(unknown), %arg7: f32 loc(unknown), %arg8: f32 loc(unknown)):
+      %4:3 = tt.call @torch._inductor.runtime.triton_helpers.welford_combine__fp32_fp32_fp32_fp32_fp32_fp32__(%arg3, %arg4, %arg5, %arg6, %arg7, %arg8) : (f32, f32, f32, f32, f32, f32) -> (f32, f32, f32) loc(#loc90)
+      tt.reduce.return %4#0, %4#1, %4#2 : f32, f32, f32 loc(#loc90)
+    }) : (tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32>) -> (tensor<1xf32>, tensor<1xf32>, tensor<1xf32>) loc(#loc90)
+    tt.return %0#0, %0#1, %0#2 : tensor<1xf32>, tensor<1xf32>, tensor<1xf32> loc(#loc92)
+  ^bb1:  // no predecessors
+    %1 = ub.poison : tensor<1xf32> loc(#loc93)
+    %2 = ub.poison : tensor<1xf32> loc(#loc93)
+    %3 = ub.poison : tensor<1xf32> loc(#loc93)
+    tt.return %1, %2, %3 : tensor<1xf32>, tensor<1xf32>, tensor<1xf32> loc(#loc93)
+  } loc(#loc89)
+  tt.func private @torch._inductor.runtime.triton_helpers.welford_combine__fp32_fp32_fp32_fp32_fp32_fp32__(%mean_1: f32 loc("mean_1"(#loc94)), %m2_1: f32 loc("m2_1"(#loc94)), %weight_1: f32 loc("weight_1"(#loc94)), %mean_2: f32 loc("mean_2"(#loc94)), %m2_2: f32 loc("m2_2"(#loc94)), %weight_2: f32 loc("weight_2"(#loc94))) -> (f32, f32, f32) attributes {noinline = false} {
+    %delta = arith.subf %mean_2, %mean_1 : f32 loc(#loc195)
+    %new_weight = arith.addf %weight_1, %weight_2 : f32 loc(#loc196)
+    %w2_over_w = arith.constant 0.000000e+00 : f32 loc(#loc197)
+    %w2_over_w_0 = arith.cmpf oeq, %new_weight, %w2_over_w : f32 loc(#loc197)
+    %w2_over_w_1 = arith.divf %weight_2, %new_weight : f32 loc(#loc198)
+    %w2_over_w_2 = arith.constant 0.000000e+00 : f32 loc(#loc199)
+    %w2_over_w_3 = arith.constant 0.000000e+00 : f32 loc(#loc199)
+    %w2_over_w_4 = arith.select %w2_over_w_0, %w2_over_w_3, %w2_over_w_1 : f32 loc(#loc199)
+    %0 = arith.mulf %delta, %w2_over_w_4 : f32 loc(#loc100)
+    %1 = arith.addf %mean_1, %0 : f32 loc(#loc101)
+    %2 = arith.addf %m2_1, %m2_2 : f32 loc(#loc102)
+    %3 = arith.mulf %delta, %delta : f32 loc(#loc103)
+    %4 = arith.mulf %3, %weight_1 : f32 loc(#loc104)
+    %5 = arith.mulf %4, %w2_over_w_4 : f32 loc(#loc105)
+    %6 = arith.addf %2, %5 : f32 loc(#loc106)
+    tt.return %1, %6, %new_weight : f32, f32, f32 loc(#loc107)
+  ^bb1:  // no predecessors
+    %7 = ub.poison : f32 loc(#loc108)
+    %8 = ub.poison : f32 loc(#loc108)
+    %9 = ub.poison : f32 loc(#loc108)
+    tt.return %7, %8, %9 : f32, f32, f32 loc(#loc108)
+  } loc(#loc94)
+} loc(#loc)
+#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":19:13)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":20:15)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":23:28)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":23:33)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":24:36)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":24:44)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":24:23)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":25:21)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":26:27)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":26:37)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":29:45)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":30:43)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":31:47)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":32:43)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":33:31)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":34:29)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":38:46)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":38:41)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":38:34)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":38:61)
+#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":38:51)
+#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":38:112)
+#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":42:62)
+#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":42:51)
+#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":44:39)
+#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":44:62)
+#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":45:37)
+#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":45:58)
+#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":46:41)
+#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":46:66)
+#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":46:8)
+#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":47:79)
+#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":48:16)
+#loc34 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":49:16)
+#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":50:16)
+#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":51:43)
+#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":52:31)
+#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":53:29)
+#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":57:34)
+#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":57:41)
+#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":57:94)
+#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":58:47)
+#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":58:42)
+#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":58:35)
+#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":58:62)
+#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":58:52)
+#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":58:114)
+#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":59:35)
+#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":59:42)
+#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":59:95)
+#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":60:16)
+#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":61:23)
+#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":63:24)
+#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":64:16)
+#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":65:24)
+#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":66:16)
+#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":67:24)
+#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":68:32)
+#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":69:24)
+#loc60 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":71:24)
+#loc61 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":72:24)
+#loc62 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":73:41)
+#loc63 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":73:36)
+#loc64 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":73:29)
+#loc65 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":73:63)
+#loc66 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":73:53)
+#loc67 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":51:4)
+#loc68 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":120:0)
+#loc69 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":129:31)
+#loc70 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":129:11)
+#loc71 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":129:4)
+#loc73 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":217:7)
+#loc74 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":218:46)
+#loc75 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":220:31)
+#loc76 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":222:24)
+#loc77 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":223:30)
+#loc78 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":224:34)
+#loc79 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":224:26)
+#loc80 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":225:39)
+#loc81 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":225:31)
+#loc82 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":225:22)
+#loc83 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":226:11)
+#loc84 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":226:4)
+#loc86 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":140:30)
+#loc87 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":140:11)
+#loc88 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":140:4)
+#loc90 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":243:46)
+#loc92 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":243:11)
+#loc93 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":243:4)
+#loc95 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":231:21)
+#loc96 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":232:28)
+#loc97 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:39)
+#loc98 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:60)
+#loc99 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:49)
+#loc100 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":235:25)
+#loc101 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":235:17)
+#loc102 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:15)
+#loc103 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:30)
+#loc104 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:38)
+#loc105 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:49)
+#loc106 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:22)
+#loc107 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":234:11)
+#loc108 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":234:4)
+#loc115 = loc("xnumel"(#loc1))
+#loc116 = loc("r0_numel"(#loc2))
+#loc117 = loc("xoffset"(#loc3))
+#loc118 = loc("xoffset"(#loc4))
+#loc119 = loc("xindex"(#loc5))
+#loc120 = loc("xindex"(#loc6))
+#loc121 = loc("xindex"(#loc7))
+#loc122 = loc("xmask"(#loc8))
+#loc123 = loc("r0_base"(#loc9))
+#loc124 = loc("r0_base"(#loc10))
+#loc125 = loc("tmp3_mean"(#loc11))
+#loc126 = loc("tmp3_m2"(#loc12))
+#loc127 = loc("tmp3_weight"(#loc13))
+#loc128 = loc("tmp3_mean"(#loc14))
+#loc129 = loc("r0_index"(#loc15))
+#loc130 = loc("r0_mask"(#loc16))
+#loc131 = loc("tmp0"(#loc17))
+#loc132 = loc("tmp0"(#loc18))
+#loc133 = loc("tmp0"(#loc19))
+#loc134 = loc("tmp0"(#loc20))
+#loc135 = loc("tmp0"(#loc21))
+#loc136 = loc("tmp0"(#loc22))
+#loc137 = loc("tmp3_mean"(#loc25))
+#loc138 = loc("tmp3_mean"(#loc26))
+#loc139 = loc("tmp3_m2"(#loc27))
+#loc140 = loc("tmp3_m2"(#loc28))
+#loc141 = loc("tmp3_weight"(#loc29))
+#loc142 = loc("tmp3_weight"(#loc30))
+#loc143 = loc("tmp3"(#loc33))
+#loc144 = loc("tmp7"(#loc34))
+#loc145 = loc("tmp8"(#loc35))
+#loc146 = loc("r0_index"(#loc37))
+#loc147 = loc("r0_mask"(#loc38))
+#loc148 = loc("tmp9"(#loc39))
+#loc149 = loc("tmp9"(#loc40))
+#loc150 = loc("tmp9"(#loc41))
+#loc151 = loc("tmp12"(#loc42))
+#loc152 = loc("tmp12"(#loc43))
+#loc153 = loc("tmp12"(#loc44))
+#loc154 = loc("tmp12"(#loc45))
+#loc155 = loc("tmp12"(#loc46))
+#loc156 = loc("tmp12"(#loc47))
+#loc157 = loc("tmp23"(#loc48))
+#loc158 = loc("tmp23"(#loc49))
+#loc159 = loc("tmp23"(#loc50))
+#loc160 = loc("tmp10"(#loc51))
+#loc161 = loc("tmp11"(#loc52))
+#loc162 = loc("tmp14"(#loc53))
+#loc163 = loc("tmp15"(#loc54))
+#loc164 = loc("tmp16"(#loc55))
+#loc165 = loc("tmp17"(#loc56))
+#loc166 = loc("tmp18"(#loc57))
+#loc167 = loc("tmp19"(#loc58))
+#loc168 = loc("tmp20"(#loc59))
+#loc169 = loc("tmp22"(#loc60))
+#loc170 = loc("tmp24"(#loc61))
+#loc176 = loc("new_weight"(#loc74))
+#loc177 = loc("new_m2"(#loc75))
+#loc178 = loc("delta"(#loc76))
+#loc179 = loc("new_weight"(#loc77))
+#loc180 = loc("new_mean"(#loc78))
+#loc181 = loc("new_mean"(#loc79))
+#loc182 = loc("new_m2"(#loc80))
+#loc183 = loc("new_m2"(#loc81))
+#loc184 = loc("new_m2"(#loc82))
+#loc195 = loc("delta"(#loc95))
+#loc196 = loc("new_weight"(#loc96))
+#loc197 = loc("w2_over_w"(#loc97))
+#loc198 = loc("w2_over_w"(#loc98))
+#loc199 = loc("w2_over_w"(#loc99))
+#loc200 = loc("tmp3_m2"(#loc128))
+#loc202 = loc("new_weight"(#loc176))
+#loc203 = loc("new_m2"(#loc177))
+#loc204 = loc("new_weight"(#loc179))
+#loc205 = loc("new_mean"(#loc181))
+#loc206 = loc("new_m2"(#loc184))
+#loc207 = loc("tmp3_weight"(#loc200))
diff --git a/triton/MRY6R44HLP4JU6GX5XHYUUNGQ6LLBRKAPA6ZNJPRYYOSCCZPWAPQ/triton_red_fused_add_mul_native_layer_norm_0.ttgir b/triton/MRY6R44HLP4JU6GX5XHYUUNGQ6LLBRKAPA6ZNJPRYYOSCCZPWAPQ/triton_red_fused_add_mul_native_layer_norm_0.ttgir
new file mode 100644
index 0000000000000000000000000000000000000000..fc93054c228943ac0bd8dec67fa9da45f7c48945
--- /dev/null
+++ b/triton/MRY6R44HLP4JU6GX5XHYUUNGQ6LLBRKAPA6ZNJPRYYOSCCZPWAPQ/triton_red_fused_add_mul_native_layer_norm_0.ttgir
@@ -0,0 +1,179 @@
+#blocked = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [1, 32], warpsPerCTA = [1, 16], order = [1, 0]}>
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":18:0)
+#loc1 = loc(unknown)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":47:79)
+#loc49 = loc("in_ptr0"(#loc))
+#loc50 = loc("in_ptr1"(#loc))
+#loc51 = loc("in_ptr2"(#loc))
+#loc52 = loc("out_ptr2"(#loc))
+#loc53 = loc("xnumel"(#loc))
+#loc54 = loc("r0_numel"(#loc))
+#loc68 = loc(callsite(#loc1 at #loc15))
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 16 : i32, ttg.target = "cuda:89", "ttg.threads-per-warp" = 32 : i32} {
+  tt.func public @triton_red_fused_add_mul_native_layer_norm_0(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %out_ptr2: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
+    %cst = arith.constant dense<4096> : tensor<1x4096xi32, #blocked> loc(#loc1)
+    %cst_0 = arith.constant dense<0.000000e+00> : tensor<1x4096xbf16, #blocked> loc(#loc1)
+    %c4096_i32 = arith.constant 4096 : i32 loc(#loc1)
+    %c2048_i32 = arith.constant 2048 : i32 loc(#loc1)
+    %cst_1 = arith.constant 0.000000e+00 : f32 loc(#loc1)
+    %cst_2 = arith.constant dense<0.000000e+00> : tensor<1x4096xf32, #blocked> loc(#loc1)
+    %cst_3 = arith.constant dense<9.99999997E-7> : tensor<1x1xf32, #blocked> loc(#loc1)
+    %cst_4 = arith.constant dense<4.096000e+03> : tensor<1x1xf32, #blocked> loc(#loc1)
+    %cst_5 = arith.constant dense<1.000000e+00> : tensor<1x4096xf32, #blocked> loc(#loc1)
+    %xoffset = tt.get_program_id x : i32 loc(#loc55)
+    %xmask = arith.cmpi slt, %xoffset, %c2048_i32 : i32 loc(#loc56)
+    %r0_base = tt.make_range {end = 4096 : i32, start = 0 : i32} : tensor<4096xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc57)
+    %r0_base_6 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<4096xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x4096xi32, #blocked> loc(#loc57)
+    %r0_mask = arith.cmpi slt, %r0_base_6, %cst : tensor<1x4096xi32, #blocked> loc(#loc58)
+    %tmp0 = arith.muli %xoffset, %c4096_i32 : i32 loc(#loc59)
+    %tmp0_7 = tt.splat %tmp0 : i32 -> tensor<1x4096xi32, #blocked> loc(#loc92)
+    %tmp0_8 = arith.addi %r0_base_6, %tmp0_7 : tensor<1x4096xi32, #blocked> loc(#loc60)
+    %tmp0_9 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<1x4096x!tt.ptr<bf16>, #blocked> loc(#loc61)
+    %tmp0_10 = tt.addptr %tmp0_9, %tmp0_8 : tensor<1x4096x!tt.ptr<bf16>, #blocked>, tensor<1x4096xi32, #blocked> loc(#loc61)
+    %tmp0_11 = tt.splat %xmask : i1 -> tensor<1x4096xi1, #blocked> loc(#loc93)
+    %tmp0_12 = arith.andi %r0_mask, %tmp0_11 : tensor<1x4096xi1, #blocked> loc(#loc62)
+    %tmp0_13 = tt.load %tmp0_10, %tmp0_12, %cst_0 evictionPolicy = evict_last : tensor<1x4096x!tt.ptr<bf16>, #blocked> loc(#loc63)
+    %tmp0_14 = arith.extf %tmp0_13 : tensor<1x4096xbf16, #blocked> to tensor<1x4096xf32, #blocked> loc(#loc64)
+    %tmp3_mean = arith.select %tmp0_12, %tmp0_14, %cst_2 : tensor<1x4096xi1, #blocked>, tensor<1x4096xf32, #blocked> loc(#loc65)
+    %tmp3_weight = arith.select %tmp0_12, %cst_5, %cst_2 : tensor<1x4096xi1, #blocked>, tensor<1x4096xf32, #blocked> loc(#loc66)
+    %0:3 = "tt.reduce"(%tmp3_mean, %cst_2, %tmp3_weight) <{axis = 1 : i32}> ({
+    ^bb0(%arg6: f32 loc(callsite(#loc1 at #loc15)), %arg7: f32 loc(callsite(#loc1 at #loc15)), %arg8: f32 loc(callsite(#loc1 at #loc15)), %arg9: f32 loc(callsite(#loc1 at #loc15)), %arg10: f32 loc(callsite(#loc1 at #loc15)), %arg11: f32 loc(callsite(#loc1 at #loc15))):
+      %delta = arith.subf %arg9, %arg6 : f32 loc(#loc94)
+      %new_weight = arith.addf %arg8, %arg11 : f32 loc(#loc95)
+      %w2_over_w = arith.cmpf oeq, %new_weight, %cst_1 : f32 loc(#loc96)
+      %w2_over_w_24 = arith.divf %arg11, %new_weight : f32 loc(#loc97)
+      %w2_over_w_25 = arith.select %w2_over_w, %cst_1, %w2_over_w_24 : f32 loc(#loc98)
+      %4 = arith.mulf %delta, %w2_over_w_25 : f32 loc(#loc99)
+      %5 = arith.addf %arg6, %4 : f32 loc(#loc100)
+      %6 = arith.addf %arg7, %arg10 : f32 loc(#loc101)
+      %7 = arith.mulf %delta, %delta : f32 loc(#loc102)
+      %8 = arith.mulf %7, %arg8 : f32 loc(#loc103)
+      %9 = arith.mulf %8, %w2_over_w_25 : f32 loc(#loc104)
+      %10 = arith.addf %6, %9 : f32 loc(#loc105)
+      tt.reduce.return %5, %10, %new_weight : f32, f32, f32 loc(#loc67)
+    }) : (tensor<1x4096xf32, #blocked>, tensor<1x4096xf32, #blocked>, tensor<1x4096xf32, #blocked>) -> (tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked}>>, tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked}>>, tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked}>>) loc(#loc67)
+    %tmp3 = tt.expand_dims %0#0 {axis = 1 : i32} : tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<1x1xf32, #blocked> loc(#loc74)
+    %tmp7 = tt.expand_dims %0#1 {axis = 1 : i32} : tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<1x1xf32, #blocked> loc(#loc75)
+    %tmp9 = tt.splat %in_ptr1 : !tt.ptr<bf16> -> tensor<1x4096x!tt.ptr<bf16>, #blocked> loc(#loc76)
+    %tmp9_15 = tt.addptr %tmp9, %r0_base_6 : tensor<1x4096x!tt.ptr<bf16>, #blocked>, tensor<1x4096xi32, #blocked> loc(#loc76)
+    %tmp9_16 = tt.load %tmp9_15, %r0_mask, %cst_0 evictionPolicy = evict_last : tensor<1x4096x!tt.ptr<bf16>, #blocked> loc(#loc77)
+    %tmp9_17 = arith.extf %tmp9_16 : tensor<1x4096xbf16, #blocked> to tensor<1x4096xf32, #blocked> loc(#loc78)
+    %tmp12 = tt.load %tmp0_10, %tmp0_12, %cst_0 evictionPolicy = evict_first : tensor<1x4096x!tt.ptr<bf16>, #blocked> loc(#loc79)
+    %tmp12_18 = arith.extf %tmp12 : tensor<1x4096xbf16, #blocked> to tensor<1x4096xf32, #blocked> loc(#loc80)
+    %tmp23 = tt.splat %in_ptr2 : !tt.ptr<bf16> -> tensor<1x4096x!tt.ptr<bf16>, #blocked> loc(#loc81)
+    %tmp23_19 = tt.addptr %tmp23, %r0_base_6 : tensor<1x4096x!tt.ptr<bf16>, #blocked>, tensor<1x4096xi32, #blocked> loc(#loc81)
+    %tmp23_20 = tt.load %tmp23_19, %r0_mask, %cst_0 evictionPolicy = evict_last : tensor<1x4096x!tt.ptr<bf16>, #blocked> loc(#loc82)
+    %tmp23_21 = arith.extf %tmp23_20 : tensor<1x4096xbf16, #blocked> to tensor<1x4096xf32, #blocked> loc(#loc83)
+    %tmp11 = arith.addf %tmp9_17, %cst_5 : tensor<1x4096xf32, #blocked> loc(#loc84)
+    %tmp14 = tt.broadcast %tmp3 : tensor<1x1xf32, #blocked> -> tensor<1x4096xf32, #blocked> loc(#loc85)
+    %tmp14_22 = arith.subf %tmp12_18, %tmp14 : tensor<1x4096xf32, #blocked> loc(#loc85)
+    %tmp16 = arith.divf %tmp7, %cst_4 : tensor<1x1xf32, #blocked> loc(#loc86)
+    %tmp18 = arith.addf %tmp16, %cst_3 : tensor<1x1xf32, #blocked> loc(#loc87)
+    %tmp19 = tt.extern_elementwise %tmp18 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<1x1xf32, #blocked>) -> tensor<1x1xf32, #blocked> loc(#loc88)
+    %tmp20 = tt.broadcast %tmp19 : tensor<1x1xf32, #blocked> -> tensor<1x4096xf32, #blocked> loc(#loc89)
+    %tmp20_23 = arith.mulf %tmp14_22, %tmp20 : tensor<1x4096xf32, #blocked> loc(#loc89)
+    %tmp22 = arith.mulf %tmp11, %tmp20_23 : tensor<1x4096xf32, #blocked> loc(#loc90)
+    %tmp24 = arith.addf %tmp22, %tmp23_21 : tensor<1x4096xf32, #blocked> loc(#loc91)
+    %1 = tt.splat %out_ptr2 : !tt.ptr<bf16> -> tensor<1x4096x!tt.ptr<bf16>, #blocked> loc(#loc46)
+    %2 = tt.addptr %1, %tmp0_8 : tensor<1x4096x!tt.ptr<bf16>, #blocked>, tensor<1x4096xi32, #blocked> loc(#loc46)
+    %3 = arith.truncf %tmp24 : tensor<1x4096xf32, #blocked> to tensor<1x4096xbf16, #blocked> loc(#loc47)
+    tt.store %2, %3, %tmp0_12 : tensor<1x4096x!tt.ptr<bf16>, #blocked> loc(#loc47)
+    tt.return loc(#loc48)
+  } loc(#loc)
+} loc(#loc)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":23:28)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":25:21)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":26:37)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":34:29)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":38:46)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":38:41)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":38:34)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":38:61)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":38:51)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":38:112)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":44:62)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":46:66)
+#loc14 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":243:46)
+#loc16 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":231:21)
+#loc17 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":232:28)
+#loc18 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:39)
+#loc19 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:60)
+#loc20 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:49)
+#loc21 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":235:25)
+#loc22 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":235:17)
+#loc23 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:15)
+#loc24 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:30)
+#loc25 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:38)
+#loc26 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:49)
+#loc27 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:22)
+#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":48:16)
+#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":49:16)
+#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":57:34)
+#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":57:41)
+#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":57:94)
+#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":58:52)
+#loc34 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":58:114)
+#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":59:35)
+#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":59:42)
+#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":59:95)
+#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":61:23)
+#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":63:24)
+#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":65:24)
+#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":67:24)
+#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":68:32)
+#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":69:24)
+#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":71:24)
+#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":72:24)
+#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":73:29)
+#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":73:53)
+#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":51:4)
+#loc55 = loc("xoffset"(#loc2))
+#loc56 = loc("xmask"(#loc3))
+#loc57 = loc("r0_base"(#loc4))
+#loc58 = loc("r0_mask"(#loc5))
+#loc59 = loc("tmp0"(#loc6))
+#loc60 = loc("tmp0"(#loc7))
+#loc61 = loc("tmp0"(#loc8))
+#loc62 = loc("tmp0"(#loc9))
+#loc63 = loc("tmp0"(#loc10))
+#loc64 = loc("tmp0"(#loc11))
+#loc65 = loc("tmp3_mean"(#loc12))
+#loc66 = loc("tmp3_weight"(#loc13))
+#loc67 = loc(callsite(#loc14 at #loc15))
+#loc69 = loc("delta"(#loc16))
+#loc70 = loc("new_weight"(#loc17))
+#loc71 = loc("w2_over_w"(#loc18))
+#loc72 = loc("w2_over_w"(#loc19))
+#loc73 = loc("w2_over_w"(#loc20))
+#loc74 = loc("tmp3"(#loc28))
+#loc75 = loc("tmp7"(#loc29))
+#loc76 = loc("tmp9"(#loc30))
+#loc77 = loc("tmp9"(#loc31))
+#loc78 = loc("tmp9"(#loc32))
+#loc79 = loc("tmp12"(#loc33))
+#loc80 = loc("tmp12"(#loc34))
+#loc81 = loc("tmp23"(#loc35))
+#loc82 = loc("tmp23"(#loc36))
+#loc83 = loc("tmp23"(#loc37))
+#loc84 = loc("tmp11"(#loc38))
+#loc85 = loc("tmp14"(#loc39))
+#loc86 = loc("tmp16"(#loc40))
+#loc87 = loc("tmp18"(#loc41))
+#loc88 = loc("tmp19"(#loc42))
+#loc89 = loc("tmp20"(#loc43))
+#loc90 = loc("tmp22"(#loc44))
+#loc91 = loc("tmp24"(#loc45))
+#loc92 = loc(fused[#loc60, #loc59])
+#loc93 = loc(fused[#loc62, #loc56])
+#loc94 = loc(callsite(#loc69 at #loc67))
+#loc95 = loc(callsite(#loc70 at #loc67))
+#loc96 = loc(callsite(#loc71 at #loc67))
+#loc97 = loc(callsite(#loc72 at #loc67))
+#loc98 = loc(callsite(#loc73 at #loc67))
+#loc99 = loc(callsite(#loc21 at #loc67))
+#loc100 = loc(callsite(#loc22 at #loc67))
+#loc101 = loc(callsite(#loc23 at #loc67))
+#loc102 = loc(callsite(#loc24 at #loc67))
+#loc103 = loc(callsite(#loc25 at #loc67))
+#loc104 = loc(callsite(#loc26 at #loc67))
+#loc105 = loc(callsite(#loc27 at #loc67))
diff --git a/triton/MRY6R44HLP4JU6GX5XHYUUNGQ6LLBRKAPA6ZNJPRYYOSCCZPWAPQ/triton_red_fused_add_mul_native_layer_norm_0.ttir b/triton/MRY6R44HLP4JU6GX5XHYUUNGQ6LLBRKAPA6ZNJPRYYOSCCZPWAPQ/triton_red_fused_add_mul_native_layer_norm_0.ttir
new file mode 100644
index 0000000000000000000000000000000000000000..116200f04887ccf75098d3097f252bdbbc2ea0e5
--- /dev/null
+++ b/triton/MRY6R44HLP4JU6GX5XHYUUNGQ6LLBRKAPA6ZNJPRYYOSCCZPWAPQ/triton_red_fused_add_mul_native_layer_norm_0.ttir
@@ -0,0 +1,180 @@
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":18:0)
+#loc1 = loc(unknown)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":47:79)
+#loc50 = loc("in_ptr0"(#loc))
+#loc51 = loc("in_ptr1"(#loc))
+#loc52 = loc("in_ptr2"(#loc))
+#loc53 = loc("out_ptr2"(#loc))
+#loc54 = loc("xnumel"(#loc))
+#loc55 = loc("r0_numel"(#loc))
+#loc57 = loc(callsite(#loc1 at #loc3))
+module {
+  tt.func public @triton_red_fused_add_mul_native_layer_norm_0(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %out_ptr2: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
+    %c4096_i32 = arith.constant 4096 : i32 loc(#loc1)
+    %xmask = arith.constant 2048 : i32 loc(#loc56)
+    %cst = arith.constant 0.000000e+00 : f32 loc(#loc57)
+    %cst_0 = arith.constant dense<0.000000e+00> : tensor<1x4096xf32> loc(#loc1)
+    %cst_1 = arith.constant dense<0.000000e+00> : tensor<1x4096xbf16> loc(#loc1)
+    %cst_2 = arith.constant dense<9.99999997E-7> : tensor<1x1xf32> loc(#loc1)
+    %cst_3 = arith.constant dense<4.096000e+03> : tensor<1x1xf32> loc(#loc1)
+    %cst_4 = arith.constant dense<1.000000e+00> : tensor<1x4096xf32> loc(#loc1)
+    %cst_5 = arith.constant dense<4096> : tensor<1x4096xi32> loc(#loc1)
+    %xoffset = tt.get_program_id x : i32 loc(#loc58)
+    %xmask_6 = arith.cmpi slt, %xoffset, %xmask : i32 loc(#loc56)
+    %r0_base = tt.make_range {end = 4096 : i32, start = 0 : i32} : tensor<4096xi32> loc(#loc59)
+    %r0_base_7 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<4096xi32> -> tensor<1x4096xi32> loc(#loc60)
+    %r0_mask = arith.cmpi slt, %r0_base_7, %cst_5 : tensor<1x4096xi32> loc(#loc61)
+    %tmp0 = arith.muli %xoffset, %c4096_i32 : i32 loc(#loc62)
+    %tmp0_8 = tt.splat %tmp0 : i32 -> tensor<1x4096xi32> loc(#loc94)
+    %tmp0_9 = arith.addi %r0_base_7, %tmp0_8 : tensor<1x4096xi32> loc(#loc63)
+    %tmp0_10 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<1x4096x!tt.ptr<bf16>> loc(#loc64)
+    %tmp0_11 = tt.addptr %tmp0_10, %tmp0_9 : tensor<1x4096x!tt.ptr<bf16>>, tensor<1x4096xi32> loc(#loc64)
+    %tmp0_12 = tt.splat %xmask_6 : i1 -> tensor<1x4096xi1> loc(#loc95)
+    %tmp0_13 = arith.andi %r0_mask, %tmp0_12 : tensor<1x4096xi1> loc(#loc65)
+    %tmp0_14 = tt.load %tmp0_11, %tmp0_13, %cst_1 evictionPolicy = evict_last : tensor<1x4096x!tt.ptr<bf16>> loc(#loc66)
+    %tmp0_15 = arith.extf %tmp0_14 : tensor<1x4096xbf16> to tensor<1x4096xf32> loc(#loc67)
+    %tmp3_mean = arith.select %tmp0_13, %tmp0_15, %cst_0 : tensor<1x4096xi1>, tensor<1x4096xf32> loc(#loc68)
+    %tmp3_weight = arith.select %tmp0_13, %cst_4, %cst_0 : tensor<1x4096xi1>, tensor<1x4096xf32> loc(#loc69)
+    %0:3 = "tt.reduce"(%tmp3_mean, %cst_0, %tmp3_weight) <{axis = 1 : i32}> ({
+    ^bb0(%arg6: f32 loc(callsite(#loc1 at #loc3)), %arg7: f32 loc(callsite(#loc1 at #loc3)), %arg8: f32 loc(callsite(#loc1 at #loc3)), %arg9: f32 loc(callsite(#loc1 at #loc3)), %arg10: f32 loc(callsite(#loc1 at #loc3)), %arg11: f32 loc(callsite(#loc1 at #loc3))):
+      %delta = arith.subf %arg9, %arg6 : f32 loc(#loc96)
+      %new_weight = arith.addf %arg8, %arg11 : f32 loc(#loc97)
+      %w2_over_w = arith.cmpf oeq, %new_weight, %cst : f32 loc(#loc98)
+      %w2_over_w_25 = arith.divf %arg11, %new_weight : f32 loc(#loc99)
+      %w2_over_w_26 = arith.select %w2_over_w, %cst, %w2_over_w_25 : f32 loc(#loc100)
+      %4 = arith.mulf %delta, %w2_over_w_26 : f32 loc(#loc101)
+      %5 = arith.addf %arg6, %4 : f32 loc(#loc102)
+      %6 = arith.addf %arg7, %arg10 : f32 loc(#loc103)
+      %7 = arith.mulf %delta, %delta : f32 loc(#loc104)
+      %8 = arith.mulf %7, %arg8 : f32 loc(#loc105)
+      %9 = arith.mulf %8, %w2_over_w_26 : f32 loc(#loc106)
+      %10 = arith.addf %6, %9 : f32 loc(#loc107)
+      tt.reduce.return %5, %10, %new_weight : f32, f32, f32 loc(#loc70)
+    }) : (tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32>) -> (tensor<1xf32>, tensor<1xf32>, tensor<1xf32>) loc(#loc70)
+    %tmp3 = tt.expand_dims %0#0 {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc76)
+    %tmp7 = tt.expand_dims %0#1 {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc77)
+    %tmp9 = tt.splat %in_ptr1 : !tt.ptr<bf16> -> tensor<1x4096x!tt.ptr<bf16>> loc(#loc78)
+    %tmp9_16 = tt.addptr %tmp9, %r0_base_7 : tensor<1x4096x!tt.ptr<bf16>>, tensor<1x4096xi32> loc(#loc78)
+    %tmp9_17 = tt.load %tmp9_16, %r0_mask, %cst_1 evictionPolicy = evict_last : tensor<1x4096x!tt.ptr<bf16>> loc(#loc79)
+    %tmp9_18 = arith.extf %tmp9_17 : tensor<1x4096xbf16> to tensor<1x4096xf32> loc(#loc80)
+    %tmp12 = tt.load %tmp0_11, %tmp0_13, %cst_1 evictionPolicy = evict_first : tensor<1x4096x!tt.ptr<bf16>> loc(#loc81)
+    %tmp12_19 = arith.extf %tmp12 : tensor<1x4096xbf16> to tensor<1x4096xf32> loc(#loc82)
+    %tmp23 = tt.splat %in_ptr2 : !tt.ptr<bf16> -> tensor<1x4096x!tt.ptr<bf16>> loc(#loc83)
+    %tmp23_20 = tt.addptr %tmp23, %r0_base_7 : tensor<1x4096x!tt.ptr<bf16>>, tensor<1x4096xi32> loc(#loc83)
+    %tmp23_21 = tt.load %tmp23_20, %r0_mask, %cst_1 evictionPolicy = evict_last : tensor<1x4096x!tt.ptr<bf16>> loc(#loc84)
+    %tmp23_22 = arith.extf %tmp23_21 : tensor<1x4096xbf16> to tensor<1x4096xf32> loc(#loc85)
+    %tmp11 = arith.addf %tmp9_18, %cst_4 : tensor<1x4096xf32> loc(#loc86)
+    %tmp14 = tt.broadcast %tmp3 : tensor<1x1xf32> -> tensor<1x4096xf32> loc(#loc87)
+    %tmp14_23 = arith.subf %tmp12_19, %tmp14 : tensor<1x4096xf32> loc(#loc87)
+    %tmp16 = arith.divf %tmp7, %cst_3 : tensor<1x1xf32> loc(#loc88)
+    %tmp18 = arith.addf %tmp16, %cst_2 : tensor<1x1xf32> loc(#loc89)
+    %tmp19 = tt.extern_elementwise %tmp18 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<1x1xf32>) -> tensor<1x1xf32> loc(#loc90)
+    %tmp20 = tt.broadcast %tmp19 : tensor<1x1xf32> -> tensor<1x4096xf32> loc(#loc91)
+    %tmp20_24 = arith.mulf %tmp14_23, %tmp20 : tensor<1x4096xf32> loc(#loc91)
+    %tmp22 = arith.mulf %tmp11, %tmp20_24 : tensor<1x4096xf32> loc(#loc92)
+    %tmp24 = arith.addf %tmp22, %tmp23_22 : tensor<1x4096xf32> loc(#loc93)
+    %1 = tt.splat %out_ptr2 : !tt.ptr<bf16> -> tensor<1x4096x!tt.ptr<bf16>> loc(#loc47)
+    %2 = tt.addptr %1, %tmp0_9 : tensor<1x4096x!tt.ptr<bf16>>, tensor<1x4096xi32> loc(#loc47)
+    %3 = arith.truncf %tmp24 : tensor<1x4096xf32> to tensor<1x4096xbf16> loc(#loc48)
+    tt.store %2, %3, %tmp0_13 : tensor<1x4096x!tt.ptr<bf16>> loc(#loc48)
+    tt.return loc(#loc49)
+  } loc(#loc)
+} loc(#loc)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":25:21)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":23:28)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":26:27)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":26:37)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":34:29)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":38:46)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":38:41)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":38:34)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":38:61)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":38:51)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":38:112)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":44:62)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":46:66)
+#loc16 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":243:46)
+#loc17 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":231:21)
+#loc18 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":232:28)
+#loc19 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:39)
+#loc20 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:60)
+#loc21 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:49)
+#loc22 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":235:25)
+#loc23 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":235:17)
+#loc24 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:15)
+#loc25 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:30)
+#loc26 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:38)
+#loc27 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:49)
+#loc28 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:22)
+#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":48:16)
+#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":49:16)
+#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":57:34)
+#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":57:41)
+#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":57:94)
+#loc34 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":58:52)
+#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":58:114)
+#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":59:35)
+#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":59:42)
+#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":59:95)
+#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":61:23)
+#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":63:24)
+#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":65:24)
+#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":67:24)
+#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":68:32)
+#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":69:24)
+#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":71:24)
+#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":72:24)
+#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":73:29)
+#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":73:53)
+#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/ww/cwwizzjwmd4ajlubxpvxidjiy3ldv5eflwludgizcahvsp4i75s2.py":51:4)
+#loc56 = loc("xmask"(#loc2))
+#loc58 = loc("xoffset"(#loc4))
+#loc59 = loc("r0_base"(#loc5))
+#loc60 = loc("r0_base"(#loc6))
+#loc61 = loc("r0_mask"(#loc7))
+#loc62 = loc("tmp0"(#loc8))
+#loc63 = loc("tmp0"(#loc9))
+#loc64 = loc("tmp0"(#loc10))
+#loc65 = loc("tmp0"(#loc11))
+#loc66 = loc("tmp0"(#loc12))
+#loc67 = loc("tmp0"(#loc13))
+#loc68 = loc("tmp3_mean"(#loc14))
+#loc69 = loc("tmp3_weight"(#loc15))
+#loc70 = loc(callsite(#loc16 at #loc3))
+#loc71 = loc("delta"(#loc17))
+#loc72 = loc("new_weight"(#loc18))
+#loc73 = loc("w2_over_w"(#loc19))
+#loc74 = loc("w2_over_w"(#loc20))
+#loc75 = loc("w2_over_w"(#loc21))
+#loc76 = loc("tmp3"(#loc29))
+#loc77 = loc("tmp7"(#loc30))
+#loc78 = loc("tmp9"(#loc31))
+#loc79 = loc("tmp9"(#loc32))
+#loc80 = loc("tmp9"(#loc33))
+#loc81 = loc("tmp12"(#loc34))
+#loc82 = loc("tmp12"(#loc35))
+#loc83 = loc("tmp23"(#loc36))
+#loc84 = loc("tmp23"(#loc37))
+#loc85 = loc("tmp23"(#loc38))
+#loc86 = loc("tmp11"(#loc39))
+#loc87 = loc("tmp14"(#loc40))
+#loc88 = loc("tmp16"(#loc41))
+#loc89 = loc("tmp18"(#loc42))
+#loc90 = loc("tmp19"(#loc43))
+#loc91 = loc("tmp20"(#loc44))
+#loc92 = loc("tmp22"(#loc45))
+#loc93 = loc("tmp24"(#loc46))
+#loc94 = loc(fused[#loc63, #loc62])
+#loc95 = loc(fused[#loc65, #loc56])
+#loc96 = loc(callsite(#loc71 at #loc70))
+#loc97 = loc(callsite(#loc72 at #loc70))
+#loc98 = loc(callsite(#loc73 at #loc70))
+#loc99 = loc(callsite(#loc74 at #loc70))
+#loc100 = loc(callsite(#loc75 at #loc70))
+#loc101 = loc(callsite(#loc22 at #loc70))
+#loc102 = loc(callsite(#loc23 at #loc70))
+#loc103 = loc(callsite(#loc24 at #loc70))
+#loc104 = loc(callsite(#loc25 at #loc70))
+#loc105 = loc(callsite(#loc26 at #loc70))
+#loc106 = loc(callsite(#loc27 at #loc70))
+#loc107 = loc(callsite(#loc28 at #loc70))
diff --git a/triton/NNLWGXPBSTRKOZBT6DRH5UXB7NTBUS2TEHIHMNLX4PT5DXTIKJLA/__grp__triton_poi_fused_add_mul_0.json b/triton/NNLWGXPBSTRKOZBT6DRH5UXB7NTBUS2TEHIHMNLX4PT5DXTIKJLA/__grp__triton_poi_fused_add_mul_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..a8335465cb2ce94c109ae0e4260cd48a229a95bb
--- /dev/null
+++ b/triton/NNLWGXPBSTRKOZBT6DRH5UXB7NTBUS2TEHIHMNLX4PT5DXTIKJLA/__grp__triton_poi_fused_add_mul_0.json
@@ -0,0 +1 @@
+{"child_paths": {"triton_poi_fused_add_mul_0.source": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/NNLWGXPBSTRKOZBT6DRH5UXB7NTBUS2TEHIHMNLX4PT5DXTIKJLA/triton_poi_fused_add_mul_0.source", "triton_poi_fused_add_mul_0.ttir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/NNLWGXPBSTRKOZBT6DRH5UXB7NTBUS2TEHIHMNLX4PT5DXTIKJLA/triton_poi_fused_add_mul_0.ttir", "triton_poi_fused_add_mul_0.ttgir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/NNLWGXPBSTRKOZBT6DRH5UXB7NTBUS2TEHIHMNLX4PT5DXTIKJLA/triton_poi_fused_add_mul_0.ttgir", "triton_poi_fused_add_mul_0.llir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/NNLWGXPBSTRKOZBT6DRH5UXB7NTBUS2TEHIHMNLX4PT5DXTIKJLA/triton_poi_fused_add_mul_0.llir", "triton_poi_fused_add_mul_0.ptx": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/NNLWGXPBSTRKOZBT6DRH5UXB7NTBUS2TEHIHMNLX4PT5DXTIKJLA/triton_poi_fused_add_mul_0.ptx", "triton_poi_fused_add_mul_0.cubin": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/NNLWGXPBSTRKOZBT6DRH5UXB7NTBUS2TEHIHMNLX4PT5DXTIKJLA/triton_poi_fused_add_mul_0.cubin", "triton_poi_fused_add_mul_0.json": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/NNLWGXPBSTRKOZBT6DRH5UXB7NTBUS2TEHIHMNLX4PT5DXTIKJLA/triton_poi_fused_add_mul_0.json"}}
\ No newline at end of file
diff --git a/triton/NNLWGXPBSTRKOZBT6DRH5UXB7NTBUS2TEHIHMNLX4PT5DXTIKJLA/triton_poi_fused_add_mul_0.cubin b/triton/NNLWGXPBSTRKOZBT6DRH5UXB7NTBUS2TEHIHMNLX4PT5DXTIKJLA/triton_poi_fused_add_mul_0.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..8f551c4582f5275c698f20ca806539089233795d
Binary files /dev/null and b/triton/NNLWGXPBSTRKOZBT6DRH5UXB7NTBUS2TEHIHMNLX4PT5DXTIKJLA/triton_poi_fused_add_mul_0.cubin differ
diff --git a/triton/NNLWGXPBSTRKOZBT6DRH5UXB7NTBUS2TEHIHMNLX4PT5DXTIKJLA/triton_poi_fused_add_mul_0.json b/triton/NNLWGXPBSTRKOZBT6DRH5UXB7NTBUS2TEHIHMNLX4PT5DXTIKJLA/triton_poi_fused_add_mul_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..39dc7b1046387eaab01748aafa5a3a7c2a523067
--- /dev/null
+++ b/triton/NNLWGXPBSTRKOZBT6DRH5UXB7NTBUS2TEHIHMNLX4PT5DXTIKJLA/triton_poi_fused_add_mul_0.json
@@ -0,0 +1 @@
+{"hash": "6b57635de194e2a76433f0e27ed2e1fb661a4b5321d0763577e3e7d1de685256", "target": {"backend": "cuda", "arch": 89, "warp_size": 32}, "num_warps": 8, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "enable_reflect_ftz": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee", "bf16x3", "bf16x6"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm89", "instrumentation_mode": "", "triton_version": "3.6.0", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_poi_fused_add_mul_0"}
\ No newline at end of file
diff --git a/triton/NNLWGXPBSTRKOZBT6DRH5UXB7NTBUS2TEHIHMNLX4PT5DXTIKJLA/triton_poi_fused_add_mul_0.llir b/triton/NNLWGXPBSTRKOZBT6DRH5UXB7NTBUS2TEHIHMNLX4PT5DXTIKJLA/triton_poi_fused_add_mul_0.llir
new file mode 100644
index 0000000000000000000000000000000000000000..060110a98452b589ca594dd7166488cd264d2461
--- /dev/null
+++ b/triton/NNLWGXPBSTRKOZBT6DRH5UXB7NTBUS2TEHIHMNLX4PT5DXTIKJLA/triton_poi_fused_add_mul_0.llir
@@ -0,0 +1,76 @@
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64"
+
+; Function Attrs: nounwind
+define ptx_kernel void @triton_poi_fused_add_mul_0(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, i32 %4, ptr addrspace(1) readnone captures(none) %5, ptr addrspace(1) readnone captures(none) %6) local_unnamed_addr #0 !dbg !4 {
+  %8 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7
+  %9 = shl i32 %8, 9, !dbg !8
+  %10 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9
+  %11 = shl nuw nsw i32 %10, 1, !dbg !9
+  %12 = and i32 %11, 510, !dbg !9
+  %13 = or disjoint i32 %12, %9, !dbg !10
+  %14 = srem i32 %13, 4096, !dbg !11
+  %15 = sext i32 %13 to i64, !dbg !12
+  %16 = getelementptr bfloat, ptr addrspace(1) %0, i64 %15, !dbg !12
+  %17 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l"(ptr addrspace(1) %16) #2, !dbg !13
+  %18 = bitcast i32 %17 to <2 x bfloat>, !dbg !13
+  %19 = sext i32 %14 to i64, !dbg !14
+  %20 = getelementptr bfloat, ptr addrspace(1) %1, i64 %19, !dbg !14
+  %21 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #2, !dbg !15
+  %22 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l"(ptr addrspace(1) %20, i64 %21) #2, !dbg !15
+  %23 = bitcast i32 %22 to <2 x bfloat>, !dbg !15
+  %24 = getelementptr bfloat, ptr addrspace(1) %2, i64 %15, !dbg !16
+  %25 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l"(ptr addrspace(1) %24) #2, !dbg !17
+  %26 = bitcast i32 %25 to <2 x bfloat>, !dbg !17
+  %27 = getelementptr bfloat, ptr addrspace(1) %3, i64 %15, !dbg !18
+  %28 = fpext <2 x bfloat> %18 to <2 x float>, !dbg !19
+  %29 = fpext <2 x bfloat> %23 to <2 x float>, !dbg !20
+  %30 = fpext <2 x bfloat> %26 to <2 x float>, !dbg !21
+  %31 = fmul <2 x float> %29, %30, !dbg !22
+  %32 = fadd <2 x float> %31, %28, !dbg !23
+  %33 = fptrunc <2 x float> %32 to <2 x bfloat>, !dbg !24
+  %34 = bitcast <2 x bfloat> %33 to i32, !dbg !24
+  tail call void asm sideeffect "st.global.b32 [ $1 + 0 ], { $0 };", "r,l"(i32 %34, ptr addrspace(1) %27) #2, !dbg !24
+  ret void, !dbg !25
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1
+
+attributes #0 = { nounwind "nvvm.reqntid"="256" }
+attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #2 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly)
+!1 = !DIFile(filename: "c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py", directory: "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k")
+!2 = !{i32 2, !"Debug Info Version", i32 3}
+!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
+!4 = distinct !DISubprogram(name: "triton_poi_fused_add_mul_0", linkageName: "triton_poi_fused_add_mul_0", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!5 = !DISubroutineType(cc: DW_CC_normal, types: !6)
+!6 = !{}
+!7 = !DILocation(line: 20, column: 28, scope: !4)
+!8 = !DILocation(line: 20, column: 33, scope: !4)
+!9 = !DILocation(line: 21, column: 36, scope: !4)
+!10 = !DILocation(line: 21, column: 23, scope: !4)
+!11 = !DILocation(line: 24, column: 19, scope: !4)
+!12 = !DILocation(line: 25, column: 30, scope: !4)
+!13 = !DILocation(line: 25, column: 35, scope: !4)
+!14 = !DILocation(line: 26, column: 30, scope: !4)
+!15 = !DILocation(line: 26, column: 35, scope: !4)
+!16 = !DILocation(line: 27, column: 30, scope: !4)
+!17 = !DILocation(line: 27, column: 35, scope: !4)
+!18 = !DILocation(line: 30, column: 25, scope: !4)
+!19 = !DILocation(line: 25, column: 44, scope: !4)
+!20 = !DILocation(line: 26, column: 74, scope: !4)
+!21 = !DILocation(line: 27, column: 44, scope: !4)
+!22 = !DILocation(line: 28, column: 18, scope: !4)
+!23 = !DILocation(line: 29, column: 18, scope: !4)
+!24 = !DILocation(line: 30, column: 36, scope: !4)
+!25 = !DILocation(line: 30, column: 4, scope: !4)
diff --git a/triton/NNLWGXPBSTRKOZBT6DRH5UXB7NTBUS2TEHIHMNLX4PT5DXTIKJLA/triton_poi_fused_add_mul_0.ptx b/triton/NNLWGXPBSTRKOZBT6DRH5UXB7NTBUS2TEHIHMNLX4PT5DXTIKJLA/triton_poi_fused_add_mul_0.ptx
new file mode 100644
index 0000000000000000000000000000000000000000..df4549b1249f2be8c811689b87560410aeaa6c14
--- /dev/null
+++ b/triton/NNLWGXPBSTRKOZBT6DRH5UXB7NTBUS2TEHIHMNLX4PT5DXTIKJLA/triton_poi_fused_add_mul_0.ptx
@@ -0,0 +1,347 @@
+//
+// Generated by LLVM NVPTX Back-End
+//
+
+.version 9.1
+.target sm_89
+.address_size 64
+
+	// .globl	triton_poi_fused_add_mul_0 // -- Begin function triton_poi_fused_add_mul_0
+                                        // @triton_poi_fused_add_mul_0
+.visible .entry triton_poi_fused_add_mul_0(
+	.param .u64 .ptr .global .align 1 triton_poi_fused_add_mul_0_param_0,
+	.param .u64 .ptr .global .align 1 triton_poi_fused_add_mul_0_param_1,
+	.param .u64 .ptr .global .align 1 triton_poi_fused_add_mul_0_param_2,
+	.param .u64 .ptr .global .align 1 triton_poi_fused_add_mul_0_param_3,
+	.param .u32 triton_poi_fused_add_mul_0_param_4,
+	.param .u64 .ptr .global .align 1 triton_poi_fused_add_mul_0_param_5,
+	.param .u64 .ptr .global .align 1 triton_poi_fused_add_mul_0_param_6
+)
+.reqntid 256
+{
+	.reg .b16 	%rs<7>;
+	.reg .b32 	%r<24>;
+	.reg .b64 	%rd<11>;
+	.loc	1 18 0                          // c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py:18:0
+$L__func_begin0:
+	.loc	1 18 0                          // c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py:18:0
+
+// %bb.0:
+	ld.param.b64 	%rd6, [triton_poi_fused_add_mul_0_param_0];
+	ld.param.b64 	%rd7, [triton_poi_fused_add_mul_0_param_1];
+$L__tmp0:
+	.loc	1 20 28                         // c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py:20:28
+	mov.u32 	%r5, %ctaid.x;
+	.loc	1 20 33                         // c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py:20:33
+	shl.b32 	%r6, %r5, 9;
+	ld.param.b64 	%rd8, [triton_poi_fused_add_mul_0_param_2];
+	ld.param.b64 	%rd9, [triton_poi_fused_add_mul_0_param_3];
+	.loc	1 21 36                         // c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py:21:36
+	mov.u32 	%r7, %tid.x;
+	shl.b32 	%r8, %r7, 1;
+	and.b32 	%r9, %r8, 510;
+	.loc	1 21 23                         // c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py:21:23
+	or.b32 	%r10, %r9, %r6;
+	.loc	1 24 19                         // c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py:24:19
+	bfe.s32 	%r11, %r5, 22, 1;
+	shr.u32 	%r12, %r11, 20;
+	add.s32 	%r13, %r10, %r12;
+	and.b32 	%r14, %r13, -4096;
+	sub.s32 	%r15, %r10, %r14;
+	.loc	1 25 30                         // c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py:25:30
+	mul.wide.s32 	%rd10, %r10, 2;
+	add.s64 	%rd1, %rd6, %rd10;
+	.loc	1 25 35                         // c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py:25:35
+	// begin inline asm
+	mov.u32 %r1, 0x0;
+	ld.global.b32 { %r1 }, [ %rd1 + 0 ];
+	// end inline asm
+	.loc	1 26 30                         // c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py:26:30
+	mad.wide.s32 	%rd2, %r15, 2, %rd7;
+	.loc	1 26 35                         // c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py:26:35
+	// begin inline asm
+	mov.u64 %rd3, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd3, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r2, 0x0;
+	ld.global.L1::evict_last.L2::cache_hint.b32 { %r2 }, [ %rd2 + 0 ], %rd3;
+	// end inline asm
+	.loc	1 27 30                         // c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py:27:30
+	add.s64 	%rd4, %rd8, %rd10;
+	.loc	1 27 35                         // c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py:27:35
+	// begin inline asm
+	mov.u32 %r3, 0x0;
+	ld.global.b32 { %r3 }, [ %rd4 + 0 ];
+	// end inline asm
+	.loc	1 30 25                         // c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py:30:25
+	add.s64 	%rd5, %rd9, %rd10;
+	.loc	1 25 44                         // c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py:25:44
+	mov.b32 	{%rs1, %rs2}, %r1;
+	cvt.f32.bf16 	%r16, %rs2;
+	cvt.f32.bf16 	%r17, %rs1;
+	.loc	1 26 74                         // c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py:26:74
+	mov.b32 	{%rs3, %rs4}, %r2;
+	cvt.f32.bf16 	%r18, %rs4;
+	cvt.f32.bf16 	%r19, %rs3;
+	.loc	1 27 44                         // c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py:27:44
+	mov.b32 	{%rs5, %rs6}, %r3;
+	cvt.f32.bf16 	%r20, %rs6;
+	cvt.f32.bf16 	%r21, %rs5;
+	.loc	1 29 18                         // c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py:29:18
+	fma.rn.f32 	%r22, %r19, %r21, %r17;
+	fma.rn.f32 	%r23, %r18, %r20, %r16;
+	.loc	1 30 36                         // c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py:30:36
+	cvt.rn.bf16x2.f32 	%r4, %r23, %r22;
+	// begin inline asm
+	st.global.b32 [ %rd5 + 0 ], { %r4 };
+	// end inline asm
+	.loc	1 30 4                          // c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py:30:4
+	ret;
+$L__tmp1:
+$L__func_end0:
+                                        // -- End function
+}
+	.file	1 "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py"
+	.section	.debug_abbrev
+	{
+.b8 1                                   // Abbreviation Code
+.b8 17                                  // DW_TAG_compile_unit
+.b8 0                                   // DW_CHILDREN_no
+.b8 37                                  // DW_AT_producer
+.b8 8                                   // DW_FORM_string
+.b8 19                                  // DW_AT_language
+.b8 5                                   // DW_FORM_data2
+.b8 3                                   // DW_AT_name
+.b8 8                                   // DW_FORM_string
+.b8 16                                  // DW_AT_stmt_list
+.b8 6                                   // DW_FORM_data4
+.b8 27                                  // DW_AT_comp_dir
+.b8 8                                   // DW_FORM_string
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 0                                   // EOM(3)
+	}
+	.section	.debug_info
+	{
+.b32 224                                // Length of Unit
+.b8 2                                   // DWARF version number
+.b8 0
+.b32 .debug_abbrev                      // Offset Into Abbrev. Section
+.b8 8                                   // Address Size (in bytes)
+.b8 1                                   // Abbrev [1] 0xb:0xd9 DW_TAG_compile_unit
+.b8 116                                 // DW_AT_producer
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 0
+.b8 2                                   // DW_AT_language
+.b8 0
+.b8 99                                  // DW_AT_name
+.b8 54
+.b8 107
+.b8 97
+.b8 116
+.b8 53
+.b8 103
+.b8 55
+.b8 110
+.b8 51
+.b8 117
+.b8 117
+.b8 107
+.b8 107
+.b8 102
+.b8 119
+.b8 103
+.b8 100
+.b8 120
+.b8 102
+.b8 119
+.b8 116
+.b8 109
+.b8 120
+.b8 98
+.b8 108
+.b8 99
+.b8 109
+.b8 113
+.b8 122
+.b8 104
+.b8 98
+.b8 105
+.b8 102
+.b8 111
+.b8 53
+.b8 103
+.b8 51
+.b8 114
+.b8 98
+.b8 97
+.b8 122
+.b8 51
+.b8 100
+.b8 106
+.b8 120
+.b8 105
+.b8 105
+.b8 51
+.b8 53
+.b8 103
+.b8 105
+.b8 46
+.b8 112
+.b8 121
+.b8 0
+.b32 .debug_line                        // DW_AT_stmt_list
+.b8 47                                  // DW_AT_comp_dir
+.b8 97
+.b8 112
+.b8 112
+.b8 47
+.b8 116
+.b8 101
+.b8 110
+.b8 115
+.b8 111
+.b8 114
+.b8 114
+.b8 116
+.b8 95
+.b8 108
+.b8 108
+.b8 109
+.b8 47
+.b8 118
+.b8 105
+.b8 115
+.b8 117
+.b8 97
+.b8 108
+.b8 95
+.b8 103
+.b8 101
+.b8 110
+.b8 47
+.b8 99
+.b8 111
+.b8 109
+.b8 112
+.b8 105
+.b8 108
+.b8 101
+.b8 100
+.b8 95
+.b8 99
+.b8 97
+.b8 99
+.b8 104
+.b8 101
+.b8 47
+.b8 102
+.b8 108
+.b8 117
+.b8 120
+.b8 50
+.b8 95
+.b8 107
+.b8 108
+.b8 101
+.b8 105
+.b8 110
+.b8 95
+.b8 57
+.b8 98
+.b8 95
+.b8 78
+.b8 86
+.b8 73
+.b8 68
+.b8 73
+.b8 65
+.b8 95
+.b8 71
+.b8 101
+.b8 70
+.b8 111
+.b8 114
+.b8 99
+.b8 101
+.b8 95
+.b8 82
+.b8 84
+.b8 88
+.b8 95
+.b8 52
+.b8 48
+.b8 57
+.b8 48
+.b8 95
+.b8 115
+.b8 109
+.b8 56
+.b8 57
+.b8 95
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 50
+.b8 46
+.b8 49
+.b8 48
+.b8 46
+.b8 48
+.b8 97
+.b8 48
+.b8 95
+.b8 98
+.b8 52
+.b8 101
+.b8 52
+.b8 101
+.b8 101
+.b8 56
+.b8 49
+.b8 100
+.b8 51
+.b8 46
+.b8 110
+.b8 118
+.b8 50
+.b8 53
+.b8 46
+.b8 49
+.b8 50
+.b8 95
+.b8 99
+.b8 117
+.b8 100
+.b8 97
+.b8 49
+.b8 51
+.b8 95
+.b8 49
+.b8 47
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 105
+.b8 110
+.b8 100
+.b8 117
+.b8 99
+.b8 116
+.b8 111
+.b8 114
+.b8 47
+.b8 54
+.b8 107
+.b8 0
+	}
+	.section	.debug_macinfo	{	}
diff --git a/triton/NNLWGXPBSTRKOZBT6DRH5UXB7NTBUS2TEHIHMNLX4PT5DXTIKJLA/triton_poi_fused_add_mul_0.source b/triton/NNLWGXPBSTRKOZBT6DRH5UXB7NTBUS2TEHIHMNLX4PT5DXTIKJLA/triton_poi_fused_add_mul_0.source
new file mode 100644
index 0000000000000000000000000000000000000000..83ec1aaf83aa222188a01634f22622994cf15787
--- /dev/null
+++ b/triton/NNLWGXPBSTRKOZBT6DRH5UXB7NTBUS2TEHIHMNLX4PT5DXTIKJLA/triton_poi_fused_add_mul_0.source
@@ -0,0 +1,82 @@
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":18:0)
+#loc22 = loc("in_ptr0"(#loc))
+#loc23 = loc("in_ptr1"(#loc))
+#loc24 = loc("in_ptr2"(#loc))
+#loc25 = loc("out_ptr0"(#loc))
+#loc26 = loc("xnumel"(#loc))
+module {
+  tt.func public @triton_poi_fused_add_mul_0(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} {
+    %xnumel_0 = arith.constant 1048576 : i32 loc(#loc27)
+    %xoffset = tt.get_program_id x : i32 loc(#loc28)
+    %xoffset_1 = arith.constant 512 : i32 loc(#loc29)
+    %xoffset_2 = arith.constant 512 : i32 loc(#loc29)
+    %xoffset_3 = arith.muli %xoffset, %xoffset_2 : i32 loc(#loc29)
+    %xindex = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32> loc(#loc30)
+    %xindex_4 = tt.splat %xoffset_3 : i32 -> tensor<512xi32> loc(#loc31)
+    %xindex_5 = arith.addi %xindex_4, %xindex : tensor<512xi32> loc(#loc31)
+    %xmask = arith.constant true loc(#loc32)
+    %xmask_6 = arith.constant dense<true> : tensor<512xi1> loc(#loc32)
+    %x0 = arith.constant 4096 : i32 loc(#loc33)
+    %x0_7 = arith.constant 4096 : i32 loc(#loc33)
+    %x0_8 = arith.constant dense<4096> : tensor<512xi32> loc(#loc33)
+    %x0_9 = arith.remsi %xindex_5, %x0_8 : tensor<512xi32> loc(#loc33)
+    %tmp0 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<512x!tt.ptr<bf16>> loc(#loc34)
+    %tmp0_10 = tt.addptr %tmp0, %xindex_5 : tensor<512x!tt.ptr<bf16>>, tensor<512xi32> loc(#loc34)
+    %tmp0_11 = tt.load %tmp0_10 : tensor<512x!tt.ptr<bf16>> loc(#loc35)
+    %tmp0_12 = arith.extf %tmp0_11 : tensor<512xbf16> to tensor<512xf32> loc(#loc36)
+    %tmp1 = tt.splat %in_ptr1 : !tt.ptr<bf16> -> tensor<512x!tt.ptr<bf16>> loc(#loc37)
+    %tmp1_13 = tt.addptr %tmp1, %x0_9 : tensor<512x!tt.ptr<bf16>>, tensor<512xi32> loc(#loc37)
+    %tmp1_14 = tt.load %tmp1_13 evictionPolicy = evict_last : tensor<512x!tt.ptr<bf16>> loc(#loc38)
+    %tmp1_15 = arith.extf %tmp1_14 : tensor<512xbf16> to tensor<512xf32> loc(#loc39)
+    %tmp2 = tt.splat %in_ptr2 : !tt.ptr<bf16> -> tensor<512x!tt.ptr<bf16>> loc(#loc40)
+    %tmp2_16 = tt.addptr %tmp2, %xindex_5 : tensor<512x!tt.ptr<bf16>>, tensor<512xi32> loc(#loc40)
+    %tmp2_17 = tt.load %tmp2_16 : tensor<512x!tt.ptr<bf16>> loc(#loc41)
+    %tmp2_18 = arith.extf %tmp2_17 : tensor<512xbf16> to tensor<512xf32> loc(#loc42)
+    %tmp3 = arith.mulf %tmp1_15, %tmp2_18 : tensor<512xf32> loc(#loc43)
+    %tmp4 = arith.addf %tmp0_12, %tmp3 : tensor<512xf32> loc(#loc44)
+    %0 = tt.splat %out_ptr0 : !tt.ptr<bf16> -> tensor<512x!tt.ptr<bf16>> loc(#loc19)
+    %1 = tt.addptr %0, %xindex_5 : tensor<512x!tt.ptr<bf16>>, tensor<512xi32> loc(#loc19)
+    %2 = arith.truncf %tmp4 : tensor<512xf32> to tensor<512xbf16> loc(#loc20)
+    tt.store %1, %2 : tensor<512x!tt.ptr<bf16>> loc(#loc20)
+    tt.return loc(#loc21)
+  } loc(#loc)
+} loc(#loc)
+#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":19:13)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":20:28)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":20:33)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":21:36)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":21:23)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":22:36)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":24:19)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":25:30)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":25:35)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":25:44)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":26:30)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":26:35)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":26:74)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":27:30)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":27:35)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":27:44)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":28:18)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":29:18)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":30:25)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":30:36)
+#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":30:4)
+#loc27 = loc("xnumel"(#loc1))
+#loc28 = loc("xoffset"(#loc2))
+#loc29 = loc("xoffset"(#loc3))
+#loc30 = loc("xindex"(#loc4))
+#loc31 = loc("xindex"(#loc5))
+#loc32 = loc("xmask"(#loc6))
+#loc33 = loc("x0"(#loc7))
+#loc34 = loc("tmp0"(#loc8))
+#loc35 = loc("tmp0"(#loc9))
+#loc36 = loc("tmp0"(#loc10))
+#loc37 = loc("tmp1"(#loc11))
+#loc38 = loc("tmp1"(#loc12))
+#loc39 = loc("tmp1"(#loc13))
+#loc40 = loc("tmp2"(#loc14))
+#loc41 = loc("tmp2"(#loc15))
+#loc42 = loc("tmp2"(#loc16))
+#loc43 = loc("tmp3"(#loc17))
+#loc44 = loc("tmp4"(#loc18))
diff --git a/triton/NNLWGXPBSTRKOZBT6DRH5UXB7NTBUS2TEHIHMNLX4PT5DXTIKJLA/triton_poi_fused_add_mul_0.ttgir b/triton/NNLWGXPBSTRKOZBT6DRH5UXB7NTBUS2TEHIHMNLX4PT5DXTIKJLA/triton_poi_fused_add_mul_0.ttgir
new file mode 100644
index 0000000000000000000000000000000000000000..7c77cf8c43c9fbdf5d6c39b01549c5cde98b7982
--- /dev/null
+++ b/triton/NNLWGXPBSTRKOZBT6DRH5UXB7NTBUS2TEHIHMNLX4PT5DXTIKJLA/triton_poi_fused_add_mul_0.ttgir
@@ -0,0 +1,74 @@
+#blocked = #ttg.blocked<{sizePerThread = [2], threadsPerWarp = [32], warpsPerCTA = [8], order = [0]}>
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":18:0)
+#loc21 = loc("in_ptr0"(#loc))
+#loc22 = loc("in_ptr1"(#loc))
+#loc23 = loc("in_ptr2"(#loc))
+#loc24 = loc("out_ptr0"(#loc))
+#loc25 = loc("xnumel"(#loc))
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "cuda:89", "ttg.threads-per-warp" = 32 : i32} {
+  tt.func public @triton_poi_fused_add_mul_0(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} {
+    %cst = arith.constant dense<4096> : tensor<512xi32, #blocked> loc(#loc1)
+    %c512_i32 = arith.constant 512 : i32 loc(#loc1)
+    %xoffset = tt.get_program_id x : i32 loc(#loc26)
+    %xoffset_0 = arith.muli %xoffset, %c512_i32 : i32 loc(#loc27)
+    %xindex = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32, #blocked> loc(#loc28)
+    %xindex_1 = tt.splat %xoffset_0 : i32 -> tensor<512xi32, #blocked> loc(#loc29)
+    %xindex_2 = arith.addi %xindex_1, %xindex : tensor<512xi32, #blocked> loc(#loc29)
+    %x0 = arith.remsi %xindex_2, %cst : tensor<512xi32, #blocked> loc(#loc30)
+    %tmp0 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<512x!tt.ptr<bf16>, #blocked> loc(#loc31)
+    %tmp0_3 = tt.addptr %tmp0, %xindex_2 : tensor<512x!tt.ptr<bf16>, #blocked>, tensor<512xi32, #blocked> loc(#loc31)
+    %tmp0_4 = tt.load %tmp0_3 : tensor<512x!tt.ptr<bf16>, #blocked> loc(#loc32)
+    %tmp0_5 = arith.extf %tmp0_4 : tensor<512xbf16, #blocked> to tensor<512xf32, #blocked> loc(#loc33)
+    %tmp1 = tt.splat %in_ptr1 : !tt.ptr<bf16> -> tensor<512x!tt.ptr<bf16>, #blocked> loc(#loc34)
+    %tmp1_6 = tt.addptr %tmp1, %x0 : tensor<512x!tt.ptr<bf16>, #blocked>, tensor<512xi32, #blocked> loc(#loc34)
+    %tmp1_7 = tt.load %tmp1_6 evictionPolicy = evict_last : tensor<512x!tt.ptr<bf16>, #blocked> loc(#loc35)
+    %tmp1_8 = arith.extf %tmp1_7 : tensor<512xbf16, #blocked> to tensor<512xf32, #blocked> loc(#loc36)
+    %tmp2 = tt.splat %in_ptr2 : !tt.ptr<bf16> -> tensor<512x!tt.ptr<bf16>, #blocked> loc(#loc37)
+    %tmp2_9 = tt.addptr %tmp2, %xindex_2 : tensor<512x!tt.ptr<bf16>, #blocked>, tensor<512xi32, #blocked> loc(#loc37)
+    %tmp2_10 = tt.load %tmp2_9 : tensor<512x!tt.ptr<bf16>, #blocked> loc(#loc38)
+    %tmp2_11 = arith.extf %tmp2_10 : tensor<512xbf16, #blocked> to tensor<512xf32, #blocked> loc(#loc39)
+    %tmp3 = arith.mulf %tmp1_8, %tmp2_11 : tensor<512xf32, #blocked> loc(#loc40)
+    %tmp4 = arith.addf %tmp0_5, %tmp3 : tensor<512xf32, #blocked> loc(#loc41)
+    %0 = tt.splat %out_ptr0 : !tt.ptr<bf16> -> tensor<512x!tt.ptr<bf16>, #blocked> loc(#loc18)
+    %1 = tt.addptr %0, %xindex_2 : tensor<512x!tt.ptr<bf16>, #blocked>, tensor<512xi32, #blocked> loc(#loc18)
+    %2 = arith.truncf %tmp4 : tensor<512xf32, #blocked> to tensor<512xbf16, #blocked> loc(#loc19)
+    tt.store %1, %2 : tensor<512x!tt.ptr<bf16>, #blocked> loc(#loc19)
+    tt.return loc(#loc20)
+  } loc(#loc)
+} loc(#loc)
+#loc1 = loc(unknown)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":20:28)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":20:33)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":21:36)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":21:23)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":24:19)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":25:30)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":25:35)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":25:44)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":26:30)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":26:35)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":26:74)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":27:30)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":27:35)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":27:44)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":28:18)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":29:18)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":30:25)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":30:36)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":30:4)
+#loc26 = loc("xoffset"(#loc2))
+#loc27 = loc("xoffset"(#loc3))
+#loc28 = loc("xindex"(#loc4))
+#loc29 = loc("xindex"(#loc5))
+#loc30 = loc("x0"(#loc6))
+#loc31 = loc("tmp0"(#loc7))
+#loc32 = loc("tmp0"(#loc8))
+#loc33 = loc("tmp0"(#loc9))
+#loc34 = loc("tmp1"(#loc10))
+#loc35 = loc("tmp1"(#loc11))
+#loc36 = loc("tmp1"(#loc12))
+#loc37 = loc("tmp2"(#loc13))
+#loc38 = loc("tmp2"(#loc14))
+#loc39 = loc("tmp2"(#loc15))
+#loc40 = loc("tmp3"(#loc16))
+#loc41 = loc("tmp4"(#loc17))
diff --git a/triton/NNLWGXPBSTRKOZBT6DRH5UXB7NTBUS2TEHIHMNLX4PT5DXTIKJLA/triton_poi_fused_add_mul_0.ttir b/triton/NNLWGXPBSTRKOZBT6DRH5UXB7NTBUS2TEHIHMNLX4PT5DXTIKJLA/triton_poi_fused_add_mul_0.ttir
new file mode 100644
index 0000000000000000000000000000000000000000..b13493bfbfcfabd04700d2852f8f814510f685f1
--- /dev/null
+++ b/triton/NNLWGXPBSTRKOZBT6DRH5UXB7NTBUS2TEHIHMNLX4PT5DXTIKJLA/triton_poi_fused_add_mul_0.ttir
@@ -0,0 +1,73 @@
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":18:0)
+#loc21 = loc("in_ptr0"(#loc))
+#loc22 = loc("in_ptr1"(#loc))
+#loc23 = loc("in_ptr2"(#loc))
+#loc24 = loc("out_ptr0"(#loc))
+#loc25 = loc("xnumel"(#loc))
+module {
+  tt.func public @triton_poi_fused_add_mul_0(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} {
+    %x0 = arith.constant dense<4096> : tensor<512xi32> loc(#loc26)
+    %c512_i32 = arith.constant 512 : i32 loc(#loc2)
+    %xoffset = tt.get_program_id x : i32 loc(#loc27)
+    %xoffset_0 = arith.muli %xoffset, %c512_i32 : i32 loc(#loc28)
+    %xindex = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32> loc(#loc29)
+    %xindex_1 = tt.splat %xoffset_0 : i32 -> tensor<512xi32> loc(#loc30)
+    %xindex_2 = arith.addi %xindex_1, %xindex : tensor<512xi32> loc(#loc30)
+    %x0_3 = arith.remsi %xindex_2, %x0 : tensor<512xi32> loc(#loc26)
+    %tmp0 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<512x!tt.ptr<bf16>> loc(#loc31)
+    %tmp0_4 = tt.addptr %tmp0, %xindex_2 : tensor<512x!tt.ptr<bf16>>, tensor<512xi32> loc(#loc31)
+    %tmp0_5 = tt.load %tmp0_4 : tensor<512x!tt.ptr<bf16>> loc(#loc32)
+    %tmp0_6 = arith.extf %tmp0_5 : tensor<512xbf16> to tensor<512xf32> loc(#loc33)
+    %tmp1 = tt.splat %in_ptr1 : !tt.ptr<bf16> -> tensor<512x!tt.ptr<bf16>> loc(#loc34)
+    %tmp1_7 = tt.addptr %tmp1, %x0_3 : tensor<512x!tt.ptr<bf16>>, tensor<512xi32> loc(#loc34)
+    %tmp1_8 = tt.load %tmp1_7 evictionPolicy = evict_last : tensor<512x!tt.ptr<bf16>> loc(#loc35)
+    %tmp1_9 = arith.extf %tmp1_8 : tensor<512xbf16> to tensor<512xf32> loc(#loc36)
+    %tmp2 = tt.splat %in_ptr2 : !tt.ptr<bf16> -> tensor<512x!tt.ptr<bf16>> loc(#loc37)
+    %tmp2_10 = tt.addptr %tmp2, %xindex_2 : tensor<512x!tt.ptr<bf16>>, tensor<512xi32> loc(#loc37)
+    %tmp2_11 = tt.load %tmp2_10 : tensor<512x!tt.ptr<bf16>> loc(#loc38)
+    %tmp2_12 = arith.extf %tmp2_11 : tensor<512xbf16> to tensor<512xf32> loc(#loc39)
+    %tmp3 = arith.mulf %tmp1_9, %tmp2_12 : tensor<512xf32> loc(#loc40)
+    %tmp4 = arith.addf %tmp0_6, %tmp3 : tensor<512xf32> loc(#loc41)
+    %0 = tt.splat %out_ptr0 : !tt.ptr<bf16> -> tensor<512x!tt.ptr<bf16>> loc(#loc18)
+    %1 = tt.addptr %0, %xindex_2 : tensor<512x!tt.ptr<bf16>>, tensor<512xi32> loc(#loc18)
+    %2 = arith.truncf %tmp4 : tensor<512xf32> to tensor<512xbf16> loc(#loc19)
+    tt.store %1, %2 : tensor<512x!tt.ptr<bf16>> loc(#loc19)
+    tt.return loc(#loc20)
+  } loc(#loc)
+} loc(#loc)
+#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":24:19)
+#loc2 = loc(unknown)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":20:28)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":20:33)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":21:36)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":21:23)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":25:30)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":25:35)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":25:44)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":26:30)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":26:35)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":26:74)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":27:30)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":27:35)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":27:44)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":28:18)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":29:18)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":30:25)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":30:36)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":30:4)
+#loc26 = loc("x0"(#loc1))
+#loc27 = loc("xoffset"(#loc3))
+#loc28 = loc("xoffset"(#loc4))
+#loc29 = loc("xindex"(#loc5))
+#loc30 = loc("xindex"(#loc6))
+#loc31 = loc("tmp0"(#loc7))
+#loc32 = loc("tmp0"(#loc8))
+#loc33 = loc("tmp0"(#loc9))
+#loc34 = loc("tmp1"(#loc10))
+#loc35 = loc("tmp1"(#loc11))
+#loc36 = loc("tmp1"(#loc12))
+#loc37 = loc("tmp2"(#loc13))
+#loc38 = loc("tmp2"(#loc14))
+#loc39 = loc("tmp2"(#loc15))
+#loc40 = loc("tmp3"(#loc16))
+#loc41 = loc("tmp4"(#loc17))
diff --git a/triton/OPILR2I4CRZEKF5SC7TQV475GMCNDW573IRFPSSSP4NDN5JIX2VQ/__grp__triton_red_fused_add_mul_native_layer_norm_0.json b/triton/OPILR2I4CRZEKF5SC7TQV475GMCNDW573IRFPSSSP4NDN5JIX2VQ/__grp__triton_red_fused_add_mul_native_layer_norm_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..f9dc9eacbb50731836276a9b4751eb3a04d9dc03
--- /dev/null
+++ b/triton/OPILR2I4CRZEKF5SC7TQV475GMCNDW573IRFPSSSP4NDN5JIX2VQ/__grp__triton_red_fused_add_mul_native_layer_norm_0.json
@@ -0,0 +1 @@
+{"child_paths": {"triton_red_fused_add_mul_native_layer_norm_0.source": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/OPILR2I4CRZEKF5SC7TQV475GMCNDW573IRFPSSSP4NDN5JIX2VQ/triton_red_fused_add_mul_native_layer_norm_0.source", "triton_red_fused_add_mul_native_layer_norm_0.ttir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/OPILR2I4CRZEKF5SC7TQV475GMCNDW573IRFPSSSP4NDN5JIX2VQ/triton_red_fused_add_mul_native_layer_norm_0.ttir", "triton_red_fused_add_mul_native_layer_norm_0.ttgir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/OPILR2I4CRZEKF5SC7TQV475GMCNDW573IRFPSSSP4NDN5JIX2VQ/triton_red_fused_add_mul_native_layer_norm_0.ttgir", "triton_red_fused_add_mul_native_layer_norm_0.llir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/OPILR2I4CRZEKF5SC7TQV475GMCNDW573IRFPSSSP4NDN5JIX2VQ/triton_red_fused_add_mul_native_layer_norm_0.llir", "triton_red_fused_add_mul_native_layer_norm_0.ptx": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/OPILR2I4CRZEKF5SC7TQV475GMCNDW573IRFPSSSP4NDN5JIX2VQ/triton_red_fused_add_mul_native_layer_norm_0.ptx", "triton_red_fused_add_mul_native_layer_norm_0.cubin": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/OPILR2I4CRZEKF5SC7TQV475GMCNDW573IRFPSSSP4NDN5JIX2VQ/triton_red_fused_add_mul_native_layer_norm_0.cubin", "triton_red_fused_add_mul_native_layer_norm_0.json": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/OPILR2I4CRZEKF5SC7TQV475GMCNDW573IRFPSSSP4NDN5JIX2VQ/triton_red_fused_add_mul_native_layer_norm_0.json"}}
\ No newline at end of file
diff --git a/triton/OPILR2I4CRZEKF5SC7TQV475GMCNDW573IRFPSSSP4NDN5JIX2VQ/triton_red_fused_add_mul_native_layer_norm_0.cubin b/triton/OPILR2I4CRZEKF5SC7TQV475GMCNDW573IRFPSSSP4NDN5JIX2VQ/triton_red_fused_add_mul_native_layer_norm_0.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..305f1ecb16764c64362fcd51b4c346bfda222f67
Binary files /dev/null and b/triton/OPILR2I4CRZEKF5SC7TQV475GMCNDW573IRFPSSSP4NDN5JIX2VQ/triton_red_fused_add_mul_native_layer_norm_0.cubin differ
diff --git a/triton/OPILR2I4CRZEKF5SC7TQV475GMCNDW573IRFPSSSP4NDN5JIX2VQ/triton_red_fused_add_mul_native_layer_norm_0.json b/triton/OPILR2I4CRZEKF5SC7TQV475GMCNDW573IRFPSSSP4NDN5JIX2VQ/triton_red_fused_add_mul_native_layer_norm_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..8830bb8f9a0cab2edc0084648e22db1d9b3c86a4
--- /dev/null
+++ b/triton/OPILR2I4CRZEKF5SC7TQV475GMCNDW573IRFPSSSP4NDN5JIX2VQ/triton_red_fused_add_mul_native_layer_norm_0.json
@@ -0,0 +1 @@
+{"hash": "73d0b8e91c14724517b217e70af3fd3304d1dbbfda2257ca527f1a36f528beab", "target": {"backend": "cuda", "arch": 89, "warp_size": 32}, "num_warps": 16, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "enable_reflect_ftz": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee", "bf16x3", "bf16x6"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm89", "instrumentation_mode": "", "triton_version": "3.6.0", "tensordesc_meta": [], "shared": 192, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused_add_mul_native_layer_norm_0"}
\ No newline at end of file
diff --git a/triton/OPILR2I4CRZEKF5SC7TQV475GMCNDW573IRFPSSSP4NDN5JIX2VQ/triton_red_fused_add_mul_native_layer_norm_0.llir b/triton/OPILR2I4CRZEKF5SC7TQV475GMCNDW573IRFPSSSP4NDN5JIX2VQ/triton_red_fused_add_mul_native_layer_norm_0.llir
new file mode 100644
index 0000000000000000000000000000000000000000..d6d36e19ad7396b6ae25fd22889607dd8ddb9254
--- /dev/null
+++ b/triton/OPILR2I4CRZEKF5SC7TQV475GMCNDW573IRFPSSSP4NDN5JIX2VQ/triton_red_fused_add_mul_native_layer_norm_0.llir
@@ -0,0 +1,601 @@
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64"
+
+@global_smem = external addrspace(3) global [0 x i8], align 16
+@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1
+
+; Function Attrs: nounwind
+define ptx_kernel void @triton_red_fused_add_mul_native_layer_norm_0(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, ptr addrspace(1) %6, i32 %7, i32 %8, ptr addrspace(1) readnone captures(none) %9, ptr addrspace(1) readnone captures(none) %10) local_unnamed_addr #0 !dbg !5 {
+__nv_rsqrtf.exit:
+  %11 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !8
+  %12 = icmp samesign ult i32 %11, 256, !dbg !9
+  %13 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
+  %14 = and i32 %13, 511, !dbg !10
+  %15 = and i32 %13, 31, !dbg !10
+  %16 = lshr i32 %14, 5, !dbg !10
+  %17 = shl nuw nsw i32 %13, 3, !dbg !10
+  %18 = and i32 %17, 4088, !dbg !10
+  %19 = shl i32 %11, 12, !dbg !11
+  %20 = or disjoint i32 %18, %19, !dbg !12
+  %21 = sext i32 %20 to i64, !dbg !13
+  %22 = getelementptr bfloat, ptr addrspace(1) %0, i64 %21, !dbg !13
+  %23 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #6, !dbg !14
+  %24 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %22, i64 %23, i1 %12) #6, !dbg !14
+  %25 = extractvalue { i32, i32, i32, i32 } %24, 0, !dbg !14
+  %26 = bitcast i32 %25 to <2 x bfloat>, !dbg !14
+  %27 = extractvalue { i32, i32, i32, i32 } %24, 1, !dbg !14
+  %28 = bitcast i32 %27 to <2 x bfloat>, !dbg !14
+  %29 = extractvalue { i32, i32, i32, i32 } %24, 2, !dbg !14
+  %30 = bitcast i32 %29 to <2 x bfloat>, !dbg !14
+  %31 = extractvalue { i32, i32, i32, i32 } %24, 3, !dbg !14
+  %32 = bitcast i32 %31 to <2 x bfloat>, !dbg !14
+  %33 = zext nneg i32 %18 to i64, !dbg !15
+  %34 = getelementptr bfloat, ptr addrspace(1) %1, i64 %33, !dbg !15
+  %35 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !16
+  %36 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %34, i64 %35, i1 true) #6, !dbg !16
+  %37 = extractvalue { i32, i32, i32, i32 } %36, 0, !dbg !16
+  %38 = bitcast i32 %37 to <2 x bfloat>, !dbg !16
+  %39 = extractvalue { i32, i32, i32, i32 } %36, 1, !dbg !16
+  %40 = bitcast i32 %39 to <2 x bfloat>, !dbg !16
+  %41 = extractvalue { i32, i32, i32, i32 } %36, 2, !dbg !16
+  %42 = bitcast i32 %41 to <2 x bfloat>, !dbg !16
+  %43 = extractvalue { i32, i32, i32, i32 } %36, 3, !dbg !16
+  %44 = bitcast i32 %43 to <2 x bfloat>, !dbg !16
+  %45 = getelementptr bfloat, ptr addrspace(1) %2, i64 %21, !dbg !17
+  %46 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #6, !dbg !18
+  %47 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %45, i64 %46, i1 %12) #6, !dbg !18
+  %48 = extractvalue { i32, i32, i32, i32 } %47, 0, !dbg !18
+  %49 = bitcast i32 %48 to <2 x bfloat>, !dbg !18
+  %50 = extractvalue { i32, i32, i32, i32 } %47, 1, !dbg !18
+  %51 = bitcast i32 %50 to <2 x bfloat>, !dbg !18
+  %52 = extractvalue { i32, i32, i32, i32 } %47, 2, !dbg !18
+  %53 = bitcast i32 %52 to <2 x bfloat>, !dbg !18
+  %54 = extractvalue { i32, i32, i32, i32 } %47, 3, !dbg !18
+  %55 = bitcast i32 %54 to <2 x bfloat>, !dbg !18
+  %56 = select i1 %12, float 1.000000e+00, float 0.000000e+00, !dbg !19
+  %57 = getelementptr bfloat, ptr addrspace(1) %5, i64 %21, !dbg !20
+  %58 = fpext <2 x bfloat> %26 to <2 x float>, !dbg !21
+  %59 = fpext <2 x bfloat> %38 to <2 x float>, !dbg !22
+  %60 = fpext <2 x bfloat> %49 to <2 x float>, !dbg !23
+  %61 = fmul <2 x float> %59, %60, !dbg !24
+  %62 = fadd <2 x float> %61, %58, !dbg !25
+  %63 = extractelement <2 x float> %62, i64 0, !dbg !26
+  %64 = select i1 %12, float %63, float 0.000000e+00, !dbg !26
+  %65 = extractelement <2 x float> %62, i64 1, !dbg !26
+  %66 = select i1 %12, float %65, float 0.000000e+00, !dbg !26
+  %67 = fptrunc <2 x float> %62 to <2 x bfloat>, !dbg !27
+  %68 = fpext <2 x bfloat> %28 to <2 x float>, !dbg !21
+  %69 = fpext <2 x bfloat> %40 to <2 x float>, !dbg !22
+  %70 = fpext <2 x bfloat> %51 to <2 x float>, !dbg !23
+  %71 = fmul <2 x float> %69, %70, !dbg !24
+  %72 = fadd <2 x float> %71, %68, !dbg !25
+  %73 = extractelement <2 x float> %72, i64 0, !dbg !26
+  %74 = select i1 %12, float %73, float 0.000000e+00, !dbg !26
+  %75 = extractelement <2 x float> %72, i64 1, !dbg !26
+  %76 = select i1 %12, float %75, float 0.000000e+00, !dbg !26
+  %77 = fptrunc <2 x float> %72 to <2 x bfloat>, !dbg !27
+  %78 = fpext <2 x bfloat> %30 to <2 x float>, !dbg !21
+  %79 = fpext <2 x bfloat> %42 to <2 x float>, !dbg !22
+  %80 = fpext <2 x bfloat> %53 to <2 x float>, !dbg !23
+  %81 = fmul <2 x float> %79, %80, !dbg !24
+  %82 = fadd <2 x float> %81, %78, !dbg !25
+  %83 = extractelement <2 x float> %82, i64 0, !dbg !26
+  %84 = select i1 %12, float %83, float 0.000000e+00, !dbg !26
+  %85 = extractelement <2 x float> %82, i64 1, !dbg !26
+  %86 = select i1 %12, float %85, float 0.000000e+00, !dbg !26
+  %87 = fptrunc <2 x float> %82 to <2 x bfloat>, !dbg !27
+  %88 = fpext <2 x bfloat> %32 to <2 x float>, !dbg !21
+  %89 = fpext <2 x bfloat> %44 to <2 x float>, !dbg !22
+  %90 = fpext <2 x bfloat> %55 to <2 x float>, !dbg !23
+  %91 = fmul <2 x float> %89, %90, !dbg !24
+  %92 = fadd <2 x float> %91, %88, !dbg !25
+  %93 = extractelement <2 x float> %92, i64 0, !dbg !26
+  %94 = select i1 %12, float %93, float 0.000000e+00, !dbg !26
+  %95 = extractelement <2 x float> %92, i64 1, !dbg !26
+  %96 = select i1 %12, float %95, float 0.000000e+00, !dbg !26
+  %97 = fptrunc <2 x float> %92 to <2 x bfloat>, !dbg !27
+  %98 = bitcast <2 x bfloat> %67 to i32, !dbg !27
+  %99 = bitcast <2 x bfloat> %77 to i32, !dbg !27
+  %100 = bitcast <2 x bfloat> %87 to i32, !dbg !27
+  %101 = bitcast <2 x bfloat> %97 to i32, !dbg !27
+  tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %98, i32 %99, i32 %100, i32 %101, ptr addrspace(1) %57, i1 %12) #6, !dbg !27
+  %102 = fsub float %66, %64, !dbg !28
+  %103 = select i1 %12, float 2.000000e+00, float 0.000000e+00, !dbg !34
+  %104 = fcmp oeq float %103, 0.000000e+00, !dbg !35
+  %105 = tail call float @llvm.nvvm.div.full(float %56, float %103), !dbg !36
+  %106 = select i1 %104, float 0.000000e+00, float %105, !dbg !37
+  %107 = fmul float %106, %102, !dbg !38
+  %108 = fadd float %64, %107, !dbg !39
+  %109 = fmul float %102, %102, !dbg !40
+  %110 = fmul float %56, %109, !dbg !41
+  %111 = fmul float %106, %110, !dbg !42
+  %112 = fadd float %111, 0.000000e+00, !dbg !43
+  %113 = fsub float %74, %108, !dbg !28
+  %114 = select i1 %12, float 3.000000e+00, float 0.000000e+00, !dbg !34
+  %115 = fcmp oeq float %114, 0.000000e+00, !dbg !35
+  %116 = tail call float @llvm.nvvm.div.full(float %56, float %114), !dbg !36
+  %117 = select i1 %115, float 0.000000e+00, float %116, !dbg !37
+  %118 = fmul float %117, %113, !dbg !38
+  %119 = fadd float %108, %118, !dbg !39
+  %120 = fmul float %113, %113, !dbg !40
+  %121 = fmul float %103, %120, !dbg !41
+  %122 = fmul float %117, %121, !dbg !42
+  %123 = fadd float %112, %122, !dbg !43
+  %124 = fsub float %76, %119, !dbg !28
+  %125 = select i1 %12, float 4.000000e+00, float 0.000000e+00, !dbg !34
+  %126 = fcmp oeq float %125, 0.000000e+00, !dbg !35
+  %127 = tail call float @llvm.nvvm.div.full(float %56, float %125), !dbg !36
+  %128 = select i1 %126, float 0.000000e+00, float %127, !dbg !37
+  %129 = fmul float %128, %124, !dbg !38
+  %130 = fadd float %119, %129, !dbg !39
+  %131 = fmul float %124, %124, !dbg !40
+  %132 = fmul float %114, %131, !dbg !41
+  %133 = fmul float %128, %132, !dbg !42
+  %134 = fadd float %123, %133, !dbg !43
+  %135 = fsub float %84, %130, !dbg !28
+  %136 = select i1 %12, float 5.000000e+00, float 0.000000e+00, !dbg !34
+  %137 = fcmp oeq float %136, 0.000000e+00, !dbg !35
+  %138 = tail call float @llvm.nvvm.div.full(float %56, float %136), !dbg !36
+  %139 = select i1 %137, float 0.000000e+00, float %138, !dbg !37
+  %140 = fmul float %139, %135, !dbg !38
+  %141 = fadd float %130, %140, !dbg !39
+  %142 = fmul float %135, %135, !dbg !40
+  %143 = fmul float %125, %142, !dbg !41
+  %144 = fmul float %139, %143, !dbg !42
+  %145 = fadd float %134, %144, !dbg !43
+  %146 = fsub float %86, %141, !dbg !28
+  %147 = select i1 %12, float 6.000000e+00, float 0.000000e+00, !dbg !34
+  %148 = fcmp oeq float %147, 0.000000e+00, !dbg !35
+  %149 = tail call float @llvm.nvvm.div.full(float %56, float %147), !dbg !36
+  %150 = select i1 %148, float 0.000000e+00, float %149, !dbg !37
+  %151 = fmul float %150, %146, !dbg !38
+  %152 = fadd float %141, %151, !dbg !39
+  %153 = fmul float %146, %146, !dbg !40
+  %154 = fmul float %136, %153, !dbg !41
+  %155 = fmul float %150, %154, !dbg !42
+  %156 = fadd float %145, %155, !dbg !43
+  %157 = fsub float %94, %152, !dbg !28
+  %158 = select i1 %12, float 7.000000e+00, float 0.000000e+00, !dbg !34
+  %159 = fcmp oeq float %158, 0.000000e+00, !dbg !35
+  %160 = tail call float @llvm.nvvm.div.full(float %56, float %158), !dbg !36
+  %161 = select i1 %159, float 0.000000e+00, float %160, !dbg !37
+  %162 = fmul float %161, %157, !dbg !38
+  %163 = fadd float %152, %162, !dbg !39
+  %164 = fmul float %157, %157, !dbg !40
+  %165 = fmul float %147, %164, !dbg !41
+  %166 = fmul float %161, %165, !dbg !42
+  %167 = fadd float %156, %166, !dbg !43
+  %168 = fsub float %96, %163, !dbg !28
+  %169 = select i1 %12, float 8.000000e+00, float 0.000000e+00, !dbg !34
+  %170 = fcmp oeq float %169, 0.000000e+00, !dbg !35
+  %171 = tail call float @llvm.nvvm.div.full(float %56, float %169), !dbg !36
+  %172 = select i1 %170, float 0.000000e+00, float %171, !dbg !37
+  %173 = fmul float %172, %168, !dbg !38
+  %174 = fadd float %163, %173, !dbg !39
+  %175 = fmul float %168, %168, !dbg !40
+  %176 = fmul float %158, %175, !dbg !41
+  %177 = fmul float %172, %176, !dbg !42
+  %178 = fadd float %167, %177, !dbg !43
+  %179 = bitcast float %174 to i32, !dbg !31
+  %180 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %179, i32 16, i32 31), !dbg !31
+  %181 = bitcast i32 %180 to float, !dbg !31
+  %182 = bitcast float %178 to i32, !dbg !31
+  %183 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %182, i32 16, i32 31), !dbg !31
+  %184 = bitcast i32 %183 to float, !dbg !31
+  %185 = bitcast float %169 to i32, !dbg !31
+  %186 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %185, i32 16, i32 31), !dbg !31
+  %187 = bitcast i32 %186 to float, !dbg !31
+  %188 = fsub float %181, %174, !dbg !28
+  %189 = fadd float %169, %187, !dbg !34
+  %190 = fcmp oeq float %189, 0.000000e+00, !dbg !35
+  %191 = tail call float @llvm.nvvm.div.full(float %187, float %189), !dbg !36
+  %192 = select i1 %190, float 0.000000e+00, float %191, !dbg !37
+  %193 = fmul float %192, %188, !dbg !38
+  %194 = fadd float %174, %193, !dbg !39
+  %195 = fadd float %178, %184, !dbg !44
+  %196 = fmul float %188, %188, !dbg !40
+  %197 = fmul float %169, %196, !dbg !41
+  %198 = fmul float %192, %197, !dbg !42
+  %199 = fadd float %195, %198, !dbg !43
+  %200 = bitcast float %194 to i32, !dbg !31
+  %201 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %200, i32 8, i32 31), !dbg !31
+  %202 = bitcast i32 %201 to float, !dbg !31
+  %203 = bitcast float %199 to i32, !dbg !31
+  %204 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %203, i32 8, i32 31), !dbg !31
+  %205 = bitcast i32 %204 to float, !dbg !31
+  %206 = bitcast float %189 to i32, !dbg !31
+  %207 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %206, i32 8, i32 31), !dbg !31
+  %208 = bitcast i32 %207 to float, !dbg !31
+  %209 = fsub float %202, %194, !dbg !28
+  %210 = fadd float %189, %208, !dbg !34
+  %211 = fcmp oeq float %210, 0.000000e+00, !dbg !35
+  %212 = tail call float @llvm.nvvm.div.full(float %208, float %210), !dbg !36
+  %213 = select i1 %211, float 0.000000e+00, float %212, !dbg !37
+  %214 = fmul float %213, %209, !dbg !38
+  %215 = fadd float %194, %214, !dbg !39
+  %216 = fadd float %199, %205, !dbg !44
+  %217 = fmul float %209, %209, !dbg !40
+  %218 = fmul float %189, %217, !dbg !41
+  %219 = fmul float %213, %218, !dbg !42
+  %220 = fadd float %216, %219, !dbg !43
+  %221 = bitcast float %215 to i32, !dbg !31
+  %222 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %221, i32 4, i32 31), !dbg !31
+  %223 = bitcast i32 %222 to float, !dbg !31
+  %224 = bitcast float %220 to i32, !dbg !31
+  %225 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %224, i32 4, i32 31), !dbg !31
+  %226 = bitcast i32 %225 to float, !dbg !31
+  %227 = bitcast float %210 to i32, !dbg !31
+  %228 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %227, i32 4, i32 31), !dbg !31
+  %229 = bitcast i32 %228 to float, !dbg !31
+  %230 = fsub float %223, %215, !dbg !28
+  %231 = fadd float %210, %229, !dbg !34
+  %232 = fcmp oeq float %231, 0.000000e+00, !dbg !35
+  %233 = tail call float @llvm.nvvm.div.full(float %229, float %231), !dbg !36
+  %234 = select i1 %232, float 0.000000e+00, float %233, !dbg !37
+  %235 = fmul float %234, %230, !dbg !38
+  %236 = fadd float %215, %235, !dbg !39
+  %237 = fadd float %220, %226, !dbg !44
+  %238 = fmul float %230, %230, !dbg !40
+  %239 = fmul float %210, %238, !dbg !41
+  %240 = fmul float %234, %239, !dbg !42
+  %241 = fadd float %237, %240, !dbg !43
+  %242 = bitcast float %236 to i32, !dbg !31
+  %243 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %242, i32 2, i32 31), !dbg !31
+  %244 = bitcast i32 %243 to float, !dbg !31
+  %245 = bitcast float %241 to i32, !dbg !31
+  %246 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %245, i32 2, i32 31), !dbg !31
+  %247 = bitcast i32 %246 to float, !dbg !31
+  %248 = bitcast float %231 to i32, !dbg !31
+  %249 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %248, i32 2, i32 31), !dbg !31
+  %250 = bitcast i32 %249 to float, !dbg !31
+  %251 = fsub float %244, %236, !dbg !28
+  %252 = fadd float %231, %250, !dbg !34
+  %253 = fcmp oeq float %252, 0.000000e+00, !dbg !35
+  %254 = tail call float @llvm.nvvm.div.full(float %250, float %252), !dbg !36
+  %255 = select i1 %253, float 0.000000e+00, float %254, !dbg !37
+  %256 = fmul float %255, %251, !dbg !38
+  %257 = fadd float %236, %256, !dbg !39
+  %258 = fadd float %241, %247, !dbg !44
+  %259 = fmul float %251, %251, !dbg !40
+  %260 = fmul float %231, %259, !dbg !41
+  %261 = fmul float %255, %260, !dbg !42
+  %262 = fadd float %258, %261, !dbg !43
+  %263 = bitcast float %257 to i32, !dbg !31
+  %264 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %263, i32 1, i32 31), !dbg !31
+  %265 = bitcast i32 %264 to float, !dbg !31
+  %266 = bitcast float %262 to i32, !dbg !31
+  %267 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %266, i32 1, i32 31), !dbg !31
+  %268 = bitcast i32 %267 to float, !dbg !31
+  %269 = bitcast float %252 to i32, !dbg !31
+  %270 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %269, i32 1, i32 31), !dbg !31
+  %271 = bitcast i32 %270 to float, !dbg !31
+  %272 = fsub float %265, %257, !dbg !28
+  %273 = fadd float %252, %271, !dbg !34
+  %274 = fcmp oeq float %273, 0.000000e+00, !dbg !35
+  %275 = tail call float @llvm.nvvm.div.full(float %271, float %273), !dbg !36
+  %276 = select i1 %274, float 0.000000e+00, float %275, !dbg !37
+  %277 = fmul float %276, %272, !dbg !38
+  %278 = fadd float %257, %277, !dbg !39
+  %279 = fadd float %262, %268, !dbg !44
+  %280 = fmul float %272, %272, !dbg !40
+  %281 = fmul float %252, %280, !dbg !41
+  %282 = fmul float %276, %281, !dbg !42
+  %283 = fadd float %279, %282, !dbg !43
+  %284 = icmp eq i32 %15, 0, !dbg !31
+  %285 = getelementptr float, ptr addrspace(3) @global_smem, i32 %16, !dbg !31
+  %286 = bitcast float %278 to <1 x i32>, !dbg !31
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %285, <1 x i32> %286, i1 %284) #6, !dbg !31
+  %287 = getelementptr float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 64), i32 %16, !dbg !31
+  %288 = bitcast float %283 to <1 x i32>, !dbg !31
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %287, <1 x i32> %288, i1 %284) #6, !dbg !31
+  %289 = getelementptr float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 128), i32 %16, !dbg !31
+  %290 = bitcast float %273 to <1 x i32>, !dbg !31
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %289, <1 x i32> %290, i1 %284) #6, !dbg !31
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !31
+  %291 = icmp samesign ult i32 %14, 16, !dbg !31
+  %292 = getelementptr float, ptr addrspace(3) @global_smem, i32 %14, !dbg !31
+  %293 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %292, i1 %291) #6, !dbg !31
+  %294 = bitcast i32 %293 to float, !dbg !31
+  %295 = getelementptr float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 64), i32 %14, !dbg !31
+  %296 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %295, i1 %291) #6, !dbg !31
+  %297 = bitcast i32 %296 to float, !dbg !31
+  %298 = getelementptr float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 128), i32 %14, !dbg !31
+  %299 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %298, i1 %291) #6, !dbg !31
+  %300 = bitcast i32 %299 to float, !dbg !31
+  %301 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %293, i32 8, i32 31), !dbg !31
+  %302 = bitcast i32 %301 to float, !dbg !31
+  %303 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %296, i32 8, i32 31), !dbg !31
+  %304 = bitcast i32 %303 to float, !dbg !31
+  %305 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %299, i32 8, i32 31), !dbg !31
+  %306 = bitcast i32 %305 to float, !dbg !31
+  %307 = fsub float %302, %294, !dbg !28
+  %308 = fadd float %300, %306, !dbg !34
+  %309 = fcmp oeq float %308, 0.000000e+00, !dbg !35
+  %310 = tail call float @llvm.nvvm.div.full(float %306, float %308), !dbg !36
+  %311 = select i1 %309, float 0.000000e+00, float %310, !dbg !37
+  %312 = fmul float %307, %311, !dbg !38
+  %313 = fadd float %312, %294, !dbg !39
+  %314 = fadd float %297, %304, !dbg !44
+  %315 = fmul float %307, %307, !dbg !40
+  %316 = fmul float %315, %300, !dbg !41
+  %317 = fmul float %316, %311, !dbg !42
+  %318 = fadd float %314, %317, !dbg !43
+  %319 = bitcast float %313 to i32, !dbg !31
+  %320 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %319, i32 4, i32 31), !dbg !31
+  %321 = bitcast i32 %320 to float, !dbg !31
+  %322 = bitcast float %318 to i32, !dbg !31
+  %323 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %322, i32 4, i32 31), !dbg !31
+  %324 = bitcast i32 %323 to float, !dbg !31
+  %325 = bitcast float %308 to i32, !dbg !31
+  %326 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %325, i32 4, i32 31), !dbg !31
+  %327 = bitcast i32 %326 to float, !dbg !31
+  %328 = fsub float %321, %313, !dbg !28
+  %329 = fadd float %308, %327, !dbg !34
+  %330 = fcmp oeq float %329, 0.000000e+00, !dbg !35
+  %331 = tail call float @llvm.nvvm.div.full(float %327, float %329), !dbg !36
+  %332 = select i1 %330, float 0.000000e+00, float %331, !dbg !37
+  %333 = fmul float %328, %332, !dbg !38
+  %334 = fadd float %313, %333, !dbg !39
+  %335 = fadd float %318, %324, !dbg !44
+  %336 = fmul float %328, %328, !dbg !40
+  %337 = fmul float %308, %336, !dbg !41
+  %338 = fmul float %332, %337, !dbg !42
+  %339 = fadd float %335, %338, !dbg !43
+  %340 = bitcast float %334 to i32, !dbg !31
+  %341 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %340, i32 2, i32 31), !dbg !31
+  %342 = bitcast i32 %341 to float, !dbg !31
+  %343 = bitcast float %339 to i32, !dbg !31
+  %344 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %343, i32 2, i32 31), !dbg !31
+  %345 = bitcast i32 %344 to float, !dbg !31
+  %346 = bitcast float %329 to i32, !dbg !31
+  %347 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %346, i32 2, i32 31), !dbg !31
+  %348 = bitcast i32 %347 to float, !dbg !31
+  %349 = fsub float %342, %334, !dbg !28
+  %350 = fadd float %329, %348, !dbg !34
+  %351 = fcmp oeq float %350, 0.000000e+00, !dbg !35
+  %352 = tail call float @llvm.nvvm.div.full(float %348, float %350), !dbg !36
+  %353 = select i1 %351, float 0.000000e+00, float %352, !dbg !37
+  %354 = fmul float %349, %353, !dbg !38
+  %355 = fadd float %334, %354, !dbg !39
+  %356 = fadd float %339, %345, !dbg !44
+  %357 = fmul float %349, %349, !dbg !40
+  %358 = fmul float %329, %357, !dbg !41
+  %359 = fmul float %353, %358, !dbg !42
+  %360 = fadd float %356, %359, !dbg !43
+  %361 = bitcast float %355 to i32, !dbg !31
+  %362 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %361, i32 1, i32 31), !dbg !31
+  %363 = bitcast i32 %362 to float, !dbg !31
+  %364 = bitcast float %360 to i32, !dbg !31
+  %365 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %364, i32 1, i32 31), !dbg !31
+  %366 = bitcast i32 %365 to float, !dbg !31
+  %367 = bitcast float %350 to i32, !dbg !31
+  %368 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %367, i32 1, i32 31), !dbg !31
+  %369 = bitcast i32 %368 to float, !dbg !31
+  %370 = fsub float %363, %355, !dbg !28
+  %371 = fadd float %350, %369, !dbg !34
+  %372 = fcmp oeq float %371, 0.000000e+00, !dbg !35
+  %373 = tail call float @llvm.nvvm.div.full(float %369, float %371), !dbg !36
+  %374 = select i1 %372, float 0.000000e+00, float %373, !dbg !37
+  %375 = fmul float %370, %374, !dbg !38
+  %376 = fadd float %355, %375, !dbg !39
+  %377 = fadd float %360, %366, !dbg !44
+  %378 = fmul float %370, %370, !dbg !40
+  %379 = fmul float %350, %378, !dbg !41
+  %380 = fmul float %374, %379, !dbg !42
+  %381 = fadd float %377, %380, !dbg !43
+  %382 = and i32 %13, 15, !dbg !31
+  %383 = icmp eq i32 %382, 0, !dbg !31
+  %384 = and i1 %291, %383, !dbg !31
+  %385 = bitcast float %376 to <1 x i32>, !dbg !31
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %292, <1 x i32> %385, i1 %384) #6, !dbg !31
+  %386 = bitcast float %381 to <1 x i32>, !dbg !31
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %295, <1 x i32> %386, i1 %384) #6, !dbg !31
+  %387 = bitcast float %371 to <1 x i32>, !dbg !31
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %298, <1 x i32> %387, i1 %384) #6, !dbg !31
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !31
+  %388 = load float, ptr addrspace(3) @global_smem, align 16, !dbg !31
+  %389 = load float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 64), align 16, !dbg !31
+  %390 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #6, !dbg !45
+  %391 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %57, i64 %390, i1 %12) #6, !dbg !45
+  %392 = getelementptr bfloat, ptr addrspace(1) %3, i64 %33, !dbg !46
+  %393 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !47
+  %394 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %392, i64 %393, i1 true) #6, !dbg !47
+  %395 = getelementptr bfloat, ptr addrspace(1) %4, i64 %33, !dbg !48
+  %396 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !49
+  %397 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %395, i64 %396, i1 true) #6, !dbg !49
+  %398 = tail call float @llvm.nvvm.div.full(float %389, float 4.096000e+03), !dbg !50
+  %399 = fadd float %398, 0x3EB0C6F7A0000000, !dbg !51
+  %400 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !52
+  %401 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !52
+  %402 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !52
+  %403 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !52
+  %404 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !52
+  %405 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !52
+  %406 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !52
+  %407 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !52
+  %.not.i19 = icmp eq i32 %407, 0, !dbg !52
+  br i1 %.not.i19, label %410, label %408, !dbg !52
+
+408:                                              ; preds = %__nv_rsqrtf.exit
+  %409 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %399), !dbg !52
+  br label %__nv_rsqrtf.exit21, !dbg !52
+
+410:                                              ; preds = %__nv_rsqrtf.exit
+  %411 = tail call float @llvm.nvvm.rsqrt.approx.f(float %399), !dbg !52
+  br label %__nv_rsqrtf.exit21, !dbg !52
+
+__nv_rsqrtf.exit21:                               ; preds = %408, %410
+  %.0.i20 = phi float [ %409, %408 ], [ %411, %410 ], !dbg !52
+  %412 = extractvalue { i32, i32, i32, i32 } %391, 3, !dbg !45
+  %413 = bitcast i32 %412 to <2 x bfloat>, !dbg !45
+  %414 = extractvalue { i32, i32, i32, i32 } %391, 2, !dbg !45
+  %415 = bitcast i32 %414 to <2 x bfloat>, !dbg !45
+  %416 = extractvalue { i32, i32, i32, i32 } %391, 1, !dbg !45
+  %417 = bitcast i32 %416 to <2 x bfloat>, !dbg !45
+  %418 = extractvalue { i32, i32, i32, i32 } %391, 0, !dbg !45
+  %419 = bitcast i32 %418 to <2 x bfloat>, !dbg !45
+  %420 = extractvalue { i32, i32, i32, i32 } %397, 3, !dbg !49
+  %421 = bitcast i32 %420 to <2 x bfloat>, !dbg !49
+  %422 = extractvalue { i32, i32, i32, i32 } %397, 2, !dbg !49
+  %423 = bitcast i32 %422 to <2 x bfloat>, !dbg !49
+  %424 = extractvalue { i32, i32, i32, i32 } %397, 1, !dbg !49
+  %425 = bitcast i32 %424 to <2 x bfloat>, !dbg !49
+  %426 = extractvalue { i32, i32, i32, i32 } %397, 0, !dbg !49
+  %427 = bitcast i32 %426 to <2 x bfloat>, !dbg !49
+  %428 = extractvalue { i32, i32, i32, i32 } %394, 3, !dbg !47
+  %429 = bitcast i32 %428 to <2 x bfloat>, !dbg !47
+  %430 = extractvalue { i32, i32, i32, i32 } %394, 2, !dbg !47
+  %431 = bitcast i32 %430 to <2 x bfloat>, !dbg !47
+  %432 = extractvalue { i32, i32, i32, i32 } %394, 1, !dbg !47
+  %433 = bitcast i32 %432 to <2 x bfloat>, !dbg !47
+  %434 = extractvalue { i32, i32, i32, i32 } %394, 0, !dbg !47
+  %435 = bitcast i32 %434 to <2 x bfloat>, !dbg !47
+  %436 = getelementptr bfloat, ptr addrspace(1) %6, i64 %21, !dbg !53
+  %437 = fpext <2 x bfloat> %419 to <2 x float>, !dbg !54
+  %438 = insertelement <2 x float> poison, float %388, i64 0, !dbg !55
+  %439 = shufflevector <2 x float> %438, <2 x float> poison, <2 x i32> zeroinitializer, !dbg !55
+  %440 = fsub <2 x float> %437, %439, !dbg !55
+  %441 = fpext <2 x bfloat> %427 to <2 x float>, !dbg !56
+  %442 = fpext <2 x bfloat> %435 to <2 x float>, !dbg !57
+  %443 = insertelement <2 x float> poison, float %.0.i20, i64 0, !dbg !58
+  %444 = shufflevector <2 x float> %443, <2 x float> poison, <2 x i32> zeroinitializer, !dbg !58
+  %445 = fmul <2 x float> %440, %444, !dbg !58
+  %446 = fadd <2 x float> %442, splat (float 1.000000e+00), !dbg !59
+  %447 = fmul <2 x float> %446, %445, !dbg !60
+  %448 = fadd <2 x float> %447, %441, !dbg !61
+  %449 = fptrunc <2 x float> %448 to <2 x bfloat>, !dbg !62
+  %450 = fpext <2 x bfloat> %417 to <2 x float>, !dbg !54
+  %451 = fsub <2 x float> %450, %439, !dbg !55
+  %452 = fpext <2 x bfloat> %425 to <2 x float>, !dbg !56
+  %453 = fpext <2 x bfloat> %433 to <2 x float>, !dbg !57
+  %454 = fmul <2 x float> %451, %444, !dbg !58
+  %455 = fadd <2 x float> %453, splat (float 1.000000e+00), !dbg !59
+  %456 = fmul <2 x float> %455, %454, !dbg !60
+  %457 = fadd <2 x float> %456, %452, !dbg !61
+  %458 = fptrunc <2 x float> %457 to <2 x bfloat>, !dbg !62
+  %459 = fpext <2 x bfloat> %415 to <2 x float>, !dbg !54
+  %460 = fsub <2 x float> %459, %439, !dbg !55
+  %461 = fpext <2 x bfloat> %423 to <2 x float>, !dbg !56
+  %462 = fpext <2 x bfloat> %431 to <2 x float>, !dbg !57
+  %463 = fmul <2 x float> %460, %444, !dbg !58
+  %464 = fadd <2 x float> %462, splat (float 1.000000e+00), !dbg !59
+  %465 = fmul <2 x float> %464, %463, !dbg !60
+  %466 = fadd <2 x float> %465, %461, !dbg !61
+  %467 = fptrunc <2 x float> %466 to <2 x bfloat>, !dbg !62
+  %468 = fpext <2 x bfloat> %413 to <2 x float>, !dbg !54
+  %469 = fsub <2 x float> %468, %439, !dbg !55
+  %470 = fpext <2 x bfloat> %421 to <2 x float>, !dbg !56
+  %471 = fpext <2 x bfloat> %429 to <2 x float>, !dbg !57
+  %472 = fmul <2 x float> %469, %444, !dbg !58
+  %473 = fadd <2 x float> %471, splat (float 1.000000e+00), !dbg !59
+  %474 = fmul <2 x float> %473, %472, !dbg !60
+  %475 = fadd <2 x float> %474, %470, !dbg !61
+  %476 = fptrunc <2 x float> %475 to <2 x bfloat>, !dbg !62
+  %477 = bitcast <2 x bfloat> %449 to i32, !dbg !62
+  %478 = bitcast <2 x bfloat> %458 to i32, !dbg !62
+  %479 = bitcast <2 x bfloat> %467 to i32, !dbg !62
+  %480 = bitcast <2 x bfloat> %476 to i32, !dbg !62
+  tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %477, i32 %478, i32 %479, i32 %480, ptr addrspace(1) %436, i1 %12) #6, !dbg !62
+  ret void, !dbg !63
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.div.full(float, float) #2
+
+; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
+declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #3
+
+; Function Attrs: convergent nocallback nounwind
+declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #4
+
+declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #5
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #2
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.rsqrt.approx.f(float) #2
+
+attributes #0 = { nounwind "nvvm.reqntid"="512" }
+attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #2 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
+attributes #3 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
+attributes #4 = { convergent nocallback nounwind }
+attributes #5 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #6 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3}
+!llvm.ident = !{!4}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly)
+!1 = !DIFile(filename: "ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py", directory: "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3")
+!2 = !{i32 2, !"Debug Info Version", i32 3}
+!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
+!4 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
+!5 = distinct !DISubprogram(name: "triton_red_fused_add_mul_native_layer_norm_0", linkageName: "triton_red_fused_add_mul_native_layer_norm_0", scope: !1, file: !1, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
+!7 = !{}
+!8 = !DILocation(line: 23, column: 28, scope: !5)
+!9 = !DILocation(line: 25, column: 21, scope: !5)
+!10 = !DILocation(line: 26, column: 37, scope: !5)
+!11 = !DILocation(line: 38, column: 46, scope: !5)
+!12 = !DILocation(line: 38, column: 41, scope: !5)
+!13 = !DILocation(line: 38, column: 34, scope: !5)
+!14 = !DILocation(line: 38, column: 51, scope: !5)
+!15 = !DILocation(line: 39, column: 34, scope: !5)
+!16 = !DILocation(line: 39, column: 41, scope: !5)
+!17 = !DILocation(line: 40, column: 34, scope: !5)
+!18 = !DILocation(line: 40, column: 51, scope: !5)
+!19 = !DILocation(line: 50, column: 66, scope: !5)
+!20 = !DILocation(line: 51, column: 29, scope: !5)
+!21 = !DILocation(line: 38, column: 113, scope: !5)
+!22 = !DILocation(line: 39, column: 94, scope: !5)
+!23 = !DILocation(line: 40, column: 113, scope: !5)
+!24 = !DILocation(line: 41, column: 22, scope: !5)
+!25 = !DILocation(line: 42, column: 22, scope: !5)
+!26 = !DILocation(line: 48, column: 62, scope: !5)
+!27 = !DILocation(line: 51, column: 52, scope: !5)
+!28 = !DILocation(line: 231, column: 21, scope: !29, inlinedAt: !31)
+!29 = distinct !DILexicalBlockFile(scope: !5, file: !30, discriminator: 0)
+!30 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime")
+!31 = !DILocation(line: 243, column: 46, scope: !29, inlinedAt: !32)
+!32 = !DILocation(line: 52, column: 80, scope: !33)
+!33 = distinct !DILexicalBlockFile(scope: !5, file: !1, discriminator: 0)
+!34 = !DILocation(line: 232, column: 28, scope: !29, inlinedAt: !31)
+!35 = !DILocation(line: 233, column: 39, scope: !29, inlinedAt: !31)
+!36 = !DILocation(line: 233, column: 60, scope: !29, inlinedAt: !31)
+!37 = !DILocation(line: 233, column: 49, scope: !29, inlinedAt: !31)
+!38 = !DILocation(line: 235, column: 25, scope: !29, inlinedAt: !31)
+!39 = !DILocation(line: 235, column: 17, scope: !29, inlinedAt: !31)
+!40 = !DILocation(line: 236, column: 30, scope: !29, inlinedAt: !31)
+!41 = !DILocation(line: 236, column: 38, scope: !29, inlinedAt: !31)
+!42 = !DILocation(line: 236, column: 49, scope: !29, inlinedAt: !31)
+!43 = !DILocation(line: 236, column: 22, scope: !29, inlinedAt: !31)
+!44 = !DILocation(line: 236, column: 15, scope: !29, inlinedAt: !31)
+!45 = !DILocation(line: 62, column: 53, scope: !5)
+!46 = !DILocation(line: 63, column: 35, scope: !5)
+!47 = !DILocation(line: 63, column: 42, scope: !5)
+!48 = !DILocation(line: 64, column: 35, scope: !5)
+!49 = !DILocation(line: 64, column: 42, scope: !5)
+!50 = !DILocation(line: 68, column: 25, scope: !5)
+!51 = !DILocation(line: 70, column: 24, scope: !5)
+!52 = !DILocation(line: 71, column: 32, scope: !5)
+!53 = !DILocation(line: 78, column: 29, scope: !5)
+!54 = !DILocation(line: 62, column: 115, scope: !5)
+!55 = !DILocation(line: 66, column: 24, scope: !5)
+!56 = !DILocation(line: 64, column: 95, scope: !5)
+!57 = !DILocation(line: 63, column: 95, scope: !5)
+!58 = !DILocation(line: 72, column: 24, scope: !5)
+!59 = !DILocation(line: 75, column: 24, scope: !5)
+!60 = !DILocation(line: 76, column: 24, scope: !5)
+!61 = !DILocation(line: 77, column: 24, scope: !5)
+!62 = !DILocation(line: 78, column: 53, scope: !5)
+!63 = !DILocation(line: 56, column: 4, scope: !5)
diff --git a/triton/OPILR2I4CRZEKF5SC7TQV475GMCNDW573IRFPSSSP4NDN5JIX2VQ/triton_red_fused_add_mul_native_layer_norm_0.ptx b/triton/OPILR2I4CRZEKF5SC7TQV475GMCNDW573IRFPSSSP4NDN5JIX2VQ/triton_red_fused_add_mul_native_layer_norm_0.ptx
new file mode 100644
index 0000000000000000000000000000000000000000..b9b7da2e3299043e74f389ae7bfc1dd265132584
--- /dev/null
+++ b/triton/OPILR2I4CRZEKF5SC7TQV475GMCNDW573IRFPSSSP4NDN5JIX2VQ/triton_red_fused_add_mul_native_layer_norm_0.ptx
@@ -0,0 +1,1129 @@
+//
+// Generated by LLVM NVPTX Back-End
+//
+
+.version 9.1
+.target sm_89
+.address_size 64
+
+	// .globl	triton_red_fused_add_mul_native_layer_norm_0 // -- Begin function triton_red_fused_add_mul_native_layer_norm_0
+.extern .shared .align 16 .b8 global_smem[];
+.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90};
+                                        // @triton_red_fused_add_mul_native_layer_norm_0
+.visible .entry triton_red_fused_add_mul_native_layer_norm_0(
+	.param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_0_param_0,
+	.param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_0_param_1,
+	.param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_0_param_2,
+	.param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_0_param_3,
+	.param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_0_param_4,
+	.param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_0_param_5,
+	.param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_0_param_6,
+	.param .u32 triton_red_fused_add_mul_native_layer_norm_0_param_7,
+	.param .u32 triton_red_fused_add_mul_native_layer_norm_0_param_8,
+	.param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_0_param_9,
+	.param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_0_param_10
+)
+.reqntid 512
+{
+	.reg .pred 	%p<23>;
+	.reg .b16 	%rs<49>;
+	.reg .b32 	%r<323>;
+	.reg .b64 	%rd<23>;
+	.loc	1 18 0                          // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:18:0
+$L__func_begin0:
+	.loc	1 18 0                          // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:18:0
+
+// %bb.0:                               // %__nv_rsqrtf.exit
+	ld.param.b64 	%rd14, [triton_red_fused_add_mul_native_layer_norm_0_param_0];
+	ld.param.b64 	%rd15, [triton_red_fused_add_mul_native_layer_norm_0_param_1];
+$L__tmp0:
+	.loc	1 23 28                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:23:28
+	mov.u32 	%r49, %ctaid.x;
+	.loc	1 25 21                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:25:21
+	setp.lt.u32 	%p1, %r49, 256;
+	ld.param.b64 	%rd16, [triton_red_fused_add_mul_native_layer_norm_0_param_2];
+	ld.param.b64 	%rd17, [triton_red_fused_add_mul_native_layer_norm_0_param_3];
+	.loc	1 26 37                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:26:37
+	mov.u32 	%r50, %tid.x;
+	and.b32 	%r51, %r50, 511;
+	ld.param.b64 	%rd18, [triton_red_fused_add_mul_native_layer_norm_0_param_4];
+	and.b32 	%r52, %r50, 31;
+	ld.param.b64 	%rd19, [triton_red_fused_add_mul_native_layer_norm_0_param_5];
+	ld.param.b64 	%rd20, [triton_red_fused_add_mul_native_layer_norm_0_param_6];
+	shl.b32 	%r53, %r50, 3;
+	and.b32 	%r54, %r53, 4088;
+	.loc	1 38 46                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:38:46
+	shl.b32 	%r55, %r49, 12;
+	.loc	1 38 41                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:38:41
+	or.b32 	%r56, %r54, %r55;
+	.loc	1 38 34                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:38:34
+	mul.wide.s32 	%rd21, %r56, 2;
+	add.s64 	%rd1, %rd14, %rd21;
+	.loc	1 38 51                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:38:51
+	// begin inline asm
+	mov.u64 %rd2, 0x0;
+	createpolicy.fractional.L2::evict_first.b64 %rd2, 1.0;
+	// end inline asm
+	mov.b32 	%r5, 0;
+	// begin inline asm
+	mov.u32 %r1, %r5;
+	mov.u32 %r2, %r5;
+	mov.u32 %r3, %r5;
+	mov.u32 %r4, %r5;
+	@%p1 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { %r1, %r2, %r3, %r4 }, [ %rd1 + 0 ], %rd2;
+	// end inline asm
+	.loc	1 39 34                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:39:34
+	mul.wide.u32 	%rd22, %r54, 2;
+	add.s64 	%rd3, %rd15, %rd22;
+	.loc	1 39 41                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:39:41
+	// begin inline asm
+	mov.u64 %rd4, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd4, 1.0;
+	// end inline asm
+	mov.pred 	%p2, -1;
+	// begin inline asm
+	mov.u32 %r6, %r5;
+	mov.u32 %r7, %r5;
+	mov.u32 %r8, %r5;
+	mov.u32 %r9, %r5;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r6, %r7, %r8, %r9 }, [ %rd3 + 0 ], %rd4;
+	// end inline asm
+	.loc	1 40 34                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:40:34
+	add.s64 	%rd5, %rd16, %rd21;
+	.loc	1 40 51                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:40:51
+	// begin inline asm
+	mov.u64 %rd6, 0x0;
+	createpolicy.fractional.L2::evict_first.b64 %rd6, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r10, %r5;
+	mov.u32 %r11, %r5;
+	mov.u32 %r12, %r5;
+	mov.u32 %r13, %r5;
+	@%p1 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { %r10, %r11, %r12, %r13 }, [ %rd5 + 0 ], %rd6;
+	// end inline asm
+	.loc	1 50 66                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:50:66
+	selp.f32 	%r57, 0f3F800000, 0f00000000, %p1;
+	.loc	1 51 29                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:51:29
+	add.s64 	%rd7, %rd19, %rd21;
+	.loc	1 38 113                        // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:38:113
+	mov.b32 	{%rs1, %rs2}, %r1;
+	cvt.f32.bf16 	%r58, %rs1;
+	cvt.f32.bf16 	%r59, %rs2;
+	.loc	1 39 94                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:39:94
+	mov.b32 	{%rs3, %rs4}, %r6;
+	cvt.f32.bf16 	%r60, %rs3;
+	cvt.f32.bf16 	%r61, %rs4;
+	.loc	1 40 113                        // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:40:113
+	mov.b32 	{%rs5, %rs6}, %r10;
+	cvt.f32.bf16 	%r62, %rs5;
+	cvt.f32.bf16 	%r63, %rs6;
+	.loc	1 42 22                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:42:22
+	fma.rn.f32 	%r64, %r61, %r63, %r59;
+	fma.rn.f32 	%r65, %r60, %r62, %r58;
+	.loc	1 48 62                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:48:62
+	selp.f32 	%r66, %r65, 0f00000000, %p1;
+	selp.f32 	%r67, %r64, 0f00000000, %p1;
+	.loc	1 51 52                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:51:52
+	cvt.rn.bf16x2.f32 	%r14, %r64, %r65;
+	.loc	1 38 113                        // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:38:113
+	mov.b32 	{%rs7, %rs8}, %r2;
+	cvt.f32.bf16 	%r68, %rs7;
+	cvt.f32.bf16 	%r69, %rs8;
+	.loc	1 39 94                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:39:94
+	mov.b32 	{%rs9, %rs10}, %r7;
+	cvt.f32.bf16 	%r70, %rs9;
+	cvt.f32.bf16 	%r71, %rs10;
+	.loc	1 40 113                        // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:40:113
+	mov.b32 	{%rs11, %rs12}, %r11;
+	cvt.f32.bf16 	%r72, %rs11;
+	cvt.f32.bf16 	%r73, %rs12;
+	.loc	1 42 22                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:42:22
+	fma.rn.f32 	%r74, %r71, %r73, %r69;
+	fma.rn.f32 	%r75, %r70, %r72, %r68;
+	.loc	1 48 62                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:48:62
+	selp.f32 	%r76, %r75, 0f00000000, %p1;
+	selp.f32 	%r77, %r74, 0f00000000, %p1;
+	.loc	1 51 52                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:51:52
+	cvt.rn.bf16x2.f32 	%r15, %r74, %r75;
+	.loc	1 38 113                        // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:38:113
+	mov.b32 	{%rs13, %rs14}, %r3;
+	cvt.f32.bf16 	%r78, %rs13;
+	cvt.f32.bf16 	%r79, %rs14;
+	.loc	1 39 94                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:39:94
+	mov.b32 	{%rs15, %rs16}, %r8;
+	cvt.f32.bf16 	%r80, %rs15;
+	cvt.f32.bf16 	%r81, %rs16;
+	.loc	1 40 113                        // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:40:113
+	mov.b32 	{%rs17, %rs18}, %r12;
+	cvt.f32.bf16 	%r82, %rs17;
+	cvt.f32.bf16 	%r83, %rs18;
+	.loc	1 42 22                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:42:22
+	fma.rn.f32 	%r84, %r81, %r83, %r79;
+	fma.rn.f32 	%r85, %r80, %r82, %r78;
+	.loc	1 48 62                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:48:62
+	selp.f32 	%r86, %r85, 0f00000000, %p1;
+	selp.f32 	%r87, %r84, 0f00000000, %p1;
+	.loc	1 51 52                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:51:52
+	cvt.rn.bf16x2.f32 	%r16, %r84, %r85;
+	.loc	1 38 113                        // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:38:113
+	mov.b32 	{%rs19, %rs20}, %r4;
+	cvt.f32.bf16 	%r88, %rs19;
+	cvt.f32.bf16 	%r89, %rs20;
+	.loc	1 39 94                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:39:94
+	mov.b32 	{%rs21, %rs22}, %r9;
+	cvt.f32.bf16 	%r90, %rs21;
+	cvt.f32.bf16 	%r91, %rs22;
+	.loc	1 40 113                        // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:40:113
+	mov.b32 	{%rs23, %rs24}, %r13;
+	cvt.f32.bf16 	%r92, %rs23;
+	cvt.f32.bf16 	%r93, %rs24;
+	.loc	1 42 22                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:42:22
+	fma.rn.f32 	%r94, %r91, %r93, %r89;
+	fma.rn.f32 	%r95, %r90, %r92, %r88;
+	.loc	1 48 62                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:48:62
+	selp.f32 	%r96, %r95, 0f00000000, %p1;
+	selp.f32 	%r97, %r94, 0f00000000, %p1;
+	.loc	1 51 52                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:51:52
+	cvt.rn.bf16x2.f32 	%r17, %r94, %r95;
+	// begin inline asm
+	@%p1 st.global.v4.b32 [ %rd7 + 0 ], { %r14, %r15, %r16, %r17 };
+	// end inline asm
+$L__tmp1:
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	sub.f32 	%r98, %r67, %r66;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	selp.f32 	%r99, 0f40000000, 0f00000000, %p1;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	setp.eq.f32 	%p6, %r99, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	div.full.f32 	%r100, %r57, %r99;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	selp.f32 	%r101, 0f00000000, %r100, %p6;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	fma.rn.f32 	%r102, %r101, %r98, %r66;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	mul.f32 	%r103, %r98, %r98;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	mul.f32 	%r104, %r57, %r103;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	fma.rn.f32 	%r105, %r101, %r104, 0f00000000;
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	sub.f32 	%r106, %r76, %r102;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	selp.f32 	%r107, 0f40400000, 0f00000000, %p1;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	setp.eq.f32 	%p7, %r107, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	div.full.f32 	%r108, %r57, %r107;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	selp.f32 	%r109, 0f00000000, %r108, %p7;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	fma.rn.f32 	%r110, %r109, %r106, %r102;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	mul.f32 	%r111, %r106, %r106;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	mul.f32 	%r112, %r99, %r111;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	fma.rn.f32 	%r113, %r109, %r112, %r105;
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	sub.f32 	%r114, %r77, %r110;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	selp.f32 	%r115, 0f40800000, 0f00000000, %p1;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	setp.eq.f32 	%p8, %r115, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	div.full.f32 	%r116, %r57, %r115;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	selp.f32 	%r117, 0f00000000, %r116, %p8;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	fma.rn.f32 	%r118, %r117, %r114, %r110;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	mul.f32 	%r119, %r114, %r114;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	mul.f32 	%r120, %r107, %r119;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	fma.rn.f32 	%r121, %r117, %r120, %r113;
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	sub.f32 	%r122, %r86, %r118;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	selp.f32 	%r123, 0f40A00000, 0f00000000, %p1;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	setp.eq.f32 	%p9, %r123, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	div.full.f32 	%r124, %r57, %r123;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	selp.f32 	%r125, 0f00000000, %r124, %p9;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	fma.rn.f32 	%r126, %r125, %r122, %r118;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	mul.f32 	%r127, %r122, %r122;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	mul.f32 	%r128, %r115, %r127;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	fma.rn.f32 	%r129, %r125, %r128, %r121;
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	sub.f32 	%r130, %r87, %r126;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	selp.f32 	%r131, 0f40C00000, 0f00000000, %p1;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	setp.eq.f32 	%p10, %r131, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	div.full.f32 	%r132, %r57, %r131;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	selp.f32 	%r133, 0f00000000, %r132, %p10;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	fma.rn.f32 	%r134, %r133, %r130, %r126;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	mul.f32 	%r135, %r130, %r130;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	mul.f32 	%r136, %r123, %r135;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	fma.rn.f32 	%r137, %r133, %r136, %r129;
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	sub.f32 	%r138, %r96, %r134;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	selp.f32 	%r139, 0f40E00000, 0f00000000, %p1;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	setp.eq.f32 	%p11, %r139, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	div.full.f32 	%r140, %r57, %r139;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	selp.f32 	%r141, 0f00000000, %r140, %p11;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	fma.rn.f32 	%r142, %r141, %r138, %r134;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	mul.f32 	%r143, %r138, %r138;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	mul.f32 	%r144, %r131, %r143;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	fma.rn.f32 	%r145, %r141, %r144, %r137;
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	sub.f32 	%r146, %r97, %r142;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	selp.f32 	%r147, 0f41000000, 0f00000000, %p1;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	setp.eq.f32 	%p12, %r147, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	div.full.f32 	%r148, %r57, %r147;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	selp.f32 	%r149, 0f00000000, %r148, %p12;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	fma.rn.f32 	%r150, %r149, %r146, %r142;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	mul.f32 	%r151, %r146, %r146;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	mul.f32 	%r152, %r139, %r151;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	fma.rn.f32 	%r153, %r149, %r152, %r145;
+$L__tmp2:
+	.loc	2 243 46                        // triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ]
+	shfl.sync.bfly.b32 	%r154, %r150, 16, 31, -1;
+	shfl.sync.bfly.b32 	%r155, %r153, 16, 31, -1;
+	shfl.sync.bfly.b32 	%r156, %r147, 16, 31, -1;
+$L__tmp3:
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	sub.f32 	%r157, %r154, %r150;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	add.f32 	%r158, %r147, %r156;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	setp.eq.f32 	%p13, %r158, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	div.full.f32 	%r159, %r156, %r158;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	selp.f32 	%r160, 0f00000000, %r159, %p13;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	fma.rn.f32 	%r161, %r160, %r157, %r150;
+	.loc	2 236 15                        // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	add.f32 	%r162, %r153, %r155;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	mul.f32 	%r163, %r157, %r157;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	mul.f32 	%r164, %r147, %r163;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	fma.rn.f32 	%r165, %r160, %r164, %r162;
+$L__tmp4:
+	.loc	2 243 46                        // triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ]
+	shfl.sync.bfly.b32 	%r166, %r161, 8, 31, -1;
+	shfl.sync.bfly.b32 	%r167, %r165, 8, 31, -1;
+	shfl.sync.bfly.b32 	%r168, %r158, 8, 31, -1;
+$L__tmp5:
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	sub.f32 	%r169, %r166, %r161;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	add.f32 	%r170, %r158, %r168;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	setp.eq.f32 	%p14, %r170, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	div.full.f32 	%r171, %r168, %r170;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	selp.f32 	%r172, 0f00000000, %r171, %p14;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	fma.rn.f32 	%r173, %r172, %r169, %r161;
+	.loc	2 236 15                        // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	add.f32 	%r174, %r165, %r167;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	mul.f32 	%r175, %r169, %r169;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	mul.f32 	%r176, %r158, %r175;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	fma.rn.f32 	%r177, %r172, %r176, %r174;
+$L__tmp6:
+	.loc	2 243 46                        // triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ]
+	shfl.sync.bfly.b32 	%r178, %r173, 4, 31, -1;
+	shfl.sync.bfly.b32 	%r179, %r177, 4, 31, -1;
+	shfl.sync.bfly.b32 	%r180, %r170, 4, 31, -1;
+$L__tmp7:
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	sub.f32 	%r181, %r178, %r173;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	add.f32 	%r182, %r170, %r180;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	setp.eq.f32 	%p15, %r182, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	div.full.f32 	%r183, %r180, %r182;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	selp.f32 	%r184, 0f00000000, %r183, %p15;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	fma.rn.f32 	%r185, %r184, %r181, %r173;
+	.loc	2 236 15                        // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	add.f32 	%r186, %r177, %r179;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	mul.f32 	%r187, %r181, %r181;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	mul.f32 	%r188, %r170, %r187;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	fma.rn.f32 	%r189, %r184, %r188, %r186;
+$L__tmp8:
+	.loc	2 243 46                        // triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ]
+	shfl.sync.bfly.b32 	%r190, %r185, 2, 31, -1;
+	shfl.sync.bfly.b32 	%r191, %r189, 2, 31, -1;
+	shfl.sync.bfly.b32 	%r192, %r182, 2, 31, -1;
+$L__tmp9:
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	sub.f32 	%r193, %r190, %r185;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	add.f32 	%r194, %r182, %r192;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	setp.eq.f32 	%p16, %r194, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	div.full.f32 	%r195, %r192, %r194;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	selp.f32 	%r196, 0f00000000, %r195, %p16;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	fma.rn.f32 	%r197, %r196, %r193, %r185;
+	.loc	2 236 15                        // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	add.f32 	%r198, %r189, %r191;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	mul.f32 	%r199, %r193, %r193;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	mul.f32 	%r200, %r182, %r199;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	fma.rn.f32 	%r201, %r196, %r200, %r198;
+$L__tmp10:
+	.loc	2 243 46                        // triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ]
+	shfl.sync.bfly.b32 	%r202, %r197, 1, 31, -1;
+	shfl.sync.bfly.b32 	%r203, %r201, 1, 31, -1;
+	shfl.sync.bfly.b32 	%r204, %r194, 1, 31, -1;
+$L__tmp11:
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	sub.f32 	%r205, %r202, %r197;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	add.f32 	%r23, %r194, %r204;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	setp.eq.f32 	%p17, %r23, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	div.full.f32 	%r206, %r204, %r23;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	selp.f32 	%r207, 0f00000000, %r206, %p17;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	fma.rn.f32 	%r19, %r207, %r205, %r197;
+	.loc	2 236 15                        // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	add.f32 	%r208, %r201, %r203;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	mul.f32 	%r209, %r205, %r205;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	mul.f32 	%r210, %r194, %r209;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	fma.rn.f32 	%r21, %r207, %r210, %r208;
+$L__tmp12:
+	.loc	2 243 46                        // triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ]
+	setp.eq.b32 	%p3, %r52, 0;
+	shr.u32 	%r211, %r50, 3;
+	and.b32 	%r212, %r211, 60;
+	mov.b32 	%r213, global_smem;
+	add.s32 	%r18, %r213, %r212;
+	// begin inline asm
+	@%p3 st.shared.b32 [ %r18 + 0 ], %r19;
+	// end inline asm
+	add.s32 	%r20, %r18, 64;
+	// begin inline asm
+	@%p3 st.shared.b32 [ %r20 + 0 ], %r21;
+	// end inline asm
+	add.s32 	%r22, %r18, 128;
+	// begin inline asm
+	@%p3 st.shared.b32 [ %r22 + 0 ], %r23;
+	// end inline asm
+	bar.sync 	0;
+	setp.lt.u32 	%p4, %r51, 16;
+	shl.b32 	%r214, %r51, 2;
+	add.s32 	%r25, %r213, %r214;
+	// begin inline asm
+	@%p4 ld.shared.b32 %r24, [ %r25 + 0 ];
+	// end inline asm
+	add.s32 	%r27, %r25, 64;
+	// begin inline asm
+	@%p4 ld.shared.b32 %r26, [ %r27 + 0 ];
+	// end inline asm
+	add.s32 	%r29, %r25, 128;
+	// begin inline asm
+	@%p4 ld.shared.b32 %r28, [ %r29 + 0 ];
+	// end inline asm
+	shfl.sync.bfly.b32 	%r215, %r24, 8, 31, -1;
+	shfl.sync.bfly.b32 	%r216, %r26, 8, 31, -1;
+	shfl.sync.bfly.b32 	%r217, %r28, 8, 31, -1;
+$L__tmp13:
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	sub.f32 	%r218, %r215, %r24;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	add.f32 	%r219, %r28, %r217;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	setp.eq.f32 	%p18, %r219, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	div.full.f32 	%r220, %r217, %r219;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	selp.f32 	%r221, 0f00000000, %r220, %p18;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	fma.rn.f32 	%r222, %r218, %r221, %r24;
+	.loc	2 236 15                        // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	add.f32 	%r223, %r26, %r216;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	mul.f32 	%r224, %r218, %r218;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	mul.f32 	%r225, %r224, %r28;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	fma.rn.f32 	%r226, %r225, %r221, %r223;
+$L__tmp14:
+	.loc	2 243 46                        // triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ]
+	shfl.sync.bfly.b32 	%r227, %r222, 4, 31, -1;
+	shfl.sync.bfly.b32 	%r228, %r226, 4, 31, -1;
+	shfl.sync.bfly.b32 	%r229, %r219, 4, 31, -1;
+$L__tmp15:
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	sub.f32 	%r230, %r227, %r222;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	add.f32 	%r231, %r219, %r229;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	setp.eq.f32 	%p19, %r231, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	div.full.f32 	%r232, %r229, %r231;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	selp.f32 	%r233, 0f00000000, %r232, %p19;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	fma.rn.f32 	%r234, %r230, %r233, %r222;
+	.loc	2 236 15                        // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	add.f32 	%r235, %r226, %r228;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	mul.f32 	%r236, %r230, %r230;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	mul.f32 	%r237, %r219, %r236;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	fma.rn.f32 	%r238, %r233, %r237, %r235;
+$L__tmp16:
+	.loc	2 243 46                        // triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ]
+	shfl.sync.bfly.b32 	%r239, %r234, 2, 31, -1;
+	shfl.sync.bfly.b32 	%r240, %r238, 2, 31, -1;
+	shfl.sync.bfly.b32 	%r241, %r231, 2, 31, -1;
+$L__tmp17:
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	sub.f32 	%r242, %r239, %r234;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	add.f32 	%r243, %r231, %r241;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	setp.eq.f32 	%p20, %r243, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	div.full.f32 	%r244, %r241, %r243;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	selp.f32 	%r245, 0f00000000, %r244, %p20;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	fma.rn.f32 	%r246, %r242, %r245, %r234;
+	.loc	2 236 15                        // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	add.f32 	%r247, %r238, %r240;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	mul.f32 	%r248, %r242, %r242;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	mul.f32 	%r249, %r231, %r248;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	fma.rn.f32 	%r250, %r245, %r249, %r247;
+$L__tmp18:
+	.loc	2 243 46                        // triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ]
+	shfl.sync.bfly.b32 	%r251, %r246, 1, 31, -1;
+	shfl.sync.bfly.b32 	%r252, %r250, 1, 31, -1;
+	shfl.sync.bfly.b32 	%r253, %r243, 1, 31, -1;
+$L__tmp19:
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	sub.f32 	%r254, %r251, %r246;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	add.f32 	%r32, %r243, %r253;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	setp.eq.f32 	%p21, %r32, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	div.full.f32 	%r255, %r253, %r32;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	selp.f32 	%r256, 0f00000000, %r255, %p21;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	fma.rn.f32 	%r30, %r254, %r256, %r246;
+	.loc	2 236 15                        // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	add.f32 	%r257, %r250, %r252;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	mul.f32 	%r258, %r254, %r254;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	mul.f32 	%r259, %r243, %r258;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ] ]
+	fma.rn.f32 	%r31, %r256, %r259, %r257;
+$L__tmp20:
+	.loc	2 243 46                        // triton_helpers.py:243:46 @[ ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:52:80 ]
+	and.b32 	%r260, %r50, 15;
+	setp.eq.b32 	%p22, %r260, 0;
+	and.pred 	%p5, %p4, %p22;
+	// begin inline asm
+	@%p5 st.shared.b32 [ %r25 + 0 ], %r30;
+	// end inline asm
+	// begin inline asm
+	@%p5 st.shared.b32 [ %r27 + 0 ], %r31;
+	// end inline asm
+	// begin inline asm
+	@%p5 st.shared.b32 [ %r29 + 0 ], %r32;
+	// end inline asm
+	bar.sync 	0;
+	ld.shared.b32 	%r261, [global_smem];
+	ld.shared.b32 	%r262, [global_smem+64];
+$L__tmp21:
+	.loc	1 62 53                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:62:53
+	// begin inline asm
+	mov.u64 %rd8, 0x0;
+	createpolicy.fractional.L2::evict_first.b64 %rd8, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r33, %r5;
+	mov.u32 %r34, %r5;
+	mov.u32 %r35, %r5;
+	mov.u32 %r36, %r5;
+	@%p1 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { %r33, %r34, %r35, %r36 }, [ %rd7 + 0 ], %rd8;
+	// end inline asm
+	.loc	1 63 35                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:63:35
+	add.s64 	%rd9, %rd17, %rd22;
+	.loc	1 63 42                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:63:42
+	// begin inline asm
+	mov.u64 %rd10, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd10, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r37, %r5;
+	mov.u32 %r38, %r5;
+	mov.u32 %r39, %r5;
+	mov.u32 %r40, %r5;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r37, %r38, %r39, %r40 }, [ %rd9 + 0 ], %rd10;
+	// end inline asm
+	.loc	1 64 35                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:64:35
+	add.s64 	%rd11, %rd18, %rd22;
+	.loc	1 64 42                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:64:42
+	// begin inline asm
+	mov.u64 %rd12, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd12, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r41, %r5;
+	mov.u32 %r42, %r5;
+	mov.u32 %r43, %r5;
+	mov.u32 %r44, %r5;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r41, %r42, %r43, %r44 }, [ %rd11 + 0 ], %rd12;
+	// end inline asm
+	mov.b32 	%r263, 0f45800000;
+	.loc	1 68 25                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:68:25
+	div.full.f32 	%r264, %r262, %r263;
+	.loc	1 70 24                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:70:24
+	add.f32 	%r265, %r264, 0f358637BD;
+	.loc	1 71 32                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:71:32
+	rsqrt.approx.ftz.f32 	%r266, %r265;
+	.loc	1 78 29                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:78:29
+	add.s64 	%rd13, %rd20, %rd21;
+	.loc	1 62 115                        // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:62:115
+	mov.b32 	{%rs25, %rs26}, %r33;
+	cvt.f32.bf16 	%r267, %rs26;
+	cvt.f32.bf16 	%r268, %rs25;
+	.loc	1 66 24                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:66:24
+	sub.f32 	%r269, %r268, %r261;
+	sub.f32 	%r270, %r267, %r261;
+	.loc	1 64 95                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:64:95
+	mov.b32 	{%rs27, %rs28}, %r41;
+	cvt.f32.bf16 	%r271, %rs28;
+	cvt.f32.bf16 	%r272, %rs27;
+	.loc	1 63 95                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:63:95
+	mov.b32 	{%rs29, %rs30}, %r37;
+	cvt.f32.bf16 	%r273, %rs29;
+	cvt.f32.bf16 	%r274, %rs30;
+	.loc	1 72 24                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:72:24
+	mul.f32 	%r275, %r270, %r266;
+	mul.f32 	%r276, %r269, %r266;
+	.loc	1 75 24                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:75:24
+	add.f32 	%r277, %r274, 0f3F800000;
+	add.f32 	%r278, %r273, 0f3F800000;
+	.loc	1 77 24                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:77:24
+	fma.rn.f32 	%r279, %r278, %r276, %r272;
+	fma.rn.f32 	%r280, %r277, %r275, %r271;
+	.loc	1 78 53                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:78:53
+	cvt.rn.bf16x2.f32 	%r45, %r280, %r279;
+	.loc	1 62 115                        // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:62:115
+	mov.b32 	{%rs31, %rs32}, %r34;
+	cvt.f32.bf16 	%r281, %rs32;
+	cvt.f32.bf16 	%r282, %rs31;
+	.loc	1 66 24                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:66:24
+	sub.f32 	%r283, %r282, %r261;
+	sub.f32 	%r284, %r281, %r261;
+	.loc	1 64 95                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:64:95
+	mov.b32 	{%rs33, %rs34}, %r42;
+	cvt.f32.bf16 	%r285, %rs34;
+	cvt.f32.bf16 	%r286, %rs33;
+	.loc	1 63 95                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:63:95
+	mov.b32 	{%rs35, %rs36}, %r38;
+	cvt.f32.bf16 	%r287, %rs35;
+	cvt.f32.bf16 	%r288, %rs36;
+	.loc	1 72 24                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:72:24
+	mul.f32 	%r289, %r284, %r266;
+	mul.f32 	%r290, %r283, %r266;
+	.loc	1 75 24                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:75:24
+	add.f32 	%r291, %r288, 0f3F800000;
+	add.f32 	%r292, %r287, 0f3F800000;
+	.loc	1 77 24                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:77:24
+	fma.rn.f32 	%r293, %r292, %r290, %r286;
+	fma.rn.f32 	%r294, %r291, %r289, %r285;
+	.loc	1 78 53                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:78:53
+	cvt.rn.bf16x2.f32 	%r46, %r294, %r293;
+	.loc	1 62 115                        // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:62:115
+	mov.b32 	{%rs37, %rs38}, %r35;
+	cvt.f32.bf16 	%r295, %rs38;
+	cvt.f32.bf16 	%r296, %rs37;
+	.loc	1 66 24                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:66:24
+	sub.f32 	%r297, %r296, %r261;
+	sub.f32 	%r298, %r295, %r261;
+	.loc	1 64 95                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:64:95
+	mov.b32 	{%rs39, %rs40}, %r43;
+	cvt.f32.bf16 	%r299, %rs40;
+	cvt.f32.bf16 	%r300, %rs39;
+	.loc	1 63 95                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:63:95
+	mov.b32 	{%rs41, %rs42}, %r39;
+	cvt.f32.bf16 	%r301, %rs41;
+	cvt.f32.bf16 	%r302, %rs42;
+	.loc	1 72 24                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:72:24
+	mul.f32 	%r303, %r298, %r266;
+	mul.f32 	%r304, %r297, %r266;
+	.loc	1 75 24                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:75:24
+	add.f32 	%r305, %r302, 0f3F800000;
+	add.f32 	%r306, %r301, 0f3F800000;
+	.loc	1 77 24                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:77:24
+	fma.rn.f32 	%r307, %r306, %r304, %r300;
+	fma.rn.f32 	%r308, %r305, %r303, %r299;
+	.loc	1 78 53                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:78:53
+	cvt.rn.bf16x2.f32 	%r47, %r308, %r307;
+	.loc	1 62 115                        // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:62:115
+	mov.b32 	{%rs43, %rs44}, %r36;
+	cvt.f32.bf16 	%r309, %rs44;
+	cvt.f32.bf16 	%r310, %rs43;
+	.loc	1 66 24                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:66:24
+	sub.f32 	%r311, %r310, %r261;
+	sub.f32 	%r312, %r309, %r261;
+	.loc	1 64 95                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:64:95
+	mov.b32 	{%rs45, %rs46}, %r44;
+	cvt.f32.bf16 	%r313, %rs46;
+	cvt.f32.bf16 	%r314, %rs45;
+	.loc	1 63 95                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:63:95
+	mov.b32 	{%rs47, %rs48}, %r40;
+	cvt.f32.bf16 	%r315, %rs47;
+	cvt.f32.bf16 	%r316, %rs48;
+	.loc	1 72 24                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:72:24
+	mul.f32 	%r317, %r312, %r266;
+	mul.f32 	%r318, %r311, %r266;
+	.loc	1 75 24                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:75:24
+	add.f32 	%r319, %r316, 0f3F800000;
+	add.f32 	%r320, %r315, 0f3F800000;
+	.loc	1 77 24                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:77:24
+	fma.rn.f32 	%r321, %r320, %r318, %r314;
+	fma.rn.f32 	%r322, %r319, %r317, %r313;
+	.loc	1 78 53                         // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:78:53
+	cvt.rn.bf16x2.f32 	%r48, %r322, %r321;
+	// begin inline asm
+	@%p1 st.global.v4.b32 [ %rd13 + 0 ], { %r45, %r46, %r47, %r48 };
+	// end inline asm
+	.loc	1 56 4                          // ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py:56:4
+	ret;
+$L__tmp22:
+$L__func_end0:
+                                        // -- End function
+}
+	.file	1 "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py"
+	.file	2 "/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py"
+	.section	.debug_abbrev
+	{
+.b8 1                                   // Abbreviation Code
+.b8 17                                  // DW_TAG_compile_unit
+.b8 1                                   // DW_CHILDREN_yes
+.b8 37                                  // DW_AT_producer
+.b8 8                                   // DW_FORM_string
+.b8 19                                  // DW_AT_language
+.b8 5                                   // DW_FORM_data2
+.b8 3                                   // DW_AT_name
+.b8 8                                   // DW_FORM_string
+.b8 16                                  // DW_AT_stmt_list
+.b8 6                                   // DW_FORM_data4
+.b8 27                                  // DW_AT_comp_dir
+.b8 8                                   // DW_FORM_string
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 2                                   // Abbreviation Code
+.b8 46                                  // DW_TAG_subprogram
+.b8 0                                   // DW_CHILDREN_no
+.b8 3                                   // DW_AT_name
+.b8 8                                   // DW_FORM_string
+.b8 32                                  // DW_AT_inline
+.b8 11                                  // DW_FORM_data1
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 3                                   // Abbreviation Code
+.b8 46                                  // DW_TAG_subprogram
+.b8 1                                   // DW_CHILDREN_yes
+.b8 17                                  // DW_AT_low_pc
+.b8 1                                   // DW_FORM_addr
+.b8 18                                  // DW_AT_high_pc
+.b8 1                                   // DW_FORM_addr
+.b8 49                                  // DW_AT_abstract_origin
+.b8 19                                  // DW_FORM_ref4
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 4                                   // Abbreviation Code
+.b8 29                                  // DW_TAG_inlined_subroutine
+.b8 1                                   // DW_CHILDREN_yes
+.b8 49                                  // DW_AT_abstract_origin
+.b8 19                                  // DW_FORM_ref4
+.b8 17                                  // DW_AT_low_pc
+.b8 1                                   // DW_FORM_addr
+.b8 18                                  // DW_AT_high_pc
+.b8 1                                   // DW_FORM_addr
+.b8 88                                  // DW_AT_call_file
+.b8 11                                  // DW_FORM_data1
+.b8 89                                  // DW_AT_call_line
+.b8 11                                  // DW_FORM_data1
+.b8 87                                  // DW_AT_call_column
+.b8 11                                  // DW_FORM_data1
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 5                                   // Abbreviation Code
+.b8 29                                  // DW_TAG_inlined_subroutine
+.b8 0                                   // DW_CHILDREN_no
+.b8 49                                  // DW_AT_abstract_origin
+.b8 19                                  // DW_FORM_ref4
+.b8 17                                  // DW_AT_low_pc
+.b8 1                                   // DW_FORM_addr
+.b8 18                                  // DW_AT_high_pc
+.b8 1                                   // DW_FORM_addr
+.b8 88                                  // DW_AT_call_file
+.b8 11                                  // DW_FORM_data1
+.b8 89                                  // DW_AT_call_line
+.b8 11                                  // DW_FORM_data1
+.b8 87                                  // DW_AT_call_column
+.b8 11                                  // DW_FORM_data1
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 0                                   // EOM(3)
+	}
+	.section	.debug_info
+	{
+.b32 343                                // Length of Unit
+.b8 2                                   // DWARF version number
+.b8 0
+.b32 .debug_abbrev                      // Offset Into Abbrev. Section
+.b8 8                                   // Address Size (in bytes)
+.b8 1                                   // Abbrev [1] 0xb:0x150 DW_TAG_compile_unit
+.b8 116                                 // DW_AT_producer
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 0
+.b8 2                                   // DW_AT_language
+.b8 0
+.b8 99                                  // DW_AT_name
+.b8 97
+.b8 51
+.b8 109
+.b8 101
+.b8 110
+.b8 108
+.b8 102
+.b8 117
+.b8 108
+.b8 100
+.b8 116
+.b8 104
+.b8 103
+.b8 109
+.b8 110
+.b8 99
+.b8 102
+.b8 112
+.b8 106
+.b8 107
+.b8 52
+.b8 53
+.b8 50
+.b8 120
+.b8 107
+.b8 114
+.b8 111
+.b8 115
+.b8 55
+.b8 105
+.b8 100
+.b8 114
+.b8 109
+.b8 105
+.b8 108
+.b8 54
+.b8 112
+.b8 99
+.b8 111
+.b8 101
+.b8 105
+.b8 103
+.b8 114
+.b8 97
+.b8 121
+.b8 109
+.b8 99
+.b8 103
+.b8 52
+.b8 101
+.b8 54
+.b8 46
+.b8 112
+.b8 121
+.b8 0
+.b32 .debug_line                        // DW_AT_stmt_list
+.b8 47                                  // DW_AT_comp_dir
+.b8 97
+.b8 112
+.b8 112
+.b8 47
+.b8 116
+.b8 101
+.b8 110
+.b8 115
+.b8 111
+.b8 114
+.b8 114
+.b8 116
+.b8 95
+.b8 108
+.b8 108
+.b8 109
+.b8 47
+.b8 118
+.b8 105
+.b8 115
+.b8 117
+.b8 97
+.b8 108
+.b8 95
+.b8 103
+.b8 101
+.b8 110
+.b8 47
+.b8 99
+.b8 111
+.b8 109
+.b8 112
+.b8 105
+.b8 108
+.b8 101
+.b8 100
+.b8 95
+.b8 99
+.b8 97
+.b8 99
+.b8 104
+.b8 101
+.b8 47
+.b8 102
+.b8 108
+.b8 117
+.b8 120
+.b8 50
+.b8 95
+.b8 107
+.b8 108
+.b8 101
+.b8 105
+.b8 110
+.b8 95
+.b8 57
+.b8 98
+.b8 95
+.b8 78
+.b8 86
+.b8 73
+.b8 68
+.b8 73
+.b8 65
+.b8 95
+.b8 71
+.b8 101
+.b8 70
+.b8 111
+.b8 114
+.b8 99
+.b8 101
+.b8 95
+.b8 82
+.b8 84
+.b8 88
+.b8 95
+.b8 52
+.b8 48
+.b8 57
+.b8 48
+.b8 95
+.b8 115
+.b8 109
+.b8 56
+.b8 57
+.b8 95
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 50
+.b8 46
+.b8 49
+.b8 48
+.b8 46
+.b8 48
+.b8 97
+.b8 48
+.b8 95
+.b8 98
+.b8 52
+.b8 101
+.b8 52
+.b8 101
+.b8 101
+.b8 56
+.b8 49
+.b8 100
+.b8 51
+.b8 46
+.b8 110
+.b8 118
+.b8 50
+.b8 53
+.b8 46
+.b8 49
+.b8 50
+.b8 95
+.b8 99
+.b8 117
+.b8 100
+.b8 97
+.b8 49
+.b8 51
+.b8 95
+.b8 49
+.b8 47
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 105
+.b8 110
+.b8 100
+.b8 117
+.b8 99
+.b8 116
+.b8 111
+.b8 114
+.b8 47
+.b8 97
+.b8 51
+.b8 0
+.b8 2                                   // Abbrev [2] 0xe4:0x2f DW_TAG_subprogram
+.b8 116                                 // DW_AT_name
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 114
+.b8 101
+.b8 100
+.b8 95
+.b8 102
+.b8 117
+.b8 115
+.b8 101
+.b8 100
+.b8 95
+.b8 97
+.b8 100
+.b8 100
+.b8 95
+.b8 109
+.b8 117
+.b8 108
+.b8 95
+.b8 110
+.b8 97
+.b8 116
+.b8 105
+.b8 118
+.b8 101
+.b8 95
+.b8 108
+.b8 97
+.b8 121
+.b8 101
+.b8 114
+.b8 95
+.b8 110
+.b8 111
+.b8 114
+.b8 109
+.b8 95
+.b8 48
+.b8 0
+.b8 1                                   // DW_AT_inline
+.b8 3                                   // Abbrev [3] 0x113:0x47 DW_TAG_subprogram
+.b64 $L__func_begin0                    // DW_AT_low_pc
+.b64 $L__func_end0                      // DW_AT_high_pc
+.b32 228                                // DW_AT_abstract_origin
+.b8 4                                   // Abbrev [4] 0x128:0x31 DW_TAG_inlined_subroutine
+.b32 228                                // DW_AT_abstract_origin
+.b64 $L__tmp1                           // DW_AT_low_pc
+.b64 $L__tmp21                          // DW_AT_high_pc
+.b8 1                                   // DW_AT_call_file
+.b8 52                                  // DW_AT_call_line
+.b8 80                                  // DW_AT_call_column
+.b8 5                                   // Abbrev [5] 0x140:0x18 DW_TAG_inlined_subroutine
+.b32 228                                // DW_AT_abstract_origin
+.b64 $L__tmp1                           // DW_AT_low_pc
+.b64 $L__tmp20                          // DW_AT_high_pc
+.b8 2                                   // DW_AT_call_file
+.b8 243                                 // DW_AT_call_line
+.b8 46                                  // DW_AT_call_column
+.b8 0                                   // End Of Children Mark
+.b8 0                                   // End Of Children Mark
+.b8 0                                   // End Of Children Mark
+	}
+	.section	.debug_macinfo	{	}
diff --git a/triton/OPILR2I4CRZEKF5SC7TQV475GMCNDW573IRFPSSSP4NDN5JIX2VQ/triton_red_fused_add_mul_native_layer_norm_0.source b/triton/OPILR2I4CRZEKF5SC7TQV475GMCNDW573IRFPSSSP4NDN5JIX2VQ/triton_red_fused_add_mul_native_layer_norm_0.source
new file mode 100644
index 0000000000000000000000000000000000000000..96dc836a52c39d65411d4fbc041c8eda06e50f38
--- /dev/null
+++ b/triton/OPILR2I4CRZEKF5SC7TQV475GMCNDW573IRFPSSSP4NDN5JIX2VQ/triton_red_fused_add_mul_native_layer_norm_0.source
@@ -0,0 +1,486 @@
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":18:0)
+#loc88 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":216:0)
+#loc101 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":133:0)
+#loc105 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":242:0)
+#loc107 = loc(unknown)
+#loc110 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":230:0)
+#loc125 = loc("in_ptr0"(#loc))
+#loc126 = loc("in_ptr1"(#loc))
+#loc127 = loc("in_ptr2"(#loc))
+#loc128 = loc("in_ptr3"(#loc))
+#loc129 = loc("in_ptr4"(#loc))
+#loc130 = loc("out_ptr0"(#loc))
+#loc131 = loc("out_ptr3"(#loc))
+#loc132 = loc("xnumel"(#loc))
+#loc133 = loc("r0_numel"(#loc))
+#loc201 = loc("value"(#loc88))
+#loc202 = loc("mean"(#loc88))
+#loc203 = loc("m2"(#loc88))
+#loc204 = loc("weight"(#loc88))
+#loc205 = loc("first_iteration"(#loc88))
+#loc215 = loc("input"(#loc101))
+#loc216 = loc("mean"(#loc105))
+#loc217 = loc("m2"(#loc105))
+#loc218 = loc("weight"(#loc105))
+#loc219 = loc("mean_1"(#loc110))
+#loc220 = loc("m2_1"(#loc110))
+#loc221 = loc("weight_1"(#loc110))
+#loc222 = loc("mean_2"(#loc110))
+#loc223 = loc("m2_2"(#loc110))
+#loc224 = loc("weight_2"(#loc110))
+#loc231 = loc("new_mean"(#loc201))
+module {
+  tt.func public @triton_red_fused_add_mul_native_layer_norm_0(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %in_ptr4: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr4"(#loc)), %out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %out_ptr3: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr3"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
+    %xnumel_0 = arith.constant 256 : i32 loc(#loc134)
+    %r0_numel_1 = arith.constant 4096 : i32 loc(#loc135)
+    %xoffset = tt.get_program_id x : i32 loc(#loc136)
+    %xoffset_2 = arith.constant 1 : i32 loc(#loc137)
+    %xoffset_3 = arith.constant 1 : i32 loc(#loc137)
+    %xoffset_4 = arith.muli %xoffset, %xoffset_3 : i32 loc(#loc137)
+    %xindex = tt.make_range {end = 1 : i32, start = 0 : i32} : tensor<1xi32> loc(#loc138)
+    %xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc139)
+    %xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<1x1xi32> loc(#loc140)
+    %xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<1x1xi32> loc(#loc140)
+    %xmask = arith.constant dense<256> : tensor<1x1xi32> loc(#loc141)
+    %xmask_8 = arith.cmpi slt, %xindex_7, %xmask : tensor<1x1xi32> loc(#loc141)
+    %r0_base = tt.make_range {end = 4096 : i32, start = 0 : i32} : tensor<4096xi32> loc(#loc142)
+    %r0_base_9 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<4096xi32> -> tensor<1x4096xi32> loc(#loc143)
+    %tmp7_mean = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_4096__(1,)cconstexpr_fp32_"() : () -> tensor<1x4096xf32> loc(#loc144)
+    %tmp7_m2 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_4096__(1,)cconstexpr_fp32_"() : () -> tensor<1x4096xf32> loc(#loc145)
+    %tmp7_weight = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_4096__(1,)cconstexpr_fp32_"() : () -> tensor<1x4096xf32> loc(#loc146)
+    %c0_i32 = arith.constant 0 : i32 loc(#loc14)
+    %c4096_i32 = arith.constant 4096 : i32 loc(#loc14)
+    %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc14)
+    %1 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc14)
+    %2 = arith.bitcast %c4096_i32 : i32 to i32 loc(#loc14)
+    %3 = ub.poison : i32 loc(#loc14)
+    %tmp7_weight_10:3 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%tmp7_mean_13 = %tmp7_mean, %tmp7_m2_14 = %tmp7_m2, %tmp7_weight_15 = %tmp7_weight) -> (tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32>)  : i32 {
+      %r0_index = tt.splat %r0_offset : i32 -> tensor<1x4096xi32> loc(#loc148)
+      %r0_index_16 = arith.addi %r0_index, %r0_base_9 : tensor<1x4096xi32> loc(#loc148)
+      %r0_mask = arith.constant dense<4096> : tensor<1x4096xi32> loc(#loc149)
+      %r0_mask_17 = arith.cmpi slt, %r0_index_16, %r0_mask : tensor<1x4096xi32> loc(#loc149)
+      %tmp0 = arith.constant 4096 : i32 loc(#loc150)
+      %tmp0_18 = arith.constant 4096 : i32 loc(#loc150)
+      %tmp0_19 = arith.constant dense<4096> : tensor<1x1xi32> loc(#loc150)
+      %tmp0_20 = arith.muli %tmp0_19, %xindex_7 : tensor<1x1xi32> loc(#loc150)
+      %tmp0_21 = tt.broadcast %tmp0_20 : tensor<1x1xi32> -> tensor<1x4096xi32> loc(#loc151)
+      %tmp0_22 = arith.addi %r0_index_16, %tmp0_21 : tensor<1x4096xi32> loc(#loc151)
+      %tmp0_23 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<1x4096x!tt.ptr<bf16>> loc(#loc152)
+      %tmp0_24 = tt.addptr %tmp0_23, %tmp0_22 : tensor<1x4096x!tt.ptr<bf16>>, tensor<1x4096xi32> loc(#loc152)
+      %tmp0_25 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x4096xi1> loc(#loc153)
+      %tmp0_26 = arith.andi %r0_mask_17, %tmp0_25 : tensor<1x4096xi1> loc(#loc153)
+      %tmp0_27 = arith.constant 0.000000e+00 : f32 loc(#loc154)
+      %tmp0_28 = arith.constant dense<0.000000e+00> : tensor<1x4096xf32> loc(#loc154)
+      %tmp0_29 = arith.truncf %tmp0_28 : tensor<1x4096xf32> to tensor<1x4096xbf16> loc(#loc154)
+      %tmp0_30 = tt.load %tmp0_24, %tmp0_26, %tmp0_29 evictionPolicy = evict_first : tensor<1x4096x!tt.ptr<bf16>> loc(#loc154)
+      %tmp0_31 = arith.extf %tmp0_30 : tensor<1x4096xbf16> to tensor<1x4096xf32> loc(#loc155)
+      %tmp1 = tt.splat %in_ptr1 : !tt.ptr<bf16> -> tensor<1x4096x!tt.ptr<bf16>> loc(#loc156)
+      %tmp1_32 = tt.addptr %tmp1, %r0_index_16 : tensor<1x4096x!tt.ptr<bf16>>, tensor<1x4096xi32> loc(#loc156)
+      %tmp1_33 = arith.constant 0.000000e+00 : f32 loc(#loc157)
+      %tmp1_34 = arith.constant dense<0.000000e+00> : tensor<1x4096xf32> loc(#loc157)
+      %tmp1_35 = arith.truncf %tmp1_34 : tensor<1x4096xf32> to tensor<1x4096xbf16> loc(#loc157)
+      %tmp1_36 = tt.load %tmp1_32, %r0_mask_17, %tmp1_35 evictionPolicy = evict_last : tensor<1x4096x!tt.ptr<bf16>> loc(#loc157)
+      %tmp1_37 = arith.extf %tmp1_36 : tensor<1x4096xbf16> to tensor<1x4096xf32> loc(#loc158)
+      %tmp2 = arith.constant 4096 : i32 loc(#loc159)
+      %tmp2_38 = arith.constant 4096 : i32 loc(#loc159)
+      %tmp2_39 = arith.constant dense<4096> : tensor<1x1xi32> loc(#loc159)
+      %tmp2_40 = arith.muli %tmp2_39, %xindex_7 : tensor<1x1xi32> loc(#loc159)
+      %tmp2_41 = tt.broadcast %tmp2_40 : tensor<1x1xi32> -> tensor<1x4096xi32> loc(#loc160)
+      %tmp2_42 = arith.addi %r0_index_16, %tmp2_41 : tensor<1x4096xi32> loc(#loc160)
+      %tmp2_43 = tt.splat %in_ptr2 : !tt.ptr<bf16> -> tensor<1x4096x!tt.ptr<bf16>> loc(#loc161)
+      %tmp2_44 = tt.addptr %tmp2_43, %tmp2_42 : tensor<1x4096x!tt.ptr<bf16>>, tensor<1x4096xi32> loc(#loc161)
+      %tmp2_45 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x4096xi1> loc(#loc162)
+      %tmp2_46 = arith.andi %r0_mask_17, %tmp2_45 : tensor<1x4096xi1> loc(#loc162)
+      %tmp2_47 = arith.constant 0.000000e+00 : f32 loc(#loc163)
+      %tmp2_48 = arith.constant dense<0.000000e+00> : tensor<1x4096xf32> loc(#loc163)
+      %tmp2_49 = arith.truncf %tmp2_48 : tensor<1x4096xf32> to tensor<1x4096xbf16> loc(#loc163)
+      %tmp2_50 = tt.load %tmp2_44, %tmp2_46, %tmp2_49 evictionPolicy = evict_first : tensor<1x4096x!tt.ptr<bf16>> loc(#loc163)
+      %tmp2_51 = arith.extf %tmp2_50 : tensor<1x4096xbf16> to tensor<1x4096xf32> loc(#loc164)
+      %tmp3 = arith.mulf %tmp1_37, %tmp2_51 : tensor<1x4096xf32> loc(#loc165)
+      %tmp4 = arith.addf %tmp0_31, %tmp3 : tensor<1x4096xf32> loc(#loc166)
+      %c0_i32_52 = arith.constant 0 : i32 loc(#loc34)
+      %9 = arith.cmpi eq, %r0_offset, %c0_i32_52 : i32 loc(#loc34)
+      %10:3 = tt.call @torch._inductor.runtime.triton_helpers.welford_reduce__fp32S1_4096S_fp32S1_4096S_fp32S1_4096S_fp32S1_4096S_u1__(%tmp4, %tmp7_mean_13, %tmp7_m2_14, %tmp7_weight_15, %9) : (tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32>, i1) -> (tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32>) loc(#loc35)
+      %tmp7_mean_53 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x4096xi1> loc(#loc167)
+      %tmp7_mean_54 = arith.andi %r0_mask_17, %tmp7_mean_53 : tensor<1x4096xi1> loc(#loc167)
+      %tmp7_mean_55 = arith.select %tmp7_mean_54, %10#0, %tmp7_mean_13 : tensor<1x4096xi1>, tensor<1x4096xf32> loc(#loc168)
+      %tmp7_m2_56 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x4096xi1> loc(#loc169)
+      %tmp7_m2_57 = arith.andi %r0_mask_17, %tmp7_m2_56 : tensor<1x4096xi1> loc(#loc169)
+      %tmp7_m2_58 = arith.select %tmp7_m2_57, %10#1, %tmp7_m2_14 : tensor<1x4096xi1>, tensor<1x4096xf32> loc(#loc170)
+      %tmp7_weight_59 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x4096xi1> loc(#loc171)
+      %tmp7_weight_60 = arith.andi %r0_mask_17, %tmp7_weight_59 : tensor<1x4096xi1> loc(#loc171)
+      %tmp7_weight_61 = arith.select %tmp7_weight_60, %10#2, %tmp7_weight_15 : tensor<1x4096xi1>, tensor<1x4096xf32> loc(#loc172)
+      %c4096_i32_62 = arith.constant 4096 : i32 loc(#loc42)
+      %c4096_i32_63 = arith.constant 4096 : i32 loc(#loc42)
+      %cst = arith.constant dense<4096> : tensor<1x1xi32> loc(#loc42)
+      %11 = arith.muli %cst, %xindex_7 : tensor<1x1xi32> loc(#loc42)
+      %12 = tt.broadcast %11 : tensor<1x1xi32> -> tensor<1x4096xi32> loc(#loc43)
+      %13 = arith.addi %r0_index_16, %12 : tensor<1x4096xi32> loc(#loc43)
+      %14 = tt.splat %out_ptr0 : !tt.ptr<bf16> -> tensor<1x4096x!tt.ptr<bf16>> loc(#loc44)
+      %15 = tt.addptr %14, %13 : tensor<1x4096x!tt.ptr<bf16>>, tensor<1x4096xi32> loc(#loc44)
+      %16 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x4096xi1> loc(#loc45)
+      %17 = arith.andi %r0_mask_17, %16 : tensor<1x4096xi1> loc(#loc45)
+      %18 = arith.truncf %tmp4 : tensor<1x4096xf32> to tensor<1x4096xbf16> loc(#loc46)
+      tt.store %15, %18, %17 : tensor<1x4096x!tt.ptr<bf16>> loc(#loc46)
+      scf.yield %tmp7_mean_55, %tmp7_m2_58, %tmp7_weight_61 : tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32> loc(#loc47)
+    } loc(#loc237)
+    %4:3 = tt.call @"torch._inductor.runtime.triton_helpers.welford__fp32S1_4096S_fp32S1_4096S_fp32S1_4096S__(3,)cconstexpr_1_"(%tmp7_weight_10#0, %tmp7_weight_10#1, %tmp7_weight_10#2) : (tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32>) -> (tensor<1xf32>, tensor<1xf32>, tensor<1xf32>) loc(#loc48)
+    %tmp7 = tt.expand_dims %4#0 {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc173)
+    %tmp11 = tt.expand_dims %4#1 {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc174)
+    %tmp12 = tt.expand_dims %4#2 {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc175)
+    %c0_i32_11 = arith.constant 0 : i32 loc(#loc52)
+    %c4096_i32_12 = arith.constant 4096 : i32 loc(#loc52)
+    %5 = arith.bitcast %c0_i32_11 : i32 to i32 loc(#loc52)
+    %6 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc52)
+    %7 = arith.bitcast %c4096_i32_12 : i32 to i32 loc(#loc52)
+    %8 = ub.poison : i32 loc(#loc52)
+    scf.for %r0_offset = %5 to %6 step %7  : i32 {
+      %r0_index = tt.splat %r0_offset : i32 -> tensor<1x4096xi32> loc(#loc176)
+      %r0_index_13 = arith.addi %r0_index, %r0_base_9 : tensor<1x4096xi32> loc(#loc176)
+      %r0_mask = arith.constant dense<4096> : tensor<1x4096xi32> loc(#loc177)
+      %r0_mask_14 = arith.cmpi slt, %r0_index_13, %r0_mask : tensor<1x4096xi32> loc(#loc177)
+      %tmp13 = arith.constant 4096 : i32 loc(#loc178)
+      %tmp13_15 = arith.constant 4096 : i32 loc(#loc178)
+      %tmp13_16 = arith.constant dense<4096> : tensor<1x1xi32> loc(#loc178)
+      %tmp13_17 = arith.muli %tmp13_16, %xindex_7 : tensor<1x1xi32> loc(#loc178)
+      %tmp13_18 = tt.broadcast %tmp13_17 : tensor<1x1xi32> -> tensor<1x4096xi32> loc(#loc179)
+      %tmp13_19 = arith.addi %r0_index_13, %tmp13_18 : tensor<1x4096xi32> loc(#loc179)
+      %tmp13_20 = tt.splat %out_ptr0 : !tt.ptr<bf16> -> tensor<1x4096x!tt.ptr<bf16>> loc(#loc180)
+      %tmp13_21 = tt.addptr %tmp13_20, %tmp13_19 : tensor<1x4096x!tt.ptr<bf16>>, tensor<1x4096xi32> loc(#loc180)
+      %tmp13_22 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x4096xi1> loc(#loc181)
+      %tmp13_23 = arith.andi %r0_mask_14, %tmp13_22 : tensor<1x4096xi1> loc(#loc181)
+      %tmp13_24 = arith.constant 0.000000e+00 : f32 loc(#loc182)
+      %tmp13_25 = arith.constant dense<0.000000e+00> : tensor<1x4096xf32> loc(#loc182)
+      %tmp13_26 = arith.truncf %tmp13_25 : tensor<1x4096xf32> to tensor<1x4096xbf16> loc(#loc182)
+      %tmp13_27 = tt.load %tmp13_21, %tmp13_23, %tmp13_26 evictionPolicy = evict_first : tensor<1x4096x!tt.ptr<bf16>> loc(#loc182)
+      %tmp13_28 = arith.extf %tmp13_27 : tensor<1x4096xbf16> to tensor<1x4096xf32> loc(#loc183)
+      %tmp23 = tt.splat %in_ptr3 : !tt.ptr<bf16> -> tensor<1x4096x!tt.ptr<bf16>> loc(#loc184)
+      %tmp23_29 = tt.addptr %tmp23, %r0_index_13 : tensor<1x4096x!tt.ptr<bf16>>, tensor<1x4096xi32> loc(#loc184)
+      %tmp23_30 = arith.constant 0.000000e+00 : f32 loc(#loc185)
+      %tmp23_31 = arith.constant dense<0.000000e+00> : tensor<1x4096xf32> loc(#loc185)
+      %tmp23_32 = arith.truncf %tmp23_31 : tensor<1x4096xf32> to tensor<1x4096xbf16> loc(#loc185)
+      %tmp23_33 = tt.load %tmp23_29, %r0_mask_14, %tmp23_32 evictionPolicy = evict_last : tensor<1x4096x!tt.ptr<bf16>> loc(#loc185)
+      %tmp23_34 = arith.extf %tmp23_33 : tensor<1x4096xbf16> to tensor<1x4096xf32> loc(#loc186)
+      %tmp27 = tt.splat %in_ptr4 : !tt.ptr<bf16> -> tensor<1x4096x!tt.ptr<bf16>> loc(#loc187)
+      %tmp27_35 = tt.addptr %tmp27, %r0_index_13 : tensor<1x4096x!tt.ptr<bf16>>, tensor<1x4096xi32> loc(#loc187)
+      %tmp27_36 = arith.constant 0.000000e+00 : f32 loc(#loc188)
+      %tmp27_37 = arith.constant dense<0.000000e+00> : tensor<1x4096xf32> loc(#loc188)
+      %tmp27_38 = arith.truncf %tmp27_37 : tensor<1x4096xf32> to tensor<1x4096xbf16> loc(#loc188)
+      %tmp27_39 = tt.load %tmp27_35, %r0_mask_14, %tmp27_38 evictionPolicy = evict_last : tensor<1x4096x!tt.ptr<bf16>> loc(#loc188)
+      %tmp27_40 = arith.extf %tmp27_39 : tensor<1x4096xbf16> to tensor<1x4096xf32> loc(#loc189)
+      %tmp15 = tt.broadcast %tmp7 : tensor<1x1xf32> -> tensor<1x4096xf32> loc(#loc190)
+      %tmp15_41 = arith.subf %tmp13_28, %tmp15 : tensor<1x4096xf32> loc(#loc190)
+      %tmp16 = arith.constant 4.096000e+03 : f32 loc(#loc191)
+      %tmp17 = arith.constant dense<4.096000e+03> : tensor<1x1xf32> loc(#loc192)
+      %tmp17_42 = arith.divf %tmp11, %tmp17 : tensor<1x1xf32> loc(#loc192)
+      %tmp18 = arith.constant 9.99999997E-7 : f32 loc(#loc193)
+      %tmp19 = arith.constant dense<9.99999997E-7> : tensor<1x1xf32> loc(#loc194)
+      %tmp19_43 = arith.addf %tmp17_42, %tmp19 : tensor<1x1xf32> loc(#loc194)
+      %tmp20 = tt.extern_elementwise %tmp19_43 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<1x1xf32>) -> tensor<1x1xf32> loc(#loc195)
+      %tmp21 = tt.broadcast %tmp20 : tensor<1x1xf32> -> tensor<1x4096xf32> loc(#loc196)
+      %tmp21_44 = arith.mulf %tmp15_41, %tmp21 : tensor<1x4096xf32> loc(#loc196)
+      %tmp24 = arith.constant 1.000000e+00 : f32 loc(#loc197)
+      %tmp25 = arith.constant dense<1.000000e+00> : tensor<1x4096xf32> loc(#loc198)
+      %tmp25_45 = arith.addf %tmp23_34, %tmp25 : tensor<1x4096xf32> loc(#loc198)
+      %tmp26 = arith.mulf %tmp21_44, %tmp25_45 : tensor<1x4096xf32> loc(#loc199)
+      %tmp28 = arith.addf %tmp26, %tmp27_40 : tensor<1x4096xf32> loc(#loc200)
+      %c4096_i32_46 = arith.constant 4096 : i32 loc(#loc78)
+      %c4096_i32_47 = arith.constant 4096 : i32 loc(#loc78)
+      %cst = arith.constant dense<4096> : tensor<1x1xi32> loc(#loc78)
+      %9 = arith.muli %cst, %xindex_7 : tensor<1x1xi32> loc(#loc78)
+      %10 = tt.broadcast %9 : tensor<1x1xi32> -> tensor<1x4096xi32> loc(#loc79)
+      %11 = arith.addi %r0_index_13, %10 : tensor<1x4096xi32> loc(#loc79)
+      %12 = tt.splat %out_ptr3 : !tt.ptr<bf16> -> tensor<1x4096x!tt.ptr<bf16>> loc(#loc80)
+      %13 = tt.addptr %12, %11 : tensor<1x4096x!tt.ptr<bf16>>, tensor<1x4096xi32> loc(#loc80)
+      %14 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x4096xi1> loc(#loc81)
+      %15 = arith.andi %r0_mask_14, %14 : tensor<1x4096xi1> loc(#loc81)
+      %16 = arith.truncf %tmp28 : tensor<1x4096xf32> to tensor<1x4096xbf16> loc(#loc82)
+      tt.store %13, %16, %15 : tensor<1x4096x!tt.ptr<bf16>> loc(#loc82)
+    } loc(#loc52)
+    tt.return loc(#loc83)
+  } loc(#loc)
+  tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_4096__(1,)cconstexpr_fp32_"() -> tensor<1x4096xf32> attributes {noinline = false} {
+    %cst = arith.constant 0.000000e+00 : f32 loc(#loc85)
+    %cst_0 = arith.constant dense<0.000000e+00> : tensor<1x4096xf32> loc(#loc85)
+    tt.return %cst_0 : tensor<1x4096xf32> loc(#loc86)
+  ^bb1:  // no predecessors
+    %0 = ub.poison : tensor<1x4096xf32> loc(#loc87)
+    tt.return %0 : tensor<1x4096xf32> loc(#loc87)
+  } loc(#loc84)
+  tt.func private @torch._inductor.runtime.triton_helpers.welford_reduce__fp32S1_4096S_fp32S1_4096S_fp32S1_4096S_fp32S1_4096S_u1__(%new_mean: tensor<1x4096xf32> loc("new_mean"(#loc201)), %mean: tensor<1x4096xf32> loc("mean"(#loc88)), %m2: tensor<1x4096xf32> loc("m2"(#loc88)), %weight: tensor<1x4096xf32> loc("weight"(#loc88)), %first_iteration: i1 loc("first_iteration"(#loc88))) -> (tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32>) attributes {noinline = false} {
+    %0:3 = scf.if %first_iteration -> (tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32>) {
+      %new_weight = arith.constant 1.000000e+00 : f32 loc(#loc206)
+      %new_weight_0 = arith.constant dense<1.000000e+00> : tensor<1x4096xf32> loc(#loc232)
+      %new_m2 = tt.call @triton.language.standard.zeros_like__fp32S1_4096S__(%m2) : (tensor<1x4096xf32>) -> tensor<1x4096xf32> loc(#loc233)
+      scf.yield %new_m2, %new_mean, %new_weight_0 : tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32> loc(#loc233)
+    } else {
+      %delta = arith.subf %new_mean, %mean : tensor<1x4096xf32> loc(#loc208)
+      %new_weight = arith.constant 1 : i32 loc(#loc209)
+      %new_weight_0 = arith.constant 1.000000e+00 : f32 loc(#loc209)
+      %new_weight_1 = arith.constant dense<1.000000e+00> : tensor<1x4096xf32> loc(#loc209)
+      %new_weight_2 = arith.addf %weight, %new_weight_1 : tensor<1x4096xf32> loc(#loc234)
+      %new_mean_3 = arith.divf %delta, %new_weight_2 : tensor<1x4096xf32> loc(#loc210)
+      %new_mean_4 = arith.addf %mean, %new_mean_3 : tensor<1x4096xf32> loc(#loc235)
+      %new_m2 = arith.subf %new_mean, %new_mean_4 : tensor<1x4096xf32> loc(#loc212)
+      %new_m2_5 = arith.mulf %delta, %new_m2 : tensor<1x4096xf32> loc(#loc213)
+      %new_m2_6 = arith.addf %m2, %new_m2_5 : tensor<1x4096xf32> loc(#loc236)
+      scf.yield %new_m2_6, %new_mean_4, %new_weight_2 : tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32> loc(#loc214)
+    } loc(#loc89)
+    tt.return %0#1, %0#0, %0#2 : tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32> loc(#loc99)
+  ^bb1:  // no predecessors
+    %1 = ub.poison : tensor<1x4096xf32> loc(#loc100)
+    %2 = ub.poison : tensor<1x4096xf32> loc(#loc100)
+    %3 = ub.poison : tensor<1x4096xf32> loc(#loc100)
+    tt.return %1, %2, %3 : tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32> loc(#loc100)
+  } loc(#loc88)
+  tt.func private @triton.language.standard.zeros_like__fp32S1_4096S__(%input: tensor<1x4096xf32> loc("input"(#loc101))) -> tensor<1x4096xf32> attributes {noinline = false} {
+    %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_4096__(1,)cconstexpr_fp32_"() : () -> tensor<1x4096xf32> loc(#loc102)
+    tt.return %0 : tensor<1x4096xf32> loc(#loc103)
+  ^bb1:  // no predecessors
+    %1 = ub.poison : tensor<1x4096xf32> loc(#loc104)
+    tt.return %1 : tensor<1x4096xf32> loc(#loc104)
+  } loc(#loc101)
+  tt.func private @"torch._inductor.runtime.triton_helpers.welford__fp32S1_4096S_fp32S1_4096S_fp32S1_4096S__(3,)cconstexpr_1_"(%mean: tensor<1x4096xf32> loc("mean"(#loc105)), %m2: tensor<1x4096xf32> loc("m2"(#loc105)), %weight: tensor<1x4096xf32> loc("weight"(#loc105))) -> (tensor<1xf32>, tensor<1xf32>, tensor<1xf32>) attributes {noinline = false} {
+    %0:3 = "tt.reduce"(%mean, %m2, %weight) <{axis = 1 : i32}> ({
+    ^bb0(%arg3: f32 loc(unknown), %arg4: f32 loc(unknown), %arg5: f32 loc(unknown), %arg6: f32 loc(unknown), %arg7: f32 loc(unknown), %arg8: f32 loc(unknown)):
+      %4:3 = tt.call @torch._inductor.runtime.triton_helpers.welford_combine__fp32_fp32_fp32_fp32_fp32_fp32__(%arg3, %arg4, %arg5, %arg6, %arg7, %arg8) : (f32, f32, f32, f32, f32, f32) -> (f32, f32, f32) loc(#loc106)
+      tt.reduce.return %4#0, %4#1, %4#2 : f32, f32, f32 loc(#loc106)
+    }) : (tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32>) -> (tensor<1xf32>, tensor<1xf32>, tensor<1xf32>) loc(#loc106)
+    tt.return %0#0, %0#1, %0#2 : tensor<1xf32>, tensor<1xf32>, tensor<1xf32> loc(#loc108)
+  ^bb1:  // no predecessors
+    %1 = ub.poison : tensor<1xf32> loc(#loc109)
+    %2 = ub.poison : tensor<1xf32> loc(#loc109)
+    %3 = ub.poison : tensor<1xf32> loc(#loc109)
+    tt.return %1, %2, %3 : tensor<1xf32>, tensor<1xf32>, tensor<1xf32> loc(#loc109)
+  } loc(#loc105)
+  tt.func private @torch._inductor.runtime.triton_helpers.welford_combine__fp32_fp32_fp32_fp32_fp32_fp32__(%mean_1: f32 loc("mean_1"(#loc110)), %m2_1: f32 loc("m2_1"(#loc110)), %weight_1: f32 loc("weight_1"(#loc110)), %mean_2: f32 loc("mean_2"(#loc110)), %m2_2: f32 loc("m2_2"(#loc110)), %weight_2: f32 loc("weight_2"(#loc110))) -> (f32, f32, f32) attributes {noinline = false} {
+    %delta = arith.subf %mean_2, %mean_1 : f32 loc(#loc225)
+    %new_weight = arith.addf %weight_1, %weight_2 : f32 loc(#loc226)
+    %w2_over_w = arith.constant 0.000000e+00 : f32 loc(#loc227)
+    %w2_over_w_0 = arith.cmpf oeq, %new_weight, %w2_over_w : f32 loc(#loc227)
+    %w2_over_w_1 = arith.divf %weight_2, %new_weight : f32 loc(#loc228)
+    %w2_over_w_2 = arith.constant 0.000000e+00 : f32 loc(#loc229)
+    %w2_over_w_3 = arith.constant 0.000000e+00 : f32 loc(#loc229)
+    %w2_over_w_4 = arith.select %w2_over_w_0, %w2_over_w_3, %w2_over_w_1 : f32 loc(#loc229)
+    %0 = arith.mulf %delta, %w2_over_w_4 : f32 loc(#loc116)
+    %1 = arith.addf %mean_1, %0 : f32 loc(#loc117)
+    %2 = arith.addf %m2_1, %m2_2 : f32 loc(#loc118)
+    %3 = arith.mulf %delta, %delta : f32 loc(#loc119)
+    %4 = arith.mulf %3, %weight_1 : f32 loc(#loc120)
+    %5 = arith.mulf %4, %w2_over_w_4 : f32 loc(#loc121)
+    %6 = arith.addf %2, %5 : f32 loc(#loc122)
+    tt.return %1, %6, %new_weight : f32, f32, f32 loc(#loc123)
+  ^bb1:  // no predecessors
+    %7 = ub.poison : f32 loc(#loc124)
+    %8 = ub.poison : f32 loc(#loc124)
+    %9 = ub.poison : f32 loc(#loc124)
+    tt.return %7, %8, %9 : f32, f32, f32 loc(#loc124)
+  } loc(#loc110)
+} loc(#loc)
+#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":19:13)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":20:15)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":23:28)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":23:33)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":24:36)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":24:44)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":24:23)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":25:21)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":26:27)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":26:37)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":29:45)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":30:43)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":31:47)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":32:43)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":33:31)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":34:29)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":38:46)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":38:41)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":38:34)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":38:61)
+#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":38:51)
+#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":38:113)
+#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":39:34)
+#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":39:41)
+#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":39:94)
+#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":40:46)
+#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":40:41)
+#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":40:34)
+#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":40:61)
+#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":40:51)
+#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":40:113)
+#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":41:22)
+#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":42:22)
+#loc34 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":46:62)
+#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":46:51)
+#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":48:39)
+#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":48:62)
+#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":49:37)
+#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":49:58)
+#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":50:41)
+#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":50:66)
+#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":51:41)
+#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":51:36)
+#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":51:29)
+#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":51:62)
+#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":51:52)
+#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":51:8)
+#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":52:80)
+#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":53:16)
+#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":54:17)
+#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":55:18)
+#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":56:43)
+#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":57:31)
+#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":58:29)
+#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":62:48)
+#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":62:43)
+#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":62:36)
+#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":62:63)
+#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":62:53)
+#loc60 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":62:115)
+#loc61 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":63:35)
+#loc62 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":63:42)
+#loc63 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":63:95)
+#loc64 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":64:35)
+#loc65 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":64:42)
+#loc66 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":64:95)
+#loc67 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":66:24)
+#loc68 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":67:16)
+#loc69 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":68:25)
+#loc70 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":69:16)
+#loc71 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":70:24)
+#loc72 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":71:32)
+#loc73 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":72:24)
+#loc74 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":74:16)
+#loc75 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":75:24)
+#loc76 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":76:24)
+#loc77 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":77:24)
+#loc78 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":78:41)
+#loc79 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":78:36)
+#loc80 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":78:29)
+#loc81 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":78:63)
+#loc82 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":78:53)
+#loc83 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":56:4)
+#loc84 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":120:0)
+#loc85 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":129:31)
+#loc86 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":129:11)
+#loc87 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":129:4)
+#loc89 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":217:7)
+#loc90 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":218:46)
+#loc91 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":220:31)
+#loc92 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":222:24)
+#loc93 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":223:30)
+#loc94 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":224:34)
+#loc95 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":224:26)
+#loc96 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":225:39)
+#loc97 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":225:31)
+#loc98 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":225:22)
+#loc99 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":226:11)
+#loc100 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":226:4)
+#loc102 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":140:30)
+#loc103 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":140:11)
+#loc104 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":140:4)
+#loc106 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":243:46)
+#loc108 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":243:11)
+#loc109 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":243:4)
+#loc111 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":231:21)
+#loc112 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":232:28)
+#loc113 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:39)
+#loc114 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:60)
+#loc115 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:49)
+#loc116 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":235:25)
+#loc117 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":235:17)
+#loc118 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:15)
+#loc119 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:30)
+#loc120 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:38)
+#loc121 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:49)
+#loc122 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:22)
+#loc123 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":234:11)
+#loc124 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":234:4)
+#loc134 = loc("xnumel"(#loc1))
+#loc135 = loc("r0_numel"(#loc2))
+#loc136 = loc("xoffset"(#loc3))
+#loc137 = loc("xoffset"(#loc4))
+#loc138 = loc("xindex"(#loc5))
+#loc139 = loc("xindex"(#loc6))
+#loc140 = loc("xindex"(#loc7))
+#loc141 = loc("xmask"(#loc8))
+#loc142 = loc("r0_base"(#loc9))
+#loc143 = loc("r0_base"(#loc10))
+#loc144 = loc("tmp7_mean"(#loc11))
+#loc145 = loc("tmp7_m2"(#loc12))
+#loc146 = loc("tmp7_weight"(#loc13))
+#loc147 = loc("tmp7_mean"(#loc14))
+#loc148 = loc("r0_index"(#loc15))
+#loc149 = loc("r0_mask"(#loc16))
+#loc150 = loc("tmp0"(#loc17))
+#loc151 = loc("tmp0"(#loc18))
+#loc152 = loc("tmp0"(#loc19))
+#loc153 = loc("tmp0"(#loc20))
+#loc154 = loc("tmp0"(#loc21))
+#loc155 = loc("tmp0"(#loc22))
+#loc156 = loc("tmp1"(#loc23))
+#loc157 = loc("tmp1"(#loc24))
+#loc158 = loc("tmp1"(#loc25))
+#loc159 = loc("tmp2"(#loc26))
+#loc160 = loc("tmp2"(#loc27))
+#loc161 = loc("tmp2"(#loc28))
+#loc162 = loc("tmp2"(#loc29))
+#loc163 = loc("tmp2"(#loc30))
+#loc164 = loc("tmp2"(#loc31))
+#loc165 = loc("tmp3"(#loc32))
+#loc166 = loc("tmp4"(#loc33))
+#loc167 = loc("tmp7_mean"(#loc36))
+#loc168 = loc("tmp7_mean"(#loc37))
+#loc169 = loc("tmp7_m2"(#loc38))
+#loc170 = loc("tmp7_m2"(#loc39))
+#loc171 = loc("tmp7_weight"(#loc40))
+#loc172 = loc("tmp7_weight"(#loc41))
+#loc173 = loc("tmp7"(#loc49))
+#loc174 = loc("tmp11"(#loc50))
+#loc175 = loc("tmp12"(#loc51))
+#loc176 = loc("r0_index"(#loc53))
+#loc177 = loc("r0_mask"(#loc54))
+#loc178 = loc("tmp13"(#loc55))
+#loc179 = loc("tmp13"(#loc56))
+#loc180 = loc("tmp13"(#loc57))
+#loc181 = loc("tmp13"(#loc58))
+#loc182 = loc("tmp13"(#loc59))
+#loc183 = loc("tmp13"(#loc60))
+#loc184 = loc("tmp23"(#loc61))
+#loc185 = loc("tmp23"(#loc62))
+#loc186 = loc("tmp23"(#loc63))
+#loc187 = loc("tmp27"(#loc64))
+#loc188 = loc("tmp27"(#loc65))
+#loc189 = loc("tmp27"(#loc66))
+#loc190 = loc("tmp15"(#loc67))
+#loc191 = loc("tmp16"(#loc68))
+#loc192 = loc("tmp17"(#loc69))
+#loc193 = loc("tmp18"(#loc70))
+#loc194 = loc("tmp19"(#loc71))
+#loc195 = loc("tmp20"(#loc72))
+#loc196 = loc("tmp21"(#loc73))
+#loc197 = loc("tmp24"(#loc74))
+#loc198 = loc("tmp25"(#loc75))
+#loc199 = loc("tmp26"(#loc76))
+#loc200 = loc("tmp28"(#loc77))
+#loc206 = loc("new_weight"(#loc90))
+#loc207 = loc("new_m2"(#loc91))
+#loc208 = loc("delta"(#loc92))
+#loc209 = loc("new_weight"(#loc93))
+#loc210 = loc("new_mean"(#loc94))
+#loc211 = loc("new_mean"(#loc95))
+#loc212 = loc("new_m2"(#loc96))
+#loc213 = loc("new_m2"(#loc97))
+#loc214 = loc("new_m2"(#loc98))
+#loc225 = loc("delta"(#loc111))
+#loc226 = loc("new_weight"(#loc112))
+#loc227 = loc("w2_over_w"(#loc113))
+#loc228 = loc("w2_over_w"(#loc114))
+#loc229 = loc("w2_over_w"(#loc115))
+#loc230 = loc("tmp7_m2"(#loc147))
+#loc232 = loc("new_weight"(#loc206))
+#loc233 = loc("new_m2"(#loc207))
+#loc234 = loc("new_weight"(#loc209))
+#loc235 = loc("new_mean"(#loc211))
+#loc236 = loc("new_m2"(#loc214))
+#loc237 = loc("tmp7_weight"(#loc230))
diff --git a/triton/OPILR2I4CRZEKF5SC7TQV475GMCNDW573IRFPSSSP4NDN5JIX2VQ/triton_red_fused_add_mul_native_layer_norm_0.ttgir b/triton/OPILR2I4CRZEKF5SC7TQV475GMCNDW573IRFPSSSP4NDN5JIX2VQ/triton_red_fused_add_mul_native_layer_norm_0.ttgir
new file mode 100644
index 0000000000000000000000000000000000000000..70f0de11d71b911cb835e273e5d7d12e26cbb0e5
--- /dev/null
+++ b/triton/OPILR2I4CRZEKF5SC7TQV475GMCNDW573IRFPSSSP4NDN5JIX2VQ/triton_red_fused_add_mul_native_layer_norm_0.ttgir
@@ -0,0 +1,214 @@
+#blocked = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [1, 32], warpsPerCTA = [1, 16], order = [1, 0]}>
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":18:0)
+#loc1 = loc(unknown)
+#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":52:80)
+#loc59 = loc("in_ptr0"(#loc))
+#loc60 = loc("in_ptr1"(#loc))
+#loc61 = loc("in_ptr2"(#loc))
+#loc62 = loc("in_ptr3"(#loc))
+#loc63 = loc("in_ptr4"(#loc))
+#loc64 = loc("out_ptr0"(#loc))
+#loc65 = loc("out_ptr3"(#loc))
+#loc66 = loc("xnumel"(#loc))
+#loc67 = loc("r0_numel"(#loc))
+#loc89 = loc(callsite(#loc1 at #loc25))
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 16 : i32, ttg.target = "cuda:89", "ttg.threads-per-warp" = 32 : i32} {
+  tt.func public @triton_red_fused_add_mul_native_layer_norm_0(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %in_ptr4: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr4"(#loc)), %out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %out_ptr3: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr3"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
+    %cst = arith.constant dense<4096> : tensor<1x4096xi32, #blocked> loc(#loc1)
+    %cst_0 = arith.constant dense<0.000000e+00> : tensor<1x4096xbf16, #blocked> loc(#loc1)
+    %c4096_i32 = arith.constant 4096 : i32 loc(#loc1)
+    %c256_i32 = arith.constant 256 : i32 loc(#loc1)
+    %cst_1 = arith.constant 0.000000e+00 : f32 loc(#loc1)
+    %cst_2 = arith.constant dense<0.000000e+00> : tensor<1x4096xf32, #blocked> loc(#loc1)
+    %cst_3 = arith.constant dense<1.000000e+00> : tensor<1x4096xf32, #blocked> loc(#loc1)
+    %cst_4 = arith.constant dense<9.99999997E-7> : tensor<1x1xf32, #blocked> loc(#loc1)
+    %cst_5 = arith.constant dense<4.096000e+03> : tensor<1x1xf32, #blocked> loc(#loc1)
+    %xoffset = tt.get_program_id x : i32 loc(#loc68)
+    %xmask = arith.cmpi slt, %xoffset, %c256_i32 : i32 loc(#loc69)
+    %r0_base = tt.make_range {end = 4096 : i32, start = 0 : i32} : tensor<4096xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc70)
+    %r0_base_6 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<4096xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x4096xi32, #blocked> loc(#loc70)
+    %r0_mask = arith.cmpi slt, %r0_base_6, %cst : tensor<1x4096xi32, #blocked> loc(#loc71)
+    %tmp0 = arith.muli %xoffset, %c4096_i32 : i32 loc(#loc72)
+    %tmp0_7 = tt.splat %tmp0 : i32 -> tensor<1x4096xi32, #blocked> loc(#loc113)
+    %tmp0_8 = arith.addi %r0_base_6, %tmp0_7 : tensor<1x4096xi32, #blocked> loc(#loc73)
+    %tmp0_9 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<1x4096x!tt.ptr<bf16>, #blocked> loc(#loc74)
+    %tmp0_10 = tt.addptr %tmp0_9, %tmp0_8 : tensor<1x4096x!tt.ptr<bf16>, #blocked>, tensor<1x4096xi32, #blocked> loc(#loc74)
+    %tmp0_11 = tt.splat %xmask : i1 -> tensor<1x4096xi1, #blocked> loc(#loc114)
+    %tmp0_12 = arith.andi %r0_mask, %tmp0_11 : tensor<1x4096xi1, #blocked> loc(#loc75)
+    %tmp0_13 = tt.load %tmp0_10, %tmp0_12, %cst_0 evictionPolicy = evict_first : tensor<1x4096x!tt.ptr<bf16>, #blocked> loc(#loc76)
+    %tmp0_14 = arith.extf %tmp0_13 : tensor<1x4096xbf16, #blocked> to tensor<1x4096xf32, #blocked> loc(#loc77)
+    %tmp1 = tt.splat %in_ptr1 : !tt.ptr<bf16> -> tensor<1x4096x!tt.ptr<bf16>, #blocked> loc(#loc78)
+    %tmp1_15 = tt.addptr %tmp1, %r0_base_6 : tensor<1x4096x!tt.ptr<bf16>, #blocked>, tensor<1x4096xi32, #blocked> loc(#loc78)
+    %tmp1_16 = tt.load %tmp1_15, %r0_mask, %cst_0 evictionPolicy = evict_last : tensor<1x4096x!tt.ptr<bf16>, #blocked> loc(#loc79)
+    %tmp1_17 = arith.extf %tmp1_16 : tensor<1x4096xbf16, #blocked> to tensor<1x4096xf32, #blocked> loc(#loc80)
+    %tmp2 = tt.splat %in_ptr2 : !tt.ptr<bf16> -> tensor<1x4096x!tt.ptr<bf16>, #blocked> loc(#loc81)
+    %tmp2_18 = tt.addptr %tmp2, %tmp0_8 : tensor<1x4096x!tt.ptr<bf16>, #blocked>, tensor<1x4096xi32, #blocked> loc(#loc81)
+    %tmp2_19 = tt.load %tmp2_18, %tmp0_12, %cst_0 evictionPolicy = evict_first : tensor<1x4096x!tt.ptr<bf16>, #blocked> loc(#loc82)
+    %tmp2_20 = arith.extf %tmp2_19 : tensor<1x4096xbf16, #blocked> to tensor<1x4096xf32, #blocked> loc(#loc83)
+    %tmp3 = arith.mulf %tmp1_17, %tmp2_20 : tensor<1x4096xf32, #blocked> loc(#loc84)
+    %tmp4 = arith.addf %tmp0_14, %tmp3 : tensor<1x4096xf32, #blocked> loc(#loc85)
+    %tmp7_mean = arith.select %tmp0_12, %tmp4, %cst_2 : tensor<1x4096xi1, #blocked>, tensor<1x4096xf32, #blocked> loc(#loc86)
+    %tmp7_weight = arith.select %tmp0_12, %cst_3, %cst_2 : tensor<1x4096xi1, #blocked>, tensor<1x4096xf32, #blocked> loc(#loc87)
+    %0 = tt.splat %out_ptr0 : !tt.ptr<bf16> -> tensor<1x4096x!tt.ptr<bf16>, #blocked> loc(#loc22)
+    %1 = tt.addptr %0, %tmp0_8 : tensor<1x4096x!tt.ptr<bf16>, #blocked>, tensor<1x4096xi32, #blocked> loc(#loc22)
+    %2 = arith.truncf %tmp4 : tensor<1x4096xf32, #blocked> to tensor<1x4096xbf16, #blocked> loc(#loc23)
+    tt.store %1, %2, %tmp0_12 : tensor<1x4096x!tt.ptr<bf16>, #blocked> loc(#loc23)
+    %3:3 = "tt.reduce"(%tmp7_mean, %cst_2, %tmp7_weight) <{axis = 1 : i32}> ({
+    ^bb0(%arg9: f32 loc(callsite(#loc1 at #loc25)), %arg10: f32 loc(callsite(#loc1 at #loc25)), %arg11: f32 loc(callsite(#loc1 at #loc25)), %arg12: f32 loc(callsite(#loc1 at #loc25)), %arg13: f32 loc(callsite(#loc1 at #loc25)), %arg14: f32 loc(callsite(#loc1 at #loc25))):
+      %delta = arith.subf %arg12, %arg9 : f32 loc(#loc115)
+      %new_weight = arith.addf %arg11, %arg14 : f32 loc(#loc116)
+      %w2_over_w = arith.cmpf oeq, %new_weight, %cst_1 : f32 loc(#loc117)
+      %w2_over_w_30 = arith.divf %arg14, %new_weight : f32 loc(#loc118)
+      %w2_over_w_31 = arith.select %w2_over_w, %cst_1, %w2_over_w_30 : f32 loc(#loc119)
+      %7 = arith.mulf %delta, %w2_over_w_31 : f32 loc(#loc120)
+      %8 = arith.addf %arg9, %7 : f32 loc(#loc121)
+      %9 = arith.addf %arg10, %arg13 : f32 loc(#loc122)
+      %10 = arith.mulf %delta, %delta : f32 loc(#loc123)
+      %11 = arith.mulf %10, %arg11 : f32 loc(#loc124)
+      %12 = arith.mulf %11, %w2_over_w_31 : f32 loc(#loc125)
+      %13 = arith.addf %9, %12 : f32 loc(#loc126)
+      tt.reduce.return %8, %13, %new_weight : f32, f32, f32 loc(#loc88)
+    }) : (tensor<1x4096xf32, #blocked>, tensor<1x4096xf32, #blocked>, tensor<1x4096xf32, #blocked>) -> (tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked}>>, tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked}>>, tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked}>>) loc(#loc88)
+    %tmp7 = tt.expand_dims %3#0 {axis = 1 : i32} : tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<1x1xf32, #blocked> loc(#loc95)
+    %tmp11 = tt.expand_dims %3#1 {axis = 1 : i32} : tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<1x1xf32, #blocked> loc(#loc96)
+    %tmp13 = tt.load %1, %tmp0_12, %cst_0 evictionPolicy = evict_first : tensor<1x4096x!tt.ptr<bf16>, #blocked> loc(#loc97)
+    %tmp13_21 = arith.extf %tmp13 : tensor<1x4096xbf16, #blocked> to tensor<1x4096xf32, #blocked> loc(#loc98)
+    %tmp23 = tt.splat %in_ptr3 : !tt.ptr<bf16> -> tensor<1x4096x!tt.ptr<bf16>, #blocked> loc(#loc99)
+    %tmp23_22 = tt.addptr %tmp23, %r0_base_6 : tensor<1x4096x!tt.ptr<bf16>, #blocked>, tensor<1x4096xi32, #blocked> loc(#loc99)
+    %tmp23_23 = tt.load %tmp23_22, %r0_mask, %cst_0 evictionPolicy = evict_last : tensor<1x4096x!tt.ptr<bf16>, #blocked> loc(#loc100)
+    %tmp23_24 = arith.extf %tmp23_23 : tensor<1x4096xbf16, #blocked> to tensor<1x4096xf32, #blocked> loc(#loc101)
+    %tmp27 = tt.splat %in_ptr4 : !tt.ptr<bf16> -> tensor<1x4096x!tt.ptr<bf16>, #blocked> loc(#loc102)
+    %tmp27_25 = tt.addptr %tmp27, %r0_base_6 : tensor<1x4096x!tt.ptr<bf16>, #blocked>, tensor<1x4096xi32, #blocked> loc(#loc102)
+    %tmp27_26 = tt.load %tmp27_25, %r0_mask, %cst_0 evictionPolicy = evict_last : tensor<1x4096x!tt.ptr<bf16>, #blocked> loc(#loc103)
+    %tmp27_27 = arith.extf %tmp27_26 : tensor<1x4096xbf16, #blocked> to tensor<1x4096xf32, #blocked> loc(#loc104)
+    %tmp15 = tt.broadcast %tmp7 : tensor<1x1xf32, #blocked> -> tensor<1x4096xf32, #blocked> loc(#loc105)
+    %tmp15_28 = arith.subf %tmp13_21, %tmp15 : tensor<1x4096xf32, #blocked> loc(#loc105)
+    %tmp17 = arith.divf %tmp11, %cst_5 : tensor<1x1xf32, #blocked> loc(#loc106)
+    %tmp19 = arith.addf %tmp17, %cst_4 : tensor<1x1xf32, #blocked> loc(#loc107)
+    %tmp20 = tt.extern_elementwise %tmp19 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<1x1xf32, #blocked>) -> tensor<1x1xf32, #blocked> loc(#loc108)
+    %tmp21 = tt.broadcast %tmp20 : tensor<1x1xf32, #blocked> -> tensor<1x4096xf32, #blocked> loc(#loc109)
+    %tmp21_29 = arith.mulf %tmp15_28, %tmp21 : tensor<1x4096xf32, #blocked> loc(#loc109)
+    %tmp25 = arith.addf %tmp23_24, %cst_3 : tensor<1x4096xf32, #blocked> loc(#loc110)
+    %tmp26 = arith.mulf %tmp21_29, %tmp25 : tensor<1x4096xf32, #blocked> loc(#loc111)
+    %tmp28 = arith.addf %tmp26, %tmp27_27 : tensor<1x4096xf32, #blocked> loc(#loc112)
+    %4 = tt.splat %out_ptr3 : !tt.ptr<bf16> -> tensor<1x4096x!tt.ptr<bf16>, #blocked> loc(#loc56)
+    %5 = tt.addptr %4, %tmp0_8 : tensor<1x4096x!tt.ptr<bf16>, #blocked>, tensor<1x4096xi32, #blocked> loc(#loc56)
+    %6 = arith.truncf %tmp28 : tensor<1x4096xf32, #blocked> to tensor<1x4096xbf16, #blocked> loc(#loc57)
+    tt.store %5, %6, %tmp0_12 : tensor<1x4096x!tt.ptr<bf16>, #blocked> loc(#loc57)
+    tt.return loc(#loc58)
+  } loc(#loc)
+} loc(#loc)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":23:28)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":25:21)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":26:37)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":34:29)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":38:46)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":38:41)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":38:34)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":38:61)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":38:51)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":38:113)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":39:34)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":39:41)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":39:94)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":40:34)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":40:51)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":40:113)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":41:22)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":42:22)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":48:62)
+#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":50:66)
+#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":51:29)
+#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":51:52)
+#loc24 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":243:46)
+#loc26 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":231:21)
+#loc27 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":232:28)
+#loc28 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:39)
+#loc29 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:60)
+#loc30 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:49)
+#loc31 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":235:25)
+#loc32 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":235:17)
+#loc33 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:15)
+#loc34 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:30)
+#loc35 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:38)
+#loc36 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:49)
+#loc37 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:22)
+#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":53:16)
+#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":54:17)
+#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":62:53)
+#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":62:115)
+#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":63:35)
+#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":63:42)
+#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":63:95)
+#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":64:35)
+#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":64:42)
+#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":64:95)
+#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":66:24)
+#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":68:25)
+#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":70:24)
+#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":71:32)
+#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":72:24)
+#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":75:24)
+#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":76:24)
+#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":77:24)
+#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":78:29)
+#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":78:53)
+#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":56:4)
+#loc68 = loc("xoffset"(#loc2))
+#loc69 = loc("xmask"(#loc3))
+#loc70 = loc("r0_base"(#loc4))
+#loc71 = loc("r0_mask"(#loc5))
+#loc72 = loc("tmp0"(#loc6))
+#loc73 = loc("tmp0"(#loc7))
+#loc74 = loc("tmp0"(#loc8))
+#loc75 = loc("tmp0"(#loc9))
+#loc76 = loc("tmp0"(#loc10))
+#loc77 = loc("tmp0"(#loc11))
+#loc78 = loc("tmp1"(#loc12))
+#loc79 = loc("tmp1"(#loc13))
+#loc80 = loc("tmp1"(#loc14))
+#loc81 = loc("tmp2"(#loc15))
+#loc82 = loc("tmp2"(#loc16))
+#loc83 = loc("tmp2"(#loc17))
+#loc84 = loc("tmp3"(#loc18))
+#loc85 = loc("tmp4"(#loc19))
+#loc86 = loc("tmp7_mean"(#loc20))
+#loc87 = loc("tmp7_weight"(#loc21))
+#loc88 = loc(callsite(#loc24 at #loc25))
+#loc90 = loc("delta"(#loc26))
+#loc91 = loc("new_weight"(#loc27))
+#loc92 = loc("w2_over_w"(#loc28))
+#loc93 = loc("w2_over_w"(#loc29))
+#loc94 = loc("w2_over_w"(#loc30))
+#loc95 = loc("tmp7"(#loc38))
+#loc96 = loc("tmp11"(#loc39))
+#loc97 = loc("tmp13"(#loc40))
+#loc98 = loc("tmp13"(#loc41))
+#loc99 = loc("tmp23"(#loc42))
+#loc100 = loc("tmp23"(#loc43))
+#loc101 = loc("tmp23"(#loc44))
+#loc102 = loc("tmp27"(#loc45))
+#loc103 = loc("tmp27"(#loc46))
+#loc104 = loc("tmp27"(#loc47))
+#loc105 = loc("tmp15"(#loc48))
+#loc106 = loc("tmp17"(#loc49))
+#loc107 = loc("tmp19"(#loc50))
+#loc108 = loc("tmp20"(#loc51))
+#loc109 = loc("tmp21"(#loc52))
+#loc110 = loc("tmp25"(#loc53))
+#loc111 = loc("tmp26"(#loc54))
+#loc112 = loc("tmp28"(#loc55))
+#loc113 = loc(fused[#loc73, #loc72])
+#loc114 = loc(fused[#loc75, #loc69])
+#loc115 = loc(callsite(#loc90 at #loc88))
+#loc116 = loc(callsite(#loc91 at #loc88))
+#loc117 = loc(callsite(#loc92 at #loc88))
+#loc118 = loc(callsite(#loc93 at #loc88))
+#loc119 = loc(callsite(#loc94 at #loc88))
+#loc120 = loc(callsite(#loc31 at #loc88))
+#loc121 = loc(callsite(#loc32 at #loc88))
+#loc122 = loc(callsite(#loc33 at #loc88))
+#loc123 = loc(callsite(#loc34 at #loc88))
+#loc124 = loc(callsite(#loc35 at #loc88))
+#loc125 = loc(callsite(#loc36 at #loc88))
+#loc126 = loc(callsite(#loc37 at #loc88))
diff --git a/triton/OPILR2I4CRZEKF5SC7TQV475GMCNDW573IRFPSSSP4NDN5JIX2VQ/triton_red_fused_add_mul_native_layer_norm_0.ttir b/triton/OPILR2I4CRZEKF5SC7TQV475GMCNDW573IRFPSSSP4NDN5JIX2VQ/triton_red_fused_add_mul_native_layer_norm_0.ttir
new file mode 100644
index 0000000000000000000000000000000000000000..78ff99218f52da14d1eb8bff343a9096cf0e656f
--- /dev/null
+++ b/triton/OPILR2I4CRZEKF5SC7TQV475GMCNDW573IRFPSSSP4NDN5JIX2VQ/triton_red_fused_add_mul_native_layer_norm_0.ttir
@@ -0,0 +1,215 @@
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":18:0)
+#loc1 = loc(unknown)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":52:80)
+#loc60 = loc("in_ptr0"(#loc))
+#loc61 = loc("in_ptr1"(#loc))
+#loc62 = loc("in_ptr2"(#loc))
+#loc63 = loc("in_ptr3"(#loc))
+#loc64 = loc("in_ptr4"(#loc))
+#loc65 = loc("out_ptr0"(#loc))
+#loc66 = loc("out_ptr3"(#loc))
+#loc67 = loc("xnumel"(#loc))
+#loc68 = loc("r0_numel"(#loc))
+#loc70 = loc(callsite(#loc1 at #loc3))
+module {
+  tt.func public @triton_red_fused_add_mul_native_layer_norm_0(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %in_ptr4: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr4"(#loc)), %out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %out_ptr3: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr3"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
+    %c4096_i32 = arith.constant 4096 : i32 loc(#loc1)
+    %xmask = arith.constant 256 : i32 loc(#loc69)
+    %cst = arith.constant 0.000000e+00 : f32 loc(#loc70)
+    %cst_0 = arith.constant dense<0.000000e+00> : tensor<1x4096xf32> loc(#loc1)
+    %cst_1 = arith.constant dense<0.000000e+00> : tensor<1x4096xbf16> loc(#loc1)
+    %cst_2 = arith.constant dense<1.000000e+00> : tensor<1x4096xf32> loc(#loc1)
+    %cst_3 = arith.constant dense<9.99999997E-7> : tensor<1x1xf32> loc(#loc1)
+    %cst_4 = arith.constant dense<4.096000e+03> : tensor<1x1xf32> loc(#loc1)
+    %cst_5 = arith.constant dense<4096> : tensor<1x4096xi32> loc(#loc1)
+    %xoffset = tt.get_program_id x : i32 loc(#loc71)
+    %xmask_6 = arith.cmpi slt, %xoffset, %xmask : i32 loc(#loc69)
+    %r0_base = tt.make_range {end = 4096 : i32, start = 0 : i32} : tensor<4096xi32> loc(#loc72)
+    %r0_base_7 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<4096xi32> -> tensor<1x4096xi32> loc(#loc73)
+    %r0_mask = arith.cmpi slt, %r0_base_7, %cst_5 : tensor<1x4096xi32> loc(#loc74)
+    %tmp0 = arith.muli %xoffset, %c4096_i32 : i32 loc(#loc75)
+    %tmp0_8 = tt.splat %tmp0 : i32 -> tensor<1x4096xi32> loc(#loc115)
+    %tmp0_9 = arith.addi %r0_base_7, %tmp0_8 : tensor<1x4096xi32> loc(#loc76)
+    %tmp0_10 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<1x4096x!tt.ptr<bf16>> loc(#loc77)
+    %tmp0_11 = tt.addptr %tmp0_10, %tmp0_9 : tensor<1x4096x!tt.ptr<bf16>>, tensor<1x4096xi32> loc(#loc77)
+    %tmp0_12 = tt.splat %xmask_6 : i1 -> tensor<1x4096xi1> loc(#loc116)
+    %tmp0_13 = arith.andi %r0_mask, %tmp0_12 : tensor<1x4096xi1> loc(#loc78)
+    %tmp0_14 = tt.load %tmp0_11, %tmp0_13, %cst_1 evictionPolicy = evict_first : tensor<1x4096x!tt.ptr<bf16>> loc(#loc79)
+    %tmp0_15 = arith.extf %tmp0_14 : tensor<1x4096xbf16> to tensor<1x4096xf32> loc(#loc80)
+    %tmp1 = tt.splat %in_ptr1 : !tt.ptr<bf16> -> tensor<1x4096x!tt.ptr<bf16>> loc(#loc81)
+    %tmp1_16 = tt.addptr %tmp1, %r0_base_7 : tensor<1x4096x!tt.ptr<bf16>>, tensor<1x4096xi32> loc(#loc81)
+    %tmp1_17 = tt.load %tmp1_16, %r0_mask, %cst_1 evictionPolicy = evict_last : tensor<1x4096x!tt.ptr<bf16>> loc(#loc82)
+    %tmp1_18 = arith.extf %tmp1_17 : tensor<1x4096xbf16> to tensor<1x4096xf32> loc(#loc83)
+    %tmp2 = tt.splat %in_ptr2 : !tt.ptr<bf16> -> tensor<1x4096x!tt.ptr<bf16>> loc(#loc84)
+    %tmp2_19 = tt.addptr %tmp2, %tmp0_9 : tensor<1x4096x!tt.ptr<bf16>>, tensor<1x4096xi32> loc(#loc84)
+    %tmp2_20 = tt.load %tmp2_19, %tmp0_13, %cst_1 evictionPolicy = evict_first : tensor<1x4096x!tt.ptr<bf16>> loc(#loc85)
+    %tmp2_21 = arith.extf %tmp2_20 : tensor<1x4096xbf16> to tensor<1x4096xf32> loc(#loc86)
+    %tmp3 = arith.mulf %tmp1_18, %tmp2_21 : tensor<1x4096xf32> loc(#loc87)
+    %tmp4 = arith.addf %tmp0_15, %tmp3 : tensor<1x4096xf32> loc(#loc88)
+    %tmp7_mean = arith.select %tmp0_13, %tmp4, %cst_0 : tensor<1x4096xi1>, tensor<1x4096xf32> loc(#loc89)
+    %tmp7_weight = arith.select %tmp0_13, %cst_2, %cst_0 : tensor<1x4096xi1>, tensor<1x4096xf32> loc(#loc90)
+    %0 = tt.splat %out_ptr0 : !tt.ptr<bf16> -> tensor<1x4096x!tt.ptr<bf16>> loc(#loc24)
+    %1 = tt.addptr %0, %tmp0_9 : tensor<1x4096x!tt.ptr<bf16>>, tensor<1x4096xi32> loc(#loc24)
+    %2 = arith.truncf %tmp4 : tensor<1x4096xf32> to tensor<1x4096xbf16> loc(#loc25)
+    tt.store %1, %2, %tmp0_13 : tensor<1x4096x!tt.ptr<bf16>> loc(#loc25)
+    %3:3 = "tt.reduce"(%tmp7_mean, %cst_0, %tmp7_weight) <{axis = 1 : i32}> ({
+    ^bb0(%arg9: f32 loc(callsite(#loc1 at #loc3)), %arg10: f32 loc(callsite(#loc1 at #loc3)), %arg11: f32 loc(callsite(#loc1 at #loc3)), %arg12: f32 loc(callsite(#loc1 at #loc3)), %arg13: f32 loc(callsite(#loc1 at #loc3)), %arg14: f32 loc(callsite(#loc1 at #loc3))):
+      %delta = arith.subf %arg12, %arg9 : f32 loc(#loc117)
+      %new_weight = arith.addf %arg11, %arg14 : f32 loc(#loc118)
+      %w2_over_w = arith.cmpf oeq, %new_weight, %cst : f32 loc(#loc119)
+      %w2_over_w_31 = arith.divf %arg14, %new_weight : f32 loc(#loc120)
+      %w2_over_w_32 = arith.select %w2_over_w, %cst, %w2_over_w_31 : f32 loc(#loc121)
+      %7 = arith.mulf %delta, %w2_over_w_32 : f32 loc(#loc122)
+      %8 = arith.addf %arg9, %7 : f32 loc(#loc123)
+      %9 = arith.addf %arg10, %arg13 : f32 loc(#loc124)
+      %10 = arith.mulf %delta, %delta : f32 loc(#loc125)
+      %11 = arith.mulf %10, %arg11 : f32 loc(#loc126)
+      %12 = arith.mulf %11, %w2_over_w_32 : f32 loc(#loc127)
+      %13 = arith.addf %9, %12 : f32 loc(#loc128)
+      tt.reduce.return %8, %13, %new_weight : f32, f32, f32 loc(#loc91)
+    }) : (tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32>) -> (tensor<1xf32>, tensor<1xf32>, tensor<1xf32>) loc(#loc91)
+    %tmp7 = tt.expand_dims %3#0 {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc97)
+    %tmp11 = tt.expand_dims %3#1 {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc98)
+    %tmp13 = tt.load %1, %tmp0_13, %cst_1 evictionPolicy = evict_first : tensor<1x4096x!tt.ptr<bf16>> loc(#loc99)
+    %tmp13_22 = arith.extf %tmp13 : tensor<1x4096xbf16> to tensor<1x4096xf32> loc(#loc100)
+    %tmp23 = tt.splat %in_ptr3 : !tt.ptr<bf16> -> tensor<1x4096x!tt.ptr<bf16>> loc(#loc101)
+    %tmp23_23 = tt.addptr %tmp23, %r0_base_7 : tensor<1x4096x!tt.ptr<bf16>>, tensor<1x4096xi32> loc(#loc101)
+    %tmp23_24 = tt.load %tmp23_23, %r0_mask, %cst_1 evictionPolicy = evict_last : tensor<1x4096x!tt.ptr<bf16>> loc(#loc102)
+    %tmp23_25 = arith.extf %tmp23_24 : tensor<1x4096xbf16> to tensor<1x4096xf32> loc(#loc103)
+    %tmp27 = tt.splat %in_ptr4 : !tt.ptr<bf16> -> tensor<1x4096x!tt.ptr<bf16>> loc(#loc104)
+    %tmp27_26 = tt.addptr %tmp27, %r0_base_7 : tensor<1x4096x!tt.ptr<bf16>>, tensor<1x4096xi32> loc(#loc104)
+    %tmp27_27 = tt.load %tmp27_26, %r0_mask, %cst_1 evictionPolicy = evict_last : tensor<1x4096x!tt.ptr<bf16>> loc(#loc105)
+    %tmp27_28 = arith.extf %tmp27_27 : tensor<1x4096xbf16> to tensor<1x4096xf32> loc(#loc106)
+    %tmp15 = tt.broadcast %tmp7 : tensor<1x1xf32> -> tensor<1x4096xf32> loc(#loc107)
+    %tmp15_29 = arith.subf %tmp13_22, %tmp15 : tensor<1x4096xf32> loc(#loc107)
+    %tmp17 = arith.divf %tmp11, %cst_4 : tensor<1x1xf32> loc(#loc108)
+    %tmp19 = arith.addf %tmp17, %cst_3 : tensor<1x1xf32> loc(#loc109)
+    %tmp20 = tt.extern_elementwise %tmp19 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<1x1xf32>) -> tensor<1x1xf32> loc(#loc110)
+    %tmp21 = tt.broadcast %tmp20 : tensor<1x1xf32> -> tensor<1x4096xf32> loc(#loc111)
+    %tmp21_30 = arith.mulf %tmp15_29, %tmp21 : tensor<1x4096xf32> loc(#loc111)
+    %tmp25 = arith.addf %tmp23_25, %cst_2 : tensor<1x4096xf32> loc(#loc112)
+    %tmp26 = arith.mulf %tmp21_30, %tmp25 : tensor<1x4096xf32> loc(#loc113)
+    %tmp28 = arith.addf %tmp26, %tmp27_28 : tensor<1x4096xf32> loc(#loc114)
+    %4 = tt.splat %out_ptr3 : !tt.ptr<bf16> -> tensor<1x4096x!tt.ptr<bf16>> loc(#loc57)
+    %5 = tt.addptr %4, %tmp0_9 : tensor<1x4096x!tt.ptr<bf16>>, tensor<1x4096xi32> loc(#loc57)
+    %6 = arith.truncf %tmp28 : tensor<1x4096xf32> to tensor<1x4096xbf16> loc(#loc58)
+    tt.store %5, %6, %tmp0_13 : tensor<1x4096x!tt.ptr<bf16>> loc(#loc58)
+    tt.return loc(#loc59)
+  } loc(#loc)
+} loc(#loc)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":25:21)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":23:28)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":26:27)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":26:37)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":34:29)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":38:46)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":38:41)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":38:34)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":38:61)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":38:51)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":38:113)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":39:34)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":39:41)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":39:94)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":40:34)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":40:51)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":40:113)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":41:22)
+#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":42:22)
+#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":48:62)
+#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":50:66)
+#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":51:29)
+#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":51:52)
+#loc26 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":243:46)
+#loc27 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":231:21)
+#loc28 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":232:28)
+#loc29 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:39)
+#loc30 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:60)
+#loc31 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:49)
+#loc32 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":235:25)
+#loc33 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":235:17)
+#loc34 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:15)
+#loc35 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:30)
+#loc36 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:38)
+#loc37 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:49)
+#loc38 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:22)
+#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":53:16)
+#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":54:17)
+#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":62:53)
+#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":62:115)
+#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":63:35)
+#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":63:42)
+#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":63:95)
+#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":64:35)
+#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":64:42)
+#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":64:95)
+#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":66:24)
+#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":68:25)
+#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":70:24)
+#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":71:32)
+#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":72:24)
+#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":75:24)
+#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":76:24)
+#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":77:24)
+#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":78:29)
+#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":78:53)
+#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/a3/ca3menlfuldthgmncfpjk452xkros7idrmil6pcoeigraymcg4e6.py":56:4)
+#loc69 = loc("xmask"(#loc2))
+#loc71 = loc("xoffset"(#loc4))
+#loc72 = loc("r0_base"(#loc5))
+#loc73 = loc("r0_base"(#loc6))
+#loc74 = loc("r0_mask"(#loc7))
+#loc75 = loc("tmp0"(#loc8))
+#loc76 = loc("tmp0"(#loc9))
+#loc77 = loc("tmp0"(#loc10))
+#loc78 = loc("tmp0"(#loc11))
+#loc79 = loc("tmp0"(#loc12))
+#loc80 = loc("tmp0"(#loc13))
+#loc81 = loc("tmp1"(#loc14))
+#loc82 = loc("tmp1"(#loc15))
+#loc83 = loc("tmp1"(#loc16))
+#loc84 = loc("tmp2"(#loc17))
+#loc85 = loc("tmp2"(#loc18))
+#loc86 = loc("tmp2"(#loc19))
+#loc87 = loc("tmp3"(#loc20))
+#loc88 = loc("tmp4"(#loc21))
+#loc89 = loc("tmp7_mean"(#loc22))
+#loc90 = loc("tmp7_weight"(#loc23))
+#loc91 = loc(callsite(#loc26 at #loc3))
+#loc92 = loc("delta"(#loc27))
+#loc93 = loc("new_weight"(#loc28))
+#loc94 = loc("w2_over_w"(#loc29))
+#loc95 = loc("w2_over_w"(#loc30))
+#loc96 = loc("w2_over_w"(#loc31))
+#loc97 = loc("tmp7"(#loc39))
+#loc98 = loc("tmp11"(#loc40))
+#loc99 = loc("tmp13"(#loc41))
+#loc100 = loc("tmp13"(#loc42))
+#loc101 = loc("tmp23"(#loc43))
+#loc102 = loc("tmp23"(#loc44))
+#loc103 = loc("tmp23"(#loc45))
+#loc104 = loc("tmp27"(#loc46))
+#loc105 = loc("tmp27"(#loc47))
+#loc106 = loc("tmp27"(#loc48))
+#loc107 = loc("tmp15"(#loc49))
+#loc108 = loc("tmp17"(#loc50))
+#loc109 = loc("tmp19"(#loc51))
+#loc110 = loc("tmp20"(#loc52))
+#loc111 = loc("tmp21"(#loc53))
+#loc112 = loc("tmp25"(#loc54))
+#loc113 = loc("tmp26"(#loc55))
+#loc114 = loc("tmp28"(#loc56))
+#loc115 = loc(fused[#loc76, #loc75])
+#loc116 = loc(fused[#loc78, #loc69])
+#loc117 = loc(callsite(#loc92 at #loc91))
+#loc118 = loc(callsite(#loc93 at #loc91))
+#loc119 = loc(callsite(#loc94 at #loc91))
+#loc120 = loc(callsite(#loc95 at #loc91))
+#loc121 = loc(callsite(#loc96 at #loc91))
+#loc122 = loc(callsite(#loc32 at #loc91))
+#loc123 = loc(callsite(#loc33 at #loc91))
+#loc124 = loc(callsite(#loc34 at #loc91))
+#loc125 = loc(callsite(#loc35 at #loc91))
+#loc126 = loc(callsite(#loc36 at #loc91))
+#loc127 = loc(callsite(#loc37 at #loc91))
+#loc128 = loc(callsite(#loc38 at #loc91))
diff --git a/triton/P6RZ5PAFGN2GJD5U5GBCNRKQD25CGJQM7AHKJR3IJHQE4JWAUJZQ/__grp__triton_poi_fused_cat_view_4.json b/triton/P6RZ5PAFGN2GJD5U5GBCNRKQD25CGJQM7AHKJR3IJHQE4JWAUJZQ/__grp__triton_poi_fused_cat_view_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..535fea929e6f108ff12a2b7229f6032435f2d312
--- /dev/null
+++ b/triton/P6RZ5PAFGN2GJD5U5GBCNRKQD25CGJQM7AHKJR3IJHQE4JWAUJZQ/__grp__triton_poi_fused_cat_view_4.json
@@ -0,0 +1 @@
+{"child_paths": {"triton_poi_fused_cat_view_4.source": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/P6RZ5PAFGN2GJD5U5GBCNRKQD25CGJQM7AHKJR3IJHQE4JWAUJZQ/triton_poi_fused_cat_view_4.source", "triton_poi_fused_cat_view_4.ttir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/P6RZ5PAFGN2GJD5U5GBCNRKQD25CGJQM7AHKJR3IJHQE4JWAUJZQ/triton_poi_fused_cat_view_4.ttir", "triton_poi_fused_cat_view_4.ttgir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/P6RZ5PAFGN2GJD5U5GBCNRKQD25CGJQM7AHKJR3IJHQE4JWAUJZQ/triton_poi_fused_cat_view_4.ttgir", "triton_poi_fused_cat_view_4.llir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/P6RZ5PAFGN2GJD5U5GBCNRKQD25CGJQM7AHKJR3IJHQE4JWAUJZQ/triton_poi_fused_cat_view_4.llir", "triton_poi_fused_cat_view_4.ptx": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/P6RZ5PAFGN2GJD5U5GBCNRKQD25CGJQM7AHKJR3IJHQE4JWAUJZQ/triton_poi_fused_cat_view_4.ptx", "triton_poi_fused_cat_view_4.cubin": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/P6RZ5PAFGN2GJD5U5GBCNRKQD25CGJQM7AHKJR3IJHQE4JWAUJZQ/triton_poi_fused_cat_view_4.cubin", "triton_poi_fused_cat_view_4.json": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/P6RZ5PAFGN2GJD5U5GBCNRKQD25CGJQM7AHKJR3IJHQE4JWAUJZQ/triton_poi_fused_cat_view_4.json"}}
\ No newline at end of file
diff --git a/triton/P6RZ5PAFGN2GJD5U5GBCNRKQD25CGJQM7AHKJR3IJHQE4JWAUJZQ/triton_poi_fused_cat_view_4.cubin b/triton/P6RZ5PAFGN2GJD5U5GBCNRKQD25CGJQM7AHKJR3IJHQE4JWAUJZQ/triton_poi_fused_cat_view_4.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..be3aa0a43b09568d5bcbcc9b3a8b0e4c7481840f
Binary files /dev/null and b/triton/P6RZ5PAFGN2GJD5U5GBCNRKQD25CGJQM7AHKJR3IJHQE4JWAUJZQ/triton_poi_fused_cat_view_4.cubin differ
diff --git a/triton/P6RZ5PAFGN2GJD5U5GBCNRKQD25CGJQM7AHKJR3IJHQE4JWAUJZQ/triton_poi_fused_cat_view_4.json b/triton/P6RZ5PAFGN2GJD5U5GBCNRKQD25CGJQM7AHKJR3IJHQE4JWAUJZQ/triton_poi_fused_cat_view_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..1d554bf0116ae62fb21c0786ef83bb947682d328
--- /dev/null
+++ b/triton/P6RZ5PAFGN2GJD5U5GBCNRKQD25CGJQM7AHKJR3IJHQE4JWAUJZQ/triton_poi_fused_cat_view_4.json
@@ -0,0 +1 @@
+{"hash": "7fa39ebc053374648fb4e98226c5501eba23260cf80ea4c76849e04e26c0a273", "target": {"backend": "cuda", "arch": 89, "warp_size": 32}, "num_warps": 8, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "enable_reflect_ftz": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee", "bf16x3", "bf16x6"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm89", "instrumentation_mode": "", "triton_version": "3.6.0", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_poi_fused_cat_view_4"}
\ No newline at end of file
diff --git a/triton/P6RZ5PAFGN2GJD5U5GBCNRKQD25CGJQM7AHKJR3IJHQE4JWAUJZQ/triton_poi_fused_cat_view_4.llir b/triton/P6RZ5PAFGN2GJD5U5GBCNRKQD25CGJQM7AHKJR3IJHQE4JWAUJZQ/triton_poi_fused_cat_view_4.llir
new file mode 100644
index 0000000000000000000000000000000000000000..09b1ccdff4ed4f4792a9c1c5857d337b2e3f1f2a
--- /dev/null
+++ b/triton/P6RZ5PAFGN2GJD5U5GBCNRKQD25CGJQM7AHKJR3IJHQE4JWAUJZQ/triton_poi_fused_cat_view_4.llir
@@ -0,0 +1,78 @@
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64"
+
+; Function Attrs: nounwind
+define ptx_kernel void @triton_poi_fused_cat_view_4(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, i32 %3, ptr addrspace(1) readnone captures(none) %4, ptr addrspace(1) readnone captures(none) %5) local_unnamed_addr #0 !dbg !4 {
+  %7 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7
+  %8 = shl i32 %7, 9, !dbg !8
+  %9 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9
+  %10 = shl nuw nsw i32 %9, 1, !dbg !9
+  %11 = and i32 %10, 510, !dbg !9
+  %12 = or disjoint i32 %11, %8, !dbg !10
+  %13 = sdiv i32 %12, 4096, !dbg !11
+  %14 = icmp slt i32 %12, 1048576, !dbg !12
+  %15 = shl i32 %13, 13, !dbg !13
+  %16 = add i32 %15, %12, !dbg !13
+  %17 = sext i32 %16 to i64, !dbg !14
+  %18 = getelementptr bfloat, ptr addrspace(1) %0, i64 %17, !dbg !14
+  %19 = tail call i32 asm sideeffect "mov.u32 $0, $1;\0A\09@$3 ld.global.b32 { $0 }, [ $2 + 0 ];", "=r,r,l,b"(i32 0, ptr addrspace(1) %18, i1 %14) #2, !dbg !15
+  %20 = bitcast i32 %19 to <2 x bfloat>, !dbg !15
+  %21 = extractelement <2 x bfloat> %20, i64 0, !dbg !15
+  %22 = extractelement <2 x bfloat> %20, i64 1, !dbg !15
+  %23 = icmp sgt i32 %12, 1048575, !dbg !16
+  %24 = add i32 %16, -3145728, !dbg !17
+  %25 = sext i32 %24 to i64, !dbg !18
+  %26 = getelementptr bfloat, ptr addrspace(1) %1, i64 %25, !dbg !18
+  %27 = tail call i32 asm sideeffect "mov.u32 $0, $1;\0A\09@$3 ld.global.b32 { $0 }, [ $2 + 0 ];", "=r,r,l,b"(i32 0, ptr addrspace(1) %26, i1 %23) #2, !dbg !19
+  %28 = bitcast i32 %27 to <2 x bfloat>, !dbg !19
+  %29 = extractelement <2 x bfloat> %28, i64 0, !dbg !19
+  %30 = extractelement <2 x bfloat> %28, i64 1, !dbg !19
+  %.v = select i1 %14, bfloat %21, bfloat %29, !dbg !20
+  %.v1 = select i1 %14, bfloat %22, bfloat %30, !dbg !20
+  %31 = sext i32 %12 to i64, !dbg !21
+  %32 = getelementptr bfloat, ptr addrspace(1) %2, i64 %31, !dbg !21
+  %33 = insertelement <2 x bfloat> poison, bfloat %.v, i64 0, !dbg !22
+  %34 = insertelement <2 x bfloat> %33, bfloat %.v1, i64 1, !dbg !22
+  %35 = bitcast <2 x bfloat> %34 to i32, !dbg !22
+  tail call void asm sideeffect "st.global.b32 [ $1 + 0 ], { $0 };", "r,l"(i32 %35, ptr addrspace(1) %32) #2, !dbg !22
+  ret void, !dbg !23
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1
+
+attributes #0 = { nounwind "nvvm.reqntid"="256" }
+attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #2 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly)
+!1 = !DIFile(filename: "clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py", directory: "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp")
+!2 = !{i32 2, !"Debug Info Version", i32 3}
+!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
+!4 = distinct !DISubprogram(name: "triton_poi_fused_cat_view_4", linkageName: "triton_poi_fused_cat_view_4", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!5 = !DISubroutineType(cc: DW_CC_normal, types: !6)
+!6 = !{}
+!7 = !DILocation(line: 20, column: 28, scope: !4)
+!8 = !DILocation(line: 20, column: 33, scope: !4)
+!9 = !DILocation(line: 21, column: 36, scope: !4)
+!10 = !DILocation(line: 21, column: 23, scope: !4)
+!11 = !DILocation(line: 23, column: 19, scope: !4)
+!12 = !DILocation(line: 30, column: 18, scope: !4)
+!13 = !DILocation(line: 31, column: 35, scope: !4)
+!14 = !DILocation(line: 31, column: 30, scope: !4)
+!15 = !DILocation(line: 31, column: 48, scope: !4)
+!16 = !DILocation(line: 32, column: 19, scope: !4)
+!17 = !DILocation(line: 35, column: 35, scope: !4)
+!18 = !DILocation(line: 35, column: 30, scope: !4)
+!19 = !DILocation(line: 35, column: 57, scope: !4)
+!20 = !DILocation(line: 36, column: 33, scope: !4)
+!21 = !DILocation(line: 37, column: 25, scope: !4)
+!22 = !DILocation(line: 37, column: 37, scope: !4)
+!23 = !DILocation(line: 37, column: 4, scope: !4)
diff --git a/triton/P6RZ5PAFGN2GJD5U5GBCNRKQD25CGJQM7AHKJR3IJHQE4JWAUJZQ/triton_poi_fused_cat_view_4.ptx b/triton/P6RZ5PAFGN2GJD5U5GBCNRKQD25CGJQM7AHKJR3IJHQE4JWAUJZQ/triton_poi_fused_cat_view_4.ptx
new file mode 100644
index 0000000000000000000000000000000000000000..c299bd739e634461ae81517b38d84f70a3cbc859
--- /dev/null
+++ b/triton/P6RZ5PAFGN2GJD5U5GBCNRKQD25CGJQM7AHKJR3IJHQE4JWAUJZQ/triton_poi_fused_cat_view_4.ptx
@@ -0,0 +1,333 @@
+//
+// Generated by LLVM NVPTX Back-End
+//
+
+.version 9.1
+.target sm_89
+.address_size 64
+
+	// .globl	triton_poi_fused_cat_view_4 // -- Begin function triton_poi_fused_cat_view_4
+                                        // @triton_poi_fused_cat_view_4
+.visible .entry triton_poi_fused_cat_view_4(
+	.param .u64 .ptr .global .align 1 triton_poi_fused_cat_view_4_param_0,
+	.param .u64 .ptr .global .align 1 triton_poi_fused_cat_view_4_param_1,
+	.param .u64 .ptr .global .align 1 triton_poi_fused_cat_view_4_param_2,
+	.param .u32 triton_poi_fused_cat_view_4_param_3,
+	.param .u64 .ptr .global .align 1 triton_poi_fused_cat_view_4_param_4,
+	.param .u64 .ptr .global .align 1 triton_poi_fused_cat_view_4_param_5
+)
+.reqntid 256
+{
+	.reg .pred 	%p<3>;
+	.reg .b16 	%rs<7>;
+	.reg .b32 	%r<18>;
+	.reg .b64 	%rd<7>;
+	.loc	1 18 0                          // clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py:18:0
+$L__func_begin0:
+	.loc	1 18 0                          // clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py:18:0
+
+// %bb.0:
+	ld.param.b64 	%rd4, [triton_poi_fused_cat_view_4_param_0];
+	ld.param.b64 	%rd5, [triton_poi_fused_cat_view_4_param_1];
+$L__tmp0:
+	.loc	1 20 28                         // clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py:20:28
+	mov.u32 	%r5, %ctaid.x;
+	.loc	1 20 33                         // clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py:20:33
+	shl.b32 	%r6, %r5, 9;
+	ld.param.b64 	%rd6, [triton_poi_fused_cat_view_4_param_2];
+	.loc	1 21 36                         // clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py:21:36
+	mov.u32 	%r7, %tid.x;
+	shl.b32 	%r8, %r7, 1;
+	and.b32 	%r9, %r8, 510;
+	.loc	1 21 23                         // clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py:21:23
+	or.b32 	%r10, %r9, %r6;
+	.loc	1 23 19                         // clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py:23:19
+	bfe.s32 	%r11, %r5, 22, 1;
+	shr.u32 	%r12, %r11, 20;
+	add.s32 	%r13, %r10, %r12;
+	.loc	1 30 18                         // clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py:30:18
+	setp.lt.s32 	%p1, %r10, 1048576;
+	.loc	1 31 35                         // clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py:31:35
+	shl.b32 	%r14, %r13, 1;
+	and.b32 	%r15, %r14, -8192;
+	add.s32 	%r16, %r15, %r10;
+	.loc	1 31 30                         // clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py:31:30
+	mad.wide.s32 	%rd1, %r16, 2, %rd4;
+	mov.b32 	%r2, 0;
+	.loc	1 31 48                         // clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py:31:48
+	// begin inline asm
+	mov.u32 %r1, %r2;
+	@%p1 ld.global.b32 { %r1 }, [ %rd1 + 0 ];
+	// end inline asm
+	mov.b32 	{%rs1, %rs2}, %r1;
+	.loc	1 32 19                         // clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py:32:19
+	setp.gt.s32 	%p2, %r10, 1048575;
+	.loc	1 35 35                         // clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py:35:35
+	add.s32 	%r17, %r16, -3145728;
+	.loc	1 35 30                         // clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py:35:30
+	mad.wide.s32 	%rd2, %r17, 2, %rd5;
+	.loc	1 35 57                         // clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py:35:57
+	// begin inline asm
+	mov.u32 %r3, %r2;
+	@%p2 ld.global.b32 { %r3 }, [ %rd2 + 0 ];
+	// end inline asm
+	mov.b32 	{%rs3, %rs4}, %r3;
+	.loc	1 36 33                         // clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py:36:33
+	selp.b16 	%rs5, %rs1, %rs3, %p1;
+	selp.b16 	%rs6, %rs2, %rs4, %p1;
+	.loc	1 37 25                         // clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py:37:25
+	mad.wide.s32 	%rd3, %r10, 2, %rd6;
+	.loc	1 37 37                         // clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py:37:37
+	mov.b32 	%r4, {%rs5, %rs6};
+	// begin inline asm
+	st.global.b32 [ %rd3 + 0 ], { %r4 };
+	// end inline asm
+	.loc	1 37 4                          // clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py:37:4
+	ret;
+$L__tmp1:
+$L__func_end0:
+                                        // -- End function
+}
+	.file	1 "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py"
+	.section	.debug_abbrev
+	{
+.b8 1                                   // Abbreviation Code
+.b8 17                                  // DW_TAG_compile_unit
+.b8 0                                   // DW_CHILDREN_no
+.b8 37                                  // DW_AT_producer
+.b8 8                                   // DW_FORM_string
+.b8 19                                  // DW_AT_language
+.b8 5                                   // DW_FORM_data2
+.b8 3                                   // DW_AT_name
+.b8 8                                   // DW_FORM_string
+.b8 16                                  // DW_AT_stmt_list
+.b8 6                                   // DW_FORM_data4
+.b8 27                                  // DW_AT_comp_dir
+.b8 8                                   // DW_FORM_string
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 0                                   // EOM(3)
+	}
+	.section	.debug_info
+	{
+.b32 224                                // Length of Unit
+.b8 2                                   // DWARF version number
+.b8 0
+.b32 .debug_abbrev                      // Offset Into Abbrev. Section
+.b8 8                                   // Address Size (in bytes)
+.b8 1                                   // Abbrev [1] 0xb:0xd9 DW_TAG_compile_unit
+.b8 116                                 // DW_AT_producer
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 0
+.b8 2                                   // DW_AT_language
+.b8 0
+.b8 99                                  // DW_AT_name
+.b8 108
+.b8 112
+.b8 102
+.b8 52
+.b8 108
+.b8 111
+.b8 111
+.b8 104
+.b8 102
+.b8 115
+.b8 103
+.b8 119
+.b8 113
+.b8 104
+.b8 50
+.b8 103
+.b8 105
+.b8 50
+.b8 120
+.b8 111
+.b8 118
+.b8 111
+.b8 100
+.b8 112
+.b8 109
+.b8 55
+.b8 104
+.b8 122
+.b8 118
+.b8 53
+.b8 117
+.b8 50
+.b8 114
+.b8 118
+.b8 110
+.b8 103
+.b8 98
+.b8 55
+.b8 99
+.b8 104
+.b8 106
+.b8 103
+.b8 121
+.b8 119
+.b8 120
+.b8 53
+.b8 53
+.b8 103
+.b8 116
+.b8 117
+.b8 100
+.b8 46
+.b8 112
+.b8 121
+.b8 0
+.b32 .debug_line                        // DW_AT_stmt_list
+.b8 47                                  // DW_AT_comp_dir
+.b8 97
+.b8 112
+.b8 112
+.b8 47
+.b8 116
+.b8 101
+.b8 110
+.b8 115
+.b8 111
+.b8 114
+.b8 114
+.b8 116
+.b8 95
+.b8 108
+.b8 108
+.b8 109
+.b8 47
+.b8 118
+.b8 105
+.b8 115
+.b8 117
+.b8 97
+.b8 108
+.b8 95
+.b8 103
+.b8 101
+.b8 110
+.b8 47
+.b8 99
+.b8 111
+.b8 109
+.b8 112
+.b8 105
+.b8 108
+.b8 101
+.b8 100
+.b8 95
+.b8 99
+.b8 97
+.b8 99
+.b8 104
+.b8 101
+.b8 47
+.b8 102
+.b8 108
+.b8 117
+.b8 120
+.b8 50
+.b8 95
+.b8 107
+.b8 108
+.b8 101
+.b8 105
+.b8 110
+.b8 95
+.b8 57
+.b8 98
+.b8 95
+.b8 78
+.b8 86
+.b8 73
+.b8 68
+.b8 73
+.b8 65
+.b8 95
+.b8 71
+.b8 101
+.b8 70
+.b8 111
+.b8 114
+.b8 99
+.b8 101
+.b8 95
+.b8 82
+.b8 84
+.b8 88
+.b8 95
+.b8 52
+.b8 48
+.b8 57
+.b8 48
+.b8 95
+.b8 115
+.b8 109
+.b8 56
+.b8 57
+.b8 95
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 50
+.b8 46
+.b8 49
+.b8 48
+.b8 46
+.b8 48
+.b8 97
+.b8 48
+.b8 95
+.b8 98
+.b8 52
+.b8 101
+.b8 52
+.b8 101
+.b8 101
+.b8 56
+.b8 49
+.b8 100
+.b8 51
+.b8 46
+.b8 110
+.b8 118
+.b8 50
+.b8 53
+.b8 46
+.b8 49
+.b8 50
+.b8 95
+.b8 99
+.b8 117
+.b8 100
+.b8 97
+.b8 49
+.b8 51
+.b8 95
+.b8 49
+.b8 47
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 105
+.b8 110
+.b8 100
+.b8 117
+.b8 99
+.b8 116
+.b8 111
+.b8 114
+.b8 47
+.b8 108
+.b8 112
+.b8 0
+	}
+	.section	.debug_macinfo	{	}
diff --git a/triton/P6RZ5PAFGN2GJD5U5GBCNRKQD25CGJQM7AHKJR3IJHQE4JWAUJZQ/triton_poi_fused_cat_view_4.source b/triton/P6RZ5PAFGN2GJD5U5GBCNRKQD25CGJQM7AHKJR3IJHQE4JWAUJZQ/triton_poi_fused_cat_view_4.source
new file mode 100644
index 0000000000000000000000000000000000000000..4042d84ea3a4509075a3ebcc3579da142486ce37
--- /dev/null
+++ b/triton/P6RZ5PAFGN2GJD5U5GBCNRKQD25CGJQM7AHKJR3IJHQE4JWAUJZQ/triton_poi_fused_cat_view_4.source
@@ -0,0 +1,136 @@
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":18:0)
+#loc31 = loc("in_ptr0"(#loc))
+#loc32 = loc("in_ptr1"(#loc))
+#loc33 = loc("out_ptr0"(#loc))
+#loc34 = loc("xnumel"(#loc))
+module {
+  tt.func public @triton_poi_fused_cat_view_4(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} {
+    %xnumel_0 = arith.constant 9437184 : i32 loc(#loc35)
+    %xoffset = tt.get_program_id x : i32 loc(#loc36)
+    %xoffset_1 = arith.constant 512 : i32 loc(#loc37)
+    %xoffset_2 = arith.constant 512 : i32 loc(#loc37)
+    %xoffset_3 = arith.muli %xoffset, %xoffset_2 : i32 loc(#loc37)
+    %xindex = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32> loc(#loc38)
+    %xindex_4 = tt.splat %xoffset_3 : i32 -> tensor<512xi32> loc(#loc39)
+    %xindex_5 = arith.addi %xindex_4, %xindex : tensor<512xi32> loc(#loc39)
+    %xmask = arith.constant true loc(#loc40)
+    %xmask_6 = arith.constant dense<true> : tensor<512xi1> loc(#loc40)
+    %x1 = arith.constant 4096 : i32 loc(#loc41)
+    %x1_7 = arith.constant 4096 : i32 loc(#loc41)
+    %x1_8 = arith.constant dense<4096> : tensor<512xi32> loc(#loc41)
+    %x1_9 = arith.divsi %xindex_5, %x1_8 : tensor<512xi32> loc(#loc41)
+    %x0 = arith.constant 4096 : i32 loc(#loc42)
+    %x0_10 = arith.constant 4096 : i32 loc(#loc42)
+    %x0_11 = arith.constant dense<4096> : tensor<512xi32> loc(#loc42)
+    %x0_12 = arith.remsi %xindex_5, %x0_11 : tensor<512xi32> loc(#loc42)
+    %tmp1 = arith.constant 0 : i64 loc(#loc43)
+    %tmp1_13 = arith.constant dense<0> : tensor<1xi64> loc(#loc43)
+    %tmp2 = arith.extsi %x1_9 : tensor<512xi32> to tensor<512xi64> loc(#loc44)
+    %tmp2_14 = arith.constant dense<0> : tensor<512xi64> loc(#loc44)
+    %tmp2_15 = arith.cmpi sge, %tmp2, %tmp2_14 : tensor<512xi64> loc(#loc44)
+    %tmp3 = arith.constant 256 : i64 loc(#loc45)
+    %tmp3_16 = arith.constant dense<256> : tensor<1xi64> loc(#loc45)
+    %tmp4 = arith.extsi %x1_9 : tensor<512xi32> to tensor<512xi64> loc(#loc46)
+    %tmp4_17 = arith.constant dense<256> : tensor<512xi64> loc(#loc46)
+    %tmp4_18 = arith.cmpi slt, %tmp4, %tmp4_17 : tensor<512xi64> loc(#loc46)
+    %tmp5 = arith.constant 12288 : i32 loc(#loc47)
+    %tmp5_19 = arith.constant 12288 : i32 loc(#loc47)
+    %tmp5_20 = arith.constant dense<12288> : tensor<512xi32> loc(#loc47)
+    %tmp5_21 = arith.muli %tmp5_20, %x1_9 : tensor<512xi32> loc(#loc47)
+    %tmp5_22 = arith.addi %x0_12, %tmp5_21 : tensor<512xi32> loc(#loc48)
+    %tmp5_23 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<512x!tt.ptr<bf16>> loc(#loc49)
+    %tmp5_24 = tt.addptr %tmp5_23, %tmp5_22 : tensor<512x!tt.ptr<bf16>>, tensor<512xi32> loc(#loc49)
+    %tmp5_25 = arith.constant 0.000000e+00 : f32 loc(#loc50)
+    %tmp5_26 = arith.constant dense<0.000000e+00> : tensor<512xf32> loc(#loc50)
+    %tmp5_27 = arith.truncf %tmp5_26 : tensor<512xf32> to tensor<512xbf16> loc(#loc50)
+    %tmp5_28 = tt.load %tmp5_24, %tmp4_18, %tmp5_27 : tensor<512x!tt.ptr<bf16>> loc(#loc50)
+    %tmp5_29 = arith.extf %tmp5_28 : tensor<512xbf16> to tensor<512xf32> loc(#loc51)
+    %tmp6 = arith.extsi %x1_9 : tensor<512xi32> to tensor<512xi64> loc(#loc52)
+    %tmp6_30 = arith.constant dense<256> : tensor<512xi64> loc(#loc52)
+    %tmp6_31 = arith.cmpi sge, %tmp6, %tmp6_30 : tensor<512xi64> loc(#loc52)
+    %tmp7 = arith.constant 2304 : i64 loc(#loc53)
+    %tmp7_32 = arith.constant dense<2304> : tensor<1xi64> loc(#loc53)
+    %tmp8 = arith.extsi %x1_9 : tensor<512xi32> to tensor<512xi64> loc(#loc54)
+    %tmp8_33 = arith.constant dense<2304> : tensor<512xi64> loc(#loc54)
+    %tmp8_34 = arith.cmpi slt, %tmp8, %tmp8_33 : tensor<512xi64> loc(#loc54)
+    %tmp9 = arith.constant -256 : i32 loc(#loc55)
+    %tmp9_35 = arith.constant -256 : i32 loc(#loc55)
+    %tmp9_36 = arith.constant dense<-256> : tensor<512xi32> loc(#loc55)
+    %tmp9_37 = arith.addi %tmp9_36, %x1_9 : tensor<512xi32> loc(#loc55)
+    %tmp9_38 = arith.constant 12288 : i32 loc(#loc56)
+    %tmp9_39 = arith.constant 12288 : i32 loc(#loc56)
+    %tmp9_40 = arith.constant dense<12288> : tensor<512xi32> loc(#loc56)
+    %tmp9_41 = arith.muli %tmp9_40, %tmp9_37 : tensor<512xi32> loc(#loc56)
+    %tmp9_42 = arith.addi %x0_12, %tmp9_41 : tensor<512xi32> loc(#loc57)
+    %tmp9_43 = tt.splat %in_ptr1 : !tt.ptr<bf16> -> tensor<512x!tt.ptr<bf16>> loc(#loc58)
+    %tmp9_44 = tt.addptr %tmp9_43, %tmp9_42 : tensor<512x!tt.ptr<bf16>>, tensor<512xi32> loc(#loc58)
+    %tmp9_45 = arith.constant 0.000000e+00 : f32 loc(#loc59)
+    %tmp9_46 = arith.constant dense<0.000000e+00> : tensor<512xf32> loc(#loc59)
+    %tmp9_47 = arith.truncf %tmp9_46 : tensor<512xf32> to tensor<512xbf16> loc(#loc59)
+    %tmp9_48 = tt.load %tmp9_44, %tmp6_31, %tmp9_47 : tensor<512x!tt.ptr<bf16>> loc(#loc59)
+    %tmp9_49 = arith.extf %tmp9_48 : tensor<512xbf16> to tensor<512xf32> loc(#loc60)
+    %tmp10 = arith.select %tmp4_18, %tmp5_29, %tmp9_49 : tensor<512xi1>, tensor<512xf32> loc(#loc61)
+    %0 = tt.splat %out_ptr0 : !tt.ptr<bf16> -> tensor<512x!tt.ptr<bf16>> loc(#loc28)
+    %1 = tt.addptr %0, %xindex_5 : tensor<512x!tt.ptr<bf16>>, tensor<512xi32> loc(#loc28)
+    %2 = arith.truncf %tmp10 : tensor<512xf32> to tensor<512xbf16> loc(#loc29)
+    tt.store %1, %2 : tensor<512x!tt.ptr<bf16>> loc(#loc29)
+    tt.return loc(#loc30)
+  } loc(#loc)
+} loc(#loc)
+#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":19:13)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":20:28)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":20:33)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":21:36)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":21:23)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":22:36)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":23:19)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":24:19)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":27:27)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":28:19)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":29:29)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":30:18)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":31:42)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":31:35)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":31:30)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":31:48)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":31:68)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":32:19)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":33:30)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":34:18)
+#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":35:51)
+#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":35:42)
+#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":35:35)
+#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":35:30)
+#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":35:57)
+#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":35:77)
+#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":36:33)
+#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":37:25)
+#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":37:37)
+#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":37:4)
+#loc35 = loc("xnumel"(#loc1))
+#loc36 = loc("xoffset"(#loc2))
+#loc37 = loc("xoffset"(#loc3))
+#loc38 = loc("xindex"(#loc4))
+#loc39 = loc("xindex"(#loc5))
+#loc40 = loc("xmask"(#loc6))
+#loc41 = loc("x1"(#loc7))
+#loc42 = loc("x0"(#loc8))
+#loc43 = loc("tmp1"(#loc9))
+#loc44 = loc("tmp2"(#loc10))
+#loc45 = loc("tmp3"(#loc11))
+#loc46 = loc("tmp4"(#loc12))
+#loc47 = loc("tmp5"(#loc13))
+#loc48 = loc("tmp5"(#loc14))
+#loc49 = loc("tmp5"(#loc15))
+#loc50 = loc("tmp5"(#loc16))
+#loc51 = loc("tmp5"(#loc17))
+#loc52 = loc("tmp6"(#loc18))
+#loc53 = loc("tmp7"(#loc19))
+#loc54 = loc("tmp8"(#loc20))
+#loc55 = loc("tmp9"(#loc21))
+#loc56 = loc("tmp9"(#loc22))
+#loc57 = loc("tmp9"(#loc23))
+#loc58 = loc("tmp9"(#loc24))
+#loc59 = loc("tmp9"(#loc25))
+#loc60 = loc("tmp9"(#loc26))
+#loc61 = loc("tmp10"(#loc27))
diff --git a/triton/P6RZ5PAFGN2GJD5U5GBCNRKQD25CGJQM7AHKJR3IJHQE4JWAUJZQ/triton_poi_fused_cat_view_4.ttgir b/triton/P6RZ5PAFGN2GJD5U5GBCNRKQD25CGJQM7AHKJR3IJHQE4JWAUJZQ/triton_poi_fused_cat_view_4.ttgir
new file mode 100644
index 0000000000000000000000000000000000000000..a81c383be57b5e9541991b93ba25615a10898d49
--- /dev/null
+++ b/triton/P6RZ5PAFGN2GJD5U5GBCNRKQD25CGJQM7AHKJR3IJHQE4JWAUJZQ/triton_poi_fused_cat_view_4.ttgir
@@ -0,0 +1,89 @@
+#blocked = #ttg.blocked<{sizePerThread = [2], threadsPerWarp = [32], warpsPerCTA = [8], order = [0]}>
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":18:0)
+#loc25 = loc("in_ptr0"(#loc))
+#loc26 = loc("in_ptr1"(#loc))
+#loc27 = loc("out_ptr0"(#loc))
+#loc28 = loc("xnumel"(#loc))
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "cuda:89", "ttg.threads-per-warp" = 32 : i32} {
+  tt.func public @triton_poi_fused_cat_view_4(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} {
+    %cst = arith.constant dense<4096> : tensor<512xi32, #blocked> loc(#loc1)
+    %cst_0 = arith.constant dense<256> : tensor<512xi64, #blocked> loc(#loc1)
+    %cst_1 = arith.constant dense<12288> : tensor<512xi32, #blocked> loc(#loc1)
+    %cst_2 = arith.constant dense<-256> : tensor<512xi32, #blocked> loc(#loc1)
+    %cst_3 = arith.constant dense<0.000000e+00> : tensor<512xbf16, #blocked> loc(#loc1)
+    %c512_i32 = arith.constant 512 : i32 loc(#loc1)
+    %xoffset = tt.get_program_id x : i32 loc(#loc29)
+    %xoffset_4 = arith.muli %xoffset, %c512_i32 : i32 loc(#loc30)
+    %xindex = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32, #blocked> loc(#loc31)
+    %xindex_5 = tt.splat %xoffset_4 : i32 -> tensor<512xi32, #blocked> loc(#loc32)
+    %xindex_6 = arith.addi %xindex_5, %xindex : tensor<512xi32, #blocked> loc(#loc32)
+    %x1 = arith.divsi %xindex_6, %cst : tensor<512xi32, #blocked> loc(#loc33)
+    %x0 = arith.remsi %xindex_6, %cst : tensor<512xi32, #blocked> loc(#loc34)
+    %tmp4 = arith.extsi %x1 : tensor<512xi32, #blocked> to tensor<512xi64, #blocked> loc(#loc35)
+    %tmp4_7 = arith.cmpi slt, %tmp4, %cst_0 : tensor<512xi64, #blocked> loc(#loc35)
+    %tmp5 = arith.muli %x1, %cst_1 : tensor<512xi32, #blocked> loc(#loc36)
+    %tmp5_8 = arith.addi %x0, %tmp5 : tensor<512xi32, #blocked> loc(#loc37)
+    %tmp5_9 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<512x!tt.ptr<bf16>, #blocked> loc(#loc38)
+    %tmp5_10 = tt.addptr %tmp5_9, %tmp5_8 : tensor<512x!tt.ptr<bf16>, #blocked>, tensor<512xi32, #blocked> loc(#loc38)
+    %tmp5_11 = tt.load %tmp5_10, %tmp4_7, %cst_3 : tensor<512x!tt.ptr<bf16>, #blocked> loc(#loc39)
+    %tmp5_12 = arith.extf %tmp5_11 : tensor<512xbf16, #blocked> to tensor<512xf32, #blocked> loc(#loc40)
+    %tmp6 = arith.cmpi sge, %tmp4, %cst_0 : tensor<512xi64, #blocked> loc(#loc41)
+    %tmp9 = arith.addi %x1, %cst_2 : tensor<512xi32, #blocked> loc(#loc42)
+    %tmp9_13 = arith.muli %tmp9, %cst_1 : tensor<512xi32, #blocked> loc(#loc43)
+    %tmp9_14 = arith.addi %x0, %tmp9_13 : tensor<512xi32, #blocked> loc(#loc44)
+    %tmp9_15 = tt.splat %in_ptr1 : !tt.ptr<bf16> -> tensor<512x!tt.ptr<bf16>, #blocked> loc(#loc45)
+    %tmp9_16 = tt.addptr %tmp9_15, %tmp9_14 : tensor<512x!tt.ptr<bf16>, #blocked>, tensor<512xi32, #blocked> loc(#loc45)
+    %tmp9_17 = tt.load %tmp9_16, %tmp6, %cst_3 : tensor<512x!tt.ptr<bf16>, #blocked> loc(#loc46)
+    %tmp9_18 = arith.extf %tmp9_17 : tensor<512xbf16, #blocked> to tensor<512xf32, #blocked> loc(#loc47)
+    %tmp10 = arith.select %tmp4_7, %tmp5_12, %tmp9_18 : tensor<512xi1, #blocked>, tensor<512xf32, #blocked> loc(#loc48)
+    %0 = tt.splat %out_ptr0 : !tt.ptr<bf16> -> tensor<512x!tt.ptr<bf16>, #blocked> loc(#loc22)
+    %1 = tt.addptr %0, %xindex_6 : tensor<512x!tt.ptr<bf16>, #blocked>, tensor<512xi32, #blocked> loc(#loc22)
+    %2 = arith.truncf %tmp10 : tensor<512xf32, #blocked> to tensor<512xbf16, #blocked> loc(#loc23)
+    tt.store %1, %2 : tensor<512x!tt.ptr<bf16>, #blocked> loc(#loc23)
+    tt.return loc(#loc24)
+  } loc(#loc)
+} loc(#loc)
+#loc1 = loc(unknown)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":20:28)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":20:33)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":21:36)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":21:23)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":23:19)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":24:19)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":30:18)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":31:42)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":31:35)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":31:30)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":31:48)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":31:68)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":32:19)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":35:51)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":35:42)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":35:35)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":35:30)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":35:57)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":35:77)
+#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":36:33)
+#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":37:25)
+#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":37:37)
+#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":37:4)
+#loc29 = loc("xoffset"(#loc2))
+#loc30 = loc("xoffset"(#loc3))
+#loc31 = loc("xindex"(#loc4))
+#loc32 = loc("xindex"(#loc5))
+#loc33 = loc("x1"(#loc6))
+#loc34 = loc("x0"(#loc7))
+#loc35 = loc("tmp4"(#loc8))
+#loc36 = loc("tmp5"(#loc9))
+#loc37 = loc("tmp5"(#loc10))
+#loc38 = loc("tmp5"(#loc11))
+#loc39 = loc("tmp5"(#loc12))
+#loc40 = loc("tmp5"(#loc13))
+#loc41 = loc("tmp6"(#loc14))
+#loc42 = loc("tmp9"(#loc15))
+#loc43 = loc("tmp9"(#loc16))
+#loc44 = loc("tmp9"(#loc17))
+#loc45 = loc("tmp9"(#loc18))
+#loc46 = loc("tmp9"(#loc19))
+#loc47 = loc("tmp9"(#loc20))
+#loc48 = loc("tmp10"(#loc21))
diff --git a/triton/P6RZ5PAFGN2GJD5U5GBCNRKQD25CGJQM7AHKJR3IJHQE4JWAUJZQ/triton_poi_fused_cat_view_4.ttir b/triton/P6RZ5PAFGN2GJD5U5GBCNRKQD25CGJQM7AHKJR3IJHQE4JWAUJZQ/triton_poi_fused_cat_view_4.ttir
new file mode 100644
index 0000000000000000000000000000000000000000..629683a28b165c750083b69450949845a7625d43
--- /dev/null
+++ b/triton/P6RZ5PAFGN2GJD5U5GBCNRKQD25CGJQM7AHKJR3IJHQE4JWAUJZQ/triton_poi_fused_cat_view_4.ttir
@@ -0,0 +1,88 @@
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":18:0)
+#loc25 = loc("in_ptr0"(#loc))
+#loc26 = loc("in_ptr1"(#loc))
+#loc27 = loc("out_ptr0"(#loc))
+#loc28 = loc("xnumel"(#loc))
+module {
+  tt.func public @triton_poi_fused_cat_view_4(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} {
+    %cst = arith.constant dense<0.000000e+00> : tensor<512xbf16> loc(#loc1)
+    %tmp9 = arith.constant dense<-256> : tensor<512xi32> loc(#loc29)
+    %cst_0 = arith.constant dense<12288> : tensor<512xi32> loc(#loc1)
+    %cst_1 = arith.constant dense<256> : tensor<512xi64> loc(#loc1)
+    %cst_2 = arith.constant dense<4096> : tensor<512xi32> loc(#loc1)
+    %c512_i32 = arith.constant 512 : i32 loc(#loc1)
+    %xoffset = tt.get_program_id x : i32 loc(#loc30)
+    %xoffset_3 = arith.muli %xoffset, %c512_i32 : i32 loc(#loc31)
+    %xindex = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32> loc(#loc32)
+    %xindex_4 = tt.splat %xoffset_3 : i32 -> tensor<512xi32> loc(#loc33)
+    %xindex_5 = arith.addi %xindex_4, %xindex : tensor<512xi32> loc(#loc33)
+    %x1 = arith.divsi %xindex_5, %cst_2 : tensor<512xi32> loc(#loc34)
+    %x0 = arith.remsi %xindex_5, %cst_2 : tensor<512xi32> loc(#loc35)
+    %tmp4 = arith.extsi %x1 : tensor<512xi32> to tensor<512xi64> loc(#loc36)
+    %tmp4_6 = arith.cmpi slt, %tmp4, %cst_1 : tensor<512xi64> loc(#loc36)
+    %tmp5 = arith.muli %x1, %cst_0 : tensor<512xi32> loc(#loc37)
+    %tmp5_7 = arith.addi %x0, %tmp5 : tensor<512xi32> loc(#loc38)
+    %tmp5_8 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<512x!tt.ptr<bf16>> loc(#loc39)
+    %tmp5_9 = tt.addptr %tmp5_8, %tmp5_7 : tensor<512x!tt.ptr<bf16>>, tensor<512xi32> loc(#loc39)
+    %tmp5_10 = tt.load %tmp5_9, %tmp4_6, %cst : tensor<512x!tt.ptr<bf16>> loc(#loc40)
+    %tmp5_11 = arith.extf %tmp5_10 : tensor<512xbf16> to tensor<512xf32> loc(#loc41)
+    %tmp6 = arith.cmpi sge, %tmp4, %cst_1 : tensor<512xi64> loc(#loc42)
+    %tmp9_12 = arith.addi %x1, %tmp9 : tensor<512xi32> loc(#loc29)
+    %tmp9_13 = arith.muli %tmp9_12, %cst_0 : tensor<512xi32> loc(#loc43)
+    %tmp9_14 = arith.addi %x0, %tmp9_13 : tensor<512xi32> loc(#loc44)
+    %tmp9_15 = tt.splat %in_ptr1 : !tt.ptr<bf16> -> tensor<512x!tt.ptr<bf16>> loc(#loc45)
+    %tmp9_16 = tt.addptr %tmp9_15, %tmp9_14 : tensor<512x!tt.ptr<bf16>>, tensor<512xi32> loc(#loc45)
+    %tmp9_17 = tt.load %tmp9_16, %tmp6, %cst : tensor<512x!tt.ptr<bf16>> loc(#loc46)
+    %tmp9_18 = arith.extf %tmp9_17 : tensor<512xbf16> to tensor<512xf32> loc(#loc47)
+    %tmp10 = arith.select %tmp4_6, %tmp5_11, %tmp9_18 : tensor<512xi1>, tensor<512xf32> loc(#loc48)
+    %0 = tt.splat %out_ptr0 : !tt.ptr<bf16> -> tensor<512x!tt.ptr<bf16>> loc(#loc22)
+    %1 = tt.addptr %0, %xindex_5 : tensor<512x!tt.ptr<bf16>>, tensor<512xi32> loc(#loc22)
+    %2 = arith.truncf %tmp10 : tensor<512xf32> to tensor<512xbf16> loc(#loc23)
+    tt.store %1, %2 : tensor<512x!tt.ptr<bf16>> loc(#loc23)
+    tt.return loc(#loc24)
+  } loc(#loc)
+} loc(#loc)
+#loc1 = loc(unknown)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":35:51)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":20:28)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":20:33)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":21:36)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":21:23)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":23:19)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":24:19)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":30:18)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":31:42)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":31:35)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":31:30)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":31:48)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":31:68)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":32:19)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":35:42)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":35:35)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":35:30)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":35:57)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":35:77)
+#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":36:33)
+#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":37:25)
+#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":37:37)
+#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":37:4)
+#loc29 = loc("tmp9"(#loc2))
+#loc30 = loc("xoffset"(#loc3))
+#loc31 = loc("xoffset"(#loc4))
+#loc32 = loc("xindex"(#loc5))
+#loc33 = loc("xindex"(#loc6))
+#loc34 = loc("x1"(#loc7))
+#loc35 = loc("x0"(#loc8))
+#loc36 = loc("tmp4"(#loc9))
+#loc37 = loc("tmp5"(#loc10))
+#loc38 = loc("tmp5"(#loc11))
+#loc39 = loc("tmp5"(#loc12))
+#loc40 = loc("tmp5"(#loc13))
+#loc41 = loc("tmp5"(#loc14))
+#loc42 = loc("tmp6"(#loc15))
+#loc43 = loc("tmp9"(#loc16))
+#loc44 = loc("tmp9"(#loc17))
+#loc45 = loc("tmp9"(#loc18))
+#loc46 = loc("tmp9"(#loc19))
+#loc47 = loc("tmp9"(#loc20))
+#loc48 = loc("tmp10"(#loc21))
diff --git a/triton/PPN4SVQW2UFKVPWUB7HCOIHQMJON3EA6PX7FI3IPMCGAPBBOTNMQ/__grp__triton_poi_fused_cat_mul_silu_split_view_0.json b/triton/PPN4SVQW2UFKVPWUB7HCOIHQMJON3EA6PX7FI3IPMCGAPBBOTNMQ/__grp__triton_poi_fused_cat_mul_silu_split_view_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..ed8c1a2d44f72bcc12c94509c45a6836d2ff86bf
--- /dev/null
+++ b/triton/PPN4SVQW2UFKVPWUB7HCOIHQMJON3EA6PX7FI3IPMCGAPBBOTNMQ/__grp__triton_poi_fused_cat_mul_silu_split_view_0.json
@@ -0,0 +1 @@
+{"child_paths": {"triton_poi_fused_cat_mul_silu_split_view_0.source": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/PPN4SVQW2UFKVPWUB7HCOIHQMJON3EA6PX7FI3IPMCGAPBBOTNMQ/triton_poi_fused_cat_mul_silu_split_view_0.source", "triton_poi_fused_cat_mul_silu_split_view_0.ttir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/PPN4SVQW2UFKVPWUB7HCOIHQMJON3EA6PX7FI3IPMCGAPBBOTNMQ/triton_poi_fused_cat_mul_silu_split_view_0.ttir", "triton_poi_fused_cat_mul_silu_split_view_0.ttgir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/PPN4SVQW2UFKVPWUB7HCOIHQMJON3EA6PX7FI3IPMCGAPBBOTNMQ/triton_poi_fused_cat_mul_silu_split_view_0.ttgir", "triton_poi_fused_cat_mul_silu_split_view_0.llir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/PPN4SVQW2UFKVPWUB7HCOIHQMJON3EA6PX7FI3IPMCGAPBBOTNMQ/triton_poi_fused_cat_mul_silu_split_view_0.llir", "triton_poi_fused_cat_mul_silu_split_view_0.ptx": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/PPN4SVQW2UFKVPWUB7HCOIHQMJON3EA6PX7FI3IPMCGAPBBOTNMQ/triton_poi_fused_cat_mul_silu_split_view_0.ptx", "triton_poi_fused_cat_mul_silu_split_view_0.cubin": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/PPN4SVQW2UFKVPWUB7HCOIHQMJON3EA6PX7FI3IPMCGAPBBOTNMQ/triton_poi_fused_cat_mul_silu_split_view_0.cubin", "triton_poi_fused_cat_mul_silu_split_view_0.json": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/PPN4SVQW2UFKVPWUB7HCOIHQMJON3EA6PX7FI3IPMCGAPBBOTNMQ/triton_poi_fused_cat_mul_silu_split_view_0.json"}}
\ No newline at end of file
diff --git a/triton/PPN4SVQW2UFKVPWUB7HCOIHQMJON3EA6PX7FI3IPMCGAPBBOTNMQ/triton_poi_fused_cat_mul_silu_split_view_0.cubin b/triton/PPN4SVQW2UFKVPWUB7HCOIHQMJON3EA6PX7FI3IPMCGAPBBOTNMQ/triton_poi_fused_cat_mul_silu_split_view_0.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..ddde3eee4ae81fa5d8530a0ba2158aacbc3d80b0
Binary files /dev/null and b/triton/PPN4SVQW2UFKVPWUB7HCOIHQMJON3EA6PX7FI3IPMCGAPBBOTNMQ/triton_poi_fused_cat_mul_silu_split_view_0.cubin differ
diff --git a/triton/PPN4SVQW2UFKVPWUB7HCOIHQMJON3EA6PX7FI3IPMCGAPBBOTNMQ/triton_poi_fused_cat_mul_silu_split_view_0.json b/triton/PPN4SVQW2UFKVPWUB7HCOIHQMJON3EA6PX7FI3IPMCGAPBBOTNMQ/triton_poi_fused_cat_mul_silu_split_view_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..1d98d1988c3b378918cdcdff2bfd2e33f1b2c1d9
--- /dev/null
+++ b/triton/PPN4SVQW2UFKVPWUB7HCOIHQMJON3EA6PX7FI3IPMCGAPBBOTNMQ/triton_poi_fused_cat_mul_silu_split_view_0.json
@@ -0,0 +1 @@
+{"hash": "7bdbc95616d50aaabed40fce2720f0625cdd901e7dfe546d0f608c07842e9b59", "target": {"backend": "cuda", "arch": 89, "warp_size": 32}, "num_warps": 8, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "enable_reflect_ftz": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee", "bf16x3", "bf16x6"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm89", "instrumentation_mode": "", "triton_version": "3.6.0", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_poi_fused_cat_mul_silu_split_view_0"}
\ No newline at end of file
diff --git a/triton/PPN4SVQW2UFKVPWUB7HCOIHQMJON3EA6PX7FI3IPMCGAPBBOTNMQ/triton_poi_fused_cat_mul_silu_split_view_0.llir b/triton/PPN4SVQW2UFKVPWUB7HCOIHQMJON3EA6PX7FI3IPMCGAPBBOTNMQ/triton_poi_fused_cat_mul_silu_split_view_0.llir
new file mode 100644
index 0000000000000000000000000000000000000000..8432e6158276ce4227f285f7a6b1da71b752fff3
--- /dev/null
+++ b/triton/PPN4SVQW2UFKVPWUB7HCOIHQMJON3EA6PX7FI3IPMCGAPBBOTNMQ/triton_poi_fused_cat_mul_silu_split_view_0.llir
@@ -0,0 +1,130 @@
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64"
+
+; Function Attrs: nounwind
+define ptx_kernel void @triton_poi_fused_cat_mul_silu_split_view_0(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, i32 %3, ptr addrspace(1) readnone captures(none) %4, ptr addrspace(1) readnone captures(none) %5) local_unnamed_addr #0 !dbg !4 {
+  %7 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7
+  %8 = shl i32 %7, 9, !dbg !8
+  %9 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9
+  %10 = shl nuw nsw i32 %9, 1, !dbg !9
+  %11 = and i32 %10, 510, !dbg !9
+  %12 = or disjoint i32 %11, %8, !dbg !10
+  %13 = sdiv i32 %12, 16384, !dbg !11
+  %14 = mul i32 %13, 16384, !dbg !12
+  %.decomposed = sub i32 %12, %14, !dbg !12
+  %15 = icmp slt i32 %.decomposed, 4096, !dbg !13
+  %16 = shl nsw i32 %13, 12, !dbg !14
+  %17 = add nsw i32 %16, %.decomposed, !dbg !15
+  %18 = sext i32 %17 to i64, !dbg !16
+  %19 = getelementptr bfloat, ptr addrspace(1) %0, i64 %18, !dbg !16
+  %20 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #3, !dbg !17
+  %21 = tail call i32 asm sideeffect "mov.u32 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $2 + 0 ], $3;", "=r,r,l,l,b"(i32 0, ptr addrspace(1) %19, i64 %20, i1 %15) #3, !dbg !17
+  %22 = bitcast i32 %21 to <2 x bfloat>, !dbg !17
+  %23 = icmp sgt i32 %.decomposed, 4095, !dbg !18
+  %24 = mul i32 %13, 36864, !dbg !19
+  %25 = add nsw i32 %.decomposed, -4096, !dbg !20
+  %26 = add i32 %24, %25, !dbg !21
+  %27 = sext i32 %26 to i64, !dbg !22
+  %28 = getelementptr bfloat, ptr addrspace(1) %1, i64 %27, !dbg !22
+  %29 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #3, !dbg !23
+  %30 = tail call i32 asm sideeffect "mov.u32 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $2 + 0 ], $3;", "=r,r,l,l,b"(i32 0, ptr addrspace(1) %28, i64 %29, i1 %23) #3, !dbg !23
+  %31 = bitcast i32 %30 to <2 x bfloat>, !dbg !23
+  %32 = add i32 %26, 12288, !dbg !24
+  %33 = sext i32 %32 to i64, !dbg !25
+  %34 = getelementptr bfloat, ptr addrspace(1) %1, i64 %33, !dbg !25
+  %35 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #3, !dbg !26
+  %36 = tail call i32 asm sideeffect "mov.u32 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $2 + 0 ], $3;", "=r,r,l,l,b"(i32 0, ptr addrspace(1) %34, i64 %35, i1 %23) #3, !dbg !26
+  %37 = bitcast i32 %36 to <2 x bfloat>, !dbg !26
+  %38 = sext i32 %12 to i64, !dbg !27
+  %39 = getelementptr bfloat, ptr addrspace(1) %2, i64 %38, !dbg !27
+  %40 = fpext <2 x bfloat> %31 to <2 x float>, !dbg !28
+  %41 = extractelement <2 x float> %40, i64 0, !dbg !29
+  %42 = fsub float 0.000000e+00, %41, !dbg !29
+  %43 = extractelement <2 x float> %40, i64 1, !dbg !29
+  %44 = fsub float 0.000000e+00, %43, !dbg !29
+  %45 = fmul float %42, 0x3FF7154760000000, !dbg !34
+  %46 = tail call float @llvm.nvvm.ex2.approx.f(float %45), !dbg !34
+  %47 = fmul float %44, 0x3FF7154760000000, !dbg !34
+  %48 = tail call float @llvm.nvvm.ex2.approx.f(float %47), !dbg !34
+  %49 = fadd float %46, 1.000000e+00, !dbg !35
+  %50 = fadd float %48, 1.000000e+00, !dbg !35
+  %51 = tail call float @llvm.nvvm.div.full(float 1.000000e+00, float %49), !dbg !36
+  %52 = tail call float @llvm.nvvm.div.full(float 1.000000e+00, float %50), !dbg !36
+  %53 = insertelement <2 x float> poison, float %51, i64 0, !dbg !37
+  %54 = insertelement <2 x float> %53, float %52, i64 1, !dbg !37
+  %55 = fmul <2 x float> %54, %40, !dbg !37
+  %56 = fpext <2 x bfloat> %37 to <2 x float>, !dbg !38
+  %57 = fmul <2 x float> %55, %56, !dbg !39
+  %58 = fptrunc <2 x float> %57 to <2 x bfloat>, !dbg !40
+  %59 = insertelement <2 x i1> poison, i1 %15, i64 0, !dbg !41
+  %60 = shufflevector <2 x i1> %59, <2 x i1> poison, <2 x i32> zeroinitializer, !dbg !41
+  %61 = select <2 x i1> %60, <2 x bfloat> %22, <2 x bfloat> %58, !dbg !41
+  %62 = bitcast <2 x bfloat> %61 to i32, !dbg !40
+  tail call void asm sideeffect "st.global.b32 [ $1 + 0 ], { $0 };", "r,l"(i32 %62, ptr addrspace(1) %39) #3, !dbg !40
+  ret void, !dbg !42
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.ex2.approx.f(float) #2
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.div.full(float, float) #2
+
+attributes #0 = { nounwind "nvvm.reqntid"="256" }
+attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #2 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
+attributes #3 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly)
+!1 = !DIFile(filename: "c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py", directory: "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i")
+!2 = !{i32 2, !"Debug Info Version", i32 3}
+!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
+!4 = distinct !DISubprogram(name: "triton_poi_fused_cat_mul_silu_split_view_0", linkageName: "triton_poi_fused_cat_mul_silu_split_view_0", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!5 = !DISubroutineType(cc: DW_CC_normal, types: !6)
+!6 = !{}
+!7 = !DILocation(line: 20, column: 28, scope: !4)
+!8 = !DILocation(line: 20, column: 33, scope: !4)
+!9 = !DILocation(line: 21, column: 36, scope: !4)
+!10 = !DILocation(line: 21, column: 23, scope: !4)
+!11 = !DILocation(line: 24, column: 19, scope: !4)
+!12 = !DILocation(line: 23, column: 19, scope: !4)
+!13 = !DILocation(line: 30, column: 18, scope: !4)
+!14 = !DILocation(line: 31, column: 35, scope: !4)
+!15 = !DILocation(line: 31, column: 41, scope: !4)
+!16 = !DILocation(line: 31, column: 30, scope: !4)
+!17 = !DILocation(line: 31, column: 47, scope: !4)
+!18 = !DILocation(line: 32, column: 19, scope: !4)
+!19 = !DILocation(line: 35, column: 36, scope: !4)
+!20 = !DILocation(line: 35, column: 52, scope: !4)
+!21 = !DILocation(line: 35, column: 42, scope: !4)
+!22 = !DILocation(line: 35, column: 30, scope: !4)
+!23 = !DILocation(line: 35, column: 58, scope: !4)
+!24 = !DILocation(line: 40, column: 51, scope: !4)
+!25 = !DILocation(line: 40, column: 31, scope: !4)
+!26 = !DILocation(line: 40, column: 67, scope: !4)
+!27 = !DILocation(line: 45, column: 25, scope: !4)
+!28 = !DILocation(line: 35, column: 108, scope: !4)
+!29 = !DILocation(line: 50, column: 30, scope: !30, inlinedAt: !32)
+!30 = distinct !DILexicalBlockFile(scope: !4, file: !31, discriminator: 0)
+!31 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.12/dist-packages/triton/language")
+!32 = !DILocation(line: 37, column: 23, scope: !33)
+!33 = distinct !DILexicalBlockFile(scope: !4, file: !1, discriminator: 0)
+!34 = !DILocation(line: 50, column: 29, scope: !30, inlinedAt: !32)
+!35 = !DILocation(line: 50, column: 20, scope: !30, inlinedAt: !32)
+!36 = !DILocation(line: 50, column: 16, scope: !30, inlinedAt: !32)
+!37 = !DILocation(line: 38, column: 20, scope: !4)
+!38 = !DILocation(line: 40, column: 117, scope: !4)
+!39 = !DILocation(line: 41, column: 20, scope: !4)
+!40 = !DILocation(line: 45, column: 37, scope: !4)
+!41 = !DILocation(line: 44, column: 33, scope: !4)
+!42 = !DILocation(line: 45, column: 4, scope: !4)
diff --git a/triton/PPN4SVQW2UFKVPWUB7HCOIHQMJON3EA6PX7FI3IPMCGAPBBOTNMQ/triton_poi_fused_cat_mul_silu_split_view_0.ptx b/triton/PPN4SVQW2UFKVPWUB7HCOIHQMJON3EA6PX7FI3IPMCGAPBBOTNMQ/triton_poi_fused_cat_mul_silu_split_view_0.ptx
new file mode 100644
index 0000000000000000000000000000000000000000..3af8173d951718ac5a86e10ab68c1a35ae420b42
--- /dev/null
+++ b/triton/PPN4SVQW2UFKVPWUB7HCOIHQMJON3EA6PX7FI3IPMCGAPBBOTNMQ/triton_poi_fused_cat_mul_silu_split_view_0.ptx
@@ -0,0 +1,490 @@
+//
+// Generated by LLVM NVPTX Back-End
+//
+
+.version 9.1
+.target sm_89
+.address_size 64
+
+	// .globl	triton_poi_fused_cat_mul_silu_split_view_0 // -- Begin function triton_poi_fused_cat_mul_silu_split_view_0
+                                        // @triton_poi_fused_cat_mul_silu_split_view_0
+.visible .entry triton_poi_fused_cat_mul_silu_split_view_0(
+	.param .u64 .ptr .global .align 1 triton_poi_fused_cat_mul_silu_split_view_0_param_0,
+	.param .u64 .ptr .global .align 1 triton_poi_fused_cat_mul_silu_split_view_0_param_1,
+	.param .u64 .ptr .global .align 1 triton_poi_fused_cat_mul_silu_split_view_0_param_2,
+	.param .u32 triton_poi_fused_cat_mul_silu_split_view_0_param_3,
+	.param .u64 .ptr .global .align 1 triton_poi_fused_cat_mul_silu_split_view_0_param_4,
+	.param .u64 .ptr .global .align 1 triton_poi_fused_cat_mul_silu_split_view_0_param_5
+)
+.reqntid 256
+{
+	.reg .pred 	%p<3>;
+	.reg .b16 	%rs<11>;
+	.reg .b32 	%r<43>;
+	.reg .b64 	%rd<11>;
+	.loc	1 18 0                          // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:18:0
+$L__func_begin0:
+	.loc	1 18 0                          // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:18:0
+
+// %bb.0:
+	ld.param.b64 	%rd8, [triton_poi_fused_cat_mul_silu_split_view_0_param_0];
+	ld.param.b64 	%rd9, [triton_poi_fused_cat_mul_silu_split_view_0_param_1];
+$L__tmp0:
+	.loc	1 20 28                         // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:20:28
+	mov.u32 	%r6, %ctaid.x;
+	.loc	1 20 33                         // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:20:33
+	shl.b32 	%r7, %r6, 9;
+	ld.param.b64 	%rd10, [triton_poi_fused_cat_mul_silu_split_view_0_param_2];
+	.loc	1 21 36                         // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:21:36
+	mov.u32 	%r8, %tid.x;
+	shl.b32 	%r9, %r8, 1;
+	and.b32 	%r10, %r9, 510;
+	.loc	1 21 23                         // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:21:23
+	or.b32 	%r11, %r10, %r7;
+	.loc	1 24 19                         // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:24:19
+	bfe.s32 	%r12, %r6, 22, 1;
+	shr.u32 	%r13, %r12, 18;
+	add.s32 	%r14, %r11, %r13;
+	shr.s32 	%r15, %r14, 14;
+	.loc	1 23 19                         // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:23:19
+	and.b32 	%r16, %r14, -16384;
+	sub.s32 	%r17, %r11, %r16;
+	.loc	1 30 18                         // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:30:18
+	setp.lt.s32 	%p1, %r17, 4096;
+	.loc	1 31 35                         // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:31:35
+	shl.b32 	%r18, %r15, 12;
+	.loc	1 31 41                         // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:31:41
+	add.s32 	%r19, %r18, %r17;
+	.loc	1 31 30                         // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:31:30
+	mad.wide.s32 	%rd1, %r19, 2, %rd8;
+	.loc	1 31 47                         // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:31:47
+	// begin inline asm
+	mov.u64 %rd2, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd2, 1.0;
+	// end inline asm
+	mov.b32 	%r2, 0;
+	// begin inline asm
+	mov.u32 %r1, %r2;
+	@%p1 ld.global.L1::evict_last.L2::cache_hint.b32 { %r1 }, [ %rd1 + 0 ], %rd2;
+	// end inline asm
+	.loc	1 32 19                         // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:32:19
+	setp.gt.s32 	%p2, %r17, 4095;
+	.loc	1 35 52                         // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:35:52
+	mad.lo.s32 	%r20, %r15, 36864, %r17;
+	.loc	1 35 42                         // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:35:42
+	add.s32 	%r21, %r20, -4096;
+	.loc	1 35 30                         // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:35:30
+	mad.wide.s32 	%rd3, %r21, 2, %rd9;
+	.loc	1 35 58                         // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:35:58
+	// begin inline asm
+	mov.u64 %rd4, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd4, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r3, %r2;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.b32 { %r3 }, [ %rd3 + 0 ], %rd4;
+	// end inline asm
+	.loc	1 40 51                         // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:40:51
+	add.s32 	%r22, %r20, 8192;
+	.loc	1 40 31                         // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:40:31
+	mad.wide.s32 	%rd5, %r22, 2, %rd9;
+	.loc	1 40 67                         // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:40:67
+	// begin inline asm
+	mov.u64 %rd6, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd6, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r4, %r2;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.b32 { %r4 }, [ %rd5 + 0 ], %rd6;
+	// end inline asm
+	.loc	1 45 25                         // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:45:25
+	mad.wide.s32 	%rd7, %r11, 2, %rd10;
+	.loc	1 35 108                        // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:35:108
+	mov.b32 	{%rs1, %rs2}, %r3;
+	cvt.f32.bf16 	%r23, %rs2;
+	cvt.f32.bf16 	%r24, %rs1;
+	mov.b32 	%r25, 0f00000000;
+$L__tmp1:
+	.loc	2 50 30                         // standard.py:50:30 @[ c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:37:23 ]
+	sub.f32 	%r26, %r25, %r24;
+	sub.f32 	%r27, %r25, %r23;
+	.loc	2 50 29                         // standard.py:50:29 @[ c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:37:23 ]
+	mul.f32 	%r28, %r26, 0f3FB8AA3B;
+	ex2.approx.f32 	%r29, %r28;
+	mul.f32 	%r30, %r27, 0f3FB8AA3B;
+	ex2.approx.f32 	%r31, %r30;
+	.loc	2 50 20                         // standard.py:50:20 @[ c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:37:23 ]
+	add.f32 	%r32, %r29, 0f3F800000;
+	add.f32 	%r33, %r31, 0f3F800000;
+	mov.b32 	%r34, 0f3F800000;
+	.loc	2 50 16                         // standard.py:50:16 @[ c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:37:23 ]
+	div.full.f32 	%r35, %r34, %r32;
+	div.full.f32 	%r36, %r34, %r33;
+$L__tmp2:
+	.loc	1 38 20                         // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:38:20
+	mul.f32 	%r37, %r35, %r24;
+	mul.f32 	%r38, %r36, %r23;
+	.loc	1 40 117                        // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:40:117
+	mov.b32 	{%rs3, %rs4}, %r4;
+	cvt.f32.bf16 	%r39, %rs3;
+	cvt.f32.bf16 	%r40, %rs4;
+	.loc	1 41 20                         // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:41:20
+	mul.f32 	%r41, %r38, %r40;
+	mul.f32 	%r42, %r37, %r39;
+	.loc	1 45 37                         // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:45:37
+	cvt.rn.bf16.f32 	%rs5, %r42;
+	cvt.rn.bf16.f32 	%rs6, %r41;
+	.loc	1 44 33                         // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:44:33
+	mov.b32 	{%rs7, %rs8}, %r1;
+	selp.b16 	%rs9, %rs8, %rs6, %p1;
+	selp.b16 	%rs10, %rs7, %rs5, %p1;
+	mov.b32 	%r5, {%rs10, %rs9};
+	.loc	1 45 37                         // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:45:37
+	// begin inline asm
+	st.global.b32 [ %rd7 + 0 ], { %r5 };
+	// end inline asm
+	.loc	1 45 4                          // c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py:45:4
+	ret;
+$L__tmp3:
+$L__func_end0:
+                                        // -- End function
+}
+	.file	1 "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py"
+	.file	2 "/usr/local/lib/python3.12/dist-packages/triton/language/standard.py"
+	.section	.debug_abbrev
+	{
+.b8 1                                   // Abbreviation Code
+.b8 17                                  // DW_TAG_compile_unit
+.b8 1                                   // DW_CHILDREN_yes
+.b8 37                                  // DW_AT_producer
+.b8 8                                   // DW_FORM_string
+.b8 19                                  // DW_AT_language
+.b8 5                                   // DW_FORM_data2
+.b8 3                                   // DW_AT_name
+.b8 8                                   // DW_FORM_string
+.b8 16                                  // DW_AT_stmt_list
+.b8 6                                   // DW_FORM_data4
+.b8 27                                  // DW_AT_comp_dir
+.b8 8                                   // DW_FORM_string
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 2                                   // Abbreviation Code
+.b8 46                                  // DW_TAG_subprogram
+.b8 0                                   // DW_CHILDREN_no
+.b8 3                                   // DW_AT_name
+.b8 8                                   // DW_FORM_string
+.b8 32                                  // DW_AT_inline
+.b8 11                                  // DW_FORM_data1
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 3                                   // Abbreviation Code
+.b8 46                                  // DW_TAG_subprogram
+.b8 1                                   // DW_CHILDREN_yes
+.b8 17                                  // DW_AT_low_pc
+.b8 1                                   // DW_FORM_addr
+.b8 18                                  // DW_AT_high_pc
+.b8 1                                   // DW_FORM_addr
+.b8 49                                  // DW_AT_abstract_origin
+.b8 19                                  // DW_FORM_ref4
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 4                                   // Abbreviation Code
+.b8 29                                  // DW_TAG_inlined_subroutine
+.b8 0                                   // DW_CHILDREN_no
+.b8 49                                  // DW_AT_abstract_origin
+.b8 19                                  // DW_FORM_ref4
+.b8 17                                  // DW_AT_low_pc
+.b8 1                                   // DW_FORM_addr
+.b8 18                                  // DW_AT_high_pc
+.b8 1                                   // DW_FORM_addr
+.b8 88                                  // DW_AT_call_file
+.b8 11                                  // DW_FORM_data1
+.b8 89                                  // DW_AT_call_line
+.b8 11                                  // DW_FORM_data1
+.b8 87                                  // DW_AT_call_column
+.b8 11                                  // DW_FORM_data1
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 0                                   // EOM(3)
+	}
+	.section	.debug_info
+	{
+.b32 316                                // Length of Unit
+.b8 2                                   // DWARF version number
+.b8 0
+.b32 .debug_abbrev                      // Offset Into Abbrev. Section
+.b8 8                                   // Address Size (in bytes)
+.b8 1                                   // Abbrev [1] 0xb:0x135 DW_TAG_compile_unit
+.b8 116                                 // DW_AT_producer
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 0
+.b8 2                                   // DW_AT_language
+.b8 0
+.b8 99                                  // DW_AT_name
+.b8 51
+.b8 105
+.b8 109
+.b8 121
+.b8 102
+.b8 105
+.b8 98
+.b8 99
+.b8 113
+.b8 51
+.b8 122
+.b8 119
+.b8 114
+.b8 99
+.b8 53
+.b8 103
+.b8 118
+.b8 102
+.b8 115
+.b8 99
+.b8 118
+.b8 112
+.b8 115
+.b8 97
+.b8 120
+.b8 100
+.b8 122
+.b8 106
+.b8 105
+.b8 106
+.b8 121
+.b8 109
+.b8 114
+.b8 110
+.b8 116
+.b8 50
+.b8 108
+.b8 102
+.b8 97
+.b8 104
+.b8 116
+.b8 114
+.b8 106
+.b8 109
+.b8 114
+.b8 98
+.b8 116
+.b8 108
+.b8 109
+.b8 104
+.b8 101
+.b8 46
+.b8 112
+.b8 121
+.b8 0
+.b32 .debug_line                        // DW_AT_stmt_list
+.b8 47                                  // DW_AT_comp_dir
+.b8 97
+.b8 112
+.b8 112
+.b8 47
+.b8 116
+.b8 101
+.b8 110
+.b8 115
+.b8 111
+.b8 114
+.b8 114
+.b8 116
+.b8 95
+.b8 108
+.b8 108
+.b8 109
+.b8 47
+.b8 118
+.b8 105
+.b8 115
+.b8 117
+.b8 97
+.b8 108
+.b8 95
+.b8 103
+.b8 101
+.b8 110
+.b8 47
+.b8 99
+.b8 111
+.b8 109
+.b8 112
+.b8 105
+.b8 108
+.b8 101
+.b8 100
+.b8 95
+.b8 99
+.b8 97
+.b8 99
+.b8 104
+.b8 101
+.b8 47
+.b8 102
+.b8 108
+.b8 117
+.b8 120
+.b8 50
+.b8 95
+.b8 107
+.b8 108
+.b8 101
+.b8 105
+.b8 110
+.b8 95
+.b8 57
+.b8 98
+.b8 95
+.b8 78
+.b8 86
+.b8 73
+.b8 68
+.b8 73
+.b8 65
+.b8 95
+.b8 71
+.b8 101
+.b8 70
+.b8 111
+.b8 114
+.b8 99
+.b8 101
+.b8 95
+.b8 82
+.b8 84
+.b8 88
+.b8 95
+.b8 52
+.b8 48
+.b8 57
+.b8 48
+.b8 95
+.b8 115
+.b8 109
+.b8 56
+.b8 57
+.b8 95
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 50
+.b8 46
+.b8 49
+.b8 48
+.b8 46
+.b8 48
+.b8 97
+.b8 48
+.b8 95
+.b8 98
+.b8 52
+.b8 101
+.b8 52
+.b8 101
+.b8 101
+.b8 56
+.b8 49
+.b8 100
+.b8 51
+.b8 46
+.b8 110
+.b8 118
+.b8 50
+.b8 53
+.b8 46
+.b8 49
+.b8 50
+.b8 95
+.b8 99
+.b8 117
+.b8 100
+.b8 97
+.b8 49
+.b8 51
+.b8 95
+.b8 49
+.b8 47
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 105
+.b8 110
+.b8 100
+.b8 117
+.b8 99
+.b8 116
+.b8 111
+.b8 114
+.b8 47
+.b8 51
+.b8 105
+.b8 0
+.b8 2                                   // Abbrev [2] 0xe4:0x2d DW_TAG_subprogram
+.b8 116                                 // DW_AT_name
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 112
+.b8 111
+.b8 105
+.b8 95
+.b8 102
+.b8 117
+.b8 115
+.b8 101
+.b8 100
+.b8 95
+.b8 99
+.b8 97
+.b8 116
+.b8 95
+.b8 109
+.b8 117
+.b8 108
+.b8 95
+.b8 115
+.b8 105
+.b8 108
+.b8 117
+.b8 95
+.b8 115
+.b8 112
+.b8 108
+.b8 105
+.b8 116
+.b8 95
+.b8 118
+.b8 105
+.b8 101
+.b8 119
+.b8 95
+.b8 48
+.b8 0
+.b8 1                                   // DW_AT_inline
+.b8 3                                   // Abbrev [3] 0x111:0x2e DW_TAG_subprogram
+.b64 $L__func_begin0                    // DW_AT_low_pc
+.b64 $L__func_end0                      // DW_AT_high_pc
+.b32 228                                // DW_AT_abstract_origin
+.b8 4                                   // Abbrev [4] 0x126:0x18 DW_TAG_inlined_subroutine
+.b32 228                                // DW_AT_abstract_origin
+.b64 $L__tmp1                           // DW_AT_low_pc
+.b64 $L__tmp2                           // DW_AT_high_pc
+.b8 1                                   // DW_AT_call_file
+.b8 37                                  // DW_AT_call_line
+.b8 23                                  // DW_AT_call_column
+.b8 0                                   // End Of Children Mark
+.b8 0                                   // End Of Children Mark
+	}
+	.section	.debug_macinfo	{	}
diff --git a/triton/PPN4SVQW2UFKVPWUB7HCOIHQMJON3EA6PX7FI3IPMCGAPBBOTNMQ/triton_poi_fused_cat_mul_silu_split_view_0.source b/triton/PPN4SVQW2UFKVPWUB7HCOIHQMJON3EA6PX7FI3IPMCGAPBBOTNMQ/triton_poi_fused_cat_mul_silu_split_view_0.source
new file mode 100644
index 0000000000000000000000000000000000000000..5630dd3cd3cb1e3f423a83a7709bef62f41aca7c
--- /dev/null
+++ b/triton/PPN4SVQW2UFKVPWUB7HCOIHQMJON3EA6PX7FI3IPMCGAPBBOTNMQ/triton_poi_fused_cat_mul_silu_split_view_0.source
@@ -0,0 +1,212 @@
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":18:0)
+#loc43 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":49:0)
+#loc50 = loc("in_ptr0"(#loc))
+#loc51 = loc("in_ptr1"(#loc))
+#loc52 = loc("out_ptr0"(#loc))
+#loc53 = loc("xnumel"(#loc))
+#loc93 = loc("x"(#loc43))
+module {
+  tt.func public @triton_poi_fused_cat_mul_silu_split_view_0(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} {
+    %xnumel_0 = arith.constant 37748736 : i32 loc(#loc54)
+    %xoffset = tt.get_program_id x : i32 loc(#loc55)
+    %xoffset_1 = arith.constant 512 : i32 loc(#loc56)
+    %xoffset_2 = arith.constant 512 : i32 loc(#loc56)
+    %xoffset_3 = arith.muli %xoffset, %xoffset_2 : i32 loc(#loc56)
+    %xindex = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32> loc(#loc57)
+    %xindex_4 = tt.splat %xoffset_3 : i32 -> tensor<512xi32> loc(#loc58)
+    %xindex_5 = arith.addi %xindex_4, %xindex : tensor<512xi32> loc(#loc58)
+    %xmask = arith.constant true loc(#loc59)
+    %xmask_6 = arith.constant dense<true> : tensor<512xi1> loc(#loc59)
+    %x0 = arith.constant 16384 : i32 loc(#loc60)
+    %x0_7 = arith.constant 16384 : i32 loc(#loc60)
+    %x0_8 = arith.constant dense<16384> : tensor<512xi32> loc(#loc60)
+    %x0_9 = arith.remsi %xindex_5, %x0_8 : tensor<512xi32> loc(#loc60)
+    %x1 = arith.constant 16384 : i32 loc(#loc61)
+    %x1_10 = arith.constant 16384 : i32 loc(#loc61)
+    %x1_11 = arith.constant dense<16384> : tensor<512xi32> loc(#loc61)
+    %x1_12 = arith.divsi %xindex_5, %x1_11 : tensor<512xi32> loc(#loc61)
+    %tmp1 = arith.constant 0 : i64 loc(#loc62)
+    %tmp1_13 = arith.constant dense<0> : tensor<1xi64> loc(#loc62)
+    %tmp2 = arith.extsi %x0_9 : tensor<512xi32> to tensor<512xi64> loc(#loc63)
+    %tmp2_14 = arith.constant dense<0> : tensor<512xi64> loc(#loc63)
+    %tmp2_15 = arith.cmpi sge, %tmp2, %tmp2_14 : tensor<512xi64> loc(#loc63)
+    %tmp3 = arith.constant 4096 : i64 loc(#loc64)
+    %tmp3_16 = arith.constant dense<4096> : tensor<1xi64> loc(#loc64)
+    %tmp4 = arith.extsi %x0_9 : tensor<512xi32> to tensor<512xi64> loc(#loc65)
+    %tmp4_17 = arith.constant dense<4096> : tensor<512xi64> loc(#loc65)
+    %tmp4_18 = arith.cmpi slt, %tmp4, %tmp4_17 : tensor<512xi64> loc(#loc65)
+    %tmp5 = arith.constant 4096 : i32 loc(#loc66)
+    %tmp5_19 = arith.constant 4096 : i32 loc(#loc66)
+    %tmp5_20 = arith.constant dense<4096> : tensor<512xi32> loc(#loc66)
+    %tmp5_21 = arith.muli %tmp5_20, %x1_12 : tensor<512xi32> loc(#loc66)
+    %tmp5_22 = arith.addi %tmp5_21, %x0_9 : tensor<512xi32> loc(#loc67)
+    %tmp5_23 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<512x!tt.ptr<bf16>> loc(#loc68)
+    %tmp5_24 = tt.addptr %tmp5_23, %tmp5_22 : tensor<512x!tt.ptr<bf16>>, tensor<512xi32> loc(#loc68)
+    %tmp5_25 = arith.constant 0.000000e+00 : f32 loc(#loc69)
+    %tmp5_26 = arith.constant dense<0.000000e+00> : tensor<512xf32> loc(#loc69)
+    %tmp5_27 = arith.truncf %tmp5_26 : tensor<512xf32> to tensor<512xbf16> loc(#loc69)
+    %tmp5_28 = tt.load %tmp5_24, %tmp4_18, %tmp5_27 evictionPolicy = evict_last : tensor<512x!tt.ptr<bf16>> loc(#loc69)
+    %tmp5_29 = arith.extf %tmp5_28 : tensor<512xbf16> to tensor<512xf32> loc(#loc70)
+    %tmp6 = arith.extsi %x0_9 : tensor<512xi32> to tensor<512xi64> loc(#loc71)
+    %tmp6_30 = arith.constant dense<4096> : tensor<512xi64> loc(#loc71)
+    %tmp6_31 = arith.cmpi sge, %tmp6, %tmp6_30 : tensor<512xi64> loc(#loc71)
+    %tmp7 = arith.constant 16384 : i64 loc(#loc72)
+    %tmp7_32 = arith.constant dense<16384> : tensor<1xi64> loc(#loc72)
+    %tmp8 = arith.extsi %x0_9 : tensor<512xi32> to tensor<512xi64> loc(#loc73)
+    %tmp8_33 = arith.constant dense<16384> : tensor<512xi64> loc(#loc73)
+    %tmp8_34 = arith.cmpi slt, %tmp8, %tmp8_33 : tensor<512xi64> loc(#loc73)
+    %tmp9 = arith.constant 36864 : i32 loc(#loc74)
+    %tmp9_35 = arith.constant 36864 : i32 loc(#loc74)
+    %tmp9_36 = arith.constant dense<36864> : tensor<512xi32> loc(#loc74)
+    %tmp9_37 = arith.muli %tmp9_36, %x1_12 : tensor<512xi32> loc(#loc74)
+    %tmp9_38 = arith.constant -4096 : i32 loc(#loc75)
+    %tmp9_39 = arith.constant -4096 : i32 loc(#loc75)
+    %tmp9_40 = arith.constant dense<-4096> : tensor<512xi32> loc(#loc75)
+    %tmp9_41 = arith.addi %tmp9_40, %x0_9 : tensor<512xi32> loc(#loc75)
+    %tmp9_42 = arith.addi %tmp9_37, %tmp9_41 : tensor<512xi32> loc(#loc76)
+    %tmp9_43 = tt.splat %in_ptr1 : !tt.ptr<bf16> -> tensor<512x!tt.ptr<bf16>> loc(#loc77)
+    %tmp9_44 = tt.addptr %tmp9_43, %tmp9_42 : tensor<512x!tt.ptr<bf16>>, tensor<512xi32> loc(#loc77)
+    %tmp9_45 = arith.constant 0.000000e+00 : f32 loc(#loc78)
+    %tmp9_46 = arith.constant dense<0.000000e+00> : tensor<512xf32> loc(#loc78)
+    %tmp9_47 = arith.truncf %tmp9_46 : tensor<512xf32> to tensor<512xbf16> loc(#loc78)
+    %tmp9_48 = tt.load %tmp9_44, %tmp6_31, %tmp9_47 evictionPolicy = evict_last : tensor<512x!tt.ptr<bf16>> loc(#loc78)
+    %tmp9_49 = arith.extf %tmp9_48 : tensor<512xbf16> to tensor<512xf32> loc(#loc79)
+    %tmp11 = tt.call @triton.language.standard.sigmoid__fp32S512S__(%tmp9_49) : (tensor<512xf32>) -> tensor<512xf32> loc(#loc80)
+    %tmp12 = arith.mulf %tmp9_49, %tmp11 : tensor<512xf32> loc(#loc81)
+    %tmp14 = arith.constant 36864 : i32 loc(#loc82)
+    %tmp14_50 = arith.constant 36864 : i32 loc(#loc82)
+    %tmp14_51 = arith.constant dense<36864> : tensor<512xi32> loc(#loc82)
+    %tmp14_52 = arith.muli %tmp14_51, %x1_12 : tensor<512xi32> loc(#loc82)
+    %tmp14_53 = arith.constant 12288 : i32 loc(#loc83)
+    %tmp14_54 = arith.constant 12288 : i32 loc(#loc83)
+    %tmp14_55 = arith.constant dense<12288> : tensor<512xi32> loc(#loc83)
+    %tmp14_56 = arith.addi %tmp14_55, %tmp14_52 : tensor<512xi32> loc(#loc83)
+    %tmp14_57 = arith.constant -4096 : i32 loc(#loc84)
+    %tmp14_58 = arith.constant -4096 : i32 loc(#loc84)
+    %tmp14_59 = arith.constant dense<-4096> : tensor<512xi32> loc(#loc84)
+    %tmp14_60 = arith.addi %tmp14_59, %x0_9 : tensor<512xi32> loc(#loc84)
+    %tmp14_61 = arith.addi %tmp14_56, %tmp14_60 : tensor<512xi32> loc(#loc85)
+    %tmp14_62 = tt.splat %in_ptr1 : !tt.ptr<bf16> -> tensor<512x!tt.ptr<bf16>> loc(#loc86)
+    %tmp14_63 = tt.addptr %tmp14_62, %tmp14_61 : tensor<512x!tt.ptr<bf16>>, tensor<512xi32> loc(#loc86)
+    %tmp14_64 = arith.constant 0.000000e+00 : f32 loc(#loc87)
+    %tmp14_65 = arith.constant dense<0.000000e+00> : tensor<512xf32> loc(#loc87)
+    %tmp14_66 = arith.truncf %tmp14_65 : tensor<512xf32> to tensor<512xbf16> loc(#loc87)
+    %tmp14_67 = tt.load %tmp14_63, %tmp6_31, %tmp14_66 evictionPolicy = evict_last : tensor<512x!tt.ptr<bf16>> loc(#loc87)
+    %tmp14_68 = arith.extf %tmp14_67 : tensor<512xbf16> to tensor<512xf32> loc(#loc88)
+    %tmp15 = arith.mulf %tmp12, %tmp14_68 : tensor<512xf32> loc(#loc89)
+    %tmp16 = arith.constant 0.000000e+00 : f32 loc(#loc90)
+    %tmp16_69 = arith.constant dense<0.000000e+00> : tensor<512xf32> loc(#loc90)
+    %tmp17 = arith.select %tmp6_31, %tmp15, %tmp16_69 : tensor<512xi1>, tensor<512xf32> loc(#loc91)
+    %tmp18 = arith.select %tmp4_18, %tmp5_29, %tmp17 : tensor<512xi1>, tensor<512xf32> loc(#loc92)
+    %0 = tt.splat %out_ptr0 : !tt.ptr<bf16> -> tensor<512x!tt.ptr<bf16>> loc(#loc40)
+    %1 = tt.addptr %0, %xindex_5 : tensor<512x!tt.ptr<bf16>>, tensor<512xi32> loc(#loc40)
+    %2 = arith.truncf %tmp18 : tensor<512xf32> to tensor<512xbf16> loc(#loc41)
+    tt.store %1, %2 : tensor<512x!tt.ptr<bf16>> loc(#loc41)
+    tt.return loc(#loc42)
+  } loc(#loc)
+  tt.func private @triton.language.standard.sigmoid__fp32S512S__(%x: tensor<512xf32> loc("x"(#loc43))) -> tensor<512xf32> attributes {noinline = false} {
+    %cst = arith.constant 0.000000e+00 : f32 loc(#loc44)
+    %cst_0 = arith.constant dense<0.000000e+00> : tensor<512xf32> loc(#loc44)
+    %0 = arith.subf %cst_0, %x : tensor<512xf32> loc(#loc44)
+    %1 = math.exp %0 : tensor<512xf32> loc(#loc45)
+    %c1_i32 = arith.constant 1 : i32 loc(#loc46)
+    %cst_1 = arith.constant 1.000000e+00 : f32 loc(#loc46)
+    %cst_2 = arith.constant dense<1.000000e+00> : tensor<512xf32> loc(#loc46)
+    %2 = arith.addf %cst_2, %1 : tensor<512xf32> loc(#loc46)
+    %c1_i32_3 = arith.constant 1 : i32 loc(#loc47)
+    %cst_4 = arith.constant 1.000000e+00 : f32 loc(#loc47)
+    %cst_5 = arith.constant dense<1.000000e+00> : tensor<512xf32> loc(#loc47)
+    %3 = arith.divf %cst_5, %2 : tensor<512xf32> loc(#loc47)
+    tt.return %3 : tensor<512xf32> loc(#loc48)
+  ^bb1:  // no predecessors
+    %4 = ub.poison : tensor<512xf32> loc(#loc49)
+    tt.return %4 : tensor<512xf32> loc(#loc49)
+  } loc(#loc43)
+} loc(#loc)
+#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":19:13)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":20:28)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":20:33)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":21:36)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":21:23)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":22:36)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":23:19)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":24:19)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":27:27)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":28:19)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":29:30)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":30:18)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":31:35)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":31:41)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":31:30)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":31:47)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":31:97)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":32:19)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":33:31)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":34:18)
+#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":35:36)
+#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":35:52)
+#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":35:42)
+#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":35:30)
+#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":35:58)
+#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":35:108)
+#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":37:23)
+#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":38:20)
+#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":40:45)
+#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":40:39)
+#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":40:61)
+#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":40:51)
+#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":40:31)
+#loc34 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":40:67)
+#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":40:117)
+#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":41:20)
+#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":42:38)
+#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":43:34)
+#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":44:33)
+#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":45:25)
+#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":45:37)
+#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":45:4)
+#loc44 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:30)
+#loc45 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:29)
+#loc46 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:20)
+#loc47 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:16)
+#loc48 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:11)
+#loc49 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:4)
+#loc54 = loc("xnumel"(#loc1))
+#loc55 = loc("xoffset"(#loc2))
+#loc56 = loc("xoffset"(#loc3))
+#loc57 = loc("xindex"(#loc4))
+#loc58 = loc("xindex"(#loc5))
+#loc59 = loc("xmask"(#loc6))
+#loc60 = loc("x0"(#loc7))
+#loc61 = loc("x1"(#loc8))
+#loc62 = loc("tmp1"(#loc9))
+#loc63 = loc("tmp2"(#loc10))
+#loc64 = loc("tmp3"(#loc11))
+#loc65 = loc("tmp4"(#loc12))
+#loc66 = loc("tmp5"(#loc13))
+#loc67 = loc("tmp5"(#loc14))
+#loc68 = loc("tmp5"(#loc15))
+#loc69 = loc("tmp5"(#loc16))
+#loc70 = loc("tmp5"(#loc17))
+#loc71 = loc("tmp6"(#loc18))
+#loc72 = loc("tmp7"(#loc19))
+#loc73 = loc("tmp8"(#loc20))
+#loc74 = loc("tmp9"(#loc21))
+#loc75 = loc("tmp9"(#loc22))
+#loc76 = loc("tmp9"(#loc23))
+#loc77 = loc("tmp9"(#loc24))
+#loc78 = loc("tmp9"(#loc25))
+#loc79 = loc("tmp9"(#loc26))
+#loc80 = loc("tmp11"(#loc27))
+#loc81 = loc("tmp12"(#loc28))
+#loc82 = loc("tmp14"(#loc29))
+#loc83 = loc("tmp14"(#loc30))
+#loc84 = loc("tmp14"(#loc31))
+#loc85 = loc("tmp14"(#loc32))
+#loc86 = loc("tmp14"(#loc33))
+#loc87 = loc("tmp14"(#loc34))
+#loc88 = loc("tmp14"(#loc35))
+#loc89 = loc("tmp15"(#loc36))
+#loc90 = loc("tmp16"(#loc37))
+#loc91 = loc("tmp17"(#loc38))
+#loc92 = loc("tmp18"(#loc39))
diff --git a/triton/PPN4SVQW2UFKVPWUB7HCOIHQMJON3EA6PX7FI3IPMCGAPBBOTNMQ/triton_poi_fused_cat_mul_silu_split_view_0.ttgir b/triton/PPN4SVQW2UFKVPWUB7HCOIHQMJON3EA6PX7FI3IPMCGAPBBOTNMQ/triton_poi_fused_cat_mul_silu_split_view_0.ttgir
new file mode 100644
index 0000000000000000000000000000000000000000..2e44a0a90fa098dc93b4ab679a3fa89613988448
--- /dev/null
+++ b/triton/PPN4SVQW2UFKVPWUB7HCOIHQMJON3EA6PX7FI3IPMCGAPBBOTNMQ/triton_poi_fused_cat_mul_silu_split_view_0.ttgir
@@ -0,0 +1,131 @@
+#blocked = #ttg.blocked<{sizePerThread = [2], threadsPerWarp = [32], warpsPerCTA = [8], order = [0]}>
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":18:0)
+#loc38 = loc("in_ptr0"(#loc))
+#loc39 = loc("in_ptr1"(#loc))
+#loc40 = loc("out_ptr0"(#loc))
+#loc41 = loc("xnumel"(#loc))
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "cuda:89", "ttg.threads-per-warp" = 32 : i32} {
+  tt.func public @triton_poi_fused_cat_mul_silu_split_view_0(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} {
+    %cst = arith.constant dense<12288> : tensor<512xi32, #blocked> loc(#loc1)
+    %cst_0 = arith.constant dense<-4096> : tensor<512xi32, #blocked> loc(#loc1)
+    %cst_1 = arith.constant dense<36864> : tensor<512xi32, #blocked> loc(#loc1)
+    %cst_2 = arith.constant dense<4096> : tensor<512xi32, #blocked> loc(#loc1)
+    %cst_3 = arith.constant dense<4096> : tensor<512xi64, #blocked> loc(#loc1)
+    %cst_4 = arith.constant dense<16384> : tensor<512xi32, #blocked> loc(#loc1)
+    %c512_i32 = arith.constant 512 : i32 loc(#loc1)
+    %cst_5 = arith.constant dense<0.000000e+00> : tensor<512xbf16, #blocked> loc(#loc1)
+    %cst_6 = arith.constant dense<1.000000e+00> : tensor<512xf32, #blocked> loc(#loc1)
+    %cst_7 = arith.constant dense<0.000000e+00> : tensor<512xf32, #blocked> loc(#loc1)
+    %xoffset = tt.get_program_id x : i32 loc(#loc42)
+    %xoffset_8 = arith.muli %xoffset, %c512_i32 : i32 loc(#loc43)
+    %xindex = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32, #blocked> loc(#loc44)
+    %xindex_9 = tt.splat %xoffset_8 : i32 -> tensor<512xi32, #blocked> loc(#loc45)
+    %xindex_10 = arith.addi %xindex_9, %xindex : tensor<512xi32, #blocked> loc(#loc45)
+    %x0 = arith.remsi %xindex_10, %cst_4 : tensor<512xi32, #blocked> loc(#loc46)
+    %x1 = arith.divsi %xindex_10, %cst_4 : tensor<512xi32, #blocked> loc(#loc47)
+    %tmp4 = arith.extsi %x0 : tensor<512xi32, #blocked> to tensor<512xi64, #blocked> loc(#loc48)
+    %tmp4_11 = arith.cmpi slt, %tmp4, %cst_3 : tensor<512xi64, #blocked> loc(#loc48)
+    %tmp5 = arith.muli %x1, %cst_2 : tensor<512xi32, #blocked> loc(#loc49)
+    %tmp5_12 = arith.addi %tmp5, %x0 : tensor<512xi32, #blocked> loc(#loc50)
+    %tmp5_13 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<512x!tt.ptr<bf16>, #blocked> loc(#loc51)
+    %tmp5_14 = tt.addptr %tmp5_13, %tmp5_12 : tensor<512x!tt.ptr<bf16>, #blocked>, tensor<512xi32, #blocked> loc(#loc51)
+    %tmp5_15 = tt.load %tmp5_14, %tmp4_11, %cst_5 evictionPolicy = evict_last : tensor<512x!tt.ptr<bf16>, #blocked> loc(#loc52)
+    %tmp5_16 = arith.extf %tmp5_15 : tensor<512xbf16, #blocked> to tensor<512xf32, #blocked> loc(#loc53)
+    %tmp6 = arith.cmpi sge, %tmp4, %cst_3 : tensor<512xi64, #blocked> loc(#loc54)
+    %tmp9 = arith.muli %x1, %cst_1 : tensor<512xi32, #blocked> loc(#loc55)
+    %tmp9_17 = arith.addi %x0, %cst_0 : tensor<512xi32, #blocked> loc(#loc56)
+    %tmp9_18 = arith.addi %tmp9, %tmp9_17 : tensor<512xi32, #blocked> loc(#loc57)
+    %tmp9_19 = tt.splat %in_ptr1 : !tt.ptr<bf16> -> tensor<512x!tt.ptr<bf16>, #blocked> loc(#loc58)
+    %tmp9_20 = tt.addptr %tmp9_19, %tmp9_18 : tensor<512x!tt.ptr<bf16>, #blocked>, tensor<512xi32, #blocked> loc(#loc58)
+    %tmp9_21 = tt.load %tmp9_20, %tmp6, %cst_5 evictionPolicy = evict_last : tensor<512x!tt.ptr<bf16>, #blocked> loc(#loc59)
+    %tmp9_22 = arith.extf %tmp9_21 : tensor<512xbf16, #blocked> to tensor<512xf32, #blocked> loc(#loc60)
+    %tmp11 = arith.subf %cst_7, %tmp9_22 : tensor<512xf32, #blocked> loc(#loc71)
+    %tmp11_23 = math.exp %tmp11 : tensor<512xf32, #blocked> loc(#loc72)
+    %tmp11_24 = arith.addf %tmp11_23, %cst_6 : tensor<512xf32, #blocked> loc(#loc73)
+    %tmp11_25 = arith.divf %cst_6, %tmp11_24 : tensor<512xf32, #blocked> loc(#loc74)
+    %tmp12 = arith.mulf %tmp9_22, %tmp11_25 : tensor<512xf32, #blocked> loc(#loc62)
+    %tmp14 = arith.addi %tmp9, %cst : tensor<512xi32, #blocked> loc(#loc63)
+    %tmp14_26 = arith.addi %tmp14, %tmp9_17 : tensor<512xi32, #blocked> loc(#loc64)
+    %tmp14_27 = tt.addptr %tmp9_19, %tmp14_26 : tensor<512x!tt.ptr<bf16>, #blocked>, tensor<512xi32, #blocked> loc(#loc65)
+    %tmp14_28 = tt.load %tmp14_27, %tmp6, %cst_5 evictionPolicy = evict_last : tensor<512x!tt.ptr<bf16>, #blocked> loc(#loc66)
+    %tmp14_29 = arith.extf %tmp14_28 : tensor<512xbf16, #blocked> to tensor<512xf32, #blocked> loc(#loc67)
+    %tmp15 = arith.mulf %tmp12, %tmp14_29 : tensor<512xf32, #blocked> loc(#loc68)
+    %tmp17 = arith.select %tmp6, %tmp15, %cst_7 : tensor<512xi1, #blocked>, tensor<512xf32, #blocked> loc(#loc69)
+    %tmp18 = arith.select %tmp4_11, %tmp5_16, %tmp17 : tensor<512xi1, #blocked>, tensor<512xf32, #blocked> loc(#loc70)
+    %0 = tt.splat %out_ptr0 : !tt.ptr<bf16> -> tensor<512x!tt.ptr<bf16>, #blocked> loc(#loc35)
+    %1 = tt.addptr %0, %xindex_10 : tensor<512x!tt.ptr<bf16>, #blocked>, tensor<512xi32, #blocked> loc(#loc35)
+    %2 = arith.truncf %tmp18 : tensor<512xf32, #blocked> to tensor<512xbf16, #blocked> loc(#loc36)
+    tt.store %1, %2 : tensor<512x!tt.ptr<bf16>, #blocked> loc(#loc36)
+    tt.return loc(#loc37)
+  } loc(#loc)
+} loc(#loc)
+#loc1 = loc(unknown)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":20:28)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":20:33)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":21:36)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":21:23)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":23:19)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":24:19)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":30:18)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":31:35)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":31:41)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":31:30)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":31:47)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":31:97)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":32:19)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":35:36)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":35:52)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":35:42)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":35:30)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":35:58)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":35:108)
+#loc21 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:30)
+#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":37:23)
+#loc23 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:29)
+#loc24 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:20)
+#loc25 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:16)
+#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":38:20)
+#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":40:39)
+#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":40:51)
+#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":40:31)
+#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":40:67)
+#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":40:117)
+#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":41:20)
+#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":43:34)
+#loc34 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":44:33)
+#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":45:25)
+#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":45:37)
+#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":45:4)
+#loc42 = loc("xoffset"(#loc2))
+#loc43 = loc("xoffset"(#loc3))
+#loc44 = loc("xindex"(#loc4))
+#loc45 = loc("xindex"(#loc5))
+#loc46 = loc("x0"(#loc6))
+#loc47 = loc("x1"(#loc7))
+#loc48 = loc("tmp4"(#loc8))
+#loc49 = loc("tmp5"(#loc9))
+#loc50 = loc("tmp5"(#loc10))
+#loc51 = loc("tmp5"(#loc11))
+#loc52 = loc("tmp5"(#loc12))
+#loc53 = loc("tmp5"(#loc13))
+#loc54 = loc("tmp6"(#loc14))
+#loc55 = loc("tmp9"(#loc15))
+#loc56 = loc("tmp9"(#loc16))
+#loc57 = loc("tmp9"(#loc17))
+#loc58 = loc("tmp9"(#loc18))
+#loc59 = loc("tmp9"(#loc19))
+#loc60 = loc("tmp9"(#loc20))
+#loc61 = loc("tmp11"(#loc22))
+#loc62 = loc("tmp12"(#loc26))
+#loc63 = loc("tmp14"(#loc27))
+#loc64 = loc("tmp14"(#loc28))
+#loc65 = loc("tmp14"(#loc29))
+#loc66 = loc("tmp14"(#loc30))
+#loc67 = loc("tmp14"(#loc31))
+#loc68 = loc("tmp15"(#loc32))
+#loc69 = loc("tmp17"(#loc33))
+#loc70 = loc("tmp18"(#loc34))
+#loc71 = loc(callsite(#loc21 at #loc61))
+#loc72 = loc(callsite(#loc23 at #loc61))
+#loc73 = loc(callsite(#loc24 at #loc61))
+#loc74 = loc(callsite(#loc25 at #loc61))
diff --git a/triton/PPN4SVQW2UFKVPWUB7HCOIHQMJON3EA6PX7FI3IPMCGAPBBOTNMQ/triton_poi_fused_cat_mul_silu_split_view_0.ttir b/triton/PPN4SVQW2UFKVPWUB7HCOIHQMJON3EA6PX7FI3IPMCGAPBBOTNMQ/triton_poi_fused_cat_mul_silu_split_view_0.ttir
new file mode 100644
index 0000000000000000000000000000000000000000..00ba1b95891e8658f6973f6000a7e178586ec370
--- /dev/null
+++ b/triton/PPN4SVQW2UFKVPWUB7HCOIHQMJON3EA6PX7FI3IPMCGAPBBOTNMQ/triton_poi_fused_cat_mul_silu_split_view_0.ttir
@@ -0,0 +1,131 @@
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":18:0)
+#loc38 = loc("in_ptr0"(#loc))
+#loc39 = loc("in_ptr1"(#loc))
+#loc40 = loc("out_ptr0"(#loc))
+#loc41 = loc("xnumel"(#loc))
+module {
+  tt.func public @triton_poi_fused_cat_mul_silu_split_view_0(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} {
+    %tmp11 = arith.constant dense<1.000000e+00> : tensor<512xf32> loc(#loc71)
+    %cst = arith.constant dense<0.000000e+00> : tensor<512xbf16> loc(#loc1)
+    %tmp14 = arith.constant dense<12288> : tensor<512xi32> loc(#loc43)
+    %cst_0 = arith.constant dense<-4096> : tensor<512xi32> loc(#loc1)
+    %cst_1 = arith.constant dense<36864> : tensor<512xi32> loc(#loc1)
+    %cst_2 = arith.constant dense<0.000000e+00> : tensor<512xf32> loc(#loc1)
+    %tmp5 = arith.constant dense<4096> : tensor<512xi32> loc(#loc44)
+    %cst_3 = arith.constant dense<4096> : tensor<512xi64> loc(#loc1)
+    %cst_4 = arith.constant dense<16384> : tensor<512xi32> loc(#loc1)
+    %c512_i32 = arith.constant 512 : i32 loc(#loc1)
+    %xoffset = tt.get_program_id x : i32 loc(#loc45)
+    %xoffset_5 = arith.muli %xoffset, %c512_i32 : i32 loc(#loc46)
+    %xindex = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32> loc(#loc47)
+    %xindex_6 = tt.splat %xoffset_5 : i32 -> tensor<512xi32> loc(#loc48)
+    %xindex_7 = arith.addi %xindex_6, %xindex : tensor<512xi32> loc(#loc48)
+    %x0 = arith.remsi %xindex_7, %cst_4 : tensor<512xi32> loc(#loc49)
+    %x1 = arith.divsi %xindex_7, %cst_4 : tensor<512xi32> loc(#loc50)
+    %tmp4 = arith.extsi %x0 : tensor<512xi32> to tensor<512xi64> loc(#loc51)
+    %tmp4_8 = arith.cmpi slt, %tmp4, %cst_3 : tensor<512xi64> loc(#loc51)
+    %tmp5_9 = arith.muli %x1, %tmp5 : tensor<512xi32> loc(#loc44)
+    %tmp5_10 = arith.addi %tmp5_9, %x0 : tensor<512xi32> loc(#loc52)
+    %tmp5_11 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<512x!tt.ptr<bf16>> loc(#loc53)
+    %tmp5_12 = tt.addptr %tmp5_11, %tmp5_10 : tensor<512x!tt.ptr<bf16>>, tensor<512xi32> loc(#loc53)
+    %tmp5_13 = tt.load %tmp5_12, %tmp4_8, %cst evictionPolicy = evict_last : tensor<512x!tt.ptr<bf16>> loc(#loc54)
+    %tmp5_14 = arith.extf %tmp5_13 : tensor<512xbf16> to tensor<512xf32> loc(#loc55)
+    %tmp6 = arith.cmpi sge, %tmp4, %cst_3 : tensor<512xi64> loc(#loc56)
+    %tmp9 = arith.muli %x1, %cst_1 : tensor<512xi32> loc(#loc57)
+    %tmp9_15 = arith.addi %x0, %cst_0 : tensor<512xi32> loc(#loc58)
+    %tmp9_16 = arith.addi %tmp9, %tmp9_15 : tensor<512xi32> loc(#loc59)
+    %tmp9_17 = tt.splat %in_ptr1 : !tt.ptr<bf16> -> tensor<512x!tt.ptr<bf16>> loc(#loc60)
+    %tmp9_18 = tt.addptr %tmp9_17, %tmp9_16 : tensor<512x!tt.ptr<bf16>>, tensor<512xi32> loc(#loc60)
+    %tmp9_19 = tt.load %tmp9_18, %tmp6, %cst evictionPolicy = evict_last : tensor<512x!tt.ptr<bf16>> loc(#loc61)
+    %tmp9_20 = arith.extf %tmp9_19 : tensor<512xbf16> to tensor<512xf32> loc(#loc62)
+    %tmp11_21 = arith.subf %cst_2, %tmp9_20 : tensor<512xf32> loc(#loc72)
+    %tmp11_22 = math.exp %tmp11_21 : tensor<512xf32> loc(#loc73)
+    %tmp11_23 = arith.addf %tmp11_22, %tmp11 : tensor<512xf32> loc(#loc74)
+    %tmp11_24 = arith.divf %tmp11, %tmp11_23 : tensor<512xf32> loc(#loc75)
+    %tmp12 = arith.mulf %tmp9_20, %tmp11_24 : tensor<512xf32> loc(#loc63)
+    %tmp14_25 = arith.addi %tmp9, %tmp14 : tensor<512xi32> loc(#loc43)
+    %tmp14_26 = arith.addi %tmp14_25, %tmp9_15 : tensor<512xi32> loc(#loc64)
+    %tmp14_27 = tt.addptr %tmp9_17, %tmp14_26 : tensor<512x!tt.ptr<bf16>>, tensor<512xi32> loc(#loc65)
+    %tmp14_28 = tt.load %tmp14_27, %tmp6, %cst evictionPolicy = evict_last : tensor<512x!tt.ptr<bf16>> loc(#loc66)
+    %tmp14_29 = arith.extf %tmp14_28 : tensor<512xbf16> to tensor<512xf32> loc(#loc67)
+    %tmp15 = arith.mulf %tmp12, %tmp14_29 : tensor<512xf32> loc(#loc68)
+    %tmp17 = arith.select %tmp6, %tmp15, %cst_2 : tensor<512xi1>, tensor<512xf32> loc(#loc69)
+    %tmp18 = arith.select %tmp4_8, %tmp5_14, %tmp17 : tensor<512xi1>, tensor<512xf32> loc(#loc70)
+    %0 = tt.splat %out_ptr0 : !tt.ptr<bf16> -> tensor<512x!tt.ptr<bf16>> loc(#loc35)
+    %1 = tt.addptr %0, %xindex_7 : tensor<512x!tt.ptr<bf16>>, tensor<512xi32> loc(#loc35)
+    %2 = arith.truncf %tmp18 : tensor<512xf32> to tensor<512xbf16> loc(#loc36)
+    tt.store %1, %2 : tensor<512x!tt.ptr<bf16>> loc(#loc36)
+    tt.return loc(#loc37)
+  } loc(#loc)
+} loc(#loc)
+#loc1 = loc(unknown)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":37:23)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":40:39)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":31:35)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":20:28)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":20:33)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":21:36)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":21:23)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":23:19)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":24:19)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":30:18)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":31:41)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":31:30)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":31:47)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":31:97)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":32:19)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":35:36)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":35:52)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":35:42)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":35:30)
+#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":35:58)
+#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":35:108)
+#loc23 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:30)
+#loc24 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:29)
+#loc25 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:20)
+#loc26 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:16)
+#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":38:20)
+#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":40:51)
+#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":40:31)
+#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":40:67)
+#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":40:117)
+#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":41:20)
+#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":43:34)
+#loc34 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":44:33)
+#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":45:25)
+#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":45:37)
+#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3i/c3imyfibcq3zwrc5gvfscvpsaxdzjijymrnt2lfahtrjmrbtlmhe.py":45:4)
+#loc42 = loc("tmp11"(#loc2))
+#loc43 = loc("tmp14"(#loc3))
+#loc44 = loc("tmp5"(#loc4))
+#loc45 = loc("xoffset"(#loc5))
+#loc46 = loc("xoffset"(#loc6))
+#loc47 = loc("xindex"(#loc7))
+#loc48 = loc("xindex"(#loc8))
+#loc49 = loc("x0"(#loc9))
+#loc50 = loc("x1"(#loc10))
+#loc51 = loc("tmp4"(#loc11))
+#loc52 = loc("tmp5"(#loc12))
+#loc53 = loc("tmp5"(#loc13))
+#loc54 = loc("tmp5"(#loc14))
+#loc55 = loc("tmp5"(#loc15))
+#loc56 = loc("tmp6"(#loc16))
+#loc57 = loc("tmp9"(#loc17))
+#loc58 = loc("tmp9"(#loc18))
+#loc59 = loc("tmp9"(#loc19))
+#loc60 = loc("tmp9"(#loc20))
+#loc61 = loc("tmp9"(#loc21))
+#loc62 = loc("tmp9"(#loc22))
+#loc63 = loc("tmp12"(#loc27))
+#loc64 = loc("tmp14"(#loc28))
+#loc65 = loc("tmp14"(#loc29))
+#loc66 = loc("tmp14"(#loc30))
+#loc67 = loc("tmp14"(#loc31))
+#loc68 = loc("tmp15"(#loc32))
+#loc69 = loc("tmp17"(#loc33))
+#loc70 = loc("tmp18"(#loc34))
+#loc71 = loc(callsite(#loc1 at #loc42))
+#loc72 = loc(callsite(#loc23 at #loc42))
+#loc73 = loc(callsite(#loc24 at #loc42))
+#loc74 = loc(callsite(#loc25 at #loc42))
+#loc75 = loc(callsite(#loc26 at #loc42))
diff --git a/triton/PSM7NANFVWEDYUPXKUGOX4GWFVUW6ZQXELVXM5G5LMW6RWIXRCOQ/__grp__triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.json b/triton/PSM7NANFVWEDYUPXKUGOX4GWFVUW6ZQXELVXM5G5LMW6RWIXRCOQ/__grp__triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..8eb4e3e423032ffa52dd92e1fbf9861e346160e0
--- /dev/null
+++ b/triton/PSM7NANFVWEDYUPXKUGOX4GWFVUW6ZQXELVXM5G5LMW6RWIXRCOQ/__grp__triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.json
@@ -0,0 +1 @@
+{"child_paths": {"triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.source": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/PSM7NANFVWEDYUPXKUGOX4GWFVUW6ZQXELVXM5G5LMW6RWIXRCOQ/triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.source", "triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.ttir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/PSM7NANFVWEDYUPXKUGOX4GWFVUW6ZQXELVXM5G5LMW6RWIXRCOQ/triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.ttir", "triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.ttgir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/PSM7NANFVWEDYUPXKUGOX4GWFVUW6ZQXELVXM5G5LMW6RWIXRCOQ/triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.ttgir", "triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.llir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/PSM7NANFVWEDYUPXKUGOX4GWFVUW6ZQXELVXM5G5LMW6RWIXRCOQ/triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.llir", "triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.ptx": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/PSM7NANFVWEDYUPXKUGOX4GWFVUW6ZQXELVXM5G5LMW6RWIXRCOQ/triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.ptx", "triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.cubin": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/PSM7NANFVWEDYUPXKUGOX4GWFVUW6ZQXELVXM5G5LMW6RWIXRCOQ/triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.cubin", "triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.json": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/PSM7NANFVWEDYUPXKUGOX4GWFVUW6ZQXELVXM5G5LMW6RWIXRCOQ/triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.json"}}
\ No newline at end of file
diff --git a/triton/PSM7NANFVWEDYUPXKUGOX4GWFVUW6ZQXELVXM5G5LMW6RWIXRCOQ/triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.cubin b/triton/PSM7NANFVWEDYUPXKUGOX4GWFVUW6ZQXELVXM5G5LMW6RWIXRCOQ/triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..55293d77a04b6fac639d79ab1f1a84c382320c33
Binary files /dev/null and b/triton/PSM7NANFVWEDYUPXKUGOX4GWFVUW6ZQXELVXM5G5LMW6RWIXRCOQ/triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.cubin differ
diff --git a/triton/PSM7NANFVWEDYUPXKUGOX4GWFVUW6ZQXELVXM5G5LMW6RWIXRCOQ/triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.json b/triton/PSM7NANFVWEDYUPXKUGOX4GWFVUW6ZQXELVXM5G5LMW6RWIXRCOQ/triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..9061cb26f26e5c0fc6a971a0fc0dc63f2489555a
--- /dev/null
+++ b/triton/PSM7NANFVWEDYUPXKUGOX4GWFVUW6ZQXELVXM5G5LMW6RWIXRCOQ/triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.json
@@ -0,0 +1 @@
+{"hash": "7c99f681a5ad883c51f7550cebf0d62d696f661722eb7674dd5b2de8d917889d", "target": {"backend": "cuda", "arch": 89, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "enable_reflect_ftz": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee", "bf16x3", "bf16x6"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm89", "instrumentation_mode": "", "triton_version": "3.6.0", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3"}
\ No newline at end of file
diff --git a/triton/PSM7NANFVWEDYUPXKUGOX4GWFVUW6ZQXELVXM5G5LMW6RWIXRCOQ/triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.llir b/triton/PSM7NANFVWEDYUPXKUGOX4GWFVUW6ZQXELVXM5G5LMW6RWIXRCOQ/triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.llir
new file mode 100644
index 0000000000000000000000000000000000000000..9a16126e00981226fe93ad6fe3acb0ee26d9494b
--- /dev/null
+++ b/triton/PSM7NANFVWEDYUPXKUGOX4GWFVUW6ZQXELVXM5G5LMW6RWIXRCOQ/triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.llir
@@ -0,0 +1,493 @@
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64"
+
+; Function Attrs: nounwind
+define ptx_kernel void @triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, i32 %6, ptr addrspace(1) readnone captures(none) %7, ptr addrspace(1) readnone captures(none) %8) local_unnamed_addr #0 !dbg !4 {
+  %10 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7
+  %11 = shl i32 %10, 10, !dbg !8
+  %12 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9
+  %13 = shl nuw nsw i32 %12, 3, !dbg !9
+  %14 = and i32 %13, 1016, !dbg !9
+  %15 = or disjoint i32 %14, %11, !dbg !10
+  %16 = or i32 %11, %13, !dbg !9
+  %17 = or disjoint i32 %16, 2, !dbg !10
+  %18 = or disjoint i32 %16, 4, !dbg !10
+  %19 = or disjoint i32 %16, 6, !dbg !10
+  %20 = sdiv i32 %15, 128, !dbg !11
+  %21 = mul i32 %20, 128, !dbg !12
+  %.decomposed = sub i32 %15, %21, !dbg !12
+  %22 = srem i32 %17, 128, !dbg !12
+  %23 = srem i32 %18, 128, !dbg !12
+  %24 = srem i32 %19, 128, !dbg !12
+  %25 = sdiv i32 %15, 4096, !dbg !13
+  %26 = sext i32 %15 to i64, !dbg !14
+  %27 = getelementptr bfloat, ptr addrspace(1) %0, i64 %26, !dbg !14
+  %28 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l"(ptr addrspace(1) %27) #2, !dbg !15
+  %29 = extractvalue { i32, i32, i32, i32 } %28, 0, !dbg !15
+  %30 = bitcast i32 %29 to <2 x bfloat>, !dbg !15
+  %31 = extractvalue { i32, i32, i32, i32 } %28, 1, !dbg !15
+  %32 = bitcast i32 %31 to <2 x bfloat>, !dbg !15
+  %33 = extractvalue { i32, i32, i32, i32 } %28, 2, !dbg !15
+  %34 = bitcast i32 %33 to <2 x bfloat>, !dbg !15
+  %35 = extractvalue { i32, i32, i32, i32 } %28, 3, !dbg !15
+  %36 = bitcast i32 %35 to <2 x bfloat>, !dbg !15
+  %37 = shl nsw i32 %25, 7, !dbg !16
+  %38 = add nsw i32 %37, %.decomposed, !dbg !17
+  %39 = add nsw i32 %37, %23, !dbg !17
+  %40 = sext i32 %38 to i64, !dbg !18
+  %41 = getelementptr float, ptr addrspace(1) %1, i64 %40, !dbg !18
+  %42 = sext i32 %39 to i64, !dbg !18
+  %43 = getelementptr float, ptr addrspace(1) %1, i64 %42, !dbg !18
+  %44 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #2, !dbg !19
+  %45 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ], $5;", "=r,=r,=r,=r,l,l"(ptr addrspace(1) %41, i64 %44) #2, !dbg !19
+  %46 = extractvalue { i32, i32, i32, i32 } %45, 0, !dbg !19
+  %47 = extractvalue { i32, i32, i32, i32 } %45, 1, !dbg !19
+  %48 = extractvalue { i32, i32, i32, i32 } %45, 2, !dbg !19
+  %49 = extractvalue { i32, i32, i32, i32 } %45, 3, !dbg !19
+  %50 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #2, !dbg !19
+  %51 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ], $5;", "=r,=r,=r,=r,l,l"(ptr addrspace(1) %43, i64 %50) #2, !dbg !19
+  %52 = extractvalue { i32, i32, i32, i32 } %51, 0, !dbg !19
+  %53 = extractvalue { i32, i32, i32, i32 } %51, 1, !dbg !19
+  %54 = extractvalue { i32, i32, i32, i32 } %51, 2, !dbg !19
+  %55 = extractvalue { i32, i32, i32, i32 } %51, 3, !dbg !19
+  %56 = getelementptr float, ptr addrspace(1) %2, i64 %40, !dbg !20
+  %57 = getelementptr float, ptr addrspace(1) %2, i64 %42, !dbg !20
+  %58 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #2, !dbg !21
+  %59 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ], $5;", "=r,=r,=r,=r,l,l"(ptr addrspace(1) %56, i64 %58) #2, !dbg !21
+  %60 = extractvalue { i32, i32, i32, i32 } %59, 0, !dbg !21
+  %61 = extractvalue { i32, i32, i32, i32 } %59, 1, !dbg !21
+  %62 = extractvalue { i32, i32, i32, i32 } %59, 2, !dbg !21
+  %63 = extractvalue { i32, i32, i32, i32 } %59, 3, !dbg !21
+  %64 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #2, !dbg !21
+  %65 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ], $5;", "=r,=r,=r,=r,l,l"(ptr addrspace(1) %57, i64 %64) #2, !dbg !21
+  %66 = extractvalue { i32, i32, i32, i32 } %65, 0, !dbg !21
+  %67 = extractvalue { i32, i32, i32, i32 } %65, 1, !dbg !21
+  %68 = extractvalue { i32, i32, i32, i32 } %65, 2, !dbg !21
+  %69 = extractvalue { i32, i32, i32, i32 } %65, 3, !dbg !21
+  %70 = getelementptr bfloat, ptr addrspace(1) %3, i64 %26, !dbg !22
+  %71 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l"(ptr addrspace(1) %70) #2, !dbg !23
+  %72 = extractvalue { i32, i32, i32, i32 } %71, 0, !dbg !23
+  %73 = bitcast i32 %72 to <2 x bfloat>, !dbg !23
+  %74 = extractvalue { i32, i32, i32, i32 } %71, 1, !dbg !23
+  %75 = bitcast i32 %74 to <2 x bfloat>, !dbg !23
+  %76 = extractvalue { i32, i32, i32, i32 } %71, 2, !dbg !23
+  %77 = bitcast i32 %76 to <2 x bfloat>, !dbg !23
+  %78 = extractvalue { i32, i32, i32, i32 } %71, 3, !dbg !23
+  %79 = bitcast i32 %78 to <2 x bfloat>, !dbg !23
+  %80 = insertelement <4 x i32> poison, i32 %15, i64 0, !dbg !10
+  %81 = shufflevector <4 x i32> %80, <4 x i32> poison, <4 x i32> zeroinitializer, !dbg !10
+  %82 = or disjoint <4 x i32> %81, <i32 7, i32 5, i32 3, i32 1>, !dbg !10
+  %83 = extractelement <4 x i32> %82, i64 3, !dbg !12
+  %84 = srem i32 %83, 128, !dbg !12
+  %85 = extractelement <4 x i32> %82, i64 2, !dbg !12
+  %86 = srem i32 %85, 128, !dbg !12
+  %87 = extractelement <4 x i32> %82, i64 1, !dbg !12
+  %88 = srem i32 %87, 128, !dbg !12
+  %89 = extractelement <4 x i32> %82, i64 0, !dbg !12
+  %90 = srem i32 %89, 128, !dbg !12
+  %91 = srem <4 x i32> %82, splat (i32 2), !dbg !24
+  %92 = icmp slt <4 x i32> %91, splat (i32 1), !dbg !25
+  %.lhs.trunc = trunc nsw i32 %84 to i8, !dbg !26
+  %93 = sdiv i8 %.lhs.trunc, 2, !dbg !26
+  %.sext = sext i8 %93 to i32, !dbg !26
+  %.lhs.trunc1 = trunc nsw i32 %86 to i8, !dbg !26
+  %94 = sdiv i8 %.lhs.trunc1, 2, !dbg !26
+  %.sext2 = sext i8 %94 to i32, !dbg !26
+  %.lhs.trunc3 = trunc nsw i32 %88 to i8, !dbg !26
+  %95 = sdiv i8 %.lhs.trunc3, 2, !dbg !26
+  %.sext4 = sext i8 %95 to i32, !dbg !26
+  %.lhs.trunc5 = trunc nsw i32 %90 to i8, !dbg !26
+  %96 = sdiv i8 %.lhs.trunc5, 2, !dbg !26
+  %.sext6 = sext i8 %96 to i32, !dbg !26
+  %97 = shl nsw i32 %.sext, 1, !dbg !27
+  %98 = shl nsw i32 %.sext2, 1, !dbg !27
+  %99 = shl nsw i32 %.sext4, 1, !dbg !27
+  %100 = shl nsw i32 %.sext6, 1, !dbg !27
+  %101 = or disjoint i32 %.decomposed, 1, !dbg !28
+  %102 = or disjoint i32 %22, 1, !dbg !28
+  %103 = or disjoint i32 %23, 1, !dbg !28
+  %104 = or disjoint i32 %24, 1, !dbg !28
+  %105 = shl nsw i32 %20, 7, !dbg !29
+  %106 = add i32 %101, %105, !dbg !30
+  %107 = or disjoint i32 %105, 1, !dbg !28
+  %108 = add i32 %107, %97, !dbg !30
+  %109 = add i32 %102, %105, !dbg !30
+  %110 = add i32 %107, %98, !dbg !30
+  %111 = add i32 %103, %105, !dbg !30
+  %112 = add i32 %107, %99, !dbg !30
+  %113 = add i32 %104, %105, !dbg !30
+  %114 = add i32 %107, %100, !dbg !30
+  %115 = sext i32 %106 to i64, !dbg !31
+  %116 = getelementptr bfloat, ptr addrspace(1) %0, i64 %115, !dbg !31
+  %117 = sext i32 %108 to i64, !dbg !31
+  %118 = getelementptr bfloat, ptr addrspace(1) %0, i64 %117, !dbg !31
+  %119 = sext i32 %109 to i64, !dbg !31
+  %120 = getelementptr bfloat, ptr addrspace(1) %0, i64 %119, !dbg !31
+  %121 = sext i32 %110 to i64, !dbg !31
+  %122 = getelementptr bfloat, ptr addrspace(1) %0, i64 %121, !dbg !31
+  %123 = sext i32 %111 to i64, !dbg !31
+  %124 = getelementptr bfloat, ptr addrspace(1) %0, i64 %123, !dbg !31
+  %125 = sext i32 %112 to i64, !dbg !31
+  %126 = getelementptr bfloat, ptr addrspace(1) %0, i64 %125, !dbg !31
+  %127 = sext i32 %113 to i64, !dbg !31
+  %128 = getelementptr bfloat, ptr addrspace(1) %0, i64 %127, !dbg !31
+  %129 = sext i32 %114 to i64, !dbg !31
+  %130 = getelementptr bfloat, ptr addrspace(1) %0, i64 %129, !dbg !31
+  %131 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #2, !dbg !32
+  %132 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %116, i64 %131, i1 true) #2, !dbg !32
+  %133 = bitcast i16 %132 to bfloat, !dbg !32
+  %134 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #2, !dbg !32
+  %135 = extractelement <4 x i1> %92, i64 3, !dbg !32
+  %136 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %118, i64 %134, i1 %135) #2, !dbg !32
+  %137 = bitcast i16 %136 to bfloat, !dbg !32
+  %138 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #2, !dbg !32
+  %139 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %120, i64 %138, i1 true) #2, !dbg !32
+  %140 = bitcast i16 %139 to bfloat, !dbg !32
+  %141 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #2, !dbg !32
+  %142 = extractelement <4 x i1> %92, i64 2, !dbg !32
+  %143 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %122, i64 %141, i1 %142) #2, !dbg !32
+  %144 = bitcast i16 %143 to bfloat, !dbg !32
+  %145 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #2, !dbg !32
+  %146 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %124, i64 %145, i1 true) #2, !dbg !32
+  %147 = bitcast i16 %146 to bfloat, !dbg !32
+  %148 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #2, !dbg !32
+  %149 = extractelement <4 x i1> %92, i64 1, !dbg !32
+  %150 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %126, i64 %148, i1 %149) #2, !dbg !32
+  %151 = bitcast i16 %150 to bfloat, !dbg !32
+  %152 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #2, !dbg !32
+  %153 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %128, i64 %152, i1 true) #2, !dbg !32
+  %154 = bitcast i16 %153 to bfloat, !dbg !32
+  %155 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #2, !dbg !32
+  %156 = extractelement <4 x i1> %92, i64 0, !dbg !32
+  %157 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %130, i64 %155, i1 %156) #2, !dbg !32
+  %158 = bitcast i16 %157 to bfloat, !dbg !32
+  %159 = fpext bfloat %133 to float, !dbg !33
+  %160 = fpext bfloat %137 to float, !dbg !33
+  %161 = fpext bfloat %140 to float, !dbg !33
+  %162 = fpext bfloat %144 to float, !dbg !33
+  %163 = fpext bfloat %147 to float, !dbg !33
+  %164 = fpext bfloat %151 to float, !dbg !33
+  %165 = fpext bfloat %154 to float, !dbg !33
+  %166 = fpext bfloat %158 to float, !dbg !33
+  %167 = fsub float 0.000000e+00, %159, !dbg !34
+  %168 = fsub float 0.000000e+00, %160, !dbg !34
+  %169 = fsub float 0.000000e+00, %161, !dbg !34
+  %170 = fsub float 0.000000e+00, %162, !dbg !34
+  %171 = fsub float 0.000000e+00, %163, !dbg !34
+  %172 = fsub float 0.000000e+00, %164, !dbg !34
+  %173 = fsub float 0.000000e+00, %165, !dbg !34
+  %174 = fsub float 0.000000e+00, %166, !dbg !34
+  %175 = extractelement <4 x i32> %91, i64 3, !dbg !35
+  %176 = icmp sgt i32 %175, 0, !dbg !35
+  %177 = extractelement <4 x i32> %91, i64 2, !dbg !35
+  %178 = icmp sgt i32 %177, 0, !dbg !35
+  %179 = extractelement <4 x i32> %91, i64 1, !dbg !35
+  %180 = icmp sgt i32 %179, 0, !dbg !35
+  %181 = extractelement <4 x i32> %91, i64 0, !dbg !35
+  %182 = icmp sgt i32 %181, 0, !dbg !35
+  %183 = add i32 %105, %.decomposed, !dbg !36
+  %184 = add i32 %97, %105, !dbg !36
+  %185 = add i32 %105, %22, !dbg !36
+  %186 = add i32 %98, %105, !dbg !36
+  %187 = add i32 %105, %23, !dbg !36
+  %188 = add i32 %99, %105, !dbg !36
+  %189 = add i32 %105, %24, !dbg !36
+  %190 = add i32 %100, %105, !dbg !36
+  %191 = sext i32 %183 to i64, !dbg !37
+  %192 = getelementptr bfloat, ptr addrspace(1) %0, i64 %191, !dbg !37
+  %193 = sext i32 %184 to i64, !dbg !37
+  %194 = getelementptr bfloat, ptr addrspace(1) %0, i64 %193, !dbg !37
+  %195 = sext i32 %185 to i64, !dbg !37
+  %196 = getelementptr bfloat, ptr addrspace(1) %0, i64 %195, !dbg !37
+  %197 = sext i32 %186 to i64, !dbg !37
+  %198 = getelementptr bfloat, ptr addrspace(1) %0, i64 %197, !dbg !37
+  %199 = sext i32 %187 to i64, !dbg !37
+  %200 = getelementptr bfloat, ptr addrspace(1) %0, i64 %199, !dbg !37
+  %201 = sext i32 %188 to i64, !dbg !37
+  %202 = getelementptr bfloat, ptr addrspace(1) %0, i64 %201, !dbg !37
+  %203 = sext i32 %189 to i64, !dbg !37
+  %204 = getelementptr bfloat, ptr addrspace(1) %0, i64 %203, !dbg !37
+  %205 = sext i32 %190 to i64, !dbg !37
+  %206 = getelementptr bfloat, ptr addrspace(1) %0, i64 %205, !dbg !37
+  %207 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #2, !dbg !38
+  %208 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %192, i64 %207, i1 false) #2, !dbg !38
+  %209 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #2, !dbg !38
+  %210 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %194, i64 %209, i1 %176) #2, !dbg !38
+  %211 = bitcast i16 %210 to bfloat, !dbg !38
+  %212 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #2, !dbg !38
+  %213 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %196, i64 %212, i1 false) #2, !dbg !38
+  %214 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #2, !dbg !38
+  %215 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %198, i64 %214, i1 %178) #2, !dbg !38
+  %216 = bitcast i16 %215 to bfloat, !dbg !38
+  %217 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #2, !dbg !38
+  %218 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %200, i64 %217, i1 false) #2, !dbg !38
+  %219 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #2, !dbg !38
+  %220 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %202, i64 %219, i1 %180) #2, !dbg !38
+  %221 = bitcast i16 %220 to bfloat, !dbg !38
+  %222 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #2, !dbg !38
+  %223 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %204, i64 %222, i1 false) #2, !dbg !38
+  %224 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #2, !dbg !38
+  %225 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %206, i64 %224, i1 %182) #2, !dbg !38
+  %226 = bitcast i16 %225 to bfloat, !dbg !38
+  %227 = fpext bfloat %211 to float, !dbg !39
+  %228 = fpext bfloat %216 to float, !dbg !39
+  %229 = fpext bfloat %221 to float, !dbg !39
+  %230 = fpext bfloat %226 to float, !dbg !39
+  %231 = select i1 %135, float %168, float %227, !dbg !40
+  %232 = select i1 %142, float %170, float %228, !dbg !40
+  %233 = select i1 %149, float %172, float %229, !dbg !40
+  %234 = select i1 %156, float %174, float %230, !dbg !40
+  %235 = getelementptr bfloat, ptr addrspace(1) %3, i64 %115, !dbg !41
+  %236 = getelementptr bfloat, ptr addrspace(1) %3, i64 %117, !dbg !41
+  %237 = getelementptr bfloat, ptr addrspace(1) %3, i64 %119, !dbg !41
+  %238 = getelementptr bfloat, ptr addrspace(1) %3, i64 %121, !dbg !41
+  %239 = getelementptr bfloat, ptr addrspace(1) %3, i64 %123, !dbg !41
+  %240 = getelementptr bfloat, ptr addrspace(1) %3, i64 %125, !dbg !41
+  %241 = getelementptr bfloat, ptr addrspace(1) %3, i64 %127, !dbg !41
+  %242 = getelementptr bfloat, ptr addrspace(1) %3, i64 %129, !dbg !41
+  %243 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #2, !dbg !42
+  %244 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %235, i64 %243, i1 true) #2, !dbg !42
+  %245 = bitcast i16 %244 to bfloat, !dbg !42
+  %246 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #2, !dbg !42
+  %247 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %236, i64 %246, i1 %135) #2, !dbg !42
+  %248 = bitcast i16 %247 to bfloat, !dbg !42
+  %249 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #2, !dbg !42
+  %250 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %237, i64 %249, i1 true) #2, !dbg !42
+  %251 = bitcast i16 %250 to bfloat, !dbg !42
+  %252 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #2, !dbg !42
+  %253 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %238, i64 %252, i1 %142) #2, !dbg !42
+  %254 = bitcast i16 %253 to bfloat, !dbg !42
+  %255 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #2, !dbg !42
+  %256 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %239, i64 %255, i1 true) #2, !dbg !42
+  %257 = bitcast i16 %256 to bfloat, !dbg !42
+  %258 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #2, !dbg !42
+  %259 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %240, i64 %258, i1 %149) #2, !dbg !42
+  %260 = bitcast i16 %259 to bfloat, !dbg !42
+  %261 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #2, !dbg !42
+  %262 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %241, i64 %261, i1 true) #2, !dbg !42
+  %263 = bitcast i16 %262 to bfloat, !dbg !42
+  %264 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #2, !dbg !42
+  %265 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %242, i64 %264, i1 %156) #2, !dbg !42
+  %266 = bitcast i16 %265 to bfloat, !dbg !42
+  %267 = fpext bfloat %245 to float, !dbg !43
+  %268 = fpext bfloat %248 to float, !dbg !43
+  %269 = fpext bfloat %251 to float, !dbg !43
+  %270 = fpext bfloat %254 to float, !dbg !43
+  %271 = fpext bfloat %257 to float, !dbg !43
+  %272 = fpext bfloat %260 to float, !dbg !43
+  %273 = fpext bfloat %263 to float, !dbg !43
+  %274 = fpext bfloat %266 to float, !dbg !43
+  %275 = fsub float 0.000000e+00, %267, !dbg !44
+  %276 = fsub float 0.000000e+00, %268, !dbg !44
+  %277 = fsub float 0.000000e+00, %269, !dbg !44
+  %278 = fsub float 0.000000e+00, %270, !dbg !44
+  %279 = fsub float 0.000000e+00, %271, !dbg !44
+  %280 = fsub float 0.000000e+00, %272, !dbg !44
+  %281 = fsub float 0.000000e+00, %273, !dbg !44
+  %282 = fsub float 0.000000e+00, %274, !dbg !44
+  %283 = getelementptr bfloat, ptr addrspace(1) %3, i64 %191, !dbg !45
+  %284 = getelementptr bfloat, ptr addrspace(1) %3, i64 %193, !dbg !45
+  %285 = getelementptr bfloat, ptr addrspace(1) %3, i64 %195, !dbg !45
+  %286 = getelementptr bfloat, ptr addrspace(1) %3, i64 %197, !dbg !45
+  %287 = getelementptr bfloat, ptr addrspace(1) %3, i64 %199, !dbg !45
+  %288 = getelementptr bfloat, ptr addrspace(1) %3, i64 %201, !dbg !45
+  %289 = getelementptr bfloat, ptr addrspace(1) %3, i64 %203, !dbg !45
+  %290 = getelementptr bfloat, ptr addrspace(1) %3, i64 %205, !dbg !45
+  %291 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #2, !dbg !46
+  %292 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %283, i64 %291, i1 false) #2, !dbg !46
+  %293 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #2, !dbg !46
+  %294 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %284, i64 %293, i1 %176) #2, !dbg !46
+  %295 = bitcast i16 %294 to bfloat, !dbg !46
+  %296 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #2, !dbg !46
+  %297 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %285, i64 %296, i1 false) #2, !dbg !46
+  %298 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #2, !dbg !46
+  %299 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %286, i64 %298, i1 %178) #2, !dbg !46
+  %300 = bitcast i16 %299 to bfloat, !dbg !46
+  %301 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #2, !dbg !46
+  %302 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %287, i64 %301, i1 false) #2, !dbg !46
+  %303 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #2, !dbg !46
+  %304 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %288, i64 %303, i1 %180) #2, !dbg !46
+  %305 = bitcast i16 %304 to bfloat, !dbg !46
+  %306 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #2, !dbg !46
+  %307 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %289, i64 %306, i1 false) #2, !dbg !46
+  %308 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #2, !dbg !46
+  %309 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %290, i64 %308, i1 %182) #2, !dbg !46
+  %310 = bitcast i16 %309 to bfloat, !dbg !46
+  %311 = fpext bfloat %295 to float, !dbg !47
+  %312 = fpext bfloat %300 to float, !dbg !47
+  %313 = fpext bfloat %305 to float, !dbg !47
+  %314 = fpext bfloat %310 to float, !dbg !47
+  %315 = select i1 %135, float %276, float %311, !dbg !40
+  %316 = select i1 %142, float %278, float %312, !dbg !40
+  %317 = select i1 %149, float %280, float %313, !dbg !40
+  %318 = select i1 %156, float %282, float %314, !dbg !40
+  %319 = getelementptr bfloat, ptr addrspace(1) %4, i64 %26, !dbg !48
+  %320 = fpext <2 x bfloat> %30 to <2 x float>, !dbg !49
+  %321 = insertelement <2 x i32> poison, i32 %46, i64 0, !dbg !19
+  %322 = insertelement <2 x i32> %321, i32 %47, i64 1, !dbg !19
+  %323 = bitcast <2 x i32> %322 to <2 x float>, !dbg !19
+  %324 = insertelement <2 x i32> poison, i32 %60, i64 0, !dbg !21
+  %325 = insertelement <2 x i32> %324, i32 %61, i64 1, !dbg !21
+  %326 = bitcast <2 x i32> %325 to <2 x float>, !dbg !21
+  %327 = fmul <2 x float> %320, %323, !dbg !50
+  %328 = insertelement <2 x float> poison, float %167, i64 0, !dbg !51
+  %329 = insertelement <2 x float> %328, float %231, i64 1, !dbg !51
+  %330 = fmul <2 x float> %329, %326, !dbg !51
+  %331 = fadd <2 x float> %327, %330, !dbg !52
+  %332 = fptrunc <2 x float> %331 to <2 x bfloat>, !dbg !53
+  %333 = fpext <2 x bfloat> %32 to <2 x float>, !dbg !49
+  %334 = insertelement <2 x i32> poison, i32 %48, i64 0, !dbg !19
+  %335 = insertelement <2 x i32> %334, i32 %49, i64 1, !dbg !19
+  %336 = bitcast <2 x i32> %335 to <2 x float>, !dbg !19
+  %337 = insertelement <2 x i32> poison, i32 %62, i64 0, !dbg !21
+  %338 = insertelement <2 x i32> %337, i32 %63, i64 1, !dbg !21
+  %339 = bitcast <2 x i32> %338 to <2 x float>, !dbg !21
+  %340 = fmul <2 x float> %333, %336, !dbg !50
+  %341 = insertelement <2 x float> poison, float %169, i64 0, !dbg !51
+  %342 = insertelement <2 x float> %341, float %232, i64 1, !dbg !51
+  %343 = fmul <2 x float> %342, %339, !dbg !51
+  %344 = fadd <2 x float> %340, %343, !dbg !52
+  %345 = fptrunc <2 x float> %344 to <2 x bfloat>, !dbg !53
+  %346 = fpext <2 x bfloat> %34 to <2 x float>, !dbg !49
+  %347 = insertelement <2 x i32> poison, i32 %52, i64 0, !dbg !19
+  %348 = insertelement <2 x i32> %347, i32 %53, i64 1, !dbg !19
+  %349 = bitcast <2 x i32> %348 to <2 x float>, !dbg !19
+  %350 = insertelement <2 x i32> poison, i32 %66, i64 0, !dbg !21
+  %351 = insertelement <2 x i32> %350, i32 %67, i64 1, !dbg !21
+  %352 = bitcast <2 x i32> %351 to <2 x float>, !dbg !21
+  %353 = fmul <2 x float> %346, %349, !dbg !50
+  %354 = insertelement <2 x float> poison, float %171, i64 0, !dbg !51
+  %355 = insertelement <2 x float> %354, float %233, i64 1, !dbg !51
+  %356 = fmul <2 x float> %355, %352, !dbg !51
+  %357 = fadd <2 x float> %353, %356, !dbg !52
+  %358 = fptrunc <2 x float> %357 to <2 x bfloat>, !dbg !53
+  %359 = fpext <2 x bfloat> %36 to <2 x float>, !dbg !49
+  %360 = insertelement <2 x i32> poison, i32 %54, i64 0, !dbg !19
+  %361 = insertelement <2 x i32> %360, i32 %55, i64 1, !dbg !19
+  %362 = bitcast <2 x i32> %361 to <2 x float>, !dbg !19
+  %363 = insertelement <2 x i32> poison, i32 %68, i64 0, !dbg !21
+  %364 = insertelement <2 x i32> %363, i32 %69, i64 1, !dbg !21
+  %365 = bitcast <2 x i32> %364 to <2 x float>, !dbg !21
+  %366 = fmul <2 x float> %359, %362, !dbg !50
+  %367 = insertelement <2 x float> poison, float %173, i64 0, !dbg !51
+  %368 = insertelement <2 x float> %367, float %234, i64 1, !dbg !51
+  %369 = fmul <2 x float> %368, %365, !dbg !51
+  %370 = fadd <2 x float> %366, %369, !dbg !52
+  %371 = fptrunc <2 x float> %370 to <2 x bfloat>, !dbg !53
+  %372 = bitcast <2 x bfloat> %332 to i32, !dbg !53
+  %373 = bitcast <2 x bfloat> %345 to i32, !dbg !53
+  %374 = bitcast <2 x bfloat> %358 to i32, !dbg !53
+  %375 = bitcast <2 x bfloat> %371 to i32, !dbg !53
+  tail call void asm sideeffect "st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l"(i32 %372, i32 %373, i32 %374, i32 %375, ptr addrspace(1) %319) #2, !dbg !53
+  %376 = getelementptr bfloat, ptr addrspace(1) %5, i64 %26, !dbg !54
+  %377 = fpext <2 x bfloat> %73 to <2 x float>, !dbg !55
+  %378 = fmul <2 x float> %323, %377, !dbg !56
+  %379 = insertelement <2 x float> poison, float %275, i64 0, !dbg !57
+  %380 = insertelement <2 x float> %379, float %315, i64 1, !dbg !57
+  %381 = fmul <2 x float> %380, %326, !dbg !57
+  %382 = fadd <2 x float> %378, %381, !dbg !58
+  %383 = fptrunc <2 x float> %382 to <2 x bfloat>, !dbg !59
+  %384 = fpext <2 x bfloat> %75 to <2 x float>, !dbg !55
+  %385 = fmul <2 x float> %336, %384, !dbg !56
+  %386 = insertelement <2 x float> poison, float %277, i64 0, !dbg !57
+  %387 = insertelement <2 x float> %386, float %316, i64 1, !dbg !57
+  %388 = fmul <2 x float> %387, %339, !dbg !57
+  %389 = fadd <2 x float> %385, %388, !dbg !58
+  %390 = fptrunc <2 x float> %389 to <2 x bfloat>, !dbg !59
+  %391 = fpext <2 x bfloat> %77 to <2 x float>, !dbg !55
+  %392 = fmul <2 x float> %349, %391, !dbg !56
+  %393 = insertelement <2 x float> poison, float %279, i64 0, !dbg !57
+  %394 = insertelement <2 x float> %393, float %317, i64 1, !dbg !57
+  %395 = fmul <2 x float> %394, %352, !dbg !57
+  %396 = fadd <2 x float> %392, %395, !dbg !58
+  %397 = fptrunc <2 x float> %396 to <2 x bfloat>, !dbg !59
+  %398 = fpext <2 x bfloat> %79 to <2 x float>, !dbg !55
+  %399 = fmul <2 x float> %362, %398, !dbg !56
+  %400 = insertelement <2 x float> poison, float %281, i64 0, !dbg !57
+  %401 = insertelement <2 x float> %400, float %318, i64 1, !dbg !57
+  %402 = fmul <2 x float> %401, %365, !dbg !57
+  %403 = fadd <2 x float> %399, %402, !dbg !58
+  %404 = fptrunc <2 x float> %403 to <2 x bfloat>, !dbg !59
+  %405 = bitcast <2 x bfloat> %383 to i32, !dbg !59
+  %406 = bitcast <2 x bfloat> %390 to i32, !dbg !59
+  %407 = bitcast <2 x bfloat> %397 to i32, !dbg !59
+  %408 = bitcast <2 x bfloat> %404 to i32, !dbg !59
+  tail call void asm sideeffect "st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l"(i32 %405, i32 %406, i32 %407, i32 %408, ptr addrspace(1) %376) #2, !dbg !59
+  ret void, !dbg !60
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1
+
+attributes #0 = { nounwind "nvvm.reqntid"="128" }
+attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #2 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly)
+!1 = !DIFile(filename: "cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py", directory: "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6")
+!2 = !{i32 2, !"Debug Info Version", i32 3}
+!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
+!4 = distinct !DISubprogram(name: "triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3", linkageName: "triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!5 = !DISubroutineType(cc: DW_CC_normal, types: !6)
+!6 = !{}
+!7 = !DILocation(line: 20, column: 28, scope: !4)
+!8 = !DILocation(line: 20, column: 33, scope: !4)
+!9 = !DILocation(line: 21, column: 36, scope: !4)
+!10 = !DILocation(line: 21, column: 23, scope: !4)
+!11 = !DILocation(line: 26, column: 19, scope: !4)
+!12 = !DILocation(line: 24, column: 19, scope: !4)
+!13 = !DILocation(line: 25, column: 19, scope: !4)
+!14 = !DILocation(line: 27, column: 30, scope: !4)
+!15 = !DILocation(line: 27, column: 35, scope: !4)
+!16 = !DILocation(line: 28, column: 39, scope: !4)
+!17 = !DILocation(line: 28, column: 35, scope: !4)
+!18 = !DILocation(line: 28, column: 30, scope: !4)
+!19 = !DILocation(line: 28, column: 44, scope: !4)
+!20 = !DILocation(line: 29, column: 31, scope: !4)
+!21 = !DILocation(line: 29, column: 45, scope: !4)
+!22 = !DILocation(line: 30, column: 31, scope: !4)
+!23 = !DILocation(line: 30, column: 36, scope: !4)
+!24 = !DILocation(line: 33, column: 17, scope: !4)
+!25 = !DILocation(line: 37, column: 18, scope: !4)
+!26 = !DILocation(line: 38, column: 43, scope: !4)
+!27 = !DILocation(line: 38, column: 37, scope: !4)
+!28 = !DILocation(line: 38, column: 34, scope: !4)
+!29 = !DILocation(line: 38, column: 52, scope: !4)
+!30 = !DILocation(line: 38, column: 48, scope: !4)
+!31 = !DILocation(line: 38, column: 30, scope: !4)
+!32 = !DILocation(line: 38, column: 57, scope: !4)
+!33 = !DILocation(line: 38, column: 107, scope: !4)
+!34 = !DILocation(line: 39, column: 13, scope: !4)
+!35 = !DILocation(line: 42, column: 20, scope: !4)
+!36 = !DILocation(line: 45, column: 45, scope: !4)
+!37 = !DILocation(line: 45, column: 31, scope: !4)
+!38 = !DILocation(line: 45, column: 54, scope: !4)
+!39 = !DILocation(line: 45, column: 105, scope: !4)
+!40 = !DILocation(line: 0, scope: !4)
+!41 = !DILocation(line: 53, column: 31, scope: !4)
+!42 = !DILocation(line: 53, column: 58, scope: !4)
+!43 = !DILocation(line: 53, column: 108, scope: !4)
+!44 = !DILocation(line: 54, column: 13, scope: !4)
+!45 = !DILocation(line: 57, column: 31, scope: !4)
+!46 = !DILocation(line: 57, column: 54, scope: !4)
+!47 = !DILocation(line: 57, column: 105, scope: !4)
+!48 = !DILocation(line: 63, column: 25, scope: !4)
+!49 = !DILocation(line: 27, column: 44, scope: !4)
+!50 = !DILocation(line: 32, column: 18, scope: !4)
+!51 = !DILocation(line: 48, column: 20, scope: !4)
+!52 = !DILocation(line: 49, column: 19, scope: !4)
+!53 = !DILocation(line: 63, column: 37, scope: !4)
+!54 = !DILocation(line: 64, column: 25, scope: !4)
+!55 = !DILocation(line: 30, column: 45, scope: !4)
+!56 = !DILocation(line: 52, column: 20, scope: !4)
+!57 = !DILocation(line: 60, column: 20, scope: !4)
+!58 = !DILocation(line: 61, column: 20, scope: !4)
+!59 = !DILocation(line: 64, column: 37, scope: !4)
+!60 = !DILocation(line: 64, column: 4, scope: !4)
diff --git a/triton/PSM7NANFVWEDYUPXKUGOX4GWFVUW6ZQXELVXM5G5LMW6RWIXRCOQ/triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.ptx b/triton/PSM7NANFVWEDYUPXKUGOX4GWFVUW6ZQXELVXM5G5LMW6RWIXRCOQ/triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.ptx
new file mode 100644
index 0000000000000000000000000000000000000000..a8d7f2e65b95579f07f8b739e37d4a36c6983108
--- /dev/null
+++ b/triton/PSM7NANFVWEDYUPXKUGOX4GWFVUW6ZQXELVXM5G5LMW6RWIXRCOQ/triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.ptx
@@ -0,0 +1,971 @@
+//
+// Generated by LLVM NVPTX Back-End
+//
+
+.version 9.1
+.target sm_89
+.address_size 64
+
+	// .globl	triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3 // -- Begin function triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3
+                                        // @triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3
+.visible .entry triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3(
+	.param .u64 .ptr .global .align 1 triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3_param_0,
+	.param .u64 .ptr .global .align 1 triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3_param_1,
+	.param .u64 .ptr .global .align 1 triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3_param_2,
+	.param .u64 .ptr .global .align 1 triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3_param_3,
+	.param .u64 .ptr .global .align 1 triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3_param_4,
+	.param .u64 .ptr .global .align 1 triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3_param_5,
+	.param .u32 triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3_param_6,
+	.param .u64 .ptr .global .align 1 triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3_param_7,
+	.param .u64 .ptr .global .align 1 triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3_param_8
+)
+.reqntid 128
+{
+	.reg .pred 	%p<11>;
+	.reg .b16 	%rs<74>;
+	.reg .b32 	%r<208>;
+	.reg .b64 	%rd<99>;
+	.loc	1 18 0                          // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:18:0
+$L__func_begin0:
+	.loc	1 18 0                          // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:18:0
+
+// %bb.0:
+	ld.param.b64 	%rd75, [triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3_param_0];
+	ld.param.b64 	%rd76, [triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3_param_1];
+$L__tmp0:
+	.loc	1 20 28                         // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:20:28
+	mov.u32 	%r33, %ctaid.x;
+	.loc	1 20 33                         // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:20:33
+	shl.b32 	%r34, %r33, 10;
+	ld.param.b64 	%rd77, [triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3_param_2];
+	ld.param.b64 	%rd78, [triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3_param_3];
+	.loc	1 21 36                         // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:21:36
+	mov.u32 	%r35, %tid.x;
+	shl.b32 	%r36, %r35, 3;
+	ld.param.b64 	%rd79, [triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3_param_4];
+	and.b32 	%r37, %r36, 1016;
+	ld.param.b64 	%rd80, [triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3_param_5];
+	.loc	1 21 23                         // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:21:23
+	or.b32 	%r38, %r37, %r34;
+	.loc	1 21 36                         // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:21:36
+	or.b32 	%r39, %r34, %r36;
+	.loc	1 21 23                         // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:21:23
+	or.b32 	%r40, %r39, 2;
+	or.b32 	%r41, %r39, 4;
+	or.b32 	%r42, %r39, 6;
+	.loc	1 26 19                         // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:26:19
+	bfe.s32 	%r43, %r33, 21, 1;
+	.loc	1 24 19                         // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:24:19
+	shr.u32 	%r44, %r43, 25;
+	.loc	1 26 19                         // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:26:19
+	add.s32 	%r45, %r38, %r44;
+	.loc	1 24 19                         // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:24:19
+	and.b32 	%r46, %r45, -128;
+	sub.s32 	%r47, %r38, %r46;
+	add.s32 	%r48, %r40, %r44;
+	and.b32 	%r49, %r48, -128;
+	sub.s32 	%r50, %r40, %r49;
+	add.s32 	%r51, %r41, %r44;
+	and.b32 	%r52, %r51, -128;
+	sub.s32 	%r53, %r41, %r52;
+	add.s32 	%r54, %r42, %r44;
+	and.b32 	%r55, %r54, -128;
+	sub.s32 	%r56, %r42, %r55;
+	.loc	1 25 19                         // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:25:19
+	shr.u32 	%r57, %r43, 20;
+	add.s32 	%r58, %r38, %r57;
+	shr.s32 	%r59, %r58, 12;
+	.loc	1 27 30                         // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:27:30
+	mul.wide.s32 	%rd81, %r38, 2;
+	add.s64 	%rd1, %rd75, %rd81;
+	.loc	1 27 35                         // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:27:35
+	// begin inline asm
+	mov.u32 %r1, 0x0;
+	mov.u32 %r2, 0x0;
+	mov.u32 %r3, 0x0;
+	mov.u32 %r4, 0x0;
+	ld.global.v4.b32 { %r1, %r2, %r3, %r4 }, [ %rd1 + 0 ];
+	// end inline asm
+	.loc	1 28 39                         // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:28:39
+	shl.b32 	%r60, %r59, 7;
+	.loc	1 28 35                         // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:28:35
+	add.s32 	%r61, %r60, %r47;
+	add.s32 	%r62, %r60, %r53;
+	.loc	1 28 30                         // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:28:30
+	mul.wide.s32 	%rd82, %r61, 4;
+	add.s64 	%rd2, %rd76, %rd82;
+	mul.wide.s32 	%rd83, %r62, 4;
+	add.s64 	%rd4, %rd76, %rd83;
+	.loc	1 28 44                         // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:28:44
+	// begin inline asm
+	mov.u64 %rd3, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd3, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r5, 0x0;
+	mov.u32 %r6, 0x0;
+	mov.u32 %r7, 0x0;
+	mov.u32 %r8, 0x0;
+	ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r5, %r6, %r7, %r8 }, [ %rd2 + 0 ], %rd3;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd5, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd5, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r9, 0x0;
+	mov.u32 %r10, 0x0;
+	mov.u32 %r11, 0x0;
+	mov.u32 %r12, 0x0;
+	ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r9, %r10, %r11, %r12 }, [ %rd4 + 0 ], %rd5;
+	// end inline asm
+	.loc	1 29 31                         // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:29:31
+	add.s64 	%rd6, %rd77, %rd82;
+	add.s64 	%rd8, %rd77, %rd83;
+	.loc	1 29 45                         // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:29:45
+	// begin inline asm
+	mov.u64 %rd7, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd7, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r13, 0x0;
+	mov.u32 %r14, 0x0;
+	mov.u32 %r15, 0x0;
+	mov.u32 %r16, 0x0;
+	ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r13, %r14, %r15, %r16 }, [ %rd6 + 0 ], %rd7;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd9, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd9, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r17, 0x0;
+	mov.u32 %r18, 0x0;
+	mov.u32 %r19, 0x0;
+	mov.u32 %r20, 0x0;
+	ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r17, %r18, %r19, %r20 }, [ %rd8 + 0 ], %rd9;
+	// end inline asm
+	.loc	1 30 31                         // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:30:31
+	add.s64 	%rd10, %rd78, %rd81;
+	.loc	1 30 36                         // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:30:36
+	// begin inline asm
+	mov.u32 %r21, 0x0;
+	mov.u32 %r22, 0x0;
+	mov.u32 %r23, 0x0;
+	mov.u32 %r24, 0x0;
+	ld.global.v4.b32 { %r21, %r22, %r23, %r24 }, [ %rd10 + 0 ];
+	// end inline asm
+	.loc	1 21 23                         // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:21:23
+	or.b32 	%r63, %r38, 7;
+	or.b32 	%r64, %r38, 5;
+	or.b32 	%r65, %r38, 3;
+	or.b32 	%r66, %r38, 1;
+	.loc	1 24 19                         // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:24:19
+	add.s32 	%r67, %r66, %r44;
+	and.b32 	%r68, %r67, 65408;
+	sub.s32 	%r69, %r66, %r68;
+	add.s32 	%r70, %r65, %r44;
+	and.b32 	%r71, %r70, 65408;
+	sub.s32 	%r72, %r65, %r71;
+	add.s32 	%r73, %r64, %r44;
+	and.b32 	%r74, %r73, 65408;
+	sub.s32 	%r75, %r64, %r74;
+	add.s32 	%r76, %r63, %r44;
+	and.b32 	%r77, %r76, 65408;
+	sub.s32 	%r78, %r63, %r77;
+	.loc	1 33 17                         // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:33:17
+	bfe.u32 	%r79, %r33, 21, 1;
+	add.s32 	%r80, %r66, %r79;
+	and.b32 	%r81, %r80, -6;
+	sub.s32 	%r82, %r66, %r81;
+	add.s32 	%r83, %r65, %r79;
+	and.b32 	%r84, %r83, -2;
+	sub.s32 	%r85, %r65, %r84;
+	add.s32 	%r86, %r64, %r79;
+	and.b32 	%r87, %r86, -2;
+	sub.s32 	%r88, %r64, %r87;
+	add.s32 	%r89, %r63, %r79;
+	and.b32 	%r90, %r89, -2;
+	sub.s32 	%r91, %r63, %r90;
+	.loc	1 37 18                         // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:37:18
+	setp.lt.s32 	%p10, %r91, 1;
+	setp.lt.s32 	%p9, %r88, 1;
+	setp.lt.s32 	%p8, %r85, 1;
+	setp.lt.s32 	%p7, %r82, 1;
+	.loc	1 38 43                         // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:38:43
+	cvt.u16.u32 	%rs34, %r69;
+	and.b16 	%rs35, %rs34, 128;
+	shr.u16 	%rs36, %rs35, 7;
+	add.s16 	%rs37, %rs34, %rs36;
+	cvt.s16.s8 	%rs38, %rs37;
+	shr.s16 	%rs39, %rs38, 1;
+	cvt.u16.u32 	%rs40, %r72;
+	and.b16 	%rs41, %rs40, 128;
+	shr.u16 	%rs42, %rs41, 7;
+	add.s16 	%rs43, %rs40, %rs42;
+	cvt.s16.s8 	%rs44, %rs43;
+	shr.s16 	%rs45, %rs44, 1;
+	cvt.u16.u32 	%rs46, %r75;
+	and.b16 	%rs47, %rs46, 128;
+	shr.u16 	%rs48, %rs47, 7;
+	add.s16 	%rs49, %rs46, %rs48;
+	cvt.s16.s8 	%rs50, %rs49;
+	shr.s16 	%rs51, %rs50, 1;
+	cvt.u16.u32 	%rs52, %r78;
+	and.b16 	%rs53, %rs52, 128;
+	shr.u16 	%rs54, %rs53, 7;
+	add.s16 	%rs55, %rs52, %rs54;
+	cvt.s16.s8 	%rs56, %rs55;
+	shr.s16 	%rs57, %rs56, 1;
+	.loc	1 38 37                         // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:38:37
+	mul.wide.s16 	%r92, %rs39, 2;
+	mul.wide.s16 	%r93, %rs45, 2;
+	mul.wide.s16 	%r94, %rs51, 2;
+	mul.wide.s16 	%r95, %rs57, 2;
+	.loc	1 38 34                         // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:38:34
+	or.b32 	%r96, %r46, 1;
+	.loc	1 38 48                         // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:38:48
+	add.s32 	%r97, %r96, %r92;
+	add.s32 	%r98, %r46, %r50;
+	or.b32 	%r99, %r98, 1;
+	add.s32 	%r100, %r96, %r93;
+	add.s32 	%r101, %r46, %r53;
+	or.b32 	%r102, %r101, 1;
+	add.s32 	%r103, %r96, %r94;
+	add.s32 	%r104, %r46, %r56;
+	or.b32 	%r105, %r104, 1;
+	add.s32 	%r106, %r96, %r95;
+	.loc	1 38 30                         // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:38:30
+	mul.wide.s32 	%rd84, %r66, 2;
+	add.s64 	%rd11, %rd75, %rd84;
+	mul.wide.s32 	%rd85, %r97, 2;
+	add.s64 	%rd13, %rd75, %rd85;
+	mul.wide.s32 	%rd86, %r99, 2;
+	add.s64 	%rd15, %rd75, %rd86;
+	mul.wide.s32 	%rd87, %r100, 2;
+	add.s64 	%rd17, %rd75, %rd87;
+	mul.wide.s32 	%rd88, %r102, 2;
+	add.s64 	%rd19, %rd75, %rd88;
+	mul.wide.s32 	%rd89, %r103, 2;
+	add.s64 	%rd21, %rd75, %rd89;
+	mul.wide.s32 	%rd90, %r105, 2;
+	add.s64 	%rd23, %rd75, %rd90;
+	mul.wide.s32 	%rd91, %r106, 2;
+	add.s64 	%rd25, %rd75, %rd91;
+	.loc	1 38 57                         // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:38:57
+	// begin inline asm
+	mov.u64 %rd12, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd12, 1.0;
+	// end inline asm
+	mov.b16 	%rs2, 0;
+	mov.pred 	%p1, -1;
+	// begin inline asm
+	mov.u16 %rs1, %rs2;
+	@%p1 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs1 }, [ %rd11 + 0 ], %rd12;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd14, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd14, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs3, %rs2;
+	@%p7 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs3 }, [ %rd13 + 0 ], %rd14;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd16, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd16, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs4, %rs2;
+	@%p1 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs4 }, [ %rd15 + 0 ], %rd16;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd18, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd18, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs5, %rs2;
+	@%p8 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs5 }, [ %rd17 + 0 ], %rd18;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd20, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd20, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs6, %rs2;
+	@%p1 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs6 }, [ %rd19 + 0 ], %rd20;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd22, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd22, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs7, %rs2;
+	@%p9 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs7 }, [ %rd21 + 0 ], %rd22;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd24, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd24, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs8, %rs2;
+	@%p1 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs8 }, [ %rd23 + 0 ], %rd24;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd26, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd26, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs9, %rs2;
+	@%p10 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs9 }, [ %rd25 + 0 ], %rd26;
+	// end inline asm
+	.loc	1 38 107                        // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:38:107
+	cvt.f32.bf16 	%r107, %rs1;
+	cvt.f32.bf16 	%r108, %rs3;
+	cvt.f32.bf16 	%r109, %rs4;
+	cvt.f32.bf16 	%r110, %rs5;
+	cvt.f32.bf16 	%r111, %rs6;
+	cvt.f32.bf16 	%r112, %rs7;
+	cvt.f32.bf16 	%r113, %rs8;
+	cvt.f32.bf16 	%r114, %rs9;
+	mov.b32 	%r115, 0f00000000;
+	.loc	1 39 13                         // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:39:13
+	sub.f32 	%r116, %r115, %r107;
+	sub.f32 	%r117, %r115, %r108;
+	sub.f32 	%r118, %r115, %r109;
+	sub.f32 	%r119, %r115, %r110;
+	sub.f32 	%r120, %r115, %r111;
+	sub.f32 	%r121, %r115, %r112;
+	sub.f32 	%r122, %r115, %r113;
+	sub.f32 	%r123, %r115, %r114;
+	.loc	1 42 20                         // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:42:20
+	setp.gt.s32 	%p3, %r82, 0;
+	setp.gt.s32 	%p4, %r85, 0;
+	setp.gt.s32 	%p5, %r88, 0;
+	setp.gt.s32 	%p6, %r91, 0;
+	.loc	1 45 45                         // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:45:45
+	add.s32 	%r124, %r92, %r46;
+	add.s32 	%r125, %r93, %r46;
+	add.s32 	%r126, %r94, %r46;
+	add.s32 	%r127, %r95, %r46;
+	.loc	1 45 31                         // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:45:31
+	mul.wide.s32 	%rd92, %r124, 2;
+	add.s64 	%rd28, %rd75, %rd92;
+	mul.wide.s32 	%rd93, %r98, 2;
+	add.s64 	%rd30, %rd75, %rd93;
+	mul.wide.s32 	%rd94, %r125, 2;
+	add.s64 	%rd32, %rd75, %rd94;
+	mul.wide.s32 	%rd95, %r101, 2;
+	add.s64 	%rd34, %rd75, %rd95;
+	mul.wide.s32 	%rd96, %r126, 2;
+	add.s64 	%rd36, %rd75, %rd96;
+	mul.wide.s32 	%rd97, %r104, 2;
+	add.s64 	%rd38, %rd75, %rd97;
+	mul.wide.s32 	%rd98, %r127, 2;
+	add.s64 	%rd40, %rd75, %rd98;
+	.loc	1 45 54                         // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:45:54
+	// begin inline asm
+	mov.u64 %rd27, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd27, 1.0;
+	// end inline asm
+	mov.pred 	%p2, 0;
+	// begin inline asm
+	mov.u16 %rs10, %rs2;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs10 }, [ %rd1 + 0 ], %rd27;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd29, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd29, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs11, %rs2;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs11 }, [ %rd28 + 0 ], %rd29;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd31, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd31, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs12, %rs2;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs12 }, [ %rd30 + 0 ], %rd31;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd33, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd33, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs13, %rs2;
+	@%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs13 }, [ %rd32 + 0 ], %rd33;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd35, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd35, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs14, %rs2;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs14 }, [ %rd34 + 0 ], %rd35;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd37, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd37, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs15, %rs2;
+	@%p5 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs15 }, [ %rd36 + 0 ], %rd37;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd39, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd39, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs16, %rs2;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs16 }, [ %rd38 + 0 ], %rd39;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd41, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd41, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs17, %rs2;
+	@%p6 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs17 }, [ %rd40 + 0 ], %rd41;
+	// end inline asm
+	.loc	1 45 105                        // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:45:105
+	cvt.f32.bf16 	%r128, %rs11;
+	cvt.f32.bf16 	%r129, %rs13;
+	cvt.f32.bf16 	%r130, %rs15;
+	cvt.f32.bf16 	%r131, %rs17;
+	.loc	1 0 0                           // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:0
+	selp.f32 	%r132, %r117, %r128, %p7;
+	selp.f32 	%r133, %r119, %r129, %p8;
+	selp.f32 	%r134, %r121, %r130, %p9;
+	selp.f32 	%r135, %r123, %r131, %p10;
+	.loc	1 53 31                         // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:53:31
+	add.s64 	%rd42, %rd78, %rd84;
+	add.s64 	%rd44, %rd78, %rd85;
+	add.s64 	%rd46, %rd78, %rd86;
+	add.s64 	%rd48, %rd78, %rd87;
+	add.s64 	%rd50, %rd78, %rd88;
+	add.s64 	%rd52, %rd78, %rd89;
+	add.s64 	%rd54, %rd78, %rd90;
+	add.s64 	%rd56, %rd78, %rd91;
+	.loc	1 53 58                         // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:53:58
+	// begin inline asm
+	mov.u64 %rd43, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd43, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs18, %rs2;
+	@%p1 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs18 }, [ %rd42 + 0 ], %rd43;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd45, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd45, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs19, %rs2;
+	@%p7 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs19 }, [ %rd44 + 0 ], %rd45;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd47, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd47, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs20, %rs2;
+	@%p1 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs20 }, [ %rd46 + 0 ], %rd47;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd49, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd49, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs21, %rs2;
+	@%p8 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs21 }, [ %rd48 + 0 ], %rd49;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd51, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd51, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs22, %rs2;
+	@%p1 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs22 }, [ %rd50 + 0 ], %rd51;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd53, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd53, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs23, %rs2;
+	@%p9 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs23 }, [ %rd52 + 0 ], %rd53;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd55, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd55, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs24, %rs2;
+	@%p1 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs24 }, [ %rd54 + 0 ], %rd55;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd57, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd57, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs25, %rs2;
+	@%p10 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs25 }, [ %rd56 + 0 ], %rd57;
+	// end inline asm
+	.loc	1 53 108                        // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:53:108
+	cvt.f32.bf16 	%r136, %rs18;
+	cvt.f32.bf16 	%r137, %rs19;
+	cvt.f32.bf16 	%r138, %rs20;
+	cvt.f32.bf16 	%r139, %rs21;
+	cvt.f32.bf16 	%r140, %rs22;
+	cvt.f32.bf16 	%r141, %rs23;
+	cvt.f32.bf16 	%r142, %rs24;
+	cvt.f32.bf16 	%r143, %rs25;
+	.loc	1 54 13                         // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:54:13
+	sub.f32 	%r144, %r115, %r136;
+	sub.f32 	%r145, %r115, %r137;
+	sub.f32 	%r146, %r115, %r138;
+	sub.f32 	%r147, %r115, %r139;
+	sub.f32 	%r148, %r115, %r140;
+	sub.f32 	%r149, %r115, %r141;
+	sub.f32 	%r150, %r115, %r142;
+	sub.f32 	%r151, %r115, %r143;
+	.loc	1 57 31                         // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:57:31
+	add.s64 	%rd59, %rd78, %rd92;
+	add.s64 	%rd61, %rd78, %rd93;
+	add.s64 	%rd63, %rd78, %rd94;
+	add.s64 	%rd65, %rd78, %rd95;
+	add.s64 	%rd67, %rd78, %rd96;
+	add.s64 	%rd69, %rd78, %rd97;
+	add.s64 	%rd71, %rd78, %rd98;
+	.loc	1 57 54                         // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:57:54
+	// begin inline asm
+	mov.u64 %rd58, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd58, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs26, %rs2;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs26 }, [ %rd10 + 0 ], %rd58;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd60, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd60, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs27, %rs2;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs27 }, [ %rd59 + 0 ], %rd60;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd62, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd62, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs28, %rs2;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs28 }, [ %rd61 + 0 ], %rd62;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd64, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd64, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs29, %rs2;
+	@%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs29 }, [ %rd63 + 0 ], %rd64;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd66, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd66, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs30, %rs2;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs30 }, [ %rd65 + 0 ], %rd66;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd68, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd68, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs31, %rs2;
+	@%p5 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs31 }, [ %rd67 + 0 ], %rd68;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd70, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd70, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs32, %rs2;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs32 }, [ %rd69 + 0 ], %rd70;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd72, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd72, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs33, %rs2;
+	@%p6 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs33 }, [ %rd71 + 0 ], %rd72;
+	// end inline asm
+	.loc	1 57 105                        // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:57:105
+	cvt.f32.bf16 	%r152, %rs27;
+	cvt.f32.bf16 	%r153, %rs29;
+	cvt.f32.bf16 	%r154, %rs31;
+	cvt.f32.bf16 	%r155, %rs33;
+	.loc	1 0 0                           // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:0
+	selp.f32 	%r156, %r145, %r152, %p7;
+	selp.f32 	%r157, %r147, %r153, %p8;
+	selp.f32 	%r158, %r149, %r154, %p9;
+	selp.f32 	%r159, %r151, %r155, %p10;
+	.loc	1 63 25                         // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:63:25
+	add.s64 	%rd73, %rd79, %rd81;
+	.loc	1 27 44                         // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:27:44
+	mov.b32 	{%rs58, %rs59}, %r1;
+	cvt.f32.bf16 	%r160, %rs58;
+	cvt.f32.bf16 	%r161, %rs59;
+	.loc	1 48 20                         // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:48:20
+	mul.f32 	%r162, %r132, %r14;
+	mul.f32 	%r163, %r116, %r13;
+	.loc	1 49 19                         // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:49:19
+	fma.rn.f32 	%r164, %r161, %r6, %r162;
+	fma.rn.f32 	%r165, %r160, %r5, %r163;
+	.loc	1 63 37                         // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:63:37
+	cvt.rn.bf16x2.f32 	%r25, %r164, %r165;
+	.loc	1 27 44                         // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:27:44
+	mov.b32 	{%rs60, %rs61}, %r2;
+	cvt.f32.bf16 	%r166, %rs60;
+	cvt.f32.bf16 	%r167, %rs61;
+	.loc	1 48 20                         // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:48:20
+	mul.f32 	%r168, %r133, %r16;
+	mul.f32 	%r169, %r118, %r15;
+	.loc	1 49 19                         // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:49:19
+	fma.rn.f32 	%r170, %r167, %r8, %r168;
+	fma.rn.f32 	%r171, %r166, %r7, %r169;
+	.loc	1 63 37                         // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:63:37
+	cvt.rn.bf16x2.f32 	%r26, %r170, %r171;
+	.loc	1 27 44                         // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:27:44
+	mov.b32 	{%rs62, %rs63}, %r3;
+	cvt.f32.bf16 	%r172, %rs62;
+	cvt.f32.bf16 	%r173, %rs63;
+	.loc	1 48 20                         // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:48:20
+	mul.f32 	%r174, %r134, %r18;
+	mul.f32 	%r175, %r120, %r17;
+	.loc	1 49 19                         // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:49:19
+	fma.rn.f32 	%r176, %r173, %r10, %r174;
+	fma.rn.f32 	%r177, %r172, %r9, %r175;
+	.loc	1 63 37                         // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:63:37
+	cvt.rn.bf16x2.f32 	%r27, %r176, %r177;
+	.loc	1 27 44                         // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:27:44
+	mov.b32 	{%rs64, %rs65}, %r4;
+	cvt.f32.bf16 	%r178, %rs64;
+	cvt.f32.bf16 	%r179, %rs65;
+	.loc	1 48 20                         // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:48:20
+	mul.f32 	%r180, %r135, %r20;
+	mul.f32 	%r181, %r122, %r19;
+	.loc	1 49 19                         // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:49:19
+	fma.rn.f32 	%r182, %r179, %r12, %r180;
+	fma.rn.f32 	%r183, %r178, %r11, %r181;
+	.loc	1 63 37                         // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:63:37
+	cvt.rn.bf16x2.f32 	%r28, %r182, %r183;
+	// begin inline asm
+	st.global.v4.b32 [ %rd73 + 0 ], { %r25, %r26, %r27, %r28 };
+	// end inline asm
+	.loc	1 64 25                         // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:64:25
+	add.s64 	%rd74, %rd80, %rd81;
+	.loc	1 30 45                         // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:30:45
+	mov.b32 	{%rs66, %rs67}, %r21;
+	cvt.f32.bf16 	%r184, %rs66;
+	cvt.f32.bf16 	%r185, %rs67;
+	.loc	1 60 20                         // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:60:20
+	mul.f32 	%r186, %r156, %r14;
+	mul.f32 	%r187, %r144, %r13;
+	.loc	1 61 20                         // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:61:20
+	fma.rn.f32 	%r188, %r6, %r185, %r186;
+	fma.rn.f32 	%r189, %r5, %r184, %r187;
+	.loc	1 64 37                         // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:64:37
+	cvt.rn.bf16x2.f32 	%r29, %r188, %r189;
+	.loc	1 30 45                         // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:30:45
+	mov.b32 	{%rs68, %rs69}, %r22;
+	cvt.f32.bf16 	%r190, %rs68;
+	cvt.f32.bf16 	%r191, %rs69;
+	.loc	1 60 20                         // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:60:20
+	mul.f32 	%r192, %r157, %r16;
+	mul.f32 	%r193, %r146, %r15;
+	.loc	1 61 20                         // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:61:20
+	fma.rn.f32 	%r194, %r8, %r191, %r192;
+	fma.rn.f32 	%r195, %r7, %r190, %r193;
+	.loc	1 64 37                         // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:64:37
+	cvt.rn.bf16x2.f32 	%r30, %r194, %r195;
+	.loc	1 30 45                         // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:30:45
+	mov.b32 	{%rs70, %rs71}, %r23;
+	cvt.f32.bf16 	%r196, %rs70;
+	cvt.f32.bf16 	%r197, %rs71;
+	.loc	1 60 20                         // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:60:20
+	mul.f32 	%r198, %r158, %r18;
+	mul.f32 	%r199, %r148, %r17;
+	.loc	1 61 20                         // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:61:20
+	fma.rn.f32 	%r200, %r10, %r197, %r198;
+	fma.rn.f32 	%r201, %r9, %r196, %r199;
+	.loc	1 64 37                         // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:64:37
+	cvt.rn.bf16x2.f32 	%r31, %r200, %r201;
+	.loc	1 30 45                         // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:30:45
+	mov.b32 	{%rs72, %rs73}, %r24;
+	cvt.f32.bf16 	%r202, %rs72;
+	cvt.f32.bf16 	%r203, %rs73;
+	.loc	1 60 20                         // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:60:20
+	mul.f32 	%r204, %r159, %r20;
+	mul.f32 	%r205, %r150, %r19;
+	.loc	1 61 20                         // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:61:20
+	fma.rn.f32 	%r206, %r12, %r203, %r204;
+	fma.rn.f32 	%r207, %r11, %r202, %r205;
+	.loc	1 64 37                         // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:64:37
+	cvt.rn.bf16x2.f32 	%r32, %r206, %r207;
+	// begin inline asm
+	st.global.v4.b32 [ %rd74 + 0 ], { %r29, %r30, %r31, %r32 };
+	// end inline asm
+	.loc	1 64 4                          // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:64:4
+	ret;
+$L__tmp1:
+$L__func_end0:
+                                        // -- End function
+}
+	.file	1 "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py"
+	.section	.debug_abbrev
+	{
+.b8 1                                   // Abbreviation Code
+.b8 17                                  // DW_TAG_compile_unit
+.b8 0                                   // DW_CHILDREN_no
+.b8 37                                  // DW_AT_producer
+.b8 8                                   // DW_FORM_string
+.b8 19                                  // DW_AT_language
+.b8 5                                   // DW_FORM_data2
+.b8 3                                   // DW_AT_name
+.b8 8                                   // DW_FORM_string
+.b8 16                                  // DW_AT_stmt_list
+.b8 6                                   // DW_FORM_data4
+.b8 27                                  // DW_AT_comp_dir
+.b8 8                                   // DW_FORM_string
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 0                                   // EOM(3)
+	}
+	.section	.debug_info
+	{
+.b32 224                                // Length of Unit
+.b8 2                                   // DWARF version number
+.b8 0
+.b32 .debug_abbrev                      // Offset Into Abbrev. Section
+.b8 8                                   // Address Size (in bytes)
+.b8 1                                   // Abbrev [1] 0xb:0xd9 DW_TAG_compile_unit
+.b8 116                                 // DW_AT_producer
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 0
+.b8 2                                   // DW_AT_language
+.b8 0
+.b8 99                                  // DW_AT_name
+.b8 106
+.b8 54
+.b8 54
+.b8 116
+.b8 103
+.b8 98
+.b8 102
+.b8 113
+.b8 120
+.b8 55
+.b8 114
+.b8 104
+.b8 121
+.b8 116
+.b8 99
+.b8 121
+.b8 119
+.b8 109
+.b8 106
+.b8 100
+.b8 99
+.b8 105
+.b8 109
+.b8 110
+.b8 119
+.b8 119
+.b8 116
+.b8 113
+.b8 54
+.b8 120
+.b8 106
+.b8 103
+.b8 98
+.b8 50
+.b8 113
+.b8 98
+.b8 113
+.b8 98
+.b8 120
+.b8 120
+.b8 111
+.b8 110
+.b8 97
+.b8 108
+.b8 100
+.b8 111
+.b8 116
+.b8 120
+.b8 54
+.b8 51
+.b8 118
+.b8 46
+.b8 112
+.b8 121
+.b8 0
+.b32 .debug_line                        // DW_AT_stmt_list
+.b8 47                                  // DW_AT_comp_dir
+.b8 97
+.b8 112
+.b8 112
+.b8 47
+.b8 116
+.b8 101
+.b8 110
+.b8 115
+.b8 111
+.b8 114
+.b8 114
+.b8 116
+.b8 95
+.b8 108
+.b8 108
+.b8 109
+.b8 47
+.b8 118
+.b8 105
+.b8 115
+.b8 117
+.b8 97
+.b8 108
+.b8 95
+.b8 103
+.b8 101
+.b8 110
+.b8 47
+.b8 99
+.b8 111
+.b8 109
+.b8 112
+.b8 105
+.b8 108
+.b8 101
+.b8 100
+.b8 95
+.b8 99
+.b8 97
+.b8 99
+.b8 104
+.b8 101
+.b8 47
+.b8 102
+.b8 108
+.b8 117
+.b8 120
+.b8 50
+.b8 95
+.b8 107
+.b8 108
+.b8 101
+.b8 105
+.b8 110
+.b8 95
+.b8 57
+.b8 98
+.b8 95
+.b8 78
+.b8 86
+.b8 73
+.b8 68
+.b8 73
+.b8 65
+.b8 95
+.b8 71
+.b8 101
+.b8 70
+.b8 111
+.b8 114
+.b8 99
+.b8 101
+.b8 95
+.b8 82
+.b8 84
+.b8 88
+.b8 95
+.b8 52
+.b8 48
+.b8 57
+.b8 48
+.b8 95
+.b8 115
+.b8 109
+.b8 56
+.b8 57
+.b8 95
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 50
+.b8 46
+.b8 49
+.b8 48
+.b8 46
+.b8 48
+.b8 97
+.b8 48
+.b8 95
+.b8 98
+.b8 52
+.b8 101
+.b8 52
+.b8 101
+.b8 101
+.b8 56
+.b8 49
+.b8 100
+.b8 51
+.b8 46
+.b8 110
+.b8 118
+.b8 50
+.b8 53
+.b8 46
+.b8 49
+.b8 50
+.b8 95
+.b8 99
+.b8 117
+.b8 100
+.b8 97
+.b8 49
+.b8 51
+.b8 95
+.b8 49
+.b8 47
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 105
+.b8 110
+.b8 100
+.b8 117
+.b8 99
+.b8 116
+.b8 111
+.b8 114
+.b8 47
+.b8 106
+.b8 54
+.b8 0
+	}
+	.section	.debug_macinfo	{	}
diff --git a/triton/PSM7NANFVWEDYUPXKUGOX4GWFVUW6ZQXELVXM5G5LMW6RWIXRCOQ/triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.source b/triton/PSM7NANFVWEDYUPXKUGOX4GWFVUW6ZQXELVXM5G5LMW6RWIXRCOQ/triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.source
new file mode 100644
index 0000000000000000000000000000000000000000..8cc7749d0e4718068c7d84bb8b1835729b30272c
--- /dev/null
+++ b/triton/PSM7NANFVWEDYUPXKUGOX4GWFVUW6ZQXELVXM5G5LMW6RWIXRCOQ/triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.source
@@ -0,0 +1,352 @@
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":18:0)
+#loc81 = loc("in_ptr0"(#loc))
+#loc82 = loc("in_ptr1"(#loc))
+#loc83 = loc("in_ptr2"(#loc))
+#loc84 = loc("in_ptr3"(#loc))
+#loc85 = loc("out_ptr0"(#loc))
+#loc86 = loc("out_ptr1"(#loc))
+#loc87 = loc("xnumel"(#loc))
+module {
+  tt.func public @triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %out_ptr1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} {
+    %xnumel_0 = arith.constant 9437184 : i32 loc(#loc88)
+    %xoffset = tt.get_program_id x : i32 loc(#loc89)
+    %xoffset_1 = arith.constant 1024 : i32 loc(#loc90)
+    %xoffset_2 = arith.constant 1024 : i32 loc(#loc90)
+    %xoffset_3 = arith.muli %xoffset, %xoffset_2 : i32 loc(#loc90)
+    %xindex = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32> loc(#loc91)
+    %xindex_4 = tt.splat %xoffset_3 : i32 -> tensor<1024xi32> loc(#loc92)
+    %xindex_5 = arith.addi %xindex_4, %xindex : tensor<1024xi32> loc(#loc92)
+    %xmask = arith.constant true loc(#loc93)
+    %xmask_6 = arith.constant dense<true> : tensor<1024xi1> loc(#loc93)
+    %x0 = arith.constant 128 : i32 loc(#loc94)
+    %x0_7 = arith.constant 128 : i32 loc(#loc94)
+    %x0_8 = arith.constant dense<128> : tensor<1024xi32> loc(#loc94)
+    %x0_9 = arith.remsi %xindex_5, %x0_8 : tensor<1024xi32> loc(#loc94)
+    %x2 = arith.constant 4096 : i32 loc(#loc95)
+    %x2_10 = arith.constant 4096 : i32 loc(#loc95)
+    %x2_11 = arith.constant dense<4096> : tensor<1024xi32> loc(#loc95)
+    %x2_12 = arith.divsi %xindex_5, %x2_11 : tensor<1024xi32> loc(#loc95)
+    %x4 = arith.constant 128 : i32 loc(#loc96)
+    %x4_13 = arith.constant 128 : i32 loc(#loc96)
+    %x4_14 = arith.constant dense<128> : tensor<1024xi32> loc(#loc96)
+    %x4_15 = arith.divsi %xindex_5, %x4_14 : tensor<1024xi32> loc(#loc96)
+    %tmp0 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<1024x!tt.ptr<bf16>> loc(#loc97)
+    %tmp0_16 = tt.addptr %tmp0, %xindex_5 : tensor<1024x!tt.ptr<bf16>>, tensor<1024xi32> loc(#loc97)
+    %tmp0_17 = tt.load %tmp0_16 : tensor<1024x!tt.ptr<bf16>> loc(#loc98)
+    %tmp0_18 = arith.extf %tmp0_17 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc99)
+    %tmp2 = arith.constant 128 : i32 loc(#loc100)
+    %tmp2_19 = arith.constant 128 : i32 loc(#loc100)
+    %tmp2_20 = arith.constant dense<128> : tensor<1024xi32> loc(#loc100)
+    %tmp2_21 = arith.muli %tmp2_20, %x2_12 : tensor<1024xi32> loc(#loc100)
+    %tmp2_22 = arith.addi %x0_9, %tmp2_21 : tensor<1024xi32> loc(#loc101)
+    %tmp2_23 = tt.splat %in_ptr1 : !tt.ptr<f32> -> tensor<1024x!tt.ptr<f32>> loc(#loc102)
+    %tmp2_24 = tt.addptr %tmp2_23, %tmp2_22 : tensor<1024x!tt.ptr<f32>>, tensor<1024xi32> loc(#loc102)
+    %tmp2_25 = tt.load %tmp2_24 evictionPolicy = evict_last : tensor<1024x!tt.ptr<f32>> loc(#loc103)
+    %tmp19 = arith.constant 128 : i32 loc(#loc104)
+    %tmp19_26 = arith.constant 128 : i32 loc(#loc104)
+    %tmp19_27 = arith.constant dense<128> : tensor<1024xi32> loc(#loc104)
+    %tmp19_28 = arith.muli %tmp19_27, %x2_12 : tensor<1024xi32> loc(#loc104)
+    %tmp19_29 = arith.addi %x0_9, %tmp19_28 : tensor<1024xi32> loc(#loc105)
+    %tmp19_30 = tt.splat %in_ptr2 : !tt.ptr<f32> -> tensor<1024x!tt.ptr<f32>> loc(#loc106)
+    %tmp19_31 = tt.addptr %tmp19_30, %tmp19_29 : tensor<1024x!tt.ptr<f32>>, tensor<1024xi32> loc(#loc106)
+    %tmp19_32 = tt.load %tmp19_31 evictionPolicy = evict_last : tensor<1024x!tt.ptr<f32>> loc(#loc107)
+    %tmp23 = tt.splat %in_ptr3 : !tt.ptr<bf16> -> tensor<1024x!tt.ptr<bf16>> loc(#loc108)
+    %tmp23_33 = tt.addptr %tmp23, %xindex_5 : tensor<1024x!tt.ptr<bf16>>, tensor<1024xi32> loc(#loc108)
+    %tmp23_34 = tt.load %tmp23_33 : tensor<1024x!tt.ptr<bf16>> loc(#loc109)
+    %tmp23_35 = arith.extf %tmp23_34 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc110)
+    %tmp3 = arith.mulf %tmp0_18, %tmp2_25 : tensor<1024xf32> loc(#loc111)
+    %tmp4 = arith.constant 2 : i32 loc(#loc112)
+    %tmp4_36 = arith.constant 2 : i32 loc(#loc112)
+    %tmp4_37 = arith.constant dense<2> : tensor<1024xi32> loc(#loc112)
+    %tmp4_38 = arith.remsi %xindex_5, %tmp4_37 : tensor<1024xi32> loc(#loc112)
+    %tmp5 = arith.constant 0 : i64 loc(#loc113)
+    %tmp5_39 = arith.constant dense<0> : tensor<1xi64> loc(#loc113)
+    %tmp6 = arith.extsi %tmp4_38 : tensor<1024xi32> to tensor<1024xi64> loc(#loc114)
+    %tmp6_40 = arith.constant dense<0> : tensor<1024xi64> loc(#loc114)
+    %tmp6_41 = arith.cmpi sge, %tmp6, %tmp6_40 : tensor<1024xi64> loc(#loc114)
+    %tmp7 = arith.constant 1 : i64 loc(#loc115)
+    %tmp7_42 = arith.constant dense<1> : tensor<1xi64> loc(#loc115)
+    %tmp8 = arith.extsi %tmp4_38 : tensor<1024xi32> to tensor<1024xi64> loc(#loc116)
+    %tmp8_43 = arith.constant dense<1> : tensor<1024xi64> loc(#loc116)
+    %tmp8_44 = arith.cmpi slt, %tmp8, %tmp8_43 : tensor<1024xi64> loc(#loc116)
+    %tmp9 = arith.constant 2 : i32 loc(#loc117)
+    %tmp9_45 = arith.constant 2 : i32 loc(#loc117)
+    %tmp9_46 = arith.constant dense<2> : tensor<1024xi32> loc(#loc117)
+    %tmp9_47 = arith.divsi %x0_9, %tmp9_46 : tensor<1024xi32> loc(#loc117)
+    %tmp9_48 = arith.constant 2 : i32 loc(#loc118)
+    %tmp9_49 = arith.constant 2 : i32 loc(#loc118)
+    %tmp9_50 = arith.constant dense<2> : tensor<1024xi32> loc(#loc118)
+    %tmp9_51 = arith.muli %tmp9_50, %tmp9_47 : tensor<1024xi32> loc(#loc118)
+    %tmp9_52 = arith.constant 1 : i32 loc(#loc119)
+    %tmp9_53 = arith.constant 1 : i32 loc(#loc119)
+    %tmp9_54 = arith.constant dense<1> : tensor<1024xi32> loc(#loc119)
+    %tmp9_55 = arith.addi %tmp9_54, %tmp9_51 : tensor<1024xi32> loc(#loc119)
+    %tmp9_56 = arith.constant 128 : i32 loc(#loc120)
+    %tmp9_57 = arith.constant 128 : i32 loc(#loc120)
+    %tmp9_58 = arith.constant dense<128> : tensor<1024xi32> loc(#loc120)
+    %tmp9_59 = arith.muli %tmp9_58, %x4_15 : tensor<1024xi32> loc(#loc120)
+    %tmp9_60 = arith.addi %tmp9_55, %tmp9_59 : tensor<1024xi32> loc(#loc121)
+    %tmp9_61 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<1024x!tt.ptr<bf16>> loc(#loc122)
+    %tmp9_62 = tt.addptr %tmp9_61, %tmp9_60 : tensor<1024x!tt.ptr<bf16>>, tensor<1024xi32> loc(#loc122)
+    %tmp9_63 = arith.constant 0.000000e+00 : f32 loc(#loc123)
+    %tmp9_64 = arith.constant dense<0.000000e+00> : tensor<1024xf32> loc(#loc123)
+    %tmp9_65 = arith.truncf %tmp9_64 : tensor<1024xf32> to tensor<1024xbf16> loc(#loc123)
+    %tmp9_66 = tt.load %tmp9_62, %tmp8_44, %tmp9_65 evictionPolicy = evict_last : tensor<1024x!tt.ptr<bf16>> loc(#loc123)
+    %tmp9_67 = arith.extf %tmp9_66 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc124)
+    %tmp10 = arith.constant 0.000000e+00 : f32 loc(#loc125)
+    %tmp10_68 = arith.constant dense<0.000000e+00> : tensor<1024xf32> loc(#loc125)
+    %tmp10_69 = arith.subf %tmp10_68, %tmp9_67 : tensor<1024xf32> loc(#loc125)
+    %tmp11 = arith.constant 0.000000e+00 : f32 loc(#loc126)
+    %tmp11_70 = arith.constant dense<0.000000e+00> : tensor<1024xf32> loc(#loc126)
+    %tmp12 = arith.select %tmp8_44, %tmp10_69, %tmp11_70 : tensor<1024xi1>, tensor<1024xf32> loc(#loc127)
+    %tmp13 = arith.extsi %tmp4_38 : tensor<1024xi32> to tensor<1024xi64> loc(#loc128)
+    %tmp13_71 = arith.constant dense<1> : tensor<1024xi64> loc(#loc128)
+    %tmp13_72 = arith.cmpi sge, %tmp13, %tmp13_71 : tensor<1024xi64> loc(#loc128)
+    %tmp14 = arith.constant 2 : i64 loc(#loc129)
+    %tmp14_73 = arith.constant dense<2> : tensor<1xi64> loc(#loc129)
+    %tmp15 = arith.extsi %tmp4_38 : tensor<1024xi32> to tensor<1024xi64> loc(#loc130)
+    %tmp15_74 = arith.constant dense<2> : tensor<1024xi64> loc(#loc130)
+    %tmp15_75 = arith.cmpi slt, %tmp15, %tmp15_74 : tensor<1024xi64> loc(#loc130)
+    %tmp16 = arith.constant 2 : i32 loc(#loc131)
+    %tmp16_76 = arith.constant 2 : i32 loc(#loc131)
+    %tmp16_77 = arith.constant dense<2> : tensor<1024xi32> loc(#loc131)
+    %tmp16_78 = arith.divsi %x0_9, %tmp16_77 : tensor<1024xi32> loc(#loc131)
+    %tmp16_79 = arith.constant 2 : i32 loc(#loc132)
+    %tmp16_80 = arith.constant 2 : i32 loc(#loc132)
+    %tmp16_81 = arith.constant dense<2> : tensor<1024xi32> loc(#loc132)
+    %tmp16_82 = arith.muli %tmp16_81, %tmp16_78 : tensor<1024xi32> loc(#loc132)
+    %tmp16_83 = arith.constant 128 : i32 loc(#loc133)
+    %tmp16_84 = arith.constant 128 : i32 loc(#loc133)
+    %tmp16_85 = arith.constant dense<128> : tensor<1024xi32> loc(#loc133)
+    %tmp16_86 = arith.muli %tmp16_85, %x4_15 : tensor<1024xi32> loc(#loc133)
+    %tmp16_87 = arith.addi %tmp16_82, %tmp16_86 : tensor<1024xi32> loc(#loc134)
+    %tmp16_88 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<1024x!tt.ptr<bf16>> loc(#loc135)
+    %tmp16_89 = tt.addptr %tmp16_88, %tmp16_87 : tensor<1024x!tt.ptr<bf16>>, tensor<1024xi32> loc(#loc135)
+    %tmp16_90 = arith.constant 0.000000e+00 : f32 loc(#loc136)
+    %tmp16_91 = arith.constant dense<0.000000e+00> : tensor<1024xf32> loc(#loc136)
+    %tmp16_92 = arith.truncf %tmp16_91 : tensor<1024xf32> to tensor<1024xbf16> loc(#loc136)
+    %tmp16_93 = tt.load %tmp16_89, %tmp13_72, %tmp16_92 evictionPolicy = evict_last : tensor<1024x!tt.ptr<bf16>> loc(#loc136)
+    %tmp16_94 = arith.extf %tmp16_93 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc137)
+    %tmp17 = arith.select %tmp8_44, %tmp12, %tmp16_94 : tensor<1024xi1>, tensor<1024xf32> loc(#loc138)
+    %tmp20 = arith.mulf %tmp17, %tmp19_32 : tensor<1024xf32> loc(#loc139)
+    %tmp21 = arith.addf %tmp3, %tmp20 : tensor<1024xf32> loc(#loc140)
+    %tmp25 = arith.mulf %tmp23_35, %tmp2_25 : tensor<1024xf32> loc(#loc141)
+    %tmp26 = arith.constant 2 : i32 loc(#loc142)
+    %tmp26_95 = arith.constant 2 : i32 loc(#loc142)
+    %tmp26_96 = arith.constant dense<2> : tensor<1024xi32> loc(#loc142)
+    %tmp26_97 = arith.divsi %x0_9, %tmp26_96 : tensor<1024xi32> loc(#loc142)
+    %tmp26_98 = arith.constant 2 : i32 loc(#loc143)
+    %tmp26_99 = arith.constant 2 : i32 loc(#loc143)
+    %tmp26_100 = arith.constant dense<2> : tensor<1024xi32> loc(#loc143)
+    %tmp26_101 = arith.muli %tmp26_100, %tmp26_97 : tensor<1024xi32> loc(#loc143)
+    %tmp26_102 = arith.constant 1 : i32 loc(#loc144)
+    %tmp26_103 = arith.constant 1 : i32 loc(#loc144)
+    %tmp26_104 = arith.constant dense<1> : tensor<1024xi32> loc(#loc144)
+    %tmp26_105 = arith.addi %tmp26_104, %tmp26_101 : tensor<1024xi32> loc(#loc144)
+    %tmp26_106 = arith.constant 128 : i32 loc(#loc145)
+    %tmp26_107 = arith.constant 128 : i32 loc(#loc145)
+    %tmp26_108 = arith.constant dense<128> : tensor<1024xi32> loc(#loc145)
+    %tmp26_109 = arith.muli %tmp26_108, %x4_15 : tensor<1024xi32> loc(#loc145)
+    %tmp26_110 = arith.addi %tmp26_105, %tmp26_109 : tensor<1024xi32> loc(#loc146)
+    %tmp26_111 = tt.splat %in_ptr3 : !tt.ptr<bf16> -> tensor<1024x!tt.ptr<bf16>> loc(#loc147)
+    %tmp26_112 = tt.addptr %tmp26_111, %tmp26_110 : tensor<1024x!tt.ptr<bf16>>, tensor<1024xi32> loc(#loc147)
+    %tmp26_113 = arith.constant 0.000000e+00 : f32 loc(#loc148)
+    %tmp26_114 = arith.constant dense<0.000000e+00> : tensor<1024xf32> loc(#loc148)
+    %tmp26_115 = arith.truncf %tmp26_114 : tensor<1024xf32> to tensor<1024xbf16> loc(#loc148)
+    %tmp26_116 = tt.load %tmp26_112, %tmp8_44, %tmp26_115 evictionPolicy = evict_last : tensor<1024x!tt.ptr<bf16>> loc(#loc148)
+    %tmp26_117 = arith.extf %tmp26_116 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc149)
+    %tmp27 = arith.constant 0.000000e+00 : f32 loc(#loc150)
+    %tmp27_118 = arith.constant dense<0.000000e+00> : tensor<1024xf32> loc(#loc150)
+    %tmp27_119 = arith.subf %tmp27_118, %tmp26_117 : tensor<1024xf32> loc(#loc150)
+    %tmp28 = arith.constant 0.000000e+00 : f32 loc(#loc151)
+    %tmp28_120 = arith.constant dense<0.000000e+00> : tensor<1024xf32> loc(#loc151)
+    %tmp29 = arith.select %tmp8_44, %tmp27_119, %tmp28_120 : tensor<1024xi1>, tensor<1024xf32> loc(#loc152)
+    %tmp30 = arith.constant 2 : i32 loc(#loc153)
+    %tmp30_121 = arith.constant 2 : i32 loc(#loc153)
+    %tmp30_122 = arith.constant dense<2> : tensor<1024xi32> loc(#loc153)
+    %tmp30_123 = arith.divsi %x0_9, %tmp30_122 : tensor<1024xi32> loc(#loc153)
+    %tmp30_124 = arith.constant 2 : i32 loc(#loc154)
+    %tmp30_125 = arith.constant 2 : i32 loc(#loc154)
+    %tmp30_126 = arith.constant dense<2> : tensor<1024xi32> loc(#loc154)
+    %tmp30_127 = arith.muli %tmp30_126, %tmp30_123 : tensor<1024xi32> loc(#loc154)
+    %tmp30_128 = arith.constant 128 : i32 loc(#loc155)
+    %tmp30_129 = arith.constant 128 : i32 loc(#loc155)
+    %tmp30_130 = arith.constant dense<128> : tensor<1024xi32> loc(#loc155)
+    %tmp30_131 = arith.muli %tmp30_130, %x4_15 : tensor<1024xi32> loc(#loc155)
+    %tmp30_132 = arith.addi %tmp30_127, %tmp30_131 : tensor<1024xi32> loc(#loc156)
+    %tmp30_133 = tt.splat %in_ptr3 : !tt.ptr<bf16> -> tensor<1024x!tt.ptr<bf16>> loc(#loc157)
+    %tmp30_134 = tt.addptr %tmp30_133, %tmp30_132 : tensor<1024x!tt.ptr<bf16>>, tensor<1024xi32> loc(#loc157)
+    %tmp30_135 = arith.constant 0.000000e+00 : f32 loc(#loc158)
+    %tmp30_136 = arith.constant dense<0.000000e+00> : tensor<1024xf32> loc(#loc158)
+    %tmp30_137 = arith.truncf %tmp30_136 : tensor<1024xf32> to tensor<1024xbf16> loc(#loc158)
+    %tmp30_138 = tt.load %tmp30_134, %tmp13_72, %tmp30_137 evictionPolicy = evict_last : tensor<1024x!tt.ptr<bf16>> loc(#loc158)
+    %tmp30_139 = arith.extf %tmp30_138 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc159)
+    %tmp31 = arith.select %tmp8_44, %tmp29, %tmp30_139 : tensor<1024xi1>, tensor<1024xf32> loc(#loc160)
+    %tmp33 = arith.mulf %tmp31, %tmp19_32 : tensor<1024xf32> loc(#loc161)
+    %tmp34 = arith.addf %tmp25, %tmp33 : tensor<1024xf32> loc(#loc162)
+    %0 = tt.splat %out_ptr0 : !tt.ptr<bf16> -> tensor<1024x!tt.ptr<bf16>> loc(#loc76)
+    %1 = tt.addptr %0, %xindex_5 : tensor<1024x!tt.ptr<bf16>>, tensor<1024xi32> loc(#loc76)
+    %2 = arith.truncf %tmp21 : tensor<1024xf32> to tensor<1024xbf16> loc(#loc77)
+    tt.store %1, %2 : tensor<1024x!tt.ptr<bf16>> loc(#loc77)
+    %3 = tt.splat %out_ptr1 : !tt.ptr<bf16> -> tensor<1024x!tt.ptr<bf16>> loc(#loc78)
+    %4 = tt.addptr %3, %xindex_5 : tensor<1024x!tt.ptr<bf16>>, tensor<1024xi32> loc(#loc78)
+    %5 = arith.truncf %tmp34 : tensor<1024xf32> to tensor<1024xbf16> loc(#loc79)
+    tt.store %4, %5 : tensor<1024x!tt.ptr<bf16>> loc(#loc79)
+    tt.return loc(#loc80)
+  } loc(#loc)
+} loc(#loc)
+#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":19:13)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":20:28)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":20:33)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":21:36)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":21:23)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":22:36)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":24:19)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":25:19)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":26:19)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":27:30)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":27:35)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":27:44)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":28:39)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":28:35)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":28:30)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":28:44)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":29:40)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":29:36)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":29:31)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":29:45)
+#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":30:31)
+#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":30:36)
+#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":30:45)
+#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":32:18)
+#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":33:17)
+#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":34:27)
+#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":35:19)
+#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":36:27)
+#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":37:18)
+#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":38:43)
+#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":38:37)
+#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":38:34)
+#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":38:52)
+#loc34 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":38:48)
+#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":38:30)
+#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":38:57)
+#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":38:107)
+#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":39:13)
+#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":40:38)
+#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":41:34)
+#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":42:20)
+#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":43:28)
+#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":44:19)
+#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":45:40)
+#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":45:34)
+#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":45:49)
+#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":45:45)
+#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":45:31)
+#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":45:54)
+#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":45:105)
+#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":46:34)
+#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":48:20)
+#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":49:19)
+#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":52:20)
+#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":53:44)
+#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":53:38)
+#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":53:35)
+#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":53:53)
+#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":53:49)
+#loc60 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":53:31)
+#loc61 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":53:58)
+#loc62 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":53:108)
+#loc63 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":54:13)
+#loc64 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":55:38)
+#loc65 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":56:34)
+#loc66 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":57:40)
+#loc67 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":57:34)
+#loc68 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":57:49)
+#loc69 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":57:45)
+#loc70 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":57:31)
+#loc71 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":57:54)
+#loc72 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":57:105)
+#loc73 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":58:34)
+#loc74 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":60:20)
+#loc75 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":61:20)
+#loc76 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":63:25)
+#loc77 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":63:37)
+#loc78 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":64:25)
+#loc79 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":64:37)
+#loc80 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":64:4)
+#loc88 = loc("xnumel"(#loc1))
+#loc89 = loc("xoffset"(#loc2))
+#loc90 = loc("xoffset"(#loc3))
+#loc91 = loc("xindex"(#loc4))
+#loc92 = loc("xindex"(#loc5))
+#loc93 = loc("xmask"(#loc6))
+#loc94 = loc("x0"(#loc7))
+#loc95 = loc("x2"(#loc8))
+#loc96 = loc("x4"(#loc9))
+#loc97 = loc("tmp0"(#loc10))
+#loc98 = loc("tmp0"(#loc11))
+#loc99 = loc("tmp0"(#loc12))
+#loc100 = loc("tmp2"(#loc13))
+#loc101 = loc("tmp2"(#loc14))
+#loc102 = loc("tmp2"(#loc15))
+#loc103 = loc("tmp2"(#loc16))
+#loc104 = loc("tmp19"(#loc17))
+#loc105 = loc("tmp19"(#loc18))
+#loc106 = loc("tmp19"(#loc19))
+#loc107 = loc("tmp19"(#loc20))
+#loc108 = loc("tmp23"(#loc21))
+#loc109 = loc("tmp23"(#loc22))
+#loc110 = loc("tmp23"(#loc23))
+#loc111 = loc("tmp3"(#loc24))
+#loc112 = loc("tmp4"(#loc25))
+#loc113 = loc("tmp5"(#loc26))
+#loc114 = loc("tmp6"(#loc27))
+#loc115 = loc("tmp7"(#loc28))
+#loc116 = loc("tmp8"(#loc29))
+#loc117 = loc("tmp9"(#loc30))
+#loc118 = loc("tmp9"(#loc31))
+#loc119 = loc("tmp9"(#loc32))
+#loc120 = loc("tmp9"(#loc33))
+#loc121 = loc("tmp9"(#loc34))
+#loc122 = loc("tmp9"(#loc35))
+#loc123 = loc("tmp9"(#loc36))
+#loc124 = loc("tmp9"(#loc37))
+#loc125 = loc("tmp10"(#loc38))
+#loc126 = loc("tmp11"(#loc39))
+#loc127 = loc("tmp12"(#loc40))
+#loc128 = loc("tmp13"(#loc41))
+#loc129 = loc("tmp14"(#loc42))
+#loc130 = loc("tmp15"(#loc43))
+#loc131 = loc("tmp16"(#loc44))
+#loc132 = loc("tmp16"(#loc45))
+#loc133 = loc("tmp16"(#loc46))
+#loc134 = loc("tmp16"(#loc47))
+#loc135 = loc("tmp16"(#loc48))
+#loc136 = loc("tmp16"(#loc49))
+#loc137 = loc("tmp16"(#loc50))
+#loc138 = loc("tmp17"(#loc51))
+#loc139 = loc("tmp20"(#loc52))
+#loc140 = loc("tmp21"(#loc53))
+#loc141 = loc("tmp25"(#loc54))
+#loc142 = loc("tmp26"(#loc55))
+#loc143 = loc("tmp26"(#loc56))
+#loc144 = loc("tmp26"(#loc57))
+#loc145 = loc("tmp26"(#loc58))
+#loc146 = loc("tmp26"(#loc59))
+#loc147 = loc("tmp26"(#loc60))
+#loc148 = loc("tmp26"(#loc61))
+#loc149 = loc("tmp26"(#loc62))
+#loc150 = loc("tmp27"(#loc63))
+#loc151 = loc("tmp28"(#loc64))
+#loc152 = loc("tmp29"(#loc65))
+#loc153 = loc("tmp30"(#loc66))
+#loc154 = loc("tmp30"(#loc67))
+#loc155 = loc("tmp30"(#loc68))
+#loc156 = loc("tmp30"(#loc69))
+#loc157 = loc("tmp30"(#loc70))
+#loc158 = loc("tmp30"(#loc71))
+#loc159 = loc("tmp30"(#loc72))
+#loc160 = loc("tmp31"(#loc73))
+#loc161 = loc("tmp33"(#loc74))
+#loc162 = loc("tmp34"(#loc75))
diff --git a/triton/PSM7NANFVWEDYUPXKUGOX4GWFVUW6ZQXELVXM5G5LMW6RWIXRCOQ/triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.ttgir b/triton/PSM7NANFVWEDYUPXKUGOX4GWFVUW6ZQXELVXM5G5LMW6RWIXRCOQ/triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.ttgir
new file mode 100644
index 0000000000000000000000000000000000000000..bbe4a8c447e36bdf1be8ed09d8a9bf9b0f38dc86
--- /dev/null
+++ b/triton/PSM7NANFVWEDYUPXKUGOX4GWFVUW6ZQXELVXM5G5LMW6RWIXRCOQ/triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.ttgir
@@ -0,0 +1,198 @@
+#blocked = #ttg.blocked<{sizePerThread = [8], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}>
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":18:0)
+#loc59 = loc("in_ptr0"(#loc))
+#loc60 = loc("in_ptr1"(#loc))
+#loc61 = loc("in_ptr2"(#loc))
+#loc62 = loc("in_ptr3"(#loc))
+#loc63 = loc("out_ptr0"(#loc))
+#loc64 = loc("out_ptr1"(#loc))
+#loc65 = loc("xnumel"(#loc))
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:89", "ttg.threads-per-warp" = 32 : i32} {
+  tt.func public @triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %out_ptr1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} {
+    %cst = arith.constant dense<1> : tensor<1024xi32, #blocked> loc(#loc1)
+    %cst_0 = arith.constant dense<1> : tensor<1024xi64, #blocked> loc(#loc1)
+    %cst_1 = arith.constant dense<2> : tensor<1024xi32, #blocked> loc(#loc1)
+    %cst_2 = arith.constant dense<4096> : tensor<1024xi32, #blocked> loc(#loc1)
+    %cst_3 = arith.constant dense<128> : tensor<1024xi32, #blocked> loc(#loc1)
+    %c1024_i32 = arith.constant 1024 : i32 loc(#loc1)
+    %cst_4 = arith.constant dense<0.000000e+00> : tensor<1024xbf16, #blocked> loc(#loc1)
+    %cst_5 = arith.constant dense<0.000000e+00> : tensor<1024xf32, #blocked> loc(#loc1)
+    %xoffset = tt.get_program_id x : i32 loc(#loc66)
+    %xoffset_6 = arith.muli %xoffset, %c1024_i32 : i32 loc(#loc67)
+    %xindex = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked> loc(#loc68)
+    %xindex_7 = tt.splat %xoffset_6 : i32 -> tensor<1024xi32, #blocked> loc(#loc69)
+    %xindex_8 = arith.addi %xindex_7, %xindex : tensor<1024xi32, #blocked> loc(#loc69)
+    %x0 = arith.remsi %xindex_8, %cst_3 : tensor<1024xi32, #blocked> loc(#loc70)
+    %x2 = arith.divsi %xindex_8, %cst_2 : tensor<1024xi32, #blocked> loc(#loc71)
+    %x4 = arith.divsi %xindex_8, %cst_3 : tensor<1024xi32, #blocked> loc(#loc72)
+    %tmp0 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<1024x!tt.ptr<bf16>, #blocked> loc(#loc73)
+    %tmp0_9 = tt.addptr %tmp0, %xindex_8 : tensor<1024x!tt.ptr<bf16>, #blocked>, tensor<1024xi32, #blocked> loc(#loc73)
+    %tmp0_10 = tt.load %tmp0_9 : tensor<1024x!tt.ptr<bf16>, #blocked> loc(#loc74)
+    %tmp0_11 = arith.extf %tmp0_10 : tensor<1024xbf16, #blocked> to tensor<1024xf32, #blocked> loc(#loc75)
+    %tmp2 = arith.muli %x2, %cst_3 : tensor<1024xi32, #blocked> loc(#loc76)
+    %tmp2_12 = arith.addi %x0, %tmp2 : tensor<1024xi32, #blocked> loc(#loc77)
+    %tmp2_13 = tt.splat %in_ptr1 : !tt.ptr<f32> -> tensor<1024x!tt.ptr<f32>, #blocked> loc(#loc78)
+    %tmp2_14 = tt.addptr %tmp2_13, %tmp2_12 : tensor<1024x!tt.ptr<f32>, #blocked>, tensor<1024xi32, #blocked> loc(#loc78)
+    %tmp2_15 = tt.load %tmp2_14 evictionPolicy = evict_last : tensor<1024x!tt.ptr<f32>, #blocked> loc(#loc79)
+    %tmp19 = tt.splat %in_ptr2 : !tt.ptr<f32> -> tensor<1024x!tt.ptr<f32>, #blocked> loc(#loc80)
+    %tmp19_16 = tt.addptr %tmp19, %tmp2_12 : tensor<1024x!tt.ptr<f32>, #blocked>, tensor<1024xi32, #blocked> loc(#loc80)
+    %tmp19_17 = tt.load %tmp19_16 evictionPolicy = evict_last : tensor<1024x!tt.ptr<f32>, #blocked> loc(#loc81)
+    %tmp23 = tt.splat %in_ptr3 : !tt.ptr<bf16> -> tensor<1024x!tt.ptr<bf16>, #blocked> loc(#loc82)
+    %tmp23_18 = tt.addptr %tmp23, %xindex_8 : tensor<1024x!tt.ptr<bf16>, #blocked>, tensor<1024xi32, #blocked> loc(#loc82)
+    %tmp23_19 = tt.load %tmp23_18 : tensor<1024x!tt.ptr<bf16>, #blocked> loc(#loc83)
+    %tmp23_20 = arith.extf %tmp23_19 : tensor<1024xbf16, #blocked> to tensor<1024xf32, #blocked> loc(#loc84)
+    %tmp3 = arith.mulf %tmp0_11, %tmp2_15 : tensor<1024xf32, #blocked> loc(#loc85)
+    %tmp4 = arith.remsi %xindex_8, %cst_1 : tensor<1024xi32, #blocked> loc(#loc86)
+    %tmp8 = arith.extsi %tmp4 : tensor<1024xi32, #blocked> to tensor<1024xi64, #blocked> loc(#loc87)
+    %tmp8_21 = arith.cmpi slt, %tmp8, %cst_0 : tensor<1024xi64, #blocked> loc(#loc87)
+    %tmp9 = arith.divsi %x0, %cst_1 : tensor<1024xi32, #blocked> loc(#loc88)
+    %tmp9_22 = arith.muli %tmp9, %cst_1 : tensor<1024xi32, #blocked> loc(#loc89)
+    %tmp9_23 = arith.addi %tmp9_22, %cst : tensor<1024xi32, #blocked> loc(#loc90)
+    %tmp9_24 = arith.muli %x4, %cst_3 : tensor<1024xi32, #blocked> loc(#loc91)
+    %tmp9_25 = arith.addi %tmp9_23, %tmp9_24 : tensor<1024xi32, #blocked> loc(#loc92)
+    %tmp9_26 = tt.addptr %tmp0, %tmp9_25 : tensor<1024x!tt.ptr<bf16>, #blocked>, tensor<1024xi32, #blocked> loc(#loc93)
+    %tmp9_27 = tt.load %tmp9_26, %tmp8_21, %cst_4 evictionPolicy = evict_last : tensor<1024x!tt.ptr<bf16>, #blocked> loc(#loc94)
+    %tmp9_28 = arith.extf %tmp9_27 : tensor<1024xbf16, #blocked> to tensor<1024xf32, #blocked> loc(#loc95)
+    %tmp10 = arith.subf %cst_5, %tmp9_28 : tensor<1024xf32, #blocked> loc(#loc96)
+    %tmp13 = arith.cmpi sge, %tmp8, %cst_0 : tensor<1024xi64, #blocked> loc(#loc97)
+    %tmp16 = arith.addi %tmp9_22, %tmp9_24 : tensor<1024xi32, #blocked> loc(#loc98)
+    %tmp16_29 = tt.addptr %tmp0, %tmp16 : tensor<1024x!tt.ptr<bf16>, #blocked>, tensor<1024xi32, #blocked> loc(#loc99)
+    %tmp16_30 = tt.load %tmp16_29, %tmp13, %cst_4 evictionPolicy = evict_last : tensor<1024x!tt.ptr<bf16>, #blocked> loc(#loc100)
+    %tmp16_31 = arith.extf %tmp16_30 : tensor<1024xbf16, #blocked> to tensor<1024xf32, #blocked> loc(#loc101)
+    %tmp17 = arith.select %tmp8_21, %tmp10, %tmp16_31 : tensor<1024xi1, #blocked>, tensor<1024xf32, #blocked> loc(#loc118)
+    %tmp20 = arith.mulf %tmp17, %tmp19_17 : tensor<1024xf32, #blocked> loc(#loc104)
+    %tmp21 = arith.addf %tmp3, %tmp20 : tensor<1024xf32, #blocked> loc(#loc105)
+    %tmp25 = arith.mulf %tmp23_20, %tmp2_15 : tensor<1024xf32, #blocked> loc(#loc106)
+    %tmp26 = tt.addptr %tmp23, %tmp9_25 : tensor<1024x!tt.ptr<bf16>, #blocked>, tensor<1024xi32, #blocked> loc(#loc107)
+    %tmp26_32 = tt.load %tmp26, %tmp8_21, %cst_4 evictionPolicy = evict_last : tensor<1024x!tt.ptr<bf16>, #blocked> loc(#loc108)
+    %tmp26_33 = arith.extf %tmp26_32 : tensor<1024xbf16, #blocked> to tensor<1024xf32, #blocked> loc(#loc109)
+    %tmp27 = arith.subf %cst_5, %tmp26_33 : tensor<1024xf32, #blocked> loc(#loc110)
+    %tmp30 = tt.addptr %tmp23, %tmp16 : tensor<1024x!tt.ptr<bf16>, #blocked>, tensor<1024xi32, #blocked> loc(#loc111)
+    %tmp30_34 = tt.load %tmp30, %tmp13, %cst_4 evictionPolicy = evict_last : tensor<1024x!tt.ptr<bf16>, #blocked> loc(#loc112)
+    %tmp30_35 = arith.extf %tmp30_34 : tensor<1024xbf16, #blocked> to tensor<1024xf32, #blocked> loc(#loc113)
+    %tmp31 = arith.select %tmp8_21, %tmp27, %tmp30_35 : tensor<1024xi1, #blocked>, tensor<1024xf32, #blocked> loc(#loc119)
+    %tmp33 = arith.mulf %tmp31, %tmp19_17 : tensor<1024xf32, #blocked> loc(#loc116)
+    %tmp34 = arith.addf %tmp25, %tmp33 : tensor<1024xf32, #blocked> loc(#loc117)
+    %0 = tt.splat %out_ptr0 : !tt.ptr<bf16> -> tensor<1024x!tt.ptr<bf16>, #blocked> loc(#loc54)
+    %1 = tt.addptr %0, %xindex_8 : tensor<1024x!tt.ptr<bf16>, #blocked>, tensor<1024xi32, #blocked> loc(#loc54)
+    %2 = arith.truncf %tmp21 : tensor<1024xf32, #blocked> to tensor<1024xbf16, #blocked> loc(#loc55)
+    tt.store %1, %2 : tensor<1024x!tt.ptr<bf16>, #blocked> loc(#loc55)
+    %3 = tt.splat %out_ptr1 : !tt.ptr<bf16> -> tensor<1024x!tt.ptr<bf16>, #blocked> loc(#loc56)
+    %4 = tt.addptr %3, %xindex_8 : tensor<1024x!tt.ptr<bf16>, #blocked>, tensor<1024xi32, #blocked> loc(#loc56)
+    %5 = arith.truncf %tmp34 : tensor<1024xf32, #blocked> to tensor<1024xbf16, #blocked> loc(#loc57)
+    tt.store %4, %5 : tensor<1024x!tt.ptr<bf16>, #blocked> loc(#loc57)
+    tt.return loc(#loc58)
+  } loc(#loc)
+} loc(#loc)
+#loc1 = loc(unknown)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":20:28)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":20:33)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":21:36)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":21:23)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":24:19)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":25:19)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":26:19)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":27:30)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":27:35)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":27:44)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":28:39)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":28:35)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":28:30)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":28:44)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":29:31)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":29:45)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":30:31)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":30:36)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":30:45)
+#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":32:18)
+#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":33:17)
+#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":37:18)
+#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":38:43)
+#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":38:37)
+#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":38:34)
+#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":38:52)
+#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":38:48)
+#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":38:30)
+#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":38:57)
+#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":38:107)
+#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":39:13)
+#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":42:20)
+#loc34 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":45:45)
+#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":45:31)
+#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":45:54)
+#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":45:105)
+#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":46:34)
+#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":41:34)
+#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":48:20)
+#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":49:19)
+#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":52:20)
+#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":53:31)
+#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":53:58)
+#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":53:108)
+#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":54:13)
+#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":57:31)
+#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":57:54)
+#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":57:105)
+#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":58:34)
+#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":56:34)
+#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":60:20)
+#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":61:20)
+#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":63:25)
+#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":63:37)
+#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":64:25)
+#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":64:37)
+#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":64:4)
+#loc66 = loc("xoffset"(#loc2))
+#loc67 = loc("xoffset"(#loc3))
+#loc68 = loc("xindex"(#loc4))
+#loc69 = loc("xindex"(#loc5))
+#loc70 = loc("x0"(#loc6))
+#loc71 = loc("x2"(#loc7))
+#loc72 = loc("x4"(#loc8))
+#loc73 = loc("tmp0"(#loc9))
+#loc74 = loc("tmp0"(#loc10))
+#loc75 = loc("tmp0"(#loc11))
+#loc76 = loc("tmp2"(#loc12))
+#loc77 = loc("tmp2"(#loc13))
+#loc78 = loc("tmp2"(#loc14))
+#loc79 = loc("tmp2"(#loc15))
+#loc80 = loc("tmp19"(#loc16))
+#loc81 = loc("tmp19"(#loc17))
+#loc82 = loc("tmp23"(#loc18))
+#loc83 = loc("tmp23"(#loc19))
+#loc84 = loc("tmp23"(#loc20))
+#loc85 = loc("tmp3"(#loc21))
+#loc86 = loc("tmp4"(#loc22))
+#loc87 = loc("tmp8"(#loc23))
+#loc88 = loc("tmp9"(#loc24))
+#loc89 = loc("tmp9"(#loc25))
+#loc90 = loc("tmp9"(#loc26))
+#loc91 = loc("tmp9"(#loc27))
+#loc92 = loc("tmp9"(#loc28))
+#loc93 = loc("tmp9"(#loc29))
+#loc94 = loc("tmp9"(#loc30))
+#loc95 = loc("tmp9"(#loc31))
+#loc96 = loc("tmp10"(#loc32))
+#loc97 = loc("tmp13"(#loc33))
+#loc98 = loc("tmp16"(#loc34))
+#loc99 = loc("tmp16"(#loc35))
+#loc100 = loc("tmp16"(#loc36))
+#loc101 = loc("tmp16"(#loc37))
+#loc102 = loc("tmp17"(#loc38))
+#loc103 = loc("tmp12"(#loc39))
+#loc104 = loc("tmp20"(#loc40))
+#loc105 = loc("tmp21"(#loc41))
+#loc106 = loc("tmp25"(#loc42))
+#loc107 = loc("tmp26"(#loc43))
+#loc108 = loc("tmp26"(#loc44))
+#loc109 = loc("tmp26"(#loc45))
+#loc110 = loc("tmp27"(#loc46))
+#loc111 = loc("tmp30"(#loc47))
+#loc112 = loc("tmp30"(#loc48))
+#loc113 = loc("tmp30"(#loc49))
+#loc114 = loc("tmp31"(#loc50))
+#loc115 = loc("tmp29"(#loc51))
+#loc116 = loc("tmp33"(#loc52))
+#loc117 = loc("tmp34"(#loc53))
+#loc118 = loc(fused[#loc102, #loc103])
+#loc119 = loc(fused[#loc114, #loc115])
diff --git a/triton/PSM7NANFVWEDYUPXKUGOX4GWFVUW6ZQXELVXM5G5LMW6RWIXRCOQ/triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.ttir b/triton/PSM7NANFVWEDYUPXKUGOX4GWFVUW6ZQXELVXM5G5LMW6RWIXRCOQ/triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.ttir
new file mode 100644
index 0000000000000000000000000000000000000000..cd4a7ecd31b77172537a9e78b74d70737000e896
--- /dev/null
+++ b/triton/PSM7NANFVWEDYUPXKUGOX4GWFVUW6ZQXELVXM5G5LMW6RWIXRCOQ/triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.ttir
@@ -0,0 +1,197 @@
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":18:0)
+#loc59 = loc("in_ptr0"(#loc))
+#loc60 = loc("in_ptr1"(#loc))
+#loc61 = loc("in_ptr2"(#loc))
+#loc62 = loc("in_ptr3"(#loc))
+#loc63 = loc("out_ptr0"(#loc))
+#loc64 = loc("out_ptr1"(#loc))
+#loc65 = loc("xnumel"(#loc))
+module {
+  tt.func public @triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %out_ptr1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} {
+    %cst = arith.constant dense<0.000000e+00> : tensor<1024xbf16> loc(#loc1)
+    %cst_0 = arith.constant dense<0.000000e+00> : tensor<1024xf32> loc(#loc1)
+    %cst_1 = arith.constant dense<1> : tensor<1024xi32> loc(#loc1)
+    %cst_2 = arith.constant dense<1> : tensor<1024xi64> loc(#loc1)
+    %cst_3 = arith.constant dense<2> : tensor<1024xi32> loc(#loc1)
+    %x2 = arith.constant dense<4096> : tensor<1024xi32> loc(#loc66)
+    %cst_4 = arith.constant dense<128> : tensor<1024xi32> loc(#loc1)
+    %c1024_i32 = arith.constant 1024 : i32 loc(#loc1)
+    %xoffset = tt.get_program_id x : i32 loc(#loc67)
+    %xoffset_5 = arith.muli %xoffset, %c1024_i32 : i32 loc(#loc68)
+    %xindex = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32> loc(#loc69)
+    %xindex_6 = tt.splat %xoffset_5 : i32 -> tensor<1024xi32> loc(#loc70)
+    %xindex_7 = arith.addi %xindex_6, %xindex : tensor<1024xi32> loc(#loc70)
+    %x0 = arith.remsi %xindex_7, %cst_4 : tensor<1024xi32> loc(#loc71)
+    %x2_8 = arith.divsi %xindex_7, %x2 : tensor<1024xi32> loc(#loc66)
+    %x4 = arith.divsi %xindex_7, %cst_4 : tensor<1024xi32> loc(#loc72)
+    %tmp0 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<1024x!tt.ptr<bf16>> loc(#loc73)
+    %tmp0_9 = tt.addptr %tmp0, %xindex_7 : tensor<1024x!tt.ptr<bf16>>, tensor<1024xi32> loc(#loc73)
+    %tmp0_10 = tt.load %tmp0_9 : tensor<1024x!tt.ptr<bf16>> loc(#loc74)
+    %tmp0_11 = arith.extf %tmp0_10 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc75)
+    %tmp2 = arith.muli %x2_8, %cst_4 : tensor<1024xi32> loc(#loc76)
+    %tmp2_12 = arith.addi %x0, %tmp2 : tensor<1024xi32> loc(#loc77)
+    %tmp2_13 = tt.splat %in_ptr1 : !tt.ptr<f32> -> tensor<1024x!tt.ptr<f32>> loc(#loc78)
+    %tmp2_14 = tt.addptr %tmp2_13, %tmp2_12 : tensor<1024x!tt.ptr<f32>>, tensor<1024xi32> loc(#loc78)
+    %tmp2_15 = tt.load %tmp2_14 evictionPolicy = evict_last : tensor<1024x!tt.ptr<f32>> loc(#loc79)
+    %tmp19 = tt.splat %in_ptr2 : !tt.ptr<f32> -> tensor<1024x!tt.ptr<f32>> loc(#loc80)
+    %tmp19_16 = tt.addptr %tmp19, %tmp2_12 : tensor<1024x!tt.ptr<f32>>, tensor<1024xi32> loc(#loc80)
+    %tmp19_17 = tt.load %tmp19_16 evictionPolicy = evict_last : tensor<1024x!tt.ptr<f32>> loc(#loc81)
+    %tmp23 = tt.splat %in_ptr3 : !tt.ptr<bf16> -> tensor<1024x!tt.ptr<bf16>> loc(#loc82)
+    %tmp23_18 = tt.addptr %tmp23, %xindex_7 : tensor<1024x!tt.ptr<bf16>>, tensor<1024xi32> loc(#loc82)
+    %tmp23_19 = tt.load %tmp23_18 : tensor<1024x!tt.ptr<bf16>> loc(#loc83)
+    %tmp23_20 = arith.extf %tmp23_19 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc84)
+    %tmp3 = arith.mulf %tmp0_11, %tmp2_15 : tensor<1024xf32> loc(#loc85)
+    %tmp4 = arith.remsi %xindex_7, %cst_3 : tensor<1024xi32> loc(#loc86)
+    %tmp8 = arith.extsi %tmp4 : tensor<1024xi32> to tensor<1024xi64> loc(#loc87)
+    %tmp8_21 = arith.cmpi slt, %tmp8, %cst_2 : tensor<1024xi64> loc(#loc87)
+    %tmp9 = arith.divsi %x0, %cst_3 : tensor<1024xi32> loc(#loc88)
+    %tmp9_22 = arith.muli %tmp9, %cst_3 : tensor<1024xi32> loc(#loc89)
+    %tmp9_23 = arith.addi %tmp9_22, %cst_1 : tensor<1024xi32> loc(#loc90)
+    %tmp9_24 = arith.muli %x4, %cst_4 : tensor<1024xi32> loc(#loc91)
+    %tmp9_25 = arith.addi %tmp9_23, %tmp9_24 : tensor<1024xi32> loc(#loc92)
+    %tmp9_26 = tt.addptr %tmp0, %tmp9_25 : tensor<1024x!tt.ptr<bf16>>, tensor<1024xi32> loc(#loc93)
+    %tmp9_27 = tt.load %tmp9_26, %tmp8_21, %cst evictionPolicy = evict_last : tensor<1024x!tt.ptr<bf16>> loc(#loc94)
+    %tmp9_28 = arith.extf %tmp9_27 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc95)
+    %tmp10 = arith.subf %cst_0, %tmp9_28 : tensor<1024xf32> loc(#loc96)
+    %tmp13 = arith.cmpi sge, %tmp8, %cst_2 : tensor<1024xi64> loc(#loc97)
+    %tmp16 = arith.addi %tmp9_22, %tmp9_24 : tensor<1024xi32> loc(#loc98)
+    %tmp16_29 = tt.addptr %tmp0, %tmp16 : tensor<1024x!tt.ptr<bf16>>, tensor<1024xi32> loc(#loc99)
+    %tmp16_30 = tt.load %tmp16_29, %tmp13, %cst evictionPolicy = evict_last : tensor<1024x!tt.ptr<bf16>> loc(#loc100)
+    %tmp16_31 = arith.extf %tmp16_30 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc101)
+    %tmp17 = arith.select %tmp8_21, %tmp10, %tmp16_31 : tensor<1024xi1>, tensor<1024xf32> loc(#loc118)
+    %tmp20 = arith.mulf %tmp17, %tmp19_17 : tensor<1024xf32> loc(#loc104)
+    %tmp21 = arith.addf %tmp3, %tmp20 : tensor<1024xf32> loc(#loc105)
+    %tmp25 = arith.mulf %tmp23_20, %tmp2_15 : tensor<1024xf32> loc(#loc106)
+    %tmp26 = tt.addptr %tmp23, %tmp9_25 : tensor<1024x!tt.ptr<bf16>>, tensor<1024xi32> loc(#loc107)
+    %tmp26_32 = tt.load %tmp26, %tmp8_21, %cst evictionPolicy = evict_last : tensor<1024x!tt.ptr<bf16>> loc(#loc108)
+    %tmp26_33 = arith.extf %tmp26_32 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc109)
+    %tmp27 = arith.subf %cst_0, %tmp26_33 : tensor<1024xf32> loc(#loc110)
+    %tmp30 = tt.addptr %tmp23, %tmp16 : tensor<1024x!tt.ptr<bf16>>, tensor<1024xi32> loc(#loc111)
+    %tmp30_34 = tt.load %tmp30, %tmp13, %cst evictionPolicy = evict_last : tensor<1024x!tt.ptr<bf16>> loc(#loc112)
+    %tmp30_35 = arith.extf %tmp30_34 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc113)
+    %tmp31 = arith.select %tmp8_21, %tmp27, %tmp30_35 : tensor<1024xi1>, tensor<1024xf32> loc(#loc119)
+    %tmp33 = arith.mulf %tmp31, %tmp19_17 : tensor<1024xf32> loc(#loc116)
+    %tmp34 = arith.addf %tmp25, %tmp33 : tensor<1024xf32> loc(#loc117)
+    %0 = tt.splat %out_ptr0 : !tt.ptr<bf16> -> tensor<1024x!tt.ptr<bf16>> loc(#loc54)
+    %1 = tt.addptr %0, %xindex_7 : tensor<1024x!tt.ptr<bf16>>, tensor<1024xi32> loc(#loc54)
+    %2 = arith.truncf %tmp21 : tensor<1024xf32> to tensor<1024xbf16> loc(#loc55)
+    tt.store %1, %2 : tensor<1024x!tt.ptr<bf16>> loc(#loc55)
+    %3 = tt.splat %out_ptr1 : !tt.ptr<bf16> -> tensor<1024x!tt.ptr<bf16>> loc(#loc56)
+    %4 = tt.addptr %3, %xindex_7 : tensor<1024x!tt.ptr<bf16>>, tensor<1024xi32> loc(#loc56)
+    %5 = arith.truncf %tmp34 : tensor<1024xf32> to tensor<1024xbf16> loc(#loc57)
+    tt.store %4, %5 : tensor<1024x!tt.ptr<bf16>> loc(#loc57)
+    tt.return loc(#loc58)
+  } loc(#loc)
+} loc(#loc)
+#loc1 = loc(unknown)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":25:19)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":20:28)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":20:33)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":21:36)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":21:23)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":24:19)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":26:19)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":27:30)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":27:35)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":27:44)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":28:39)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":28:35)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":28:30)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":28:44)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":29:31)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":29:45)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":30:31)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":30:36)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":30:45)
+#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":32:18)
+#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":33:17)
+#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":37:18)
+#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":38:43)
+#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":38:37)
+#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":38:34)
+#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":38:52)
+#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":38:48)
+#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":38:30)
+#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":38:57)
+#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":38:107)
+#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":39:13)
+#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":42:20)
+#loc34 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":45:45)
+#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":45:31)
+#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":45:54)
+#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":45:105)
+#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":46:34)
+#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":41:34)
+#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":48:20)
+#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":49:19)
+#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":52:20)
+#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":53:31)
+#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":53:58)
+#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":53:108)
+#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":54:13)
+#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":57:31)
+#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":57:54)
+#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":57:105)
+#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":58:34)
+#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":56:34)
+#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":60:20)
+#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":61:20)
+#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":63:25)
+#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":63:37)
+#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":64:25)
+#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":64:37)
+#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":64:4)
+#loc66 = loc("x2"(#loc2))
+#loc67 = loc("xoffset"(#loc3))
+#loc68 = loc("xoffset"(#loc4))
+#loc69 = loc("xindex"(#loc5))
+#loc70 = loc("xindex"(#loc6))
+#loc71 = loc("x0"(#loc7))
+#loc72 = loc("x4"(#loc8))
+#loc73 = loc("tmp0"(#loc9))
+#loc74 = loc("tmp0"(#loc10))
+#loc75 = loc("tmp0"(#loc11))
+#loc76 = loc("tmp2"(#loc12))
+#loc77 = loc("tmp2"(#loc13))
+#loc78 = loc("tmp2"(#loc14))
+#loc79 = loc("tmp2"(#loc15))
+#loc80 = loc("tmp19"(#loc16))
+#loc81 = loc("tmp19"(#loc17))
+#loc82 = loc("tmp23"(#loc18))
+#loc83 = loc("tmp23"(#loc19))
+#loc84 = loc("tmp23"(#loc20))
+#loc85 = loc("tmp3"(#loc21))
+#loc86 = loc("tmp4"(#loc22))
+#loc87 = loc("tmp8"(#loc23))
+#loc88 = loc("tmp9"(#loc24))
+#loc89 = loc("tmp9"(#loc25))
+#loc90 = loc("tmp9"(#loc26))
+#loc91 = loc("tmp9"(#loc27))
+#loc92 = loc("tmp9"(#loc28))
+#loc93 = loc("tmp9"(#loc29))
+#loc94 = loc("tmp9"(#loc30))
+#loc95 = loc("tmp9"(#loc31))
+#loc96 = loc("tmp10"(#loc32))
+#loc97 = loc("tmp13"(#loc33))
+#loc98 = loc("tmp16"(#loc34))
+#loc99 = loc("tmp16"(#loc35))
+#loc100 = loc("tmp16"(#loc36))
+#loc101 = loc("tmp16"(#loc37))
+#loc102 = loc("tmp17"(#loc38))
+#loc103 = loc("tmp12"(#loc39))
+#loc104 = loc("tmp20"(#loc40))
+#loc105 = loc("tmp21"(#loc41))
+#loc106 = loc("tmp25"(#loc42))
+#loc107 = loc("tmp26"(#loc43))
+#loc108 = loc("tmp26"(#loc44))
+#loc109 = loc("tmp26"(#loc45))
+#loc110 = loc("tmp27"(#loc46))
+#loc111 = loc("tmp30"(#loc47))
+#loc112 = loc("tmp30"(#loc48))
+#loc113 = loc("tmp30"(#loc49))
+#loc114 = loc("tmp31"(#loc50))
+#loc115 = loc("tmp29"(#loc51))
+#loc116 = loc("tmp33"(#loc52))
+#loc117 = loc("tmp34"(#loc53))
+#loc118 = loc(fused[#loc102, #loc103])
+#loc119 = loc(fused[#loc114, #loc115])
diff --git a/triton/PVJFEKUHFY6YDP74LR5ZN755XPQ2NOJORFYDHPYAEBAUF37PJJJQ/__grp__triton_poi_fused_clone_permute_2.json b/triton/PVJFEKUHFY6YDP74LR5ZN755XPQ2NOJORFYDHPYAEBAUF37PJJJQ/__grp__triton_poi_fused_clone_permute_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..cc46aaaa092b53462e08a66301f0317fdac25dee
--- /dev/null
+++ b/triton/PVJFEKUHFY6YDP74LR5ZN755XPQ2NOJORFYDHPYAEBAUF37PJJJQ/__grp__triton_poi_fused_clone_permute_2.json
@@ -0,0 +1 @@
+{"child_paths": {"triton_poi_fused_clone_permute_2.source": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/PVJFEKUHFY6YDP74LR5ZN755XPQ2NOJORFYDHPYAEBAUF37PJJJQ/triton_poi_fused_clone_permute_2.source", "triton_poi_fused_clone_permute_2.ttir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/PVJFEKUHFY6YDP74LR5ZN755XPQ2NOJORFYDHPYAEBAUF37PJJJQ/triton_poi_fused_clone_permute_2.ttir", "triton_poi_fused_clone_permute_2.ttgir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/PVJFEKUHFY6YDP74LR5ZN755XPQ2NOJORFYDHPYAEBAUF37PJJJQ/triton_poi_fused_clone_permute_2.ttgir", "triton_poi_fused_clone_permute_2.llir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/PVJFEKUHFY6YDP74LR5ZN755XPQ2NOJORFYDHPYAEBAUF37PJJJQ/triton_poi_fused_clone_permute_2.llir", "triton_poi_fused_clone_permute_2.ptx": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/PVJFEKUHFY6YDP74LR5ZN755XPQ2NOJORFYDHPYAEBAUF37PJJJQ/triton_poi_fused_clone_permute_2.ptx", "triton_poi_fused_clone_permute_2.cubin": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/PVJFEKUHFY6YDP74LR5ZN755XPQ2NOJORFYDHPYAEBAUF37PJJJQ/triton_poi_fused_clone_permute_2.cubin", "triton_poi_fused_clone_permute_2.json": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/PVJFEKUHFY6YDP74LR5ZN755XPQ2NOJORFYDHPYAEBAUF37PJJJQ/triton_poi_fused_clone_permute_2.json"}}
\ No newline at end of file
diff --git a/triton/PVJFEKUHFY6YDP74LR5ZN755XPQ2NOJORFYDHPYAEBAUF37PJJJQ/triton_poi_fused_clone_permute_2.cubin b/triton/PVJFEKUHFY6YDP74LR5ZN755XPQ2NOJORFYDHPYAEBAUF37PJJJQ/triton_poi_fused_clone_permute_2.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..acf9275f00eb73587ac18e978fe7d2b8ba03b621
Binary files /dev/null and b/triton/PVJFEKUHFY6YDP74LR5ZN755XPQ2NOJORFYDHPYAEBAUF37PJJJQ/triton_poi_fused_clone_permute_2.cubin differ
diff --git a/triton/PVJFEKUHFY6YDP74LR5ZN755XPQ2NOJORFYDHPYAEBAUF37PJJJQ/triton_poi_fused_clone_permute_2.json b/triton/PVJFEKUHFY6YDP74LR5ZN755XPQ2NOJORFYDHPYAEBAUF37PJJJQ/triton_poi_fused_clone_permute_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..5266b1dfb39ff9c47ac5a70fca3cc7b5ade5cabc
--- /dev/null
+++ b/triton/PVJFEKUHFY6YDP74LR5ZN755XPQ2NOJORFYDHPYAEBAUF37PJJJQ/triton_poi_fused_clone_permute_2.json
@@ -0,0 +1 @@
+{"hash": "7d52522a872e3d81bffc5c7b96ffbdbbe1a6b92e897033bf00204142efef4a53", "target": {"backend": "cuda", "arch": 89, "warp_size": 32}, "num_warps": 8, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "enable_reflect_ftz": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee", "bf16x3", "bf16x6"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm89", "instrumentation_mode": "", "triton_version": "3.6.0", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_poi_fused_clone_permute_2"}
\ No newline at end of file
diff --git a/triton/PVJFEKUHFY6YDP74LR5ZN755XPQ2NOJORFYDHPYAEBAUF37PJJJQ/triton_poi_fused_clone_permute_2.llir b/triton/PVJFEKUHFY6YDP74LR5ZN755XPQ2NOJORFYDHPYAEBAUF37PJJJQ/triton_poi_fused_clone_permute_2.llir
new file mode 100644
index 0000000000000000000000000000000000000000..7a54ef9cf8343e9c92f1b0469c29b2a2bf75c9fb
--- /dev/null
+++ b/triton/PVJFEKUHFY6YDP74LR5ZN755XPQ2NOJORFYDHPYAEBAUF37PJJJQ/triton_poi_fused_clone_permute_2.llir
@@ -0,0 +1,67 @@
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64"
+
+; Function Attrs: nounwind
+define ptx_kernel void @triton_poi_fused_clone_permute_2(ptr addrspace(1) %0, ptr addrspace(1) %1, i32 %2, ptr addrspace(1) readnone captures(none) %3, ptr addrspace(1) readnone captures(none) %4) local_unnamed_addr #0 !dbg !4 {
+  %6 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7
+  %7 = shl i32 %6, 9, !dbg !8
+  %8 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9
+  %9 = shl nuw nsw i32 %8, 1, !dbg !9
+  %10 = and i32 %9, 510, !dbg !9
+  %11 = or disjoint i32 %10, %7, !dbg !10
+  %12 = sdiv i32 %11, 128, !dbg !11
+  %13 = mul i32 %12, 128, !dbg !12
+  %.decomposed = sub i32 %11, %13, !dbg !12
+  %14 = srem i32 %12, 32, !dbg !13
+  %15 = sdiv i32 %11, 4096, !dbg !14
+  %16 = shl nsw i32 %15, 7, !dbg !15
+  %17 = add nsw i32 %16, %.decomposed, !dbg !16
+  %18 = mul nsw i32 %14, 294912, !dbg !17
+  %19 = add nsw i32 %17, %18, !dbg !18
+  %20 = sext i32 %19 to i64, !dbg !19
+  %21 = getelementptr bfloat, ptr addrspace(1) %0, i64 %20, !dbg !19
+  %22 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l"(ptr addrspace(1) %21) #2, !dbg !20
+  %23 = sext i32 %11 to i64, !dbg !21
+  %24 = getelementptr bfloat, ptr addrspace(1) %1, i64 %23, !dbg !21
+  tail call void asm sideeffect "st.global.b32 [ $1 + 0 ], { $0 };", "r,l"(i32 %22, ptr addrspace(1) %24) #2, !dbg !22
+  ret void, !dbg !23
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1
+
+attributes #0 = { nounwind "nvvm.reqntid"="256" }
+attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #2 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly)
+!1 = !DIFile(filename: "cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py", directory: "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4")
+!2 = !{i32 2, !"Debug Info Version", i32 3}
+!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
+!4 = distinct !DISubprogram(name: "triton_poi_fused_clone_permute_2", linkageName: "triton_poi_fused_clone_permute_2", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!5 = !DISubroutineType(cc: DW_CC_normal, types: !6)
+!6 = !{}
+!7 = !DILocation(line: 20, column: 28, scope: !4)
+!8 = !DILocation(line: 20, column: 33, scope: !4)
+!9 = !DILocation(line: 21, column: 36, scope: !4)
+!10 = !DILocation(line: 21, column: 23, scope: !4)
+!11 = !DILocation(line: 24, column: 21, scope: !4)
+!12 = !DILocation(line: 23, column: 19, scope: !4)
+!13 = !DILocation(line: 24, column: 28, scope: !4)
+!14 = !DILocation(line: 25, column: 19, scope: !4)
+!15 = !DILocation(line: 27, column: 39, scope: !4)
+!16 = !DILocation(line: 27, column: 35, scope: !4)
+!17 = !DILocation(line: 27, column: 51, scope: !4)
+!18 = !DILocation(line: 27, column: 44, scope: !4)
+!19 = !DILocation(line: 27, column: 30, scope: !4)
+!20 = !DILocation(line: 27, column: 56, scope: !4)
+!21 = !DILocation(line: 28, column: 25, scope: !4)
+!22 = !DILocation(line: 28, column: 36, scope: !4)
+!23 = !DILocation(line: 28, column: 4, scope: !4)
diff --git a/triton/PVJFEKUHFY6YDP74LR5ZN755XPQ2NOJORFYDHPYAEBAUF37PJJJQ/triton_poi_fused_clone_permute_2.ptx b/triton/PVJFEKUHFY6YDP74LR5ZN755XPQ2NOJORFYDHPYAEBAUF37PJJJQ/triton_poi_fused_clone_permute_2.ptx
new file mode 100644
index 0000000000000000000000000000000000000000..10a6493b491e7870ac033b725875e9e9bad66953
--- /dev/null
+++ b/triton/PVJFEKUHFY6YDP74LR5ZN755XPQ2NOJORFYDHPYAEBAUF37PJJJQ/triton_poi_fused_clone_permute_2.ptx
@@ -0,0 +1,324 @@
+//
+// Generated by LLVM NVPTX Back-End
+//
+
+.version 9.1
+.target sm_89
+.address_size 64
+
+	// .globl	triton_poi_fused_clone_permute_2 // -- Begin function triton_poi_fused_clone_permute_2
+                                        // @triton_poi_fused_clone_permute_2
+.visible .entry triton_poi_fused_clone_permute_2(
+	.param .u64 .ptr .global .align 1 triton_poi_fused_clone_permute_2_param_0,
+	.param .u64 .ptr .global .align 1 triton_poi_fused_clone_permute_2_param_1,
+	.param .u32 triton_poi_fused_clone_permute_2_param_2,
+	.param .u64 .ptr .global .align 1 triton_poi_fused_clone_permute_2_param_3,
+	.param .u64 .ptr .global .align 1 triton_poi_fused_clone_permute_2_param_4
+)
+.reqntid 256
+{
+	.reg .b32 	%r<24>;
+	.reg .b64 	%rd<5>;
+	.loc	1 18 0                          // cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py:18:0
+$L__func_begin0:
+	.loc	1 18 0                          // cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py:18:0
+
+// %bb.0:
+	ld.param.b64 	%rd3, [triton_poi_fused_clone_permute_2_param_0];
+	ld.param.b64 	%rd4, [triton_poi_fused_clone_permute_2_param_1];
+$L__tmp0:
+	.loc	1 20 28                         // cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py:20:28
+	mov.u32 	%r2, %ctaid.x;
+	.loc	1 20 33                         // cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py:20:33
+	shl.b32 	%r3, %r2, 9;
+	.loc	1 21 36                         // cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py:21:36
+	mov.u32 	%r4, %tid.x;
+	shl.b32 	%r5, %r4, 1;
+	and.b32 	%r6, %r5, 510;
+	.loc	1 21 23                         // cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py:21:23
+	or.b32 	%r7, %r6, %r3;
+	.loc	1 24 21                         // cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py:24:21
+	bfe.s32 	%r8, %r2, 22, 1;
+	shr.u32 	%r9, %r8, 25;
+	add.s32 	%r10, %r7, %r9;
+	shr.s32 	%r11, %r10, 7;
+	.loc	1 23 19                         // cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py:23:19
+	and.b32 	%r12, %r10, -128;
+	sub.s32 	%r13, %r7, %r12;
+	.loc	1 24 28                         // cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py:24:28
+	shr.u32 	%r14, %r11, 27;
+	add.s32 	%r15, %r11, %r14;
+	and.b32 	%r16, %r15, 131040;
+	sub.s32 	%r17, %r11, %r16;
+	.loc	1 25 19                         // cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py:25:19
+	shr.u32 	%r18, %r8, 20;
+	add.s32 	%r19, %r7, %r18;
+	shr.s32 	%r20, %r19, 12;
+	.loc	1 27 39                         // cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py:27:39
+	shl.b32 	%r21, %r20, 7;
+	.loc	1 27 35                         // cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py:27:35
+	add.s32 	%r22, %r21, %r13;
+	.loc	1 27 44                         // cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py:27:44
+	mad.lo.s32 	%r23, %r17, 294912, %r22;
+	.loc	1 27 30                         // cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py:27:30
+	mad.wide.s32 	%rd1, %r23, 2, %rd3;
+	.loc	1 27 56                         // cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py:27:56
+	// begin inline asm
+	mov.u32 %r1, 0x0;
+	ld.global.b32 { %r1 }, [ %rd1 + 0 ];
+	// end inline asm
+	.loc	1 28 25                         // cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py:28:25
+	mad.wide.s32 	%rd2, %r7, 2, %rd4;
+	.loc	1 28 36                         // cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py:28:36
+	// begin inline asm
+	st.global.b32 [ %rd2 + 0 ], { %r1 };
+	// end inline asm
+	.loc	1 28 4                          // cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py:28:4
+	ret;
+$L__tmp1:
+$L__func_end0:
+                                        // -- End function
+}
+	.file	1 "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py"
+	.section	.debug_abbrev
+	{
+.b8 1                                   // Abbreviation Code
+.b8 17                                  // DW_TAG_compile_unit
+.b8 0                                   // DW_CHILDREN_no
+.b8 37                                  // DW_AT_producer
+.b8 8                                   // DW_FORM_string
+.b8 19                                  // DW_AT_language
+.b8 5                                   // DW_FORM_data2
+.b8 3                                   // DW_AT_name
+.b8 8                                   // DW_FORM_string
+.b8 16                                  // DW_AT_stmt_list
+.b8 6                                   // DW_FORM_data4
+.b8 27                                  // DW_AT_comp_dir
+.b8 8                                   // DW_FORM_string
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 0                                   // EOM(3)
+	}
+	.section	.debug_info
+	{
+.b32 224                                // Length of Unit
+.b8 2                                   // DWARF version number
+.b8 0
+.b32 .debug_abbrev                      // Offset Into Abbrev. Section
+.b8 8                                   // Address Size (in bytes)
+.b8 1                                   // Abbrev [1] 0xb:0xd9 DW_TAG_compile_unit
+.b8 116                                 // DW_AT_producer
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 0
+.b8 2                                   // DW_AT_language
+.b8 0
+.b8 99                                  // DW_AT_name
+.b8 106
+.b8 52
+.b8 55
+.b8 118
+.b8 122
+.b8 50
+.b8 117
+.b8 55
+.b8 105
+.b8 51
+.b8 116
+.b8 104
+.b8 53
+.b8 51
+.b8 99
+.b8 102
+.b8 50
+.b8 101
+.b8 108
+.b8 99
+.b8 53
+.b8 102
+.b8 105
+.b8 121
+.b8 108
+.b8 118
+.b8 121
+.b8 107
+.b8 55
+.b8 111
+.b8 51
+.b8 110
+.b8 105
+.b8 50
+.b8 112
+.b8 110
+.b8 52
+.b8 99
+.b8 50
+.b8 98
+.b8 100
+.b8 100
+.b8 114
+.b8 122
+.b8 113
+.b8 53
+.b8 106
+.b8 110
+.b8 117
+.b8 110
+.b8 113
+.b8 46
+.b8 112
+.b8 121
+.b8 0
+.b32 .debug_line                        // DW_AT_stmt_list
+.b8 47                                  // DW_AT_comp_dir
+.b8 97
+.b8 112
+.b8 112
+.b8 47
+.b8 116
+.b8 101
+.b8 110
+.b8 115
+.b8 111
+.b8 114
+.b8 114
+.b8 116
+.b8 95
+.b8 108
+.b8 108
+.b8 109
+.b8 47
+.b8 118
+.b8 105
+.b8 115
+.b8 117
+.b8 97
+.b8 108
+.b8 95
+.b8 103
+.b8 101
+.b8 110
+.b8 47
+.b8 99
+.b8 111
+.b8 109
+.b8 112
+.b8 105
+.b8 108
+.b8 101
+.b8 100
+.b8 95
+.b8 99
+.b8 97
+.b8 99
+.b8 104
+.b8 101
+.b8 47
+.b8 102
+.b8 108
+.b8 117
+.b8 120
+.b8 50
+.b8 95
+.b8 107
+.b8 108
+.b8 101
+.b8 105
+.b8 110
+.b8 95
+.b8 57
+.b8 98
+.b8 95
+.b8 78
+.b8 86
+.b8 73
+.b8 68
+.b8 73
+.b8 65
+.b8 95
+.b8 71
+.b8 101
+.b8 70
+.b8 111
+.b8 114
+.b8 99
+.b8 101
+.b8 95
+.b8 82
+.b8 84
+.b8 88
+.b8 95
+.b8 52
+.b8 48
+.b8 57
+.b8 48
+.b8 95
+.b8 115
+.b8 109
+.b8 56
+.b8 57
+.b8 95
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 50
+.b8 46
+.b8 49
+.b8 48
+.b8 46
+.b8 48
+.b8 97
+.b8 48
+.b8 95
+.b8 98
+.b8 52
+.b8 101
+.b8 52
+.b8 101
+.b8 101
+.b8 56
+.b8 49
+.b8 100
+.b8 51
+.b8 46
+.b8 110
+.b8 118
+.b8 50
+.b8 53
+.b8 46
+.b8 49
+.b8 50
+.b8 95
+.b8 99
+.b8 117
+.b8 100
+.b8 97
+.b8 49
+.b8 51
+.b8 95
+.b8 49
+.b8 47
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 105
+.b8 110
+.b8 100
+.b8 117
+.b8 99
+.b8 116
+.b8 111
+.b8 114
+.b8 47
+.b8 106
+.b8 52
+.b8 0
+	}
+	.section	.debug_macinfo	{	}
diff --git a/triton/PVJFEKUHFY6YDP74LR5ZN755XPQ2NOJORFYDHPYAEBAUF37PJJJQ/triton_poi_fused_clone_permute_2.source b/triton/PVJFEKUHFY6YDP74LR5ZN755XPQ2NOJORFYDHPYAEBAUF37PJJJQ/triton_poi_fused_clone_permute_2.source
new file mode 100644
index 0000000000000000000000000000000000000000..fb39d548a310881d8db46da35c1ed42ee766abb4
--- /dev/null
+++ b/triton/PVJFEKUHFY6YDP74LR5ZN755XPQ2NOJORFYDHPYAEBAUF37PJJJQ/triton_poi_fused_clone_permute_2.source
@@ -0,0 +1,90 @@
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":18:0)
+#loc21 = loc("in_ptr0"(#loc))
+#loc22 = loc("out_ptr0"(#loc))
+#loc23 = loc("xnumel"(#loc))
+module {
+  tt.func public @triton_poi_fused_clone_permute_2(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} {
+    %xnumel_0 = arith.constant 9437184 : i32 loc(#loc24)
+    %xoffset = tt.get_program_id x : i32 loc(#loc25)
+    %xoffset_1 = arith.constant 512 : i32 loc(#loc26)
+    %xoffset_2 = arith.constant 512 : i32 loc(#loc26)
+    %xoffset_3 = arith.muli %xoffset, %xoffset_2 : i32 loc(#loc26)
+    %xindex = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32> loc(#loc27)
+    %xindex_4 = tt.splat %xoffset_3 : i32 -> tensor<512xi32> loc(#loc28)
+    %xindex_5 = arith.addi %xindex_4, %xindex : tensor<512xi32> loc(#loc28)
+    %xmask = arith.constant true loc(#loc29)
+    %xmask_6 = arith.constant dense<true> : tensor<512xi1> loc(#loc29)
+    %x0 = arith.constant 128 : i32 loc(#loc30)
+    %x0_7 = arith.constant 128 : i32 loc(#loc30)
+    %x0_8 = arith.constant dense<128> : tensor<512xi32> loc(#loc30)
+    %x0_9 = arith.remsi %xindex_5, %x0_8 : tensor<512xi32> loc(#loc30)
+    %x1 = arith.constant 128 : i32 loc(#loc31)
+    %x1_10 = arith.constant 128 : i32 loc(#loc31)
+    %x1_11 = arith.constant dense<128> : tensor<512xi32> loc(#loc31)
+    %x1_12 = arith.divsi %xindex_5, %x1_11 : tensor<512xi32> loc(#loc31)
+    %x1_13 = arith.constant 32 : i32 loc(#loc32)
+    %x1_14 = arith.constant 32 : i32 loc(#loc32)
+    %x1_15 = arith.constant dense<32> : tensor<512xi32> loc(#loc32)
+    %x1_16 = arith.remsi %x1_12, %x1_15 : tensor<512xi32> loc(#loc32)
+    %x2 = arith.constant 4096 : i32 loc(#loc33)
+    %x2_17 = arith.constant 4096 : i32 loc(#loc33)
+    %x2_18 = arith.constant dense<4096> : tensor<512xi32> loc(#loc33)
+    %x2_19 = arith.divsi %xindex_5, %x2_18 : tensor<512xi32> loc(#loc33)
+    %tmp0 = arith.constant 128 : i32 loc(#loc34)
+    %tmp0_20 = arith.constant 128 : i32 loc(#loc34)
+    %tmp0_21 = arith.constant dense<128> : tensor<512xi32> loc(#loc34)
+    %tmp0_22 = arith.muli %tmp0_21, %x2_19 : tensor<512xi32> loc(#loc34)
+    %tmp0_23 = arith.addi %x0_9, %tmp0_22 : tensor<512xi32> loc(#loc35)
+    %tmp0_24 = arith.constant 294912 : i32 loc(#loc36)
+    %tmp0_25 = arith.constant 294912 : i32 loc(#loc36)
+    %tmp0_26 = arith.constant dense<294912> : tensor<512xi32> loc(#loc36)
+    %tmp0_27 = arith.muli %tmp0_26, %x1_16 : tensor<512xi32> loc(#loc36)
+    %tmp0_28 = arith.addi %tmp0_23, %tmp0_27 : tensor<512xi32> loc(#loc37)
+    %tmp0_29 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<512x!tt.ptr<bf16>> loc(#loc38)
+    %tmp0_30 = tt.addptr %tmp0_29, %tmp0_28 : tensor<512x!tt.ptr<bf16>>, tensor<512xi32> loc(#loc38)
+    %tmp0_31 = tt.load %tmp0_30 : tensor<512x!tt.ptr<bf16>> loc(#loc39)
+    %tmp0_32 = arith.extf %tmp0_31 : tensor<512xbf16> to tensor<512xf32> loc(#loc40)
+    %0 = tt.splat %out_ptr0 : !tt.ptr<bf16> -> tensor<512x!tt.ptr<bf16>> loc(#loc18)
+    %1 = tt.addptr %0, %xindex_5 : tensor<512x!tt.ptr<bf16>>, tensor<512xi32> loc(#loc18)
+    %2 = arith.truncf %tmp0_32 : tensor<512xf32> to tensor<512xbf16> loc(#loc19)
+    tt.store %1, %2 : tensor<512x!tt.ptr<bf16>> loc(#loc19)
+    tt.return loc(#loc20)
+  } loc(#loc)
+} loc(#loc)
+#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":19:13)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":20:28)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":20:33)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":21:36)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":21:23)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":22:36)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":23:19)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":24:21)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":24:28)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":25:19)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":27:39)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":27:35)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":27:51)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":27:44)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":27:30)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":27:56)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":27:65)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":28:25)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":28:36)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":28:4)
+#loc24 = loc("xnumel"(#loc1))
+#loc25 = loc("xoffset"(#loc2))
+#loc26 = loc("xoffset"(#loc3))
+#loc27 = loc("xindex"(#loc4))
+#loc28 = loc("xindex"(#loc5))
+#loc29 = loc("xmask"(#loc6))
+#loc30 = loc("x0"(#loc7))
+#loc31 = loc("x1"(#loc8))
+#loc32 = loc("x1"(#loc9))
+#loc33 = loc("x2"(#loc10))
+#loc34 = loc("tmp0"(#loc11))
+#loc35 = loc("tmp0"(#loc12))
+#loc36 = loc("tmp0"(#loc13))
+#loc37 = loc("tmp0"(#loc14))
+#loc38 = loc("tmp0"(#loc15))
+#loc39 = loc("tmp0"(#loc16))
+#loc40 = loc("tmp0"(#loc17))
diff --git a/triton/PVJFEKUHFY6YDP74LR5ZN755XPQ2NOJORFYDHPYAEBAUF37PJJJQ/triton_poi_fused_clone_permute_2.ttgir b/triton/PVJFEKUHFY6YDP74LR5ZN755XPQ2NOJORFYDHPYAEBAUF37PJJJQ/triton_poi_fused_clone_permute_2.ttgir
new file mode 100644
index 0000000000000000000000000000000000000000..cea7be2293112cec854e1647a9bf428d763ceef5
--- /dev/null
+++ b/triton/PVJFEKUHFY6YDP74LR5ZN755XPQ2NOJORFYDHPYAEBAUF37PJJJQ/triton_poi_fused_clone_permute_2.ttgir
@@ -0,0 +1,66 @@
+#blocked = #ttg.blocked<{sizePerThread = [2], threadsPerWarp = [32], warpsPerCTA = [8], order = [0]}>
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":18:0)
+#loc19 = loc("in_ptr0"(#loc))
+#loc20 = loc("out_ptr0"(#loc))
+#loc21 = loc("xnumel"(#loc))
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "cuda:89", "ttg.threads-per-warp" = 32 : i32} {
+  tt.func public @triton_poi_fused_clone_permute_2(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} {
+    %cst = arith.constant dense<128> : tensor<512xi32, #blocked> loc(#loc1)
+    %cst_0 = arith.constant dense<32> : tensor<512xi32, #blocked> loc(#loc1)
+    %cst_1 = arith.constant dense<4096> : tensor<512xi32, #blocked> loc(#loc1)
+    %cst_2 = arith.constant dense<294912> : tensor<512xi32, #blocked> loc(#loc1)
+    %c512_i32 = arith.constant 512 : i32 loc(#loc1)
+    %xoffset = tt.get_program_id x : i32 loc(#loc22)
+    %xoffset_3 = arith.muli %xoffset, %c512_i32 : i32 loc(#loc23)
+    %xindex = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32, #blocked> loc(#loc24)
+    %xindex_4 = tt.splat %xoffset_3 : i32 -> tensor<512xi32, #blocked> loc(#loc25)
+    %xindex_5 = arith.addi %xindex_4, %xindex : tensor<512xi32, #blocked> loc(#loc25)
+    %x0 = arith.remsi %xindex_5, %cst : tensor<512xi32, #blocked> loc(#loc26)
+    %x1 = arith.divsi %xindex_5, %cst : tensor<512xi32, #blocked> loc(#loc27)
+    %x1_6 = arith.remsi %x1, %cst_0 : tensor<512xi32, #blocked> loc(#loc28)
+    %x2 = arith.divsi %xindex_5, %cst_1 : tensor<512xi32, #blocked> loc(#loc29)
+    %tmp0 = arith.muli %x2, %cst : tensor<512xi32, #blocked> loc(#loc30)
+    %tmp0_7 = arith.addi %x0, %tmp0 : tensor<512xi32, #blocked> loc(#loc31)
+    %tmp0_8 = arith.muli %x1_6, %cst_2 : tensor<512xi32, #blocked> loc(#loc32)
+    %tmp0_9 = arith.addi %tmp0_7, %tmp0_8 : tensor<512xi32, #blocked> loc(#loc33)
+    %tmp0_10 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<512x!tt.ptr<bf16>, #blocked> loc(#loc34)
+    %tmp0_11 = tt.addptr %tmp0_10, %tmp0_9 : tensor<512x!tt.ptr<bf16>, #blocked>, tensor<512xi32, #blocked> loc(#loc34)
+    %tmp0_12 = tt.load %tmp0_11 : tensor<512x!tt.ptr<bf16>, #blocked> loc(#loc35)
+    %0 = tt.splat %out_ptr0 : !tt.ptr<bf16> -> tensor<512x!tt.ptr<bf16>, #blocked> loc(#loc16)
+    %1 = tt.addptr %0, %xindex_5 : tensor<512x!tt.ptr<bf16>, #blocked>, tensor<512xi32, #blocked> loc(#loc16)
+    tt.store %1, %tmp0_12 : tensor<512x!tt.ptr<bf16>, #blocked> loc(#loc17)
+    tt.return loc(#loc18)
+  } loc(#loc)
+} loc(#loc)
+#loc1 = loc(unknown)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":20:28)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":20:33)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":21:36)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":21:23)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":23:19)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":24:21)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":24:28)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":25:19)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":27:39)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":27:35)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":27:51)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":27:44)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":27:30)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":27:56)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":28:25)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":28:36)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":28:4)
+#loc22 = loc("xoffset"(#loc2))
+#loc23 = loc("xoffset"(#loc3))
+#loc24 = loc("xindex"(#loc4))
+#loc25 = loc("xindex"(#loc5))
+#loc26 = loc("x0"(#loc6))
+#loc27 = loc("x1"(#loc7))
+#loc28 = loc("x1"(#loc8))
+#loc29 = loc("x2"(#loc9))
+#loc30 = loc("tmp0"(#loc10))
+#loc31 = loc("tmp0"(#loc11))
+#loc32 = loc("tmp0"(#loc12))
+#loc33 = loc("tmp0"(#loc13))
+#loc34 = loc("tmp0"(#loc14))
+#loc35 = loc("tmp0"(#loc15))
diff --git a/triton/PVJFEKUHFY6YDP74LR5ZN755XPQ2NOJORFYDHPYAEBAUF37PJJJQ/triton_poi_fused_clone_permute_2.ttir b/triton/PVJFEKUHFY6YDP74LR5ZN755XPQ2NOJORFYDHPYAEBAUF37PJJJQ/triton_poi_fused_clone_permute_2.ttir
new file mode 100644
index 0000000000000000000000000000000000000000..736e658ece89e46da1fc5785e06cb31d4b9ca20a
--- /dev/null
+++ b/triton/PVJFEKUHFY6YDP74LR5ZN755XPQ2NOJORFYDHPYAEBAUF37PJJJQ/triton_poi_fused_clone_permute_2.ttir
@@ -0,0 +1,65 @@
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":18:0)
+#loc19 = loc("in_ptr0"(#loc))
+#loc20 = loc("out_ptr0"(#loc))
+#loc21 = loc("xnumel"(#loc))
+module {
+  tt.func public @triton_poi_fused_clone_permute_2(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} {
+    %tmp0 = arith.constant dense<294912> : tensor<512xi32> loc(#loc22)
+    %x2 = arith.constant dense<4096> : tensor<512xi32> loc(#loc23)
+    %x1 = arith.constant dense<32> : tensor<512xi32> loc(#loc24)
+    %cst = arith.constant dense<128> : tensor<512xi32> loc(#loc4)
+    %c512_i32 = arith.constant 512 : i32 loc(#loc4)
+    %xoffset = tt.get_program_id x : i32 loc(#loc25)
+    %xoffset_0 = arith.muli %xoffset, %c512_i32 : i32 loc(#loc26)
+    %xindex = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32> loc(#loc27)
+    %xindex_1 = tt.splat %xoffset_0 : i32 -> tensor<512xi32> loc(#loc28)
+    %xindex_2 = arith.addi %xindex_1, %xindex : tensor<512xi32> loc(#loc28)
+    %x0 = arith.remsi %xindex_2, %cst : tensor<512xi32> loc(#loc29)
+    %x1_3 = arith.divsi %xindex_2, %cst : tensor<512xi32> loc(#loc30)
+    %x1_4 = arith.remsi %x1_3, %x1 : tensor<512xi32> loc(#loc24)
+    %x2_5 = arith.divsi %xindex_2, %x2 : tensor<512xi32> loc(#loc23)
+    %tmp0_6 = arith.muli %x2_5, %cst : tensor<512xi32> loc(#loc31)
+    %tmp0_7 = arith.addi %x0, %tmp0_6 : tensor<512xi32> loc(#loc32)
+    %tmp0_8 = arith.muli %x1_4, %tmp0 : tensor<512xi32> loc(#loc22)
+    %tmp0_9 = arith.addi %tmp0_7, %tmp0_8 : tensor<512xi32> loc(#loc33)
+    %tmp0_10 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<512x!tt.ptr<bf16>> loc(#loc34)
+    %tmp0_11 = tt.addptr %tmp0_10, %tmp0_9 : tensor<512x!tt.ptr<bf16>>, tensor<512xi32> loc(#loc34)
+    %tmp0_12 = tt.load %tmp0_11 : tensor<512x!tt.ptr<bf16>> loc(#loc35)
+    %0 = tt.splat %out_ptr0 : !tt.ptr<bf16> -> tensor<512x!tt.ptr<bf16>> loc(#loc16)
+    %1 = tt.addptr %0, %xindex_2 : tensor<512x!tt.ptr<bf16>>, tensor<512xi32> loc(#loc16)
+    tt.store %1, %tmp0_12 : tensor<512x!tt.ptr<bf16>> loc(#loc17)
+    tt.return loc(#loc18)
+  } loc(#loc)
+} loc(#loc)
+#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":27:51)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":25:19)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":24:28)
+#loc4 = loc(unknown)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":20:28)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":20:33)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":21:36)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":21:23)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":23:19)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":24:21)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":27:39)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":27:35)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":27:44)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":27:30)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":27:56)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":28:25)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":28:36)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j4/cj47vz2u7i3th53cf2elc5fiylvyk7o3ni2pn4c2bddrzq5jnunq.py":28:4)
+#loc22 = loc("tmp0"(#loc1))
+#loc23 = loc("x2"(#loc2))
+#loc24 = loc("x1"(#loc3))
+#loc25 = loc("xoffset"(#loc5))
+#loc26 = loc("xoffset"(#loc6))
+#loc27 = loc("xindex"(#loc7))
+#loc28 = loc("xindex"(#loc8))
+#loc29 = loc("x0"(#loc9))
+#loc30 = loc("x1"(#loc10))
+#loc31 = loc("tmp0"(#loc11))
+#loc32 = loc("tmp0"(#loc12))
+#loc33 = loc("tmp0"(#loc13))
+#loc34 = loc("tmp0"(#loc14))
+#loc35 = loc("tmp0"(#loc15))
diff --git a/triton/Q5QIKEPJDRH7FHZ6CDBLMD5Y4GTGU6Y7IAWNFLJIJRNGOB7RFV4Q/__grp__triton_poi_fused_add_mul_0.json b/triton/Q5QIKEPJDRH7FHZ6CDBLMD5Y4GTGU6Y7IAWNFLJIJRNGOB7RFV4Q/__grp__triton_poi_fused_add_mul_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..8413b80802ab7e798a5db50533dc5fd4a187d9c7
--- /dev/null
+++ b/triton/Q5QIKEPJDRH7FHZ6CDBLMD5Y4GTGU6Y7IAWNFLJIJRNGOB7RFV4Q/__grp__triton_poi_fused_add_mul_0.json
@@ -0,0 +1 @@
+{"child_paths": {"triton_poi_fused_add_mul_0.source": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/Q5QIKEPJDRH7FHZ6CDBLMD5Y4GTGU6Y7IAWNFLJIJRNGOB7RFV4Q/triton_poi_fused_add_mul_0.source", "triton_poi_fused_add_mul_0.ttir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/Q5QIKEPJDRH7FHZ6CDBLMD5Y4GTGU6Y7IAWNFLJIJRNGOB7RFV4Q/triton_poi_fused_add_mul_0.ttir", "triton_poi_fused_add_mul_0.ttgir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/Q5QIKEPJDRH7FHZ6CDBLMD5Y4GTGU6Y7IAWNFLJIJRNGOB7RFV4Q/triton_poi_fused_add_mul_0.ttgir", "triton_poi_fused_add_mul_0.llir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/Q5QIKEPJDRH7FHZ6CDBLMD5Y4GTGU6Y7IAWNFLJIJRNGOB7RFV4Q/triton_poi_fused_add_mul_0.llir", "triton_poi_fused_add_mul_0.ptx": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/Q5QIKEPJDRH7FHZ6CDBLMD5Y4GTGU6Y7IAWNFLJIJRNGOB7RFV4Q/triton_poi_fused_add_mul_0.ptx", "triton_poi_fused_add_mul_0.cubin": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/Q5QIKEPJDRH7FHZ6CDBLMD5Y4GTGU6Y7IAWNFLJIJRNGOB7RFV4Q/triton_poi_fused_add_mul_0.cubin", "triton_poi_fused_add_mul_0.json": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/Q5QIKEPJDRH7FHZ6CDBLMD5Y4GTGU6Y7IAWNFLJIJRNGOB7RFV4Q/triton_poi_fused_add_mul_0.json"}}
\ No newline at end of file
diff --git a/triton/Q5QIKEPJDRH7FHZ6CDBLMD5Y4GTGU6Y7IAWNFLJIJRNGOB7RFV4Q/triton_poi_fused_add_mul_0.cubin b/triton/Q5QIKEPJDRH7FHZ6CDBLMD5Y4GTGU6Y7IAWNFLJIJRNGOB7RFV4Q/triton_poi_fused_add_mul_0.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..0863a2a23203195e02d18b93847ac8b8854102e6
Binary files /dev/null and b/triton/Q5QIKEPJDRH7FHZ6CDBLMD5Y4GTGU6Y7IAWNFLJIJRNGOB7RFV4Q/triton_poi_fused_add_mul_0.cubin differ
diff --git a/triton/Q5QIKEPJDRH7FHZ6CDBLMD5Y4GTGU6Y7IAWNFLJIJRNGOB7RFV4Q/triton_poi_fused_add_mul_0.json b/triton/Q5QIKEPJDRH7FHZ6CDBLMD5Y4GTGU6Y7IAWNFLJIJRNGOB7RFV4Q/triton_poi_fused_add_mul_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..99f87dc7c7d91be60abc7f071d39cdb9a512fa18
--- /dev/null
+++ b/triton/Q5QIKEPJDRH7FHZ6CDBLMD5Y4GTGU6Y7IAWNFLJIJRNGOB7RFV4Q/triton_poi_fused_add_mul_0.json
@@ -0,0 +1 @@
+{"hash": "87608511e91c4ff29f3e10c2b60fb8e1a66a7b1f402cd2ad284c5a6707f12d79", "target": {"backend": "cuda", "arch": 89, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "enable_reflect_ftz": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee", "bf16x3", "bf16x6"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm89", "instrumentation_mode": "", "triton_version": "3.6.0", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_poi_fused_add_mul_0"}
\ No newline at end of file
diff --git a/triton/Q5QIKEPJDRH7FHZ6CDBLMD5Y4GTGU6Y7IAWNFLJIJRNGOB7RFV4Q/triton_poi_fused_add_mul_0.llir b/triton/Q5QIKEPJDRH7FHZ6CDBLMD5Y4GTGU6Y7IAWNFLJIJRNGOB7RFV4Q/triton_poi_fused_add_mul_0.llir
new file mode 100644
index 0000000000000000000000000000000000000000..e21e9f05be0ab5246db2ad7bf2f4d35695de041e
--- /dev/null
+++ b/triton/Q5QIKEPJDRH7FHZ6CDBLMD5Y4GTGU6Y7IAWNFLJIJRNGOB7RFV4Q/triton_poi_fused_add_mul_0.llir
@@ -0,0 +1,118 @@
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64"
+
+; Function Attrs: nounwind
+define ptx_kernel void @triton_poi_fused_add_mul_0(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, i32 %4, ptr addrspace(1) readnone captures(none) %5, ptr addrspace(1) readnone captures(none) %6) local_unnamed_addr #0 !dbg !4 {
+  %8 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7
+  %9 = shl i32 %8, 10, !dbg !8
+  %10 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9
+  %11 = shl nuw nsw i32 %10, 3, !dbg !9
+  %12 = and i32 %11, 1016, !dbg !9
+  %13 = or disjoint i32 %12, %9, !dbg !10
+  %14 = srem i32 %13, 4096, !dbg !11
+  %15 = sext i32 %13 to i64, !dbg !12
+  %16 = getelementptr bfloat, ptr addrspace(1) %0, i64 %15, !dbg !12
+  %17 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l"(ptr addrspace(1) %16) #2, !dbg !13
+  %18 = extractvalue { i32, i32, i32, i32 } %17, 0, !dbg !13
+  %19 = bitcast i32 %18 to <2 x bfloat>, !dbg !13
+  %20 = extractvalue { i32, i32, i32, i32 } %17, 1, !dbg !13
+  %21 = bitcast i32 %20 to <2 x bfloat>, !dbg !13
+  %22 = extractvalue { i32, i32, i32, i32 } %17, 2, !dbg !13
+  %23 = bitcast i32 %22 to <2 x bfloat>, !dbg !13
+  %24 = extractvalue { i32, i32, i32, i32 } %17, 3, !dbg !13
+  %25 = bitcast i32 %24 to <2 x bfloat>, !dbg !13
+  %26 = sext i32 %14 to i64, !dbg !14
+  %27 = getelementptr bfloat, ptr addrspace(1) %1, i64 %26, !dbg !14
+  %28 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #2, !dbg !15
+  %29 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ], $5;", "=r,=r,=r,=r,l,l"(ptr addrspace(1) %27, i64 %28) #2, !dbg !15
+  %30 = extractvalue { i32, i32, i32, i32 } %29, 0, !dbg !15
+  %31 = bitcast i32 %30 to <2 x bfloat>, !dbg !15
+  %32 = extractvalue { i32, i32, i32, i32 } %29, 1, !dbg !15
+  %33 = bitcast i32 %32 to <2 x bfloat>, !dbg !15
+  %34 = extractvalue { i32, i32, i32, i32 } %29, 2, !dbg !15
+  %35 = bitcast i32 %34 to <2 x bfloat>, !dbg !15
+  %36 = extractvalue { i32, i32, i32, i32 } %29, 3, !dbg !15
+  %37 = bitcast i32 %36 to <2 x bfloat>, !dbg !15
+  %38 = getelementptr bfloat, ptr addrspace(1) %2, i64 %15, !dbg !16
+  %39 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l"(ptr addrspace(1) %38) #2, !dbg !17
+  %40 = extractvalue { i32, i32, i32, i32 } %39, 0, !dbg !17
+  %41 = bitcast i32 %40 to <2 x bfloat>, !dbg !17
+  %42 = extractvalue { i32, i32, i32, i32 } %39, 1, !dbg !17
+  %43 = bitcast i32 %42 to <2 x bfloat>, !dbg !17
+  %44 = extractvalue { i32, i32, i32, i32 } %39, 2, !dbg !17
+  %45 = bitcast i32 %44 to <2 x bfloat>, !dbg !17
+  %46 = extractvalue { i32, i32, i32, i32 } %39, 3, !dbg !17
+  %47 = bitcast i32 %46 to <2 x bfloat>, !dbg !17
+  %48 = getelementptr bfloat, ptr addrspace(1) %3, i64 %15, !dbg !18
+  %49 = fpext <2 x bfloat> %19 to <2 x float>, !dbg !19
+  %50 = fpext <2 x bfloat> %31 to <2 x float>, !dbg !20
+  %51 = fpext <2 x bfloat> %41 to <2 x float>, !dbg !21
+  %52 = fmul <2 x float> %50, %51, !dbg !22
+  %53 = fadd <2 x float> %52, %49, !dbg !23
+  %54 = fptrunc <2 x float> %53 to <2 x bfloat>, !dbg !24
+  %55 = fpext <2 x bfloat> %21 to <2 x float>, !dbg !19
+  %56 = fpext <2 x bfloat> %33 to <2 x float>, !dbg !20
+  %57 = fpext <2 x bfloat> %43 to <2 x float>, !dbg !21
+  %58 = fmul <2 x float> %56, %57, !dbg !22
+  %59 = fadd <2 x float> %58, %55, !dbg !23
+  %60 = fptrunc <2 x float> %59 to <2 x bfloat>, !dbg !24
+  %61 = fpext <2 x bfloat> %23 to <2 x float>, !dbg !19
+  %62 = fpext <2 x bfloat> %35 to <2 x float>, !dbg !20
+  %63 = fpext <2 x bfloat> %45 to <2 x float>, !dbg !21
+  %64 = fmul <2 x float> %62, %63, !dbg !22
+  %65 = fadd <2 x float> %64, %61, !dbg !23
+  %66 = fptrunc <2 x float> %65 to <2 x bfloat>, !dbg !24
+  %67 = fpext <2 x bfloat> %25 to <2 x float>, !dbg !19
+  %68 = fpext <2 x bfloat> %37 to <2 x float>, !dbg !20
+  %69 = fpext <2 x bfloat> %47 to <2 x float>, !dbg !21
+  %70 = fmul <2 x float> %68, %69, !dbg !22
+  %71 = fadd <2 x float> %70, %67, !dbg !23
+  %72 = fptrunc <2 x float> %71 to <2 x bfloat>, !dbg !24
+  %73 = bitcast <2 x bfloat> %54 to i32, !dbg !24
+  %74 = bitcast <2 x bfloat> %60 to i32, !dbg !24
+  %75 = bitcast <2 x bfloat> %66 to i32, !dbg !24
+  %76 = bitcast <2 x bfloat> %72 to i32, !dbg !24
+  tail call void asm sideeffect "st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l"(i32 %73, i32 %74, i32 %75, i32 %76, ptr addrspace(1) %48) #2, !dbg !24
+  ret void, !dbg !25
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1
+
+attributes #0 = { nounwind "nvvm.reqntid"="128" }
+attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #2 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly)
+!1 = !DIFile(filename: "c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py", directory: "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k")
+!2 = !{i32 2, !"Debug Info Version", i32 3}
+!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
+!4 = distinct !DISubprogram(name: "triton_poi_fused_add_mul_0", linkageName: "triton_poi_fused_add_mul_0", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!5 = !DISubroutineType(cc: DW_CC_normal, types: !6)
+!6 = !{}
+!7 = !DILocation(line: 20, column: 28, scope: !4)
+!8 = !DILocation(line: 20, column: 33, scope: !4)
+!9 = !DILocation(line: 21, column: 36, scope: !4)
+!10 = !DILocation(line: 21, column: 23, scope: !4)
+!11 = !DILocation(line: 24, column: 19, scope: !4)
+!12 = !DILocation(line: 25, column: 30, scope: !4)
+!13 = !DILocation(line: 25, column: 35, scope: !4)
+!14 = !DILocation(line: 26, column: 30, scope: !4)
+!15 = !DILocation(line: 26, column: 35, scope: !4)
+!16 = !DILocation(line: 27, column: 30, scope: !4)
+!17 = !DILocation(line: 27, column: 35, scope: !4)
+!18 = !DILocation(line: 30, column: 25, scope: !4)
+!19 = !DILocation(line: 25, column: 44, scope: !4)
+!20 = !DILocation(line: 26, column: 74, scope: !4)
+!21 = !DILocation(line: 27, column: 44, scope: !4)
+!22 = !DILocation(line: 28, column: 18, scope: !4)
+!23 = !DILocation(line: 29, column: 18, scope: !4)
+!24 = !DILocation(line: 30, column: 36, scope: !4)
+!25 = !DILocation(line: 30, column: 4, scope: !4)
diff --git a/triton/Q5QIKEPJDRH7FHZ6CDBLMD5Y4GTGU6Y7IAWNFLJIJRNGOB7RFV4Q/triton_poi_fused_add_mul_0.ptx b/triton/Q5QIKEPJDRH7FHZ6CDBLMD5Y4GTGU6Y7IAWNFLJIJRNGOB7RFV4Q/triton_poi_fused_add_mul_0.ptx
new file mode 100644
index 0000000000000000000000000000000000000000..9fd68f9f34101b13ddf924239071fbcbf373d304
--- /dev/null
+++ b/triton/Q5QIKEPJDRH7FHZ6CDBLMD5Y4GTGU6Y7IAWNFLJIJRNGOB7RFV4Q/triton_poi_fused_add_mul_0.ptx
@@ -0,0 +1,407 @@
+//
+// Generated by LLVM NVPTX Back-End
+//
+
+.version 9.1
+.target sm_89
+.address_size 64
+
+	// .globl	triton_poi_fused_add_mul_0 // -- Begin function triton_poi_fused_add_mul_0
+                                        // @triton_poi_fused_add_mul_0
+.visible .entry triton_poi_fused_add_mul_0(
+	.param .u64 .ptr .global .align 1 triton_poi_fused_add_mul_0_param_0,
+	.param .u64 .ptr .global .align 1 triton_poi_fused_add_mul_0_param_1,
+	.param .u64 .ptr .global .align 1 triton_poi_fused_add_mul_0_param_2,
+	.param .u64 .ptr .global .align 1 triton_poi_fused_add_mul_0_param_3,
+	.param .u32 triton_poi_fused_add_mul_0_param_4,
+	.param .u64 .ptr .global .align 1 triton_poi_fused_add_mul_0_param_5,
+	.param .u64 .ptr .global .align 1 triton_poi_fused_add_mul_0_param_6
+)
+.reqntid 128
+{
+	.reg .b16 	%rs<25>;
+	.reg .b32 	%r<60>;
+	.reg .b64 	%rd<11>;
+	.loc	1 18 0                          // c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py:18:0
+$L__func_begin0:
+	.loc	1 18 0                          // c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py:18:0
+
+// %bb.0:
+	ld.param.b64 	%rd6, [triton_poi_fused_add_mul_0_param_0];
+	ld.param.b64 	%rd7, [triton_poi_fused_add_mul_0_param_1];
+$L__tmp0:
+	.loc	1 20 28                         // c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py:20:28
+	mov.u32 	%r17, %ctaid.x;
+	.loc	1 20 33                         // c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py:20:33
+	shl.b32 	%r18, %r17, 10;
+	ld.param.b64 	%rd8, [triton_poi_fused_add_mul_0_param_2];
+	ld.param.b64 	%rd9, [triton_poi_fused_add_mul_0_param_3];
+	.loc	1 21 36                         // c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py:21:36
+	mov.u32 	%r19, %tid.x;
+	shl.b32 	%r20, %r19, 3;
+	and.b32 	%r21, %r20, 1016;
+	.loc	1 21 23                         // c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py:21:23
+	or.b32 	%r22, %r21, %r18;
+	.loc	1 24 19                         // c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py:24:19
+	bfe.s32 	%r23, %r17, 21, 1;
+	shr.u32 	%r24, %r23, 20;
+	add.s32 	%r25, %r22, %r24;
+	and.b32 	%r26, %r25, -4096;
+	sub.s32 	%r27, %r22, %r26;
+	.loc	1 25 30                         // c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py:25:30
+	mul.wide.s32 	%rd10, %r22, 2;
+	add.s64 	%rd1, %rd6, %rd10;
+	.loc	1 25 35                         // c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py:25:35
+	// begin inline asm
+	mov.u32 %r1, 0x0;
+	mov.u32 %r2, 0x0;
+	mov.u32 %r3, 0x0;
+	mov.u32 %r4, 0x0;
+	ld.global.v4.b32 { %r1, %r2, %r3, %r4 }, [ %rd1 + 0 ];
+	// end inline asm
+	.loc	1 26 30                         // c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py:26:30
+	mad.wide.s32 	%rd2, %r27, 2, %rd7;
+	.loc	1 26 35                         // c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py:26:35
+	// begin inline asm
+	mov.u64 %rd3, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd3, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r5, 0x0;
+	mov.u32 %r6, 0x0;
+	mov.u32 %r7, 0x0;
+	mov.u32 %r8, 0x0;
+	ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r5, %r6, %r7, %r8 }, [ %rd2 + 0 ], %rd3;
+	// end inline asm
+	.loc	1 27 30                         // c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py:27:30
+	add.s64 	%rd4, %rd8, %rd10;
+	.loc	1 27 35                         // c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py:27:35
+	// begin inline asm
+	mov.u32 %r9, 0x0;
+	mov.u32 %r10, 0x0;
+	mov.u32 %r11, 0x0;
+	mov.u32 %r12, 0x0;
+	ld.global.v4.b32 { %r9, %r10, %r11, %r12 }, [ %rd4 + 0 ];
+	// end inline asm
+	.loc	1 30 25                         // c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py:30:25
+	add.s64 	%rd5, %rd9, %rd10;
+	.loc	1 25 44                         // c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py:25:44
+	mov.b32 	{%rs1, %rs2}, %r1;
+	cvt.f32.bf16 	%r28, %rs2;
+	cvt.f32.bf16 	%r29, %rs1;
+	.loc	1 26 74                         // c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py:26:74
+	mov.b32 	{%rs3, %rs4}, %r5;
+	cvt.f32.bf16 	%r30, %rs4;
+	cvt.f32.bf16 	%r31, %rs3;
+	.loc	1 27 44                         // c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py:27:44
+	mov.b32 	{%rs5, %rs6}, %r9;
+	cvt.f32.bf16 	%r32, %rs6;
+	cvt.f32.bf16 	%r33, %rs5;
+	.loc	1 29 18                         // c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py:29:18
+	fma.rn.f32 	%r34, %r31, %r33, %r29;
+	fma.rn.f32 	%r35, %r30, %r32, %r28;
+	.loc	1 30 36                         // c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py:30:36
+	cvt.rn.bf16x2.f32 	%r13, %r35, %r34;
+	.loc	1 25 44                         // c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py:25:44
+	mov.b32 	{%rs7, %rs8}, %r2;
+	cvt.f32.bf16 	%r36, %rs8;
+	cvt.f32.bf16 	%r37, %rs7;
+	.loc	1 26 74                         // c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py:26:74
+	mov.b32 	{%rs9, %rs10}, %r6;
+	cvt.f32.bf16 	%r38, %rs10;
+	cvt.f32.bf16 	%r39, %rs9;
+	.loc	1 27 44                         // c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py:27:44
+	mov.b32 	{%rs11, %rs12}, %r10;
+	cvt.f32.bf16 	%r40, %rs12;
+	cvt.f32.bf16 	%r41, %rs11;
+	.loc	1 29 18                         // c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py:29:18
+	fma.rn.f32 	%r42, %r39, %r41, %r37;
+	fma.rn.f32 	%r43, %r38, %r40, %r36;
+	.loc	1 30 36                         // c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py:30:36
+	cvt.rn.bf16x2.f32 	%r14, %r43, %r42;
+	.loc	1 25 44                         // c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py:25:44
+	mov.b32 	{%rs13, %rs14}, %r3;
+	cvt.f32.bf16 	%r44, %rs14;
+	cvt.f32.bf16 	%r45, %rs13;
+	.loc	1 26 74                         // c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py:26:74
+	mov.b32 	{%rs15, %rs16}, %r7;
+	cvt.f32.bf16 	%r46, %rs16;
+	cvt.f32.bf16 	%r47, %rs15;
+	.loc	1 27 44                         // c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py:27:44
+	mov.b32 	{%rs17, %rs18}, %r11;
+	cvt.f32.bf16 	%r48, %rs18;
+	cvt.f32.bf16 	%r49, %rs17;
+	.loc	1 29 18                         // c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py:29:18
+	fma.rn.f32 	%r50, %r47, %r49, %r45;
+	fma.rn.f32 	%r51, %r46, %r48, %r44;
+	.loc	1 30 36                         // c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py:30:36
+	cvt.rn.bf16x2.f32 	%r15, %r51, %r50;
+	.loc	1 25 44                         // c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py:25:44
+	mov.b32 	{%rs19, %rs20}, %r4;
+	cvt.f32.bf16 	%r52, %rs20;
+	cvt.f32.bf16 	%r53, %rs19;
+	.loc	1 26 74                         // c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py:26:74
+	mov.b32 	{%rs21, %rs22}, %r8;
+	cvt.f32.bf16 	%r54, %rs22;
+	cvt.f32.bf16 	%r55, %rs21;
+	.loc	1 27 44                         // c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py:27:44
+	mov.b32 	{%rs23, %rs24}, %r12;
+	cvt.f32.bf16 	%r56, %rs24;
+	cvt.f32.bf16 	%r57, %rs23;
+	.loc	1 29 18                         // c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py:29:18
+	fma.rn.f32 	%r58, %r55, %r57, %r53;
+	fma.rn.f32 	%r59, %r54, %r56, %r52;
+	.loc	1 30 36                         // c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py:30:36
+	cvt.rn.bf16x2.f32 	%r16, %r59, %r58;
+	// begin inline asm
+	st.global.v4.b32 [ %rd5 + 0 ], { %r13, %r14, %r15, %r16 };
+	// end inline asm
+	.loc	1 30 4                          // c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py:30:4
+	ret;
+$L__tmp1:
+$L__func_end0:
+                                        // -- End function
+}
+	.file	1 "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py"
+	.section	.debug_abbrev
+	{
+.b8 1                                   // Abbreviation Code
+.b8 17                                  // DW_TAG_compile_unit
+.b8 0                                   // DW_CHILDREN_no
+.b8 37                                  // DW_AT_producer
+.b8 8                                   // DW_FORM_string
+.b8 19                                  // DW_AT_language
+.b8 5                                   // DW_FORM_data2
+.b8 3                                   // DW_AT_name
+.b8 8                                   // DW_FORM_string
+.b8 16                                  // DW_AT_stmt_list
+.b8 6                                   // DW_FORM_data4
+.b8 27                                  // DW_AT_comp_dir
+.b8 8                                   // DW_FORM_string
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 0                                   // EOM(3)
+	}
+	.section	.debug_info
+	{
+.b32 224                                // Length of Unit
+.b8 2                                   // DWARF version number
+.b8 0
+.b32 .debug_abbrev                      // Offset Into Abbrev. Section
+.b8 8                                   // Address Size (in bytes)
+.b8 1                                   // Abbrev [1] 0xb:0xd9 DW_TAG_compile_unit
+.b8 116                                 // DW_AT_producer
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 0
+.b8 2                                   // DW_AT_language
+.b8 0
+.b8 99                                  // DW_AT_name
+.b8 54
+.b8 107
+.b8 97
+.b8 116
+.b8 53
+.b8 103
+.b8 55
+.b8 110
+.b8 51
+.b8 117
+.b8 117
+.b8 107
+.b8 107
+.b8 102
+.b8 119
+.b8 103
+.b8 100
+.b8 120
+.b8 102
+.b8 119
+.b8 116
+.b8 109
+.b8 120
+.b8 98
+.b8 108
+.b8 99
+.b8 109
+.b8 113
+.b8 122
+.b8 104
+.b8 98
+.b8 105
+.b8 102
+.b8 111
+.b8 53
+.b8 103
+.b8 51
+.b8 114
+.b8 98
+.b8 97
+.b8 122
+.b8 51
+.b8 100
+.b8 106
+.b8 120
+.b8 105
+.b8 105
+.b8 51
+.b8 53
+.b8 103
+.b8 105
+.b8 46
+.b8 112
+.b8 121
+.b8 0
+.b32 .debug_line                        // DW_AT_stmt_list
+.b8 47                                  // DW_AT_comp_dir
+.b8 97
+.b8 112
+.b8 112
+.b8 47
+.b8 116
+.b8 101
+.b8 110
+.b8 115
+.b8 111
+.b8 114
+.b8 114
+.b8 116
+.b8 95
+.b8 108
+.b8 108
+.b8 109
+.b8 47
+.b8 118
+.b8 105
+.b8 115
+.b8 117
+.b8 97
+.b8 108
+.b8 95
+.b8 103
+.b8 101
+.b8 110
+.b8 47
+.b8 99
+.b8 111
+.b8 109
+.b8 112
+.b8 105
+.b8 108
+.b8 101
+.b8 100
+.b8 95
+.b8 99
+.b8 97
+.b8 99
+.b8 104
+.b8 101
+.b8 47
+.b8 102
+.b8 108
+.b8 117
+.b8 120
+.b8 50
+.b8 95
+.b8 107
+.b8 108
+.b8 101
+.b8 105
+.b8 110
+.b8 95
+.b8 57
+.b8 98
+.b8 95
+.b8 78
+.b8 86
+.b8 73
+.b8 68
+.b8 73
+.b8 65
+.b8 95
+.b8 71
+.b8 101
+.b8 70
+.b8 111
+.b8 114
+.b8 99
+.b8 101
+.b8 95
+.b8 82
+.b8 84
+.b8 88
+.b8 95
+.b8 52
+.b8 48
+.b8 57
+.b8 48
+.b8 95
+.b8 115
+.b8 109
+.b8 56
+.b8 57
+.b8 95
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 50
+.b8 46
+.b8 49
+.b8 48
+.b8 46
+.b8 48
+.b8 97
+.b8 48
+.b8 95
+.b8 98
+.b8 52
+.b8 101
+.b8 52
+.b8 101
+.b8 101
+.b8 56
+.b8 49
+.b8 100
+.b8 51
+.b8 46
+.b8 110
+.b8 118
+.b8 50
+.b8 53
+.b8 46
+.b8 49
+.b8 50
+.b8 95
+.b8 99
+.b8 117
+.b8 100
+.b8 97
+.b8 49
+.b8 51
+.b8 95
+.b8 49
+.b8 47
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 105
+.b8 110
+.b8 100
+.b8 117
+.b8 99
+.b8 116
+.b8 111
+.b8 114
+.b8 47
+.b8 54
+.b8 107
+.b8 0
+	}
+	.section	.debug_macinfo	{	}
diff --git a/triton/Q5QIKEPJDRH7FHZ6CDBLMD5Y4GTGU6Y7IAWNFLJIJRNGOB7RFV4Q/triton_poi_fused_add_mul_0.source b/triton/Q5QIKEPJDRH7FHZ6CDBLMD5Y4GTGU6Y7IAWNFLJIJRNGOB7RFV4Q/triton_poi_fused_add_mul_0.source
new file mode 100644
index 0000000000000000000000000000000000000000..3e7bbcc0cc0df0c77b070bdcbdfe4f4907a1a357
--- /dev/null
+++ b/triton/Q5QIKEPJDRH7FHZ6CDBLMD5Y4GTGU6Y7IAWNFLJIJRNGOB7RFV4Q/triton_poi_fused_add_mul_0.source
@@ -0,0 +1,82 @@
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":18:0)
+#loc22 = loc("in_ptr0"(#loc))
+#loc23 = loc("in_ptr1"(#loc))
+#loc24 = loc("in_ptr2"(#loc))
+#loc25 = loc("out_ptr0"(#loc))
+#loc26 = loc("xnumel"(#loc))
+module {
+  tt.func public @triton_poi_fused_add_mul_0(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} {
+    %xnumel_0 = arith.constant 1048576 : i32 loc(#loc27)
+    %xoffset = tt.get_program_id x : i32 loc(#loc28)
+    %xoffset_1 = arith.constant 1024 : i32 loc(#loc29)
+    %xoffset_2 = arith.constant 1024 : i32 loc(#loc29)
+    %xoffset_3 = arith.muli %xoffset, %xoffset_2 : i32 loc(#loc29)
+    %xindex = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32> loc(#loc30)
+    %xindex_4 = tt.splat %xoffset_3 : i32 -> tensor<1024xi32> loc(#loc31)
+    %xindex_5 = arith.addi %xindex_4, %xindex : tensor<1024xi32> loc(#loc31)
+    %xmask = arith.constant true loc(#loc32)
+    %xmask_6 = arith.constant dense<true> : tensor<1024xi1> loc(#loc32)
+    %x0 = arith.constant 4096 : i32 loc(#loc33)
+    %x0_7 = arith.constant 4096 : i32 loc(#loc33)
+    %x0_8 = arith.constant dense<4096> : tensor<1024xi32> loc(#loc33)
+    %x0_9 = arith.remsi %xindex_5, %x0_8 : tensor<1024xi32> loc(#loc33)
+    %tmp0 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<1024x!tt.ptr<bf16>> loc(#loc34)
+    %tmp0_10 = tt.addptr %tmp0, %xindex_5 : tensor<1024x!tt.ptr<bf16>>, tensor<1024xi32> loc(#loc34)
+    %tmp0_11 = tt.load %tmp0_10 : tensor<1024x!tt.ptr<bf16>> loc(#loc35)
+    %tmp0_12 = arith.extf %tmp0_11 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc36)
+    %tmp1 = tt.splat %in_ptr1 : !tt.ptr<bf16> -> tensor<1024x!tt.ptr<bf16>> loc(#loc37)
+    %tmp1_13 = tt.addptr %tmp1, %x0_9 : tensor<1024x!tt.ptr<bf16>>, tensor<1024xi32> loc(#loc37)
+    %tmp1_14 = tt.load %tmp1_13 evictionPolicy = evict_last : tensor<1024x!tt.ptr<bf16>> loc(#loc38)
+    %tmp1_15 = arith.extf %tmp1_14 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc39)
+    %tmp2 = tt.splat %in_ptr2 : !tt.ptr<bf16> -> tensor<1024x!tt.ptr<bf16>> loc(#loc40)
+    %tmp2_16 = tt.addptr %tmp2, %xindex_5 : tensor<1024x!tt.ptr<bf16>>, tensor<1024xi32> loc(#loc40)
+    %tmp2_17 = tt.load %tmp2_16 : tensor<1024x!tt.ptr<bf16>> loc(#loc41)
+    %tmp2_18 = arith.extf %tmp2_17 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc42)
+    %tmp3 = arith.mulf %tmp1_15, %tmp2_18 : tensor<1024xf32> loc(#loc43)
+    %tmp4 = arith.addf %tmp0_12, %tmp3 : tensor<1024xf32> loc(#loc44)
+    %0 = tt.splat %out_ptr0 : !tt.ptr<bf16> -> tensor<1024x!tt.ptr<bf16>> loc(#loc19)
+    %1 = tt.addptr %0, %xindex_5 : tensor<1024x!tt.ptr<bf16>>, tensor<1024xi32> loc(#loc19)
+    %2 = arith.truncf %tmp4 : tensor<1024xf32> to tensor<1024xbf16> loc(#loc20)
+    tt.store %1, %2 : tensor<1024x!tt.ptr<bf16>> loc(#loc20)
+    tt.return loc(#loc21)
+  } loc(#loc)
+} loc(#loc)
+#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":19:13)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":20:28)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":20:33)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":21:36)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":21:23)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":22:36)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":24:19)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":25:30)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":25:35)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":25:44)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":26:30)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":26:35)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":26:74)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":27:30)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":27:35)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":27:44)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":28:18)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":29:18)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":30:25)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":30:36)
+#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":30:4)
+#loc27 = loc("xnumel"(#loc1))
+#loc28 = loc("xoffset"(#loc2))
+#loc29 = loc("xoffset"(#loc3))
+#loc30 = loc("xindex"(#loc4))
+#loc31 = loc("xindex"(#loc5))
+#loc32 = loc("xmask"(#loc6))
+#loc33 = loc("x0"(#loc7))
+#loc34 = loc("tmp0"(#loc8))
+#loc35 = loc("tmp0"(#loc9))
+#loc36 = loc("tmp0"(#loc10))
+#loc37 = loc("tmp1"(#loc11))
+#loc38 = loc("tmp1"(#loc12))
+#loc39 = loc("tmp1"(#loc13))
+#loc40 = loc("tmp2"(#loc14))
+#loc41 = loc("tmp2"(#loc15))
+#loc42 = loc("tmp2"(#loc16))
+#loc43 = loc("tmp3"(#loc17))
+#loc44 = loc("tmp4"(#loc18))
diff --git a/triton/Q5QIKEPJDRH7FHZ6CDBLMD5Y4GTGU6Y7IAWNFLJIJRNGOB7RFV4Q/triton_poi_fused_add_mul_0.ttgir b/triton/Q5QIKEPJDRH7FHZ6CDBLMD5Y4GTGU6Y7IAWNFLJIJRNGOB7RFV4Q/triton_poi_fused_add_mul_0.ttgir
new file mode 100644
index 0000000000000000000000000000000000000000..c149c43803247cd01a3d873dc50a8fa2c6dc9d16
--- /dev/null
+++ b/triton/Q5QIKEPJDRH7FHZ6CDBLMD5Y4GTGU6Y7IAWNFLJIJRNGOB7RFV4Q/triton_poi_fused_add_mul_0.ttgir
@@ -0,0 +1,74 @@
+#blocked = #ttg.blocked<{sizePerThread = [8], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}>
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":18:0)
+#loc21 = loc("in_ptr0"(#loc))
+#loc22 = loc("in_ptr1"(#loc))
+#loc23 = loc("in_ptr2"(#loc))
+#loc24 = loc("out_ptr0"(#loc))
+#loc25 = loc("xnumel"(#loc))
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:89", "ttg.threads-per-warp" = 32 : i32} {
+  tt.func public @triton_poi_fused_add_mul_0(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} {
+    %cst = arith.constant dense<4096> : tensor<1024xi32, #blocked> loc(#loc1)
+    %c1024_i32 = arith.constant 1024 : i32 loc(#loc1)
+    %xoffset = tt.get_program_id x : i32 loc(#loc26)
+    %xoffset_0 = arith.muli %xoffset, %c1024_i32 : i32 loc(#loc27)
+    %xindex = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked> loc(#loc28)
+    %xindex_1 = tt.splat %xoffset_0 : i32 -> tensor<1024xi32, #blocked> loc(#loc29)
+    %xindex_2 = arith.addi %xindex_1, %xindex : tensor<1024xi32, #blocked> loc(#loc29)
+    %x0 = arith.remsi %xindex_2, %cst : tensor<1024xi32, #blocked> loc(#loc30)
+    %tmp0 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<1024x!tt.ptr<bf16>, #blocked> loc(#loc31)
+    %tmp0_3 = tt.addptr %tmp0, %xindex_2 : tensor<1024x!tt.ptr<bf16>, #blocked>, tensor<1024xi32, #blocked> loc(#loc31)
+    %tmp0_4 = tt.load %tmp0_3 : tensor<1024x!tt.ptr<bf16>, #blocked> loc(#loc32)
+    %tmp0_5 = arith.extf %tmp0_4 : tensor<1024xbf16, #blocked> to tensor<1024xf32, #blocked> loc(#loc33)
+    %tmp1 = tt.splat %in_ptr1 : !tt.ptr<bf16> -> tensor<1024x!tt.ptr<bf16>, #blocked> loc(#loc34)
+    %tmp1_6 = tt.addptr %tmp1, %x0 : tensor<1024x!tt.ptr<bf16>, #blocked>, tensor<1024xi32, #blocked> loc(#loc34)
+    %tmp1_7 = tt.load %tmp1_6 evictionPolicy = evict_last : tensor<1024x!tt.ptr<bf16>, #blocked> loc(#loc35)
+    %tmp1_8 = arith.extf %tmp1_7 : tensor<1024xbf16, #blocked> to tensor<1024xf32, #blocked> loc(#loc36)
+    %tmp2 = tt.splat %in_ptr2 : !tt.ptr<bf16> -> tensor<1024x!tt.ptr<bf16>, #blocked> loc(#loc37)
+    %tmp2_9 = tt.addptr %tmp2, %xindex_2 : tensor<1024x!tt.ptr<bf16>, #blocked>, tensor<1024xi32, #blocked> loc(#loc37)
+    %tmp2_10 = tt.load %tmp2_9 : tensor<1024x!tt.ptr<bf16>, #blocked> loc(#loc38)
+    %tmp2_11 = arith.extf %tmp2_10 : tensor<1024xbf16, #blocked> to tensor<1024xf32, #blocked> loc(#loc39)
+    %tmp3 = arith.mulf %tmp1_8, %tmp2_11 : tensor<1024xf32, #blocked> loc(#loc40)
+    %tmp4 = arith.addf %tmp0_5, %tmp3 : tensor<1024xf32, #blocked> loc(#loc41)
+    %0 = tt.splat %out_ptr0 : !tt.ptr<bf16> -> tensor<1024x!tt.ptr<bf16>, #blocked> loc(#loc18)
+    %1 = tt.addptr %0, %xindex_2 : tensor<1024x!tt.ptr<bf16>, #blocked>, tensor<1024xi32, #blocked> loc(#loc18)
+    %2 = arith.truncf %tmp4 : tensor<1024xf32, #blocked> to tensor<1024xbf16, #blocked> loc(#loc19)
+    tt.store %1, %2 : tensor<1024x!tt.ptr<bf16>, #blocked> loc(#loc19)
+    tt.return loc(#loc20)
+  } loc(#loc)
+} loc(#loc)
+#loc1 = loc(unknown)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":20:28)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":20:33)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":21:36)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":21:23)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":24:19)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":25:30)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":25:35)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":25:44)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":26:30)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":26:35)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":26:74)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":27:30)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":27:35)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":27:44)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":28:18)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":29:18)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":30:25)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":30:36)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":30:4)
+#loc26 = loc("xoffset"(#loc2))
+#loc27 = loc("xoffset"(#loc3))
+#loc28 = loc("xindex"(#loc4))
+#loc29 = loc("xindex"(#loc5))
+#loc30 = loc("x0"(#loc6))
+#loc31 = loc("tmp0"(#loc7))
+#loc32 = loc("tmp0"(#loc8))
+#loc33 = loc("tmp0"(#loc9))
+#loc34 = loc("tmp1"(#loc10))
+#loc35 = loc("tmp1"(#loc11))
+#loc36 = loc("tmp1"(#loc12))
+#loc37 = loc("tmp2"(#loc13))
+#loc38 = loc("tmp2"(#loc14))
+#loc39 = loc("tmp2"(#loc15))
+#loc40 = loc("tmp3"(#loc16))
+#loc41 = loc("tmp4"(#loc17))
diff --git a/triton/Q5QIKEPJDRH7FHZ6CDBLMD5Y4GTGU6Y7IAWNFLJIJRNGOB7RFV4Q/triton_poi_fused_add_mul_0.ttir b/triton/Q5QIKEPJDRH7FHZ6CDBLMD5Y4GTGU6Y7IAWNFLJIJRNGOB7RFV4Q/triton_poi_fused_add_mul_0.ttir
new file mode 100644
index 0000000000000000000000000000000000000000..87847c2d4162a9a076fe808d5eba56a06e2e5cef
--- /dev/null
+++ b/triton/Q5QIKEPJDRH7FHZ6CDBLMD5Y4GTGU6Y7IAWNFLJIJRNGOB7RFV4Q/triton_poi_fused_add_mul_0.ttir
@@ -0,0 +1,73 @@
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":18:0)
+#loc21 = loc("in_ptr0"(#loc))
+#loc22 = loc("in_ptr1"(#loc))
+#loc23 = loc("in_ptr2"(#loc))
+#loc24 = loc("out_ptr0"(#loc))
+#loc25 = loc("xnumel"(#loc))
+module {
+  tt.func public @triton_poi_fused_add_mul_0(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} {
+    %x0 = arith.constant dense<4096> : tensor<1024xi32> loc(#loc26)
+    %c1024_i32 = arith.constant 1024 : i32 loc(#loc2)
+    %xoffset = tt.get_program_id x : i32 loc(#loc27)
+    %xoffset_0 = arith.muli %xoffset, %c1024_i32 : i32 loc(#loc28)
+    %xindex = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32> loc(#loc29)
+    %xindex_1 = tt.splat %xoffset_0 : i32 -> tensor<1024xi32> loc(#loc30)
+    %xindex_2 = arith.addi %xindex_1, %xindex : tensor<1024xi32> loc(#loc30)
+    %x0_3 = arith.remsi %xindex_2, %x0 : tensor<1024xi32> loc(#loc26)
+    %tmp0 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<1024x!tt.ptr<bf16>> loc(#loc31)
+    %tmp0_4 = tt.addptr %tmp0, %xindex_2 : tensor<1024x!tt.ptr<bf16>>, tensor<1024xi32> loc(#loc31)
+    %tmp0_5 = tt.load %tmp0_4 : tensor<1024x!tt.ptr<bf16>> loc(#loc32)
+    %tmp0_6 = arith.extf %tmp0_5 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc33)
+    %tmp1 = tt.splat %in_ptr1 : !tt.ptr<bf16> -> tensor<1024x!tt.ptr<bf16>> loc(#loc34)
+    %tmp1_7 = tt.addptr %tmp1, %x0_3 : tensor<1024x!tt.ptr<bf16>>, tensor<1024xi32> loc(#loc34)
+    %tmp1_8 = tt.load %tmp1_7 evictionPolicy = evict_last : tensor<1024x!tt.ptr<bf16>> loc(#loc35)
+    %tmp1_9 = arith.extf %tmp1_8 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc36)
+    %tmp2 = tt.splat %in_ptr2 : !tt.ptr<bf16> -> tensor<1024x!tt.ptr<bf16>> loc(#loc37)
+    %tmp2_10 = tt.addptr %tmp2, %xindex_2 : tensor<1024x!tt.ptr<bf16>>, tensor<1024xi32> loc(#loc37)
+    %tmp2_11 = tt.load %tmp2_10 : tensor<1024x!tt.ptr<bf16>> loc(#loc38)
+    %tmp2_12 = arith.extf %tmp2_11 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc39)
+    %tmp3 = arith.mulf %tmp1_9, %tmp2_12 : tensor<1024xf32> loc(#loc40)
+    %tmp4 = arith.addf %tmp0_6, %tmp3 : tensor<1024xf32> loc(#loc41)
+    %0 = tt.splat %out_ptr0 : !tt.ptr<bf16> -> tensor<1024x!tt.ptr<bf16>> loc(#loc18)
+    %1 = tt.addptr %0, %xindex_2 : tensor<1024x!tt.ptr<bf16>>, tensor<1024xi32> loc(#loc18)
+    %2 = arith.truncf %tmp4 : tensor<1024xf32> to tensor<1024xbf16> loc(#loc19)
+    tt.store %1, %2 : tensor<1024x!tt.ptr<bf16>> loc(#loc19)
+    tt.return loc(#loc20)
+  } loc(#loc)
+} loc(#loc)
+#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":24:19)
+#loc2 = loc(unknown)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":20:28)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":20:33)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":21:36)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":21:23)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":25:30)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":25:35)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":25:44)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":26:30)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":26:35)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":26:74)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":27:30)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":27:35)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":27:44)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":28:18)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":29:18)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":30:25)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":30:36)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6k/c6kat5g7n3uukkfwgdxfwtmxblcmqzhbifo5g3rbaz3djxii35gi.py":30:4)
+#loc26 = loc("x0"(#loc1))
+#loc27 = loc("xoffset"(#loc3))
+#loc28 = loc("xoffset"(#loc4))
+#loc29 = loc("xindex"(#loc5))
+#loc30 = loc("xindex"(#loc6))
+#loc31 = loc("tmp0"(#loc7))
+#loc32 = loc("tmp0"(#loc8))
+#loc33 = loc("tmp0"(#loc9))
+#loc34 = loc("tmp1"(#loc10))
+#loc35 = loc("tmp1"(#loc11))
+#loc36 = loc("tmp1"(#loc12))
+#loc37 = loc("tmp2"(#loc13))
+#loc38 = loc("tmp2"(#loc14))
+#loc39 = loc("tmp2"(#loc15))
+#loc40 = loc("tmp3"(#loc16))
+#loc41 = loc("tmp4"(#loc17))
diff --git a/triton/R2YOCP4NTK65APP5W3JMFFRCAYOI4BDG7YB6FWKROGYIMK5QVNUA/__grp__triton_red_fused__fused_rms_norm_view_1.json b/triton/R2YOCP4NTK65APP5W3JMFFRCAYOI4BDG7YB6FWKROGYIMK5QVNUA/__grp__triton_red_fused__fused_rms_norm_view_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..9716ac7f9aad2578a333078f8934c90f7a3dd29b
--- /dev/null
+++ b/triton/R2YOCP4NTK65APP5W3JMFFRCAYOI4BDG7YB6FWKROGYIMK5QVNUA/__grp__triton_red_fused__fused_rms_norm_view_1.json
@@ -0,0 +1 @@
+{"child_paths": {"triton_red_fused__fused_rms_norm_view_1.source": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/R2YOCP4NTK65APP5W3JMFFRCAYOI4BDG7YB6FWKROGYIMK5QVNUA/triton_red_fused__fused_rms_norm_view_1.source", "triton_red_fused__fused_rms_norm_view_1.ttir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/R2YOCP4NTK65APP5W3JMFFRCAYOI4BDG7YB6FWKROGYIMK5QVNUA/triton_red_fused__fused_rms_norm_view_1.ttir", "triton_red_fused__fused_rms_norm_view_1.ttgir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/R2YOCP4NTK65APP5W3JMFFRCAYOI4BDG7YB6FWKROGYIMK5QVNUA/triton_red_fused__fused_rms_norm_view_1.ttgir", "triton_red_fused__fused_rms_norm_view_1.llir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/R2YOCP4NTK65APP5W3JMFFRCAYOI4BDG7YB6FWKROGYIMK5QVNUA/triton_red_fused__fused_rms_norm_view_1.llir", "triton_red_fused__fused_rms_norm_view_1.ptx": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/R2YOCP4NTK65APP5W3JMFFRCAYOI4BDG7YB6FWKROGYIMK5QVNUA/triton_red_fused__fused_rms_norm_view_1.ptx", "triton_red_fused__fused_rms_norm_view_1.cubin": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/R2YOCP4NTK65APP5W3JMFFRCAYOI4BDG7YB6FWKROGYIMK5QVNUA/triton_red_fused__fused_rms_norm_view_1.cubin", "triton_red_fused__fused_rms_norm_view_1.json": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/R2YOCP4NTK65APP5W3JMFFRCAYOI4BDG7YB6FWKROGYIMK5QVNUA/triton_red_fused__fused_rms_norm_view_1.json"}}
\ No newline at end of file
diff --git a/triton/R2YOCP4NTK65APP5W3JMFFRCAYOI4BDG7YB6FWKROGYIMK5QVNUA/triton_red_fused__fused_rms_norm_view_1.cubin b/triton/R2YOCP4NTK65APP5W3JMFFRCAYOI4BDG7YB6FWKROGYIMK5QVNUA/triton_red_fused__fused_rms_norm_view_1.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..dede282231445e1180b670527f652c18516baedf
Binary files /dev/null and b/triton/R2YOCP4NTK65APP5W3JMFFRCAYOI4BDG7YB6FWKROGYIMK5QVNUA/triton_red_fused__fused_rms_norm_view_1.cubin differ
diff --git a/triton/R2YOCP4NTK65APP5W3JMFFRCAYOI4BDG7YB6FWKROGYIMK5QVNUA/triton_red_fused__fused_rms_norm_view_1.json b/triton/R2YOCP4NTK65APP5W3JMFFRCAYOI4BDG7YB6FWKROGYIMK5QVNUA/triton_red_fused__fused_rms_norm_view_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..af94cda9a80836f03ab87953ac57ba68d1683503
--- /dev/null
+++ b/triton/R2YOCP4NTK65APP5W3JMFFRCAYOI4BDG7YB6FWKROGYIMK5QVNUA/triton_red_fused__fused_rms_norm_view_1.json
@@ -0,0 +1 @@
+{"hash": "8eb0e13f8d9abdd03dfdb6d2c29622061c8e0466fe03e2d95171b0862bb0ab68", "target": {"backend": "cuda", "arch": 89, "warp_size": 32}, "num_warps": 8, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "enable_reflect_ftz": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee", "bf16x3", "bf16x6"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm89", "instrumentation_mode": "", "triton_version": "3.6.0", "tensordesc_meta": [], "shared": 256, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused__fused_rms_norm_view_1"}
\ No newline at end of file
diff --git a/triton/R2YOCP4NTK65APP5W3JMFFRCAYOI4BDG7YB6FWKROGYIMK5QVNUA/triton_red_fused__fused_rms_norm_view_1.llir b/triton/R2YOCP4NTK65APP5W3JMFFRCAYOI4BDG7YB6FWKROGYIMK5QVNUA/triton_red_fused__fused_rms_norm_view_1.llir
new file mode 100644
index 0000000000000000000000000000000000000000..d946385b57a31921e8e0863d5e3fef64e2034104
--- /dev/null
+++ b/triton/R2YOCP4NTK65APP5W3JMFFRCAYOI4BDG7YB6FWKROGYIMK5QVNUA/triton_red_fused__fused_rms_norm_view_1.llir
@@ -0,0 +1,120 @@
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64"
+
+@global_smem = external local_unnamed_addr addrspace(3) global [0 x i8], align 16
+
+; Function Attrs: nounwind
+define ptx_kernel void @triton_red_fused__fused_rms_norm_view_1(ptr addrspace(1) %0, ptr addrspace(1) %1, i32 %2, i32 %3, ptr addrspace(1) readnone captures(none) %4, ptr addrspace(1) readnone captures(none) %5) local_unnamed_addr #0 !dbg !4 {
+  %7 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7
+  %8 = shl i32 %7, 6, !dbg !8
+  %9 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9
+  %10 = and i32 %9, 252, !dbg !9
+  %11 = lshr exact i32 %10, 2, !dbg !9
+  %12 = or disjoint i32 %11, %8, !dbg !10
+  %13 = and i32 %9, 3, !dbg !11
+  %14 = sdiv i32 %12, 32, !dbg !12
+  %15 = mul i32 %14, 32, !dbg !13
+  %.decomposed = sub i32 %12, %15, !dbg !13
+  %16 = shl nsw i32 %.decomposed, 7, !dbg !14
+  %17 = mul i32 %14, 12288, !dbg !15
+  %18 = or disjoint i32 %16, %13
+  %19 = add i32 %18, %17
+  br label %20, !dbg !16
+
+20:                                               ; preds = %6, %20
+  %indvars.iv = phi i64 [ 0, %6 ], [ %indvars.iv.next, %20 ]
+  %21 = phi float [ 0.000000e+00, %6 ], [ %31, %20 ]
+  %22 = trunc nuw nsw i64 %indvars.iv to i32, !dbg !17
+  %23 = add i32 %19, %22, !dbg !17
+  %24 = sext i32 %23 to i64, !dbg !18
+  %25 = getelementptr bfloat, ptr addrspace(1) %0, i64 %24, !dbg !18
+  %26 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !19
+  %27 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %25, i64 %26, i1 true) #4, !dbg !19
+  %28 = bitcast i16 %27 to bfloat, !dbg !19
+  %29 = fpext bfloat %28 to float, !dbg !20
+  %30 = fmul float %29, %29, !dbg !21
+  %31 = fadd float %21, %30, !dbg !22
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 4, !dbg !16
+  %32 = icmp samesign ult i64 %indvars.iv, 124, !dbg !16
+  br i1 %32, label %20, label %33, !dbg !16
+
+33:                                               ; preds = %20
+  %34 = and i32 %9, 63, !dbg !9
+  %35 = or disjoint i32 %8, %34, !dbg !10
+  %36 = bitcast float %31 to i32, !dbg !23
+  %37 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %36, i32 2, i32 31), !dbg !23
+  %38 = bitcast i32 %37 to float, !dbg !23
+  %39 = fadd float %31, %38, !dbg !28
+  %40 = bitcast float %39 to i32, !dbg !23
+  %41 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %40, i32 1, i32 31), !dbg !23
+  %42 = bitcast i32 %41 to float, !dbg !23
+  %43 = fadd float %39, %42, !dbg !28
+  %44 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %10, !dbg !29
+  store float %43, ptr addrspace(3) %44, align 4, !dbg !29
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !29
+  %45 = shl nuw nsw i32 %34, 2, !dbg !29
+  %46 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %45, !dbg !29
+  %47 = load i32, ptr addrspace(3) %46, align 4, !dbg !29
+  %48 = sext i32 %35 to i64, !dbg !30
+  %49 = getelementptr float, ptr addrspace(1) %1, i64 %48, !dbg !30
+  %50 = and i32 %9, 192, !dbg !31
+  %51 = icmp eq i32 %50, 0, !dbg !31
+  tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %47, ptr addrspace(1) %49, i1 %51) #4, !dbg !31
+  ret void, !dbg !32
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1
+
+; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
+declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2
+
+; Function Attrs: convergent nocallback nounwind
+declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #3
+
+attributes #0 = { nounwind "nvvm.reqntid"="256" }
+attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
+attributes #3 = { convergent nocallback nounwind }
+attributes #4 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly)
+!1 = !DIFile(filename: "cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py", directory: "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi")
+!2 = !{i32 2, !"Debug Info Version", i32 3}
+!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
+!4 = distinct !DISubprogram(name: "triton_red_fused__fused_rms_norm_view_1", linkageName: "triton_red_fused__fused_rms_norm_view_1", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!5 = !DISubroutineType(cc: DW_CC_normal, types: !6)
+!6 = !{}
+!7 = !DILocation(line: 23, column: 28, scope: !4)
+!8 = !DILocation(line: 23, column: 33, scope: !4)
+!9 = !DILocation(line: 24, column: 44, scope: !4)
+!10 = !DILocation(line: 24, column: 23, scope: !4)
+!11 = !DILocation(line: 26, column: 37, scope: !4)
+!12 = !DILocation(line: 29, column: 19, scope: !4)
+!13 = !DILocation(line: 28, column: 19, scope: !4)
+!14 = !DILocation(line: 38, column: 45, scope: !4)
+!15 = !DILocation(line: 38, column: 56, scope: !4)
+!16 = !DILocation(line: 32, column: 43, scope: !4)
+!17 = !DILocation(line: 38, column: 50, scope: !4)
+!18 = !DILocation(line: 38, column: 34, scope: !4)
+!19 = !DILocation(line: 38, column: 61, scope: !4)
+!20 = !DILocation(line: 38, column: 115, scope: !4)
+!21 = !DILocation(line: 40, column: 22, scope: !4)
+!22 = !DILocation(line: 42, column: 23, scope: !4)
+!23 = !DILocation(line: 293, column: 36, scope: !24, inlinedAt: !26)
+!24 = distinct !DILexicalBlockFile(scope: !4, file: !25, discriminator: 0)
+!25 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.12/dist-packages/triton/language")
+!26 = !DILocation(line: 44, column: 25, scope: !27)
+!27 = distinct !DILexicalBlockFile(scope: !4, file: !1, discriminator: 0)
+!28 = !DILocation(line: 263, column: 15, scope: !24, inlinedAt: !23)
+!29 = !DILocation(line: 44, column: 28, scope: !4)
+!30 = !DILocation(line: 45, column: 25, scope: !4)
+!31 = !DILocation(line: 45, column: 36, scope: !4)
+!32 = !DILocation(line: 45, column: 4, scope: !4)
diff --git a/triton/R2YOCP4NTK65APP5W3JMFFRCAYOI4BDG7YB6FWKROGYIMK5QVNUA/triton_red_fused__fused_rms_norm_view_1.ptx b/triton/R2YOCP4NTK65APP5W3JMFFRCAYOI4BDG7YB6FWKROGYIMK5QVNUA/triton_red_fused__fused_rms_norm_view_1.ptx
new file mode 100644
index 0000000000000000000000000000000000000000..79cacab3f6a802e4ef9e061c648ccead776423a7
--- /dev/null
+++ b/triton/R2YOCP4NTK65APP5W3JMFFRCAYOI4BDG7YB6FWKROGYIMK5QVNUA/triton_red_fused__fused_rms_norm_view_1.ptx
@@ -0,0 +1,486 @@
+//
+// Generated by LLVM NVPTX Back-End
+//
+
+.version 9.1
+.target sm_89
+.address_size 64
+
+	// .globl	triton_red_fused__fused_rms_norm_view_1 // -- Begin function triton_red_fused__fused_rms_norm_view_1
+.extern .shared .align 16 .b8 global_smem[];
+                                        // @triton_red_fused__fused_rms_norm_view_1
+.visible .entry triton_red_fused__fused_rms_norm_view_1(
+	.param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm_view_1_param_0,
+	.param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm_view_1_param_1,
+	.param .u32 triton_red_fused__fused_rms_norm_view_1_param_2,
+	.param .u32 triton_red_fused__fused_rms_norm_view_1_param_3,
+	.param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm_view_1_param_4,
+	.param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm_view_1_param_5
+)
+.reqntid 256
+{
+	.reg .pred 	%p<4>;
+	.reg .b16 	%rs<3>;
+	.reg .b32 	%r<33>;
+	.reg .b64 	%rd<9>;
+	.loc	1 18 0                          // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:18:0
+$L__func_begin0:
+	.loc	1 18 0                          // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:18:0
+
+// %bb.0:
+	ld.param.b64 	%rd3, [triton_red_fused__fused_rms_norm_view_1_param_1];
+	ld.param.b64 	%rd2, [triton_red_fused__fused_rms_norm_view_1_param_0];
+$L__tmp0:
+	.loc	1 23 28                         // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:23:28
+	mov.u32 	%r4, %ctaid.x;
+	.loc	1 23 33                         // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:23:33
+	shl.b32 	%r1, %r4, 6;
+	.loc	1 24 44                         // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:24:44
+	mov.u32 	%r2, %tid.x;
+	and.b32 	%r3, %r2, 252;
+	bfe.u32 	%r5, %r2, 2, 6;
+	.loc	1 24 23                         // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:24:23
+	or.b32 	%r6, %r5, %r1;
+	.loc	1 26 37                         // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:26:37
+	and.b32 	%r7, %r2, 3;
+	.loc	1 29 19                         // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:29:19
+	bfe.s32 	%r8, %r4, 25, 1;
+	shr.u32 	%r9, %r8, 27;
+	add.s32 	%r10, %r6, %r9;
+	shr.u32 	%r11, %r10, 5;
+	.loc	1 32 43                         // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:32:43
+	add.s32 	%r12, %r4, %r11;
+	shl.b32 	%r13, %r12, 13;
+	shl.b32 	%r14, %r5, 7;
+	or.b32 	%r15, %r13, %r14;
+	or.b32 	%r16, %r15, %r7;
+	cvt.u64.u32 	%rd1, %r16;
+	mov.b32 	%r32, 0f00000000;
+	mov.b64 	%rd8, -4;
+$L__BB0_1:                              // =>This Inner Loop Header: Depth=1
+	.loc	1 38 34                         // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:38:34
+	add.s64 	%rd6, %rd1, %rd8;
+	cvt.u32.u64 	%r17, %rd6;
+	add.s32 	%r18, %r17, 4;
+	mad.wide.s32 	%rd5, %r18, 2, %rd2;
+	.loc	1 38 61                         // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:38:61
+	// begin inline asm
+	mov.u64 %rd4, 0x0;
+	createpolicy.fractional.L2::evict_first.b64 %rd4, 1.0;
+	// end inline asm
+	mov.b16 	%rs2, 0;
+	mov.pred 	%p1, -1;
+	// begin inline asm
+	mov.u16 %rs1, %rs2;
+	@%p1 ld.global.L1::evict_first.L2::cache_hint.b16 { %rs1 }, [ %rd5 + 0 ], %rd4;
+	// end inline asm
+	.loc	1 38 115                        // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:38:115
+	cvt.f32.bf16 	%r19, %rs1;
+	.loc	1 42 23                         // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:42:23
+	fma.rn.f32 	%r32, %r19, %r19, %r32;
+	.loc	1 32 43                         // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:32:43
+	add.s64 	%rd8, %rd8, 4;
+	setp.lt.u64 	%p2, %rd8, 124;
+	@%p2 bra 	$L__BB0_1;
+// %bb.2:
+	.loc	1 24 44                         // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:24:44
+	and.b32 	%r21, %r2, 63;
+	.loc	1 24 23                         // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:24:23
+	or.b32 	%r22, %r1, %r21;
+$L__tmp1:
+	.loc	2 293 36                        // standard.py:293:36 @[ cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:44:25 ]
+	shfl.sync.bfly.b32 	%r23, %r32, 2, 31, -1;
+$L__tmp2:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:44:25 ] ]
+	add.f32 	%r24, %r32, %r23;
+$L__tmp3:
+	.loc	2 293 36                        // standard.py:293:36 @[ cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:44:25 ]
+	shfl.sync.bfly.b32 	%r25, %r24, 1, 31, -1;
+$L__tmp4:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:44:25 ] ]
+	add.f32 	%r26, %r24, %r25;
+$L__tmp5:
+	.loc	1 44 28                         // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:44:28
+	mov.b32 	%r27, global_smem;
+	add.s32 	%r28, %r27, %r3;
+	st.shared.b32 	[%r28], %r26;
+	bar.sync 	0;
+	shl.b32 	%r29, %r21, 2;
+	add.s32 	%r30, %r27, %r29;
+	ld.shared.b32 	%r20, [%r30];
+	.loc	1 45 25                         // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:45:25
+	mad.wide.s32 	%rd7, %r22, 4, %rd3;
+	.loc	1 45 36                         // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:45:36
+	and.b32 	%r31, %r2, 192;
+	setp.eq.b32 	%p3, %r31, 0;
+	// begin inline asm
+	@%p3 st.global.b32 [ %rd7 + 0 ], { %r20 };
+	// end inline asm
+	.loc	1 45 4                          // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:45:4
+	ret;
+$L__tmp6:
+$L__func_end0:
+                                        // -- End function
+}
+	.file	1 "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py"
+	.file	2 "/usr/local/lib/python3.12/dist-packages/triton/language/standard.py"
+	.section	.debug_abbrev
+	{
+.b8 1                                   // Abbreviation Code
+.b8 17                                  // DW_TAG_compile_unit
+.b8 1                                   // DW_CHILDREN_yes
+.b8 37                                  // DW_AT_producer
+.b8 8                                   // DW_FORM_string
+.b8 19                                  // DW_AT_language
+.b8 5                                   // DW_FORM_data2
+.b8 3                                   // DW_AT_name
+.b8 8                                   // DW_FORM_string
+.b8 16                                  // DW_AT_stmt_list
+.b8 6                                   // DW_FORM_data4
+.b8 27                                  // DW_AT_comp_dir
+.b8 8                                   // DW_FORM_string
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 2                                   // Abbreviation Code
+.b8 46                                  // DW_TAG_subprogram
+.b8 0                                   // DW_CHILDREN_no
+.b8 3                                   // DW_AT_name
+.b8 8                                   // DW_FORM_string
+.b8 32                                  // DW_AT_inline
+.b8 11                                  // DW_FORM_data1
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 3                                   // Abbreviation Code
+.b8 46                                  // DW_TAG_subprogram
+.b8 1                                   // DW_CHILDREN_yes
+.b8 17                                  // DW_AT_low_pc
+.b8 1                                   // DW_FORM_addr
+.b8 18                                  // DW_AT_high_pc
+.b8 1                                   // DW_FORM_addr
+.b8 49                                  // DW_AT_abstract_origin
+.b8 19                                  // DW_FORM_ref4
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 4                                   // Abbreviation Code
+.b8 29                                  // DW_TAG_inlined_subroutine
+.b8 1                                   // DW_CHILDREN_yes
+.b8 49                                  // DW_AT_abstract_origin
+.b8 19                                  // DW_FORM_ref4
+.b8 17                                  // DW_AT_low_pc
+.b8 1                                   // DW_FORM_addr
+.b8 18                                  // DW_AT_high_pc
+.b8 1                                   // DW_FORM_addr
+.b8 88                                  // DW_AT_call_file
+.b8 11                                  // DW_FORM_data1
+.b8 89                                  // DW_AT_call_line
+.b8 11                                  // DW_FORM_data1
+.b8 87                                  // DW_AT_call_column
+.b8 11                                  // DW_FORM_data1
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 5                                   // Abbreviation Code
+.b8 29                                  // DW_TAG_inlined_subroutine
+.b8 0                                   // DW_CHILDREN_no
+.b8 49                                  // DW_AT_abstract_origin
+.b8 19                                  // DW_FORM_ref4
+.b8 17                                  // DW_AT_low_pc
+.b8 1                                   // DW_FORM_addr
+.b8 18                                  // DW_AT_high_pc
+.b8 1                                   // DW_FORM_addr
+.b8 88                                  // DW_AT_call_file
+.b8 11                                  // DW_FORM_data1
+.b8 89                                  // DW_AT_call_line
+.b8 5                                   // DW_FORM_data2
+.b8 87                                  // DW_AT_call_column
+.b8 11                                  // DW_FORM_data1
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 0                                   // EOM(3)
+	}
+	.section	.debug_info
+	{
+.b32 339                                // Length of Unit
+.b8 2                                   // DWARF version number
+.b8 0
+.b32 .debug_abbrev                      // Offset Into Abbrev. Section
+.b8 8                                   // Address Size (in bytes)
+.b8 1                                   // Abbrev [1] 0xb:0x14c DW_TAG_compile_unit
+.b8 116                                 // DW_AT_producer
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 0
+.b8 2                                   // DW_AT_language
+.b8 0
+.b8 99                                  // DW_AT_name
+.b8 113
+.b8 105
+.b8 116
+.b8 120
+.b8 53
+.b8 104
+.b8 119
+.b8 117
+.b8 112
+.b8 107
+.b8 98
+.b8 106
+.b8 109
+.b8 99
+.b8 115
+.b8 111
+.b8 121
+.b8 107
+.b8 113
+.b8 101
+.b8 112
+.b8 122
+.b8 113
+.b8 99
+.b8 55
+.b8 122
+.b8 99
+.b8 120
+.b8 106
+.b8 99
+.b8 98
+.b8 53
+.b8 97
+.b8 99
+.b8 113
+.b8 107
+.b8 105
+.b8 55
+.b8 122
+.b8 99
+.b8 115
+.b8 106
+.b8 105
+.b8 102
+.b8 114
+.b8 110
+.b8 114
+.b8 122
+.b8 99
+.b8 114
+.b8 114
+.b8 46
+.b8 112
+.b8 121
+.b8 0
+.b32 .debug_line                        // DW_AT_stmt_list
+.b8 47                                  // DW_AT_comp_dir
+.b8 97
+.b8 112
+.b8 112
+.b8 47
+.b8 116
+.b8 101
+.b8 110
+.b8 115
+.b8 111
+.b8 114
+.b8 114
+.b8 116
+.b8 95
+.b8 108
+.b8 108
+.b8 109
+.b8 47
+.b8 118
+.b8 105
+.b8 115
+.b8 117
+.b8 97
+.b8 108
+.b8 95
+.b8 103
+.b8 101
+.b8 110
+.b8 47
+.b8 99
+.b8 111
+.b8 109
+.b8 112
+.b8 105
+.b8 108
+.b8 101
+.b8 100
+.b8 95
+.b8 99
+.b8 97
+.b8 99
+.b8 104
+.b8 101
+.b8 47
+.b8 102
+.b8 108
+.b8 117
+.b8 120
+.b8 50
+.b8 95
+.b8 107
+.b8 108
+.b8 101
+.b8 105
+.b8 110
+.b8 95
+.b8 57
+.b8 98
+.b8 95
+.b8 78
+.b8 86
+.b8 73
+.b8 68
+.b8 73
+.b8 65
+.b8 95
+.b8 71
+.b8 101
+.b8 70
+.b8 111
+.b8 114
+.b8 99
+.b8 101
+.b8 95
+.b8 82
+.b8 84
+.b8 88
+.b8 95
+.b8 52
+.b8 48
+.b8 57
+.b8 48
+.b8 95
+.b8 115
+.b8 109
+.b8 56
+.b8 57
+.b8 95
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 50
+.b8 46
+.b8 49
+.b8 48
+.b8 46
+.b8 48
+.b8 97
+.b8 48
+.b8 95
+.b8 98
+.b8 52
+.b8 101
+.b8 52
+.b8 101
+.b8 101
+.b8 56
+.b8 49
+.b8 100
+.b8 51
+.b8 46
+.b8 110
+.b8 118
+.b8 50
+.b8 53
+.b8 46
+.b8 49
+.b8 50
+.b8 95
+.b8 99
+.b8 117
+.b8 100
+.b8 97
+.b8 49
+.b8 51
+.b8 95
+.b8 49
+.b8 47
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 105
+.b8 110
+.b8 100
+.b8 117
+.b8 99
+.b8 116
+.b8 111
+.b8 114
+.b8 47
+.b8 113
+.b8 105
+.b8 0
+.b8 2                                   // Abbrev [2] 0xe4:0x2a DW_TAG_subprogram
+.b8 116                                 // DW_AT_name
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 114
+.b8 101
+.b8 100
+.b8 95
+.b8 102
+.b8 117
+.b8 115
+.b8 101
+.b8 100
+.b8 95
+.b8 95
+.b8 102
+.b8 117
+.b8 115
+.b8 101
+.b8 100
+.b8 95
+.b8 114
+.b8 109
+.b8 115
+.b8 95
+.b8 110
+.b8 111
+.b8 114
+.b8 109
+.b8 95
+.b8 118
+.b8 105
+.b8 101
+.b8 119
+.b8 95
+.b8 49
+.b8 0
+.b8 1                                   // DW_AT_inline
+.b8 3                                   // Abbrev [3] 0x10e:0x48 DW_TAG_subprogram
+.b64 $L__func_begin0                    // DW_AT_low_pc
+.b64 $L__func_end0                      // DW_AT_high_pc
+.b32 228                                // DW_AT_abstract_origin
+.b8 4                                   // Abbrev [4] 0x123:0x32 DW_TAG_inlined_subroutine
+.b32 228                                // DW_AT_abstract_origin
+.b64 $L__tmp1                           // DW_AT_low_pc
+.b64 $L__tmp5                           // DW_AT_high_pc
+.b8 1                                   // DW_AT_call_file
+.b8 44                                  // DW_AT_call_line
+.b8 25                                  // DW_AT_call_column
+.b8 5                                   // Abbrev [5] 0x13b:0x19 DW_TAG_inlined_subroutine
+.b32 228                                // DW_AT_abstract_origin
+.b64 $L__tmp2                           // DW_AT_low_pc
+.b64 $L__tmp5                           // DW_AT_high_pc
+.b8 2                                   // DW_AT_call_file
+.b8 37                                  // DW_AT_call_line
+.b8 1
+.b8 36                                  // DW_AT_call_column
+.b8 0                                   // End Of Children Mark
+.b8 0                                   // End Of Children Mark
+.b8 0                                   // End Of Children Mark
+	}
+	.section	.debug_macinfo	{	}
diff --git a/triton/R2YOCP4NTK65APP5W3JMFFRCAYOI4BDG7YB6FWKROGYIMK5QVNUA/triton_red_fused__fused_rms_norm_view_1.source b/triton/R2YOCP4NTK65APP5W3JMFFRCAYOI4BDG7YB6FWKROGYIMK5QVNUA/triton_red_fused__fused_rms_norm_view_1.source
new file mode 100644
index 0000000000000000000000000000000000000000..668c29aaa5214a312af10fa56f2b302a3dedf3b5
--- /dev/null
+++ b/triton/R2YOCP4NTK65APP5W3JMFFRCAYOI4BDG7YB6FWKROGYIMK5QVNUA/triton_red_fused__fused_rms_norm_view_1.source
@@ -0,0 +1,167 @@
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":18:0)
+#loc33 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":287:0)
+#loc35 = loc(unknown)
+#loc38 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":262:0)
+#loc42 = loc("in_ptr0"(#loc))
+#loc43 = loc("out_ptr0"(#loc))
+#loc44 = loc("xnumel"(#loc))
+#loc45 = loc("r0_numel"(#loc))
+#loc74 = loc("input"(#loc33))
+#loc75 = loc("a"(#loc38))
+#loc76 = loc("b"(#loc38))
+module {
+  tt.func public @triton_red_fused__fused_rms_norm_view_1(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
+    %xnumel_0 = arith.constant 65536 : i32 loc(#loc46)
+    %r0_numel_1 = arith.constant 128 : i32 loc(#loc47)
+    %xoffset = tt.get_program_id x : i32 loc(#loc48)
+    %xoffset_2 = arith.constant 64 : i32 loc(#loc49)
+    %xoffset_3 = arith.constant 64 : i32 loc(#loc49)
+    %xoffset_4 = arith.muli %xoffset, %xoffset_3 : i32 loc(#loc49)
+    %xindex = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc50)
+    %xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc51)
+    %xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<64x1xi32> loc(#loc52)
+    %xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<64x1xi32> loc(#loc52)
+    %xmask = arith.constant true loc(#loc53)
+    %xmask_8 = arith.constant dense<true> : tensor<64x4xi1> loc(#loc53)
+    %r0_base = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32> loc(#loc54)
+    %r0_base_9 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<4xi32> -> tensor<1x4xi32> loc(#loc55)
+    %x0 = arith.constant 32 : i32 loc(#loc56)
+    %x0_10 = arith.constant 32 : i32 loc(#loc56)
+    %x0_11 = arith.constant dense<32> : tensor<64x1xi32> loc(#loc56)
+    %x0_12 = arith.remsi %xindex_7, %x0_11 : tensor<64x1xi32> loc(#loc56)
+    %x1 = arith.constant 32 : i32 loc(#loc57)
+    %x1_13 = arith.constant 32 : i32 loc(#loc57)
+    %x1_14 = arith.constant dense<32> : tensor<64x1xi32> loc(#loc57)
+    %x1_15 = arith.divsi %xindex_7, %x1_14 : tensor<64x1xi32> loc(#loc57)
+    %_tmp4 = arith.constant 0.000000e+00 : f32 loc(#loc58)
+    %_tmp4_16 = arith.constant dense<0.000000e+00> : tensor<64x4xf32> loc(#loc58)
+    %c0_i32 = arith.constant 0 : i32 loc(#loc14)
+    %c4_i32 = arith.constant 4 : i32 loc(#loc14)
+    %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc14)
+    %1 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc14)
+    %2 = arith.bitcast %c4_i32 : i32 to i32 loc(#loc14)
+    %3 = ub.poison : i32 loc(#loc14)
+    %_tmp4_17 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%_tmp4_19 = %_tmp4_16) -> (tensor<64x4xf32>)  : i32 {
+      %r0_index = tt.splat %r0_offset : i32 -> tensor<1x4xi32> loc(#loc60)
+      %r0_index_20 = arith.addi %r0_index, %r0_base_9 : tensor<1x4xi32> loc(#loc60)
+      %r0_mask = arith.constant dense<128> : tensor<1x4xi32> loc(#loc61)
+      %r0_mask_21 = arith.cmpi slt, %r0_index_20, %r0_mask : tensor<1x4xi32> loc(#loc61)
+      %tmp0 = arith.constant 128 : i32 loc(#loc62)
+      %tmp0_22 = arith.constant 128 : i32 loc(#loc62)
+      %tmp0_23 = arith.constant dense<128> : tensor<64x1xi32> loc(#loc62)
+      %tmp0_24 = arith.muli %tmp0_23, %x0_12 : tensor<64x1xi32> loc(#loc62)
+      %tmp0_25 = tt.broadcast %r0_index_20 : tensor<1x4xi32> -> tensor<64x4xi32> loc(#loc63)
+      %tmp0_26 = tt.broadcast %tmp0_24 : tensor<64x1xi32> -> tensor<64x4xi32> loc(#loc63)
+      %tmp0_27 = arith.addi %tmp0_25, %tmp0_26 : tensor<64x4xi32> loc(#loc63)
+      %tmp0_28 = arith.constant 12288 : i32 loc(#loc64)
+      %tmp0_29 = arith.constant 12288 : i32 loc(#loc64)
+      %tmp0_30 = arith.constant dense<12288> : tensor<64x1xi32> loc(#loc64)
+      %tmp0_31 = arith.muli %tmp0_30, %x1_15 : tensor<64x1xi32> loc(#loc64)
+      %tmp0_32 = tt.broadcast %tmp0_31 : tensor<64x1xi32> -> tensor<64x4xi32> loc(#loc65)
+      %tmp0_33 = arith.addi %tmp0_27, %tmp0_32 : tensor<64x4xi32> loc(#loc65)
+      %tmp0_34 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<64x4x!tt.ptr<bf16>> loc(#loc66)
+      %tmp0_35 = tt.addptr %tmp0_34, %tmp0_33 : tensor<64x4x!tt.ptr<bf16>>, tensor<64x4xi32> loc(#loc66)
+      %tmp0_36 = arith.constant 0.000000e+00 : f32 loc(#loc67)
+      %tmp0_37 = tt.broadcast %r0_mask_21 : tensor<1x4xi1> -> tensor<64x4xi1> loc(#loc67)
+      %tmp0_38 = arith.constant dense<0.000000e+00> : tensor<64x4xf32> loc(#loc67)
+      %tmp0_39 = arith.truncf %tmp0_38 : tensor<64x4xf32> to tensor<64x4xbf16> loc(#loc67)
+      %tmp0_40 = tt.load %tmp0_35, %tmp0_37, %tmp0_39 evictionPolicy = evict_first : tensor<64x4x!tt.ptr<bf16>> loc(#loc67)
+      %tmp0_41 = arith.extf %tmp0_40 : tensor<64x4xbf16> to tensor<64x4xf32> loc(#loc68)
+      %tmp2 = arith.mulf %tmp0_41, %tmp0_41 : tensor<64x4xf32> loc(#loc69)
+      %tmp5 = arith.addf %_tmp4_19, %tmp2 : tensor<64x4xf32> loc(#loc70)
+      %_tmp4_42 = tt.broadcast %r0_mask_21 : tensor<1x4xi1> -> tensor<64x4xi1> loc(#loc71)
+      %_tmp4_43 = arith.select %_tmp4_42, %tmp5, %_tmp4_19 : tensor<64x4xi1>, tensor<64x4xf32> loc(#loc71)
+      scf.yield %_tmp4_43 : tensor<64x4xf32> loc(#loc27)
+    } loc(#loc59)
+    %tmp4 = tt.call @"triton.language.standard.sum__fp32S64_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%_tmp4_17) : (tensor<64x4xf32>) -> tensor<64xf32> loc(#loc72)
+    %tmp4_18 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<64xf32> -> tensor<64x1xf32> loc(#loc73)
+    %4 = tt.splat %out_ptr0 : !tt.ptr<f32> -> tensor<64x1x!tt.ptr<f32>> loc(#loc30)
+    %5 = tt.addptr %4, %xindex_7 : tensor<64x1x!tt.ptr<f32>>, tensor<64x1xi32> loc(#loc30)
+    tt.store %5, %tmp4_18 : tensor<64x1x!tt.ptr<f32>> loc(#loc31)
+    tt.return loc(#loc32)
+  } loc(#loc)
+  tt.func private @"triton.language.standard.sum__fp32S64_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<64x4xf32> loc("input"(#loc33))) -> tensor<64xf32> attributes {noinline = false} {
+    %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({
+    ^bb0(%arg1: f32 loc(unknown), %arg2: f32 loc(unknown)):
+      %2 = tt.call @triton.language.standard._sum_combine__fp32_fp32__(%arg1, %arg2) : (f32, f32) -> f32 loc(#loc34)
+      tt.reduce.return %2 : f32 loc(#loc34)
+    }) : (tensor<64x4xf32>) -> tensor<64xf32> loc(#loc34)
+    tt.return %0 : tensor<64xf32> loc(#loc36)
+  ^bb1:  // no predecessors
+    %1 = ub.poison : tensor<64xf32> loc(#loc37)
+    tt.return %1 : tensor<64xf32> loc(#loc37)
+  } loc(#loc33)
+  tt.func private @triton.language.standard._sum_combine__fp32_fp32__(%a: f32 loc("a"(#loc38)), %b: f32 loc("b"(#loc38))) -> f32 attributes {noinline = false} {
+    %0 = arith.addf %a, %b : f32 loc(#loc39)
+    tt.return %0 : f32 loc(#loc40)
+  ^bb1:  // no predecessors
+    %1 = ub.poison : f32 loc(#loc41)
+    tt.return %1 : f32 loc(#loc41)
+  } loc(#loc38)
+} loc(#loc)
+#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":19:13)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":20:15)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":23:28)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":23:33)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":24:36)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":24:44)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":24:23)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":25:46)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":26:27)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":26:37)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":28:19)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":29:19)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":30:43)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":32:43)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":33:31)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":34:29)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:45)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:41)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:56)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:50)
+#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:34)
+#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:61)
+#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:115)
+#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":40:22)
+#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":42:23)
+#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":43:40)
+#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":43:8)
+#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":44:25)
+#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":44:28)
+#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":45:25)
+#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":45:36)
+#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":45:4)
+#loc34 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:36)
+#loc36 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:11)
+#loc37 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:4)
+#loc39 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:15)
+#loc40 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:11)
+#loc41 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:4)
+#loc46 = loc("xnumel"(#loc1))
+#loc47 = loc("r0_numel"(#loc2))
+#loc48 = loc("xoffset"(#loc3))
+#loc49 = loc("xoffset"(#loc4))
+#loc50 = loc("xindex"(#loc5))
+#loc51 = loc("xindex"(#loc6))
+#loc52 = loc("xindex"(#loc7))
+#loc53 = loc("xmask"(#loc8))
+#loc54 = loc("r0_base"(#loc9))
+#loc55 = loc("r0_base"(#loc10))
+#loc56 = loc("x0"(#loc11))
+#loc57 = loc("x1"(#loc12))
+#loc58 = loc("_tmp4"(#loc13))
+#loc59 = loc("_tmp4"(#loc14))
+#loc60 = loc("r0_index"(#loc15))
+#loc61 = loc("r0_mask"(#loc16))
+#loc62 = loc("tmp0"(#loc17))
+#loc63 = loc("tmp0"(#loc18))
+#loc64 = loc("tmp0"(#loc19))
+#loc65 = loc("tmp0"(#loc20))
+#loc66 = loc("tmp0"(#loc21))
+#loc67 = loc("tmp0"(#loc22))
+#loc68 = loc("tmp0"(#loc23))
+#loc69 = loc("tmp2"(#loc24))
+#loc70 = loc("tmp5"(#loc25))
+#loc71 = loc("_tmp4"(#loc26))
+#loc72 = loc("tmp4"(#loc28))
+#loc73 = loc("tmp4"(#loc29))
diff --git a/triton/R2YOCP4NTK65APP5W3JMFFRCAYOI4BDG7YB6FWKROGYIMK5QVNUA/triton_red_fused__fused_rms_norm_view_1.ttgir b/triton/R2YOCP4NTK65APP5W3JMFFRCAYOI4BDG7YB6FWKROGYIMK5QVNUA/triton_red_fused__fused_rms_norm_view_1.ttgir
new file mode 100644
index 0000000000000000000000000000000000000000..f03f4df9b0f1c9928db9a993cc2d4e1f3aef89eb
--- /dev/null
+++ b/triton/R2YOCP4NTK65APP5W3JMFFRCAYOI4BDG7YB6FWKROGYIMK5QVNUA/triton_red_fused__fused_rms_norm_view_1.ttgir
@@ -0,0 +1,121 @@
+#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [8, 4], warpsPerCTA = [8, 1], order = [1, 0]}>
+#blocked1 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [2, 4], order = [0, 1]}>
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":18:0)
+#loc1 = loc(unknown)
+#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":44:25)
+#loc30 = loc("in_ptr0"(#loc))
+#loc31 = loc("out_ptr0"(#loc))
+#loc32 = loc("xnumel"(#loc))
+#loc33 = loc("r0_numel"(#loc))
+#loc54 = loc("tmp4"(#loc24))
+#loc57 = loc(callsite(#loc1 at #loc54))
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "cuda:89", "ttg.threads-per-warp" = 32 : i32} {
+  tt.func public @triton_red_fused__fused_rms_norm_view_1(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
+    %cst = arith.constant dense<32> : tensor<64x1xi32, #blocked> loc(#loc1)
+    %cst_0 = arith.constant dense<128> : tensor<64x1xi32, #blocked> loc(#loc1)
+    %cst_1 = arith.constant dense<12288> : tensor<64x1xi32, #blocked> loc(#loc1)
+    %c0_i32 = arith.constant 0 : i32 loc(#loc1)
+    %c128_i32 = arith.constant 128 : i32 loc(#loc1)
+    %c4_i32 = arith.constant 4 : i32 loc(#loc1)
+    %cst_2 = arith.constant dense<0.000000e+00> : tensor<64x4xbf16, #blocked> loc(#loc1)
+    %cst_3 = arith.constant dense<128> : tensor<1x4xi32, #blocked> loc(#loc1)
+    %cst_4 = arith.constant dense<0.000000e+00> : tensor<64x4xf32, #blocked> loc(#loc1)
+    %c64_i32 = arith.constant 64 : i32 loc(#loc1)
+    %xoffset = tt.get_program_id x : i32 loc(#loc34)
+    %xoffset_5 = arith.muli %xoffset, %c64_i32 : i32 loc(#loc35)
+    %xindex = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc36)
+    %xindex_6 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc36)
+    %xindex_7 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> loc(#loc36)
+    %xindex_8 = tt.expand_dims %xindex_6 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<64x1xi32, #blocked1> loc(#loc36)
+    %xindex_9 = tt.splat %xoffset_5 : i32 -> tensor<64x1xi32, #blocked> loc(#loc37)
+    %xindex_10 = tt.splat %xoffset_5 : i32 -> tensor<64x1xi32, #blocked1> loc(#loc37)
+    %xindex_11 = arith.addi %xindex_9, %xindex_7 : tensor<64x1xi32, #blocked> loc(#loc37)
+    %xindex_12 = arith.addi %xindex_10, %xindex_8 : tensor<64x1xi32, #blocked1> loc(#loc37)
+    %r0_base = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc38)
+    %r0_base_13 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<4xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x4xi32, #blocked> loc(#loc38)
+    %x0 = arith.remsi %xindex_11, %cst : tensor<64x1xi32, #blocked> loc(#loc39)
+    %x1 = arith.divsi %xindex_11, %cst : tensor<64x1xi32, #blocked> loc(#loc40)
+    %tmp0 = arith.muli %x0, %cst_0 : tensor<64x1xi32, #blocked> loc(#loc41)
+    %tmp0_14 = tt.broadcast %tmp0 : tensor<64x1xi32, #blocked> -> tensor<64x4xi32, #blocked> loc(#loc42)
+    %tmp0_15 = arith.muli %x1, %cst_1 : tensor<64x1xi32, #blocked> loc(#loc43)
+    %tmp0_16 = tt.broadcast %tmp0_15 : tensor<64x1xi32, #blocked> -> tensor<64x4xi32, #blocked> loc(#loc44)
+    %tmp0_17 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<64x4x!tt.ptr<bf16>, #blocked> loc(#loc45)
+    %_tmp4 = scf.for %r0_offset = %c0_i32 to %c128_i32 step %c4_i32 iter_args(%_tmp4_20 = %cst_4) -> (tensor<64x4xf32, #blocked>)  : i32 {
+      %r0_index = tt.splat %r0_offset : i32 -> tensor<1x4xi32, #blocked> loc(#loc47)
+      %r0_index_21 = arith.addi %r0_index, %r0_base_13 : tensor<1x4xi32, #blocked> loc(#loc47)
+      %r0_mask = arith.cmpi slt, %r0_index_21, %cst_3 : tensor<1x4xi32, #blocked> loc(#loc48)
+      %tmp0_22 = tt.broadcast %r0_index_21 : tensor<1x4xi32, #blocked> -> tensor<64x4xi32, #blocked> loc(#loc42)
+      %tmp0_23 = arith.addi %tmp0_22, %tmp0_14 : tensor<64x4xi32, #blocked> loc(#loc42)
+      %tmp0_24 = arith.addi %tmp0_23, %tmp0_16 : tensor<64x4xi32, #blocked> loc(#loc44)
+      %tmp0_25 = tt.addptr %tmp0_17, %tmp0_24 : tensor<64x4x!tt.ptr<bf16>, #blocked>, tensor<64x4xi32, #blocked> loc(#loc45)
+      %tmp0_26 = tt.broadcast %r0_mask : tensor<1x4xi1, #blocked> -> tensor<64x4xi1, #blocked> loc(#loc49)
+      %tmp0_27 = tt.load %tmp0_25, %tmp0_26, %cst_2 evictionPolicy = evict_first : tensor<64x4x!tt.ptr<bf16>, #blocked> loc(#loc49)
+      %tmp0_28 = arith.extf %tmp0_27 : tensor<64x4xbf16, #blocked> to tensor<64x4xf32, #blocked> loc(#loc50)
+      %tmp2 = arith.mulf %tmp0_28, %tmp0_28 : tensor<64x4xf32, #blocked> loc(#loc51)
+      %tmp5 = arith.addf %_tmp4_20, %tmp2 : tensor<64x4xf32, #blocked> loc(#loc52)
+      %_tmp4_29 = arith.select %tmp0_26, %tmp5, %_tmp4_20 : tensor<64x4xi1, #blocked>, tensor<64x4xf32, #blocked> loc(#loc53)
+      scf.yield %_tmp4_29 : tensor<64x4xf32, #blocked> loc(#loc22)
+    } loc(#loc46)
+    %tmp4 = "tt.reduce"(%_tmp4) <{axis = 1 : i32}> ({
+    ^bb0(%tmp4_20: f32 loc(callsite(#loc1 at #loc54)), %tmp4_21: f32 loc(callsite(#loc1 at #loc54))):
+      %tmp4_22 = arith.addf %tmp4_20, %tmp4_21 : f32 loc(#loc58)
+      tt.reduce.return %tmp4_22 : f32 loc(#loc56)
+    }) : (tensor<64x4xf32, #blocked>) -> tensor<64xf32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc56)
+    %tmp4_18 = ttg.convert_layout %tmp4 : tensor<64xf32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc55)
+    %tmp4_19 = tt.expand_dims %tmp4_18 {axis = 1 : i32} : tensor<64xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<64x1xf32, #blocked1> loc(#loc55)
+    %0 = tt.splat %out_ptr0 : !tt.ptr<f32> -> tensor<64x1x!tt.ptr<f32>, #blocked1> loc(#loc27)
+    %1 = tt.addptr %0, %xindex_12 : tensor<64x1x!tt.ptr<f32>, #blocked1>, tensor<64x1xi32, #blocked1> loc(#loc27)
+    tt.store %1, %tmp4_19 : tensor<64x1x!tt.ptr<f32>, #blocked1> loc(#loc28)
+    tt.return loc(#loc29)
+  } loc(#loc)
+} loc(#loc)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":23:28)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":23:33)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":24:44)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":24:23)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":26:37)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":28:19)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":29:19)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:45)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:41)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:56)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:50)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:34)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":32:43)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":33:31)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":34:29)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:61)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:115)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":40:22)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":42:23)
+#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":43:40)
+#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":43:8)
+#loc23 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:36)
+#loc25 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:15)
+#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":44:28)
+#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":45:25)
+#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":45:36)
+#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":45:4)
+#loc34 = loc("xoffset"(#loc2))
+#loc35 = loc("xoffset"(#loc3))
+#loc36 = loc("xindex"(#loc4))
+#loc37 = loc("xindex"(#loc5))
+#loc38 = loc("r0_base"(#loc6))
+#loc39 = loc("x0"(#loc7))
+#loc40 = loc("x1"(#loc8))
+#loc41 = loc("tmp0"(#loc9))
+#loc42 = loc("tmp0"(#loc10))
+#loc43 = loc("tmp0"(#loc11))
+#loc44 = loc("tmp0"(#loc12))
+#loc45 = loc("tmp0"(#loc13))
+#loc46 = loc("_tmp4"(#loc14))
+#loc47 = loc("r0_index"(#loc15))
+#loc48 = loc("r0_mask"(#loc16))
+#loc49 = loc("tmp0"(#loc17))
+#loc50 = loc("tmp0"(#loc18))
+#loc51 = loc("tmp2"(#loc19))
+#loc52 = loc("tmp5"(#loc20))
+#loc53 = loc("_tmp4"(#loc21))
+#loc55 = loc("tmp4"(#loc26))
+#loc56 = loc(callsite(#loc23 at #loc54))
+#loc58 = loc(callsite(#loc25 at #loc56))
diff --git a/triton/R2YOCP4NTK65APP5W3JMFFRCAYOI4BDG7YB6FWKROGYIMK5QVNUA/triton_red_fused__fused_rms_norm_view_1.ttir b/triton/R2YOCP4NTK65APP5W3JMFFRCAYOI4BDG7YB6FWKROGYIMK5QVNUA/triton_red_fused__fused_rms_norm_view_1.ttir
new file mode 100644
index 0000000000000000000000000000000000000000..a17336b6cd6c0e1646710a7e673b9676c2869e0b
--- /dev/null
+++ b/triton/R2YOCP4NTK65APP5W3JMFFRCAYOI4BDG7YB6FWKROGYIMK5QVNUA/triton_red_fused__fused_rms_norm_view_1.ttir
@@ -0,0 +1,118 @@
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":18:0)
+#loc1 = loc(unknown)
+#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":44:25)
+#loc32 = loc("in_ptr0"(#loc))
+#loc33 = loc("out_ptr0"(#loc))
+#loc34 = loc("xnumel"(#loc))
+#loc35 = loc("r0_numel"(#loc))
+#loc58 = loc("tmp4"(#loc26))
+#loc61 = loc(callsite(#loc1 at #loc58))
+module {
+  tt.func public @triton_red_fused__fused_rms_norm_view_1(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
+    %cst = arith.constant dense<0.000000e+00> : tensor<64x4xbf16> loc(#loc1)
+    %c4_i32 = arith.constant 4 : i32 loc(#loc2)
+    %c128_i32 = arith.constant 128 : i32 loc(#loc2)
+    %c0_i32 = arith.constant 0 : i32 loc(#loc2)
+    %cst_0 = arith.constant dense<12288> : tensor<64x1xi32> loc(#loc1)
+    %cst_1 = arith.constant dense<128> : tensor<64x1xi32> loc(#loc1)
+    %cst_2 = arith.constant dense<128> : tensor<1x4xi32> loc(#loc1)
+    %cst_3 = arith.constant dense<0.000000e+00> : tensor<64x4xf32> loc(#loc1)
+    %cst_4 = arith.constant dense<32> : tensor<64x1xi32> loc(#loc1)
+    %c64_i32 = arith.constant 64 : i32 loc(#loc1)
+    %xoffset = tt.get_program_id x : i32 loc(#loc36)
+    %xoffset_5 = arith.muli %xoffset, %c64_i32 : i32 loc(#loc37)
+    %xindex = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc38)
+    %xindex_6 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc39)
+    %xindex_7 = tt.splat %xoffset_5 : i32 -> tensor<64x1xi32> loc(#loc40)
+    %xindex_8 = arith.addi %xindex_7, %xindex_6 : tensor<64x1xi32> loc(#loc40)
+    %r0_base = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32> loc(#loc41)
+    %r0_base_9 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<4xi32> -> tensor<1x4xi32> loc(#loc42)
+    %x0 = arith.remsi %xindex_8, %cst_4 : tensor<64x1xi32> loc(#loc43)
+    %x1 = arith.divsi %xindex_8, %cst_4 : tensor<64x1xi32> loc(#loc44)
+    %_tmp4 = scf.for %r0_offset = %c0_i32 to %c128_i32 step %c4_i32 iter_args(%_tmp4_11 = %cst_3) -> (tensor<64x4xf32>)  : i32 {
+      %r0_index = tt.splat %r0_offset : i32 -> tensor<1x4xi32> loc(#loc46)
+      %r0_index_12 = arith.addi %r0_index, %r0_base_9 : tensor<1x4xi32> loc(#loc46)
+      %r0_mask = arith.cmpi slt, %r0_index_12, %cst_2 : tensor<1x4xi32> loc(#loc47)
+      %tmp0 = arith.muli %x0, %cst_1 : tensor<64x1xi32> loc(#loc48)
+      %tmp0_13 = tt.broadcast %r0_index_12 : tensor<1x4xi32> -> tensor<64x4xi32> loc(#loc49)
+      %tmp0_14 = tt.broadcast %tmp0 : tensor<64x1xi32> -> tensor<64x4xi32> loc(#loc49)
+      %tmp0_15 = arith.addi %tmp0_13, %tmp0_14 : tensor<64x4xi32> loc(#loc49)
+      %tmp0_16 = arith.muli %x1, %cst_0 : tensor<64x1xi32> loc(#loc50)
+      %tmp0_17 = tt.broadcast %tmp0_16 : tensor<64x1xi32> -> tensor<64x4xi32> loc(#loc51)
+      %tmp0_18 = arith.addi %tmp0_15, %tmp0_17 : tensor<64x4xi32> loc(#loc51)
+      %tmp0_19 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<64x4x!tt.ptr<bf16>> loc(#loc52)
+      %tmp0_20 = tt.addptr %tmp0_19, %tmp0_18 : tensor<64x4x!tt.ptr<bf16>>, tensor<64x4xi32> loc(#loc52)
+      %tmp0_21 = tt.broadcast %r0_mask : tensor<1x4xi1> -> tensor<64x4xi1> loc(#loc53)
+      %tmp0_22 = tt.load %tmp0_20, %tmp0_21, %cst evictionPolicy = evict_first : tensor<64x4x!tt.ptr<bf16>> loc(#loc53)
+      %tmp0_23 = arith.extf %tmp0_22 : tensor<64x4xbf16> to tensor<64x4xf32> loc(#loc54)
+      %tmp2 = arith.mulf %tmp0_23, %tmp0_23 : tensor<64x4xf32> loc(#loc55)
+      %tmp5 = arith.addf %_tmp4_11, %tmp2 : tensor<64x4xf32> loc(#loc56)
+      %_tmp4_24 = arith.select %tmp0_21, %tmp5, %_tmp4_11 : tensor<64x4xi1>, tensor<64x4xf32> loc(#loc57)
+      scf.yield %_tmp4_24 : tensor<64x4xf32> loc(#loc24)
+    } loc(#loc45)
+    %tmp4 = "tt.reduce"(%_tmp4) <{axis = 1 : i32}> ({
+    ^bb0(%tmp4_11: f32 loc(callsite(#loc1 at #loc58)), %tmp4_12: f32 loc(callsite(#loc1 at #loc58))):
+      %tmp4_13 = arith.addf %tmp4_11, %tmp4_12 : f32 loc(#loc62)
+      tt.reduce.return %tmp4_13 : f32 loc(#loc60)
+    }) : (tensor<64x4xf32>) -> tensor<64xf32> loc(#loc60)
+    %tmp4_10 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<64xf32> -> tensor<64x1xf32> loc(#loc59)
+    %0 = tt.splat %out_ptr0 : !tt.ptr<f32> -> tensor<64x1x!tt.ptr<f32>> loc(#loc29)
+    %1 = tt.addptr %0, %xindex_8 : tensor<64x1x!tt.ptr<f32>>, tensor<64x1xi32> loc(#loc29)
+    tt.store %1, %tmp4_10 : tensor<64x1x!tt.ptr<f32>> loc(#loc30)
+    tt.return loc(#loc31)
+  } loc(#loc)
+} loc(#loc)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":32:43)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":23:28)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":23:33)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":24:36)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":24:44)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":24:23)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":26:27)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":26:37)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":28:19)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":29:19)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":33:31)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":34:29)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:45)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:41)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:56)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:50)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:34)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:61)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:115)
+#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":40:22)
+#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":42:23)
+#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":43:40)
+#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":43:8)
+#loc25 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:36)
+#loc27 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:15)
+#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":44:28)
+#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":45:25)
+#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":45:36)
+#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":45:4)
+#loc36 = loc("xoffset"(#loc3))
+#loc37 = loc("xoffset"(#loc4))
+#loc38 = loc("xindex"(#loc5))
+#loc39 = loc("xindex"(#loc6))
+#loc40 = loc("xindex"(#loc7))
+#loc41 = loc("r0_base"(#loc8))
+#loc42 = loc("r0_base"(#loc9))
+#loc43 = loc("x0"(#loc10))
+#loc44 = loc("x1"(#loc11))
+#loc45 = loc("_tmp4"(#loc2))
+#loc46 = loc("r0_index"(#loc12))
+#loc47 = loc("r0_mask"(#loc13))
+#loc48 = loc("tmp0"(#loc14))
+#loc49 = loc("tmp0"(#loc15))
+#loc50 = loc("tmp0"(#loc16))
+#loc51 = loc("tmp0"(#loc17))
+#loc52 = loc("tmp0"(#loc18))
+#loc53 = loc("tmp0"(#loc19))
+#loc54 = loc("tmp0"(#loc20))
+#loc55 = loc("tmp2"(#loc21))
+#loc56 = loc("tmp5"(#loc22))
+#loc57 = loc("_tmp4"(#loc23))
+#loc59 = loc("tmp4"(#loc28))
+#loc60 = loc(callsite(#loc25 at #loc58))
+#loc62 = loc(callsite(#loc27 at #loc60))
diff --git a/triton/RIRT6HUGDE5FKFXB6MO7WI4SAH36A3KCULSXF6ERVDOUPVVMN3QA/__grp__triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json b/triton/RIRT6HUGDE5FKFXB6MO7WI4SAH36A3KCULSXF6ERVDOUPVVMN3QA/__grp__triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..437a9fa9e3e5b6ec89ecc05c10df6e70ac93c49c
--- /dev/null
+++ b/triton/RIRT6HUGDE5FKFXB6MO7WI4SAH36A3KCULSXF6ERVDOUPVVMN3QA/__grp__triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json
@@ -0,0 +1 @@
+{"child_paths": {"triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.source": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/RIRT6HUGDE5FKFXB6MO7WI4SAH36A3KCULSXF6ERVDOUPVVMN3QA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.source", "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/RIRT6HUGDE5FKFXB6MO7WI4SAH36A3KCULSXF6ERVDOUPVVMN3QA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttir", "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttgir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/RIRT6HUGDE5FKFXB6MO7WI4SAH36A3KCULSXF6ERVDOUPVVMN3QA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttgir", "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.llir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/RIRT6HUGDE5FKFXB6MO7WI4SAH36A3KCULSXF6ERVDOUPVVMN3QA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.llir", "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ptx": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/RIRT6HUGDE5FKFXB6MO7WI4SAH36A3KCULSXF6ERVDOUPVVMN3QA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ptx", "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.cubin": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/RIRT6HUGDE5FKFXB6MO7WI4SAH36A3KCULSXF6ERVDOUPVVMN3QA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.cubin", "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/RIRT6HUGDE5FKFXB6MO7WI4SAH36A3KCULSXF6ERVDOUPVVMN3QA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json"}}
\ No newline at end of file
diff --git a/triton/RIRT6HUGDE5FKFXB6MO7WI4SAH36A3KCULSXF6ERVDOUPVVMN3QA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.cubin b/triton/RIRT6HUGDE5FKFXB6MO7WI4SAH36A3KCULSXF6ERVDOUPVVMN3QA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..fc08911e17a8343c307df373a327389a91fde45d
Binary files /dev/null and b/triton/RIRT6HUGDE5FKFXB6MO7WI4SAH36A3KCULSXF6ERVDOUPVVMN3QA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.cubin differ
diff --git a/triton/RIRT6HUGDE5FKFXB6MO7WI4SAH36A3KCULSXF6ERVDOUPVVMN3QA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json b/triton/RIRT6HUGDE5FKFXB6MO7WI4SAH36A3KCULSXF6ERVDOUPVVMN3QA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..a6b79ea1c064c00215e5996dbbb2d9fa83ce132e
--- /dev/null
+++ b/triton/RIRT6HUGDE5FKFXB6MO7WI4SAH36A3KCULSXF6ERVDOUPVVMN3QA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json
@@ -0,0 +1 @@
+{"hash": "8a233f1e86193a5516e1f31dfb239201f7e06d42a2e572f891a8dd47d6ac6ee0", "target": {"backend": "cuda", "arch": 89, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "enable_reflect_ftz": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee", "bf16x3", "bf16x6"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm89", "instrumentation_mode": "", "triton_version": "3.6.0", "tensordesc_meta": [], "shared": 2048, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0"}
\ No newline at end of file
diff --git a/triton/RIRT6HUGDE5FKFXB6MO7WI4SAH36A3KCULSXF6ERVDOUPVVMN3QA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.llir b/triton/RIRT6HUGDE5FKFXB6MO7WI4SAH36A3KCULSXF6ERVDOUPVVMN3QA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.llir
new file mode 100644
index 0000000000000000000000000000000000000000..be548701fb837c24d0c458c7bfa86b08fd3b470c
--- /dev/null
+++ b/triton/RIRT6HUGDE5FKFXB6MO7WI4SAH36A3KCULSXF6ERVDOUPVVMN3QA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.llir
@@ -0,0 +1,891 @@
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64"
+
+@global_smem = external local_unnamed_addr addrspace(3) global [0 x i8], align 16
+@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1
+
+; Function Attrs: nounwind
+define ptx_kernel void @triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, ptr addrspace(1) %6, i32 %7, i32 %8, ptr addrspace(1) readnone captures(none) %9, ptr addrspace(1) readnone captures(none) %10) local_unnamed_addr #0 !dbg !5 {
+  %12 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !8
+  %13 = shl i32 %12, 2, !dbg !9
+  %14 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
+  %15 = and i32 %14, 127, !dbg !10
+  %16 = and i32 %14, 96, !dbg !10
+  %17 = lshr exact i32 %16, 5, !dbg !10
+  %18 = and i32 %14, 3, !dbg !10
+  %19 = or disjoint i32 %17, %13, !dbg !11
+  %20 = or disjoint i32 %13, %18, !dbg !11
+  %21 = shl nuw nsw i32 %14, 2, !dbg !12
+  %22 = and i32 %21, 124, !dbg !12
+  %23 = and i32 %14, 124, !dbg !12
+  %24 = lshr i32 %14, 2, !dbg !12
+  %25 = sdiv i32 %19, 32, !dbg !13
+  %26 = mul i32 %25, 32, !dbg !14
+  %.decomposed = sub i32 %19, %26, !dbg !14
+  %27 = sdiv i32 %20, 32, !dbg !13
+  %28 = or disjoint i32 %22, 4096, !dbg !15
+  %29 = shl nsw i32 %.decomposed, 7, !dbg !16
+  %30 = add nsw i32 %28, %29, !dbg !17
+  %31 = mul i32 %25, 36864, !dbg !18
+  %32 = add i32 %30, %31, !dbg !19
+  %33 = sext i32 %32 to i64, !dbg !20
+  %34 = getelementptr bfloat, ptr addrspace(1) %2, i64 %33, !dbg !20
+  %35 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !21
+  %36 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %34, i64 %35, i1 true) #6, !dbg !21
+  %37 = extractvalue { i32, i32 } %36, 0, !dbg !21
+  %38 = bitcast i32 %37 to <2 x bfloat>, !dbg !21
+  %39 = extractvalue { i32, i32 } %36, 1, !dbg !21
+  %40 = bitcast i32 %39 to <2 x bfloat>, !dbg !21
+  %41 = extractelement <2 x bfloat> %38, i64 0, !dbg !21
+  %42 = extractelement <2 x bfloat> %38, i64 1, !dbg !21
+  %43 = extractelement <2 x bfloat> %40, i64 0, !dbg !21
+  %44 = extractelement <2 x bfloat> %40, i64 1, !dbg !21
+  %45 = fpext bfloat %41 to float, !dbg !22
+  %46 = fpext bfloat %42 to float, !dbg !22
+  %47 = fpext bfloat %43 to float, !dbg !22
+  %48 = fpext bfloat %44 to float, !dbg !22
+  %49 = or disjoint i32 %29, %22, !dbg !23
+  %50 = add i32 %49, %31, !dbg !24
+  %51 = sext i32 %50 to i64, !dbg !25
+  %52 = getelementptr bfloat, ptr addrspace(1) %2, i64 %51, !dbg !25
+  %53 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !26
+  %54 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %52, i64 %53, i1 true) #6, !dbg !26
+  %55 = extractvalue { i32, i32 } %54, 0, !dbg !26
+  %56 = bitcast i32 %55 to <2 x bfloat>, !dbg !26
+  %57 = extractvalue { i32, i32 } %54, 1, !dbg !26
+  %58 = bitcast i32 %57 to <2 x bfloat>, !dbg !26
+  %59 = extractelement <2 x bfloat> %56, i64 0, !dbg !26
+  %60 = extractelement <2 x bfloat> %56, i64 1, !dbg !26
+  %61 = extractelement <2 x bfloat> %58, i64 0, !dbg !26
+  %62 = extractelement <2 x bfloat> %58, i64 1, !dbg !26
+  %63 = fpext bfloat %59 to float, !dbg !27
+  %64 = fpext bfloat %60 to float, !dbg !27
+  %65 = fpext bfloat %61 to float, !dbg !27
+  %66 = fpext bfloat %62 to float, !dbg !27
+  %67 = fmul float %45, %45, !dbg !28
+  %68 = fmul float %46, %46, !dbg !28
+  %69 = fmul float %47, %47, !dbg !28
+  %70 = fmul float %48, %48, !dbg !28
+  %71 = fmul float %63, %63, !dbg !29
+  %72 = fmul float %64, %64, !dbg !29
+  %73 = fmul float %65, %65, !dbg !29
+  %74 = fmul float %66, %66, !dbg !29
+  %75 = fadd float %67, %68, !dbg !30
+  %76 = fadd float %69, %75, !dbg !30
+  %77 = fadd float %70, %76, !dbg !30
+  %78 = bitcast float %77 to i32, !dbg !33
+  %79 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %78, i32 16, i32 31), !dbg !33
+  %80 = bitcast i32 %79 to float, !dbg !33
+  %81 = fadd float %77, %80, !dbg !30
+  %82 = bitcast float %81 to i32, !dbg !33
+  %83 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %82, i32 8, i32 31), !dbg !33
+  %84 = bitcast i32 %83 to float, !dbg !33
+  %85 = fadd float %81, %84, !dbg !30
+  %86 = bitcast float %85 to i32, !dbg !33
+  %87 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %86, i32 4, i32 31), !dbg !33
+  %88 = bitcast i32 %87 to float, !dbg !33
+  %89 = fadd float %85, %88, !dbg !30
+  %90 = bitcast float %89 to i32, !dbg !33
+  %91 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %90, i32 2, i32 31), !dbg !33
+  %92 = bitcast i32 %91 to float, !dbg !33
+  %93 = fadd float %89, %92, !dbg !30
+  %94 = bitcast float %93 to i32, !dbg !33
+  %95 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %94, i32 1, i32 31), !dbg !33
+  %96 = bitcast i32 %95 to float, !dbg !33
+  %97 = fadd float %93, %96, !dbg !30
+  %98 = fadd float %71, %72, !dbg !36
+  %99 = fadd float %73, %98, !dbg !36
+  %100 = fadd float %74, %99, !dbg !36
+  %101 = bitcast float %100 to i32, !dbg !37
+  %102 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %101, i32 16, i32 31), !dbg !37
+  %103 = bitcast i32 %102 to float, !dbg !37
+  %104 = fadd float %100, %103, !dbg !36
+  %105 = bitcast float %104 to i32, !dbg !37
+  %106 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %105, i32 8, i32 31), !dbg !37
+  %107 = bitcast i32 %106 to float, !dbg !37
+  %108 = fadd float %104, %107, !dbg !36
+  %109 = bitcast float %108 to i32, !dbg !37
+  %110 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %109, i32 4, i32 31), !dbg !37
+  %111 = bitcast i32 %110 to float, !dbg !37
+  %112 = fadd float %108, %111, !dbg !36
+  %113 = bitcast float %112 to i32, !dbg !37
+  %114 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %113, i32 2, i32 31), !dbg !37
+  %115 = bitcast i32 %114 to float, !dbg !37
+  %116 = fadd float %112, %115, !dbg !36
+  %117 = bitcast float %116 to i32, !dbg !37
+  %118 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %117, i32 1, i32 31), !dbg !37
+  %119 = bitcast i32 %118 to float, !dbg !37
+  %120 = fadd float %116, %119, !dbg !36
+  %121 = and i32 %24, 1, !dbg !39
+  %122 = zext nneg i32 %15 to i64, !dbg !40
+  %123 = getelementptr bfloat, ptr addrspace(1) %3, i64 %122, !dbg !40
+  %124 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !41
+  %125 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %123, i64 %124, i1 true) #6, !dbg !41
+  %126 = bitcast i16 %125 to bfloat, !dbg !41
+  %127 = fpext bfloat %126 to float, !dbg !42
+  %128 = shl i32 %25, 7, !dbg !43
+  %129 = or disjoint i32 %128, %22, !dbg !44
+  %130 = sext i32 %129 to i64, !dbg !45
+  %131 = getelementptr float, ptr addrspace(1) %4, i64 %130, !dbg !45
+  %132 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !46
+  %133 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %131, i64 %132, i1 true) #6, !dbg !46
+  %134 = extractvalue { i32, i32, i32, i32 } %133, 0, !dbg !46
+  %135 = extractvalue { i32, i32, i32, i32 } %133, 1, !dbg !46
+  %136 = extractvalue { i32, i32, i32, i32 } %133, 2, !dbg !46
+  %137 = extractvalue { i32, i32, i32, i32 } %133, 3, !dbg !46
+  %138 = bitcast i32 %134 to float, !dbg !46
+  %139 = bitcast i32 %135 to float, !dbg !46
+  %140 = bitcast i32 %136 to float, !dbg !46
+  %141 = bitcast i32 %137 to float, !dbg !46
+  %142 = and i32 %14, 7, !dbg !46
+  %143 = shl nuw nsw i32 %142, 4, !dbg !46
+  %144 = shl nuw nsw i32 %16, 2, !dbg !46
+  %145 = lshr i32 %14, 1, !dbg !46
+  %146 = and i32 %145, 12, !dbg !46
+  %147 = or disjoint i32 %143, %144, !dbg !46
+  %148 = or disjoint i32 %146, %16, !dbg !46
+  %149 = xor i32 %147, %148, !dbg !46
+  %150 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %149, !dbg !46
+  %151 = insertelement <1 x i32> poison, i32 %134, i64 0, !dbg !46
+  store <1 x i32> %151, ptr addrspace(3) %150, align 4, !dbg !46
+  %152 = xor i32 %149, 516, !dbg !46
+  %153 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %152, !dbg !46
+  %154 = insertelement <1 x i32> poison, i32 %135, i64 0, !dbg !46
+  store <1 x i32> %154, ptr addrspace(3) %153, align 4, !dbg !46
+  %155 = xor i32 %149, 1032, !dbg !46
+  %156 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %155, !dbg !46
+  %157 = insertelement <1 x i32> poison, i32 %136, i64 0, !dbg !46
+  store <1 x i32> %157, ptr addrspace(3) %156, align 4, !dbg !46
+  %158 = xor i32 %149, 1548, !dbg !46
+  %159 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %158, !dbg !46
+  %160 = insertelement <1 x i32> poison, i32 %137, i64 0, !dbg !46
+  store <1 x i32> %160, ptr addrspace(3) %159, align 4, !dbg !46
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !46
+  %161 = shl nuw nsw i32 %14, 7, !dbg !46
+  %162 = and i32 %161, 1920, !dbg !46
+  %163 = shl nuw nsw i32 %18, 5, !dbg !46
+  %164 = xor i32 %163, %23, !dbg !46
+  %165 = or disjoint i32 %164, %162, !dbg !46
+  %166 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %165, !dbg !46
+  %167 = load float, ptr addrspace(3) %166, align 4, !dbg !46
+  %168 = xor i32 %165, 4, !dbg !46
+  %169 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %168, !dbg !46
+  %170 = load float, ptr addrspace(3) %169, align 4, !dbg !46
+  %171 = xor i32 %165, 8, !dbg !46
+  %172 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %171, !dbg !46
+  %173 = load float, ptr addrspace(3) %172, align 4, !dbg !46
+  %174 = xor i32 %165, 12, !dbg !46
+  %175 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %174, !dbg !46
+  %176 = load float, ptr addrspace(3) %175, align 4, !dbg !46
+  %177 = getelementptr float, ptr addrspace(1) %5, i64 %130, !dbg !47
+  %178 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !48
+  %179 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %177, i64 %178, i1 true) #6, !dbg !48
+  %180 = extractvalue { i32, i32, i32, i32 } %179, 0, !dbg !48
+  %181 = extractvalue { i32, i32, i32, i32 } %179, 1, !dbg !48
+  %182 = extractvalue { i32, i32, i32, i32 } %179, 2, !dbg !48
+  %183 = extractvalue { i32, i32, i32, i32 } %179, 3, !dbg !48
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !48
+  %184 = insertelement <1 x i32> poison, i32 %180, i64 0, !dbg !48
+  store <1 x i32> %184, ptr addrspace(3) %150, align 4, !dbg !48
+  %185 = insertelement <1 x i32> poison, i32 %181, i64 0, !dbg !48
+  store <1 x i32> %185, ptr addrspace(3) %153, align 4, !dbg !48
+  %186 = insertelement <1 x i32> poison, i32 %182, i64 0, !dbg !48
+  store <1 x i32> %186, ptr addrspace(3) %156, align 4, !dbg !48
+  %187 = insertelement <1 x i32> poison, i32 %183, i64 0, !dbg !48
+  store <1 x i32> %187, ptr addrspace(3) %159, align 4, !dbg !48
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !48
+  %188 = load float, ptr addrspace(3) %166, align 4, !dbg !48
+  %189 = load float, ptr addrspace(3) %169, align 4, !dbg !48
+  %190 = load float, ptr addrspace(3) %172, align 4, !dbg !48
+  %191 = load float, ptr addrspace(3) %175, align 4, !dbg !48
+  %192 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #6, !dbg !49
+  %193 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %34, i64 %192, i1 true) #6, !dbg !49
+  %194 = getelementptr bfloat, ptr addrspace(1) %6, i64 %122, !dbg !50
+  %195 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !51
+  %196 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %194, i64 %195, i1 true) #6, !dbg !51
+  %197 = icmp eq i32 %121, 0, !dbg !52
+  %198 = and i32 %24, 30, !dbg !53
+  %199 = or disjoint i32 %198, 32, !dbg !53
+  %200 = or disjoint i32 %198, 64, !dbg !53
+  %201 = or disjoint i32 %198, 96, !dbg !53
+  %202 = or disjoint i32 %198, 1, !dbg !54
+  %203 = or disjoint i32 %198, 33, !dbg !54
+  %204 = or disjoint i32 %198, 65, !dbg !54
+  %205 = or disjoint i32 %198, 97, !dbg !54
+  %206 = shl i32 %20, 7, !dbg !55
+  %207 = shl i32 %27, 15, !dbg !55
+  %208 = add i32 %207, %206, !dbg !55
+  %209 = or disjoint i32 %208, %202, !dbg !56
+  %210 = or disjoint i32 %208, %203, !dbg !56
+  %211 = or disjoint i32 %208, %204, !dbg !56
+  %212 = or disjoint i32 %208, %205, !dbg !56
+  %213 = sext i32 %209 to i64, !dbg !57
+  %214 = getelementptr bfloat, ptr addrspace(1) %2, i64 %213, !dbg !57
+  %215 = sext i32 %210 to i64, !dbg !57
+  %216 = getelementptr bfloat, ptr addrspace(1) %2, i64 %215, !dbg !57
+  %217 = sext i32 %211 to i64, !dbg !57
+  %218 = getelementptr bfloat, ptr addrspace(1) %2, i64 %217, !dbg !57
+  %219 = sext i32 %212 to i64, !dbg !57
+  %220 = getelementptr bfloat, ptr addrspace(1) %2, i64 %219, !dbg !57
+  %221 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !58
+  %222 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %214, i64 %221, i1 %197) #6, !dbg !58
+  %223 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !58
+  %224 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %216, i64 %223, i1 %197) #6, !dbg !58
+  %225 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !58
+  %226 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %218, i64 %225, i1 %197) #6, !dbg !58
+  %227 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !58
+  %228 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %220, i64 %227, i1 %197) #6, !dbg !58
+  %229 = tail call float @llvm.nvvm.div.full(float %120, float 1.280000e+02), !dbg !59
+  %230 = fadd float %229, 0x3EB0C6F7A0000000, !dbg !60
+  %231 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !61
+  %.not.i = icmp eq i32 %231, 0, !dbg !61
+  br i1 %.not.i, label %234, label %232, !dbg !61
+
+232:                                              ; preds = %11
+  %233 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %230), !dbg !61
+  br label %__nv_rsqrtf.exit, !dbg !61
+
+234:                                              ; preds = %11
+  %235 = tail call float @llvm.nvvm.rsqrt.approx.f(float %230), !dbg !61
+  br label %__nv_rsqrtf.exit, !dbg !61
+
+__nv_rsqrtf.exit:                                 ; preds = %232, %234
+  %.0.i = phi float [ %233, %232 ], [ %235, %234 ], !dbg !61
+  %236 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !61
+  %237 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !61
+  %238 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !61
+  %.not.i7 = icmp eq i32 %238, 0, !dbg !61
+  br i1 %.not.i7, label %241, label %239, !dbg !61
+
+239:                                              ; preds = %__nv_rsqrtf.exit
+  %240 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %230), !dbg !61
+  br label %__nv_rsqrtf.exit9, !dbg !61
+
+241:                                              ; preds = %__nv_rsqrtf.exit
+  %242 = tail call float @llvm.nvvm.rsqrt.approx.f(float %230), !dbg !61
+  br label %__nv_rsqrtf.exit9, !dbg !61
+
+__nv_rsqrtf.exit9:                                ; preds = %239, %241
+  %.0.i8 = phi float [ %240, %239 ], [ %242, %241 ], !dbg !61
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !62
+  %243 = lshr exact i32 %16, 3, !dbg !62
+  %244 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %243, !dbg !62
+  store float %.0.i, ptr addrspace(3) %244, align 4, !dbg !62
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !62
+  %245 = shl nuw nsw i32 %18, 2, !dbg !62
+  %246 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %245, !dbg !62
+  %247 = load float, ptr addrspace(3) %246, align 4, !dbg !62
+  %248 = zext nneg i32 %202 to i64, !dbg !63
+  %249 = getelementptr bfloat, ptr addrspace(1) %3, i64 %248, !dbg !63
+  %250 = zext nneg i32 %203 to i64, !dbg !63
+  %251 = getelementptr bfloat, ptr addrspace(1) %3, i64 %250, !dbg !63
+  %252 = zext nneg i32 %204 to i64, !dbg !63
+  %253 = getelementptr bfloat, ptr addrspace(1) %3, i64 %252, !dbg !63
+  %254 = zext nneg i32 %205 to i64, !dbg !63
+  %255 = getelementptr bfloat, ptr addrspace(1) %3, i64 %254, !dbg !63
+  %256 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !64
+  %257 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %249, i64 %256, i1 %197) #6, !dbg !64
+  %258 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !64
+  %259 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %251, i64 %258, i1 %197) #6, !dbg !64
+  %260 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !64
+  %261 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %253, i64 %260, i1 %197) #6, !dbg !64
+  %262 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !64
+  %263 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %255, i64 %262, i1 %197) #6, !dbg !64
+  %264 = icmp ne i32 %121, 0, !dbg !65
+  %265 = or disjoint i32 %208, %198, !dbg !66
+  %266 = or disjoint i32 %208, %199, !dbg !66
+  %267 = or disjoint i32 %208, %200, !dbg !66
+  %268 = or disjoint i32 %208, %201, !dbg !66
+  %269 = sext i32 %265 to i64, !dbg !67
+  %270 = getelementptr bfloat, ptr addrspace(1) %2, i64 %269, !dbg !67
+  %271 = sext i32 %266 to i64, !dbg !67
+  %272 = getelementptr bfloat, ptr addrspace(1) %2, i64 %271, !dbg !67
+  %273 = sext i32 %267 to i64, !dbg !67
+  %274 = getelementptr bfloat, ptr addrspace(1) %2, i64 %273, !dbg !67
+  %275 = sext i32 %268 to i64, !dbg !67
+  %276 = getelementptr bfloat, ptr addrspace(1) %2, i64 %275, !dbg !67
+  %277 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !68
+  %278 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %270, i64 %277, i1 %264) #6, !dbg !68
+  %279 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !68
+  %280 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %272, i64 %279, i1 %264) #6, !dbg !68
+  %281 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !68
+  %282 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %274, i64 %281, i1 %264) #6, !dbg !68
+  %283 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !68
+  %284 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %276, i64 %283, i1 %264) #6, !dbg !68
+  %285 = zext nneg i32 %198 to i64, !dbg !69
+  %286 = getelementptr bfloat, ptr addrspace(1) %3, i64 %285, !dbg !69
+  %287 = zext nneg i32 %199 to i64, !dbg !69
+  %288 = getelementptr bfloat, ptr addrspace(1) %3, i64 %287, !dbg !69
+  %289 = zext nneg i32 %200 to i64, !dbg !69
+  %290 = getelementptr bfloat, ptr addrspace(1) %3, i64 %289, !dbg !69
+  %291 = zext nneg i32 %201 to i64, !dbg !69
+  %292 = getelementptr bfloat, ptr addrspace(1) %3, i64 %291, !dbg !69
+  %293 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !70
+  %294 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %286, i64 %293, i1 %264) #6, !dbg !70
+  %295 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !70
+  %296 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %288, i64 %295, i1 %264) #6, !dbg !70
+  %297 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !70
+  %298 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %290, i64 %297, i1 %264) #6, !dbg !70
+  %299 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !70
+  %300 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %292, i64 %299, i1 %264) #6, !dbg !70
+  %301 = fmul float %.0.i8, %63, !dbg !71
+  %302 = fmul float %.0.i8, %64, !dbg !71
+  %303 = fmul float %.0.i8, %65, !dbg !71
+  %304 = fmul float %.0.i8, %66, !dbg !71
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !72
+  %305 = shl nuw nsw i32 %18, 7, !dbg !72
+  %306 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %305, !dbg !72
+  %307 = getelementptr inbounds nuw i8, ptr addrspace(3) %306, i32 %164, !dbg !72
+  store float %127, ptr addrspace(3) %307, align 4, !dbg !72
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !72
+  %308 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %22, !dbg !72
+  %309 = load float, ptr addrspace(3) %308, align 4, !dbg !72
+  %310 = xor i32 %22, 160, !dbg !72
+  %311 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %310, !dbg !72
+  %312 = load float, ptr addrspace(3) %311, align 4, !dbg !72
+  %313 = xor i32 %22, 320, !dbg !72
+  %314 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %313, !dbg !72
+  %315 = load float, ptr addrspace(3) %314, align 4, !dbg !72
+  %316 = xor i32 %22, 480, !dbg !72
+  %317 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %316, !dbg !72
+  %318 = load float, ptr addrspace(3) %317, align 4, !dbg !72
+  %319 = fmul float %301, %309, !dbg !72
+  %320 = fmul float %302, %312, !dbg !72
+  %321 = fmul float %303, %315, !dbg !72
+  %322 = fmul float %304, %318, !dbg !72
+  %323 = fmul float %319, %138, !dbg !73
+  %324 = fmul float %320, %139, !dbg !73
+  %325 = fmul float %321, %140, !dbg !73
+  %326 = fmul float %322, %141, !dbg !73
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !73
+  store float %323, ptr addrspace(3) %150, align 4, !dbg !73
+  store float %324, ptr addrspace(3) %153, align 4, !dbg !73
+  store float %325, ptr addrspace(3) %156, align 4, !dbg !73
+  store float %326, ptr addrspace(3) %159, align 4, !dbg !73
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !73
+  %327 = load float, ptr addrspace(3) %166, align 4, !dbg !73
+  %328 = load float, ptr addrspace(3) %169, align 4, !dbg !73
+  %329 = load float, ptr addrspace(3) %172, align 4, !dbg !73
+  %330 = load float, ptr addrspace(3) %175, align 4, !dbg !73
+  %331 = add i32 %208, 4097, !dbg !74
+  %332 = or disjoint i32 %331, %198, !dbg !75
+  %333 = add i32 %208, 4129, !dbg !74
+  %334 = or disjoint i32 %333, %198, !dbg !75
+  %335 = add i32 %208, 4161, !dbg !74
+  %336 = or disjoint i32 %335, %198, !dbg !75
+  %337 = add i32 %208, 4193, !dbg !74
+  %338 = or disjoint i32 %337, %198, !dbg !75
+  %339 = sext i32 %332 to i64, !dbg !76
+  %340 = getelementptr bfloat, ptr addrspace(1) %2, i64 %339, !dbg !76
+  %341 = sext i32 %334 to i64, !dbg !76
+  %342 = getelementptr bfloat, ptr addrspace(1) %2, i64 %341, !dbg !76
+  %343 = sext i32 %336 to i64, !dbg !76
+  %344 = getelementptr bfloat, ptr addrspace(1) %2, i64 %343, !dbg !76
+  %345 = sext i32 %338 to i64, !dbg !76
+  %346 = getelementptr bfloat, ptr addrspace(1) %2, i64 %345, !dbg !76
+  %347 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !77
+  %348 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %340, i64 %347, i1 %197) #6, !dbg !77
+  %349 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !77
+  %350 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %342, i64 %349, i1 %197) #6, !dbg !77
+  %351 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !77
+  %352 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %344, i64 %351, i1 %197) #6, !dbg !77
+  %353 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !77
+  %354 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %346, i64 %353, i1 %197) #6, !dbg !77
+  %355 = tail call float @llvm.nvvm.div.full(float %97, float 1.280000e+02), !dbg !78
+  %356 = fadd float %355, 0x3EB0C6F7A0000000, !dbg !79
+  %357 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !80
+  %.not.i10 = icmp eq i32 %357, 0, !dbg !80
+  br i1 %.not.i10, label %360, label %358, !dbg !80
+
+358:                                              ; preds = %__nv_rsqrtf.exit9
+  %359 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %356), !dbg !80
+  br label %__nv_rsqrtf.exit12, !dbg !80
+
+360:                                              ; preds = %__nv_rsqrtf.exit9
+  %361 = tail call float @llvm.nvvm.rsqrt.approx.f(float %356), !dbg !80
+  br label %__nv_rsqrtf.exit12, !dbg !80
+
+__nv_rsqrtf.exit12:                               ; preds = %358, %360
+  %.0.i11 = phi float [ %359, %358 ], [ %361, %360 ], !dbg !80
+  %362 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !80
+  %363 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !80
+  %364 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !80
+  %.not.i19 = icmp eq i32 %364, 0, !dbg !80
+  br i1 %.not.i19, label %367, label %365, !dbg !80
+
+365:                                              ; preds = %__nv_rsqrtf.exit12
+  %366 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %356), !dbg !80
+  br label %__nv_rsqrtf.exit21, !dbg !80
+
+367:                                              ; preds = %__nv_rsqrtf.exit12
+  %368 = tail call float @llvm.nvvm.rsqrt.approx.f(float %356), !dbg !80
+  br label %__nv_rsqrtf.exit21, !dbg !80
+
+__nv_rsqrtf.exit21:                               ; preds = %365, %367
+  %.0.i20 = phi float [ %366, %365 ], [ %368, %367 ], !dbg !80
+  %369 = bitcast i16 %354 to bfloat, !dbg !77
+  %370 = fpext bfloat %369 to float, !dbg !81
+  %371 = bitcast i16 %352 to bfloat, !dbg !77
+  %372 = fpext bfloat %371 to float, !dbg !81
+  %373 = bitcast i16 %350 to bfloat, !dbg !77
+  %374 = fpext bfloat %373 to float, !dbg !81
+  %375 = bitcast i16 %348 to bfloat, !dbg !77
+  %376 = fpext bfloat %375 to float, !dbg !81
+  %377 = bitcast i16 %228 to bfloat, !dbg !58
+  %378 = fpext bfloat %377 to float, !dbg !82
+  %379 = fmul float %247, %378, !dbg !62
+  %380 = bitcast i16 %263 to bfloat, !dbg !64
+  %381 = fpext bfloat %380 to float, !dbg !83
+  %382 = fmul float %379, %381, !dbg !84
+  %383 = fsub float 0.000000e+00, %382, !dbg !85
+  %384 = bitcast i16 %284 to bfloat, !dbg !68
+  %385 = fpext bfloat %384 to float, !dbg !86
+  %386 = fmul float %247, %385, !dbg !87
+  %387 = bitcast i16 %300 to bfloat, !dbg !70
+  %388 = fpext bfloat %387 to float, !dbg !88
+  %389 = fmul float %386, %388, !dbg !89
+  %390 = select i1 %197, float %383, float %389, !dbg !90
+  %391 = fmul float %191, %390, !dbg !91
+  %392 = fadd float %391, %330, !dbg !92
+  %393 = bitcast i16 %226 to bfloat, !dbg !58
+  %394 = fpext bfloat %393 to float, !dbg !82
+  %395 = fmul float %247, %394, !dbg !62
+  %396 = bitcast i16 %261 to bfloat, !dbg !64
+  %397 = fpext bfloat %396 to float, !dbg !83
+  %398 = fmul float %395, %397, !dbg !84
+  %399 = fsub float 0.000000e+00, %398, !dbg !85
+  %400 = bitcast i16 %282 to bfloat, !dbg !68
+  %401 = fpext bfloat %400 to float, !dbg !86
+  %402 = fmul float %247, %401, !dbg !87
+  %403 = bitcast i16 %298 to bfloat, !dbg !70
+  %404 = fpext bfloat %403 to float, !dbg !88
+  %405 = fmul float %402, %404, !dbg !89
+  %406 = select i1 %197, float %399, float %405, !dbg !90
+  %407 = fmul float %190, %406, !dbg !91
+  %408 = fadd float %407, %329, !dbg !92
+  %409 = bitcast i16 %224 to bfloat, !dbg !58
+  %410 = fpext bfloat %409 to float, !dbg !82
+  %411 = fmul float %247, %410, !dbg !62
+  %412 = bitcast i16 %259 to bfloat, !dbg !64
+  %413 = fpext bfloat %412 to float, !dbg !83
+  %414 = fmul float %411, %413, !dbg !84
+  %415 = fsub float 0.000000e+00, %414, !dbg !85
+  %416 = bitcast i16 %280 to bfloat, !dbg !68
+  %417 = fpext bfloat %416 to float, !dbg !86
+  %418 = fmul float %247, %417, !dbg !87
+  %419 = bitcast i16 %296 to bfloat, !dbg !70
+  %420 = fpext bfloat %419 to float, !dbg !88
+  %421 = fmul float %418, %420, !dbg !89
+  %422 = select i1 %197, float %415, float %421, !dbg !90
+  %423 = fmul float %189, %422, !dbg !91
+  %424 = fadd float %423, %328, !dbg !92
+  %425 = bitcast i16 %222 to bfloat, !dbg !58
+  %426 = fpext bfloat %425 to float, !dbg !82
+  %427 = fmul float %247, %426, !dbg !62
+  %428 = bitcast i16 %257 to bfloat, !dbg !64
+  %429 = fpext bfloat %428 to float, !dbg !83
+  %430 = fmul float %427, %429, !dbg !84
+  %431 = fsub float 0.000000e+00, %430, !dbg !85
+  %432 = bitcast i16 %278 to bfloat, !dbg !68
+  %433 = fpext bfloat %432 to float, !dbg !86
+  %434 = fmul float %247, %433, !dbg !87
+  %435 = bitcast i16 %294 to bfloat, !dbg !70
+  %436 = fpext bfloat %435 to float, !dbg !88
+  %437 = fmul float %434, %436, !dbg !89
+  %438 = select i1 %197, float %431, float %437, !dbg !90
+  %439 = fmul float %188, %438, !dbg !91
+  %440 = fadd float %439, %327, !dbg !92
+  %441 = bitcast i16 %196 to bfloat, !dbg !51
+  %442 = fpext bfloat %441 to float, !dbg !93
+  %443 = extractvalue { i32, i32 } %193, 1, !dbg !49
+  %444 = bitcast i32 %443 to <2 x bfloat>, !dbg !49
+  %445 = extractelement <2 x bfloat> %444, i64 1, !dbg !49
+  %446 = fpext bfloat %445 to float, !dbg !94
+  %447 = extractelement <2 x bfloat> %444, i64 0, !dbg !49
+  %448 = fpext bfloat %447 to float, !dbg !94
+  %449 = extractvalue { i32, i32 } %193, 0, !dbg !49
+  %450 = bitcast i32 %449 to <2 x bfloat>, !dbg !49
+  %451 = extractelement <2 x bfloat> %450, i64 1, !dbg !49
+  %452 = fpext bfloat %451 to float, !dbg !94
+  %453 = extractelement <2 x bfloat> %450, i64 0, !dbg !49
+  %454 = fpext bfloat %453 to float, !dbg !94
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !95
+  store float %.0.i11, ptr addrspace(3) %244, align 4, !dbg !95
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !95
+  %455 = load float, ptr addrspace(3) %246, align 4, !dbg !95
+  %456 = fmul float %455, %376, !dbg !95
+  %457 = fmul float %455, %374, !dbg !95
+  %458 = fmul float %455, %372, !dbg !95
+  %459 = fmul float %455, %370, !dbg !95
+  %460 = getelementptr bfloat, ptr addrspace(1) %6, i64 %248, !dbg !96
+  %461 = getelementptr bfloat, ptr addrspace(1) %6, i64 %250, !dbg !96
+  %462 = getelementptr bfloat, ptr addrspace(1) %6, i64 %252, !dbg !96
+  %463 = getelementptr bfloat, ptr addrspace(1) %6, i64 %254, !dbg !96
+  %464 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !97
+  %465 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %460, i64 %464, i1 %197) #6, !dbg !97
+  %466 = bitcast i16 %465 to bfloat, !dbg !97
+  %467 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !97
+  %468 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %461, i64 %467, i1 %197) #6, !dbg !97
+  %469 = bitcast i16 %468 to bfloat, !dbg !97
+  %470 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !97
+  %471 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %462, i64 %470, i1 %197) #6, !dbg !97
+  %472 = bitcast i16 %471 to bfloat, !dbg !97
+  %473 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !97
+  %474 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %463, i64 %473, i1 %197) #6, !dbg !97
+  %475 = bitcast i16 %474 to bfloat, !dbg !97
+  %476 = fpext bfloat %466 to float, !dbg !98
+  %477 = fpext bfloat %469 to float, !dbg !98
+  %478 = fpext bfloat %472 to float, !dbg !98
+  %479 = fpext bfloat %475 to float, !dbg !98
+  %480 = fmul float %456, %476, !dbg !99
+  %481 = fmul float %457, %477, !dbg !99
+  %482 = fmul float %458, %478, !dbg !99
+  %483 = fmul float %459, %479, !dbg !99
+  %484 = fsub float 0.000000e+00, %480, !dbg !100
+  %485 = fsub float 0.000000e+00, %481, !dbg !100
+  %486 = fsub float 0.000000e+00, %482, !dbg !100
+  %487 = fsub float 0.000000e+00, %483, !dbg !100
+  %488 = add i32 %208, 4096, !dbg !101
+  %489 = or disjoint i32 %488, %198, !dbg !102
+  %490 = add i32 %208, 4128, !dbg !101
+  %491 = or disjoint i32 %490, %198, !dbg !102
+  %492 = add i32 %208, 4160, !dbg !101
+  %493 = or disjoint i32 %492, %198, !dbg !102
+  %494 = add i32 %208, 4192, !dbg !101
+  %495 = or disjoint i32 %494, %198, !dbg !102
+  %496 = sext i32 %489 to i64, !dbg !103
+  %497 = getelementptr bfloat, ptr addrspace(1) %2, i64 %496, !dbg !103
+  %498 = sext i32 %491 to i64, !dbg !103
+  %499 = getelementptr bfloat, ptr addrspace(1) %2, i64 %498, !dbg !103
+  %500 = sext i32 %493 to i64, !dbg !103
+  %501 = getelementptr bfloat, ptr addrspace(1) %2, i64 %500, !dbg !103
+  %502 = sext i32 %495 to i64, !dbg !103
+  %503 = getelementptr bfloat, ptr addrspace(1) %2, i64 %502, !dbg !103
+  %504 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !104
+  %505 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %497, i64 %504, i1 %264) #6, !dbg !104
+  %506 = bitcast i16 %505 to bfloat, !dbg !104
+  %507 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !104
+  %508 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %499, i64 %507, i1 %264) #6, !dbg !104
+  %509 = bitcast i16 %508 to bfloat, !dbg !104
+  %510 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !104
+  %511 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %501, i64 %510, i1 %264) #6, !dbg !104
+  %512 = bitcast i16 %511 to bfloat, !dbg !104
+  %513 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !104
+  %514 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %503, i64 %513, i1 %264) #6, !dbg !104
+  %515 = bitcast i16 %514 to bfloat, !dbg !104
+  %516 = fpext bfloat %506 to float, !dbg !105
+  %517 = fpext bfloat %509 to float, !dbg !105
+  %518 = fpext bfloat %512 to float, !dbg !105
+  %519 = fpext bfloat %515 to float, !dbg !105
+  %520 = fmul float %455, %516, !dbg !106
+  %521 = fmul float %455, %517, !dbg !106
+  %522 = fmul float %455, %518, !dbg !106
+  %523 = fmul float %455, %519, !dbg !106
+  %524 = getelementptr bfloat, ptr addrspace(1) %6, i64 %285, !dbg !107
+  %525 = getelementptr bfloat, ptr addrspace(1) %6, i64 %287, !dbg !107
+  %526 = getelementptr bfloat, ptr addrspace(1) %6, i64 %289, !dbg !107
+  %527 = getelementptr bfloat, ptr addrspace(1) %6, i64 %291, !dbg !107
+  %528 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !108
+  %529 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %524, i64 %528, i1 %264) #6, !dbg !108
+  %530 = bitcast i16 %529 to bfloat, !dbg !108
+  %531 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !108
+  %532 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %525, i64 %531, i1 %264) #6, !dbg !108
+  %533 = bitcast i16 %532 to bfloat, !dbg !108
+  %534 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !108
+  %535 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %526, i64 %534, i1 %264) #6, !dbg !108
+  %536 = bitcast i16 %535 to bfloat, !dbg !108
+  %537 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !108
+  %538 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %527, i64 %537, i1 %264) #6, !dbg !108
+  %539 = bitcast i16 %538 to bfloat, !dbg !108
+  %540 = fpext bfloat %530 to float, !dbg !109
+  %541 = fpext bfloat %533 to float, !dbg !109
+  %542 = fpext bfloat %536 to float, !dbg !109
+  %543 = fpext bfloat %539 to float, !dbg !109
+  %544 = fmul float %520, %540, !dbg !110
+  %545 = fmul float %521, %541, !dbg !110
+  %546 = fmul float %522, %542, !dbg !110
+  %547 = fmul float %523, %543, !dbg !110
+  %548 = select i1 %197, float %484, float %544, !dbg !90
+  %549 = select i1 %197, float %485, float %545, !dbg !90
+  %550 = select i1 %197, float %486, float %546, !dbg !90
+  %551 = select i1 %197, float %487, float %547, !dbg !90
+  %552 = fmul float %.0.i20, %454, !dbg !111
+  %553 = fmul float %.0.i20, %452, !dbg !111
+  %554 = fmul float %.0.i20, %448, !dbg !111
+  %555 = fmul float %.0.i20, %446, !dbg !111
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !111
+  store float %552, ptr addrspace(3) %150, align 4, !dbg !111
+  store float %553, ptr addrspace(3) %153, align 4, !dbg !111
+  store float %554, ptr addrspace(3) %156, align 4, !dbg !111
+  store float %555, ptr addrspace(3) %159, align 4, !dbg !111
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !111
+  %556 = load float, ptr addrspace(3) %166, align 4, !dbg !111
+  %557 = load float, ptr addrspace(3) %169, align 4, !dbg !111
+  %558 = load float, ptr addrspace(3) %172, align 4, !dbg !111
+  %559 = load float, ptr addrspace(3) %175, align 4, !dbg !111
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !112
+  %560 = shl nuw nsw i32 %15, 2, !dbg !112
+  %561 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %560, !dbg !112
+  store float %442, ptr addrspace(3) %561, align 4, !dbg !112
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !112
+  %562 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %23, !dbg !112
+  %563 = load float, ptr addrspace(3) %562, align 4, !dbg !112
+  %564 = getelementptr inbounds nuw i8, ptr addrspace(3) %562, i32 128, !dbg !112
+  %565 = load float, ptr addrspace(3) %564, align 4, !dbg !112
+  %566 = getelementptr inbounds nuw i8, ptr addrspace(3) %562, i32 256, !dbg !112
+  %567 = load float, ptr addrspace(3) %566, align 4, !dbg !112
+  %568 = getelementptr inbounds nuw i8, ptr addrspace(3) %562, i32 384, !dbg !112
+  %569 = load float, ptr addrspace(3) %568, align 4, !dbg !112
+  %570 = fmul float %556, %563, !dbg !113
+  %571 = fmul float %557, %565, !dbg !113
+  %572 = fmul float %558, %567, !dbg !113
+  %573 = fmul float %559, %569, !dbg !113
+  %574 = fmul float %167, %570, !dbg !112
+  %575 = fmul float %170, %571, !dbg !112
+  %576 = fmul float %173, %572, !dbg !112
+  %577 = fmul float %176, %573, !dbg !112
+  %578 = fmul float %188, %548, !dbg !114
+  %579 = fmul float %189, %549, !dbg !114
+  %580 = fmul float %190, %550, !dbg !114
+  %581 = fmul float %191, %551, !dbg !114
+  %582 = fadd float %578, %574, !dbg !115
+  %583 = fadd float %579, %575, !dbg !115
+  %584 = fadd float %580, %576, !dbg !115
+  %585 = fadd float %581, %577, !dbg !115
+  %586 = shl i32 %19, 7, !dbg !116
+  %587 = or disjoint i32 %586, %22, !dbg !117
+  %588 = sext i32 %587 to i64, !dbg !118
+  %589 = getelementptr bfloat, ptr addrspace(1) %0, i64 %588, !dbg !118
+  %590 = fptrunc float %440 to bfloat, !dbg !119
+  %591 = fptrunc float %424 to bfloat, !dbg !119
+  %592 = fptrunc float %408 to bfloat, !dbg !119
+  %593 = fptrunc float %392 to bfloat, !dbg !119
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !119
+  %594 = shl nuw nsw i32 %142, 7, !dbg !119
+  %595 = lshr i32 %14, 4, !dbg !119
+  %596 = and i32 %595, 2, !dbg !119
+  %597 = and i32 %24, 16, !dbg !119
+  %598 = or disjoint i32 %594, %596, !dbg !119
+  %599 = or disjoint i32 %143, %146, !dbg !119
+  %600 = xor i32 %599, %597, !dbg !119
+  %601 = or disjoint i32 %600, %598, !dbg !119
+  %602 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %601, !dbg !119
+  store bfloat %590, ptr addrspace(3) %602, align 2, !dbg !119
+  %603 = xor i32 %601, 32, !dbg !119
+  %604 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %603, !dbg !119
+  store bfloat %591, ptr addrspace(3) %604, align 2, !dbg !119
+  %605 = xor i32 %601, 64, !dbg !119
+  %606 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %605, !dbg !119
+  store bfloat %592, ptr addrspace(3) %606, align 2, !dbg !119
+  %607 = xor i32 %601, 96, !dbg !119
+  %608 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %607, !dbg !119
+  store bfloat %593, ptr addrspace(3) %608, align 2, !dbg !119
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !119
+  %609 = shl nuw nsw i32 %23, 2, !dbg !119
+  %610 = lshr exact i32 %16, 1, !dbg !119
+  %611 = shl nuw nsw i32 %14, 3, !dbg !119
+  %612 = and i32 %611, 8, !dbg !119
+  %613 = and i32 %14, 2, !dbg !119
+  %614 = or disjoint i32 %612, %613, !dbg !119
+  %615 = xor i32 %609, %610, !dbg !119
+  %616 = or disjoint i32 %614, %615, !dbg !119
+  %617 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %616, !dbg !119
+  %618 = load bfloat, ptr addrspace(3) %617, align 2, !dbg !119
+  %619 = getelementptr inbounds nuw i8, ptr addrspace(3) %617, i32 4, !dbg !119
+  %620 = load bfloat, ptr addrspace(3) %619, align 2, !dbg !119
+  %621 = xor i32 %616, 576, !dbg !119
+  %622 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %621, !dbg !119
+  %623 = load bfloat, ptr addrspace(3) %622, align 2, !dbg !119
+  %624 = getelementptr inbounds nuw i8, ptr addrspace(3) %622, i32 4, !dbg !119
+  %625 = load bfloat, ptr addrspace(3) %624, align 2, !dbg !119
+  %626 = insertelement <2 x bfloat> poison, bfloat %618, i64 0, !dbg !119
+  %627 = insertelement <2 x bfloat> %626, bfloat %623, i64 1, !dbg !119
+  %628 = bitcast <2 x bfloat> %627 to i32, !dbg !119
+  %629 = insertelement <2 x bfloat> poison, bfloat %620, i64 0, !dbg !119
+  %630 = insertelement <2 x bfloat> %629, bfloat %625, i64 1, !dbg !119
+  %631 = bitcast <2 x bfloat> %630 to i32, !dbg !119
+  tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %628, i32 %631, ptr addrspace(1) %589, i1 true) #6, !dbg !119
+  %632 = getelementptr bfloat, ptr addrspace(1) %1, i64 %588, !dbg !120
+  %633 = fptrunc float %582 to bfloat, !dbg !121
+  %634 = fptrunc float %583 to bfloat, !dbg !121
+  %635 = fptrunc float %584 to bfloat, !dbg !121
+  %636 = fptrunc float %585 to bfloat, !dbg !121
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !121
+  store bfloat %633, ptr addrspace(3) %602, align 2, !dbg !121
+  store bfloat %634, ptr addrspace(3) %604, align 2, !dbg !121
+  store bfloat %635, ptr addrspace(3) %606, align 2, !dbg !121
+  store bfloat %636, ptr addrspace(3) %608, align 2, !dbg !121
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !121
+  %637 = load bfloat, ptr addrspace(3) %617, align 2, !dbg !121
+  %638 = load bfloat, ptr addrspace(3) %619, align 2, !dbg !121
+  %639 = load bfloat, ptr addrspace(3) %622, align 2, !dbg !121
+  %640 = load bfloat, ptr addrspace(3) %624, align 2, !dbg !121
+  %641 = insertelement <2 x bfloat> poison, bfloat %637, i64 0, !dbg !121
+  %642 = insertelement <2 x bfloat> %641, bfloat %639, i64 1, !dbg !121
+  %643 = bitcast <2 x bfloat> %642 to i32, !dbg !121
+  %644 = insertelement <2 x bfloat> poison, bfloat %638, i64 0, !dbg !121
+  %645 = insertelement <2 x bfloat> %644, bfloat %640, i64 1, !dbg !121
+  %646 = bitcast <2 x bfloat> %645 to i32, !dbg !121
+  tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %643, i32 %646, ptr addrspace(1) %632, i1 true) #6, !dbg !121
+  ret void, !dbg !122
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1
+
+; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
+declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2
+
+; Function Attrs: convergent nocallback nounwind
+declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #3
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.div.full(float, float) #4
+
+declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #5
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #4
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.rsqrt.approx.f(float) #4
+
+attributes #0 = { nounwind "nvvm.reqntid"="128" }
+attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
+attributes #3 = { convergent nocallback nounwind }
+attributes #4 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
+attributes #5 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #6 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3}
+!llvm.ident = !{!4}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly)
+!1 = !DIFile(filename: "cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py", directory: "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv")
+!2 = !{i32 2, !"Debug Info Version", i32 3}
+!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
+!4 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
+!5 = distinct !DISubprogram(name: "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0", linkageName: "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0", scope: !1, file: !1, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
+!7 = !{}
+!8 = !DILocation(line: 23, column: 28, scope: !5)
+!9 = !DILocation(line: 23, column: 33, scope: !5)
+!10 = !DILocation(line: 24, column: 44, scope: !5)
+!11 = !DILocation(line: 24, column: 23, scope: !5)
+!12 = !DILocation(line: 26, column: 37, scope: !5)
+!13 = !DILocation(line: 29, column: 19, scope: !5)
+!14 = !DILocation(line: 28, column: 19, scope: !5)
+!15 = !DILocation(line: 39, column: 41, scope: !5)
+!16 = !DILocation(line: 39, column: 52, scope: !5)
+!17 = !DILocation(line: 39, column: 48, scope: !5)
+!18 = !DILocation(line: 39, column: 63, scope: !5)
+!19 = !DILocation(line: 39, column: 57, scope: !5)
+!20 = !DILocation(line: 39, column: 34, scope: !5)
+!21 = !DILocation(line: 39, column: 68, scope: !5)
+!22 = !DILocation(line: 39, column: 121, scope: !5)
+!23 = !DILocation(line: 40, column: 41, scope: !5)
+!24 = !DILocation(line: 40, column: 50, scope: !5)
+!25 = !DILocation(line: 40, column: 34, scope: !5)
+!26 = !DILocation(line: 40, column: 61, scope: !5)
+!27 = !DILocation(line: 40, column: 114, scope: !5)
+!28 = !DILocation(line: 42, column: 22, scope: !5)
+!29 = !DILocation(line: 47, column: 22, scope: !5)
+!30 = !DILocation(line: 263, column: 15, scope: !31, inlinedAt: !33)
+!31 = distinct !DILexicalBlockFile(scope: !5, file: !32, discriminator: 0)
+!32 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.12/dist-packages/triton/language")
+!33 = !DILocation(line: 293, column: 36, scope: !31, inlinedAt: !34)
+!34 = !DILocation(line: 51, column: 25, scope: !35)
+!35 = distinct !DILexicalBlockFile(scope: !5, file: !1, discriminator: 0)
+!36 = !DILocation(line: 263, column: 15, scope: !31, inlinedAt: !37)
+!37 = !DILocation(line: 293, column: 36, scope: !31, inlinedAt: !38)
+!38 = !DILocation(line: 52, column: 27, scope: !35)
+!39 = !DILocation(line: 58, column: 27, scope: !5)
+!40 = !DILocation(line: 62, column: 35, scope: !5)
+!41 = !DILocation(line: 62, column: 42, scope: !5)
+!42 = !DILocation(line: 62, column: 95, scope: !5)
+!43 = !DILocation(line: 63, column: 46, scope: !5)
+!44 = !DILocation(line: 63, column: 42, scope: !5)
+!45 = !DILocation(line: 63, column: 35, scope: !5)
+!46 = !DILocation(line: 63, column: 51, scope: !5)
+!47 = !DILocation(line: 64, column: 35, scope: !5)
+!48 = !DILocation(line: 64, column: 51, scope: !5)
+!49 = !DILocation(line: 65, column: 69, scope: !5)
+!50 = !DILocation(line: 66, column: 36, scope: !5)
+!51 = !DILocation(line: 66, column: 43, scope: !5)
+!52 = !DILocation(line: 71, column: 24, scope: !5)
+!53 = !DILocation(line: 72, column: 41, scope: !5)
+!54 = !DILocation(line: 72, column: 39, scope: !5)
+!55 = !DILocation(line: 72, column: 48, scope: !5)
+!56 = !DILocation(line: 72, column: 57, scope: !5)
+!57 = !DILocation(line: 72, column: 35, scope: !5)
+!58 = !DILocation(line: 72, column: 68, scope: !5)
+!59 = !DILocation(line: 75, column: 25, scope: !5)
+!60 = !DILocation(line: 77, column: 24, scope: !5)
+!61 = !DILocation(line: 78, column: 32, scope: !5)
+!62 = !DILocation(line: 79, column: 24, scope: !5)
+!63 = !DILocation(line: 80, column: 35, scope: !5)
+!64 = !DILocation(line: 80, column: 85, scope: !5)
+!65 = !DILocation(line: 87, column: 25, scope: !5)
+!66 = !DILocation(line: 90, column: 53, scope: !5)
+!67 = !DILocation(line: 90, column: 35, scope: !5)
+!68 = !DILocation(line: 90, column: 64, scope: !5)
+!69 = !DILocation(line: 98, column: 35, scope: !5)
+!70 = !DILocation(line: 98, column: 81, scope: !5)
+!71 = !DILocation(line: 111, column: 24, scope: !5)
+!72 = !DILocation(line: 113, column: 24, scope: !5)
+!73 = !DILocation(line: 116, column: 24, scope: !5)
+!74 = !DILocation(line: 121, column: 51, scope: !5)
+!75 = !DILocation(line: 121, column: 60, scope: !5)
+!76 = !DILocation(line: 121, column: 35, scope: !5)
+!77 = !DILocation(line: 121, column: 71, scope: !5)
+!78 = !DILocation(line: 123, column: 24, scope: !5)
+!79 = !DILocation(line: 124, column: 24, scope: !5)
+!80 = !DILocation(line: 125, column: 32, scope: !5)
+!81 = !DILocation(line: 121, column: 132, scope: !5)
+!82 = !DILocation(line: 72, column: 129, scope: !5)
+!83 = !DILocation(line: 80, column: 146, scope: !5)
+!84 = !DILocation(line: 82, column: 24, scope: !5)
+!85 = !DILocation(line: 84, column: 17, scope: !5)
+!86 = !DILocation(line: 90, column: 125, scope: !5)
+!87 = !DILocation(line: 97, column: 24, scope: !5)
+!88 = !DILocation(line: 98, column: 142, scope: !5)
+!89 = !DILocation(line: 100, column: 24, scope: !5)
+!90 = !DILocation(line: 0, scope: !5)
+!91 = !DILocation(line: 118, column: 24, scope: !5)
+!92 = !DILocation(line: 119, column: 24, scope: !5)
+!93 = !DILocation(line: 66, column: 96, scope: !5)
+!94 = !DILocation(line: 65, column: 123, scope: !5)
+!95 = !DILocation(line: 126, column: 24, scope: !5)
+!96 = !DILocation(line: 127, column: 35, scope: !5)
+!97 = !DILocation(line: 127, column: 85, scope: !5)
+!98 = !DILocation(line: 127, column: 146, scope: !5)
+!99 = !DILocation(line: 129, column: 24, scope: !5)
+!100 = !DILocation(line: 131, column: 17, scope: !5)
+!101 = !DILocation(line: 134, column: 51, scope: !5)
+!102 = !DILocation(line: 134, column: 60, scope: !5)
+!103 = !DILocation(line: 134, column: 35, scope: !5)
+!104 = !DILocation(line: 134, column: 71, scope: !5)
+!105 = !DILocation(line: 134, column: 132, scope: !5)
+!106 = !DILocation(line: 139, column: 24, scope: !5)
+!107 = !DILocation(line: 140, column: 35, scope: !5)
+!108 = !DILocation(line: 140, column: 81, scope: !5)
+!109 = !DILocation(line: 140, column: 142, scope: !5)
+!110 = !DILocation(line: 142, column: 24, scope: !5)
+!111 = !DILocation(line: 151, column: 25, scope: !5)
+!112 = !DILocation(line: 156, column: 26, scope: !5)
+!113 = !DILocation(line: 153, column: 26, scope: !5)
+!114 = !DILocation(line: 158, column: 26, scope: !5)
+!115 = !DILocation(line: 159, column: 26, scope: !5)
+!116 = !DILocation(line: 161, column: 43, scope: !5)
+!117 = !DILocation(line: 161, column: 39, scope: !5)
+!118 = !DILocation(line: 161, column: 32, scope: !5)
+!119 = !DILocation(line: 161, column: 55, scope: !5)
+!120 = !DILocation(line: 162, column: 32, scope: !5)
+!121 = !DILocation(line: 162, column: 56, scope: !5)
+!122 = !DILocation(line: 53, column: 4, scope: !5)
diff --git a/triton/RIRT6HUGDE5FKFXB6MO7WI4SAH36A3KCULSXF6ERVDOUPVVMN3QA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ptx b/triton/RIRT6HUGDE5FKFXB6MO7WI4SAH36A3KCULSXF6ERVDOUPVVMN3QA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ptx
new file mode 100644
index 0000000000000000000000000000000000000000..16e9f5aa3056e5a206c9254072fb8f955811fff4
--- /dev/null
+++ b/triton/RIRT6HUGDE5FKFXB6MO7WI4SAH36A3KCULSXF6ERVDOUPVVMN3QA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ptx
@@ -0,0 +1,1422 @@
+//
+// Generated by LLVM NVPTX Back-End
+//
+
+.version 9.1
+.target sm_89
+.address_size 64
+
+	// .globl	triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0 // -- Begin function triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0
+.extern .shared .align 16 .b8 global_smem[];
+.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90};
+                                        // @triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0
+.visible .entry triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0(
+	.param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_0,
+	.param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_1,
+	.param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_2,
+	.param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_3,
+	.param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_4,
+	.param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_5,
+	.param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_6,
+	.param .u32 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_7,
+	.param .u32 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_8,
+	.param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_9,
+	.param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_10
+)
+.reqntid 128
+{
+	.reg .pred 	%p<4>;
+	.reg .b16 	%rs<64>;
+	.reg .b32 	%r<320>;
+	.reg .b64 	%rd<96>;
+	.loc	1 18 0                          // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:18:0
+$L__func_begin0:
+	.loc	1 18 0                          // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:18:0
+
+// %bb.0:                               // %__nv_rsqrtf.exit
+	ld.param.b64 	%rd80, [triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_0];
+	ld.param.b64 	%rd81, [triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_1];
+$L__tmp0:
+	.loc	1 23 28                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:23:28
+	mov.u32 	%r20, %ctaid.x;
+	.loc	1 23 33                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:23:33
+	shl.b32 	%r21, %r20, 2;
+	ld.param.b64 	%rd82, [triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_2];
+	ld.param.b64 	%rd83, [triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_3];
+	.loc	1 24 44                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:24:44
+	mov.u32 	%r22, %tid.x;
+	and.b32 	%r23, %r22, 127;
+	ld.param.b64 	%rd84, [triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_4];
+	and.b32 	%r24, %r22, 96;
+	ld.param.b64 	%rd85, [triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_5];
+	bfe.u32 	%r25, %r22, 5, 2;
+	ld.param.b64 	%rd86, [triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_6];
+	and.b32 	%r26, %r22, 3;
+	.loc	1 24 23                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:24:23
+	or.b32 	%r27, %r25, %r21;
+	or.b32 	%r28, %r21, %r26;
+	.loc	1 26 37                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:26:37
+	shl.b32 	%r29, %r22, 2;
+	and.b32 	%r30, %r29, 124;
+	and.b32 	%r31, %r22, 124;
+	shr.u32 	%r32, %r22, 2;
+	.loc	1 29 19                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:29:19
+	bfe.s32 	%r33, %r20, 29, 1;
+	shr.u32 	%r34, %r33, 27;
+	add.s32 	%r35, %r27, %r34;
+	shr.s32 	%r36, %r35, 5;
+	.loc	1 28 19                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:28:19
+	and.b32 	%r37, %r35, 33554400;
+	sub.s32 	%r38, %r27, %r37;
+	.loc	1 29 19                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:29:19
+	add.s32 	%r39, %r28, %r34;
+	.loc	1 39 52                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:39:52
+	shl.b32 	%r40, %r38, 7;
+	.loc	1 39 48                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:39:48
+	or.b32 	%r41, %r40, %r30;
+	mad.lo.s32 	%r42, %r36, 36864, %r41;
+	.loc	1 39 57                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:39:57
+	add.s32 	%r43, %r42, 4096;
+	.loc	1 39 34                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:39:34
+	mad.wide.s32 	%rd1, %r43, 2, %rd82;
+	.loc	1 39 68                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:39:68
+	// begin inline asm
+	mov.u64 %rd2, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd2, 1.0;
+	// end inline asm
+	mov.b32 	%r3, 0;
+	mov.pred 	%p1, -1;
+	// begin inline asm
+	mov.u32 %r1, %r3;
+	mov.u32 %r2, %r3;
+	@%p1 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { %r1, %r2 }, [ %rd1 + 0 ], %rd2;
+	// end inline asm
+	mov.b32 	{%rs36, %rs37}, %r1;
+	mov.b32 	{%rs38, %rs39}, %r2;
+	.loc	1 39 121                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:39:121
+	cvt.f32.bf16 	%r44, %rs36;
+	cvt.f32.bf16 	%r45, %rs37;
+	cvt.f32.bf16 	%r46, %rs38;
+	cvt.f32.bf16 	%r47, %rs39;
+	.loc	1 40 34                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:40:34
+	mad.wide.s32 	%rd3, %r42, 2, %rd82;
+	.loc	1 40 61                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:40:61
+	// begin inline asm
+	mov.u64 %rd4, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd4, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r4, %r3;
+	mov.u32 %r5, %r3;
+	@%p1 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { %r4, %r5 }, [ %rd3 + 0 ], %rd4;
+	// end inline asm
+	mov.b32 	{%rs40, %rs41}, %r4;
+	mov.b32 	{%rs42, %rs43}, %r5;
+	.loc	1 40 114                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:40:114
+	cvt.f32.bf16 	%r48, %rs40;
+	cvt.f32.bf16 	%r49, %rs41;
+	cvt.f32.bf16 	%r50, %rs42;
+	cvt.f32.bf16 	%r51, %rs43;
+	.loc	1 42 22                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:42:22
+	mul.f32 	%r52, %r45, %r45;
+	.loc	1 47 22                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:47:22
+	mul.f32 	%r53, %r49, %r49;
+$L__tmp1:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ] ]
+	fma.rn.f32 	%r54, %r44, %r44, %r52;
+	fma.rn.f32 	%r55, %r46, %r46, %r54;
+	fma.rn.f32 	%r56, %r47, %r47, %r55;
+$L__tmp2:
+	.loc	2 293 36                        // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ]
+	shfl.sync.bfly.b32 	%r57, %r56, 16, 31, -1;
+$L__tmp3:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ] ]
+	add.f32 	%r58, %r56, %r57;
+$L__tmp4:
+	.loc	2 293 36                        // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ]
+	shfl.sync.bfly.b32 	%r59, %r58, 8, 31, -1;
+$L__tmp5:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ] ]
+	add.f32 	%r60, %r58, %r59;
+$L__tmp6:
+	.loc	2 293 36                        // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ]
+	shfl.sync.bfly.b32 	%r61, %r60, 4, 31, -1;
+$L__tmp7:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ] ]
+	add.f32 	%r62, %r60, %r61;
+$L__tmp8:
+	.loc	2 293 36                        // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ]
+	shfl.sync.bfly.b32 	%r63, %r62, 2, 31, -1;
+$L__tmp9:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ] ]
+	add.f32 	%r64, %r62, %r63;
+$L__tmp10:
+	.loc	2 293 36                        // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ]
+	shfl.sync.bfly.b32 	%r65, %r64, 1, 31, -1;
+$L__tmp11:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ] ]
+	add.f32 	%r66, %r64, %r65;
+$L__tmp12:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ] ]
+	fma.rn.f32 	%r67, %r48, %r48, %r53;
+	fma.rn.f32 	%r68, %r50, %r50, %r67;
+	fma.rn.f32 	%r69, %r51, %r51, %r68;
+$L__tmp13:
+	.loc	2 293 36                        // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ]
+	shfl.sync.bfly.b32 	%r70, %r69, 16, 31, -1;
+$L__tmp14:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ] ]
+	add.f32 	%r71, %r69, %r70;
+$L__tmp15:
+	.loc	2 293 36                        // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ]
+	shfl.sync.bfly.b32 	%r72, %r71, 8, 31, -1;
+$L__tmp16:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ] ]
+	add.f32 	%r73, %r71, %r72;
+$L__tmp17:
+	.loc	2 293 36                        // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ]
+	shfl.sync.bfly.b32 	%r74, %r73, 4, 31, -1;
+$L__tmp18:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ] ]
+	add.f32 	%r75, %r73, %r74;
+$L__tmp19:
+	.loc	2 293 36                        // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ]
+	shfl.sync.bfly.b32 	%r76, %r75, 2, 31, -1;
+$L__tmp20:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ] ]
+	add.f32 	%r77, %r75, %r76;
+$L__tmp21:
+	.loc	2 293 36                        // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ]
+	shfl.sync.bfly.b32 	%r78, %r77, 1, 31, -1;
+$L__tmp22:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ] ]
+	add.f32 	%r79, %r77, %r78;
+$L__tmp23:
+	.loc	1 62 35                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:62:35
+	mul.wide.u32 	%rd87, %r23, 2;
+	add.s64 	%rd5, %rd83, %rd87;
+	.loc	1 62 42                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:62:42
+	// begin inline asm
+	mov.u64 %rd6, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd6, 1.0;
+	// end inline asm
+	mov.b16 	%rs2, 0;
+	// begin inline asm
+	mov.u16 %rs1, %rs2;
+	@%p1 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs1 }, [ %rd5 + 0 ], %rd6;
+	// end inline asm
+	.loc	1 62 95                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:62:95
+	cvt.f32.bf16 	%r80, %rs1;
+	.loc	1 63 46                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:63:46
+	shl.b32 	%r81, %r36, 7;
+	.loc	1 63 42                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:63:42
+	or.b32 	%r82, %r81, %r30;
+	.loc	1 63 35                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:63:35
+	mul.wide.s32 	%rd88, %r82, 4;
+	add.s64 	%rd7, %rd84, %rd88;
+	.loc	1 63 51                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:63:51
+	// begin inline asm
+	mov.u64 %rd8, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd8, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r6, %r3;
+	mov.u32 %r7, %r3;
+	mov.u32 %r8, %r3;
+	mov.u32 %r9, %r3;
+	@%p1 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r6, %r7, %r8, %r9 }, [ %rd7 + 0 ], %rd8;
+	// end inline asm
+	and.b32 	%r83, %r22, 7;
+	shl.b32 	%r84, %r83, 4;
+	shl.b32 	%r85, %r24, 2;
+	shr.u32 	%r86, %r22, 1;
+	and.b32 	%r87, %r86, 12;
+	or.b32 	%r88, %r84, %r85;
+	or.b32 	%r89, %r87, %r24;
+	xor.b32 	%r90, %r88, %r89;
+	mov.b32 	%r91, global_smem;
+	add.s32 	%r92, %r91, %r90;
+	st.shared.b32 	[%r92], %r6;
+	xor.b32 	%r93, %r90, 4;
+	add.s32 	%r94, %r91, %r93;
+	st.shared.b32 	[%r94+512], %r7;
+	xor.b32 	%r95, %r90, 8;
+	add.s32 	%r96, %r91, %r95;
+	st.shared.b32 	[%r96+1024], %r8;
+	xor.b32 	%r97, %r90, 12;
+	add.s32 	%r98, %r91, %r97;
+	st.shared.b32 	[%r98+1536], %r9;
+	bar.sync 	0;
+	shl.b32 	%r99, %r22, 7;
+	and.b32 	%r100, %r99, 1920;
+	shl.b32 	%r101, %r26, 5;
+	xor.b32 	%r102, %r101, %r31;
+	or.b32 	%r103, %r102, %r100;
+	add.s32 	%r104, %r91, %r103;
+	ld.shared.b32 	%r105, [%r104];
+	xor.b32 	%r106, %r103, 4;
+	add.s32 	%r107, %r91, %r106;
+	ld.shared.b32 	%r108, [%r107];
+	xor.b32 	%r109, %r103, 8;
+	add.s32 	%r110, %r91, %r109;
+	ld.shared.b32 	%r111, [%r110];
+	xor.b32 	%r112, %r103, 12;
+	add.s32 	%r113, %r91, %r112;
+	ld.shared.b32 	%r114, [%r113];
+	.loc	1 64 35                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:64:35
+	add.s64 	%rd9, %rd85, %rd88;
+	.loc	1 64 51                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:64:51
+	// begin inline asm
+	mov.u64 %rd10, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd10, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r10, %r3;
+	mov.u32 %r11, %r3;
+	mov.u32 %r12, %r3;
+	mov.u32 %r13, %r3;
+	@%p1 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r10, %r11, %r12, %r13 }, [ %rd9 + 0 ], %rd10;
+	// end inline asm
+	bar.sync 	0;
+	st.shared.b32 	[%r92], %r10;
+	st.shared.b32 	[%r94+512], %r11;
+	st.shared.b32 	[%r96+1024], %r12;
+	st.shared.b32 	[%r98+1536], %r13;
+	bar.sync 	0;
+	ld.shared.b32 	%r115, [%r104];
+	ld.shared.b32 	%r116, [%r107];
+	ld.shared.b32 	%r117, [%r110];
+	ld.shared.b32 	%r118, [%r113];
+	.loc	1 65 69                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:65:69
+	// begin inline asm
+	mov.u64 %rd11, 0x0;
+	createpolicy.fractional.L2::evict_first.b64 %rd11, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r14, %r3;
+	mov.u32 %r15, %r3;
+	@%p1 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { %r14, %r15 }, [ %rd1 + 0 ], %rd11;
+	// end inline asm
+	.loc	1 66 36                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:66:36
+	add.s64 	%rd12, %rd86, %rd87;
+	.loc	1 66 43                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:66:43
+	// begin inline asm
+	mov.u64 %rd13, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd13, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs3, %rs2;
+	@%p1 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs3 }, [ %rd12 + 0 ], %rd13;
+	// end inline asm
+	.loc	1 71 24                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:71:24
+	and.b32 	%r119, %r32, 1;
+	setp.ne.b32 	%p3, %r119, 0;
+	not.pred 	%p2, %p3;
+	.loc	1 72 41                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:72:41
+	and.b32 	%r120, %r32, 30;
+	.loc	1 72 48                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:72:48
+	shl.b32 	%r121, %r28, 7;
+	shl.b32 	%r122, %r39, 10;
+	and.b32 	%r123, %r122, -32768;
+	add.s32 	%r124, %r123, %r121;
+	.loc	1 72 35                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:72:35
+	cvt.s64.s32 	%rd89, %r124;
+	cvt.u64.u32 	%rd90, %r120;
+	or.b64 	%rd91, %rd89, %rd90;
+	shl.b64 	%rd92, %rd91, 1;
+	add.s64 	%rd93, %rd82, %rd92;
+	add.s64 	%rd14, %rd93, 2;
+	add.s64 	%rd16, %rd93, 66;
+	add.s64 	%rd18, %rd93, 130;
+	add.s64 	%rd20, %rd93, 194;
+	.loc	1 72 68                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:72:68
+	// begin inline asm
+	mov.u64 %rd15, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd15, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs4, %rs2;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs4 }, [ %rd14 + 0 ], %rd15;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd17, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd17, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs5, %rs2;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs5 }, [ %rd16 + 0 ], %rd17;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd19, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd19, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs6, %rs2;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs6 }, [ %rd18 + 0 ], %rd19;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd21, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd21, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs7, %rs2;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs7 }, [ %rd20 + 0 ], %rd21;
+	// end inline asm
+	mov.b32 	%r125, 0f43000000;
+	.loc	1 75 25                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:75:25
+	div.full.f32 	%r126, %r79, %r125;
+	.loc	1 77 24                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:77:24
+	add.f32 	%r127, %r126, 0f358637BD;
+	.loc	1 78 32                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:78:32
+	rsqrt.approx.ftz.f32 	%r128, %r127;
+	.loc	1 79 24                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:79:24
+	bar.sync 	0;
+	shr.u32 	%r129, %r24, 3;
+	add.s32 	%r130, %r91, %r129;
+	st.shared.b32 	[%r130], %r128;
+	bar.sync 	0;
+	shl.b32 	%r131, %r26, 2;
+	add.s32 	%r132, %r91, %r131;
+	ld.shared.b32 	%r133, [%r132];
+	.loc	1 80 35                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:80:35
+	mul.wide.u32 	%rd94, %r120, 2;
+	add.s64 	%rd38, %rd83, %rd94;
+	add.s64 	%rd22, %rd38, 2;
+	add.s64 	%rd24, %rd38, 66;
+	add.s64 	%rd26, %rd38, 130;
+	add.s64 	%rd28, %rd38, 194;
+	.loc	1 80 85                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:80:85
+	// begin inline asm
+	mov.u64 %rd23, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd23, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs8, %rs2;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs8 }, [ %rd22 + 0 ], %rd23;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd25, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd25, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs9, %rs2;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs9 }, [ %rd24 + 0 ], %rd25;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd27, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd27, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs10, %rs2;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs10 }, [ %rd26 + 0 ], %rd27;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd29, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd29, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs11, %rs2;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs11 }, [ %rd28 + 0 ], %rd29;
+	// end inline asm
+	.loc	1 90 53                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:90:53
+	or.b32 	%r134, %r124, %r120;
+	.loc	1 90 35                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:90:35
+	mad.wide.s32 	%rd30, %r134, 2, %rd82;
+	add.s64 	%rd32, %rd93, 64;
+	add.s64 	%rd34, %rd93, 128;
+	add.s64 	%rd36, %rd93, 192;
+	.loc	1 90 64                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:90:64
+	// begin inline asm
+	mov.u64 %rd31, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd31, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs12, %rs2;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs12 }, [ %rd30 + 0 ], %rd31;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd33, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd33, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs13, %rs2;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs13 }, [ %rd32 + 0 ], %rd33;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd35, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd35, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs14, %rs2;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs14 }, [ %rd34 + 0 ], %rd35;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd37, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd37, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs15, %rs2;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs15 }, [ %rd36 + 0 ], %rd37;
+	// end inline asm
+	.loc	1 98 35                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:98:35
+	add.s64 	%rd40, %rd38, 64;
+	add.s64 	%rd42, %rd38, 128;
+	add.s64 	%rd44, %rd38, 192;
+	.loc	1 98 81                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:98:81
+	// begin inline asm
+	mov.u64 %rd39, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd39, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs16, %rs2;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs16 }, [ %rd38 + 0 ], %rd39;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd41, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd41, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs17, %rs2;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs17 }, [ %rd40 + 0 ], %rd41;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd43, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd43, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs18, %rs2;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs18 }, [ %rd42 + 0 ], %rd43;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd45, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd45, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs19, %rs2;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs19 }, [ %rd44 + 0 ], %rd45;
+	// end inline asm
+	.loc	1 111 24                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:111:24
+	mul.f32 	%r135, %r128, %r48;
+	mul.f32 	%r136, %r128, %r49;
+	mul.f32 	%r137, %r128, %r50;
+	mul.f32 	%r138, %r128, %r51;
+	.loc	1 113 24                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:113:24
+	bar.sync 	0;
+	mad.lo.s32 	%r139, %r26, 124, %r132;
+	add.s32 	%r140, %r139, %r102;
+	st.shared.b32 	[%r140], %r80;
+	bar.sync 	0;
+	add.s32 	%r141, %r91, %r30;
+	ld.shared.b32 	%r142, [%r141];
+	xor.b32 	%r143, %r30, 32;
+	add.s32 	%r144, %r91, %r143;
+	ld.shared.b32 	%r145, [%r144+128];
+	xor.b32 	%r146, %r30, 64;
+	add.s32 	%r147, %r91, %r146;
+	ld.shared.b32 	%r148, [%r147+256];
+	xor.b32 	%r149, %r30, 96;
+	add.s32 	%r150, %r91, %r149;
+	ld.shared.b32 	%r151, [%r150+384];
+	mul.f32 	%r152, %r135, %r142;
+	mul.f32 	%r153, %r136, %r145;
+	mul.f32 	%r154, %r137, %r148;
+	mul.f32 	%r155, %r138, %r151;
+	.loc	1 116 24                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:116:24
+	mul.f32 	%r156, %r152, %r6;
+	mul.f32 	%r157, %r153, %r7;
+	mul.f32 	%r158, %r154, %r8;
+	mul.f32 	%r159, %r155, %r9;
+	bar.sync 	0;
+	st.shared.b32 	[%r92], %r156;
+	st.shared.b32 	[%r94+512], %r157;
+	st.shared.b32 	[%r96+1024], %r158;
+	st.shared.b32 	[%r98+1536], %r159;
+	bar.sync 	0;
+	ld.shared.b32 	%r160, [%r104];
+	ld.shared.b32 	%r161, [%r107];
+	ld.shared.b32 	%r162, [%r110];
+	ld.shared.b32 	%r163, [%r113];
+	.loc	1 121 60                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:121:60
+	add.s32 	%r164, %r134, 4097;
+	add.s32 	%r165, %r134, 4129;
+	add.s32 	%r166, %r134, 4161;
+	add.s32 	%r167, %r134, 4193;
+	.loc	1 121 35                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:121:35
+	mad.wide.s32 	%rd46, %r164, 2, %rd82;
+	mad.wide.s32 	%rd48, %r165, 2, %rd82;
+	mad.wide.s32 	%rd50, %r166, 2, %rd82;
+	mad.wide.s32 	%rd52, %r167, 2, %rd82;
+	.loc	1 121 71                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:121:71
+	// begin inline asm
+	mov.u64 %rd47, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd47, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs20, %rs2;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs20 }, [ %rd46 + 0 ], %rd47;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd49, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd49, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs21, %rs2;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs21 }, [ %rd48 + 0 ], %rd49;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd51, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd51, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs22, %rs2;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs22 }, [ %rd50 + 0 ], %rd51;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd53, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd53, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs23, %rs2;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs23 }, [ %rd52 + 0 ], %rd53;
+	// end inline asm
+	.loc	1 123 24                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:123:24
+	div.full.f32 	%r168, %r66, %r125;
+	.loc	1 124 24                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:124:24
+	add.f32 	%r169, %r168, 0f358637BD;
+	.loc	1 125 32                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:125:32
+	rsqrt.approx.ftz.f32 	%r170, %r169;
+	.loc	1 121 132                       // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:121:132
+	cvt.f32.bf16 	%r171, %rs23;
+	cvt.f32.bf16 	%r172, %rs22;
+	cvt.f32.bf16 	%r173, %rs21;
+	cvt.f32.bf16 	%r174, %rs20;
+	.loc	1 72 129                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:72:129
+	cvt.f32.bf16 	%r175, %rs7;
+	.loc	1 79 24                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:79:24
+	mul.f32 	%r176, %r133, %r175;
+	.loc	1 80 146                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:80:146
+	cvt.f32.bf16 	%r177, %rs11;
+	.loc	1 84 17                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:84:17
+	neg.f32 	%r178, %r176;
+	fma.rn.f32 	%r179, %r178, %r177, 0f00000000;
+	.loc	1 90 125                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:90:125
+	cvt.f32.bf16 	%r180, %rs15;
+	.loc	1 97 24                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:97:24
+	mul.f32 	%r181, %r133, %r180;
+	.loc	1 98 142                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:98:142
+	cvt.f32.bf16 	%r182, %rs19;
+	.loc	1 100 24                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:100:24
+	mul.f32 	%r183, %r181, %r182;
+	.loc	1 0 0                           // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:0
+	selp.f32 	%r184, %r183, %r179, %p3;
+	.loc	1 119 24                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:119:24
+	fma.rn.f32 	%r185, %r118, %r184, %r163;
+	.loc	1 72 129                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:72:129
+	cvt.f32.bf16 	%r186, %rs6;
+	.loc	1 79 24                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:79:24
+	mul.f32 	%r187, %r133, %r186;
+	.loc	1 80 146                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:80:146
+	cvt.f32.bf16 	%r188, %rs10;
+	.loc	1 84 17                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:84:17
+	neg.f32 	%r189, %r187;
+	fma.rn.f32 	%r190, %r189, %r188, 0f00000000;
+	.loc	1 90 125                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:90:125
+	cvt.f32.bf16 	%r191, %rs14;
+	.loc	1 97 24                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:97:24
+	mul.f32 	%r192, %r133, %r191;
+	.loc	1 98 142                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:98:142
+	cvt.f32.bf16 	%r193, %rs18;
+	.loc	1 100 24                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:100:24
+	mul.f32 	%r194, %r192, %r193;
+	.loc	1 0 0                           // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:0
+	selp.f32 	%r195, %r194, %r190, %p3;
+	.loc	1 119 24                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:119:24
+	fma.rn.f32 	%r196, %r117, %r195, %r162;
+	.loc	1 72 129                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:72:129
+	cvt.f32.bf16 	%r197, %rs5;
+	.loc	1 79 24                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:79:24
+	mul.f32 	%r198, %r133, %r197;
+	.loc	1 80 146                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:80:146
+	cvt.f32.bf16 	%r199, %rs9;
+	.loc	1 84 17                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:84:17
+	neg.f32 	%r200, %r198;
+	fma.rn.f32 	%r201, %r200, %r199, 0f00000000;
+	.loc	1 90 125                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:90:125
+	cvt.f32.bf16 	%r202, %rs13;
+	.loc	1 97 24                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:97:24
+	mul.f32 	%r203, %r133, %r202;
+	.loc	1 98 142                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:98:142
+	cvt.f32.bf16 	%r204, %rs17;
+	.loc	1 100 24                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:100:24
+	mul.f32 	%r205, %r203, %r204;
+	.loc	1 0 0                           // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:0
+	selp.f32 	%r206, %r205, %r201, %p3;
+	.loc	1 119 24                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:119:24
+	fma.rn.f32 	%r207, %r116, %r206, %r161;
+	.loc	1 72 129                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:72:129
+	cvt.f32.bf16 	%r208, %rs4;
+	.loc	1 79 24                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:79:24
+	mul.f32 	%r209, %r133, %r208;
+	.loc	1 80 146                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:80:146
+	cvt.f32.bf16 	%r210, %rs8;
+	.loc	1 84 17                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:84:17
+	neg.f32 	%r211, %r209;
+	fma.rn.f32 	%r212, %r211, %r210, 0f00000000;
+	.loc	1 90 125                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:90:125
+	cvt.f32.bf16 	%r213, %rs12;
+	.loc	1 97 24                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:97:24
+	mul.f32 	%r214, %r133, %r213;
+	.loc	1 98 142                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:98:142
+	cvt.f32.bf16 	%r215, %rs16;
+	.loc	1 100 24                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:100:24
+	mul.f32 	%r216, %r214, %r215;
+	.loc	1 0 0                           // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:0
+	selp.f32 	%r217, %r216, %r212, %p3;
+	.loc	1 119 24                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:119:24
+	fma.rn.f32 	%r218, %r115, %r217, %r160;
+	.loc	1 66 96                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:66:96
+	cvt.f32.bf16 	%r219, %rs3;
+	.loc	1 65 69                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:65:69
+	mov.b32 	{%rs44, %rs45}, %r15;
+	.loc	1 65 123                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:65:123
+	cvt.f32.bf16 	%r220, %rs45;
+	cvt.f32.bf16 	%r221, %rs44;
+	.loc	1 65 69                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:65:69
+	mov.b32 	{%rs46, %rs47}, %r14;
+	.loc	1 65 123                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:65:123
+	cvt.f32.bf16 	%r222, %rs47;
+	cvt.f32.bf16 	%r223, %rs46;
+	.loc	1 126 24                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:126:24
+	bar.sync 	0;
+	st.shared.b32 	[%r130], %r170;
+	bar.sync 	0;
+	ld.shared.b32 	%r224, [%r132];
+	mul.f32 	%r225, %r224, %r174;
+	mul.f32 	%r226, %r224, %r173;
+	mul.f32 	%r227, %r224, %r172;
+	mul.f32 	%r228, %r224, %r171;
+	.loc	1 127 35                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:127:35
+	add.s64 	%rd70, %rd86, %rd94;
+	add.s64 	%rd54, %rd70, 2;
+	add.s64 	%rd56, %rd70, 66;
+	add.s64 	%rd58, %rd70, 130;
+	add.s64 	%rd60, %rd70, 194;
+	.loc	1 127 85                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:127:85
+	// begin inline asm
+	mov.u64 %rd55, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd55, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs24, %rs2;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs24 }, [ %rd54 + 0 ], %rd55;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd57, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd57, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs25, %rs2;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs25 }, [ %rd56 + 0 ], %rd57;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd59, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd59, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs26, %rs2;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs26 }, [ %rd58 + 0 ], %rd59;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd61, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd61, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs27, %rs2;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs27 }, [ %rd60 + 0 ], %rd61;
+	// end inline asm
+	.loc	1 127 146                       // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:127:146
+	cvt.f32.bf16 	%r229, %rs24;
+	cvt.f32.bf16 	%r230, %rs25;
+	cvt.f32.bf16 	%r231, %rs26;
+	cvt.f32.bf16 	%r232, %rs27;
+	.loc	1 131 17                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:131:17
+	neg.f32 	%r233, %r225;
+	fma.rn.f32 	%r234, %r233, %r229, 0f00000000;
+	neg.f32 	%r235, %r226;
+	fma.rn.f32 	%r236, %r235, %r230, 0f00000000;
+	neg.f32 	%r237, %r227;
+	fma.rn.f32 	%r238, %r237, %r231, 0f00000000;
+	neg.f32 	%r239, %r228;
+	fma.rn.f32 	%r240, %r239, %r232, 0f00000000;
+	.loc	1 134 60                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:134:60
+	add.s32 	%r241, %r134, 4096;
+	add.s32 	%r242, %r134, 4128;
+	add.s32 	%r243, %r134, 4160;
+	add.s32 	%r244, %r134, 4192;
+	.loc	1 134 35                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:134:35
+	mad.wide.s32 	%rd62, %r241, 2, %rd82;
+	mad.wide.s32 	%rd64, %r242, 2, %rd82;
+	mad.wide.s32 	%rd66, %r243, 2, %rd82;
+	mad.wide.s32 	%rd68, %r244, 2, %rd82;
+	.loc	1 134 71                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:134:71
+	// begin inline asm
+	mov.u64 %rd63, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd63, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs28, %rs2;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs28 }, [ %rd62 + 0 ], %rd63;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd65, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd65, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs29, %rs2;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs29 }, [ %rd64 + 0 ], %rd65;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd67, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd67, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs30, %rs2;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs30 }, [ %rd66 + 0 ], %rd67;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd69, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd69, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs31, %rs2;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs31 }, [ %rd68 + 0 ], %rd69;
+	// end inline asm
+	.loc	1 134 132                       // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:134:132
+	cvt.f32.bf16 	%r245, %rs28;
+	cvt.f32.bf16 	%r246, %rs29;
+	cvt.f32.bf16 	%r247, %rs30;
+	cvt.f32.bf16 	%r248, %rs31;
+	.loc	1 139 24                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:139:24
+	mul.f32 	%r249, %r224, %r245;
+	mul.f32 	%r250, %r224, %r246;
+	mul.f32 	%r251, %r224, %r247;
+	mul.f32 	%r252, %r224, %r248;
+	.loc	1 140 35                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:140:35
+	add.s64 	%rd72, %rd70, 64;
+	add.s64 	%rd74, %rd70, 128;
+	add.s64 	%rd76, %rd70, 192;
+	.loc	1 140 81                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:140:81
+	// begin inline asm
+	mov.u64 %rd71, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd71, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs32, %rs2;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs32 }, [ %rd70 + 0 ], %rd71;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd73, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd73, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs33, %rs2;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs33 }, [ %rd72 + 0 ], %rd73;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd75, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd75, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs34, %rs2;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs34 }, [ %rd74 + 0 ], %rd75;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd77, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd77, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs35, %rs2;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs35 }, [ %rd76 + 0 ], %rd77;
+	// end inline asm
+	.loc	1 140 142                       // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:140:142
+	cvt.f32.bf16 	%r253, %rs32;
+	cvt.f32.bf16 	%r254, %rs33;
+	cvt.f32.bf16 	%r255, %rs34;
+	cvt.f32.bf16 	%r256, %rs35;
+	.loc	1 142 24                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:142:24
+	mul.f32 	%r257, %r249, %r253;
+	mul.f32 	%r258, %r250, %r254;
+	mul.f32 	%r259, %r251, %r255;
+	mul.f32 	%r260, %r252, %r256;
+	.loc	1 0 0                           // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:0
+	selp.f32 	%r261, %r257, %r234, %p3;
+	selp.f32 	%r262, %r258, %r236, %p3;
+	selp.f32 	%r263, %r259, %r238, %p3;
+	selp.f32 	%r264, %r260, %r240, %p3;
+	.loc	1 151 25                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:151:25
+	mul.f32 	%r265, %r170, %r223;
+	mul.f32 	%r266, %r170, %r222;
+	mul.f32 	%r267, %r170, %r221;
+	mul.f32 	%r268, %r170, %r220;
+	bar.sync 	0;
+	st.shared.b32 	[%r92], %r265;
+	st.shared.b32 	[%r94+512], %r266;
+	st.shared.b32 	[%r96+1024], %r267;
+	st.shared.b32 	[%r98+1536], %r268;
+	bar.sync 	0;
+	ld.shared.b32 	%r269, [%r104];
+	ld.shared.b32 	%r270, [%r107];
+	ld.shared.b32 	%r271, [%r110];
+	ld.shared.b32 	%r272, [%r113];
+	.loc	1 156 26                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:156:26
+	bar.sync 	0;
+	shl.b32 	%r273, %r23, 2;
+	add.s32 	%r274, %r91, %r273;
+	st.shared.b32 	[%r274], %r219;
+	bar.sync 	0;
+	add.s32 	%r275, %r91, %r31;
+	ld.shared.b32 	%r276, [%r275];
+	ld.shared.b32 	%r277, [%r275+128];
+	ld.shared.b32 	%r278, [%r275+256];
+	ld.shared.b32 	%r279, [%r275+384];
+	.loc	1 153 26                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:153:26
+	mul.f32 	%r280, %r269, %r276;
+	mul.f32 	%r281, %r270, %r277;
+	mul.f32 	%r282, %r271, %r278;
+	mul.f32 	%r283, %r272, %r279;
+	.loc	1 156 26                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:156:26
+	mul.f32 	%r284, %r105, %r280;
+	mul.f32 	%r285, %r108, %r281;
+	mul.f32 	%r286, %r111, %r282;
+	mul.f32 	%r287, %r114, %r283;
+	.loc	1 159 26                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:159:26
+	fma.rn.f32 	%r288, %r115, %r261, %r284;
+	fma.rn.f32 	%r289, %r116, %r262, %r285;
+	fma.rn.f32 	%r290, %r117, %r263, %r286;
+	fma.rn.f32 	%r291, %r118, %r264, %r287;
+	.loc	1 161 43                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:161:43
+	shl.b32 	%r292, %r27, 7;
+	.loc	1 161 39                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:161:39
+	or.b32 	%r293, %r292, %r30;
+	.loc	1 161 32                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:161:32
+	mul.wide.s32 	%rd95, %r293, 2;
+	add.s64 	%rd78, %rd80, %rd95;
+	.loc	1 161 55                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:161:55
+	cvt.rn.bf16.f32 	%rs48, %r218;
+	cvt.rn.bf16.f32 	%rs49, %r207;
+	cvt.rn.bf16.f32 	%rs50, %r196;
+	cvt.rn.bf16.f32 	%rs51, %r185;
+	bar.sync 	0;
+	shl.b32 	%r294, %r83, 7;
+	shr.u32 	%r295, %r22, 4;
+	and.b32 	%r296, %r295, 2;
+	and.b32 	%r297, %r32, 16;
+	or.b32 	%r298, %r294, %r296;
+	or.b32 	%r299, %r84, %r87;
+	xor.b32 	%r300, %r299, %r297;
+	or.b32 	%r301, %r300, %r298;
+	add.s32 	%r302, %r91, %r301;
+	st.shared.b16 	[%r302], %rs48;
+	xor.b32 	%r303, %r301, 32;
+	add.s32 	%r304, %r91, %r303;
+	st.shared.b16 	[%r304], %rs49;
+	xor.b32 	%r305, %r301, 64;
+	add.s32 	%r306, %r91, %r305;
+	st.shared.b16 	[%r306], %rs50;
+	xor.b32 	%r307, %r301, 96;
+	add.s32 	%r308, %r91, %r307;
+	st.shared.b16 	[%r308], %rs51;
+	bar.sync 	0;
+	shl.b32 	%r309, %r31, 2;
+	shr.u32 	%r310, %r24, 1;
+	shl.b32 	%r311, %r22, 3;
+	and.b32 	%r312, %r311, 8;
+	and.b32 	%r313, %r22, 2;
+	or.b32 	%r314, %r312, %r313;
+	xor.b32 	%r315, %r309, %r310;
+	or.b32 	%r316, %r314, %r315;
+	add.s32 	%r317, %r91, %r316;
+	ld.shared.b16 	%rs52, [%r317];
+	ld.shared.b16 	%rs53, [%r317+4];
+	xor.b32 	%r318, %r316, 64;
+	add.s32 	%r319, %r91, %r318;
+	ld.shared.b16 	%rs54, [%r319+512];
+	ld.shared.b16 	%rs55, [%r319+516];
+	mov.b32 	%r16, {%rs52, %rs54};
+	mov.b32 	%r17, {%rs53, %rs55};
+	// begin inline asm
+	@%p1 st.global.v2.b32 [ %rd78 + 0 ], { %r16, %r17 };
+	// end inline asm
+	.loc	1 162 32                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:162:32
+	add.s64 	%rd79, %rd81, %rd95;
+	.loc	1 162 56                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:162:56
+	cvt.rn.bf16.f32 	%rs56, %r288;
+	cvt.rn.bf16.f32 	%rs57, %r289;
+	cvt.rn.bf16.f32 	%rs58, %r290;
+	cvt.rn.bf16.f32 	%rs59, %r291;
+	bar.sync 	0;
+	st.shared.b16 	[%r302], %rs56;
+	st.shared.b16 	[%r304], %rs57;
+	st.shared.b16 	[%r306], %rs58;
+	st.shared.b16 	[%r308], %rs59;
+	bar.sync 	0;
+	ld.shared.b16 	%rs60, [%r317];
+	ld.shared.b16 	%rs61, [%r317+4];
+	ld.shared.b16 	%rs62, [%r319+512];
+	ld.shared.b16 	%rs63, [%r319+516];
+	mov.b32 	%r18, {%rs60, %rs62};
+	mov.b32 	%r19, {%rs61, %rs63};
+	// begin inline asm
+	@%p1 st.global.v2.b32 [ %rd79 + 0 ], { %r18, %r19 };
+	// end inline asm
+	.loc	1 53 4                          // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:53:4
+	ret;
+$L__tmp24:
+$L__func_end0:
+                                        // -- End function
+}
+	.file	1 "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py"
+	.file	2 "/usr/local/lib/python3.12/dist-packages/triton/language/standard.py"
+	.section	.debug_abbrev
+	{
+.b8 1                                   // Abbreviation Code
+.b8 17                                  // DW_TAG_compile_unit
+.b8 1                                   // DW_CHILDREN_yes
+.b8 37                                  // DW_AT_producer
+.b8 8                                   // DW_FORM_string
+.b8 19                                  // DW_AT_language
+.b8 5                                   // DW_FORM_data2
+.b8 3                                   // DW_AT_name
+.b8 8                                   // DW_FORM_string
+.b8 16                                  // DW_AT_stmt_list
+.b8 6                                   // DW_FORM_data4
+.b8 27                                  // DW_AT_comp_dir
+.b8 8                                   // DW_FORM_string
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 2                                   // Abbreviation Code
+.b8 46                                  // DW_TAG_subprogram
+.b8 0                                   // DW_CHILDREN_no
+.b8 3                                   // DW_AT_name
+.b8 8                                   // DW_FORM_string
+.b8 32                                  // DW_AT_inline
+.b8 11                                  // DW_FORM_data1
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 3                                   // Abbreviation Code
+.b8 46                                  // DW_TAG_subprogram
+.b8 1                                   // DW_CHILDREN_yes
+.b8 17                                  // DW_AT_low_pc
+.b8 1                                   // DW_FORM_addr
+.b8 18                                  // DW_AT_high_pc
+.b8 1                                   // DW_FORM_addr
+.b8 49                                  // DW_AT_abstract_origin
+.b8 19                                  // DW_FORM_ref4
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 4                                   // Abbreviation Code
+.b8 29                                  // DW_TAG_inlined_subroutine
+.b8 1                                   // DW_CHILDREN_yes
+.b8 49                                  // DW_AT_abstract_origin
+.b8 19                                  // DW_FORM_ref4
+.b8 17                                  // DW_AT_low_pc
+.b8 1                                   // DW_FORM_addr
+.b8 18                                  // DW_AT_high_pc
+.b8 1                                   // DW_FORM_addr
+.b8 88                                  // DW_AT_call_file
+.b8 11                                  // DW_FORM_data1
+.b8 89                                  // DW_AT_call_line
+.b8 11                                  // DW_FORM_data1
+.b8 87                                  // DW_AT_call_column
+.b8 11                                  // DW_FORM_data1
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 5                                   // Abbreviation Code
+.b8 29                                  // DW_TAG_inlined_subroutine
+.b8 0                                   // DW_CHILDREN_no
+.b8 49                                  // DW_AT_abstract_origin
+.b8 19                                  // DW_FORM_ref4
+.b8 17                                  // DW_AT_low_pc
+.b8 1                                   // DW_FORM_addr
+.b8 18                                  // DW_AT_high_pc
+.b8 1                                   // DW_FORM_addr
+.b8 88                                  // DW_AT_call_file
+.b8 11                                  // DW_FORM_data1
+.b8 89                                  // DW_AT_call_line
+.b8 5                                   // DW_FORM_data2
+.b8 87                                  // DW_AT_call_column
+.b8 11                                  // DW_FORM_data1
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 0                                   // EOM(3)
+	}
+	.section	.debug_info
+	{
+.b32 456                                // Length of Unit
+.b8 2                                   // DWARF version number
+.b8 0
+.b32 .debug_abbrev                      // Offset Into Abbrev. Section
+.b8 8                                   // Address Size (in bytes)
+.b8 1                                   // Abbrev [1] 0xb:0x1c1 DW_TAG_compile_unit
+.b8 116                                 // DW_AT_producer
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 0
+.b8 2                                   // DW_AT_language
+.b8 0
+.b8 99                                  // DW_AT_name
+.b8 98
+.b8 118
+.b8 113
+.b8 104
+.b8 106
+.b8 116
+.b8 121
+.b8 103
+.b8 55
+.b8 102
+.b8 118
+.b8 120
+.b8 122
+.b8 119
+.b8 116
+.b8 98
+.b8 116
+.b8 116
+.b8 52
+.b8 118
+.b8 114
+.b8 100
+.b8 107
+.b8 98
+.b8 110
+.b8 98
+.b8 54
+.b8 110
+.b8 51
+.b8 50
+.b8 102
+.b8 110
+.b8 114
+.b8 105
+.b8 106
+.b8 106
+.b8 112
+.b8 108
+.b8 51
+.b8 118
+.b8 118
+.b8 52
+.b8 99
+.b8 102
+.b8 113
+.b8 100
+.b8 52
+.b8 109
+.b8 122
+.b8 110
+.b8 114
+.b8 46
+.b8 112
+.b8 121
+.b8 0
+.b32 .debug_line                        // DW_AT_stmt_list
+.b8 47                                  // DW_AT_comp_dir
+.b8 97
+.b8 112
+.b8 112
+.b8 47
+.b8 116
+.b8 101
+.b8 110
+.b8 115
+.b8 111
+.b8 114
+.b8 114
+.b8 116
+.b8 95
+.b8 108
+.b8 108
+.b8 109
+.b8 47
+.b8 118
+.b8 105
+.b8 115
+.b8 117
+.b8 97
+.b8 108
+.b8 95
+.b8 103
+.b8 101
+.b8 110
+.b8 47
+.b8 99
+.b8 111
+.b8 109
+.b8 112
+.b8 105
+.b8 108
+.b8 101
+.b8 100
+.b8 95
+.b8 99
+.b8 97
+.b8 99
+.b8 104
+.b8 101
+.b8 47
+.b8 102
+.b8 108
+.b8 117
+.b8 120
+.b8 50
+.b8 95
+.b8 107
+.b8 108
+.b8 101
+.b8 105
+.b8 110
+.b8 95
+.b8 57
+.b8 98
+.b8 95
+.b8 78
+.b8 86
+.b8 73
+.b8 68
+.b8 73
+.b8 65
+.b8 95
+.b8 71
+.b8 101
+.b8 70
+.b8 111
+.b8 114
+.b8 99
+.b8 101
+.b8 95
+.b8 82
+.b8 84
+.b8 88
+.b8 95
+.b8 52
+.b8 48
+.b8 57
+.b8 48
+.b8 95
+.b8 115
+.b8 109
+.b8 56
+.b8 57
+.b8 95
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 50
+.b8 46
+.b8 49
+.b8 48
+.b8 46
+.b8 48
+.b8 97
+.b8 48
+.b8 95
+.b8 98
+.b8 52
+.b8 101
+.b8 52
+.b8 101
+.b8 101
+.b8 56
+.b8 49
+.b8 100
+.b8 51
+.b8 46
+.b8 110
+.b8 118
+.b8 50
+.b8 53
+.b8 46
+.b8 49
+.b8 50
+.b8 95
+.b8 99
+.b8 117
+.b8 100
+.b8 97
+.b8 49
+.b8 51
+.b8 95
+.b8 49
+.b8 47
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 105
+.b8 110
+.b8 100
+.b8 117
+.b8 99
+.b8 116
+.b8 111
+.b8 114
+.b8 47
+.b8 98
+.b8 118
+.b8 0
+.b8 2                                   // Abbrev [2] 0xe4:0x6d DW_TAG_subprogram
+.b8 116                                 // DW_AT_name
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 114
+.b8 101
+.b8 100
+.b8 95
+.b8 102
+.b8 117
+.b8 115
+.b8 101
+.b8 100
+.b8 95
+.b8 95
+.b8 102
+.b8 117
+.b8 115
+.b8 101
+.b8 100
+.b8 95
+.b8 114
+.b8 109
+.b8 115
+.b8 95
+.b8 110
+.b8 111
+.b8 114
+.b8 109
+.b8 95
+.b8 95
+.b8 116
+.b8 111
+.b8 95
+.b8 99
+.b8 111
+.b8 112
+.b8 121
+.b8 95
+.b8 97
+.b8 100
+.b8 100
+.b8 95
+.b8 109
+.b8 117
+.b8 108
+.b8 95
+.b8 110
+.b8 101
+.b8 103
+.b8 95
+.b8 115
+.b8 112
+.b8 108
+.b8 105
+.b8 116
+.b8 95
+.b8 115
+.b8 112
+.b8 108
+.b8 105
+.b8 116
+.b8 95
+.b8 119
+.b8 105
+.b8 116
+.b8 104
+.b8 95
+.b8 115
+.b8 105
+.b8 122
+.b8 101
+.b8 115
+.b8 95
+.b8 115
+.b8 116
+.b8 97
+.b8 99
+.b8 107
+.b8 95
+.b8 117
+.b8 110
+.b8 98
+.b8 105
+.b8 110
+.b8 100
+.b8 95
+.b8 117
+.b8 110
+.b8 115
+.b8 113
+.b8 117
+.b8 101
+.b8 101
+.b8 122
+.b8 101
+.b8 95
+.b8 118
+.b8 105
+.b8 101
+.b8 119
+.b8 95
+.b8 48
+.b8 0
+.b8 1                                   // DW_AT_inline
+.b8 3                                   // Abbrev [3] 0x151:0x7a DW_TAG_subprogram
+.b64 $L__func_begin0                    // DW_AT_low_pc
+.b64 $L__func_end0                      // DW_AT_high_pc
+.b32 228                                // DW_AT_abstract_origin
+.b8 4                                   // Abbrev [4] 0x166:0x32 DW_TAG_inlined_subroutine
+.b32 228                                // DW_AT_abstract_origin
+.b64 $L__tmp1                           // DW_AT_low_pc
+.b64 $L__tmp12                          // DW_AT_high_pc
+.b8 1                                   // DW_AT_call_file
+.b8 51                                  // DW_AT_call_line
+.b8 25                                  // DW_AT_call_column
+.b8 5                                   // Abbrev [5] 0x17e:0x19 DW_TAG_inlined_subroutine
+.b32 228                                // DW_AT_abstract_origin
+.b64 $L__tmp1                           // DW_AT_low_pc
+.b64 $L__tmp12                          // DW_AT_high_pc
+.b8 2                                   // DW_AT_call_file
+.b8 37                                  // DW_AT_call_line
+.b8 1
+.b8 36                                  // DW_AT_call_column
+.b8 0                                   // End Of Children Mark
+.b8 4                                   // Abbrev [4] 0x198:0x32 DW_TAG_inlined_subroutine
+.b32 228                                // DW_AT_abstract_origin
+.b64 $L__tmp12                          // DW_AT_low_pc
+.b64 $L__tmp23                          // DW_AT_high_pc
+.b8 1                                   // DW_AT_call_file
+.b8 52                                  // DW_AT_call_line
+.b8 27                                  // DW_AT_call_column
+.b8 5                                   // Abbrev [5] 0x1b0:0x19 DW_TAG_inlined_subroutine
+.b32 228                                // DW_AT_abstract_origin
+.b64 $L__tmp12                          // DW_AT_low_pc
+.b64 $L__tmp23                          // DW_AT_high_pc
+.b8 2                                   // DW_AT_call_file
+.b8 37                                  // DW_AT_call_line
+.b8 1
+.b8 36                                  // DW_AT_call_column
+.b8 0                                   // End Of Children Mark
+.b8 0                                   // End Of Children Mark
+.b8 0                                   // End Of Children Mark
+	}
+	.section	.debug_macinfo	{	}
diff --git a/triton/RIRT6HUGDE5FKFXB6MO7WI4SAH36A3KCULSXF6ERVDOUPVVMN3QA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.source b/triton/RIRT6HUGDE5FKFXB6MO7WI4SAH36A3KCULSXF6ERVDOUPVVMN3QA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.source
new file mode 100644
index 0000000000000000000000000000000000000000..10fae842eec251858f3c2a3d2e8882ca13ea4e71
--- /dev/null
+++ b/triton/RIRT6HUGDE5FKFXB6MO7WI4SAH36A3KCULSXF6ERVDOUPVVMN3QA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.source
@@ -0,0 +1,972 @@
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":18:0)
+#loc213 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":287:0)
+#loc215 = loc(unknown)
+#loc218 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":262:0)
+#loc222 = loc("in_out_ptr0"(#loc))
+#loc223 = loc("in_out_ptr1"(#loc))
+#loc224 = loc("in_ptr0"(#loc))
+#loc225 = loc("in_ptr1"(#loc))
+#loc226 = loc("in_ptr2"(#loc))
+#loc227 = loc("in_ptr3"(#loc))
+#loc228 = loc("in_ptr4"(#loc))
+#loc229 = loc("xnumel"(#loc))
+#loc230 = loc("r0_numel"(#loc))
+#loc432 = loc("input"(#loc213))
+#loc433 = loc("a"(#loc218))
+#loc434 = loc("b"(#loc218))
+module {
+  tt.func public @triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0(%in_out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_out_ptr0"(#loc)), %in_out_ptr1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_out_ptr1"(#loc)), %in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %in_ptr4: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr4"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
+    %xnumel_0 = arith.constant 73728 : i32 loc(#loc231)
+    %r0_numel_1 = arith.constant 128 : i32 loc(#loc232)
+    %xoffset = tt.get_program_id x : i32 loc(#loc233)
+    %xoffset_2 = arith.constant 4 : i32 loc(#loc234)
+    %xoffset_3 = arith.constant 4 : i32 loc(#loc234)
+    %xoffset_4 = arith.muli %xoffset, %xoffset_3 : i32 loc(#loc234)
+    %xindex = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32> loc(#loc235)
+    %xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<4xi32> -> tensor<4x1xi32> loc(#loc236)
+    %xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<4x1xi32> loc(#loc237)
+    %xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<4x1xi32> loc(#loc237)
+    %xmask = arith.constant true loc(#loc238)
+    %xmask_8 = arith.constant dense<true> : tensor<4x128xi1> loc(#loc238)
+    %r0_base = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc239)
+    %r0_base_9 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc240)
+    %x0 = arith.constant 32 : i32 loc(#loc241)
+    %x0_10 = arith.constant 32 : i32 loc(#loc241)
+    %x0_11 = arith.constant dense<32> : tensor<4x1xi32> loc(#loc241)
+    %x0_12 = arith.remsi %xindex_7, %x0_11 : tensor<4x1xi32> loc(#loc241)
+    %x1 = arith.constant 32 : i32 loc(#loc242)
+    %x1_13 = arith.constant 32 : i32 loc(#loc242)
+    %x1_14 = arith.constant dense<32> : tensor<4x1xi32> loc(#loc242)
+    %x1_15 = arith.divsi %xindex_7, %x1_14 : tensor<4x1xi32> loc(#loc242)
+    %_tmp4 = arith.constant 0.000000e+00 : f32 loc(#loc243)
+    %_tmp4_16 = arith.constant dense<0.000000e+00> : tensor<4x128xf32> loc(#loc243)
+    %_tmp10 = arith.constant 0.000000e+00 : f32 loc(#loc244)
+    %_tmp10_17 = arith.constant dense<0.000000e+00> : tensor<4x128xf32> loc(#loc244)
+    %c0_i32 = arith.constant 0 : i32 loc(#loc15)
+    %c128_i32 = arith.constant 128 : i32 loc(#loc15)
+    %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc15)
+    %1 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc15)
+    %2 = arith.bitcast %c128_i32 : i32 to i32 loc(#loc15)
+    %3 = ub.poison : i32 loc(#loc15)
+    %_tmp10_18:2 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%_tmp4_23 = %_tmp4_16, %_tmp10_24 = %_tmp10_17) -> (tensor<4x128xf32>, tensor<4x128xf32>)  : i32 {
+      %r0_index = tt.splat %r0_offset : i32 -> tensor<1x128xi32> loc(#loc246)
+      %r0_index_25 = arith.addi %r0_index, %r0_base_9 : tensor<1x128xi32> loc(#loc246)
+      %r0_mask = arith.constant dense<128> : tensor<1x128xi32> loc(#loc247)
+      %r0_mask_26 = arith.cmpi slt, %r0_index_25, %r0_mask : tensor<1x128xi32> loc(#loc247)
+      %tmp0 = arith.constant 4096 : i32 loc(#loc248)
+      %tmp0_27 = arith.constant 4096 : i32 loc(#loc248)
+      %tmp0_28 = arith.constant dense<4096> : tensor<1x128xi32> loc(#loc248)
+      %tmp0_29 = arith.addi %tmp0_28, %r0_index_25 : tensor<1x128xi32> loc(#loc248)
+      %tmp0_30 = arith.constant 128 : i32 loc(#loc249)
+      %tmp0_31 = arith.constant 128 : i32 loc(#loc249)
+      %tmp0_32 = arith.constant dense<128> : tensor<4x1xi32> loc(#loc249)
+      %tmp0_33 = arith.muli %tmp0_32, %x0_12 : tensor<4x1xi32> loc(#loc249)
+      %tmp0_34 = tt.broadcast %tmp0_29 : tensor<1x128xi32> -> tensor<4x128xi32> loc(#loc250)
+      %tmp0_35 = tt.broadcast %tmp0_33 : tensor<4x1xi32> -> tensor<4x128xi32> loc(#loc250)
+      %tmp0_36 = arith.addi %tmp0_34, %tmp0_35 : tensor<4x128xi32> loc(#loc250)
+      %tmp0_37 = arith.constant 36864 : i32 loc(#loc251)
+      %tmp0_38 = arith.constant 36864 : i32 loc(#loc251)
+      %tmp0_39 = arith.constant dense<36864> : tensor<4x1xi32> loc(#loc251)
+      %tmp0_40 = arith.muli %tmp0_39, %x1_15 : tensor<4x1xi32> loc(#loc251)
+      %tmp0_41 = tt.broadcast %tmp0_40 : tensor<4x1xi32> -> tensor<4x128xi32> loc(#loc252)
+      %tmp0_42 = arith.addi %tmp0_36, %tmp0_41 : tensor<4x128xi32> loc(#loc252)
+      %tmp0_43 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<4x128x!tt.ptr<bf16>> loc(#loc253)
+      %tmp0_44 = tt.addptr %tmp0_43, %tmp0_42 : tensor<4x128x!tt.ptr<bf16>>, tensor<4x128xi32> loc(#loc253)
+      %tmp0_45 = arith.constant 0.000000e+00 : f32 loc(#loc254)
+      %tmp0_46 = tt.broadcast %r0_mask_26 : tensor<1x128xi1> -> tensor<4x128xi1> loc(#loc254)
+      %tmp0_47 = arith.constant dense<0.000000e+00> : tensor<4x128xf32> loc(#loc254)
+      %tmp0_48 = arith.truncf %tmp0_47 : tensor<4x128xf32> to tensor<4x128xbf16> loc(#loc254)
+      %tmp0_49 = tt.load %tmp0_44, %tmp0_46, %tmp0_48 evictionPolicy = evict_last : tensor<4x128x!tt.ptr<bf16>> loc(#loc254)
+      %tmp0_50 = arith.extf %tmp0_49 : tensor<4x128xbf16> to tensor<4x128xf32> loc(#loc255)
+      %tmp6 = arith.constant 128 : i32 loc(#loc256)
+      %tmp6_51 = arith.constant 128 : i32 loc(#loc256)
+      %tmp6_52 = arith.constant dense<128> : tensor<4x1xi32> loc(#loc256)
+      %tmp6_53 = arith.muli %tmp6_52, %x0_12 : tensor<4x1xi32> loc(#loc256)
+      %tmp6_54 = tt.broadcast %r0_index_25 : tensor<1x128xi32> -> tensor<4x128xi32> loc(#loc257)
+      %tmp6_55 = tt.broadcast %tmp6_53 : tensor<4x1xi32> -> tensor<4x128xi32> loc(#loc257)
+      %tmp6_56 = arith.addi %tmp6_54, %tmp6_55 : tensor<4x128xi32> loc(#loc257)
+      %tmp6_57 = arith.constant 36864 : i32 loc(#loc258)
+      %tmp6_58 = arith.constant 36864 : i32 loc(#loc258)
+      %tmp6_59 = arith.constant dense<36864> : tensor<4x1xi32> loc(#loc258)
+      %tmp6_60 = arith.muli %tmp6_59, %x1_15 : tensor<4x1xi32> loc(#loc258)
+      %tmp6_61 = tt.broadcast %tmp6_60 : tensor<4x1xi32> -> tensor<4x128xi32> loc(#loc259)
+      %tmp6_62 = arith.addi %tmp6_56, %tmp6_61 : tensor<4x128xi32> loc(#loc259)
+      %tmp6_63 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<4x128x!tt.ptr<bf16>> loc(#loc260)
+      %tmp6_64 = tt.addptr %tmp6_63, %tmp6_62 : tensor<4x128x!tt.ptr<bf16>>, tensor<4x128xi32> loc(#loc260)
+      %tmp6_65 = arith.constant 0.000000e+00 : f32 loc(#loc261)
+      %tmp6_66 = tt.broadcast %r0_mask_26 : tensor<1x128xi1> -> tensor<4x128xi1> loc(#loc261)
+      %tmp6_67 = arith.constant dense<0.000000e+00> : tensor<4x128xf32> loc(#loc261)
+      %tmp6_68 = arith.truncf %tmp6_67 : tensor<4x128xf32> to tensor<4x128xbf16> loc(#loc261)
+      %tmp6_69 = tt.load %tmp6_64, %tmp6_66, %tmp6_68 evictionPolicy = evict_last : tensor<4x128x!tt.ptr<bf16>> loc(#loc261)
+      %tmp6_70 = arith.extf %tmp6_69 : tensor<4x128xbf16> to tensor<4x128xf32> loc(#loc262)
+      %tmp2 = arith.mulf %tmp0_50, %tmp0_50 : tensor<4x128xf32> loc(#loc263)
+      %tmp5 = arith.addf %_tmp4_23, %tmp2 : tensor<4x128xf32> loc(#loc264)
+      %_tmp4_71 = tt.broadcast %r0_mask_26 : tensor<1x128xi1> -> tensor<4x128xi1> loc(#loc265)
+      %_tmp4_72 = arith.select %_tmp4_71, %tmp5, %_tmp4_23 : tensor<4x128xi1>, tensor<4x128xf32> loc(#loc265)
+      %tmp8 = arith.mulf %tmp6_70, %tmp6_70 : tensor<4x128xf32> loc(#loc266)
+      %tmp11 = arith.addf %_tmp10_24, %tmp8 : tensor<4x128xf32> loc(#loc267)
+      %_tmp10_73 = tt.broadcast %r0_mask_26 : tensor<1x128xi1> -> tensor<4x128xi1> loc(#loc268)
+      %_tmp10_74 = arith.select %_tmp10_73, %tmp11, %_tmp10_24 : tensor<4x128xi1>, tensor<4x128xf32> loc(#loc268)
+      scf.yield %_tmp4_72, %_tmp10_74 : tensor<4x128xf32>, tensor<4x128xf32> loc(#loc39)
+    } loc(#loc435)
+    %tmp4 = tt.call @"triton.language.standard.sum__fp32S4_128S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%_tmp10_18#0) : (tensor<4x128xf32>) -> tensor<4xf32> loc(#loc269)
+    %tmp4_19 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<4xf32> -> tensor<4x1xf32> loc(#loc270)
+    %tmp10 = tt.call @"triton.language.standard.sum__fp32S4_128S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%_tmp10_18#1) : (tensor<4x128xf32>) -> tensor<4xf32> loc(#loc271)
+    %tmp10_20 = tt.expand_dims %tmp10 {axis = 1 : i32} : tensor<4xf32> -> tensor<4x1xf32> loc(#loc272)
+    %c0_i32_21 = arith.constant 0 : i32 loc(#loc44)
+    %c128_i32_22 = arith.constant 128 : i32 loc(#loc44)
+    %4 = arith.bitcast %c0_i32_21 : i32 to i32 loc(#loc44)
+    %5 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc44)
+    %6 = arith.bitcast %c128_i32_22 : i32 to i32 loc(#loc44)
+    %7 = ub.poison : i32 loc(#loc44)
+    scf.for %r0_offset = %4 to %5 step %6  : i32 {
+      %r0_index = tt.splat %r0_offset : i32 -> tensor<1x128xi32> loc(#loc273)
+      %r0_index_23 = arith.addi %r0_index, %r0_base_9 : tensor<1x128xi32> loc(#loc273)
+      %r0_mask = arith.constant dense<128> : tensor<1x128xi32> loc(#loc274)
+      %r0_mask_24 = arith.cmpi slt, %r0_index_23, %r0_mask : tensor<1x128xi32> loc(#loc274)
+      %r0_3 = arith.constant 2 : i32 loc(#loc275)
+      %r0_3_25 = arith.constant 2 : i32 loc(#loc275)
+      %r0_3_26 = arith.constant dense<2> : tensor<1x128xi32> loc(#loc275)
+      %r0_3_27 = arith.remsi %r0_index_23, %r0_3_26 : tensor<1x128xi32> loc(#loc275)
+      %r0_4 = arith.constant 2 : i32 loc(#loc276)
+      %r0_4_28 = arith.constant 2 : i32 loc(#loc276)
+      %r0_4_29 = arith.constant dense<2> : tensor<1x128xi32> loc(#loc276)
+      %r0_4_30 = arith.divsi %r0_index_23, %r0_4_29 : tensor<1x128xi32> loc(#loc276)
+      %tmp50 = arith.constant 128 : i32 loc(#loc277)
+      %tmp50_31 = arith.constant 128 : i32 loc(#loc277)
+      %tmp50_32 = arith.constant dense<128> : tensor<4x1xi32> loc(#loc277)
+      %tmp50_33 = arith.muli %tmp50_32, %x0_12 : tensor<4x1xi32> loc(#loc277)
+      %tmp50_34 = tt.broadcast %r0_index_23 : tensor<1x128xi32> -> tensor<4x128xi32> loc(#loc278)
+      %tmp50_35 = tt.broadcast %tmp50_33 : tensor<4x1xi32> -> tensor<4x128xi32> loc(#loc278)
+      %tmp50_36 = arith.addi %tmp50_34, %tmp50_35 : tensor<4x128xi32> loc(#loc278)
+      %tmp50_37 = arith.constant 36864 : i32 loc(#loc279)
+      %tmp50_38 = arith.constant 36864 : i32 loc(#loc279)
+      %tmp50_39 = arith.constant dense<36864> : tensor<4x1xi32> loc(#loc279)
+      %tmp50_40 = arith.muli %tmp50_39, %x1_15 : tensor<4x1xi32> loc(#loc279)
+      %tmp50_41 = tt.broadcast %tmp50_40 : tensor<4x1xi32> -> tensor<4x128xi32> loc(#loc280)
+      %tmp50_42 = arith.addi %tmp50_36, %tmp50_41 : tensor<4x128xi32> loc(#loc280)
+      %tmp50_43 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<4x128x!tt.ptr<bf16>> loc(#loc281)
+      %tmp50_44 = tt.addptr %tmp50_43, %tmp50_42 : tensor<4x128x!tt.ptr<bf16>>, tensor<4x128xi32> loc(#loc281)
+      %tmp50_45 = arith.constant 0.000000e+00 : f32 loc(#loc282)
+      %tmp50_46 = tt.broadcast %r0_mask_24 : tensor<1x128xi1> -> tensor<4x128xi1> loc(#loc282)
+      %tmp50_47 = arith.constant dense<0.000000e+00> : tensor<4x128xf32> loc(#loc282)
+      %tmp50_48 = arith.truncf %tmp50_47 : tensor<4x128xf32> to tensor<4x128xbf16> loc(#loc282)
+      %tmp50_49 = tt.load %tmp50_44, %tmp50_46, %tmp50_48 evictionPolicy = evict_last : tensor<4x128x!tt.ptr<bf16>> loc(#loc282)
+      %tmp50_50 = arith.extf %tmp50_49 : tensor<4x128xbf16> to tensor<4x128xf32> loc(#loc283)
+      %tmp58 = tt.splat %in_ptr1 : !tt.ptr<bf16> -> tensor<1x128x!tt.ptr<bf16>> loc(#loc284)
+      %tmp58_51 = tt.addptr %tmp58, %r0_index_23 : tensor<1x128x!tt.ptr<bf16>>, tensor<1x128xi32> loc(#loc284)
+      %tmp58_52 = arith.constant 0.000000e+00 : f32 loc(#loc285)
+      %tmp58_53 = arith.constant dense<0.000000e+00> : tensor<1x128xf32> loc(#loc285)
+      %tmp58_54 = arith.truncf %tmp58_53 : tensor<1x128xf32> to tensor<1x128xbf16> loc(#loc285)
+      %tmp58_55 = tt.load %tmp58_51, %r0_mask_24, %tmp58_54 evictionPolicy = evict_last : tensor<1x128x!tt.ptr<bf16>> loc(#loc285)
+      %tmp58_56 = arith.extf %tmp58_55 : tensor<1x128xbf16> to tensor<1x128xf32> loc(#loc286)
+      %tmp63 = arith.constant 128 : i32 loc(#loc287)
+      %tmp63_57 = arith.constant 128 : i32 loc(#loc287)
+      %tmp63_58 = arith.constant dense<128> : tensor<4x1xi32> loc(#loc287)
+      %tmp63_59 = arith.muli %tmp63_58, %x1_15 : tensor<4x1xi32> loc(#loc287)
+      %tmp63_60 = tt.broadcast %r0_index_23 : tensor<1x128xi32> -> tensor<4x128xi32> loc(#loc288)
+      %tmp63_61 = tt.broadcast %tmp63_59 : tensor<4x1xi32> -> tensor<4x128xi32> loc(#loc288)
+      %tmp63_62 = arith.addi %tmp63_60, %tmp63_61 : tensor<4x128xi32> loc(#loc288)
+      %tmp63_63 = tt.splat %in_ptr2 : !tt.ptr<f32> -> tensor<4x128x!tt.ptr<f32>> loc(#loc289)
+      %tmp63_64 = tt.addptr %tmp63_63, %tmp63_62 : tensor<4x128x!tt.ptr<f32>>, tensor<4x128xi32> loc(#loc289)
+      %tmp63_65 = arith.constant 0.000000e+00 : f32 loc(#loc290)
+      %tmp63_66 = tt.broadcast %r0_mask_24 : tensor<1x128xi1> -> tensor<4x128xi1> loc(#loc290)
+      %tmp63_67 = arith.constant dense<0.000000e+00> : tensor<4x128xf32> loc(#loc290)
+      %tmp63_68 = tt.load %tmp63_64, %tmp63_66, %tmp63_67 evictionPolicy = evict_last : tensor<4x128x!tt.ptr<f32>> loc(#loc290)
+      %tmp66 = arith.constant 128 : i32 loc(#loc291)
+      %tmp66_69 = arith.constant 128 : i32 loc(#loc291)
+      %tmp66_70 = arith.constant dense<128> : tensor<4x1xi32> loc(#loc291)
+      %tmp66_71 = arith.muli %tmp66_70, %x1_15 : tensor<4x1xi32> loc(#loc291)
+      %tmp66_72 = tt.broadcast %r0_index_23 : tensor<1x128xi32> -> tensor<4x128xi32> loc(#loc292)
+      %tmp66_73 = tt.broadcast %tmp66_71 : tensor<4x1xi32> -> tensor<4x128xi32> loc(#loc292)
+      %tmp66_74 = arith.addi %tmp66_72, %tmp66_73 : tensor<4x128xi32> loc(#loc292)
+      %tmp66_75 = tt.splat %in_ptr3 : !tt.ptr<f32> -> tensor<4x128x!tt.ptr<f32>> loc(#loc293)
+      %tmp66_76 = tt.addptr %tmp66_75, %tmp66_74 : tensor<4x128x!tt.ptr<f32>>, tensor<4x128xi32> loc(#loc293)
+      %tmp66_77 = arith.constant 0.000000e+00 : f32 loc(#loc294)
+      %tmp66_78 = tt.broadcast %r0_mask_24 : tensor<1x128xi1> -> tensor<4x128xi1> loc(#loc294)
+      %tmp66_79 = arith.constant dense<0.000000e+00> : tensor<4x128xf32> loc(#loc294)
+      %tmp66_80 = tt.load %tmp66_76, %tmp66_78, %tmp66_79 evictionPolicy = evict_last : tensor<4x128x!tt.ptr<f32>> loc(#loc294)
+      %tmp96 = arith.constant 4096 : i32 loc(#loc295)
+      %tmp96_81 = arith.constant 4096 : i32 loc(#loc295)
+      %tmp96_82 = arith.constant dense<4096> : tensor<1x128xi32> loc(#loc295)
+      %tmp96_83 = arith.addi %tmp96_82, %r0_index_23 : tensor<1x128xi32> loc(#loc295)
+      %tmp96_84 = arith.constant 128 : i32 loc(#loc296)
+      %tmp96_85 = arith.constant 128 : i32 loc(#loc296)
+      %tmp96_86 = arith.constant dense<128> : tensor<4x1xi32> loc(#loc296)
+      %tmp96_87 = arith.muli %tmp96_86, %x0_12 : tensor<4x1xi32> loc(#loc296)
+      %tmp96_88 = tt.broadcast %tmp96_83 : tensor<1x128xi32> -> tensor<4x128xi32> loc(#loc297)
+      %tmp96_89 = tt.broadcast %tmp96_87 : tensor<4x1xi32> -> tensor<4x128xi32> loc(#loc297)
+      %tmp96_90 = arith.addi %tmp96_88, %tmp96_89 : tensor<4x128xi32> loc(#loc297)
+      %tmp96_91 = arith.constant 36864 : i32 loc(#loc298)
+      %tmp96_92 = arith.constant 36864 : i32 loc(#loc298)
+      %tmp96_93 = arith.constant dense<36864> : tensor<4x1xi32> loc(#loc298)
+      %tmp96_94 = arith.muli %tmp96_93, %x1_15 : tensor<4x1xi32> loc(#loc298)
+      %tmp96_95 = tt.broadcast %tmp96_94 : tensor<4x1xi32> -> tensor<4x128xi32> loc(#loc299)
+      %tmp96_96 = arith.addi %tmp96_90, %tmp96_95 : tensor<4x128xi32> loc(#loc299)
+      %tmp96_97 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<4x128x!tt.ptr<bf16>> loc(#loc300)
+      %tmp96_98 = tt.addptr %tmp96_97, %tmp96_96 : tensor<4x128x!tt.ptr<bf16>>, tensor<4x128xi32> loc(#loc300)
+      %tmp96_99 = arith.constant 0.000000e+00 : f32 loc(#loc301)
+      %tmp96_100 = tt.broadcast %r0_mask_24 : tensor<1x128xi1> -> tensor<4x128xi1> loc(#loc301)
+      %tmp96_101 = arith.constant dense<0.000000e+00> : tensor<4x128xf32> loc(#loc301)
+      %tmp96_102 = arith.truncf %tmp96_101 : tensor<4x128xf32> to tensor<4x128xbf16> loc(#loc301)
+      %tmp96_103 = tt.load %tmp96_98, %tmp96_100, %tmp96_102 evictionPolicy = evict_first : tensor<4x128x!tt.ptr<bf16>> loc(#loc301)
+      %tmp96_104 = arith.extf %tmp96_103 : tensor<4x128xbf16> to tensor<4x128xf32> loc(#loc302)
+      %tmp102 = tt.splat %in_ptr4 : !tt.ptr<bf16> -> tensor<1x128x!tt.ptr<bf16>> loc(#loc303)
+      %tmp102_105 = tt.addptr %tmp102, %r0_index_23 : tensor<1x128x!tt.ptr<bf16>>, tensor<1x128xi32> loc(#loc303)
+      %tmp102_106 = arith.constant 0.000000e+00 : f32 loc(#loc304)
+      %tmp102_107 = arith.constant dense<0.000000e+00> : tensor<1x128xf32> loc(#loc304)
+      %tmp102_108 = arith.truncf %tmp102_107 : tensor<1x128xf32> to tensor<1x128xbf16> loc(#loc304)
+      %tmp102_109 = tt.load %tmp102_105, %r0_mask_24, %tmp102_108 evictionPolicy = evict_last : tensor<1x128x!tt.ptr<bf16>> loc(#loc304)
+      %tmp102_110 = arith.extf %tmp102_109 : tensor<1x128xbf16> to tensor<1x128xf32> loc(#loc305)
+      %tmp13 = arith.constant 0 : i64 loc(#loc306)
+      %tmp13_111 = arith.constant dense<0> : tensor<1x1xi64> loc(#loc306)
+      %tmp14 = arith.extsi %r0_3_27 : tensor<1x128xi32> to tensor<1x128xi64> loc(#loc307)
+      %tmp14_112 = arith.constant dense<0> : tensor<1x128xi64> loc(#loc307)
+      %tmp14_113 = arith.cmpi sge, %tmp14, %tmp14_112 : tensor<1x128xi64> loc(#loc307)
+      %tmp15 = arith.constant 1 : i64 loc(#loc308)
+      %tmp15_114 = arith.constant dense<1> : tensor<1x1xi64> loc(#loc308)
+      %tmp16 = arith.extsi %r0_3_27 : tensor<1x128xi32> to tensor<1x128xi64> loc(#loc309)
+      %tmp16_115 = arith.constant dense<1> : tensor<1x128xi64> loc(#loc309)
+      %tmp16_116 = arith.cmpi slt, %tmp16, %tmp16_115 : tensor<1x128xi64> loc(#loc309)
+      %tmp17 = arith.constant 2 : i32 loc(#loc310)
+      %tmp17_117 = arith.constant 2 : i32 loc(#loc310)
+      %tmp17_118 = arith.constant dense<2> : tensor<1x128xi32> loc(#loc310)
+      %tmp17_119 = arith.muli %tmp17_118, %r0_4_30 : tensor<1x128xi32> loc(#loc310)
+      %tmp17_120 = arith.constant 1 : i32 loc(#loc311)
+      %tmp17_121 = arith.constant 1 : i32 loc(#loc311)
+      %tmp17_122 = arith.constant dense<1> : tensor<1x128xi32> loc(#loc311)
+      %tmp17_123 = arith.addi %tmp17_122, %tmp17_119 : tensor<1x128xi32> loc(#loc311)
+      %tmp17_124 = arith.constant 128 : i32 loc(#loc312)
+      %tmp17_125 = arith.constant 128 : i32 loc(#loc312)
+      %tmp17_126 = arith.constant dense<128> : tensor<4x1xi32> loc(#loc312)
+      %tmp17_127 = arith.muli %tmp17_126, %x0_12 : tensor<4x1xi32> loc(#loc312)
+      %tmp17_128 = tt.broadcast %tmp17_123 : tensor<1x128xi32> -> tensor<4x128xi32> loc(#loc313)
+      %tmp17_129 = tt.broadcast %tmp17_127 : tensor<4x1xi32> -> tensor<4x128xi32> loc(#loc313)
+      %tmp17_130 = arith.addi %tmp17_128, %tmp17_129 : tensor<4x128xi32> loc(#loc313)
+      %tmp17_131 = arith.constant 36864 : i32 loc(#loc314)
+      %tmp17_132 = arith.constant 36864 : i32 loc(#loc314)
+      %tmp17_133 = arith.constant dense<36864> : tensor<4x1xi32> loc(#loc314)
+      %tmp17_134 = arith.muli %tmp17_133, %x1_15 : tensor<4x1xi32> loc(#loc314)
+      %tmp17_135 = tt.broadcast %tmp17_134 : tensor<4x1xi32> -> tensor<4x128xi32> loc(#loc315)
+      %tmp17_136 = arith.addi %tmp17_130, %tmp17_135 : tensor<4x128xi32> loc(#loc315)
+      %tmp17_137 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<4x128x!tt.ptr<bf16>> loc(#loc316)
+      %tmp17_138 = tt.addptr %tmp17_137, %tmp17_136 : tensor<4x128x!tt.ptr<bf16>>, tensor<4x128xi32> loc(#loc316)
+      %tmp17_139 = arith.andi %r0_mask_24, %tmp16_116 : tensor<1x128xi1> loc(#loc317)
+      %tmp17_140 = arith.constant 0.000000e+00 : f32 loc(#loc318)
+      %tmp17_141 = tt.broadcast %tmp17_139 : tensor<1x128xi1> -> tensor<4x128xi1> loc(#loc318)
+      %tmp17_142 = arith.constant dense<0.000000e+00> : tensor<4x128xf32> loc(#loc318)
+      %tmp17_143 = arith.truncf %tmp17_142 : tensor<4x128xf32> to tensor<4x128xbf16> loc(#loc318)
+      %tmp17_144 = tt.load %tmp17_138, %tmp17_141, %tmp17_143 evictionPolicy = evict_last : tensor<4x128x!tt.ptr<bf16>> loc(#loc318)
+      %tmp17_145 = arith.extf %tmp17_144 : tensor<4x128xbf16> to tensor<4x128xf32> loc(#loc319)
+      %tmp19 = arith.constant 1.280000e+02 : f32 loc(#loc320)
+      %tmp20 = arith.constant dense<1.280000e+02> : tensor<4x1xf32> loc(#loc321)
+      %tmp20_146 = arith.divf %tmp10_20, %tmp20 : tensor<4x1xf32> loc(#loc321)
+      %tmp21 = arith.constant 9.99999997E-7 : f32 loc(#loc322)
+      %tmp22 = arith.constant dense<9.99999997E-7> : tensor<4x1xf32> loc(#loc323)
+      %tmp22_147 = arith.addf %tmp20_146, %tmp22 : tensor<4x1xf32> loc(#loc323)
+      %tmp23 = tt.extern_elementwise %tmp22_147 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<4x1xf32>) -> tensor<4x1xf32> loc(#loc324)
+      %tmp24 = tt.broadcast %tmp23 : tensor<4x1xf32> -> tensor<4x128xf32> loc(#loc325)
+      %tmp24_148 = arith.mulf %tmp17_145, %tmp24 : tensor<4x128xf32> loc(#loc325)
+      %tmp25 = arith.constant 2 : i32 loc(#loc326)
+      %tmp25_149 = arith.constant 2 : i32 loc(#loc326)
+      %tmp25_150 = arith.constant dense<2> : tensor<1x128xi32> loc(#loc326)
+      %tmp25_151 = arith.muli %tmp25_150, %r0_4_30 : tensor<1x128xi32> loc(#loc326)
+      %tmp25_152 = arith.constant 1 : i32 loc(#loc327)
+      %tmp25_153 = arith.constant 1 : i32 loc(#loc327)
+      %tmp25_154 = arith.constant dense<1> : tensor<1x128xi32> loc(#loc327)
+      %tmp25_155 = arith.addi %tmp25_154, %tmp25_151 : tensor<1x128xi32> loc(#loc327)
+      %tmp25_156 = tt.broadcast %tmp25_155 : tensor<1x128xi32> -> tensor<4x128xi32> loc(#loc328)
+      %tmp25_157 = tt.splat %in_ptr1 : !tt.ptr<bf16> -> tensor<4x128x!tt.ptr<bf16>> loc(#loc329)
+      %tmp25_158 = tt.addptr %tmp25_157, %tmp25_156 : tensor<4x128x!tt.ptr<bf16>>, tensor<4x128xi32> loc(#loc329)
+      %tmp25_159 = arith.andi %r0_mask_24, %tmp16_116 : tensor<1x128xi1> loc(#loc330)
+      %tmp25_160 = arith.constant 0.000000e+00 : f32 loc(#loc331)
+      %tmp25_161 = tt.broadcast %tmp25_159 : tensor<1x128xi1> -> tensor<4x128xi1> loc(#loc331)
+      %tmp25_162 = arith.constant dense<0.000000e+00> : tensor<4x128xf32> loc(#loc331)
+      %tmp25_163 = arith.truncf %tmp25_162 : tensor<4x128xf32> to tensor<4x128xbf16> loc(#loc331)
+      %tmp25_164 = tt.load %tmp25_158, %tmp25_161, %tmp25_163 evictionPolicy = evict_last : tensor<4x128x!tt.ptr<bf16>> loc(#loc331)
+      %tmp25_165 = arith.extf %tmp25_164 : tensor<4x128xbf16> to tensor<4x128xf32> loc(#loc332)
+      %tmp27 = arith.mulf %tmp24_148, %tmp25_165 : tensor<4x128xf32> loc(#loc333)
+      %tmp29 = arith.constant 0.000000e+00 : f32 loc(#loc334)
+      %tmp29_166 = arith.constant dense<0.000000e+00> : tensor<4x128xf32> loc(#loc334)
+      %tmp29_167 = arith.subf %tmp29_166, %tmp27 : tensor<4x128xf32> loc(#loc334)
+      %tmp30 = arith.constant 0.000000e+00 : f32 loc(#loc335)
+      %tmp30_168 = arith.constant dense<0.000000e+00> : tensor<4x128xf32> loc(#loc335)
+      %tmp31 = tt.broadcast %tmp16_116 : tensor<1x128xi1> -> tensor<4x128xi1> loc(#loc336)
+      %tmp31_169 = arith.select %tmp31, %tmp29_167, %tmp30_168 : tensor<4x128xi1>, tensor<4x128xf32> loc(#loc336)
+      %tmp32 = arith.extsi %r0_3_27 : tensor<1x128xi32> to tensor<1x128xi64> loc(#loc337)
+      %tmp32_170 = arith.constant dense<1> : tensor<1x128xi64> loc(#loc337)
+      %tmp32_171 = arith.cmpi sge, %tmp32, %tmp32_170 : tensor<1x128xi64> loc(#loc337)
+      %tmp33 = arith.constant 2 : i64 loc(#loc338)
+      %tmp33_172 = arith.constant dense<2> : tensor<1x1xi64> loc(#loc338)
+      %tmp34 = arith.extsi %r0_3_27 : tensor<1x128xi32> to tensor<1x128xi64> loc(#loc339)
+      %tmp34_173 = arith.constant dense<2> : tensor<1x128xi64> loc(#loc339)
+      %tmp34_174 = arith.cmpi slt, %tmp34, %tmp34_173 : tensor<1x128xi64> loc(#loc339)
+      %tmp35 = arith.constant 2 : i32 loc(#loc340)
+      %tmp35_175 = arith.constant 2 : i32 loc(#loc340)
+      %tmp35_176 = arith.constant dense<2> : tensor<1x128xi32> loc(#loc340)
+      %tmp35_177 = arith.muli %tmp35_176, %r0_4_30 : tensor<1x128xi32> loc(#loc340)
+      %tmp35_178 = arith.constant 128 : i32 loc(#loc341)
+      %tmp35_179 = arith.constant 128 : i32 loc(#loc341)
+      %tmp35_180 = arith.constant dense<128> : tensor<4x1xi32> loc(#loc341)
+      %tmp35_181 = arith.muli %tmp35_180, %x0_12 : tensor<4x1xi32> loc(#loc341)
+      %tmp35_182 = tt.broadcast %tmp35_177 : tensor<1x128xi32> -> tensor<4x128xi32> loc(#loc342)
+      %tmp35_183 = tt.broadcast %tmp35_181 : tensor<4x1xi32> -> tensor<4x128xi32> loc(#loc342)
+      %tmp35_184 = arith.addi %tmp35_182, %tmp35_183 : tensor<4x128xi32> loc(#loc342)
+      %tmp35_185 = arith.constant 36864 : i32 loc(#loc343)
+      %tmp35_186 = arith.constant 36864 : i32 loc(#loc343)
+      %tmp35_187 = arith.constant dense<36864> : tensor<4x1xi32> loc(#loc343)
+      %tmp35_188 = arith.muli %tmp35_187, %x1_15 : tensor<4x1xi32> loc(#loc343)
+      %tmp35_189 = tt.broadcast %tmp35_188 : tensor<4x1xi32> -> tensor<4x128xi32> loc(#loc344)
+      %tmp35_190 = arith.addi %tmp35_184, %tmp35_189 : tensor<4x128xi32> loc(#loc344)
+      %tmp35_191 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<4x128x!tt.ptr<bf16>> loc(#loc345)
+      %tmp35_192 = tt.addptr %tmp35_191, %tmp35_190 : tensor<4x128x!tt.ptr<bf16>>, tensor<4x128xi32> loc(#loc345)
+      %tmp35_193 = arith.andi %r0_mask_24, %tmp32_171 : tensor<1x128xi1> loc(#loc346)
+      %tmp35_194 = arith.constant 0.000000e+00 : f32 loc(#loc347)
+      %tmp35_195 = tt.broadcast %tmp35_193 : tensor<1x128xi1> -> tensor<4x128xi1> loc(#loc347)
+      %tmp35_196 = arith.constant dense<0.000000e+00> : tensor<4x128xf32> loc(#loc347)
+      %tmp35_197 = arith.truncf %tmp35_196 : tensor<4x128xf32> to tensor<4x128xbf16> loc(#loc347)
+      %tmp35_198 = tt.load %tmp35_192, %tmp35_195, %tmp35_197 evictionPolicy = evict_last : tensor<4x128x!tt.ptr<bf16>> loc(#loc347)
+      %tmp35_199 = arith.extf %tmp35_198 : tensor<4x128xbf16> to tensor<4x128xf32> loc(#loc348)
+      %tmp37 = arith.constant 1.280000e+02 : f32 loc(#loc349)
+      %tmp38 = arith.constant dense<1.280000e+02> : tensor<4x1xf32> loc(#loc350)
+      %tmp38_200 = arith.divf %tmp10_20, %tmp38 : tensor<4x1xf32> loc(#loc350)
+      %tmp39 = arith.constant 9.99999997E-7 : f32 loc(#loc351)
+      %tmp40 = arith.constant dense<9.99999997E-7> : tensor<4x1xf32> loc(#loc352)
+      %tmp40_201 = arith.addf %tmp38_200, %tmp40 : tensor<4x1xf32> loc(#loc352)
+      %tmp41 = tt.extern_elementwise %tmp40_201 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<4x1xf32>) -> tensor<4x1xf32> loc(#loc353)
+      %tmp42 = tt.broadcast %tmp41 : tensor<4x1xf32> -> tensor<4x128xf32> loc(#loc354)
+      %tmp42_202 = arith.mulf %tmp35_199, %tmp42 : tensor<4x128xf32> loc(#loc354)
+      %tmp43 = arith.constant 2 : i32 loc(#loc355)
+      %tmp43_203 = arith.constant 2 : i32 loc(#loc355)
+      %tmp43_204 = arith.constant dense<2> : tensor<1x128xi32> loc(#loc355)
+      %tmp43_205 = arith.muli %tmp43_204, %r0_4_30 : tensor<1x128xi32> loc(#loc355)
+      %tmp43_206 = tt.broadcast %tmp43_205 : tensor<1x128xi32> -> tensor<4x128xi32> loc(#loc356)
+      %tmp43_207 = tt.splat %in_ptr1 : !tt.ptr<bf16> -> tensor<4x128x!tt.ptr<bf16>> loc(#loc357)
+      %tmp43_208 = tt.addptr %tmp43_207, %tmp43_206 : tensor<4x128x!tt.ptr<bf16>>, tensor<4x128xi32> loc(#loc357)
+      %tmp43_209 = arith.andi %r0_mask_24, %tmp32_171 : tensor<1x128xi1> loc(#loc358)
+      %tmp43_210 = arith.constant 0.000000e+00 : f32 loc(#loc359)
+      %tmp43_211 = tt.broadcast %tmp43_209 : tensor<1x128xi1> -> tensor<4x128xi1> loc(#loc359)
+      %tmp43_212 = arith.constant dense<0.000000e+00> : tensor<4x128xf32> loc(#loc359)
+      %tmp43_213 = arith.truncf %tmp43_212 : tensor<4x128xf32> to tensor<4x128xbf16> loc(#loc359)
+      %tmp43_214 = tt.load %tmp43_208, %tmp43_211, %tmp43_213 evictionPolicy = evict_last : tensor<4x128x!tt.ptr<bf16>> loc(#loc359)
+      %tmp43_215 = arith.extf %tmp43_214 : tensor<4x128xbf16> to tensor<4x128xf32> loc(#loc360)
+      %tmp45 = arith.mulf %tmp42_202, %tmp43_215 : tensor<4x128xf32> loc(#loc361)
+      %tmp47 = arith.constant 0.000000e+00 : f32 loc(#loc362)
+      %tmp47_216 = arith.constant dense<0.000000e+00> : tensor<4x128xf32> loc(#loc362)
+      %tmp48 = tt.broadcast %tmp32_171 : tensor<1x128xi1> -> tensor<4x128xi1> loc(#loc363)
+      %tmp48_217 = arith.select %tmp48, %tmp45, %tmp47_216 : tensor<4x128xi1>, tensor<4x128xf32> loc(#loc363)
+      %tmp49 = tt.broadcast %tmp16_116 : tensor<1x128xi1> -> tensor<4x128xi1> loc(#loc364)
+      %tmp49_218 = arith.select %tmp49, %tmp31_169, %tmp48_217 : tensor<4x128xi1>, tensor<4x128xf32> loc(#loc364)
+      %tmp52 = arith.constant 1.280000e+02 : f32 loc(#loc365)
+      %tmp53 = arith.constant dense<1.280000e+02> : tensor<4x1xf32> loc(#loc366)
+      %tmp53_219 = arith.divf %tmp10_20, %tmp53 : tensor<4x1xf32> loc(#loc366)
+      %tmp54 = arith.constant 9.99999997E-7 : f32 loc(#loc367)
+      %tmp55 = arith.constant dense<9.99999997E-7> : tensor<4x1xf32> loc(#loc368)
+      %tmp55_220 = arith.addf %tmp53_219, %tmp55 : tensor<4x1xf32> loc(#loc368)
+      %tmp56 = tt.extern_elementwise %tmp55_220 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<4x1xf32>) -> tensor<4x1xf32> loc(#loc369)
+      %tmp57 = tt.broadcast %tmp56 : tensor<4x1xf32> -> tensor<4x128xf32> loc(#loc370)
+      %tmp57_221 = arith.mulf %tmp50_50, %tmp57 : tensor<4x128xf32> loc(#loc370)
+      %tmp60 = tt.broadcast %tmp58_56 : tensor<1x128xf32> -> tensor<4x128xf32> loc(#loc371)
+      %tmp60_222 = arith.mulf %tmp57_221, %tmp60 : tensor<4x128xf32> loc(#loc371)
+      %tmp64 = arith.mulf %tmp60_222, %tmp63_68 : tensor<4x128xf32> loc(#loc372)
+      %tmp67 = arith.mulf %tmp49_218, %tmp66_80 : tensor<4x128xf32> loc(#loc373)
+      %tmp68 = arith.addf %tmp64, %tmp67 : tensor<4x128xf32> loc(#loc374)
+      %tmp70 = arith.constant 2 : i32 loc(#loc375)
+      %tmp70_223 = arith.constant 2 : i32 loc(#loc375)
+      %tmp70_224 = arith.constant dense<2> : tensor<1x128xi32> loc(#loc375)
+      %tmp70_225 = arith.muli %tmp70_224, %r0_4_30 : tensor<1x128xi32> loc(#loc375)
+      %tmp70_226 = arith.constant 4097 : i32 loc(#loc376)
+      %tmp70_227 = arith.constant 4097 : i32 loc(#loc376)
+      %tmp70_228 = arith.constant dense<4097> : tensor<1x128xi32> loc(#loc376)
+      %tmp70_229 = arith.addi %tmp70_228, %tmp70_225 : tensor<1x128xi32> loc(#loc376)
+      %tmp70_230 = arith.constant 128 : i32 loc(#loc377)
+      %tmp70_231 = arith.constant 128 : i32 loc(#loc377)
+      %tmp70_232 = arith.constant dense<128> : tensor<4x1xi32> loc(#loc377)
+      %tmp70_233 = arith.muli %tmp70_232, %x0_12 : tensor<4x1xi32> loc(#loc377)
+      %tmp70_234 = tt.broadcast %tmp70_229 : tensor<1x128xi32> -> tensor<4x128xi32> loc(#loc378)
+      %tmp70_235 = tt.broadcast %tmp70_233 : tensor<4x1xi32> -> tensor<4x128xi32> loc(#loc378)
+      %tmp70_236 = arith.addi %tmp70_234, %tmp70_235 : tensor<4x128xi32> loc(#loc378)
+      %tmp70_237 = arith.constant 36864 : i32 loc(#loc379)
+      %tmp70_238 = arith.constant 36864 : i32 loc(#loc379)
+      %tmp70_239 = arith.constant dense<36864> : tensor<4x1xi32> loc(#loc379)
+      %tmp70_240 = arith.muli %tmp70_239, %x1_15 : tensor<4x1xi32> loc(#loc379)
+      %tmp70_241 = tt.broadcast %tmp70_240 : tensor<4x1xi32> -> tensor<4x128xi32> loc(#loc380)
+      %tmp70_242 = arith.addi %tmp70_236, %tmp70_241 : tensor<4x128xi32> loc(#loc380)
+      %tmp70_243 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<4x128x!tt.ptr<bf16>> loc(#loc381)
+      %tmp70_244 = tt.addptr %tmp70_243, %tmp70_242 : tensor<4x128x!tt.ptr<bf16>>, tensor<4x128xi32> loc(#loc381)
+      %tmp70_245 = arith.andi %r0_mask_24, %tmp16_116 : tensor<1x128xi1> loc(#loc382)
+      %tmp70_246 = arith.constant 0.000000e+00 : f32 loc(#loc383)
+      %tmp70_247 = tt.broadcast %tmp70_245 : tensor<1x128xi1> -> tensor<4x128xi1> loc(#loc383)
+      %tmp70_248 = arith.constant dense<0.000000e+00> : tensor<4x128xf32> loc(#loc383)
+      %tmp70_249 = arith.truncf %tmp70_248 : tensor<4x128xf32> to tensor<4x128xbf16> loc(#loc383)
+      %tmp70_250 = tt.load %tmp70_244, %tmp70_247, %tmp70_249 evictionPolicy = evict_last : tensor<4x128x!tt.ptr<bf16>> loc(#loc383)
+      %tmp70_251 = arith.extf %tmp70_250 : tensor<4x128xbf16> to tensor<4x128xf32> loc(#loc384)
+      %tmp72 = arith.constant dense<1.280000e+02> : tensor<4x1xf32> loc(#loc385)
+      %tmp72_252 = arith.divf %tmp4_19, %tmp72 : tensor<4x1xf32> loc(#loc385)
+      %tmp73 = arith.constant dense<9.99999997E-7> : tensor<4x1xf32> loc(#loc386)
+      %tmp73_253 = arith.addf %tmp72_252, %tmp73 : tensor<4x1xf32> loc(#loc386)
+      %tmp74 = tt.extern_elementwise %tmp73_253 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<4x1xf32>) -> tensor<4x1xf32> loc(#loc387)
+      %tmp75 = tt.broadcast %tmp74 : tensor<4x1xf32> -> tensor<4x128xf32> loc(#loc388)
+      %tmp75_254 = arith.mulf %tmp70_251, %tmp75 : tensor<4x128xf32> loc(#loc388)
+      %tmp76 = arith.constant 2 : i32 loc(#loc389)
+      %tmp76_255 = arith.constant 2 : i32 loc(#loc389)
+      %tmp76_256 = arith.constant dense<2> : tensor<1x128xi32> loc(#loc389)
+      %tmp76_257 = arith.muli %tmp76_256, %r0_4_30 : tensor<1x128xi32> loc(#loc389)
+      %tmp76_258 = arith.constant 1 : i32 loc(#loc390)
+      %tmp76_259 = arith.constant 1 : i32 loc(#loc390)
+      %tmp76_260 = arith.constant dense<1> : tensor<1x128xi32> loc(#loc390)
+      %tmp76_261 = arith.addi %tmp76_260, %tmp76_257 : tensor<1x128xi32> loc(#loc390)
+      %tmp76_262 = tt.broadcast %tmp76_261 : tensor<1x128xi32> -> tensor<4x128xi32> loc(#loc391)
+      %tmp76_263 = tt.splat %in_ptr4 : !tt.ptr<bf16> -> tensor<4x128x!tt.ptr<bf16>> loc(#loc392)
+      %tmp76_264 = tt.addptr %tmp76_263, %tmp76_262 : tensor<4x128x!tt.ptr<bf16>>, tensor<4x128xi32> loc(#loc392)
+      %tmp76_265 = arith.andi %r0_mask_24, %tmp16_116 : tensor<1x128xi1> loc(#loc393)
+      %tmp76_266 = arith.constant 0.000000e+00 : f32 loc(#loc394)
+      %tmp76_267 = tt.broadcast %tmp76_265 : tensor<1x128xi1> -> tensor<4x128xi1> loc(#loc394)
+      %tmp76_268 = arith.constant dense<0.000000e+00> : tensor<4x128xf32> loc(#loc394)
+      %tmp76_269 = arith.truncf %tmp76_268 : tensor<4x128xf32> to tensor<4x128xbf16> loc(#loc394)
+      %tmp76_270 = tt.load %tmp76_264, %tmp76_267, %tmp76_269 evictionPolicy = evict_last : tensor<4x128x!tt.ptr<bf16>> loc(#loc394)
+      %tmp76_271 = arith.extf %tmp76_270 : tensor<4x128xbf16> to tensor<4x128xf32> loc(#loc395)
+      %tmp78 = arith.mulf %tmp75_254, %tmp76_271 : tensor<4x128xf32> loc(#loc396)
+      %tmp80 = arith.constant 0.000000e+00 : f32 loc(#loc397)
+      %tmp80_272 = arith.constant dense<0.000000e+00> : tensor<4x128xf32> loc(#loc397)
+      %tmp80_273 = arith.subf %tmp80_272, %tmp78 : tensor<4x128xf32> loc(#loc397)
+      %tmp81 = arith.constant 0.000000e+00 : f32 loc(#loc398)
+      %tmp81_274 = arith.constant dense<0.000000e+00> : tensor<4x128xf32> loc(#loc398)
+      %tmp82 = tt.broadcast %tmp16_116 : tensor<1x128xi1> -> tensor<4x128xi1> loc(#loc399)
+      %tmp82_275 = arith.select %tmp82, %tmp80_273, %tmp81_274 : tensor<4x128xi1>, tensor<4x128xf32> loc(#loc399)
+      %tmp83 = arith.constant 2 : i32 loc(#loc400)
+      %tmp83_276 = arith.constant 2 : i32 loc(#loc400)
+      %tmp83_277 = arith.constant dense<2> : tensor<1x128xi32> loc(#loc400)
+      %tmp83_278 = arith.muli %tmp83_277, %r0_4_30 : tensor<1x128xi32> loc(#loc400)
+      %tmp83_279 = arith.constant 4096 : i32 loc(#loc401)
+      %tmp83_280 = arith.constant 4096 : i32 loc(#loc401)
+      %tmp83_281 = arith.constant dense<4096> : tensor<1x128xi32> loc(#loc401)
+      %tmp83_282 = arith.addi %tmp83_281, %tmp83_278 : tensor<1x128xi32> loc(#loc401)
+      %tmp83_283 = arith.constant 128 : i32 loc(#loc402)
+      %tmp83_284 = arith.constant 128 : i32 loc(#loc402)
+      %tmp83_285 = arith.constant dense<128> : tensor<4x1xi32> loc(#loc402)
+      %tmp83_286 = arith.muli %tmp83_285, %x0_12 : tensor<4x1xi32> loc(#loc402)
+      %tmp83_287 = tt.broadcast %tmp83_282 : tensor<1x128xi32> -> tensor<4x128xi32> loc(#loc403)
+      %tmp83_288 = tt.broadcast %tmp83_286 : tensor<4x1xi32> -> tensor<4x128xi32> loc(#loc403)
+      %tmp83_289 = arith.addi %tmp83_287, %tmp83_288 : tensor<4x128xi32> loc(#loc403)
+      %tmp83_290 = arith.constant 36864 : i32 loc(#loc404)
+      %tmp83_291 = arith.constant 36864 : i32 loc(#loc404)
+      %tmp83_292 = arith.constant dense<36864> : tensor<4x1xi32> loc(#loc404)
+      %tmp83_293 = arith.muli %tmp83_292, %x1_15 : tensor<4x1xi32> loc(#loc404)
+      %tmp83_294 = tt.broadcast %tmp83_293 : tensor<4x1xi32> -> tensor<4x128xi32> loc(#loc405)
+      %tmp83_295 = arith.addi %tmp83_289, %tmp83_294 : tensor<4x128xi32> loc(#loc405)
+      %tmp83_296 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<4x128x!tt.ptr<bf16>> loc(#loc406)
+      %tmp83_297 = tt.addptr %tmp83_296, %tmp83_295 : tensor<4x128x!tt.ptr<bf16>>, tensor<4x128xi32> loc(#loc406)
+      %tmp83_298 = arith.andi %r0_mask_24, %tmp32_171 : tensor<1x128xi1> loc(#loc407)
+      %tmp83_299 = arith.constant 0.000000e+00 : f32 loc(#loc408)
+      %tmp83_300 = tt.broadcast %tmp83_298 : tensor<1x128xi1> -> tensor<4x128xi1> loc(#loc408)
+      %tmp83_301 = arith.constant dense<0.000000e+00> : tensor<4x128xf32> loc(#loc408)
+      %tmp83_302 = arith.truncf %tmp83_301 : tensor<4x128xf32> to tensor<4x128xbf16> loc(#loc408)
+      %tmp83_303 = tt.load %tmp83_297, %tmp83_300, %tmp83_302 evictionPolicy = evict_last : tensor<4x128x!tt.ptr<bf16>> loc(#loc408)
+      %tmp83_304 = arith.extf %tmp83_303 : tensor<4x128xbf16> to tensor<4x128xf32> loc(#loc409)
+      %tmp85 = arith.constant dense<1.280000e+02> : tensor<4x1xf32> loc(#loc410)
+      %tmp85_305 = arith.divf %tmp4_19, %tmp85 : tensor<4x1xf32> loc(#loc410)
+      %tmp86 = arith.constant dense<9.99999997E-7> : tensor<4x1xf32> loc(#loc411)
+      %tmp86_306 = arith.addf %tmp85_305, %tmp86 : tensor<4x1xf32> loc(#loc411)
+      %tmp87 = tt.extern_elementwise %tmp86_306 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<4x1xf32>) -> tensor<4x1xf32> loc(#loc412)
+      %tmp88 = tt.broadcast %tmp87 : tensor<4x1xf32> -> tensor<4x128xf32> loc(#loc413)
+      %tmp88_307 = arith.mulf %tmp83_304, %tmp88 : tensor<4x128xf32> loc(#loc413)
+      %tmp89 = arith.constant 2 : i32 loc(#loc414)
+      %tmp89_308 = arith.constant 2 : i32 loc(#loc414)
+      %tmp89_309 = arith.constant dense<2> : tensor<1x128xi32> loc(#loc414)
+      %tmp89_310 = arith.muli %tmp89_309, %r0_4_30 : tensor<1x128xi32> loc(#loc414)
+      %tmp89_311 = tt.broadcast %tmp89_310 : tensor<1x128xi32> -> tensor<4x128xi32> loc(#loc415)
+      %tmp89_312 = tt.splat %in_ptr4 : !tt.ptr<bf16> -> tensor<4x128x!tt.ptr<bf16>> loc(#loc416)
+      %tmp89_313 = tt.addptr %tmp89_312, %tmp89_311 : tensor<4x128x!tt.ptr<bf16>>, tensor<4x128xi32> loc(#loc416)
+      %tmp89_314 = arith.andi %r0_mask_24, %tmp32_171 : tensor<1x128xi1> loc(#loc417)
+      %tmp89_315 = arith.constant 0.000000e+00 : f32 loc(#loc418)
+      %tmp89_316 = tt.broadcast %tmp89_314 : tensor<1x128xi1> -> tensor<4x128xi1> loc(#loc418)
+      %tmp89_317 = arith.constant dense<0.000000e+00> : tensor<4x128xf32> loc(#loc418)
+      %tmp89_318 = arith.truncf %tmp89_317 : tensor<4x128xf32> to tensor<4x128xbf16> loc(#loc418)
+      %tmp89_319 = tt.load %tmp89_313, %tmp89_316, %tmp89_318 evictionPolicy = evict_last : tensor<4x128x!tt.ptr<bf16>> loc(#loc418)
+      %tmp89_320 = arith.extf %tmp89_319 : tensor<4x128xbf16> to tensor<4x128xf32> loc(#loc419)
+      %tmp91 = arith.mulf %tmp88_307, %tmp89_320 : tensor<4x128xf32> loc(#loc420)
+      %tmp93 = arith.constant 0.000000e+00 : f32 loc(#loc421)
+      %tmp93_321 = arith.constant dense<0.000000e+00> : tensor<4x128xf32> loc(#loc421)
+      %tmp94 = tt.broadcast %tmp32_171 : tensor<1x128xi1> -> tensor<4x128xi1> loc(#loc422)
+      %tmp94_322 = arith.select %tmp94, %tmp91, %tmp93_321 : tensor<4x128xi1>, tensor<4x128xf32> loc(#loc422)
+      %tmp95 = tt.broadcast %tmp16_116 : tensor<1x128xi1> -> tensor<4x128xi1> loc(#loc423)
+      %tmp95_323 = arith.select %tmp95, %tmp82_275, %tmp94_322 : tensor<4x128xi1>, tensor<4x128xf32> loc(#loc423)
+      %tmp98 = arith.constant dense<1.280000e+02> : tensor<4x1xf32> loc(#loc424)
+      %tmp98_324 = arith.divf %tmp4_19, %tmp98 : tensor<4x1xf32> loc(#loc424)
+      %tmp99 = arith.constant dense<9.99999997E-7> : tensor<4x1xf32> loc(#loc425)
+      %tmp99_325 = arith.addf %tmp98_324, %tmp99 : tensor<4x1xf32> loc(#loc425)
+      %tmp100 = tt.extern_elementwise %tmp99_325 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<4x1xf32>) -> tensor<4x1xf32> loc(#loc426)
+      %tmp101 = tt.broadcast %tmp100 : tensor<4x1xf32> -> tensor<4x128xf32> loc(#loc427)
+      %tmp101_326 = arith.mulf %tmp96_104, %tmp101 : tensor<4x128xf32> loc(#loc427)
+      %tmp104 = tt.broadcast %tmp102_110 : tensor<1x128xf32> -> tensor<4x128xf32> loc(#loc428)
+      %tmp104_327 = arith.mulf %tmp101_326, %tmp104 : tensor<4x128xf32> loc(#loc428)
+      %tmp107 = arith.mulf %tmp104_327, %tmp63_68 : tensor<4x128xf32> loc(#loc429)
+      %tmp109 = arith.mulf %tmp95_323, %tmp66_80 : tensor<4x128xf32> loc(#loc430)
+      %tmp110 = arith.addf %tmp107, %tmp109 : tensor<4x128xf32> loc(#loc431)
+      %c128_i32_328 = arith.constant 128 : i32 loc(#loc204)
+      %c128_i32_329 = arith.constant 128 : i32 loc(#loc204)
+      %cst = arith.constant dense<128> : tensor<4x1xi32> loc(#loc204)
+      %8 = arith.muli %cst, %xindex_7 : tensor<4x1xi32> loc(#loc204)
+      %9 = tt.broadcast %r0_index_23 : tensor<1x128xi32> -> tensor<4x128xi32> loc(#loc205)
+      %10 = tt.broadcast %8 : tensor<4x1xi32> -> tensor<4x128xi32> loc(#loc205)
+      %11 = arith.addi %9, %10 : tensor<4x128xi32> loc(#loc205)
+      %12 = tt.splat %in_out_ptr0 : !tt.ptr<bf16> -> tensor<4x128x!tt.ptr<bf16>> loc(#loc206)
+      %13 = tt.addptr %12, %11 : tensor<4x128x!tt.ptr<bf16>>, tensor<4x128xi32> loc(#loc206)
+      %14 = tt.broadcast %r0_mask_24 : tensor<1x128xi1> -> tensor<4x128xi1> loc(#loc207)
+      %15 = arith.truncf %tmp68 : tensor<4x128xf32> to tensor<4x128xbf16> loc(#loc207)
+      tt.store %13, %15, %14 : tensor<4x128x!tt.ptr<bf16>> loc(#loc207)
+      %c128_i32_330 = arith.constant 128 : i32 loc(#loc208)
+      %c128_i32_331 = arith.constant 128 : i32 loc(#loc208)
+      %cst_332 = arith.constant dense<128> : tensor<4x1xi32> loc(#loc208)
+      %16 = arith.muli %cst_332, %xindex_7 : tensor<4x1xi32> loc(#loc208)
+      %17 = tt.broadcast %r0_index_23 : tensor<1x128xi32> -> tensor<4x128xi32> loc(#loc209)
+      %18 = tt.broadcast %16 : tensor<4x1xi32> -> tensor<4x128xi32> loc(#loc209)
+      %19 = arith.addi %17, %18 : tensor<4x128xi32> loc(#loc209)
+      %20 = tt.splat %in_out_ptr1 : !tt.ptr<bf16> -> tensor<4x128x!tt.ptr<bf16>> loc(#loc210)
+      %21 = tt.addptr %20, %19 : tensor<4x128x!tt.ptr<bf16>>, tensor<4x128xi32> loc(#loc210)
+      %22 = tt.broadcast %r0_mask_24 : tensor<1x128xi1> -> tensor<4x128xi1> loc(#loc211)
+      %23 = arith.truncf %tmp110 : tensor<4x128xf32> to tensor<4x128xbf16> loc(#loc211)
+      tt.store %21, %23, %22 : tensor<4x128x!tt.ptr<bf16>> loc(#loc211)
+    } loc(#loc44)
+    tt.return loc(#loc212)
+  } loc(#loc)
+  tt.func private @"triton.language.standard.sum__fp32S4_128S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<4x128xf32> loc("input"(#loc213))) -> tensor<4xf32> attributes {noinline = false} {
+    %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({
+    ^bb0(%arg1: f32 loc(unknown), %arg2: f32 loc(unknown)):
+      %2 = tt.call @triton.language.standard._sum_combine__fp32_fp32__(%arg1, %arg2) : (f32, f32) -> f32 loc(#loc214)
+      tt.reduce.return %2 : f32 loc(#loc214)
+    }) : (tensor<4x128xf32>) -> tensor<4xf32> loc(#loc214)
+    tt.return %0 : tensor<4xf32> loc(#loc216)
+  ^bb1:  // no predecessors
+    %1 = ub.poison : tensor<4xf32> loc(#loc217)
+    tt.return %1 : tensor<4xf32> loc(#loc217)
+  } loc(#loc213)
+  tt.func private @triton.language.standard._sum_combine__fp32_fp32__(%a: f32 loc("a"(#loc218)), %b: f32 loc("b"(#loc218))) -> f32 attributes {noinline = false} {
+    %0 = arith.addf %a, %b : f32 loc(#loc219)
+    tt.return %0 : f32 loc(#loc220)
+  ^bb1:  // no predecessors
+    %1 = ub.poison : f32 loc(#loc221)
+    tt.return %1 : f32 loc(#loc221)
+  } loc(#loc218)
+} loc(#loc)
+#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":19:13)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":20:15)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":23:28)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":23:33)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:36)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:44)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:23)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":25:46)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":26:27)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":26:37)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":28:19)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":29:19)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":30:43)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":32:44)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":33:43)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":34:31)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":35:29)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:41)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:52)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:48)
+#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:63)
+#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:57)
+#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:34)
+#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:68)
+#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:121)
+#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:45)
+#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:41)
+#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:56)
+#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:50)
+#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:34)
+#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:61)
+#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:114)
+#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":42:22)
+#loc34 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":44:23)
+#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":45:40)
+#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":47:22)
+#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":49:25)
+#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":50:42)
+#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":50:8)
+#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":51:25)
+#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":51:28)
+#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":52:27)
+#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":52:30)
+#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":53:43)
+#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":54:31)
+#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":55:29)
+#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":58:27)
+#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":59:27)
+#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:46)
+#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:42)
+#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:57)
+#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:51)
+#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:35)
+#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:62)
+#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:115)
+#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:35)
+#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:42)
+#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:95)
+#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:46)
+#loc60 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:42)
+#loc61 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:35)
+#loc62 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:51)
+#loc63 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:46)
+#loc64 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:42)
+#loc65 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:35)
+#loc66 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:51)
+#loc67 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:42)
+#loc68 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:53)
+#loc69 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:49)
+#loc70 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:64)
+#loc71 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:58)
+#loc72 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:35)
+#loc73 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:69)
+#loc74 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:123)
+#loc75 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:36)
+#loc76 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:43)
+#loc77 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:96)
+#loc78 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":68:35)
+#loc79 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":69:25)
+#loc80 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":70:35)
+#loc81 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":71:24)
+#loc82 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:41)
+#loc83 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:39)
+#loc84 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:52)
+#loc85 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:48)
+#loc86 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:63)
+#loc87 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:57)
+#loc88 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:35)
+#loc89 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:78)
+#loc90 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:68)
+#loc91 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:129)
+#loc92 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":74:16)
+#loc93 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":75:25)
+#loc94 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":76:16)
+#loc95 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":77:24)
+#loc96 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":78:32)
+#loc97 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":79:24)
+#loc98 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:57)
+#loc99 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:55)
+#loc100 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:63)
+#loc101 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:35)
+#loc102 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:95)
+#loc103 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:85)
+#loc104 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:146)
+#loc105 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":82:24)
+#loc106 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":84:17)
+#loc107 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":85:42)
+#loc108 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":86:39)
+#loc109 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":87:25)
+#loc110 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":88:35)
+#loc111 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":89:24)
+#loc112 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:37)
+#loc113 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:48)
+#loc114 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:44)
+#loc115 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:59)
+#loc116 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:53)
+#loc117 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:35)
+#loc118 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:74)
+#loc119 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:64)
+#loc120 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:125)
+#loc121 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":92:16)
+#loc122 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":93:25)
+#loc123 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":94:16)
+#loc124 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":95:24)
+#loc125 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":96:32)
+#loc126 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":97:24)
+#loc127 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:53)
+#loc128 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:59)
+#loc129 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:35)
+#loc130 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:91)
+#loc131 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:81)
+#loc132 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:142)
+#loc133 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":100:24)
+#loc134 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":102:42)
+#loc135 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":103:39)
+#loc136 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":104:39)
+#loc137 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":106:16)
+#loc138 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":107:25)
+#loc139 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":108:16)
+#loc140 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":109:24)
+#loc141 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":110:32)
+#loc142 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":111:24)
+#loc143 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":113:24)
+#loc144 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":116:24)
+#loc145 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":118:24)
+#loc146 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":119:24)
+#loc147 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:44)
+#loc148 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:42)
+#loc149 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:55)
+#loc150 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:51)
+#loc151 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:66)
+#loc152 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:60)
+#loc153 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:35)
+#loc154 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:81)
+#loc155 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:71)
+#loc156 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:132)
+#loc157 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":123:24)
+#loc158 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":124:24)
+#loc159 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":125:32)
+#loc160 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":126:24)
+#loc161 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:57)
+#loc162 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:55)
+#loc163 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:63)
+#loc164 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:35)
+#loc165 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:95)
+#loc166 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:85)
+#loc167 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:146)
+#loc168 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":129:24)
+#loc169 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":131:17)
+#loc170 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":132:42)
+#loc171 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":133:39)
+#loc172 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:44)
+#loc173 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:42)
+#loc174 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:55)
+#loc175 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:51)
+#loc176 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:66)
+#loc177 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:60)
+#loc178 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:35)
+#loc179 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:81)
+#loc180 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:71)
+#loc181 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:132)
+#loc182 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":136:24)
+#loc183 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":137:24)
+#loc184 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":138:32)
+#loc185 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":139:24)
+#loc186 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:53)
+#loc187 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:59)
+#loc188 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:35)
+#loc189 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:91)
+#loc190 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:81)
+#loc191 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:142)
+#loc192 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":142:24)
+#loc193 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":144:42)
+#loc194 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":145:39)
+#loc195 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":146:39)
+#loc196 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":148:24)
+#loc197 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":149:24)
+#loc198 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":150:33)
+#loc199 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":151:25)
+#loc200 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":153:26)
+#loc201 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":156:26)
+#loc202 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":158:26)
+#loc203 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":159:26)
+#loc204 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:43)
+#loc205 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:39)
+#loc206 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:32)
+#loc207 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:55)
+#loc208 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:43)
+#loc209 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:39)
+#loc210 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:32)
+#loc211 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:56)
+#loc212 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":53:4)
+#loc214 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:36)
+#loc216 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:11)
+#loc217 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:4)
+#loc219 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:15)
+#loc220 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:11)
+#loc221 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:4)
+#loc231 = loc("xnumel"(#loc1))
+#loc232 = loc("r0_numel"(#loc2))
+#loc233 = loc("xoffset"(#loc3))
+#loc234 = loc("xoffset"(#loc4))
+#loc235 = loc("xindex"(#loc5))
+#loc236 = loc("xindex"(#loc6))
+#loc237 = loc("xindex"(#loc7))
+#loc238 = loc("xmask"(#loc8))
+#loc239 = loc("r0_base"(#loc9))
+#loc240 = loc("r0_base"(#loc10))
+#loc241 = loc("x0"(#loc11))
+#loc242 = loc("x1"(#loc12))
+#loc243 = loc("_tmp4"(#loc13))
+#loc244 = loc("_tmp10"(#loc14))
+#loc245 = loc("_tmp4"(#loc15))
+#loc246 = loc("r0_index"(#loc16))
+#loc247 = loc("r0_mask"(#loc17))
+#loc248 = loc("tmp0"(#loc18))
+#loc249 = loc("tmp0"(#loc19))
+#loc250 = loc("tmp0"(#loc20))
+#loc251 = loc("tmp0"(#loc21))
+#loc252 = loc("tmp0"(#loc22))
+#loc253 = loc("tmp0"(#loc23))
+#loc254 = loc("tmp0"(#loc24))
+#loc255 = loc("tmp0"(#loc25))
+#loc256 = loc("tmp6"(#loc26))
+#loc257 = loc("tmp6"(#loc27))
+#loc258 = loc("tmp6"(#loc28))
+#loc259 = loc("tmp6"(#loc29))
+#loc260 = loc("tmp6"(#loc30))
+#loc261 = loc("tmp6"(#loc31))
+#loc262 = loc("tmp6"(#loc32))
+#loc263 = loc("tmp2"(#loc33))
+#loc264 = loc("tmp5"(#loc34))
+#loc265 = loc("_tmp4"(#loc35))
+#loc266 = loc("tmp8"(#loc36))
+#loc267 = loc("tmp11"(#loc37))
+#loc268 = loc("_tmp10"(#loc38))
+#loc269 = loc("tmp4"(#loc40))
+#loc270 = loc("tmp4"(#loc41))
+#loc271 = loc("tmp10"(#loc42))
+#loc272 = loc("tmp10"(#loc43))
+#loc273 = loc("r0_index"(#loc45))
+#loc274 = loc("r0_mask"(#loc46))
+#loc275 = loc("r0_3"(#loc47))
+#loc276 = loc("r0_4"(#loc48))
+#loc277 = loc("tmp50"(#loc49))
+#loc278 = loc("tmp50"(#loc50))
+#loc279 = loc("tmp50"(#loc51))
+#loc280 = loc("tmp50"(#loc52))
+#loc281 = loc("tmp50"(#loc53))
+#loc282 = loc("tmp50"(#loc54))
+#loc283 = loc("tmp50"(#loc55))
+#loc284 = loc("tmp58"(#loc56))
+#loc285 = loc("tmp58"(#loc57))
+#loc286 = loc("tmp58"(#loc58))
+#loc287 = loc("tmp63"(#loc59))
+#loc288 = loc("tmp63"(#loc60))
+#loc289 = loc("tmp63"(#loc61))
+#loc290 = loc("tmp63"(#loc62))
+#loc291 = loc("tmp66"(#loc63))
+#loc292 = loc("tmp66"(#loc64))
+#loc293 = loc("tmp66"(#loc65))
+#loc294 = loc("tmp66"(#loc66))
+#loc295 = loc("tmp96"(#loc67))
+#loc296 = loc("tmp96"(#loc68))
+#loc297 = loc("tmp96"(#loc69))
+#loc298 = loc("tmp96"(#loc70))
+#loc299 = loc("tmp96"(#loc71))
+#loc300 = loc("tmp96"(#loc72))
+#loc301 = loc("tmp96"(#loc73))
+#loc302 = loc("tmp96"(#loc74))
+#loc303 = loc("tmp102"(#loc75))
+#loc304 = loc("tmp102"(#loc76))
+#loc305 = loc("tmp102"(#loc77))
+#loc306 = loc("tmp13"(#loc78))
+#loc307 = loc("tmp14"(#loc79))
+#loc308 = loc("tmp15"(#loc80))
+#loc309 = loc("tmp16"(#loc81))
+#loc310 = loc("tmp17"(#loc82))
+#loc311 = loc("tmp17"(#loc83))
+#loc312 = loc("tmp17"(#loc84))
+#loc313 = loc("tmp17"(#loc85))
+#loc314 = loc("tmp17"(#loc86))
+#loc315 = loc("tmp17"(#loc87))
+#loc316 = loc("tmp17"(#loc88))
+#loc317 = loc("tmp17"(#loc89))
+#loc318 = loc("tmp17"(#loc90))
+#loc319 = loc("tmp17"(#loc91))
+#loc320 = loc("tmp19"(#loc92))
+#loc321 = loc("tmp20"(#loc93))
+#loc322 = loc("tmp21"(#loc94))
+#loc323 = loc("tmp22"(#loc95))
+#loc324 = loc("tmp23"(#loc96))
+#loc325 = loc("tmp24"(#loc97))
+#loc326 = loc("tmp25"(#loc98))
+#loc327 = loc("tmp25"(#loc99))
+#loc328 = loc("tmp25"(#loc100))
+#loc329 = loc("tmp25"(#loc101))
+#loc330 = loc("tmp25"(#loc102))
+#loc331 = loc("tmp25"(#loc103))
+#loc332 = loc("tmp25"(#loc104))
+#loc333 = loc("tmp27"(#loc105))
+#loc334 = loc("tmp29"(#loc106))
+#loc335 = loc("tmp30"(#loc107))
+#loc336 = loc("tmp31"(#loc108))
+#loc337 = loc("tmp32"(#loc109))
+#loc338 = loc("tmp33"(#loc110))
+#loc339 = loc("tmp34"(#loc111))
+#loc340 = loc("tmp35"(#loc112))
+#loc341 = loc("tmp35"(#loc113))
+#loc342 = loc("tmp35"(#loc114))
+#loc343 = loc("tmp35"(#loc115))
+#loc344 = loc("tmp35"(#loc116))
+#loc345 = loc("tmp35"(#loc117))
+#loc346 = loc("tmp35"(#loc118))
+#loc347 = loc("tmp35"(#loc119))
+#loc348 = loc("tmp35"(#loc120))
+#loc349 = loc("tmp37"(#loc121))
+#loc350 = loc("tmp38"(#loc122))
+#loc351 = loc("tmp39"(#loc123))
+#loc352 = loc("tmp40"(#loc124))
+#loc353 = loc("tmp41"(#loc125))
+#loc354 = loc("tmp42"(#loc126))
+#loc355 = loc("tmp43"(#loc127))
+#loc356 = loc("tmp43"(#loc128))
+#loc357 = loc("tmp43"(#loc129))
+#loc358 = loc("tmp43"(#loc130))
+#loc359 = loc("tmp43"(#loc131))
+#loc360 = loc("tmp43"(#loc132))
+#loc361 = loc("tmp45"(#loc133))
+#loc362 = loc("tmp47"(#loc134))
+#loc363 = loc("tmp48"(#loc135))
+#loc364 = loc("tmp49"(#loc136))
+#loc365 = loc("tmp52"(#loc137))
+#loc366 = loc("tmp53"(#loc138))
+#loc367 = loc("tmp54"(#loc139))
+#loc368 = loc("tmp55"(#loc140))
+#loc369 = loc("tmp56"(#loc141))
+#loc370 = loc("tmp57"(#loc142))
+#loc371 = loc("tmp60"(#loc143))
+#loc372 = loc("tmp64"(#loc144))
+#loc373 = loc("tmp67"(#loc145))
+#loc374 = loc("tmp68"(#loc146))
+#loc375 = loc("tmp70"(#loc147))
+#loc376 = loc("tmp70"(#loc148))
+#loc377 = loc("tmp70"(#loc149))
+#loc378 = loc("tmp70"(#loc150))
+#loc379 = loc("tmp70"(#loc151))
+#loc380 = loc("tmp70"(#loc152))
+#loc381 = loc("tmp70"(#loc153))
+#loc382 = loc("tmp70"(#loc154))
+#loc383 = loc("tmp70"(#loc155))
+#loc384 = loc("tmp70"(#loc156))
+#loc385 = loc("tmp72"(#loc157))
+#loc386 = loc("tmp73"(#loc158))
+#loc387 = loc("tmp74"(#loc159))
+#loc388 = loc("tmp75"(#loc160))
+#loc389 = loc("tmp76"(#loc161))
+#loc390 = loc("tmp76"(#loc162))
+#loc391 = loc("tmp76"(#loc163))
+#loc392 = loc("tmp76"(#loc164))
+#loc393 = loc("tmp76"(#loc165))
+#loc394 = loc("tmp76"(#loc166))
+#loc395 = loc("tmp76"(#loc167))
+#loc396 = loc("tmp78"(#loc168))
+#loc397 = loc("tmp80"(#loc169))
+#loc398 = loc("tmp81"(#loc170))
+#loc399 = loc("tmp82"(#loc171))
+#loc400 = loc("tmp83"(#loc172))
+#loc401 = loc("tmp83"(#loc173))
+#loc402 = loc("tmp83"(#loc174))
+#loc403 = loc("tmp83"(#loc175))
+#loc404 = loc("tmp83"(#loc176))
+#loc405 = loc("tmp83"(#loc177))
+#loc406 = loc("tmp83"(#loc178))
+#loc407 = loc("tmp83"(#loc179))
+#loc408 = loc("tmp83"(#loc180))
+#loc409 = loc("tmp83"(#loc181))
+#loc410 = loc("tmp85"(#loc182))
+#loc411 = loc("tmp86"(#loc183))
+#loc412 = loc("tmp87"(#loc184))
+#loc413 = loc("tmp88"(#loc185))
+#loc414 = loc("tmp89"(#loc186))
+#loc415 = loc("tmp89"(#loc187))
+#loc416 = loc("tmp89"(#loc188))
+#loc417 = loc("tmp89"(#loc189))
+#loc418 = loc("tmp89"(#loc190))
+#loc419 = loc("tmp89"(#loc191))
+#loc420 = loc("tmp91"(#loc192))
+#loc421 = loc("tmp93"(#loc193))
+#loc422 = loc("tmp94"(#loc194))
+#loc423 = loc("tmp95"(#loc195))
+#loc424 = loc("tmp98"(#loc196))
+#loc425 = loc("tmp99"(#loc197))
+#loc426 = loc("tmp100"(#loc198))
+#loc427 = loc("tmp101"(#loc199))
+#loc428 = loc("tmp104"(#loc200))
+#loc429 = loc("tmp107"(#loc201))
+#loc430 = loc("tmp109"(#loc202))
+#loc431 = loc("tmp110"(#loc203))
+#loc435 = loc("_tmp10"(#loc245))
diff --git a/triton/RIRT6HUGDE5FKFXB6MO7WI4SAH36A3KCULSXF6ERVDOUPVVMN3QA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttgir b/triton/RIRT6HUGDE5FKFXB6MO7WI4SAH36A3KCULSXF6ERVDOUPVVMN3QA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttgir
new file mode 100644
index 0000000000000000000000000000000000000000..2cefa8cb793bb6248b8e1d562ebf076b666bfce3
--- /dev/null
+++ b/triton/RIRT6HUGDE5FKFXB6MO7WI4SAH36A3KCULSXF6ERVDOUPVVMN3QA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttgir
@@ -0,0 +1,495 @@
+#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [4, 8], warpsPerCTA = [1, 4], order = [0, 1]}>
+#blocked1 = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 32], warpsPerCTA = [4, 1], order = [1, 0]}>
+#blocked2 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 4], order = [1, 0]}>
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":18:0)
+#loc1 = loc(unknown)
+#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":51:25)
+#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":52:27)
+#loc130 = loc("in_out_ptr0"(#loc))
+#loc131 = loc("in_out_ptr1"(#loc))
+#loc132 = loc("in_ptr0"(#loc))
+#loc133 = loc("in_ptr1"(#loc))
+#loc134 = loc("in_ptr2"(#loc))
+#loc135 = loc("in_ptr3"(#loc))
+#loc136 = loc("in_ptr4"(#loc))
+#loc137 = loc("xnumel"(#loc))
+#loc138 = loc("r0_numel"(#loc))
+#loc166 = loc("tmp4"(#loc30))
+#loc168 = loc("tmp10"(#loc33))
+#loc259 = loc(callsite(#loc1 at #loc166))
+#loc261 = loc(callsite(#loc1 at #loc168))
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:89", "ttg.threads-per-warp" = 32 : i32} {
+  tt.func public @triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0(%in_out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_out_ptr0"(#loc)), %in_out_ptr1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_out_ptr1"(#loc)), %in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %in_ptr4: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr4"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
+    %cst = arith.constant dense<4097> : tensor<1x128xi32, #blocked> loc(#loc1)
+    %cst_0 = arith.constant dense<1> : tensor<1x128xi32, #blocked> loc(#loc1)
+    %cst_1 = arith.constant dense<1> : tensor<1x128xi64, #blocked> loc(#loc1)
+    %cst_2 = arith.constant dense<2> : tensor<1x128xi32, #blocked> loc(#loc1)
+    %cst_3 = arith.constant dense<36864> : tensor<4x1xi32, #blocked> loc(#loc1)
+    %cst_4 = arith.constant dense<36864> : tensor<4x1xi32, #blocked1> loc(#loc1)
+    %cst_5 = arith.constant dense<128> : tensor<4x1xi32, #blocked> loc(#loc1)
+    %cst_6 = arith.constant dense<128> : tensor<4x1xi32, #blocked1> loc(#loc1)
+    %cst_7 = arith.constant dense<4096> : tensor<1x128xi32, #blocked> loc(#loc1)
+    %cst_8 = arith.constant dense<4096> : tensor<1x128xi32, #blocked1> loc(#loc1)
+    %cst_9 = arith.constant dense<128> : tensor<1x128xi32, #blocked> loc(#loc1)
+    %cst_10 = arith.constant dense<128> : tensor<1x128xi32, #blocked1> loc(#loc1)
+    %cst_11 = arith.constant dense<32> : tensor<4x1xi32, #blocked> loc(#loc1)
+    %cst_12 = arith.constant dense<32> : tensor<4x1xi32, #blocked1> loc(#loc1)
+    %c4_i32 = arith.constant 4 : i32 loc(#loc1)
+    %cst_13 = arith.constant dense<0.000000e+00> : tensor<4x128xbf16, #blocked1> loc(#loc1)
+    %cst_14 = arith.constant dense<0.000000e+00> : tensor<4x128xbf16, #blocked> loc(#loc1)
+    %cst_15 = arith.constant dense<128> : tensor<1x128xi32, #blocked2> loc(#loc1)
+    %cst_16 = arith.constant dense<0.000000e+00> : tensor<1x128xbf16, #blocked2> loc(#loc1)
+    %cst_17 = arith.constant dense<9.99999997E-7> : tensor<4x1xf32, #blocked1> loc(#loc1)
+    %cst_18 = arith.constant dense<1.280000e+02> : tensor<4x1xf32, #blocked1> loc(#loc1)
+    %cst_19 = arith.constant dense<0.000000e+00> : tensor<4x128xf32, #blocked> loc(#loc1)
+    %cst_20 = arith.constant dense<0.000000e+00> : tensor<4x128xf32, #blocked1> loc(#loc1)
+    %xoffset = tt.get_program_id x : i32 loc(#loc139)
+    %xoffset_21 = arith.muli %xoffset, %c4_i32 : i32 loc(#loc140)
+    %xindex = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc141)
+    %xindex_22 = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc141)
+    %xindex_23 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<4xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<4x1xi32, #blocked1> loc(#loc141)
+    %xindex_24 = tt.expand_dims %xindex_22 {axis = 1 : i32} : tensor<4xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<4x1xi32, #blocked> loc(#loc141)
+    %xindex_25 = tt.splat %xoffset_21 : i32 -> tensor<4x1xi32, #blocked1> loc(#loc142)
+    %xindex_26 = tt.splat %xoffset_21 : i32 -> tensor<4x1xi32, #blocked> loc(#loc142)
+    %xindex_27 = arith.addi %xindex_25, %xindex_23 : tensor<4x1xi32, #blocked1> loc(#loc142)
+    %xindex_28 = arith.addi %xindex_26, %xindex_24 : tensor<4x1xi32, #blocked> loc(#loc142)
+    %r0_base = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc143)
+    %r0_base_29 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc143)
+    %r0_base_30 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked2}>> loc(#loc143)
+    %r0_base_31 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x128xi32, #blocked1> loc(#loc143)
+    %r0_base_32 = tt.expand_dims %r0_base_29 {axis = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x128xi32, #blocked> loc(#loc143)
+    %r0_base_33 = tt.expand_dims %r0_base_30 {axis = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked2}>> -> tensor<1x128xi32, #blocked2> loc(#loc143)
+    %x0 = arith.remsi %xindex_27, %cst_12 : tensor<4x1xi32, #blocked1> loc(#loc144)
+    %x0_34 = arith.remsi %xindex_28, %cst_11 : tensor<4x1xi32, #blocked> loc(#loc144)
+    %x1 = arith.divsi %xindex_27, %cst_12 : tensor<4x1xi32, #blocked1> loc(#loc145)
+    %x1_35 = arith.divsi %xindex_28, %cst_11 : tensor<4x1xi32, #blocked> loc(#loc145)
+    %r0_mask = arith.cmpi slt, %r0_base_31, %cst_10 : tensor<1x128xi32, #blocked1> loc(#loc146)
+    %r0_mask_36 = arith.cmpi slt, %r0_base_32, %cst_9 : tensor<1x128xi32, #blocked> loc(#loc146)
+    %r0_mask_37 = arith.cmpi slt, %r0_base_33, %cst_15 : tensor<1x128xi32, #blocked2> loc(#loc146)
+    %tmp0 = arith.addi %r0_base_31, %cst_8 : tensor<1x128xi32, #blocked1> loc(#loc147)
+    %tmp0_38 = arith.muli %x0, %cst_6 : tensor<4x1xi32, #blocked1> loc(#loc148)
+    %tmp0_39 = arith.muli %x0_34, %cst_5 : tensor<4x1xi32, #blocked> loc(#loc148)
+    %tmp0_40 = tt.broadcast %tmp0 : tensor<1x128xi32, #blocked1> -> tensor<4x128xi32, #blocked1> loc(#loc149)
+    %tmp0_41 = tt.broadcast %tmp0_38 : tensor<4x1xi32, #blocked1> -> tensor<4x128xi32, #blocked1> loc(#loc149)
+    %tmp0_42 = tt.broadcast %tmp0_39 : tensor<4x1xi32, #blocked> -> tensor<4x128xi32, #blocked> loc(#loc149)
+    %tmp0_43 = arith.addi %tmp0_40, %tmp0_41 : tensor<4x128xi32, #blocked1> loc(#loc149)
+    %tmp0_44 = arith.muli %x1, %cst_4 : tensor<4x1xi32, #blocked1> loc(#loc150)
+    %tmp0_45 = arith.muli %x1_35, %cst_3 : tensor<4x1xi32, #blocked> loc(#loc150)
+    %tmp0_46 = tt.broadcast %tmp0_44 : tensor<4x1xi32, #blocked1> -> tensor<4x128xi32, #blocked1> loc(#loc151)
+    %tmp0_47 = tt.broadcast %tmp0_45 : tensor<4x1xi32, #blocked> -> tensor<4x128xi32, #blocked> loc(#loc151)
+    %tmp0_48 = arith.addi %tmp0_43, %tmp0_46 : tensor<4x128xi32, #blocked1> loc(#loc151)
+    %tmp0_49 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<4x128x!tt.ptr<bf16>, #blocked1> loc(#loc152)
+    %tmp0_50 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<4x128x!tt.ptr<bf16>, #blocked> loc(#loc152)
+    %tmp0_51 = tt.addptr %tmp0_49, %tmp0_48 : tensor<4x128x!tt.ptr<bf16>, #blocked1>, tensor<4x128xi32, #blocked1> loc(#loc152)
+    %tmp0_52 = tt.broadcast %r0_mask : tensor<1x128xi1, #blocked1> -> tensor<4x128xi1, #blocked1> loc(#loc153)
+    %tmp0_53 = tt.load %tmp0_51, %tmp0_52, %cst_13 evictionPolicy = evict_last : tensor<4x128x!tt.ptr<bf16>, #blocked1> loc(#loc153)
+    %tmp0_54 = arith.extf %tmp0_53 : tensor<4x128xbf16, #blocked1> to tensor<4x128xf32, #blocked1> loc(#loc154)
+    %tmp6 = tt.broadcast %r0_base_31 : tensor<1x128xi32, #blocked1> -> tensor<4x128xi32, #blocked1> loc(#loc155)
+    %tmp6_55 = arith.addi %tmp6, %tmp0_41 : tensor<4x128xi32, #blocked1> loc(#loc155)
+    %tmp6_56 = arith.addi %tmp6_55, %tmp0_46 : tensor<4x128xi32, #blocked1> loc(#loc156)
+    %tmp6_57 = tt.addptr %tmp0_49, %tmp6_56 : tensor<4x128x!tt.ptr<bf16>, #blocked1>, tensor<4x128xi32, #blocked1> loc(#loc157)
+    %tmp6_58 = tt.load %tmp6_57, %tmp0_52, %cst_13 evictionPolicy = evict_last : tensor<4x128x!tt.ptr<bf16>, #blocked1> loc(#loc158)
+    %tmp6_59 = arith.extf %tmp6_58 : tensor<4x128xbf16, #blocked1> to tensor<4x128xf32, #blocked1> loc(#loc159)
+    %tmp2 = arith.mulf %tmp0_54, %tmp0_54 : tensor<4x128xf32, #blocked1> loc(#loc160)
+    %tmp5 = arith.addf %tmp2, %cst_20 : tensor<4x128xf32, #blocked1> loc(#loc161)
+    %_tmp4 = arith.select %tmp0_52, %tmp5, %cst_20 : tensor<4x128xi1, #blocked1>, tensor<4x128xf32, #blocked1> loc(#loc162)
+    %tmp8 = arith.mulf %tmp6_59, %tmp6_59 : tensor<4x128xf32, #blocked1> loc(#loc163)
+    %tmp11 = arith.addf %tmp8, %cst_20 : tensor<4x128xf32, #blocked1> loc(#loc164)
+    %_tmp10 = arith.select %tmp0_52, %tmp11, %cst_20 : tensor<4x128xi1, #blocked1>, tensor<4x128xf32, #blocked1> loc(#loc165)
+    %tmp4 = "tt.reduce"(%_tmp4) <{axis = 1 : i32}> ({
+    ^bb0(%tmp4_134: f32 loc(callsite(#loc1 at #loc166)), %tmp4_135: f32 loc(callsite(#loc1 at #loc166))):
+      %tmp4_136 = arith.addf %tmp4_134, %tmp4_135 : f32 loc(#loc264)
+      tt.reduce.return %tmp4_136 : f32 loc(#loc258)
+    }) : (tensor<4x128xf32, #blocked1>) -> tensor<4xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc258)
+    %tmp4_60 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<4xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<4x1xf32, #blocked1> loc(#loc167)
+    %tmp10 = "tt.reduce"(%_tmp10) <{axis = 1 : i32}> ({
+    ^bb0(%tmp10_134: f32 loc(callsite(#loc1 at #loc168)), %tmp10_135: f32 loc(callsite(#loc1 at #loc168))):
+      %tmp10_136 = arith.addf %tmp10_134, %tmp10_135 : f32 loc(#loc265)
+      tt.reduce.return %tmp10_136 : f32 loc(#loc260)
+    }) : (tensor<4x128xf32, #blocked1>) -> tensor<4xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc260)
+    %tmp10_61 = tt.expand_dims %tmp10 {axis = 1 : i32} : tensor<4xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<4x1xf32, #blocked1> loc(#loc169)
+    %r0_3 = arith.remsi %r0_base_32, %cst_2 : tensor<1x128xi32, #blocked> loc(#loc170)
+    %r0_4 = arith.divsi %r0_base_32, %cst_2 : tensor<1x128xi32, #blocked> loc(#loc171)
+    %tmp58 = tt.splat %in_ptr1 : !tt.ptr<bf16> -> tensor<1x128x!tt.ptr<bf16>, #blocked> loc(#loc172)
+    %tmp58_62 = tt.splat %in_ptr1 : !tt.ptr<bf16> -> tensor<1x128x!tt.ptr<bf16>, #blocked2> loc(#loc172)
+    %tmp58_63 = tt.addptr %tmp58_62, %r0_base_33 : tensor<1x128x!tt.ptr<bf16>, #blocked2>, tensor<1x128xi32, #blocked2> loc(#loc172)
+    %tmp58_64 = tt.load %tmp58_63, %r0_mask_37, %cst_16 evictionPolicy = evict_last : tensor<1x128x!tt.ptr<bf16>, #blocked2> loc(#loc173)
+    %tmp58_65 = arith.extf %tmp58_64 : tensor<1x128xbf16, #blocked2> to tensor<1x128xf32, #blocked2> loc(#loc174)
+    %tmp63 = arith.muli %x1, %cst_6 : tensor<4x1xi32, #blocked1> loc(#loc175)
+    %tmp63_66 = tt.broadcast %tmp63 : tensor<4x1xi32, #blocked1> -> tensor<4x128xi32, #blocked1> loc(#loc176)
+    %tmp63_67 = arith.addi %tmp6, %tmp63_66 : tensor<4x128xi32, #blocked1> loc(#loc176)
+    %tmp63_68 = tt.splat %in_ptr2 : !tt.ptr<f32> -> tensor<4x128x!tt.ptr<f32>, #blocked1> loc(#loc177)
+    %tmp63_69 = tt.addptr %tmp63_68, %tmp63_67 : tensor<4x128x!tt.ptr<f32>, #blocked1>, tensor<4x128xi32, #blocked1> loc(#loc177)
+    %tmp63_70 = tt.load %tmp63_69, %tmp0_52, %cst_20 evictionPolicy = evict_last : tensor<4x128x!tt.ptr<f32>, #blocked1> loc(#loc178)
+    %tmp63_71 = ttg.convert_layout %tmp63_70 : tensor<4x128xf32, #blocked1> -> tensor<4x128xf32, #blocked> loc(#loc178)
+    %tmp66 = tt.splat %in_ptr3 : !tt.ptr<f32> -> tensor<4x128x!tt.ptr<f32>, #blocked1> loc(#loc179)
+    %tmp66_72 = tt.addptr %tmp66, %tmp63_67 : tensor<4x128x!tt.ptr<f32>, #blocked1>, tensor<4x128xi32, #blocked1> loc(#loc179)
+    %tmp66_73 = tt.load %tmp66_72, %tmp0_52, %cst_20 evictionPolicy = evict_last : tensor<4x128x!tt.ptr<f32>, #blocked1> loc(#loc180)
+    %tmp66_74 = ttg.convert_layout %tmp66_73 : tensor<4x128xf32, #blocked1> -> tensor<4x128xf32, #blocked> loc(#loc180)
+    %tmp96 = tt.load %tmp0_51, %tmp0_52, %cst_13 evictionPolicy = evict_first : tensor<4x128x!tt.ptr<bf16>, #blocked1> loc(#loc181)
+    %tmp96_75 = arith.extf %tmp96 : tensor<4x128xbf16, #blocked1> to tensor<4x128xf32, #blocked1> loc(#loc182)
+    %tmp102 = tt.splat %in_ptr4 : !tt.ptr<bf16> -> tensor<1x128x!tt.ptr<bf16>, #blocked> loc(#loc183)
+    %tmp102_76 = tt.splat %in_ptr4 : !tt.ptr<bf16> -> tensor<1x128x!tt.ptr<bf16>, #blocked2> loc(#loc183)
+    %tmp102_77 = tt.addptr %tmp102_76, %r0_base_33 : tensor<1x128x!tt.ptr<bf16>, #blocked2>, tensor<1x128xi32, #blocked2> loc(#loc183)
+    %tmp102_78 = tt.load %tmp102_77, %r0_mask_37, %cst_16 evictionPolicy = evict_last : tensor<1x128x!tt.ptr<bf16>, #blocked2> loc(#loc184)
+    %tmp102_79 = arith.extf %tmp102_78 : tensor<1x128xbf16, #blocked2> to tensor<1x128xf32, #blocked2> loc(#loc185)
+    %tmp16 = arith.extsi %r0_3 : tensor<1x128xi32, #blocked> to tensor<1x128xi64, #blocked> loc(#loc186)
+    %tmp16_80 = arith.cmpi slt, %tmp16, %cst_1 : tensor<1x128xi64, #blocked> loc(#loc186)
+    %tmp17 = arith.muli %r0_4, %cst_2 : tensor<1x128xi32, #blocked> loc(#loc187)
+    %tmp17_81 = arith.addi %tmp17, %cst_0 : tensor<1x128xi32, #blocked> loc(#loc188)
+    %tmp17_82 = tt.broadcast %tmp17_81 : tensor<1x128xi32, #blocked> -> tensor<4x128xi32, #blocked> loc(#loc189)
+    %tmp17_83 = arith.addi %tmp17_82, %tmp0_42 : tensor<4x128xi32, #blocked> loc(#loc189)
+    %tmp17_84 = arith.addi %tmp17_83, %tmp0_47 : tensor<4x128xi32, #blocked> loc(#loc190)
+    %tmp17_85 = tt.addptr %tmp0_50, %tmp17_84 : tensor<4x128x!tt.ptr<bf16>, #blocked>, tensor<4x128xi32, #blocked> loc(#loc191)
+    %tmp17_86 = arith.andi %r0_mask_36, %tmp16_80 : tensor<1x128xi1, #blocked> loc(#loc192)
+    %tmp17_87 = tt.broadcast %tmp17_86 : tensor<1x128xi1, #blocked> -> tensor<4x128xi1, #blocked> loc(#loc193)
+    %tmp17_88 = tt.load %tmp17_85, %tmp17_87, %cst_14 evictionPolicy = evict_last : tensor<4x128x!tt.ptr<bf16>, #blocked> loc(#loc193)
+    %tmp17_89 = arith.extf %tmp17_88 : tensor<4x128xbf16, #blocked> to tensor<4x128xf32, #blocked> loc(#loc194)
+    %tmp20 = arith.divf %tmp10_61, %cst_18 : tensor<4x1xf32, #blocked1> loc(#loc195)
+    %tmp22 = arith.addf %tmp20, %cst_17 : tensor<4x1xf32, #blocked1> loc(#loc196)
+    %tmp23 = tt.extern_elementwise %tmp22 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<4x1xf32, #blocked1>) -> tensor<4x1xf32, #blocked1> loc(#loc197)
+    %tmp24 = ttg.convert_layout %tmp23 : tensor<4x1xf32, #blocked1> -> tensor<4x1xf32, #blocked> loc(#loc198)
+    %tmp24_90 = tt.broadcast %tmp24 : tensor<4x1xf32, #blocked> -> tensor<4x128xf32, #blocked> loc(#loc198)
+    %tmp24_91 = tt.broadcast %tmp23 : tensor<4x1xf32, #blocked1> -> tensor<4x128xf32, #blocked1> loc(#loc198)
+    %tmp24_92 = arith.mulf %tmp17_89, %tmp24_90 : tensor<4x128xf32, #blocked> loc(#loc198)
+    %tmp25 = tt.addptr %tmp58, %tmp17_81 : tensor<1x128x!tt.ptr<bf16>, #blocked>, tensor<1x128xi32, #blocked> loc(#loc199)
+    %tmp25_93 = tt.broadcast %tmp25 : tensor<1x128x!tt.ptr<bf16>, #blocked> -> tensor<4x128x!tt.ptr<bf16>, #blocked> loc(#loc199)
+    %tmp25_94 = tt.load %tmp25_93, %tmp17_87, %cst_14 evictionPolicy = evict_last : tensor<4x128x!tt.ptr<bf16>, #blocked> loc(#loc200)
+    %tmp25_95 = arith.extf %tmp25_94 : tensor<4x128xbf16, #blocked> to tensor<4x128xf32, #blocked> loc(#loc201)
+    %tmp27 = arith.mulf %tmp24_92, %tmp25_95 : tensor<4x128xf32, #blocked> loc(#loc202)
+    %tmp29 = arith.subf %cst_19, %tmp27 : tensor<4x128xf32, #blocked> loc(#loc203)
+    %tmp31 = tt.broadcast %tmp16_80 : tensor<1x128xi1, #blocked> -> tensor<4x128xi1, #blocked> loc(#loc204)
+    %tmp32 = arith.cmpi sge, %tmp16, %cst_1 : tensor<1x128xi64, #blocked> loc(#loc205)
+    %tmp35 = tt.broadcast %tmp17 : tensor<1x128xi32, #blocked> -> tensor<4x128xi32, #blocked> loc(#loc206)
+    %tmp35_96 = arith.addi %tmp35, %tmp0_42 : tensor<4x128xi32, #blocked> loc(#loc206)
+    %tmp35_97 = arith.addi %tmp35_96, %tmp0_47 : tensor<4x128xi32, #blocked> loc(#loc207)
+    %tmp35_98 = tt.addptr %tmp0_50, %tmp35_97 : tensor<4x128x!tt.ptr<bf16>, #blocked>, tensor<4x128xi32, #blocked> loc(#loc208)
+    %tmp35_99 = arith.andi %r0_mask_36, %tmp32 : tensor<1x128xi1, #blocked> loc(#loc209)
+    %tmp35_100 = tt.broadcast %tmp35_99 : tensor<1x128xi1, #blocked> -> tensor<4x128xi1, #blocked> loc(#loc210)
+    %tmp35_101 = tt.load %tmp35_98, %tmp35_100, %cst_14 evictionPolicy = evict_last : tensor<4x128x!tt.ptr<bf16>, #blocked> loc(#loc210)
+    %tmp35_102 = arith.extf %tmp35_101 : tensor<4x128xbf16, #blocked> to tensor<4x128xf32, #blocked> loc(#loc211)
+    %tmp42 = arith.mulf %tmp35_102, %tmp24_90 : tensor<4x128xf32, #blocked> loc(#loc212)
+    %tmp43 = tt.addptr %tmp58, %tmp17 : tensor<1x128x!tt.ptr<bf16>, #blocked>, tensor<1x128xi32, #blocked> loc(#loc213)
+    %tmp43_103 = tt.broadcast %tmp43 : tensor<1x128x!tt.ptr<bf16>, #blocked> -> tensor<4x128x!tt.ptr<bf16>, #blocked> loc(#loc213)
+    %tmp43_104 = tt.load %tmp43_103, %tmp35_100, %cst_14 evictionPolicy = evict_last : tensor<4x128x!tt.ptr<bf16>, #blocked> loc(#loc214)
+    %tmp43_105 = arith.extf %tmp43_104 : tensor<4x128xbf16, #blocked> to tensor<4x128xf32, #blocked> loc(#loc215)
+    %tmp45 = arith.mulf %tmp42, %tmp43_105 : tensor<4x128xf32, #blocked> loc(#loc216)
+    %tmp48 = tt.broadcast %tmp32 : tensor<1x128xi1, #blocked> -> tensor<4x128xi1, #blocked> loc(#loc217)
+    %tmp48_106 = arith.select %tmp48, %tmp45, %cst_19 : tensor<4x128xi1, #blocked>, tensor<4x128xf32, #blocked> loc(#loc217)
+    %tmp49 = arith.select %tmp31, %tmp29, %tmp48_106 : tensor<4x128xi1, #blocked>, tensor<4x128xf32, #blocked> loc(#loc262)
+    %tmp57 = arith.mulf %tmp6_59, %tmp24_91 : tensor<4x128xf32, #blocked1> loc(#loc219)
+    %tmp60 = ttg.convert_layout %tmp58_65 : tensor<1x128xf32, #blocked2> -> tensor<1x128xf32, #blocked1> loc(#loc220)
+    %tmp60_107 = tt.broadcast %tmp60 : tensor<1x128xf32, #blocked1> -> tensor<4x128xf32, #blocked1> loc(#loc220)
+    %tmp60_108 = arith.mulf %tmp57, %tmp60_107 : tensor<4x128xf32, #blocked1> loc(#loc220)
+    %tmp64 = arith.mulf %tmp60_108, %tmp63_70 : tensor<4x128xf32, #blocked1> loc(#loc221)
+    %tmp64_109 = ttg.convert_layout %tmp64 : tensor<4x128xf32, #blocked1> -> tensor<4x128xf32, #blocked> loc(#loc221)
+    %tmp67 = arith.mulf %tmp49, %tmp66_74 : tensor<4x128xf32, #blocked> loc(#loc222)
+    %tmp68 = arith.addf %tmp64_109, %tmp67 : tensor<4x128xf32, #blocked> loc(#loc223)
+    %tmp70 = arith.addi %tmp17, %cst : tensor<1x128xi32, #blocked> loc(#loc224)
+    %tmp70_110 = tt.broadcast %tmp70 : tensor<1x128xi32, #blocked> -> tensor<4x128xi32, #blocked> loc(#loc225)
+    %tmp70_111 = arith.addi %tmp70_110, %tmp0_42 : tensor<4x128xi32, #blocked> loc(#loc225)
+    %tmp70_112 = arith.addi %tmp70_111, %tmp0_47 : tensor<4x128xi32, #blocked> loc(#loc226)
+    %tmp70_113 = tt.addptr %tmp0_50, %tmp70_112 : tensor<4x128x!tt.ptr<bf16>, #blocked>, tensor<4x128xi32, #blocked> loc(#loc227)
+    %tmp70_114 = tt.load %tmp70_113, %tmp17_87, %cst_14 evictionPolicy = evict_last : tensor<4x128x!tt.ptr<bf16>, #blocked> loc(#loc228)
+    %tmp70_115 = arith.extf %tmp70_114 : tensor<4x128xbf16, #blocked> to tensor<4x128xf32, #blocked> loc(#loc229)
+    %tmp72 = arith.divf %tmp4_60, %cst_18 : tensor<4x1xf32, #blocked1> loc(#loc230)
+    %tmp73 = arith.addf %tmp72, %cst_17 : tensor<4x1xf32, #blocked1> loc(#loc231)
+    %tmp74 = tt.extern_elementwise %tmp73 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<4x1xf32, #blocked1>) -> tensor<4x1xf32, #blocked1> loc(#loc232)
+    %tmp75 = ttg.convert_layout %tmp74 : tensor<4x1xf32, #blocked1> -> tensor<4x1xf32, #blocked> loc(#loc233)
+    %tmp75_116 = tt.broadcast %tmp75 : tensor<4x1xf32, #blocked> -> tensor<4x128xf32, #blocked> loc(#loc233)
+    %tmp75_117 = tt.broadcast %tmp74 : tensor<4x1xf32, #blocked1> -> tensor<4x128xf32, #blocked1> loc(#loc233)
+    %tmp75_118 = arith.mulf %tmp70_115, %tmp75_116 : tensor<4x128xf32, #blocked> loc(#loc233)
+    %tmp76 = tt.addptr %tmp102, %tmp17_81 : tensor<1x128x!tt.ptr<bf16>, #blocked>, tensor<1x128xi32, #blocked> loc(#loc234)
+    %tmp76_119 = tt.broadcast %tmp76 : tensor<1x128x!tt.ptr<bf16>, #blocked> -> tensor<4x128x!tt.ptr<bf16>, #blocked> loc(#loc234)
+    %tmp76_120 = tt.load %tmp76_119, %tmp17_87, %cst_14 evictionPolicy = evict_last : tensor<4x128x!tt.ptr<bf16>, #blocked> loc(#loc235)
+    %tmp76_121 = arith.extf %tmp76_120 : tensor<4x128xbf16, #blocked> to tensor<4x128xf32, #blocked> loc(#loc236)
+    %tmp78 = arith.mulf %tmp75_118, %tmp76_121 : tensor<4x128xf32, #blocked> loc(#loc237)
+    %tmp80 = arith.subf %cst_19, %tmp78 : tensor<4x128xf32, #blocked> loc(#loc238)
+    %tmp83 = arith.addi %tmp17, %cst_7 : tensor<1x128xi32, #blocked> loc(#loc239)
+    %tmp83_122 = tt.broadcast %tmp83 : tensor<1x128xi32, #blocked> -> tensor<4x128xi32, #blocked> loc(#loc240)
+    %tmp83_123 = arith.addi %tmp83_122, %tmp0_42 : tensor<4x128xi32, #blocked> loc(#loc240)
+    %tmp83_124 = arith.addi %tmp83_123, %tmp0_47 : tensor<4x128xi32, #blocked> loc(#loc241)
+    %tmp83_125 = tt.addptr %tmp0_50, %tmp83_124 : tensor<4x128x!tt.ptr<bf16>, #blocked>, tensor<4x128xi32, #blocked> loc(#loc242)
+    %tmp83_126 = tt.load %tmp83_125, %tmp35_100, %cst_14 evictionPolicy = evict_last : tensor<4x128x!tt.ptr<bf16>, #blocked> loc(#loc243)
+    %tmp83_127 = arith.extf %tmp83_126 : tensor<4x128xbf16, #blocked> to tensor<4x128xf32, #blocked> loc(#loc244)
+    %tmp88 = arith.mulf %tmp83_127, %tmp75_116 : tensor<4x128xf32, #blocked> loc(#loc245)
+    %tmp89 = tt.addptr %tmp102, %tmp17 : tensor<1x128x!tt.ptr<bf16>, #blocked>, tensor<1x128xi32, #blocked> loc(#loc246)
+    %tmp89_128 = tt.broadcast %tmp89 : tensor<1x128x!tt.ptr<bf16>, #blocked> -> tensor<4x128x!tt.ptr<bf16>, #blocked> loc(#loc246)
+    %tmp89_129 = tt.load %tmp89_128, %tmp35_100, %cst_14 evictionPolicy = evict_last : tensor<4x128x!tt.ptr<bf16>, #blocked> loc(#loc247)
+    %tmp89_130 = arith.extf %tmp89_129 : tensor<4x128xbf16, #blocked> to tensor<4x128xf32, #blocked> loc(#loc248)
+    %tmp91 = arith.mulf %tmp88, %tmp89_130 : tensor<4x128xf32, #blocked> loc(#loc249)
+    %tmp94 = arith.select %tmp48, %tmp91, %cst_19 : tensor<4x128xi1, #blocked>, tensor<4x128xf32, #blocked> loc(#loc250)
+    %tmp95 = arith.select %tmp31, %tmp80, %tmp94 : tensor<4x128xi1, #blocked>, tensor<4x128xf32, #blocked> loc(#loc263)
+    %tmp101 = arith.mulf %tmp96_75, %tmp75_117 : tensor<4x128xf32, #blocked1> loc(#loc253)
+    %tmp101_131 = ttg.convert_layout %tmp101 : tensor<4x128xf32, #blocked1> -> tensor<4x128xf32, #blocked> loc(#loc253)
+    %tmp107 = ttg.convert_layout %tmp102_79 : tensor<1x128xf32, #blocked2> -> tensor<1x128xf32, #blocked> loc(#loc254)
+    %tmp104 = tt.broadcast %tmp107 : tensor<1x128xf32, #blocked> -> tensor<4x128xf32, #blocked> loc(#loc255)
+    %tmp104_132 = arith.mulf %tmp101_131, %tmp104 : tensor<4x128xf32, #blocked> loc(#loc255)
+    %tmp107_133 = arith.mulf %tmp104_132, %tmp63_71 : tensor<4x128xf32, #blocked> loc(#loc254)
+    %tmp109 = arith.mulf %tmp95, %tmp66_74 : tensor<4x128xf32, #blocked> loc(#loc256)
+    %tmp110 = arith.addf %tmp107_133, %tmp109 : tensor<4x128xf32, #blocked> loc(#loc257)
+    %0 = arith.muli %xindex_27, %cst_6 : tensor<4x1xi32, #blocked1> loc(#loc123)
+    %1 = tt.broadcast %0 : tensor<4x1xi32, #blocked1> -> tensor<4x128xi32, #blocked1> loc(#loc124)
+    %2 = arith.addi %tmp6, %1 : tensor<4x128xi32, #blocked1> loc(#loc124)
+    %3 = tt.splat %in_out_ptr0 : !tt.ptr<bf16> -> tensor<4x128x!tt.ptr<bf16>, #blocked1> loc(#loc125)
+    %4 = tt.addptr %3, %2 : tensor<4x128x!tt.ptr<bf16>, #blocked1>, tensor<4x128xi32, #blocked1> loc(#loc125)
+    %5 = arith.truncf %tmp68 : tensor<4x128xf32, #blocked> to tensor<4x128xbf16, #blocked> loc(#loc126)
+    %6 = ttg.convert_layout %5 : tensor<4x128xbf16, #blocked> -> tensor<4x128xbf16, #blocked1> loc(#loc126)
+    tt.store %4, %6, %tmp0_52 : tensor<4x128x!tt.ptr<bf16>, #blocked1> loc(#loc126)
+    %7 = tt.splat %in_out_ptr1 : !tt.ptr<bf16> -> tensor<4x128x!tt.ptr<bf16>, #blocked1> loc(#loc127)
+    %8 = tt.addptr %7, %2 : tensor<4x128x!tt.ptr<bf16>, #blocked1>, tensor<4x128xi32, #blocked1> loc(#loc127)
+    %9 = arith.truncf %tmp110 : tensor<4x128xf32, #blocked> to tensor<4x128xbf16, #blocked> loc(#loc128)
+    %10 = ttg.convert_layout %9 : tensor<4x128xbf16, #blocked> -> tensor<4x128xbf16, #blocked1> loc(#loc128)
+    tt.store %8, %10, %tmp0_52 : tensor<4x128x!tt.ptr<bf16>, #blocked1> loc(#loc128)
+    tt.return loc(#loc129)
+  } loc(#loc)
+} loc(#loc)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":23:28)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":23:33)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:44)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:23)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":26:37)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":28:19)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":29:19)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":35:29)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:41)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:52)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:48)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:63)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:57)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:34)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:68)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:121)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:41)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:50)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:34)
+#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:61)
+#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:114)
+#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":42:22)
+#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":44:23)
+#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":45:40)
+#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":47:22)
+#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":49:25)
+#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":50:42)
+#loc29 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:36)
+#loc31 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:15)
+#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":51:28)
+#loc34 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":52:30)
+#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":58:27)
+#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":59:27)
+#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:35)
+#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:42)
+#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:95)
+#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:46)
+#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:42)
+#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:35)
+#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:51)
+#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:35)
+#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:51)
+#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:69)
+#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:123)
+#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:36)
+#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:43)
+#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:96)
+#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":71:24)
+#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:41)
+#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:39)
+#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:48)
+#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:57)
+#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:35)
+#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:78)
+#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:68)
+#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:129)
+#loc60 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":75:25)
+#loc61 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":77:24)
+#loc62 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":78:32)
+#loc63 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":79:24)
+#loc64 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:35)
+#loc65 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:85)
+#loc66 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:146)
+#loc67 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":82:24)
+#loc68 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":84:17)
+#loc69 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":86:39)
+#loc70 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":87:25)
+#loc71 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:44)
+#loc72 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:53)
+#loc73 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:35)
+#loc74 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:74)
+#loc75 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:64)
+#loc76 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:125)
+#loc77 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":97:24)
+#loc78 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:35)
+#loc79 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:81)
+#loc80 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:142)
+#loc81 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":100:24)
+#loc82 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":103:39)
+#loc83 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":104:39)
+#loc84 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":111:24)
+#loc85 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":113:24)
+#loc86 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":116:24)
+#loc87 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":118:24)
+#loc88 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":119:24)
+#loc89 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:42)
+#loc90 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:51)
+#loc91 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:60)
+#loc92 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:35)
+#loc93 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:71)
+#loc94 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:132)
+#loc95 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":123:24)
+#loc96 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":124:24)
+#loc97 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":125:32)
+#loc98 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":126:24)
+#loc99 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:35)
+#loc100 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:85)
+#loc101 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:146)
+#loc102 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":129:24)
+#loc103 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":131:17)
+#loc104 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:42)
+#loc105 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:51)
+#loc106 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:60)
+#loc107 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:35)
+#loc108 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:71)
+#loc109 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:132)
+#loc110 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":139:24)
+#loc111 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:35)
+#loc112 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:81)
+#loc113 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:142)
+#loc114 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":142:24)
+#loc115 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":145:39)
+#loc116 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":146:39)
+#loc117 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":133:39)
+#loc118 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":151:25)
+#loc119 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":156:26)
+#loc120 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":153:26)
+#loc121 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":158:26)
+#loc122 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":159:26)
+#loc123 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:43)
+#loc124 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:39)
+#loc125 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:32)
+#loc126 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:55)
+#loc127 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:32)
+#loc128 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:56)
+#loc129 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":53:4)
+#loc139 = loc("xoffset"(#loc2))
+#loc140 = loc("xoffset"(#loc3))
+#loc141 = loc("xindex"(#loc4))
+#loc142 = loc("xindex"(#loc5))
+#loc143 = loc("r0_base"(#loc6))
+#loc144 = loc("x0"(#loc7))
+#loc145 = loc("x1"(#loc8))
+#loc146 = loc("r0_mask"(#loc9))
+#loc147 = loc("tmp0"(#loc10))
+#loc148 = loc("tmp0"(#loc11))
+#loc149 = loc("tmp0"(#loc12))
+#loc150 = loc("tmp0"(#loc13))
+#loc151 = loc("tmp0"(#loc14))
+#loc152 = loc("tmp0"(#loc15))
+#loc153 = loc("tmp0"(#loc16))
+#loc154 = loc("tmp0"(#loc17))
+#loc155 = loc("tmp6"(#loc18))
+#loc156 = loc("tmp6"(#loc19))
+#loc157 = loc("tmp6"(#loc20))
+#loc158 = loc("tmp6"(#loc21))
+#loc159 = loc("tmp6"(#loc22))
+#loc160 = loc("tmp2"(#loc23))
+#loc161 = loc("tmp5"(#loc24))
+#loc162 = loc("_tmp4"(#loc25))
+#loc163 = loc("tmp8"(#loc26))
+#loc164 = loc("tmp11"(#loc27))
+#loc165 = loc("_tmp10"(#loc28))
+#loc167 = loc("tmp4"(#loc32))
+#loc169 = loc("tmp10"(#loc34))
+#loc170 = loc("r0_3"(#loc35))
+#loc171 = loc("r0_4"(#loc36))
+#loc172 = loc("tmp58"(#loc37))
+#loc173 = loc("tmp58"(#loc38))
+#loc174 = loc("tmp58"(#loc39))
+#loc175 = loc("tmp63"(#loc40))
+#loc176 = loc("tmp63"(#loc41))
+#loc177 = loc("tmp63"(#loc42))
+#loc178 = loc("tmp63"(#loc43))
+#loc179 = loc("tmp66"(#loc44))
+#loc180 = loc("tmp66"(#loc45))
+#loc181 = loc("tmp96"(#loc46))
+#loc182 = loc("tmp96"(#loc47))
+#loc183 = loc("tmp102"(#loc48))
+#loc184 = loc("tmp102"(#loc49))
+#loc185 = loc("tmp102"(#loc50))
+#loc186 = loc("tmp16"(#loc51))
+#loc187 = loc("tmp17"(#loc52))
+#loc188 = loc("tmp17"(#loc53))
+#loc189 = loc("tmp17"(#loc54))
+#loc190 = loc("tmp17"(#loc55))
+#loc191 = loc("tmp17"(#loc56))
+#loc192 = loc("tmp17"(#loc57))
+#loc193 = loc("tmp17"(#loc58))
+#loc194 = loc("tmp17"(#loc59))
+#loc195 = loc("tmp20"(#loc60))
+#loc196 = loc("tmp22"(#loc61))
+#loc197 = loc("tmp23"(#loc62))
+#loc198 = loc("tmp24"(#loc63))
+#loc199 = loc("tmp25"(#loc64))
+#loc200 = loc("tmp25"(#loc65))
+#loc201 = loc("tmp25"(#loc66))
+#loc202 = loc("tmp27"(#loc67))
+#loc203 = loc("tmp29"(#loc68))
+#loc204 = loc("tmp31"(#loc69))
+#loc205 = loc("tmp32"(#loc70))
+#loc206 = loc("tmp35"(#loc71))
+#loc207 = loc("tmp35"(#loc72))
+#loc208 = loc("tmp35"(#loc73))
+#loc209 = loc("tmp35"(#loc74))
+#loc210 = loc("tmp35"(#loc75))
+#loc211 = loc("tmp35"(#loc76))
+#loc212 = loc("tmp42"(#loc77))
+#loc213 = loc("tmp43"(#loc78))
+#loc214 = loc("tmp43"(#loc79))
+#loc215 = loc("tmp43"(#loc80))
+#loc216 = loc("tmp45"(#loc81))
+#loc217 = loc("tmp48"(#loc82))
+#loc218 = loc("tmp49"(#loc83))
+#loc219 = loc("tmp57"(#loc84))
+#loc220 = loc("tmp60"(#loc85))
+#loc221 = loc("tmp64"(#loc86))
+#loc222 = loc("tmp67"(#loc87))
+#loc223 = loc("tmp68"(#loc88))
+#loc224 = loc("tmp70"(#loc89))
+#loc225 = loc("tmp70"(#loc90))
+#loc226 = loc("tmp70"(#loc91))
+#loc227 = loc("tmp70"(#loc92))
+#loc228 = loc("tmp70"(#loc93))
+#loc229 = loc("tmp70"(#loc94))
+#loc230 = loc("tmp72"(#loc95))
+#loc231 = loc("tmp73"(#loc96))
+#loc232 = loc("tmp74"(#loc97))
+#loc233 = loc("tmp75"(#loc98))
+#loc234 = loc("tmp76"(#loc99))
+#loc235 = loc("tmp76"(#loc100))
+#loc236 = loc("tmp76"(#loc101))
+#loc237 = loc("tmp78"(#loc102))
+#loc238 = loc("tmp80"(#loc103))
+#loc239 = loc("tmp83"(#loc104))
+#loc240 = loc("tmp83"(#loc105))
+#loc241 = loc("tmp83"(#loc106))
+#loc242 = loc("tmp83"(#loc107))
+#loc243 = loc("tmp83"(#loc108))
+#loc244 = loc("tmp83"(#loc109))
+#loc245 = loc("tmp88"(#loc110))
+#loc246 = loc("tmp89"(#loc111))
+#loc247 = loc("tmp89"(#loc112))
+#loc248 = loc("tmp89"(#loc113))
+#loc249 = loc("tmp91"(#loc114))
+#loc250 = loc("tmp94"(#loc115))
+#loc251 = loc("tmp95"(#loc116))
+#loc252 = loc("tmp82"(#loc117))
+#loc253 = loc("tmp101"(#loc118))
+#loc254 = loc("tmp107"(#loc119))
+#loc255 = loc("tmp104"(#loc120))
+#loc256 = loc("tmp109"(#loc121))
+#loc257 = loc("tmp110"(#loc122))
+#loc258 = loc(callsite(#loc29 at #loc166))
+#loc260 = loc(callsite(#loc29 at #loc168))
+#loc262 = loc(fused[#loc218, #loc204])
+#loc263 = loc(fused[#loc251, #loc252])
+#loc264 = loc(callsite(#loc31 at #loc258))
+#loc265 = loc(callsite(#loc31 at #loc260))
diff --git a/triton/RIRT6HUGDE5FKFXB6MO7WI4SAH36A3KCULSXF6ERVDOUPVVMN3QA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttir b/triton/RIRT6HUGDE5FKFXB6MO7WI4SAH36A3KCULSXF6ERVDOUPVVMN3QA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttir
new file mode 100644
index 0000000000000000000000000000000000000000..f12e63b8c007fc3a601413263d614941e61bf54b
--- /dev/null
+++ b/triton/RIRT6HUGDE5FKFXB6MO7WI4SAH36A3KCULSXF6ERVDOUPVVMN3QA/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttir
@@ -0,0 +1,457 @@
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":18:0)
+#loc1 = loc(unknown)
+#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":51:25)
+#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":52:27)
+#loc132 = loc("in_out_ptr0"(#loc))
+#loc133 = loc("in_out_ptr1"(#loc))
+#loc134 = loc("in_ptr0"(#loc))
+#loc135 = loc("in_ptr1"(#loc))
+#loc136 = loc("in_ptr2"(#loc))
+#loc137 = loc("in_ptr3"(#loc))
+#loc138 = loc("in_ptr4"(#loc))
+#loc139 = loc("xnumel"(#loc))
+#loc140 = loc("r0_numel"(#loc))
+#loc170 = loc("tmp4"(#loc32))
+#loc172 = loc("tmp10"(#loc35))
+#loc263 = loc(callsite(#loc1 at #loc170))
+#loc265 = loc(callsite(#loc1 at #loc172))
+module {
+  tt.func public @triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0(%in_out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_out_ptr0"(#loc)), %in_out_ptr1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_out_ptr1"(#loc)), %in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %in_ptr4: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr4"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
+    %cst = arith.constant dense<0.000000e+00> : tensor<1x128xbf16> loc(#loc1)
+    %cst_0 = arith.constant dense<0.000000e+00> : tensor<4x128xbf16> loc(#loc1)
+    %cst_1 = arith.constant dense<4097> : tensor<1x128xi32> loc(#loc1)
+    %cst_2 = arith.constant dense<9.99999997E-7> : tensor<4x1xf32> loc(#loc1)
+    %cst_3 = arith.constant dense<1.280000e+02> : tensor<4x1xf32> loc(#loc1)
+    %cst_4 = arith.constant dense<1> : tensor<1x128xi32> loc(#loc1)
+    %cst_5 = arith.constant dense<1> : tensor<1x128xi64> loc(#loc1)
+    %cst_6 = arith.constant dense<2> : tensor<1x128xi32> loc(#loc1)
+    %cst_7 = arith.constant dense<36864> : tensor<4x1xi32> loc(#loc1)
+    %cst_8 = arith.constant dense<128> : tensor<4x1xi32> loc(#loc1)
+    %cst_9 = arith.constant dense<4096> : tensor<1x128xi32> loc(#loc1)
+    %cst_10 = arith.constant dense<128> : tensor<1x128xi32> loc(#loc1)
+    %cst_11 = arith.constant dense<0.000000e+00> : tensor<4x128xf32> loc(#loc1)
+    %cst_12 = arith.constant dense<32> : tensor<4x1xi32> loc(#loc1)
+    %c4_i32 = arith.constant 4 : i32 loc(#loc1)
+    %xoffset = tt.get_program_id x : i32 loc(#loc141)
+    %xoffset_13 = arith.muli %xoffset, %c4_i32 : i32 loc(#loc142)
+    %xindex = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32> loc(#loc143)
+    %xindex_14 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<4xi32> -> tensor<4x1xi32> loc(#loc144)
+    %xindex_15 = tt.splat %xoffset_13 : i32 -> tensor<4x1xi32> loc(#loc145)
+    %xindex_16 = arith.addi %xindex_15, %xindex_14 : tensor<4x1xi32> loc(#loc145)
+    %r0_base = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc146)
+    %r0_base_17 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc147)
+    %x0 = arith.remsi %xindex_16, %cst_12 : tensor<4x1xi32> loc(#loc148)
+    %x1 = arith.divsi %xindex_16, %cst_12 : tensor<4x1xi32> loc(#loc149)
+    %r0_mask = arith.cmpi slt, %r0_base_17, %cst_10 : tensor<1x128xi32> loc(#loc150)
+    %tmp0 = arith.addi %r0_base_17, %cst_9 : tensor<1x128xi32> loc(#loc151)
+    %tmp0_18 = arith.muli %x0, %cst_8 : tensor<4x1xi32> loc(#loc152)
+    %tmp0_19 = tt.broadcast %tmp0 : tensor<1x128xi32> -> tensor<4x128xi32> loc(#loc153)
+    %tmp0_20 = tt.broadcast %tmp0_18 : tensor<4x1xi32> -> tensor<4x128xi32> loc(#loc153)
+    %tmp0_21 = arith.addi %tmp0_19, %tmp0_20 : tensor<4x128xi32> loc(#loc153)
+    %tmp0_22 = arith.muli %x1, %cst_7 : tensor<4x1xi32> loc(#loc154)
+    %tmp0_23 = tt.broadcast %tmp0_22 : tensor<4x1xi32> -> tensor<4x128xi32> loc(#loc155)
+    %tmp0_24 = arith.addi %tmp0_21, %tmp0_23 : tensor<4x128xi32> loc(#loc155)
+    %tmp0_25 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<4x128x!tt.ptr<bf16>> loc(#loc156)
+    %tmp0_26 = tt.addptr %tmp0_25, %tmp0_24 : tensor<4x128x!tt.ptr<bf16>>, tensor<4x128xi32> loc(#loc156)
+    %tmp0_27 = tt.broadcast %r0_mask : tensor<1x128xi1> -> tensor<4x128xi1> loc(#loc157)
+    %tmp0_28 = tt.load %tmp0_26, %tmp0_27, %cst_0 evictionPolicy = evict_last : tensor<4x128x!tt.ptr<bf16>> loc(#loc157)
+    %tmp0_29 = arith.extf %tmp0_28 : tensor<4x128xbf16> to tensor<4x128xf32> loc(#loc158)
+    %tmp6 = tt.broadcast %r0_base_17 : tensor<1x128xi32> -> tensor<4x128xi32> loc(#loc159)
+    %tmp6_30 = arith.addi %tmp6, %tmp0_20 : tensor<4x128xi32> loc(#loc159)
+    %tmp6_31 = arith.addi %tmp6_30, %tmp0_23 : tensor<4x128xi32> loc(#loc160)
+    %tmp6_32 = tt.addptr %tmp0_25, %tmp6_31 : tensor<4x128x!tt.ptr<bf16>>, tensor<4x128xi32> loc(#loc161)
+    %tmp6_33 = tt.load %tmp6_32, %tmp0_27, %cst_0 evictionPolicy = evict_last : tensor<4x128x!tt.ptr<bf16>> loc(#loc162)
+    %tmp6_34 = arith.extf %tmp6_33 : tensor<4x128xbf16> to tensor<4x128xf32> loc(#loc163)
+    %tmp2 = arith.mulf %tmp0_29, %tmp0_29 : tensor<4x128xf32> loc(#loc164)
+    %tmp5 = arith.addf %tmp2, %cst_11 : tensor<4x128xf32> loc(#loc165)
+    %_tmp4 = arith.select %tmp0_27, %tmp5, %cst_11 : tensor<4x128xi1>, tensor<4x128xf32> loc(#loc166)
+    %tmp8 = arith.mulf %tmp6_34, %tmp6_34 : tensor<4x128xf32> loc(#loc167)
+    %tmp11 = arith.addf %tmp8, %cst_11 : tensor<4x128xf32> loc(#loc168)
+    %_tmp10 = arith.select %tmp0_27, %tmp11, %cst_11 : tensor<4x128xi1>, tensor<4x128xf32> loc(#loc169)
+    %tmp4 = "tt.reduce"(%_tmp4) <{axis = 1 : i32}> ({
+    ^bb0(%tmp4_98: f32 loc(callsite(#loc1 at #loc170)), %tmp4_99: f32 loc(callsite(#loc1 at #loc170))):
+      %tmp4_100 = arith.addf %tmp4_98, %tmp4_99 : f32 loc(#loc266)
+      tt.reduce.return %tmp4_100 : f32 loc(#loc262)
+    }) : (tensor<4x128xf32>) -> tensor<4xf32> loc(#loc262)
+    %tmp4_35 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<4xf32> -> tensor<4x1xf32> loc(#loc171)
+    %tmp10 = "tt.reduce"(%_tmp10) <{axis = 1 : i32}> ({
+    ^bb0(%tmp10_98: f32 loc(callsite(#loc1 at #loc172)), %tmp10_99: f32 loc(callsite(#loc1 at #loc172))):
+      %tmp10_100 = arith.addf %tmp10_98, %tmp10_99 : f32 loc(#loc267)
+      tt.reduce.return %tmp10_100 : f32 loc(#loc264)
+    }) : (tensor<4x128xf32>) -> tensor<4xf32> loc(#loc264)
+    %tmp10_36 = tt.expand_dims %tmp10 {axis = 1 : i32} : tensor<4xf32> -> tensor<4x1xf32> loc(#loc173)
+    %r0_3 = arith.remsi %r0_base_17, %cst_6 : tensor<1x128xi32> loc(#loc174)
+    %r0_4 = arith.divsi %r0_base_17, %cst_6 : tensor<1x128xi32> loc(#loc175)
+    %tmp58 = tt.splat %in_ptr1 : !tt.ptr<bf16> -> tensor<1x128x!tt.ptr<bf16>> loc(#loc176)
+    %tmp58_37 = tt.addptr %tmp58, %r0_base_17 : tensor<1x128x!tt.ptr<bf16>>, tensor<1x128xi32> loc(#loc176)
+    %tmp58_38 = tt.load %tmp58_37, %r0_mask, %cst evictionPolicy = evict_last : tensor<1x128x!tt.ptr<bf16>> loc(#loc177)
+    %tmp58_39 = arith.extf %tmp58_38 : tensor<1x128xbf16> to tensor<1x128xf32> loc(#loc178)
+    %tmp63 = arith.muli %x1, %cst_8 : tensor<4x1xi32> loc(#loc179)
+    %tmp63_40 = tt.broadcast %tmp63 : tensor<4x1xi32> -> tensor<4x128xi32> loc(#loc180)
+    %tmp63_41 = arith.addi %tmp6, %tmp63_40 : tensor<4x128xi32> loc(#loc180)
+    %tmp63_42 = tt.splat %in_ptr2 : !tt.ptr<f32> -> tensor<4x128x!tt.ptr<f32>> loc(#loc181)
+    %tmp63_43 = tt.addptr %tmp63_42, %tmp63_41 : tensor<4x128x!tt.ptr<f32>>, tensor<4x128xi32> loc(#loc181)
+    %tmp63_44 = tt.load %tmp63_43, %tmp0_27, %cst_11 evictionPolicy = evict_last : tensor<4x128x!tt.ptr<f32>> loc(#loc182)
+    %tmp66 = tt.splat %in_ptr3 : !tt.ptr<f32> -> tensor<4x128x!tt.ptr<f32>> loc(#loc183)
+    %tmp66_45 = tt.addptr %tmp66, %tmp63_41 : tensor<4x128x!tt.ptr<f32>>, tensor<4x128xi32> loc(#loc183)
+    %tmp66_46 = tt.load %tmp66_45, %tmp0_27, %cst_11 evictionPolicy = evict_last : tensor<4x128x!tt.ptr<f32>> loc(#loc184)
+    %tmp96 = tt.load %tmp0_26, %tmp0_27, %cst_0 evictionPolicy = evict_first : tensor<4x128x!tt.ptr<bf16>> loc(#loc185)
+    %tmp96_47 = arith.extf %tmp96 : tensor<4x128xbf16> to tensor<4x128xf32> loc(#loc186)
+    %tmp102 = tt.splat %in_ptr4 : !tt.ptr<bf16> -> tensor<1x128x!tt.ptr<bf16>> loc(#loc187)
+    %tmp102_48 = tt.addptr %tmp102, %r0_base_17 : tensor<1x128x!tt.ptr<bf16>>, tensor<1x128xi32> loc(#loc187)
+    %tmp102_49 = tt.load %tmp102_48, %r0_mask, %cst evictionPolicy = evict_last : tensor<1x128x!tt.ptr<bf16>> loc(#loc188)
+    %tmp102_50 = arith.extf %tmp102_49 : tensor<1x128xbf16> to tensor<1x128xf32> loc(#loc189)
+    %tmp16 = arith.extsi %r0_3 : tensor<1x128xi32> to tensor<1x128xi64> loc(#loc190)
+    %tmp16_51 = arith.cmpi slt, %tmp16, %cst_5 : tensor<1x128xi64> loc(#loc190)
+    %tmp17 = arith.muli %r0_4, %cst_6 : tensor<1x128xi32> loc(#loc191)
+    %tmp17_52 = arith.addi %tmp17, %cst_4 : tensor<1x128xi32> loc(#loc192)
+    %tmp17_53 = tt.broadcast %tmp17_52 : tensor<1x128xi32> -> tensor<4x128xi32> loc(#loc193)
+    %tmp17_54 = arith.addi %tmp17_53, %tmp0_20 : tensor<4x128xi32> loc(#loc193)
+    %tmp17_55 = arith.addi %tmp17_54, %tmp0_23 : tensor<4x128xi32> loc(#loc194)
+    %tmp17_56 = tt.addptr %tmp0_25, %tmp17_55 : tensor<4x128x!tt.ptr<bf16>>, tensor<4x128xi32> loc(#loc195)
+    %tmp17_57 = arith.andi %r0_mask, %tmp16_51 : tensor<1x128xi1> loc(#loc196)
+    %tmp17_58 = tt.broadcast %tmp17_57 : tensor<1x128xi1> -> tensor<4x128xi1> loc(#loc197)
+    %tmp17_59 = tt.load %tmp17_56, %tmp17_58, %cst_0 evictionPolicy = evict_last : tensor<4x128x!tt.ptr<bf16>> loc(#loc197)
+    %tmp17_60 = arith.extf %tmp17_59 : tensor<4x128xbf16> to tensor<4x128xf32> loc(#loc198)
+    %tmp20 = arith.divf %tmp10_36, %cst_3 : tensor<4x1xf32> loc(#loc199)
+    %tmp22 = arith.addf %tmp20, %cst_2 : tensor<4x1xf32> loc(#loc200)
+    %tmp23 = tt.extern_elementwise %tmp22 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<4x1xf32>) -> tensor<4x1xf32> loc(#loc201)
+    %tmp24 = tt.broadcast %tmp23 : tensor<4x1xf32> -> tensor<4x128xf32> loc(#loc202)
+    %tmp24_61 = arith.mulf %tmp17_60, %tmp24 : tensor<4x128xf32> loc(#loc202)
+    %tmp25 = tt.addptr %tmp58, %tmp17_52 : tensor<1x128x!tt.ptr<bf16>>, tensor<1x128xi32> loc(#loc203)
+    %tmp25_62 = tt.broadcast %tmp25 : tensor<1x128x!tt.ptr<bf16>> -> tensor<4x128x!tt.ptr<bf16>> loc(#loc203)
+    %tmp25_63 = tt.load %tmp25_62, %tmp17_58, %cst_0 evictionPolicy = evict_last : tensor<4x128x!tt.ptr<bf16>> loc(#loc204)
+    %tmp25_64 = arith.extf %tmp25_63 : tensor<4x128xbf16> to tensor<4x128xf32> loc(#loc205)
+    %tmp27 = arith.mulf %tmp24_61, %tmp25_64 : tensor<4x128xf32> loc(#loc206)
+    %tmp29 = arith.subf %cst_11, %tmp27 : tensor<4x128xf32> loc(#loc207)
+    %tmp31 = tt.broadcast %tmp16_51 : tensor<1x128xi1> -> tensor<4x128xi1> loc(#loc208)
+    %tmp31_65 = arith.select %tmp31, %tmp29, %cst_11 : tensor<4x128xi1>, tensor<4x128xf32> loc(#loc208)
+    %tmp32 = arith.cmpi sge, %tmp16, %cst_5 : tensor<1x128xi64> loc(#loc209)
+    %tmp35 = tt.broadcast %tmp17 : tensor<1x128xi32> -> tensor<4x128xi32> loc(#loc210)
+    %tmp35_66 = arith.addi %tmp35, %tmp0_20 : tensor<4x128xi32> loc(#loc210)
+    %tmp35_67 = arith.addi %tmp35_66, %tmp0_23 : tensor<4x128xi32> loc(#loc211)
+    %tmp35_68 = tt.addptr %tmp0_25, %tmp35_67 : tensor<4x128x!tt.ptr<bf16>>, tensor<4x128xi32> loc(#loc212)
+    %tmp35_69 = arith.andi %r0_mask, %tmp32 : tensor<1x128xi1> loc(#loc213)
+    %tmp35_70 = tt.broadcast %tmp35_69 : tensor<1x128xi1> -> tensor<4x128xi1> loc(#loc214)
+    %tmp35_71 = tt.load %tmp35_68, %tmp35_70, %cst_0 evictionPolicy = evict_last : tensor<4x128x!tt.ptr<bf16>> loc(#loc214)
+    %tmp35_72 = arith.extf %tmp35_71 : tensor<4x128xbf16> to tensor<4x128xf32> loc(#loc215)
+    %tmp42 = arith.mulf %tmp35_72, %tmp24 : tensor<4x128xf32> loc(#loc216)
+    %tmp43 = tt.addptr %tmp58, %tmp17 : tensor<1x128x!tt.ptr<bf16>>, tensor<1x128xi32> loc(#loc217)
+    %tmp43_73 = tt.broadcast %tmp43 : tensor<1x128x!tt.ptr<bf16>> -> tensor<4x128x!tt.ptr<bf16>> loc(#loc217)
+    %tmp43_74 = tt.load %tmp43_73, %tmp35_70, %cst_0 evictionPolicy = evict_last : tensor<4x128x!tt.ptr<bf16>> loc(#loc218)
+    %tmp43_75 = arith.extf %tmp43_74 : tensor<4x128xbf16> to tensor<4x128xf32> loc(#loc219)
+    %tmp45 = arith.mulf %tmp42, %tmp43_75 : tensor<4x128xf32> loc(#loc220)
+    %tmp48 = tt.broadcast %tmp32 : tensor<1x128xi1> -> tensor<4x128xi1> loc(#loc221)
+    %tmp48_76 = arith.select %tmp48, %tmp45, %cst_11 : tensor<4x128xi1>, tensor<4x128xf32> loc(#loc221)
+    %tmp49 = arith.select %tmp31, %tmp31_65, %tmp48_76 : tensor<4x128xi1>, tensor<4x128xf32> loc(#loc222)
+    %tmp57 = arith.mulf %tmp6_34, %tmp24 : tensor<4x128xf32> loc(#loc223)
+    %tmp60 = tt.broadcast %tmp58_39 : tensor<1x128xf32> -> tensor<4x128xf32> loc(#loc224)
+    %tmp60_77 = arith.mulf %tmp57, %tmp60 : tensor<4x128xf32> loc(#loc224)
+    %tmp64 = arith.mulf %tmp60_77, %tmp63_44 : tensor<4x128xf32> loc(#loc225)
+    %tmp67 = arith.mulf %tmp49, %tmp66_46 : tensor<4x128xf32> loc(#loc226)
+    %tmp68 = arith.addf %tmp64, %tmp67 : tensor<4x128xf32> loc(#loc227)
+    %tmp70 = arith.addi %tmp17, %cst_1 : tensor<1x128xi32> loc(#loc228)
+    %tmp70_78 = tt.broadcast %tmp70 : tensor<1x128xi32> -> tensor<4x128xi32> loc(#loc229)
+    %tmp70_79 = arith.addi %tmp70_78, %tmp0_20 : tensor<4x128xi32> loc(#loc229)
+    %tmp70_80 = arith.addi %tmp70_79, %tmp0_23 : tensor<4x128xi32> loc(#loc230)
+    %tmp70_81 = tt.addptr %tmp0_25, %tmp70_80 : tensor<4x128x!tt.ptr<bf16>>, tensor<4x128xi32> loc(#loc231)
+    %tmp70_82 = tt.load %tmp70_81, %tmp17_58, %cst_0 evictionPolicy = evict_last : tensor<4x128x!tt.ptr<bf16>> loc(#loc232)
+    %tmp70_83 = arith.extf %tmp70_82 : tensor<4x128xbf16> to tensor<4x128xf32> loc(#loc233)
+    %tmp72 = arith.divf %tmp4_35, %cst_3 : tensor<4x1xf32> loc(#loc234)
+    %tmp73 = arith.addf %tmp72, %cst_2 : tensor<4x1xf32> loc(#loc235)
+    %tmp74 = tt.extern_elementwise %tmp73 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<4x1xf32>) -> tensor<4x1xf32> loc(#loc236)
+    %tmp75 = tt.broadcast %tmp74 : tensor<4x1xf32> -> tensor<4x128xf32> loc(#loc237)
+    %tmp75_84 = arith.mulf %tmp70_83, %tmp75 : tensor<4x128xf32> loc(#loc237)
+    %tmp76 = tt.addptr %tmp102, %tmp17_52 : tensor<1x128x!tt.ptr<bf16>>, tensor<1x128xi32> loc(#loc238)
+    %tmp76_85 = tt.broadcast %tmp76 : tensor<1x128x!tt.ptr<bf16>> -> tensor<4x128x!tt.ptr<bf16>> loc(#loc238)
+    %tmp76_86 = tt.load %tmp76_85, %tmp17_58, %cst_0 evictionPolicy = evict_last : tensor<4x128x!tt.ptr<bf16>> loc(#loc239)
+    %tmp76_87 = arith.extf %tmp76_86 : tensor<4x128xbf16> to tensor<4x128xf32> loc(#loc240)
+    %tmp78 = arith.mulf %tmp75_84, %tmp76_87 : tensor<4x128xf32> loc(#loc241)
+    %tmp80 = arith.subf %cst_11, %tmp78 : tensor<4x128xf32> loc(#loc242)
+    %tmp82 = arith.select %tmp31, %tmp80, %cst_11 : tensor<4x128xi1>, tensor<4x128xf32> loc(#loc243)
+    %tmp83 = arith.addi %tmp17, %cst_9 : tensor<1x128xi32> loc(#loc244)
+    %tmp83_88 = tt.broadcast %tmp83 : tensor<1x128xi32> -> tensor<4x128xi32> loc(#loc245)
+    %tmp83_89 = arith.addi %tmp83_88, %tmp0_20 : tensor<4x128xi32> loc(#loc245)
+    %tmp83_90 = arith.addi %tmp83_89, %tmp0_23 : tensor<4x128xi32> loc(#loc246)
+    %tmp83_91 = tt.addptr %tmp0_25, %tmp83_90 : tensor<4x128x!tt.ptr<bf16>>, tensor<4x128xi32> loc(#loc247)
+    %tmp83_92 = tt.load %tmp83_91, %tmp35_70, %cst_0 evictionPolicy = evict_last : tensor<4x128x!tt.ptr<bf16>> loc(#loc248)
+    %tmp83_93 = arith.extf %tmp83_92 : tensor<4x128xbf16> to tensor<4x128xf32> loc(#loc249)
+    %tmp88 = arith.mulf %tmp83_93, %tmp75 : tensor<4x128xf32> loc(#loc250)
+    %tmp89 = tt.addptr %tmp102, %tmp17 : tensor<1x128x!tt.ptr<bf16>>, tensor<1x128xi32> loc(#loc251)
+    %tmp89_94 = tt.broadcast %tmp89 : tensor<1x128x!tt.ptr<bf16>> -> tensor<4x128x!tt.ptr<bf16>> loc(#loc251)
+    %tmp89_95 = tt.load %tmp89_94, %tmp35_70, %cst_0 evictionPolicy = evict_last : tensor<4x128x!tt.ptr<bf16>> loc(#loc252)
+    %tmp89_96 = arith.extf %tmp89_95 : tensor<4x128xbf16> to tensor<4x128xf32> loc(#loc253)
+    %tmp91 = arith.mulf %tmp88, %tmp89_96 : tensor<4x128xf32> loc(#loc254)
+    %tmp94 = arith.select %tmp48, %tmp91, %cst_11 : tensor<4x128xi1>, tensor<4x128xf32> loc(#loc255)
+    %tmp95 = arith.select %tmp31, %tmp82, %tmp94 : tensor<4x128xi1>, tensor<4x128xf32> loc(#loc256)
+    %tmp101 = arith.mulf %tmp96_47, %tmp75 : tensor<4x128xf32> loc(#loc257)
+    %tmp104 = tt.broadcast %tmp102_50 : tensor<1x128xf32> -> tensor<4x128xf32> loc(#loc258)
+    %tmp104_97 = arith.mulf %tmp101, %tmp104 : tensor<4x128xf32> loc(#loc258)
+    %tmp107 = arith.mulf %tmp104_97, %tmp63_44 : tensor<4x128xf32> loc(#loc259)
+    %tmp109 = arith.mulf %tmp95, %tmp66_46 : tensor<4x128xf32> loc(#loc260)
+    %tmp110 = arith.addf %tmp107, %tmp109 : tensor<4x128xf32> loc(#loc261)
+    %0 = arith.muli %xindex_16, %cst_8 : tensor<4x1xi32> loc(#loc125)
+    %1 = tt.broadcast %0 : tensor<4x1xi32> -> tensor<4x128xi32> loc(#loc126)
+    %2 = arith.addi %tmp6, %1 : tensor<4x128xi32> loc(#loc126)
+    %3 = tt.splat %in_out_ptr0 : !tt.ptr<bf16> -> tensor<4x128x!tt.ptr<bf16>> loc(#loc127)
+    %4 = tt.addptr %3, %2 : tensor<4x128x!tt.ptr<bf16>>, tensor<4x128xi32> loc(#loc127)
+    %5 = arith.truncf %tmp68 : tensor<4x128xf32> to tensor<4x128xbf16> loc(#loc128)
+    tt.store %4, %5, %tmp0_27 : tensor<4x128x!tt.ptr<bf16>> loc(#loc128)
+    %6 = tt.splat %in_out_ptr1 : !tt.ptr<bf16> -> tensor<4x128x!tt.ptr<bf16>> loc(#loc129)
+    %7 = tt.addptr %6, %2 : tensor<4x128x!tt.ptr<bf16>>, tensor<4x128xi32> loc(#loc129)
+    %8 = arith.truncf %tmp110 : tensor<4x128xf32> to tensor<4x128xbf16> loc(#loc130)
+    tt.store %7, %8, %tmp0_27 : tensor<4x128x!tt.ptr<bf16>> loc(#loc130)
+    tt.return loc(#loc131)
+  } loc(#loc)
+} loc(#loc)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":23:28)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":23:33)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:36)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:44)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:23)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":26:27)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":26:37)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":28:19)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":29:19)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":35:29)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:41)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:52)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:48)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:63)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:57)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:34)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:68)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:121)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:41)
+#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:50)
+#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:34)
+#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:61)
+#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:114)
+#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":42:22)
+#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":44:23)
+#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":45:40)
+#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":47:22)
+#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":49:25)
+#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":50:42)
+#loc31 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:36)
+#loc33 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:15)
+#loc34 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":51:28)
+#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":52:30)
+#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":58:27)
+#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":59:27)
+#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:35)
+#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:42)
+#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:95)
+#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:46)
+#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:42)
+#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:35)
+#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:51)
+#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:35)
+#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:51)
+#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:69)
+#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:123)
+#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:36)
+#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:43)
+#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:96)
+#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":71:24)
+#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:41)
+#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:39)
+#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:48)
+#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:57)
+#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:35)
+#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:78)
+#loc60 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:68)
+#loc61 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:129)
+#loc62 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":75:25)
+#loc63 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":77:24)
+#loc64 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":78:32)
+#loc65 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":79:24)
+#loc66 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:35)
+#loc67 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:85)
+#loc68 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:146)
+#loc69 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":82:24)
+#loc70 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":84:17)
+#loc71 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":86:39)
+#loc72 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":87:25)
+#loc73 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:44)
+#loc74 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:53)
+#loc75 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:35)
+#loc76 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:74)
+#loc77 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:64)
+#loc78 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:125)
+#loc79 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":97:24)
+#loc80 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:35)
+#loc81 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:81)
+#loc82 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:142)
+#loc83 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":100:24)
+#loc84 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":103:39)
+#loc85 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":104:39)
+#loc86 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":111:24)
+#loc87 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":113:24)
+#loc88 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":116:24)
+#loc89 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":118:24)
+#loc90 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":119:24)
+#loc91 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:42)
+#loc92 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:51)
+#loc93 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:60)
+#loc94 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:35)
+#loc95 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:71)
+#loc96 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:132)
+#loc97 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":123:24)
+#loc98 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":124:24)
+#loc99 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":125:32)
+#loc100 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":126:24)
+#loc101 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:35)
+#loc102 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:85)
+#loc103 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:146)
+#loc104 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":129:24)
+#loc105 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":131:17)
+#loc106 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":133:39)
+#loc107 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:42)
+#loc108 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:51)
+#loc109 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:60)
+#loc110 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:35)
+#loc111 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:71)
+#loc112 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:132)
+#loc113 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":139:24)
+#loc114 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:35)
+#loc115 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:81)
+#loc116 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:142)
+#loc117 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":142:24)
+#loc118 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":145:39)
+#loc119 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":146:39)
+#loc120 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":151:25)
+#loc121 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":153:26)
+#loc122 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":156:26)
+#loc123 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":158:26)
+#loc124 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":159:26)
+#loc125 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:43)
+#loc126 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:39)
+#loc127 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:32)
+#loc128 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:55)
+#loc129 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:32)
+#loc130 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:56)
+#loc131 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":53:4)
+#loc141 = loc("xoffset"(#loc2))
+#loc142 = loc("xoffset"(#loc3))
+#loc143 = loc("xindex"(#loc4))
+#loc144 = loc("xindex"(#loc5))
+#loc145 = loc("xindex"(#loc6))
+#loc146 = loc("r0_base"(#loc7))
+#loc147 = loc("r0_base"(#loc8))
+#loc148 = loc("x0"(#loc9))
+#loc149 = loc("x1"(#loc10))
+#loc150 = loc("r0_mask"(#loc11))
+#loc151 = loc("tmp0"(#loc12))
+#loc152 = loc("tmp0"(#loc13))
+#loc153 = loc("tmp0"(#loc14))
+#loc154 = loc("tmp0"(#loc15))
+#loc155 = loc("tmp0"(#loc16))
+#loc156 = loc("tmp0"(#loc17))
+#loc157 = loc("tmp0"(#loc18))
+#loc158 = loc("tmp0"(#loc19))
+#loc159 = loc("tmp6"(#loc20))
+#loc160 = loc("tmp6"(#loc21))
+#loc161 = loc("tmp6"(#loc22))
+#loc162 = loc("tmp6"(#loc23))
+#loc163 = loc("tmp6"(#loc24))
+#loc164 = loc("tmp2"(#loc25))
+#loc165 = loc("tmp5"(#loc26))
+#loc166 = loc("_tmp4"(#loc27))
+#loc167 = loc("tmp8"(#loc28))
+#loc168 = loc("tmp11"(#loc29))
+#loc169 = loc("_tmp10"(#loc30))
+#loc171 = loc("tmp4"(#loc34))
+#loc173 = loc("tmp10"(#loc36))
+#loc174 = loc("r0_3"(#loc37))
+#loc175 = loc("r0_4"(#loc38))
+#loc176 = loc("tmp58"(#loc39))
+#loc177 = loc("tmp58"(#loc40))
+#loc178 = loc("tmp58"(#loc41))
+#loc179 = loc("tmp63"(#loc42))
+#loc180 = loc("tmp63"(#loc43))
+#loc181 = loc("tmp63"(#loc44))
+#loc182 = loc("tmp63"(#loc45))
+#loc183 = loc("tmp66"(#loc46))
+#loc184 = loc("tmp66"(#loc47))
+#loc185 = loc("tmp96"(#loc48))
+#loc186 = loc("tmp96"(#loc49))
+#loc187 = loc("tmp102"(#loc50))
+#loc188 = loc("tmp102"(#loc51))
+#loc189 = loc("tmp102"(#loc52))
+#loc190 = loc("tmp16"(#loc53))
+#loc191 = loc("tmp17"(#loc54))
+#loc192 = loc("tmp17"(#loc55))
+#loc193 = loc("tmp17"(#loc56))
+#loc194 = loc("tmp17"(#loc57))
+#loc195 = loc("tmp17"(#loc58))
+#loc196 = loc("tmp17"(#loc59))
+#loc197 = loc("tmp17"(#loc60))
+#loc198 = loc("tmp17"(#loc61))
+#loc199 = loc("tmp20"(#loc62))
+#loc200 = loc("tmp22"(#loc63))
+#loc201 = loc("tmp23"(#loc64))
+#loc202 = loc("tmp24"(#loc65))
+#loc203 = loc("tmp25"(#loc66))
+#loc204 = loc("tmp25"(#loc67))
+#loc205 = loc("tmp25"(#loc68))
+#loc206 = loc("tmp27"(#loc69))
+#loc207 = loc("tmp29"(#loc70))
+#loc208 = loc("tmp31"(#loc71))
+#loc209 = loc("tmp32"(#loc72))
+#loc210 = loc("tmp35"(#loc73))
+#loc211 = loc("tmp35"(#loc74))
+#loc212 = loc("tmp35"(#loc75))
+#loc213 = loc("tmp35"(#loc76))
+#loc214 = loc("tmp35"(#loc77))
+#loc215 = loc("tmp35"(#loc78))
+#loc216 = loc("tmp42"(#loc79))
+#loc217 = loc("tmp43"(#loc80))
+#loc218 = loc("tmp43"(#loc81))
+#loc219 = loc("tmp43"(#loc82))
+#loc220 = loc("tmp45"(#loc83))
+#loc221 = loc("tmp48"(#loc84))
+#loc222 = loc("tmp49"(#loc85))
+#loc223 = loc("tmp57"(#loc86))
+#loc224 = loc("tmp60"(#loc87))
+#loc225 = loc("tmp64"(#loc88))
+#loc226 = loc("tmp67"(#loc89))
+#loc227 = loc("tmp68"(#loc90))
+#loc228 = loc("tmp70"(#loc91))
+#loc229 = loc("tmp70"(#loc92))
+#loc230 = loc("tmp70"(#loc93))
+#loc231 = loc("tmp70"(#loc94))
+#loc232 = loc("tmp70"(#loc95))
+#loc233 = loc("tmp70"(#loc96))
+#loc234 = loc("tmp72"(#loc97))
+#loc235 = loc("tmp73"(#loc98))
+#loc236 = loc("tmp74"(#loc99))
+#loc237 = loc("tmp75"(#loc100))
+#loc238 = loc("tmp76"(#loc101))
+#loc239 = loc("tmp76"(#loc102))
+#loc240 = loc("tmp76"(#loc103))
+#loc241 = loc("tmp78"(#loc104))
+#loc242 = loc("tmp80"(#loc105))
+#loc243 = loc("tmp82"(#loc106))
+#loc244 = loc("tmp83"(#loc107))
+#loc245 = loc("tmp83"(#loc108))
+#loc246 = loc("tmp83"(#loc109))
+#loc247 = loc("tmp83"(#loc110))
+#loc248 = loc("tmp83"(#loc111))
+#loc249 = loc("tmp83"(#loc112))
+#loc250 = loc("tmp88"(#loc113))
+#loc251 = loc("tmp89"(#loc114))
+#loc252 = loc("tmp89"(#loc115))
+#loc253 = loc("tmp89"(#loc116))
+#loc254 = loc("tmp91"(#loc117))
+#loc255 = loc("tmp94"(#loc118))
+#loc256 = loc("tmp95"(#loc119))
+#loc257 = loc("tmp101"(#loc120))
+#loc258 = loc("tmp104"(#loc121))
+#loc259 = loc("tmp107"(#loc122))
+#loc260 = loc("tmp109"(#loc123))
+#loc261 = loc("tmp110"(#loc124))
+#loc262 = loc(callsite(#loc31 at #loc170))
+#loc264 = loc(callsite(#loc31 at #loc172))
+#loc266 = loc(callsite(#loc33 at #loc262))
+#loc267 = loc(callsite(#loc33 at #loc264))
diff --git a/triton/RNNMPWWZPRYLZDDP3QNL7R5SV7EYTG7WXIUJKWKAEGE4BUI424IA/__grp__triton_poi_fused__fused_rms_norm_cat_view_2.json b/triton/RNNMPWWZPRYLZDDP3QNL7R5SV7EYTG7WXIUJKWKAEGE4BUI424IA/__grp__triton_poi_fused__fused_rms_norm_cat_view_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..87a9287a16b727ccbefa80c12edc1145ced3dc8d
--- /dev/null
+++ b/triton/RNNMPWWZPRYLZDDP3QNL7R5SV7EYTG7WXIUJKWKAEGE4BUI424IA/__grp__triton_poi_fused__fused_rms_norm_cat_view_2.json
@@ -0,0 +1 @@
+{"child_paths": {"triton_poi_fused__fused_rms_norm_cat_view_2.source": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/RNNMPWWZPRYLZDDP3QNL7R5SV7EYTG7WXIUJKWKAEGE4BUI424IA/triton_poi_fused__fused_rms_norm_cat_view_2.source", "triton_poi_fused__fused_rms_norm_cat_view_2.ttir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/RNNMPWWZPRYLZDDP3QNL7R5SV7EYTG7WXIUJKWKAEGE4BUI424IA/triton_poi_fused__fused_rms_norm_cat_view_2.ttir", "triton_poi_fused__fused_rms_norm_cat_view_2.ttgir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/RNNMPWWZPRYLZDDP3QNL7R5SV7EYTG7WXIUJKWKAEGE4BUI424IA/triton_poi_fused__fused_rms_norm_cat_view_2.ttgir", "triton_poi_fused__fused_rms_norm_cat_view_2.llir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/RNNMPWWZPRYLZDDP3QNL7R5SV7EYTG7WXIUJKWKAEGE4BUI424IA/triton_poi_fused__fused_rms_norm_cat_view_2.llir", "triton_poi_fused__fused_rms_norm_cat_view_2.ptx": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/RNNMPWWZPRYLZDDP3QNL7R5SV7EYTG7WXIUJKWKAEGE4BUI424IA/triton_poi_fused__fused_rms_norm_cat_view_2.ptx", "triton_poi_fused__fused_rms_norm_cat_view_2.cubin": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/RNNMPWWZPRYLZDDP3QNL7R5SV7EYTG7WXIUJKWKAEGE4BUI424IA/triton_poi_fused__fused_rms_norm_cat_view_2.cubin", "triton_poi_fused__fused_rms_norm_cat_view_2.json": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/RNNMPWWZPRYLZDDP3QNL7R5SV7EYTG7WXIUJKWKAEGE4BUI424IA/triton_poi_fused__fused_rms_norm_cat_view_2.json"}}
\ No newline at end of file
diff --git a/triton/RNNMPWWZPRYLZDDP3QNL7R5SV7EYTG7WXIUJKWKAEGE4BUI424IA/triton_poi_fused__fused_rms_norm_cat_view_2.cubin b/triton/RNNMPWWZPRYLZDDP3QNL7R5SV7EYTG7WXIUJKWKAEGE4BUI424IA/triton_poi_fused__fused_rms_norm_cat_view_2.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..693d9afb8c5a403bd215eafdff67ac38688b25a1
Binary files /dev/null and b/triton/RNNMPWWZPRYLZDDP3QNL7R5SV7EYTG7WXIUJKWKAEGE4BUI424IA/triton_poi_fused__fused_rms_norm_cat_view_2.cubin differ
diff --git a/triton/RNNMPWWZPRYLZDDP3QNL7R5SV7EYTG7WXIUJKWKAEGE4BUI424IA/triton_poi_fused__fused_rms_norm_cat_view_2.json b/triton/RNNMPWWZPRYLZDDP3QNL7R5SV7EYTG7WXIUJKWKAEGE4BUI424IA/triton_poi_fused__fused_rms_norm_cat_view_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..2f70ae834ed89b91859a6198234a96601eb158f5
--- /dev/null
+++ b/triton/RNNMPWWZPRYLZDDP3QNL7R5SV7EYTG7WXIUJKWKAEGE4BUI424IA/triton_poi_fused__fused_rms_norm_cat_view_2.json
@@ -0,0 +1 @@
+{"hash": "8b5ac7dad97c70bc8c6fdc1abfc7b2afc9899bf6ba289559402189c0d11cd710", "target": {"backend": "cuda", "arch": 89, "warp_size": 32}, "num_warps": 8, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "enable_reflect_ftz": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee", "bf16x3", "bf16x6"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm89", "instrumentation_mode": "", "triton_version": "3.6.0", "tensordesc_meta": [], "shared": 16384, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_poi_fused__fused_rms_norm_cat_view_2"}
\ No newline at end of file
diff --git a/triton/RNNMPWWZPRYLZDDP3QNL7R5SV7EYTG7WXIUJKWKAEGE4BUI424IA/triton_poi_fused__fused_rms_norm_cat_view_2.llir b/triton/RNNMPWWZPRYLZDDP3QNL7R5SV7EYTG7WXIUJKWKAEGE4BUI424IA/triton_poi_fused__fused_rms_norm_cat_view_2.llir
new file mode 100644
index 0000000000000000000000000000000000000000..a922420883645796bcc237e1fe85fc1542c99b56
--- /dev/null
+++ b/triton/RNNMPWWZPRYLZDDP3QNL7R5SV7EYTG7WXIUJKWKAEGE4BUI424IA/triton_poi_fused__fused_rms_norm_cat_view_2.llir
@@ -0,0 +1,1329 @@
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64"
+
+@global_smem = external local_unnamed_addr addrspace(3) global [0 x i8], align 16
+@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1
+
+; Function Attrs: nounwind
+define ptx_kernel void @triton_poi_fused__fused_rms_norm_cat_view_2(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, ptr addrspace(1) %6, i32 %7, i32 %8, ptr addrspace(1) readnone captures(none) %9, ptr addrspace(1) readnone captures(none) %10) local_unnamed_addr #0 !dbg !5 {
+  %12 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y(), !dbg !8
+  %13 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.z(), !dbg !9
+  %14 = tail call i32 @llvm.nvvm.read.ptx.sreg.nctaid.y(), !dbg !10
+  %15 = mul nuw i32 %13, %14, !dbg !11
+  %16 = add nuw i32 %15, %12, !dbg !12
+  %17 = shl i32 %16, 6, !dbg !13
+  %18 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !14
+  %19 = lshr i32 %18, 3, !dbg !14
+  %20 = and i32 %19, 31, !dbg !14
+  %21 = shl nuw nsw i32 %18, 2, !dbg !14
+  %22 = and i32 %21, 60, !dbg !14
+  %23 = or disjoint i32 %17, %20, !dbg !15
+  %24 = or disjoint i32 %23, 32, !dbg !15
+  %25 = or disjoint i32 %17, %22, !dbg !15
+  %26 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !16
+  %27 = shl i32 %26, 6, !dbg !17
+  %28 = and i32 %18, 7, !dbg !18
+  %29 = shl nuw nsw i32 %28, 3, !dbg !18
+  %30 = lshr i32 %18, 4, !dbg !18
+  %31 = and i32 %30, 15, !dbg !18
+  %32 = or disjoint i32 %29, %27, !dbg !19
+  %33 = or disjoint i32 %31, %27, !dbg !19
+  %34 = icmp slt i32 %32, 128, !dbg !20
+  %35 = icmp slt i32 %33, 128, !dbg !20
+  %36 = sdiv i32 %23, 32, !dbg !21
+  %37 = sdiv i32 %24, 32, !dbg !21
+  %38 = sdiv i32 %25, 32, !dbg !21
+  %39 = mul i32 %36, 32, !dbg !22
+  %.decomposed = sub i32 %23, %39, !dbg !22
+  %40 = mul i32 %38, 32, !dbg !22
+  %.decomposed109 = sub i32 %25, %40, !dbg !22
+  %41 = icmp slt i32 %23, 8192, !dbg !23
+  %42 = icmp slt i32 %25, 8192, !dbg !23
+  %43 = shl nsw i32 %.decomposed, 7, !dbg !24
+  %44 = add i32 %43, %32, !dbg !25
+  %45 = mul i32 %36, 12288, !dbg !26
+  %46 = mul i32 %37, 12288, !dbg !26
+  %47 = add i32 %44, %45, !dbg !27
+  %48 = add i32 %44, %46, !dbg !27
+  %49 = sext i32 %47 to i64, !dbg !28
+  %50 = getelementptr bfloat, ptr addrspace(1) %0, i64 %49, !dbg !28
+  %51 = sext i32 %48 to i64, !dbg !28
+  %52 = getelementptr bfloat, ptr addrspace(1) %0, i64 %51, !dbg !28
+  %53 = and i1 %34, %41, !dbg !29
+  %54 = and i1 %35, %42, !dbg !29
+  %55 = icmp slt i32 %23, 8160, !dbg !30
+  %56 = and i1 %34, %55, !dbg !30
+  %57 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !31
+  %58 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %50, i64 %57, i1 %53) #5, !dbg !31
+  %59 = extractvalue { i32, i32, i32, i32 } %58, 0, !dbg !31
+  %60 = extractvalue { i32, i32, i32, i32 } %58, 1, !dbg !31
+  %61 = extractvalue { i32, i32, i32, i32 } %58, 2, !dbg !31
+  %62 = extractvalue { i32, i32, i32, i32 } %58, 3, !dbg !31
+  %63 = insertelement <2 x i32> poison, i32 %59, i64 0, !dbg !31
+  %64 = insertelement <2 x i32> %63, i32 %60, i64 1, !dbg !31
+  %65 = lshr <2 x i32> %64, splat (i32 16), !dbg !31
+  %66 = trunc nuw <2 x i32> %65 to <2 x i16>, !dbg !31
+  %67 = insertelement <2 x i32> poison, i32 %61, i64 0, !dbg !31
+  %68 = insertelement <2 x i32> %67, i32 %62, i64 1, !dbg !31
+  %69 = lshr <2 x i32> %68, splat (i32 16), !dbg !31
+  %70 = trunc nuw <2 x i32> %69 to <2 x i16>, !dbg !31
+  %71 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !31
+  %72 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %52, i64 %71, i1 %56) #5, !dbg !31
+  %73 = extractvalue { i32, i32, i32, i32 } %72, 0, !dbg !31
+  %74 = extractvalue { i32, i32, i32, i32 } %72, 1, !dbg !31
+  %75 = extractvalue { i32, i32, i32, i32 } %72, 2, !dbg !31
+  %76 = extractvalue { i32, i32, i32, i32 } %72, 3, !dbg !31
+  %77 = insertelement <2 x i32> poison, i32 %73, i64 0, !dbg !31
+  %78 = insertelement <2 x i32> %77, i32 %74, i64 1, !dbg !31
+  %79 = lshr <2 x i32> %78, splat (i32 16), !dbg !31
+  %80 = trunc nuw <2 x i32> %79 to <2 x i16>, !dbg !31
+  %81 = insertelement <2 x i32> poison, i32 %75, i64 0, !dbg !31
+  %82 = insertelement <2 x i32> %81, i32 %76, i64 1, !dbg !31
+  %83 = lshr <2 x i32> %82, splat (i32 16), !dbg !31
+  %84 = trunc nuw <2 x i32> %83 to <2 x i16>, !dbg !31
+  %85 = and i32 %18, 24, !dbg !32
+  %86 = shl nuw nsw i32 %85, 5, !dbg !32
+  %87 = shl nuw nsw i32 %28, 4, !dbg !32
+  %88 = lshr exact i32 %85, 1, !dbg !32
+  %89 = and i32 %18, 96, !dbg !32
+  %90 = lshr exact i32 %89, 3, !dbg !32
+  %91 = and i32 %18, 128, !dbg !32
+  %92 = icmp eq i32 %91, 0, !dbg !32
+  %93 = select i1 %92, i32 0, i32 1040, !dbg !32
+  %94 = xor i32 %88, %90, !dbg !32
+  %95 = or disjoint i32 %94, %86, !dbg !32
+  %96 = or disjoint i32 %95, %87, !dbg !32
+  %97 = xor i32 %96, %93, !dbg !32
+  %98 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %97, !dbg !32
+  %99 = trunc i32 %59 to i16, !dbg !32
+  %100 = trunc i32 %60 to i16, !dbg !32
+  %101 = insertelement <2 x i16> poison, i16 %99, i64 0, !dbg !32
+  %102 = insertelement <2 x i16> %101, i16 %100, i64 1, !dbg !32
+  store <2 x i16> %102, ptr addrspace(3) %98, align 4, !dbg !32
+  %103 = getelementptr inbounds nuw i8, ptr addrspace(3) %98, i32 128, !dbg !32
+  %104 = trunc i32 %61 to i16, !dbg !32
+  %105 = trunc i32 %62 to i16, !dbg !32
+  %106 = insertelement <2 x i16> poison, i16 %104, i64 0, !dbg !32
+  %107 = insertelement <2 x i16> %106, i16 %105, i64 1, !dbg !32
+  store <2 x i16> %107, ptr addrspace(3) %103, align 4, !dbg !32
+  %108 = xor i32 %97, 4160, !dbg !32
+  %109 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %108, !dbg !32
+  store <2 x i16> %66, ptr addrspace(3) %109, align 4, !dbg !32
+  %110 = getelementptr inbounds nuw i8, ptr addrspace(3) %109, i32 128, !dbg !32
+  store <2 x i16> %70, ptr addrspace(3) %110, align 4, !dbg !32
+  %111 = xor i32 %97, 2080, !dbg !32
+  %112 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %111, !dbg !32
+  %113 = trunc i32 %73 to i16, !dbg !32
+  %114 = trunc i32 %74 to i16, !dbg !32
+  %115 = insertelement <2 x i16> poison, i16 %113, i64 0, !dbg !32
+  %116 = insertelement <2 x i16> %115, i16 %114, i64 1, !dbg !32
+  store <2 x i16> %116, ptr addrspace(3) %112, align 4, !dbg !32
+  %117 = getelementptr inbounds nuw i8, ptr addrspace(3) %112, i32 128, !dbg !32
+  %118 = trunc i32 %75 to i16, !dbg !32
+  %119 = trunc i32 %76 to i16, !dbg !32
+  %120 = insertelement <2 x i16> poison, i16 %118, i64 0, !dbg !32
+  %121 = insertelement <2 x i16> %120, i16 %119, i64 1, !dbg !32
+  store <2 x i16> %121, ptr addrspace(3) %117, align 4, !dbg !32
+  %122 = xor i32 %97, 6240, !dbg !32
+  %123 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %122, !dbg !32
+  store <2 x i16> %80, ptr addrspace(3) %123, align 4, !dbg !32
+  %124 = getelementptr inbounds nuw i8, ptr addrspace(3) %123, i32 128, !dbg !32
+  store <2 x i16> %84, ptr addrspace(3) %124, align 4, !dbg !32
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !32
+  %125 = and i32 %18, 28, !dbg !32
+  %126 = shl nuw nsw i32 %125, 8, !dbg !32
+  %127 = and i32 %21, 124, !dbg !32
+  %128 = and i32 %30, 2, !dbg !32
+  %129 = shl nuw nsw i32 %18, 1, !dbg !32
+  %130 = and i32 %129, 128, !dbg !32
+  %131 = lshr exact i32 %91, 3, !dbg !32
+  %132 = or disjoint i32 %128, %130, !dbg !32
+  %133 = or disjoint i32 %126, %127, !dbg !32
+  %134 = xor i32 %133, %131, !dbg !32
+  %135 = or disjoint i32 %132, %134, !dbg !32
+  %136 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %135, !dbg !32
+  %137 = load bfloat, ptr addrspace(3) %136, align 2, !dbg !32
+  %138 = xor i32 %135, 260, !dbg !32
+  %139 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %138, !dbg !32
+  %140 = load bfloat, ptr addrspace(3) %139, align 2, !dbg !32
+  %141 = xor i32 %135, 520, !dbg !32
+  %142 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %141, !dbg !32
+  %143 = load bfloat, ptr addrspace(3) %142, align 2, !dbg !32
+  %144 = xor i32 %135, 780, !dbg !32
+  %145 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %144, !dbg !32
+  %146 = load bfloat, ptr addrspace(3) %145, align 2, !dbg !32
+  %147 = xor i32 %135, 32, !dbg !32
+  %148 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %147, !dbg !32
+  %149 = load bfloat, ptr addrspace(3) %148, align 2, !dbg !32
+  %150 = xor i32 %135, 292, !dbg !32
+  %151 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %150, !dbg !32
+  %152 = load bfloat, ptr addrspace(3) %151, align 2, !dbg !32
+  %153 = xor i32 %135, 552, !dbg !32
+  %154 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %153, !dbg !32
+  %155 = load bfloat, ptr addrspace(3) %154, align 2, !dbg !32
+  %156 = xor i32 %135, 812, !dbg !32
+  %157 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %156, !dbg !32
+  %158 = load bfloat, ptr addrspace(3) %157, align 2, !dbg !32
+  %159 = xor i32 %135, 64, !dbg !32
+  %160 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %159, !dbg !32
+  %161 = load bfloat, ptr addrspace(3) %160, align 2, !dbg !32
+  %162 = xor i32 %135, 324, !dbg !32
+  %163 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %162, !dbg !32
+  %164 = load bfloat, ptr addrspace(3) %163, align 2, !dbg !32
+  %165 = xor i32 %135, 584, !dbg !32
+  %166 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %165, !dbg !32
+  %167 = load bfloat, ptr addrspace(3) %166, align 2, !dbg !32
+  %168 = xor i32 %135, 844, !dbg !32
+  %169 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %168, !dbg !32
+  %170 = load bfloat, ptr addrspace(3) %169, align 2, !dbg !32
+  %171 = xor i32 %135, 96, !dbg !32
+  %172 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %171, !dbg !32
+  %173 = load bfloat, ptr addrspace(3) %172, align 2, !dbg !32
+  %174 = xor i32 %135, 356, !dbg !32
+  %175 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %174, !dbg !32
+  %176 = load bfloat, ptr addrspace(3) %175, align 2, !dbg !32
+  %177 = xor i32 %135, 616, !dbg !32
+  %178 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %177, !dbg !32
+  %179 = load bfloat, ptr addrspace(3) %178, align 2, !dbg !32
+  %180 = xor i32 %135, 876, !dbg !32
+  %181 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %180, !dbg !32
+  %182 = load bfloat, ptr addrspace(3) %181, align 2, !dbg !32
+  %183 = fpext bfloat %137 to float, !dbg !32
+  %184 = fpext bfloat %140 to float, !dbg !32
+  %185 = fpext bfloat %143 to float, !dbg !32
+  %186 = fpext bfloat %146 to float, !dbg !32
+  %187 = fpext bfloat %149 to float, !dbg !32
+  %188 = fpext bfloat %152 to float, !dbg !32
+  %189 = fpext bfloat %155 to float, !dbg !32
+  %190 = fpext bfloat %158 to float, !dbg !32
+  %191 = fpext bfloat %161 to float, !dbg !32
+  %192 = fpext bfloat %164 to float, !dbg !32
+  %193 = fpext bfloat %167 to float, !dbg !32
+  %194 = fpext bfloat %170 to float, !dbg !32
+  %195 = fpext bfloat %173 to float, !dbg !32
+  %196 = fpext bfloat %176 to float, !dbg !32
+  %197 = fpext bfloat %179 to float, !dbg !32
+  %198 = fpext bfloat %182 to float, !dbg !32
+  %199 = sext i32 %25 to i64, !dbg !33
+  %200 = getelementptr float, ptr addrspace(1) %1, i64 %199, !dbg !33
+  %201 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !34
+  %202 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %200, i64 %201, i1 %54) #5, !dbg !34
+  %203 = extractvalue { i32, i32, i32, i32 } %202, 0, !dbg !34
+  %204 = extractvalue { i32, i32, i32, i32 } %202, 1, !dbg !34
+  %205 = extractvalue { i32, i32, i32, i32 } %202, 2, !dbg !34
+  %206 = extractvalue { i32, i32, i32, i32 } %202, 3, !dbg !34
+  %207 = bitcast i32 %203 to float, !dbg !34
+  %208 = bitcast i32 %204 to float, !dbg !34
+  %209 = bitcast i32 %205 to float, !dbg !34
+  %210 = bitcast i32 %206 to float, !dbg !34
+  %211 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !34
+  %212 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %200, i64 %211, i1 %54) #5, !dbg !34
+  %213 = extractvalue { i32, i32, i32, i32 } %212, 0, !dbg !34
+  %214 = extractvalue { i32, i32, i32, i32 } %212, 1, !dbg !34
+  %215 = extractvalue { i32, i32, i32, i32 } %212, 2, !dbg !34
+  %216 = extractvalue { i32, i32, i32, i32 } %212, 3, !dbg !34
+  %217 = bitcast i32 %213 to float, !dbg !34
+  %218 = bitcast i32 %214 to float, !dbg !34
+  %219 = bitcast i32 %215 to float, !dbg !34
+  %220 = bitcast i32 %216 to float, !dbg !34
+  %221 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !34
+  %222 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %200, i64 %221, i1 %54) #5, !dbg !34
+  %223 = extractvalue { i32, i32, i32, i32 } %222, 0, !dbg !34
+  %224 = extractvalue { i32, i32, i32, i32 } %222, 1, !dbg !34
+  %225 = extractvalue { i32, i32, i32, i32 } %222, 2, !dbg !34
+  %226 = extractvalue { i32, i32, i32, i32 } %222, 3, !dbg !34
+  %227 = bitcast i32 %223 to float, !dbg !34
+  %228 = bitcast i32 %224 to float, !dbg !34
+  %229 = bitcast i32 %225 to float, !dbg !34
+  %230 = bitcast i32 %226 to float, !dbg !34
+  %231 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !34
+  %232 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %200, i64 %231, i1 %54) #5, !dbg !34
+  %233 = extractvalue { i32, i32, i32, i32 } %232, 0, !dbg !34
+  %234 = extractvalue { i32, i32, i32, i32 } %232, 1, !dbg !34
+  %235 = extractvalue { i32, i32, i32, i32 } %232, 2, !dbg !34
+  %236 = extractvalue { i32, i32, i32, i32 } %232, 3, !dbg !34
+  %237 = bitcast i32 %233 to float, !dbg !34
+  %238 = bitcast i32 %234 to float, !dbg !34
+  %239 = bitcast i32 %235 to float, !dbg !34
+  %240 = bitcast i32 %236 to float, !dbg !34
+  %241 = tail call float @llvm.nvvm.div.full(float %207, float 1.280000e+02), !dbg !35
+  %242 = tail call float @llvm.nvvm.div.full(float %208, float 1.280000e+02), !dbg !35
+  %243 = tail call float @llvm.nvvm.div.full(float %209, float 1.280000e+02), !dbg !35
+  %244 = tail call float @llvm.nvvm.div.full(float %210, float 1.280000e+02), !dbg !35
+  %245 = tail call float @llvm.nvvm.div.full(float %217, float 1.280000e+02), !dbg !35
+  %246 = tail call float @llvm.nvvm.div.full(float %218, float 1.280000e+02), !dbg !35
+  %247 = tail call float @llvm.nvvm.div.full(float %219, float 1.280000e+02), !dbg !35
+  %248 = tail call float @llvm.nvvm.div.full(float %220, float 1.280000e+02), !dbg !35
+  %249 = tail call float @llvm.nvvm.div.full(float %227, float 1.280000e+02), !dbg !35
+  %250 = tail call float @llvm.nvvm.div.full(float %228, float 1.280000e+02), !dbg !35
+  %251 = tail call float @llvm.nvvm.div.full(float %229, float 1.280000e+02), !dbg !35
+  %252 = tail call float @llvm.nvvm.div.full(float %230, float 1.280000e+02), !dbg !35
+  %253 = tail call float @llvm.nvvm.div.full(float %237, float 1.280000e+02), !dbg !35
+  %254 = tail call float @llvm.nvvm.div.full(float %238, float 1.280000e+02), !dbg !35
+  %255 = tail call float @llvm.nvvm.div.full(float %239, float 1.280000e+02), !dbg !35
+  %256 = tail call float @llvm.nvvm.div.full(float %240, float 1.280000e+02), !dbg !35
+  %257 = fadd float %241, 0x3EB0C6F7A0000000, !dbg !36
+  %258 = fadd float %242, 0x3EB0C6F7A0000000, !dbg !36
+  %259 = fadd float %243, 0x3EB0C6F7A0000000, !dbg !36
+  %260 = fadd float %244, 0x3EB0C6F7A0000000, !dbg !36
+  %261 = fadd float %245, 0x3EB0C6F7A0000000, !dbg !36
+  %262 = fadd float %246, 0x3EB0C6F7A0000000, !dbg !36
+  %263 = fadd float %247, 0x3EB0C6F7A0000000, !dbg !36
+  %264 = fadd float %248, 0x3EB0C6F7A0000000, !dbg !36
+  %265 = fadd float %249, 0x3EB0C6F7A0000000, !dbg !36
+  %266 = fadd float %250, 0x3EB0C6F7A0000000, !dbg !36
+  %267 = fadd float %251, 0x3EB0C6F7A0000000, !dbg !36
+  %268 = fadd float %252, 0x3EB0C6F7A0000000, !dbg !36
+  %269 = fadd float %253, 0x3EB0C6F7A0000000, !dbg !36
+  %270 = fadd float %254, 0x3EB0C6F7A0000000, !dbg !36
+  %271 = fadd float %255, 0x3EB0C6F7A0000000, !dbg !36
+  %272 = fadd float %256, 0x3EB0C6F7A0000000, !dbg !36
+  %273 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !37
+  %.not.i = icmp eq i32 %273, 0, !dbg !37
+  br i1 %.not.i, label %276, label %274, !dbg !37
+
+274:                                              ; preds = %11
+  %275 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %257), !dbg !37
+  br label %__nv_rsqrtf.exit, !dbg !37
+
+276:                                              ; preds = %11
+  %277 = tail call float @llvm.nvvm.rsqrt.approx.f(float %257), !dbg !37
+  br label %__nv_rsqrtf.exit, !dbg !37
+
+__nv_rsqrtf.exit:                                 ; preds = %274, %276
+  %.0.i = phi float [ %275, %274 ], [ %277, %276 ], !dbg !37
+  %278 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !37
+  %.not.i16 = icmp eq i32 %278, 0, !dbg !37
+  br i1 %.not.i16, label %281, label %279, !dbg !37
+
+279:                                              ; preds = %__nv_rsqrtf.exit
+  %280 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %258), !dbg !37
+  br label %__nv_rsqrtf.exit18, !dbg !37
+
+281:                                              ; preds = %__nv_rsqrtf.exit
+  %282 = tail call float @llvm.nvvm.rsqrt.approx.f(float %258), !dbg !37
+  br label %__nv_rsqrtf.exit18, !dbg !37
+
+__nv_rsqrtf.exit18:                               ; preds = %279, %281
+  %.0.i17 = phi float [ %280, %279 ], [ %282, %281 ], !dbg !37
+  %283 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !37
+  %.not.i19 = icmp eq i32 %283, 0, !dbg !37
+  br i1 %.not.i19, label %286, label %284, !dbg !37
+
+284:                                              ; preds = %__nv_rsqrtf.exit18
+  %285 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %259), !dbg !37
+  br label %__nv_rsqrtf.exit21, !dbg !37
+
+286:                                              ; preds = %__nv_rsqrtf.exit18
+  %287 = tail call float @llvm.nvvm.rsqrt.approx.f(float %259), !dbg !37
+  br label %__nv_rsqrtf.exit21, !dbg !37
+
+__nv_rsqrtf.exit21:                               ; preds = %284, %286
+  %.0.i20 = phi float [ %285, %284 ], [ %287, %286 ], !dbg !37
+  %288 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !37
+  %.not.i22 = icmp eq i32 %288, 0, !dbg !37
+  br i1 %.not.i22, label %291, label %289, !dbg !37
+
+289:                                              ; preds = %__nv_rsqrtf.exit21
+  %290 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %260), !dbg !37
+  br label %__nv_rsqrtf.exit24, !dbg !37
+
+291:                                              ; preds = %__nv_rsqrtf.exit21
+  %292 = tail call float @llvm.nvvm.rsqrt.approx.f(float %260), !dbg !37
+  br label %__nv_rsqrtf.exit24, !dbg !37
+
+__nv_rsqrtf.exit24:                               ; preds = %289, %291
+  %.0.i23 = phi float [ %290, %289 ], [ %292, %291 ], !dbg !37
+  %293 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !37
+  %.not.i25 = icmp eq i32 %293, 0, !dbg !37
+  br i1 %.not.i25, label %296, label %294, !dbg !37
+
+294:                                              ; preds = %__nv_rsqrtf.exit24
+  %295 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %261), !dbg !37
+  br label %__nv_rsqrtf.exit27, !dbg !37
+
+296:                                              ; preds = %__nv_rsqrtf.exit24
+  %297 = tail call float @llvm.nvvm.rsqrt.approx.f(float %261), !dbg !37
+  br label %__nv_rsqrtf.exit27, !dbg !37
+
+__nv_rsqrtf.exit27:                               ; preds = %294, %296
+  %.0.i26 = phi float [ %295, %294 ], [ %297, %296 ], !dbg !37
+  %298 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !37
+  %.not.i28 = icmp eq i32 %298, 0, !dbg !37
+  br i1 %.not.i28, label %301, label %299, !dbg !37
+
+299:                                              ; preds = %__nv_rsqrtf.exit27
+  %300 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %262), !dbg !37
+  br label %__nv_rsqrtf.exit30, !dbg !37
+
+301:                                              ; preds = %__nv_rsqrtf.exit27
+  %302 = tail call float @llvm.nvvm.rsqrt.approx.f(float %262), !dbg !37
+  br label %__nv_rsqrtf.exit30, !dbg !37
+
+__nv_rsqrtf.exit30:                               ; preds = %299, %301
+  %.0.i29 = phi float [ %300, %299 ], [ %302, %301 ], !dbg !37
+  %303 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !37
+  %.not.i31 = icmp eq i32 %303, 0, !dbg !37
+  br i1 %.not.i31, label %306, label %304, !dbg !37
+
+304:                                              ; preds = %__nv_rsqrtf.exit30
+  %305 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %263), !dbg !37
+  br label %__nv_rsqrtf.exit33, !dbg !37
+
+306:                                              ; preds = %__nv_rsqrtf.exit30
+  %307 = tail call float @llvm.nvvm.rsqrt.approx.f(float %263), !dbg !37
+  br label %__nv_rsqrtf.exit33, !dbg !37
+
+__nv_rsqrtf.exit33:                               ; preds = %304, %306
+  %.0.i32 = phi float [ %305, %304 ], [ %307, %306 ], !dbg !37
+  %308 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !37
+  %.not.i34 = icmp eq i32 %308, 0, !dbg !37
+  br i1 %.not.i34, label %311, label %309, !dbg !37
+
+309:                                              ; preds = %__nv_rsqrtf.exit33
+  %310 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %264), !dbg !37
+  br label %__nv_rsqrtf.exit36, !dbg !37
+
+311:                                              ; preds = %__nv_rsqrtf.exit33
+  %312 = tail call float @llvm.nvvm.rsqrt.approx.f(float %264), !dbg !37
+  br label %__nv_rsqrtf.exit36, !dbg !37
+
+__nv_rsqrtf.exit36:                               ; preds = %309, %311
+  %.0.i35 = phi float [ %310, %309 ], [ %312, %311 ], !dbg !37
+  %313 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !37
+  %.not.i37 = icmp eq i32 %313, 0, !dbg !37
+  br i1 %.not.i37, label %316, label %314, !dbg !37
+
+314:                                              ; preds = %__nv_rsqrtf.exit36
+  %315 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %265), !dbg !37
+  br label %__nv_rsqrtf.exit39, !dbg !37
+
+316:                                              ; preds = %__nv_rsqrtf.exit36
+  %317 = tail call float @llvm.nvvm.rsqrt.approx.f(float %265), !dbg !37
+  br label %__nv_rsqrtf.exit39, !dbg !37
+
+__nv_rsqrtf.exit39:                               ; preds = %314, %316
+  %.0.i38 = phi float [ %315, %314 ], [ %317, %316 ], !dbg !37
+  %318 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !37
+  %.not.i40 = icmp eq i32 %318, 0, !dbg !37
+  br i1 %.not.i40, label %321, label %319, !dbg !37
+
+319:                                              ; preds = %__nv_rsqrtf.exit39
+  %320 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %266), !dbg !37
+  br label %__nv_rsqrtf.exit42, !dbg !37
+
+321:                                              ; preds = %__nv_rsqrtf.exit39
+  %322 = tail call float @llvm.nvvm.rsqrt.approx.f(float %266), !dbg !37
+  br label %__nv_rsqrtf.exit42, !dbg !37
+
+__nv_rsqrtf.exit42:                               ; preds = %319, %321
+  %.0.i41 = phi float [ %320, %319 ], [ %322, %321 ], !dbg !37
+  %323 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !37
+  %.not.i43 = icmp eq i32 %323, 0, !dbg !37
+  br i1 %.not.i43, label %326, label %324, !dbg !37
+
+324:                                              ; preds = %__nv_rsqrtf.exit42
+  %325 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %267), !dbg !37
+  br label %__nv_rsqrtf.exit45, !dbg !37
+
+326:                                              ; preds = %__nv_rsqrtf.exit42
+  %327 = tail call float @llvm.nvvm.rsqrt.approx.f(float %267), !dbg !37
+  br label %__nv_rsqrtf.exit45, !dbg !37
+
+__nv_rsqrtf.exit45:                               ; preds = %324, %326
+  %.0.i44 = phi float [ %325, %324 ], [ %327, %326 ], !dbg !37
+  %328 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !37
+  %.not.i46 = icmp eq i32 %328, 0, !dbg !37
+  br i1 %.not.i46, label %331, label %329, !dbg !37
+
+329:                                              ; preds = %__nv_rsqrtf.exit45
+  %330 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %268), !dbg !37
+  br label %__nv_rsqrtf.exit48, !dbg !37
+
+331:                                              ; preds = %__nv_rsqrtf.exit45
+  %332 = tail call float @llvm.nvvm.rsqrt.approx.f(float %268), !dbg !37
+  br label %__nv_rsqrtf.exit48, !dbg !37
+
+__nv_rsqrtf.exit48:                               ; preds = %329, %331
+  %.0.i47 = phi float [ %330, %329 ], [ %332, %331 ], !dbg !37
+  %333 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !37
+  %.not.i49 = icmp eq i32 %333, 0, !dbg !37
+  br i1 %.not.i49, label %336, label %334, !dbg !37
+
+334:                                              ; preds = %__nv_rsqrtf.exit48
+  %335 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %269), !dbg !37
+  br label %__nv_rsqrtf.exit51, !dbg !37
+
+336:                                              ; preds = %__nv_rsqrtf.exit48
+  %337 = tail call float @llvm.nvvm.rsqrt.approx.f(float %269), !dbg !37
+  br label %__nv_rsqrtf.exit51, !dbg !37
+
+__nv_rsqrtf.exit51:                               ; preds = %334, %336
+  %.0.i50 = phi float [ %335, %334 ], [ %337, %336 ], !dbg !37
+  %338 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !37
+  %.not.i52 = icmp eq i32 %338, 0, !dbg !37
+  br i1 %.not.i52, label %341, label %339, !dbg !37
+
+339:                                              ; preds = %__nv_rsqrtf.exit51
+  %340 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %270), !dbg !37
+  br label %__nv_rsqrtf.exit54, !dbg !37
+
+341:                                              ; preds = %__nv_rsqrtf.exit51
+  %342 = tail call float @llvm.nvvm.rsqrt.approx.f(float %270), !dbg !37
+  br label %__nv_rsqrtf.exit54, !dbg !37
+
+__nv_rsqrtf.exit54:                               ; preds = %339, %341
+  %.0.i53 = phi float [ %340, %339 ], [ %342, %341 ], !dbg !37
+  %343 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !37
+  %.not.i55 = icmp eq i32 %343, 0, !dbg !37
+  br i1 %.not.i55, label %346, label %344, !dbg !37
+
+344:                                              ; preds = %__nv_rsqrtf.exit54
+  %345 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %271), !dbg !37
+  br label %__nv_rsqrtf.exit57, !dbg !37
+
+346:                                              ; preds = %__nv_rsqrtf.exit54
+  %347 = tail call float @llvm.nvvm.rsqrt.approx.f(float %271), !dbg !37
+  br label %__nv_rsqrtf.exit57, !dbg !37
+
+__nv_rsqrtf.exit57:                               ; preds = %344, %346
+  %.0.i56 = phi float [ %345, %344 ], [ %347, %346 ], !dbg !37
+  %348 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !37
+  %.not.i58 = icmp eq i32 %348, 0, !dbg !37
+  br i1 %.not.i58, label %351, label %349, !dbg !37
+
+349:                                              ; preds = %__nv_rsqrtf.exit57
+  %350 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %272), !dbg !37
+  br label %__nv_rsqrtf.exit60, !dbg !37
+
+351:                                              ; preds = %__nv_rsqrtf.exit57
+  %352 = tail call float @llvm.nvvm.rsqrt.approx.f(float %272), !dbg !37
+  br label %__nv_rsqrtf.exit60, !dbg !37
+
+__nv_rsqrtf.exit60:                               ; preds = %349, %351
+  %.0.i59 = phi float [ %350, %349 ], [ %352, %351 ], !dbg !37
+  %353 = fmul float %.0.i, %183, !dbg !38
+  %354 = fmul float %.0.i17, %184, !dbg !38
+  %355 = fmul float %.0.i20, %185, !dbg !38
+  %356 = fmul float %.0.i23, %186, !dbg !38
+  %357 = fmul float %.0.i26, %187, !dbg !38
+  %358 = fmul float %.0.i29, %188, !dbg !38
+  %359 = fmul float %.0.i32, %189, !dbg !38
+  %360 = fmul float %.0.i35, %190, !dbg !38
+  %361 = fmul float %.0.i38, %191, !dbg !38
+  %362 = fmul float %.0.i41, %192, !dbg !38
+  %363 = fmul float %.0.i44, %193, !dbg !38
+  %364 = fmul float %.0.i47, %194, !dbg !38
+  %365 = fmul float %.0.i50, %195, !dbg !38
+  %366 = fmul float %.0.i53, %196, !dbg !38
+  %367 = fmul float %.0.i56, %197, !dbg !38
+  %368 = fmul float %.0.i59, %198, !dbg !38
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !38
+  %369 = shl nuw nsw i32 %125, 9, !dbg !38
+  %370 = shl nuw nsw i32 %89, 2, !dbg !38
+  %371 = lshr i32 %18, 1, !dbg !38
+  %372 = and i32 %371, 76, !dbg !38
+  %373 = or disjoint i32 %369, %87, !dbg !38
+  %374 = or disjoint i32 %370, %372, !dbg !38
+  %375 = xor i32 %373, %374, !dbg !38
+  %376 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %375, !dbg !38
+  store float %353, ptr addrspace(3) %376, align 4, !dbg !38
+  %377 = xor i32 %375, 528, !dbg !38
+  %378 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %377, !dbg !38
+  store float %354, ptr addrspace(3) %378, align 4, !dbg !38
+  %379 = xor i32 %375, 1056, !dbg !38
+  %380 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %379, !dbg !38
+  store float %355, ptr addrspace(3) %380, align 4, !dbg !38
+  %381 = xor i32 %375, 1584, !dbg !38
+  %382 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %381, !dbg !38
+  store float %356, ptr addrspace(3) %382, align 4, !dbg !38
+  %383 = xor i32 %375, 4, !dbg !38
+  %384 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %383, !dbg !38
+  store float %357, ptr addrspace(3) %384, align 4, !dbg !38
+  %385 = xor i32 %375, 532, !dbg !38
+  %386 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %385, !dbg !38
+  store float %358, ptr addrspace(3) %386, align 4, !dbg !38
+  %387 = xor i32 %375, 1060, !dbg !38
+  %388 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %387, !dbg !38
+  store float %359, ptr addrspace(3) %388, align 4, !dbg !38
+  %389 = xor i32 %375, 1588, !dbg !38
+  %390 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %389, !dbg !38
+  store float %360, ptr addrspace(3) %390, align 4, !dbg !38
+  %391 = xor i32 %375, 8, !dbg !38
+  %392 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %391, !dbg !38
+  store float %361, ptr addrspace(3) %392, align 4, !dbg !38
+  %393 = xor i32 %375, 536, !dbg !38
+  %394 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %393, !dbg !38
+  store float %362, ptr addrspace(3) %394, align 4, !dbg !38
+  %395 = xor i32 %375, 1064, !dbg !38
+  %396 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %395, !dbg !38
+  store float %363, ptr addrspace(3) %396, align 4, !dbg !38
+  %397 = xor i32 %375, 1592, !dbg !38
+  %398 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %397, !dbg !38
+  store float %364, ptr addrspace(3) %398, align 4, !dbg !38
+  %399 = xor i32 %375, 12, !dbg !38
+  %400 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %399, !dbg !38
+  store float %365, ptr addrspace(3) %400, align 4, !dbg !38
+  %401 = xor i32 %375, 540, !dbg !38
+  %402 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %401, !dbg !38
+  store float %366, ptr addrspace(3) %402, align 4, !dbg !38
+  %403 = xor i32 %375, 1068, !dbg !38
+  %404 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %403, !dbg !38
+  store float %367, ptr addrspace(3) %404, align 4, !dbg !38
+  %405 = xor i32 %375, 1596, !dbg !38
+  %406 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %405, !dbg !38
+  store float %368, ptr addrspace(3) %406, align 4, !dbg !38
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !38
+  %407 = shl nuw nsw i32 %18, 6, !dbg !38
+  %408 = and i32 %407, 1600, !dbg !38
+  %409 = and i32 %129, 60, !dbg !38
+  %410 = lshr exact i32 %89, 1, !dbg !38
+  %411 = select i1 %92, i32 0, i32 2112, !dbg !38
+  %412 = or disjoint i32 %408, %409, !dbg !38
+  %413 = or disjoint i32 %411, %410, !dbg !38
+  %414 = xor i32 %413, %412, !dbg !38
+  %415 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %414, !dbg !38
+  %416 = load float, ptr addrspace(3) %415, align 4, !dbg !38
+  %417 = getelementptr inbounds nuw i8, ptr addrspace(3) %415, i32 128, !dbg !38
+  %418 = load float, ptr addrspace(3) %417, align 4, !dbg !38
+  %419 = getelementptr inbounds nuw i8, ptr addrspace(3) %415, i32 256, !dbg !38
+  %420 = load float, ptr addrspace(3) %419, align 4, !dbg !38
+  %421 = getelementptr inbounds nuw i8, ptr addrspace(3) %415, i32 384, !dbg !38
+  %422 = load float, ptr addrspace(3) %421, align 4, !dbg !38
+  %423 = xor i32 %414, 8200, !dbg !38
+  %424 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %423, !dbg !38
+  %425 = load float, ptr addrspace(3) %424, align 4, !dbg !38
+  %426 = getelementptr inbounds nuw i8, ptr addrspace(3) %424, i32 128, !dbg !38
+  %427 = load float, ptr addrspace(3) %426, align 4, !dbg !38
+  %428 = getelementptr inbounds nuw i8, ptr addrspace(3) %424, i32 256, !dbg !38
+  %429 = load float, ptr addrspace(3) %428, align 4, !dbg !38
+  %430 = getelementptr inbounds nuw i8, ptr addrspace(3) %424, i32 384, !dbg !38
+  %431 = load float, ptr addrspace(3) %430, align 4, !dbg !38
+  %432 = xor i32 %414, 4100, !dbg !38
+  %433 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %432, !dbg !38
+  %434 = load float, ptr addrspace(3) %433, align 4, !dbg !38
+  %435 = getelementptr inbounds nuw i8, ptr addrspace(3) %433, i32 128, !dbg !38
+  %436 = load float, ptr addrspace(3) %435, align 4, !dbg !38
+  %437 = getelementptr inbounds nuw i8, ptr addrspace(3) %433, i32 256, !dbg !38
+  %438 = load float, ptr addrspace(3) %437, align 4, !dbg !38
+  %439 = getelementptr inbounds nuw i8, ptr addrspace(3) %433, i32 384, !dbg !38
+  %440 = load float, ptr addrspace(3) %439, align 4, !dbg !38
+  %441 = xor i32 %414, 12300, !dbg !38
+  %442 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %441, !dbg !38
+  %443 = load float, ptr addrspace(3) %442, align 4, !dbg !38
+  %444 = getelementptr inbounds nuw i8, ptr addrspace(3) %442, i32 128, !dbg !38
+  %445 = load float, ptr addrspace(3) %444, align 4, !dbg !38
+  %446 = getelementptr inbounds nuw i8, ptr addrspace(3) %442, i32 256, !dbg !38
+  %447 = load float, ptr addrspace(3) %446, align 4, !dbg !38
+  %448 = getelementptr inbounds nuw i8, ptr addrspace(3) %442, i32 384, !dbg !38
+  %449 = load float, ptr addrspace(3) %448, align 4, !dbg !38
+  %450 = sext i32 %32 to i64, !dbg !39
+  %451 = getelementptr bfloat, ptr addrspace(1) %2, i64 %450, !dbg !39
+  %452 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !40
+  %453 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %451, i64 %452, i1 %53) #5, !dbg !40
+  %454 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !40
+  %455 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %451, i64 %454, i1 %56) #5, !dbg !40
+  %456 = add i32 %47, -3145728, !dbg !41
+  %457 = add i32 %48, -3145728, !dbg !41
+  %458 = sext i32 %456 to i64, !dbg !42
+  %459 = getelementptr bfloat, ptr addrspace(1) %3, i64 %458, !dbg !42
+  %460 = sext i32 %457 to i64, !dbg !42
+  %461 = getelementptr bfloat, ptr addrspace(1) %3, i64 %460, !dbg !42
+  %462 = add i32 %17, -8192, !dbg !43
+  %463 = icmp ult i32 %462, 65536, !dbg !43
+  %464 = and i1 %34, %463, !dbg !43
+  %465 = add i32 %17, -8160, !dbg !43
+  %466 = icmp ult i32 %465, 65568, !dbg !43
+  %467 = and i1 %34, %466, !dbg !43
+  %468 = and i1 %35, %463, !dbg !43
+  %469 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !44
+  %470 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %459, i64 %469, i1 %464) #5, !dbg !44
+  %471 = extractvalue { i32, i32, i32, i32 } %470, 0, !dbg !44
+  %472 = extractvalue { i32, i32, i32, i32 } %470, 1, !dbg !44
+  %473 = extractvalue { i32, i32, i32, i32 } %470, 2, !dbg !44
+  %474 = extractvalue { i32, i32, i32, i32 } %470, 3, !dbg !44
+  %475 = insertelement <2 x i32> poison, i32 %471, i64 0, !dbg !44
+  %476 = insertelement <2 x i32> %475, i32 %472, i64 1, !dbg !44
+  %477 = lshr <2 x i32> %476, splat (i32 16), !dbg !44
+  %478 = trunc nuw <2 x i32> %477 to <2 x i16>, !dbg !44
+  %479 = insertelement <2 x i32> poison, i32 %473, i64 0, !dbg !44
+  %480 = insertelement <2 x i32> %479, i32 %474, i64 1, !dbg !44
+  %481 = lshr <2 x i32> %480, splat (i32 16), !dbg !44
+  %482 = trunc nuw <2 x i32> %481 to <2 x i16>, !dbg !44
+  %483 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !44
+  %484 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %461, i64 %483, i1 %467) #5, !dbg !44
+  %485 = extractvalue { i32, i32, i32, i32 } %484, 0, !dbg !44
+  %486 = extractvalue { i32, i32, i32, i32 } %484, 1, !dbg !44
+  %487 = extractvalue { i32, i32, i32, i32 } %484, 2, !dbg !44
+  %488 = extractvalue { i32, i32, i32, i32 } %484, 3, !dbg !44
+  %489 = insertelement <2 x i32> poison, i32 %485, i64 0, !dbg !44
+  %490 = insertelement <2 x i32> %489, i32 %486, i64 1, !dbg !44
+  %491 = lshr <2 x i32> %490, splat (i32 16), !dbg !44
+  %492 = trunc nuw <2 x i32> %491 to <2 x i16>, !dbg !44
+  %493 = insertelement <2 x i32> poison, i32 %487, i64 0, !dbg !44
+  %494 = insertelement <2 x i32> %493, i32 %488, i64 1, !dbg !44
+  %495 = lshr <2 x i32> %494, splat (i32 16), !dbg !44
+  %496 = trunc nuw <2 x i32> %495 to <2 x i16>, !dbg !44
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !45
+  %497 = trunc i32 %471 to i16, !dbg !45
+  %498 = trunc i32 %472 to i16, !dbg !45
+  %499 = insertelement <2 x i16> poison, i16 %497, i64 0, !dbg !45
+  %500 = insertelement <2 x i16> %499, i16 %498, i64 1, !dbg !45
+  store <2 x i16> %500, ptr addrspace(3) %98, align 4, !dbg !45
+  %501 = trunc i32 %473 to i16, !dbg !45
+  %502 = trunc i32 %474 to i16, !dbg !45
+  %503 = insertelement <2 x i16> poison, i16 %501, i64 0, !dbg !45
+  %504 = insertelement <2 x i16> %503, i16 %502, i64 1, !dbg !45
+  store <2 x i16> %504, ptr addrspace(3) %103, align 4, !dbg !45
+  store <2 x i16> %478, ptr addrspace(3) %109, align 4, !dbg !45
+  store <2 x i16> %482, ptr addrspace(3) %110, align 4, !dbg !45
+  %505 = trunc i32 %485 to i16, !dbg !45
+  %506 = trunc i32 %486 to i16, !dbg !45
+  %507 = insertelement <2 x i16> poison, i16 %505, i64 0, !dbg !45
+  %508 = insertelement <2 x i16> %507, i16 %506, i64 1, !dbg !45
+  store <2 x i16> %508, ptr addrspace(3) %112, align 4, !dbg !45
+  %509 = trunc i32 %487 to i16, !dbg !45
+  %510 = trunc i32 %488 to i16, !dbg !45
+  %511 = insertelement <2 x i16> poison, i16 %509, i64 0, !dbg !45
+  %512 = insertelement <2 x i16> %511, i16 %510, i64 1, !dbg !45
+  store <2 x i16> %512, ptr addrspace(3) %117, align 4, !dbg !45
+  store <2 x i16> %492, ptr addrspace(3) %123, align 4, !dbg !45
+  store <2 x i16> %496, ptr addrspace(3) %124, align 4, !dbg !45
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !45
+  %513 = load bfloat, ptr addrspace(3) %136, align 2, !dbg !45
+  %514 = load bfloat, ptr addrspace(3) %139, align 2, !dbg !45
+  %515 = load bfloat, ptr addrspace(3) %142, align 2, !dbg !45
+  %516 = load bfloat, ptr addrspace(3) %145, align 2, !dbg !45
+  %517 = load bfloat, ptr addrspace(3) %148, align 2, !dbg !45
+  %518 = load bfloat, ptr addrspace(3) %151, align 2, !dbg !45
+  %519 = load bfloat, ptr addrspace(3) %154, align 2, !dbg !45
+  %520 = load bfloat, ptr addrspace(3) %157, align 2, !dbg !45
+  %521 = load bfloat, ptr addrspace(3) %160, align 2, !dbg !45
+  %522 = load bfloat, ptr addrspace(3) %163, align 2, !dbg !45
+  %523 = load bfloat, ptr addrspace(3) %166, align 2, !dbg !45
+  %524 = load bfloat, ptr addrspace(3) %169, align 2, !dbg !45
+  %525 = load bfloat, ptr addrspace(3) %172, align 2, !dbg !45
+  %526 = load bfloat, ptr addrspace(3) %175, align 2, !dbg !45
+  %527 = load bfloat, ptr addrspace(3) %178, align 2, !dbg !45
+  %528 = load bfloat, ptr addrspace(3) %181, align 2, !dbg !45
+  %529 = shl nsw i32 %38, 5, !dbg !46
+  %530 = add nsw i32 %.decomposed109, -8192, !dbg !46
+  %531 = add i32 %530, %529, !dbg !47
+  %532 = sext i32 %531 to i64, !dbg !48
+  %533 = getelementptr float, ptr addrspace(1) %4, i64 %532, !dbg !48
+  %534 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !49
+  %535 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %533, i64 %534, i1 %468) #5, !dbg !49
+  %536 = extractvalue { i32, i32, i32, i32 } %535, 0, !dbg !49
+  %537 = extractvalue { i32, i32, i32, i32 } %535, 1, !dbg !49
+  %538 = extractvalue { i32, i32, i32, i32 } %535, 2, !dbg !49
+  %539 = extractvalue { i32, i32, i32, i32 } %535, 3, !dbg !49
+  %540 = bitcast i32 %536 to float, !dbg !49
+  %541 = bitcast i32 %537 to float, !dbg !49
+  %542 = bitcast i32 %538 to float, !dbg !49
+  %543 = bitcast i32 %539 to float, !dbg !49
+  %544 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !49
+  %545 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %533, i64 %544, i1 %468) #5, !dbg !49
+  %546 = extractvalue { i32, i32, i32, i32 } %545, 0, !dbg !49
+  %547 = extractvalue { i32, i32, i32, i32 } %545, 1, !dbg !49
+  %548 = extractvalue { i32, i32, i32, i32 } %545, 2, !dbg !49
+  %549 = extractvalue { i32, i32, i32, i32 } %545, 3, !dbg !49
+  %550 = bitcast i32 %546 to float, !dbg !49
+  %551 = bitcast i32 %547 to float, !dbg !49
+  %552 = bitcast i32 %548 to float, !dbg !49
+  %553 = bitcast i32 %549 to float, !dbg !49
+  %554 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !49
+  %555 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %533, i64 %554, i1 %468) #5, !dbg !49
+  %556 = extractvalue { i32, i32, i32, i32 } %555, 0, !dbg !49
+  %557 = extractvalue { i32, i32, i32, i32 } %555, 1, !dbg !49
+  %558 = extractvalue { i32, i32, i32, i32 } %555, 2, !dbg !49
+  %559 = extractvalue { i32, i32, i32, i32 } %555, 3, !dbg !49
+  %560 = bitcast i32 %556 to float, !dbg !49
+  %561 = bitcast i32 %557 to float, !dbg !49
+  %562 = bitcast i32 %558 to float, !dbg !49
+  %563 = bitcast i32 %559 to float, !dbg !49
+  %564 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !49
+  %565 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %533, i64 %564, i1 %468) #5, !dbg !49
+  %566 = extractvalue { i32, i32, i32, i32 } %565, 0, !dbg !49
+  %567 = extractvalue { i32, i32, i32, i32 } %565, 1, !dbg !49
+  %568 = extractvalue { i32, i32, i32, i32 } %565, 2, !dbg !49
+  %569 = extractvalue { i32, i32, i32, i32 } %565, 3, !dbg !49
+  %570 = bitcast i32 %566 to float, !dbg !49
+  %571 = bitcast i32 %567 to float, !dbg !49
+  %572 = bitcast i32 %568 to float, !dbg !49
+  %573 = bitcast i32 %569 to float, !dbg !49
+  %574 = tail call float @llvm.nvvm.div.full(float %540, float 1.280000e+02), !dbg !50
+  %575 = tail call float @llvm.nvvm.div.full(float %541, float 1.280000e+02), !dbg !50
+  %576 = tail call float @llvm.nvvm.div.full(float %542, float 1.280000e+02), !dbg !50
+  %577 = tail call float @llvm.nvvm.div.full(float %543, float 1.280000e+02), !dbg !50
+  %578 = tail call float @llvm.nvvm.div.full(float %550, float 1.280000e+02), !dbg !50
+  %579 = tail call float @llvm.nvvm.div.full(float %551, float 1.280000e+02), !dbg !50
+  %580 = tail call float @llvm.nvvm.div.full(float %552, float 1.280000e+02), !dbg !50
+  %581 = tail call float @llvm.nvvm.div.full(float %553, float 1.280000e+02), !dbg !50
+  %582 = tail call float @llvm.nvvm.div.full(float %560, float 1.280000e+02), !dbg !50
+  %583 = tail call float @llvm.nvvm.div.full(float %561, float 1.280000e+02), !dbg !50
+  %584 = tail call float @llvm.nvvm.div.full(float %562, float 1.280000e+02), !dbg !50
+  %585 = tail call float @llvm.nvvm.div.full(float %563, float 1.280000e+02), !dbg !50
+  %586 = tail call float @llvm.nvvm.div.full(float %570, float 1.280000e+02), !dbg !50
+  %587 = tail call float @llvm.nvvm.div.full(float %571, float 1.280000e+02), !dbg !50
+  %588 = tail call float @llvm.nvvm.div.full(float %572, float 1.280000e+02), !dbg !50
+  %589 = tail call float @llvm.nvvm.div.full(float %573, float 1.280000e+02), !dbg !50
+  %590 = fadd float %574, 0x3EB0C6F7A0000000, !dbg !51
+  %591 = fadd float %575, 0x3EB0C6F7A0000000, !dbg !51
+  %592 = fadd float %576, 0x3EB0C6F7A0000000, !dbg !51
+  %593 = fadd float %577, 0x3EB0C6F7A0000000, !dbg !51
+  %594 = fadd float %578, 0x3EB0C6F7A0000000, !dbg !51
+  %595 = fadd float %579, 0x3EB0C6F7A0000000, !dbg !51
+  %596 = fadd float %580, 0x3EB0C6F7A0000000, !dbg !51
+  %597 = fadd float %581, 0x3EB0C6F7A0000000, !dbg !51
+  %598 = fadd float %582, 0x3EB0C6F7A0000000, !dbg !51
+  %599 = fadd float %583, 0x3EB0C6F7A0000000, !dbg !51
+  %600 = fadd float %584, 0x3EB0C6F7A0000000, !dbg !51
+  %601 = fadd float %585, 0x3EB0C6F7A0000000, !dbg !51
+  %602 = fadd float %586, 0x3EB0C6F7A0000000, !dbg !51
+  %603 = fadd float %587, 0x3EB0C6F7A0000000, !dbg !51
+  %604 = fadd float %588, 0x3EB0C6F7A0000000, !dbg !51
+  %605 = fadd float %589, 0x3EB0C6F7A0000000, !dbg !51
+  %606 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !52
+  %.not.i61 = icmp eq i32 %606, 0, !dbg !52
+  br i1 %.not.i61, label %609, label %607, !dbg !52
+
+607:                                              ; preds = %__nv_rsqrtf.exit60
+  %608 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %590), !dbg !52
+  br label %__nv_rsqrtf.exit63, !dbg !52
+
+609:                                              ; preds = %__nv_rsqrtf.exit60
+  %610 = tail call float @llvm.nvvm.rsqrt.approx.f(float %590), !dbg !52
+  br label %__nv_rsqrtf.exit63, !dbg !52
+
+__nv_rsqrtf.exit63:                               ; preds = %607, %609
+  %.0.i62 = phi float [ %608, %607 ], [ %610, %609 ], !dbg !52
+  %611 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !52
+  %.not.i64 = icmp eq i32 %611, 0, !dbg !52
+  br i1 %.not.i64, label %614, label %612, !dbg !52
+
+612:                                              ; preds = %__nv_rsqrtf.exit63
+  %613 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %591), !dbg !52
+  br label %__nv_rsqrtf.exit66, !dbg !52
+
+614:                                              ; preds = %__nv_rsqrtf.exit63
+  %615 = tail call float @llvm.nvvm.rsqrt.approx.f(float %591), !dbg !52
+  br label %__nv_rsqrtf.exit66, !dbg !52
+
+__nv_rsqrtf.exit66:                               ; preds = %612, %614
+  %.0.i65 = phi float [ %613, %612 ], [ %615, %614 ], !dbg !52
+  %616 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !52
+  %.not.i67 = icmp eq i32 %616, 0, !dbg !52
+  br i1 %.not.i67, label %619, label %617, !dbg !52
+
+617:                                              ; preds = %__nv_rsqrtf.exit66
+  %618 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %592), !dbg !52
+  br label %__nv_rsqrtf.exit69, !dbg !52
+
+619:                                              ; preds = %__nv_rsqrtf.exit66
+  %620 = tail call float @llvm.nvvm.rsqrt.approx.f(float %592), !dbg !52
+  br label %__nv_rsqrtf.exit69, !dbg !52
+
+__nv_rsqrtf.exit69:                               ; preds = %617, %619
+  %.0.i68 = phi float [ %618, %617 ], [ %620, %619 ], !dbg !52
+  %621 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !52
+  %.not.i70 = icmp eq i32 %621, 0, !dbg !52
+  br i1 %.not.i70, label %624, label %622, !dbg !52
+
+622:                                              ; preds = %__nv_rsqrtf.exit69
+  %623 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %593), !dbg !52
+  br label %__nv_rsqrtf.exit72, !dbg !52
+
+624:                                              ; preds = %__nv_rsqrtf.exit69
+  %625 = tail call float @llvm.nvvm.rsqrt.approx.f(float %593), !dbg !52
+  br label %__nv_rsqrtf.exit72, !dbg !52
+
+__nv_rsqrtf.exit72:                               ; preds = %622, %624
+  %.0.i71 = phi float [ %623, %622 ], [ %625, %624 ], !dbg !52
+  %626 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !52
+  %.not.i73 = icmp eq i32 %626, 0, !dbg !52
+  br i1 %.not.i73, label %629, label %627, !dbg !52
+
+627:                                              ; preds = %__nv_rsqrtf.exit72
+  %628 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %594), !dbg !52
+  br label %__nv_rsqrtf.exit75, !dbg !52
+
+629:                                              ; preds = %__nv_rsqrtf.exit72
+  %630 = tail call float @llvm.nvvm.rsqrt.approx.f(float %594), !dbg !52
+  br label %__nv_rsqrtf.exit75, !dbg !52
+
+__nv_rsqrtf.exit75:                               ; preds = %627, %629
+  %.0.i74 = phi float [ %628, %627 ], [ %630, %629 ], !dbg !52
+  %631 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !52
+  %.not.i76 = icmp eq i32 %631, 0, !dbg !52
+  br i1 %.not.i76, label %634, label %632, !dbg !52
+
+632:                                              ; preds = %__nv_rsqrtf.exit75
+  %633 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %595), !dbg !52
+  br label %__nv_rsqrtf.exit78, !dbg !52
+
+634:                                              ; preds = %__nv_rsqrtf.exit75
+  %635 = tail call float @llvm.nvvm.rsqrt.approx.f(float %595), !dbg !52
+  br label %__nv_rsqrtf.exit78, !dbg !52
+
+__nv_rsqrtf.exit78:                               ; preds = %632, %634
+  %.0.i77 = phi float [ %633, %632 ], [ %635, %634 ], !dbg !52
+  %636 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !52
+  %.not.i79 = icmp eq i32 %636, 0, !dbg !52
+  br i1 %.not.i79, label %639, label %637, !dbg !52
+
+637:                                              ; preds = %__nv_rsqrtf.exit78
+  %638 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %596), !dbg !52
+  br label %__nv_rsqrtf.exit81, !dbg !52
+
+639:                                              ; preds = %__nv_rsqrtf.exit78
+  %640 = tail call float @llvm.nvvm.rsqrt.approx.f(float %596), !dbg !52
+  br label %__nv_rsqrtf.exit81, !dbg !52
+
+__nv_rsqrtf.exit81:                               ; preds = %637, %639
+  %.0.i80 = phi float [ %638, %637 ], [ %640, %639 ], !dbg !52
+  %641 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !52
+  %.not.i82 = icmp eq i32 %641, 0, !dbg !52
+  br i1 %.not.i82, label %644, label %642, !dbg !52
+
+642:                                              ; preds = %__nv_rsqrtf.exit81
+  %643 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %597), !dbg !52
+  br label %__nv_rsqrtf.exit84, !dbg !52
+
+644:                                              ; preds = %__nv_rsqrtf.exit81
+  %645 = tail call float @llvm.nvvm.rsqrt.approx.f(float %597), !dbg !52
+  br label %__nv_rsqrtf.exit84, !dbg !52
+
+__nv_rsqrtf.exit84:                               ; preds = %642, %644
+  %.0.i83 = phi float [ %643, %642 ], [ %645, %644 ], !dbg !52
+  %646 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !52
+  %.not.i85 = icmp eq i32 %646, 0, !dbg !52
+  br i1 %.not.i85, label %649, label %647, !dbg !52
+
+647:                                              ; preds = %__nv_rsqrtf.exit84
+  %648 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %598), !dbg !52
+  br label %__nv_rsqrtf.exit87, !dbg !52
+
+649:                                              ; preds = %__nv_rsqrtf.exit84
+  %650 = tail call float @llvm.nvvm.rsqrt.approx.f(float %598), !dbg !52
+  br label %__nv_rsqrtf.exit87, !dbg !52
+
+__nv_rsqrtf.exit87:                               ; preds = %647, %649
+  %.0.i86 = phi float [ %648, %647 ], [ %650, %649 ], !dbg !52
+  %651 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !52
+  %.not.i88 = icmp eq i32 %651, 0, !dbg !52
+  br i1 %.not.i88, label %654, label %652, !dbg !52
+
+652:                                              ; preds = %__nv_rsqrtf.exit87
+  %653 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %599), !dbg !52
+  br label %__nv_rsqrtf.exit90, !dbg !52
+
+654:                                              ; preds = %__nv_rsqrtf.exit87
+  %655 = tail call float @llvm.nvvm.rsqrt.approx.f(float %599), !dbg !52
+  br label %__nv_rsqrtf.exit90, !dbg !52
+
+__nv_rsqrtf.exit90:                               ; preds = %652, %654
+  %.0.i89 = phi float [ %653, %652 ], [ %655, %654 ], !dbg !52
+  %656 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !52
+  %.not.i91 = icmp eq i32 %656, 0, !dbg !52
+  br i1 %.not.i91, label %659, label %657, !dbg !52
+
+657:                                              ; preds = %__nv_rsqrtf.exit90
+  %658 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %600), !dbg !52
+  br label %__nv_rsqrtf.exit93, !dbg !52
+
+659:                                              ; preds = %__nv_rsqrtf.exit90
+  %660 = tail call float @llvm.nvvm.rsqrt.approx.f(float %600), !dbg !52
+  br label %__nv_rsqrtf.exit93, !dbg !52
+
+__nv_rsqrtf.exit93:                               ; preds = %657, %659
+  %.0.i92 = phi float [ %658, %657 ], [ %660, %659 ], !dbg !52
+  %661 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !52
+  %.not.i94 = icmp eq i32 %661, 0, !dbg !52
+  br i1 %.not.i94, label %664, label %662, !dbg !52
+
+662:                                              ; preds = %__nv_rsqrtf.exit93
+  %663 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %601), !dbg !52
+  br label %__nv_rsqrtf.exit96, !dbg !52
+
+664:                                              ; preds = %__nv_rsqrtf.exit93
+  %665 = tail call float @llvm.nvvm.rsqrt.approx.f(float %601), !dbg !52
+  br label %__nv_rsqrtf.exit96, !dbg !52
+
+__nv_rsqrtf.exit96:                               ; preds = %662, %664
+  %.0.i95 = phi float [ %663, %662 ], [ %665, %664 ], !dbg !52
+  %666 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !52
+  %.not.i97 = icmp eq i32 %666, 0, !dbg !52
+  br i1 %.not.i97, label %669, label %667, !dbg !52
+
+667:                                              ; preds = %__nv_rsqrtf.exit96
+  %668 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %602), !dbg !52
+  br label %__nv_rsqrtf.exit99, !dbg !52
+
+669:                                              ; preds = %__nv_rsqrtf.exit96
+  %670 = tail call float @llvm.nvvm.rsqrt.approx.f(float %602), !dbg !52
+  br label %__nv_rsqrtf.exit99, !dbg !52
+
+__nv_rsqrtf.exit99:                               ; preds = %667, %669
+  %.0.i98 = phi float [ %668, %667 ], [ %670, %669 ], !dbg !52
+  %671 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !52
+  %.not.i100 = icmp eq i32 %671, 0, !dbg !52
+  br i1 %.not.i100, label %674, label %672, !dbg !52
+
+672:                                              ; preds = %__nv_rsqrtf.exit99
+  %673 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %603), !dbg !52
+  br label %__nv_rsqrtf.exit102, !dbg !52
+
+674:                                              ; preds = %__nv_rsqrtf.exit99
+  %675 = tail call float @llvm.nvvm.rsqrt.approx.f(float %603), !dbg !52
+  br label %__nv_rsqrtf.exit102, !dbg !52
+
+__nv_rsqrtf.exit102:                              ; preds = %672, %674
+  %.0.i101 = phi float [ %673, %672 ], [ %675, %674 ], !dbg !52
+  %676 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !52
+  %.not.i103 = icmp eq i32 %676, 0, !dbg !52
+  br i1 %.not.i103, label %679, label %677, !dbg !52
+
+677:                                              ; preds = %__nv_rsqrtf.exit102
+  %678 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %604), !dbg !52
+  br label %__nv_rsqrtf.exit105, !dbg !52
+
+679:                                              ; preds = %__nv_rsqrtf.exit102
+  %680 = tail call float @llvm.nvvm.rsqrt.approx.f(float %604), !dbg !52
+  br label %__nv_rsqrtf.exit105, !dbg !52
+
+__nv_rsqrtf.exit105:                              ; preds = %677, %679
+  %.0.i104 = phi float [ %678, %677 ], [ %680, %679 ], !dbg !52
+  %681 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !52
+  %.not.i106 = icmp eq i32 %681, 0, !dbg !52
+  br i1 %.not.i106, label %684, label %682, !dbg !52
+
+682:                                              ; preds = %__nv_rsqrtf.exit105
+  %683 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %605), !dbg !52
+  br label %__nv_rsqrtf.exit108, !dbg !52
+
+684:                                              ; preds = %__nv_rsqrtf.exit105
+  %685 = tail call float @llvm.nvvm.rsqrt.approx.f(float %605), !dbg !52
+  br label %__nv_rsqrtf.exit108, !dbg !52
+
+__nv_rsqrtf.exit108:                              ; preds = %682, %684
+  %.0.i107 = phi float [ %683, %682 ], [ %685, %684 ], !dbg !52
+  %686 = icmp slt i32 %23, 73728, !dbg !53
+  %687 = icmp slt i32 %24, 8192, !dbg !23
+  %688 = fpext bfloat %528 to float, !dbg !45
+  %689 = fpext bfloat %527 to float, !dbg !45
+  %690 = fpext bfloat %526 to float, !dbg !45
+  %691 = fpext bfloat %525 to float, !dbg !45
+  %692 = fpext bfloat %524 to float, !dbg !45
+  %693 = fpext bfloat %523 to float, !dbg !45
+  %694 = fpext bfloat %522 to float, !dbg !45
+  %695 = fpext bfloat %521 to float, !dbg !45
+  %696 = fpext bfloat %520 to float, !dbg !45
+  %697 = fpext bfloat %519 to float, !dbg !45
+  %698 = fpext bfloat %518 to float, !dbg !45
+  %699 = fpext bfloat %517 to float, !dbg !45
+  %700 = fpext bfloat %516 to float, !dbg !45
+  %701 = fpext bfloat %515 to float, !dbg !45
+  %702 = fpext bfloat %514 to float, !dbg !45
+  %703 = fpext bfloat %513 to float, !dbg !45
+  %704 = extractvalue { i32, i32, i32, i32 } %455, 3, !dbg !40
+  %705 = bitcast i32 %704 to <2 x bfloat>, !dbg !40
+  %706 = extractvalue { i32, i32, i32, i32 } %455, 2, !dbg !40
+  %707 = bitcast i32 %706 to <2 x bfloat>, !dbg !40
+  %708 = extractvalue { i32, i32, i32, i32 } %455, 1, !dbg !40
+  %709 = bitcast i32 %708 to <2 x bfloat>, !dbg !40
+  %710 = extractvalue { i32, i32, i32, i32 } %455, 0, !dbg !40
+  %711 = bitcast i32 %710 to <2 x bfloat>, !dbg !40
+  %712 = extractvalue { i32, i32, i32, i32 } %453, 3, !dbg !40
+  %713 = bitcast i32 %712 to <2 x bfloat>, !dbg !40
+  %714 = extractvalue { i32, i32, i32, i32 } %453, 2, !dbg !40
+  %715 = bitcast i32 %714 to <2 x bfloat>, !dbg !40
+  %716 = extractvalue { i32, i32, i32, i32 } %453, 1, !dbg !40
+  %717 = bitcast i32 %716 to <2 x bfloat>, !dbg !40
+  %718 = extractvalue { i32, i32, i32, i32 } %453, 0, !dbg !40
+  %719 = bitcast i32 %718 to <2 x bfloat>, !dbg !40
+  %720 = fmul float %.0.i62, %703, !dbg !54
+  %721 = fmul float %.0.i65, %702, !dbg !54
+  %722 = fmul float %.0.i68, %701, !dbg !54
+  %723 = fmul float %.0.i71, %700, !dbg !54
+  %724 = fmul float %.0.i74, %699, !dbg !54
+  %725 = fmul float %.0.i77, %698, !dbg !54
+  %726 = fmul float %.0.i80, %697, !dbg !54
+  %727 = fmul float %.0.i83, %696, !dbg !54
+  %728 = fmul float %.0.i86, %695, !dbg !54
+  %729 = fmul float %.0.i89, %694, !dbg !54
+  %730 = fmul float %.0.i92, %693, !dbg !54
+  %731 = fmul float %.0.i95, %692, !dbg !54
+  %732 = fmul float %.0.i98, %691, !dbg !54
+  %733 = fmul float %.0.i101, %690, !dbg !54
+  %734 = fmul float %.0.i104, %689, !dbg !54
+  %735 = fmul float %.0.i107, %688, !dbg !54
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !54
+  store float %720, ptr addrspace(3) %376, align 4, !dbg !54
+  store float %721, ptr addrspace(3) %378, align 4, !dbg !54
+  store float %722, ptr addrspace(3) %380, align 4, !dbg !54
+  store float %723, ptr addrspace(3) %382, align 4, !dbg !54
+  store float %724, ptr addrspace(3) %384, align 4, !dbg !54
+  store float %725, ptr addrspace(3) %386, align 4, !dbg !54
+  store float %726, ptr addrspace(3) %388, align 4, !dbg !54
+  store float %727, ptr addrspace(3) %390, align 4, !dbg !54
+  store float %728, ptr addrspace(3) %392, align 4, !dbg !54
+  store float %729, ptr addrspace(3) %394, align 4, !dbg !54
+  store float %730, ptr addrspace(3) %396, align 4, !dbg !54
+  store float %731, ptr addrspace(3) %398, align 4, !dbg !54
+  store float %732, ptr addrspace(3) %400, align 4, !dbg !54
+  store float %733, ptr addrspace(3) %402, align 4, !dbg !54
+  store float %734, ptr addrspace(3) %404, align 4, !dbg !54
+  store float %735, ptr addrspace(3) %406, align 4, !dbg !54
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !54
+  %736 = load float, ptr addrspace(3) %415, align 4, !dbg !54
+  %737 = load float, ptr addrspace(3) %417, align 4, !dbg !54
+  %738 = load float, ptr addrspace(3) %419, align 4, !dbg !54
+  %739 = load float, ptr addrspace(3) %421, align 4, !dbg !54
+  %740 = load float, ptr addrspace(3) %424, align 4, !dbg !54
+  %741 = load float, ptr addrspace(3) %426, align 4, !dbg !54
+  %742 = load float, ptr addrspace(3) %428, align 4, !dbg !54
+  %743 = load float, ptr addrspace(3) %430, align 4, !dbg !54
+  %744 = load float, ptr addrspace(3) %433, align 4, !dbg !54
+  %745 = load float, ptr addrspace(3) %435, align 4, !dbg !54
+  %746 = load float, ptr addrspace(3) %437, align 4, !dbg !54
+  %747 = load float, ptr addrspace(3) %439, align 4, !dbg !54
+  %748 = load float, ptr addrspace(3) %442, align 4, !dbg !54
+  %749 = load float, ptr addrspace(3) %444, align 4, !dbg !54
+  %750 = load float, ptr addrspace(3) %446, align 4, !dbg !54
+  %751 = load float, ptr addrspace(3) %448, align 4, !dbg !54
+  %752 = getelementptr bfloat, ptr addrspace(1) %5, i64 %450, !dbg !55
+  %753 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !56
+  %754 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %752, i64 %753, i1 %464) #5, !dbg !56
+  %755 = extractvalue { i32, i32, i32, i32 } %754, 0, !dbg !56
+  %756 = bitcast i32 %755 to <2 x bfloat>, !dbg !56
+  %757 = extractvalue { i32, i32, i32, i32 } %754, 1, !dbg !56
+  %758 = bitcast i32 %757 to <2 x bfloat>, !dbg !56
+  %759 = extractvalue { i32, i32, i32, i32 } %754, 2, !dbg !56
+  %760 = bitcast i32 %759 to <2 x bfloat>, !dbg !56
+  %761 = extractvalue { i32, i32, i32, i32 } %754, 3, !dbg !56
+  %762 = bitcast i32 %761 to <2 x bfloat>, !dbg !56
+  %763 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !56
+  %764 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %752, i64 %763, i1 %467) #5, !dbg !56
+  %765 = extractvalue { i32, i32, i32, i32 } %764, 0, !dbg !56
+  %766 = bitcast i32 %765 to <2 x bfloat>, !dbg !56
+  %767 = extractvalue { i32, i32, i32, i32 } %764, 1, !dbg !56
+  %768 = bitcast i32 %767 to <2 x bfloat>, !dbg !56
+  %769 = extractvalue { i32, i32, i32, i32 } %764, 2, !dbg !56
+  %770 = bitcast i32 %769 to <2 x bfloat>, !dbg !56
+  %771 = extractvalue { i32, i32, i32, i32 } %764, 3, !dbg !56
+  %772 = bitcast i32 %771 to <2 x bfloat>, !dbg !56
+  %773 = shl i32 %23, 7, !dbg !57
+  %774 = shl i32 %24, 7, !dbg !57
+  %775 = add i32 %773, %32, !dbg !58
+  %776 = add i32 %774, %32, !dbg !58
+  %777 = sext i32 %775 to i64, !dbg !59
+  %778 = getelementptr bfloat, ptr addrspace(1) %6, i64 %777, !dbg !59
+  %779 = sext i32 %776 to i64, !dbg !59
+  %780 = getelementptr bfloat, ptr addrspace(1) %6, i64 %779, !dbg !59
+  %781 = and i1 %34, %686, !dbg !60
+  %782 = fpext <2 x bfloat> %719 to <2 x float>, !dbg !61
+  %783 = insertelement <2 x float> poison, float %416, i64 0, !dbg !62
+  %784 = insertelement <2 x float> %783, float %425, i64 1, !dbg !62
+  %785 = fmul <2 x float> %784, %782, !dbg !62
+  %786 = fpext <2 x bfloat> %756 to <2 x float>, !dbg !63
+  %787 = insertelement <2 x float> poison, float %736, i64 0, !dbg !64
+  %788 = insertelement <2 x float> %787, float %740, i64 1, !dbg !64
+  %789 = fmul <2 x float> %788, %786, !dbg !64
+  %790 = insertelement <2 x i1> poison, i1 %41, i64 0, !dbg !65
+  %791 = shufflevector <2 x i1> %790, <2 x i1> poison, <2 x i32> zeroinitializer, !dbg !65
+  %792 = select <2 x i1> %791, <2 x float> %785, <2 x float> %789, !dbg !65
+  %793 = fptrunc <2 x float> %792 to <2 x bfloat>, !dbg !66
+  %794 = fpext <2 x bfloat> %717 to <2 x float>, !dbg !61
+  %795 = insertelement <2 x float> poison, float %418, i64 0, !dbg !62
+  %796 = insertelement <2 x float> %795, float %427, i64 1, !dbg !62
+  %797 = fmul <2 x float> %796, %794, !dbg !62
+  %798 = fpext <2 x bfloat> %758 to <2 x float>, !dbg !63
+  %799 = insertelement <2 x float> poison, float %737, i64 0, !dbg !64
+  %800 = insertelement <2 x float> %799, float %741, i64 1, !dbg !64
+  %801 = fmul <2 x float> %800, %798, !dbg !64
+  %802 = select <2 x i1> %791, <2 x float> %797, <2 x float> %801, !dbg !65
+  %803 = fptrunc <2 x float> %802 to <2 x bfloat>, !dbg !66
+  %804 = fpext <2 x bfloat> %715 to <2 x float>, !dbg !61
+  %805 = insertelement <2 x float> poison, float %420, i64 0, !dbg !62
+  %806 = insertelement <2 x float> %805, float %429, i64 1, !dbg !62
+  %807 = fmul <2 x float> %806, %804, !dbg !62
+  %808 = fpext <2 x bfloat> %760 to <2 x float>, !dbg !63
+  %809 = insertelement <2 x float> poison, float %738, i64 0, !dbg !64
+  %810 = insertelement <2 x float> %809, float %742, i64 1, !dbg !64
+  %811 = fmul <2 x float> %810, %808, !dbg !64
+  %812 = select <2 x i1> %791, <2 x float> %807, <2 x float> %811, !dbg !65
+  %813 = fptrunc <2 x float> %812 to <2 x bfloat>, !dbg !66
+  %814 = fpext <2 x bfloat> %713 to <2 x float>, !dbg !61
+  %815 = insertelement <2 x float> poison, float %422, i64 0, !dbg !62
+  %816 = insertelement <2 x float> %815, float %431, i64 1, !dbg !62
+  %817 = fmul <2 x float> %816, %814, !dbg !62
+  %818 = fpext <2 x bfloat> %762 to <2 x float>, !dbg !63
+  %819 = insertelement <2 x float> poison, float %739, i64 0, !dbg !64
+  %820 = insertelement <2 x float> %819, float %743, i64 1, !dbg !64
+  %821 = fmul <2 x float> %820, %818, !dbg !64
+  %822 = select <2 x i1> %791, <2 x float> %817, <2 x float> %821, !dbg !65
+  %823 = fptrunc <2 x float> %822 to <2 x bfloat>, !dbg !66
+  %824 = fpext <2 x bfloat> %711 to <2 x float>, !dbg !61
+  %825 = insertelement <2 x float> poison, float %434, i64 0, !dbg !62
+  %826 = insertelement <2 x float> %825, float %443, i64 1, !dbg !62
+  %827 = fmul <2 x float> %826, %824, !dbg !62
+  %828 = fpext <2 x bfloat> %766 to <2 x float>, !dbg !63
+  %829 = insertelement <2 x float> poison, float %744, i64 0, !dbg !64
+  %830 = insertelement <2 x float> %829, float %748, i64 1, !dbg !64
+  %831 = fmul <2 x float> %830, %828, !dbg !64
+  %832 = insertelement <2 x i1> poison, i1 %687, i64 0, !dbg !65
+  %833 = shufflevector <2 x i1> %832, <2 x i1> poison, <2 x i32> zeroinitializer, !dbg !65
+  %834 = select <2 x i1> %833, <2 x float> %827, <2 x float> %831, !dbg !65
+  %835 = fptrunc <2 x float> %834 to <2 x bfloat>, !dbg !66
+  %836 = fpext <2 x bfloat> %709 to <2 x float>, !dbg !61
+  %837 = insertelement <2 x float> poison, float %436, i64 0, !dbg !62
+  %838 = insertelement <2 x float> %837, float %445, i64 1, !dbg !62
+  %839 = fmul <2 x float> %838, %836, !dbg !62
+  %840 = fpext <2 x bfloat> %768 to <2 x float>, !dbg !63
+  %841 = insertelement <2 x float> poison, float %745, i64 0, !dbg !64
+  %842 = insertelement <2 x float> %841, float %749, i64 1, !dbg !64
+  %843 = fmul <2 x float> %842, %840, !dbg !64
+  %844 = select <2 x i1> %833, <2 x float> %839, <2 x float> %843, !dbg !65
+  %845 = fptrunc <2 x float> %844 to <2 x bfloat>, !dbg !66
+  %846 = fpext <2 x bfloat> %707 to <2 x float>, !dbg !61
+  %847 = insertelement <2 x float> poison, float %438, i64 0, !dbg !62
+  %848 = insertelement <2 x float> %847, float %447, i64 1, !dbg !62
+  %849 = fmul <2 x float> %848, %846, !dbg !62
+  %850 = fpext <2 x bfloat> %770 to <2 x float>, !dbg !63
+  %851 = insertelement <2 x float> poison, float %746, i64 0, !dbg !64
+  %852 = insertelement <2 x float> %851, float %750, i64 1, !dbg !64
+  %853 = fmul <2 x float> %852, %850, !dbg !64
+  %854 = select <2 x i1> %833, <2 x float> %849, <2 x float> %853, !dbg !65
+  %855 = fptrunc <2 x float> %854 to <2 x bfloat>, !dbg !66
+  %856 = fpext <2 x bfloat> %705 to <2 x float>, !dbg !61
+  %857 = insertelement <2 x float> poison, float %440, i64 0, !dbg !62
+  %858 = insertelement <2 x float> %857, float %449, i64 1, !dbg !62
+  %859 = fmul <2 x float> %858, %856, !dbg !62
+  %860 = fpext <2 x bfloat> %772 to <2 x float>, !dbg !63
+  %861 = insertelement <2 x float> poison, float %747, i64 0, !dbg !64
+  %862 = insertelement <2 x float> %861, float %751, i64 1, !dbg !64
+  %863 = fmul <2 x float> %862, %860, !dbg !64
+  %864 = select <2 x i1> %833, <2 x float> %859, <2 x float> %863, !dbg !65
+  %865 = fptrunc <2 x float> %864 to <2 x bfloat>, !dbg !66
+  %866 = bitcast <2 x bfloat> %793 to i32, !dbg !66
+  %867 = bitcast <2 x bfloat> %803 to i32, !dbg !66
+  %868 = bitcast <2 x bfloat> %813 to i32, !dbg !66
+  %869 = bitcast <2 x bfloat> %823 to i32, !dbg !66
+  tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %866, i32 %867, i32 %868, i32 %869, ptr addrspace(1) %778, i1 %781) #5, !dbg !66
+  %870 = bitcast <2 x bfloat> %835 to i32, !dbg !66
+  %871 = bitcast <2 x bfloat> %845 to i32, !dbg !66
+  %872 = bitcast <2 x bfloat> %855 to i32, !dbg !66
+  %873 = bitcast <2 x bfloat> %865 to i32, !dbg !66
+  tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %870, i32 %871, i32 %872, i32 %873, ptr addrspace(1) %780, i1 %781) #5, !dbg !66
+  ret void, !dbg !67
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 65535) i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() #1
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 65535) i32 @llvm.nvvm.read.ptx.sreg.ctaid.z() #1
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 1, 65536) i32 @llvm.nvvm.read.ptx.sreg.nctaid.y() #1
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1
+
+; Function Attrs: convergent nocallback nounwind
+declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #2
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.div.full(float, float) #3
+
+declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #4
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #3
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.rsqrt.approx.f(float) #3
+
+attributes #0 = { nounwind "nvvm.reqntid"="256" }
+attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #2 = { convergent nocallback nounwind }
+attributes #3 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
+attributes #4 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #5 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3}
+!llvm.ident = !{!4}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly)
+!1 = !DIFile(filename: "c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py", directory: "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h")
+!2 = !{i32 2, !"Debug Info Version", i32 3}
+!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
+!4 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
+!5 = distinct !DISubprogram(name: "triton_poi_fused__fused_rms_norm_cat_view_2", linkageName: "triton_poi_fused__fused_rms_norm_cat_view_2", scope: !1, file: !1, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
+!7 = !{}
+!8 = !DILocation(line: 21, column: 29, scope: !5)
+!9 = !DILocation(line: 21, column: 48, scope: !5)
+!10 = !DILocation(line: 21, column: 69, scope: !5)
+!11 = !DILocation(line: 21, column: 53, scope: !5)
+!12 = !DILocation(line: 21, column: 34, scope: !5)
+!13 = !DILocation(line: 21, column: 75, scope: !5)
+!14 = !DILocation(line: 22, column: 44, scope: !5)
+!15 = !DILocation(line: 22, column: 23, scope: !5)
+!16 = !DILocation(line: 24, column: 28, scope: !5)
+!17 = !DILocation(line: 24, column: 33, scope: !5)
+!18 = !DILocation(line: 25, column: 44, scope: !5)
+!19 = !DILocation(line: 25, column: 23, scope: !5)
+!20 = !DILocation(line: 26, column: 21, scope: !5)
+!21 = !DILocation(line: 27, column: 19, scope: !5)
+!22 = !DILocation(line: 29, column: 19, scope: !5)
+!23 = !DILocation(line: 35, column: 18, scope: !5)
+!24 = !DILocation(line: 36, column: 39, scope: !5)
+!25 = !DILocation(line: 36, column: 35, scope: !5)
+!26 = !DILocation(line: 36, column: 51, scope: !5)
+!27 = !DILocation(line: 36, column: 44, scope: !5)
+!28 = !DILocation(line: 36, column: 30, scope: !5)
+!29 = !DILocation(line: 36, column: 64, scope: !5)
+!30 = !DILocation(line: 36, column: 72, scope: !5)
+!31 = !DILocation(line: 36, column: 57, scope: !5)
+!32 = !DILocation(line: 36, column: 123, scope: !5)
+!33 = !DILocation(line: 38, column: 30, scope: !5)
+!34 = !DILocation(line: 38, column: 80, scope: !5)
+!35 = !DILocation(line: 40, column: 19, scope: !5)
+!36 = !DILocation(line: 42, column: 19, scope: !5)
+!37 = !DILocation(line: 43, column: 28, scope: !5)
+!38 = !DILocation(line: 44, column: 19, scope: !5)
+!39 = !DILocation(line: 45, column: 31, scope: !5)
+!40 = !DILocation(line: 45, column: 71, scope: !5)
+!41 = !DILocation(line: 54, column: 45, scope: !5)
+!42 = !DILocation(line: 54, column: 31, scope: !5)
+!43 = !DILocation(line: 54, column: 83, scope: !5)
+!44 = !DILocation(line: 54, column: 67, scope: !5)
+!45 = !DILocation(line: 54, column: 134, scope: !5)
+!46 = !DILocation(line: 56, column: 56, scope: !5)
+!47 = !DILocation(line: 56, column: 52, scope: !5)
+!48 = !DILocation(line: 56, column: 31, scope: !5)
+!49 = !DILocation(line: 56, column: 90, scope: !5)
+!50 = !DILocation(line: 58, column: 21, scope: !5)
+!51 = !DILocation(line: 60, column: 20, scope: !5)
+!52 = !DILocation(line: 61, column: 28, scope: !5)
+!53 = !DILocation(line: 23, column: 21, scope: !5)
+!54 = !DILocation(line: 62, column: 20, scope: !5)
+!55 = !DILocation(line: 63, column: 31, scope: !5)
+!56 = !DILocation(line: 63, column: 71, scope: !5)
+!57 = !DILocation(line: 70, column: 34, scope: !5)
+!58 = !DILocation(line: 70, column: 30, scope: !5)
+!59 = !DILocation(line: 70, column: 25, scope: !5)
+!60 = !DILocation(line: 70, column: 54, scope: !5)
+!61 = !DILocation(line: 45, column: 137, scope: !5)
+!62 = !DILocation(line: 47, column: 20, scope: !5)
+!63 = !DILocation(line: 63, column: 138, scope: !5)
+!64 = !DILocation(line: 65, column: 20, scope: !5)
+!65 = !DILocation(line: 0, scope: !5)
+!66 = !DILocation(line: 70, column: 46, scope: !5)
+!67 = !DILocation(line: 70, column: 4, scope: !5)
diff --git a/triton/RNNMPWWZPRYLZDDP3QNL7R5SV7EYTG7WXIUJKWKAEGE4BUI424IA/triton_poi_fused__fused_rms_norm_cat_view_2.ptx b/triton/RNNMPWWZPRYLZDDP3QNL7R5SV7EYTG7WXIUJKWKAEGE4BUI424IA/triton_poi_fused__fused_rms_norm_cat_view_2.ptx
new file mode 100644
index 0000000000000000000000000000000000000000..2f54e7f201a63fcb6f52e7a2e5d62b81556c6ed4
--- /dev/null
+++ b/triton/RNNMPWWZPRYLZDDP3QNL7R5SV7EYTG7WXIUJKWKAEGE4BUI424IA/triton_poi_fused__fused_rms_norm_cat_view_2.ptx
@@ -0,0 +1,1160 @@
+//
+// Generated by LLVM NVPTX Back-End
+//
+
+.version 9.1
+.target sm_89
+.address_size 64
+
+	// .globl	triton_poi_fused__fused_rms_norm_cat_view_2 // -- Begin function triton_poi_fused__fused_rms_norm_cat_view_2
+.extern .shared .align 16 .b8 global_smem[];
+.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90};
+                                        // @triton_poi_fused__fused_rms_norm_cat_view_2
+.visible .entry triton_poi_fused__fused_rms_norm_cat_view_2(
+	.param .u64 .ptr .global .align 1 triton_poi_fused__fused_rms_norm_cat_view_2_param_0,
+	.param .u64 .ptr .global .align 1 triton_poi_fused__fused_rms_norm_cat_view_2_param_1,
+	.param .u64 .ptr .global .align 1 triton_poi_fused__fused_rms_norm_cat_view_2_param_2,
+	.param .u64 .ptr .global .align 1 triton_poi_fused__fused_rms_norm_cat_view_2_param_3,
+	.param .u64 .ptr .global .align 1 triton_poi_fused__fused_rms_norm_cat_view_2_param_4,
+	.param .u64 .ptr .global .align 1 triton_poi_fused__fused_rms_norm_cat_view_2_param_5,
+	.param .u64 .ptr .global .align 1 triton_poi_fused__fused_rms_norm_cat_view_2_param_6,
+	.param .u32 triton_poi_fused__fused_rms_norm_cat_view_2_param_7,
+	.param .u32 triton_poi_fused__fused_rms_norm_cat_view_2_param_8,
+	.param .u64 .ptr .global .align 1 triton_poi_fused__fused_rms_norm_cat_view_2_param_9,
+	.param .u64 .ptr .global .align 1 triton_poi_fused__fused_rms_norm_cat_view_2_param_10
+)
+.reqntid 256
+{
+	.reg .pred 	%p<17>;
+	.reg .b16 	%rs<65>;
+	.reg .b32 	%r<520>;
+	.reg .b64 	%rd<35>;
+	.loc	1 18 0                          // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:18:0
+$L__func_begin0:
+	.loc	1 18 0                          // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:18:0
+
+// %bb.0:                               // %__nv_rsqrtf.exit
+	ld.param.b64 	%rd27, [triton_poi_fused__fused_rms_norm_cat_view_2_param_0];
+	ld.param.b64 	%rd28, [triton_poi_fused__fused_rms_norm_cat_view_2_param_1];
+$L__tmp0:
+	.loc	1 21 29                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:21:29
+	mov.u32 	%r74, %ctaid.y;
+	ld.param.b64 	%rd29, [triton_poi_fused__fused_rms_norm_cat_view_2_param_2];
+	.loc	1 21 48                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:21:48
+	mov.u32 	%r75, %ctaid.z;
+	ld.param.b64 	%rd30, [triton_poi_fused__fused_rms_norm_cat_view_2_param_3];
+	.loc	1 21 69                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:21:69
+	mov.u32 	%r76, %nctaid.y;
+	ld.param.b64 	%rd31, [triton_poi_fused__fused_rms_norm_cat_view_2_param_4];
+	.loc	1 21 34                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:21:34
+	mad.lo.s32 	%r77, %r75, %r76, %r74;
+	ld.param.b64 	%rd32, [triton_poi_fused__fused_rms_norm_cat_view_2_param_5];
+	.loc	1 21 75                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:21:75
+	shl.b32 	%r78, %r77, 6;
+	ld.param.b64 	%rd33, [triton_poi_fused__fused_rms_norm_cat_view_2_param_6];
+	.loc	1 22 44                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:22:44
+	mov.u32 	%r79, %tid.x;
+	bfe.u32 	%r80, %r79, 3, 5;
+	shl.b32 	%r81, %r79, 2;
+	and.b32 	%r82, %r81, 60;
+	.loc	1 22 23                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:22:23
+	or.b32 	%r83, %r78, %r80;
+	or.b32 	%r84, %r83, 32;
+	or.b32 	%r85, %r78, %r82;
+	.loc	1 24 28                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:24:28
+	mov.u32 	%r86, %ctaid.x;
+	.loc	1 24 33                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:24:33
+	shl.b32 	%r87, %r86, 6;
+	.loc	1 25 44                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:25:44
+	and.b32 	%r88, %r79, 7;
+	shl.b32 	%r89, %r88, 3;
+	shr.u32 	%r90, %r79, 4;
+	bfe.u32 	%r91, %r79, 4, 4;
+	.loc	1 25 23                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:25:23
+	or.b32 	%r92, %r89, %r87;
+	or.b32 	%r93, %r91, %r87;
+	.loc	1 26 21                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:26:21
+	setp.lt.s32 	%p8, %r92, 128;
+	setp.lt.s32 	%p9, %r93, 128;
+	.loc	1 27 19                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:27:19
+	bfe.s32 	%r94, %r77, 25, 1;
+	shr.u32 	%r95, %r94, 27;
+	add.s32 	%r96, %r83, %r95;
+	shr.u32 	%r97, %r96, 5;
+	add.s32 	%r98, %r84, %r95;
+	shr.u32 	%r99, %r98, 5;
+	.loc	1 29 19                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:29:19
+	and.b32 	%r100, %r96, 33554400;
+	sub.s32 	%r101, %r83, %r100;
+	.loc	1 35 18                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:35:18
+	setp.lt.s32 	%p10, %r83, 8192;
+	setp.lt.s32 	%p11, %r85, 8192;
+	.loc	1 36 39                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:36:39
+	shl.b32 	%r102, %r101, 7;
+	.loc	1 36 35                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:36:35
+	add.s32 	%r103, %r102, %r92;
+	.loc	1 36 44                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:36:44
+	mad.lo.s32 	%r104, %r97, 12288, %r103;
+	mad.lo.s32 	%r105, %r99, 12288, %r103;
+	.loc	1 36 30                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:36:30
+	mad.wide.s32 	%rd1, %r104, 2, %rd27;
+	mad.wide.s32 	%rd3, %r105, 2, %rd27;
+	.loc	1 36 64                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:36:64
+	and.pred 	%p1, %p8, %p10;
+	and.pred 	%p3, %p9, %p11;
+	.loc	1 36 72                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:36:72
+	setp.lt.s32 	%p12, %r83, 8160;
+	and.pred 	%p2, %p8, %p12;
+	.loc	1 36 57                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:36:57
+	// begin inline asm
+	mov.u64 %rd2, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd2, 1.0;
+	// end inline asm
+	mov.b32 	%r5, 0;
+	// begin inline asm
+	mov.u32 %r1, %r5;
+	mov.u32 %r2, %r5;
+	mov.u32 %r3, %r5;
+	mov.u32 %r4, %r5;
+	@%p1 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r1, %r2, %r3, %r4 }, [ %rd1 + 0 ], %rd2;
+	// end inline asm
+	prmt.b32 	%r106, %r1, %r2, 0x7632U;
+	prmt.b32 	%r107, %r3, %r4, 0x7632U;
+	// begin inline asm
+	mov.u64 %rd4, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd4, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r6, %r5;
+	mov.u32 %r7, %r5;
+	mov.u32 %r8, %r5;
+	mov.u32 %r9, %r5;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r6, %r7, %r8, %r9 }, [ %rd3 + 0 ], %rd4;
+	// end inline asm
+	prmt.b32 	%r108, %r6, %r7, 0x7632U;
+	prmt.b32 	%r109, %r8, %r9, 0x7632U;
+	.loc	1 36 123                        // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:36:123
+	and.b32 	%r110, %r79, 24;
+	shl.b32 	%r111, %r110, 5;
+	shl.b32 	%r112, %r88, 4;
+	shr.u32 	%r113, %r110, 1;
+	and.b32 	%r114, %r79, 96;
+	shr.u32 	%r115, %r114, 3;
+	and.b32 	%r116, %r79, 128;
+	bfe.s32 	%r117, %r79, 7, 1;
+	and.b32 	%r118, %r117, 1040;
+	xor.b32 	%r119, %r113, %r115;
+	or.b32 	%r120, %r119, %r111;
+	or.b32 	%r121, %r120, %r112;
+	xor.b32 	%r122, %r121, %r118;
+	mov.b32 	%r123, global_smem;
+	add.s32 	%r124, %r123, %r122;
+	prmt.b32 	%r125, %r1, %r2, 0x5410U;
+	st.shared.b32 	[%r124], %r125;
+	prmt.b32 	%r126, %r3, %r4, 0x5410U;
+	st.shared.b32 	[%r124+128], %r126;
+	xor.b32 	%r127, %r122, 64;
+	add.s32 	%r128, %r123, %r127;
+	st.shared.b32 	[%r128+4096], %r106;
+	st.shared.b32 	[%r128+4224], %r107;
+	xor.b32 	%r129, %r122, 32;
+	add.s32 	%r130, %r123, %r129;
+	prmt.b32 	%r131, %r6, %r7, 0x5410U;
+	st.shared.b32 	[%r130+2048], %r131;
+	prmt.b32 	%r132, %r8, %r9, 0x5410U;
+	st.shared.b32 	[%r130+2176], %r132;
+	xor.b32 	%r133, %r122, 96;
+	add.s32 	%r134, %r123, %r133;
+	st.shared.b32 	[%r134+6144], %r108;
+	st.shared.b32 	[%r134+6272], %r109;
+	bar.sync 	0;
+	and.b32 	%r135, %r79, 28;
+	shl.b32 	%r136, %r135, 8;
+	and.b32 	%r137, %r81, 124;
+	and.b32 	%r138, %r90, 2;
+	shl.b32 	%r139, %r79, 1;
+	and.b32 	%r140, %r139, 128;
+	shr.u32 	%r141, %r116, 3;
+	or.b32 	%r142, %r138, %r140;
+	or.b32 	%r143, %r136, %r137;
+	xor.b32 	%r144, %r143, %r141;
+	or.b32 	%r145, %r142, %r144;
+	add.s32 	%r146, %r123, %r145;
+	ld.shared.b16 	%rs1, [%r146];
+	xor.b32 	%r147, %r145, 4;
+	add.s32 	%r148, %r123, %r147;
+	ld.shared.b16 	%rs2, [%r148+256];
+	xor.b32 	%r149, %r145, 8;
+	add.s32 	%r150, %r123, %r149;
+	ld.shared.b16 	%rs3, [%r150+512];
+	xor.b32 	%r151, %r145, 12;
+	add.s32 	%r152, %r123, %r151;
+	ld.shared.b16 	%rs4, [%r152+768];
+	xor.b32 	%r153, %r145, 32;
+	add.s32 	%r154, %r123, %r153;
+	ld.shared.b16 	%rs5, [%r154];
+	xor.b32 	%r155, %r145, 36;
+	add.s32 	%r156, %r123, %r155;
+	ld.shared.b16 	%rs6, [%r156+256];
+	xor.b32 	%r157, %r145, 40;
+	add.s32 	%r158, %r123, %r157;
+	ld.shared.b16 	%rs7, [%r158+512];
+	xor.b32 	%r159, %r145, 44;
+	add.s32 	%r160, %r123, %r159;
+	ld.shared.b16 	%rs8, [%r160+768];
+	xor.b32 	%r161, %r145, 64;
+	add.s32 	%r162, %r123, %r161;
+	ld.shared.b16 	%rs9, [%r162];
+	xor.b32 	%r163, %r145, 68;
+	add.s32 	%r164, %r123, %r163;
+	ld.shared.b16 	%rs10, [%r164+256];
+	xor.b32 	%r165, %r145, 72;
+	add.s32 	%r166, %r123, %r165;
+	ld.shared.b16 	%rs11, [%r166+512];
+	xor.b32 	%r167, %r145, 76;
+	add.s32 	%r168, %r123, %r167;
+	ld.shared.b16 	%rs12, [%r168+768];
+	xor.b32 	%r169, %r145, 96;
+	add.s32 	%r170, %r123, %r169;
+	ld.shared.b16 	%rs13, [%r170];
+	xor.b32 	%r171, %r145, 100;
+	add.s32 	%r172, %r123, %r171;
+	ld.shared.b16 	%rs14, [%r172+256];
+	xor.b32 	%r173, %r145, 104;
+	add.s32 	%r174, %r123, %r173;
+	ld.shared.b16 	%rs15, [%r174+512];
+	xor.b32 	%r175, %r145, 108;
+	add.s32 	%r176, %r123, %r175;
+	ld.shared.b16 	%rs16, [%r176+768];
+	cvt.f32.bf16 	%r177, %rs1;
+	cvt.f32.bf16 	%r178, %rs2;
+	cvt.f32.bf16 	%r179, %rs3;
+	cvt.f32.bf16 	%r180, %rs4;
+	cvt.f32.bf16 	%r181, %rs5;
+	cvt.f32.bf16 	%r182, %rs6;
+	cvt.f32.bf16 	%r183, %rs7;
+	cvt.f32.bf16 	%r184, %rs8;
+	cvt.f32.bf16 	%r185, %rs9;
+	cvt.f32.bf16 	%r186, %rs10;
+	cvt.f32.bf16 	%r187, %rs11;
+	cvt.f32.bf16 	%r188, %rs12;
+	cvt.f32.bf16 	%r189, %rs13;
+	cvt.f32.bf16 	%r190, %rs14;
+	cvt.f32.bf16 	%r191, %rs15;
+	cvt.f32.bf16 	%r192, %rs16;
+	.loc	1 38 30                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:38:30
+	mad.wide.s32 	%rd5, %r85, 4, %rd28;
+	.loc	1 38 80                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:38:80
+	// begin inline asm
+	mov.u64 %rd6, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd6, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r10, %r5;
+	mov.u32 %r11, %r5;
+	mov.u32 %r12, %r5;
+	mov.u32 %r13, %r5;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r10, %r11, %r12, %r13 }, [ %rd5 + 0 ], %rd6;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd7, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd7, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r14, %r5;
+	mov.u32 %r15, %r5;
+	mov.u32 %r16, %r5;
+	mov.u32 %r17, %r5;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r14, %r15, %r16, %r17 }, [ %rd5 + 0 ], %rd7;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd8, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd8, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r18, %r5;
+	mov.u32 %r19, %r5;
+	mov.u32 %r20, %r5;
+	mov.u32 %r21, %r5;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r18, %r19, %r20, %r21 }, [ %rd5 + 0 ], %rd8;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd9, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd9, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r22, %r5;
+	mov.u32 %r23, %r5;
+	mov.u32 %r24, %r5;
+	mov.u32 %r25, %r5;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r22, %r23, %r24, %r25 }, [ %rd5 + 0 ], %rd9;
+	// end inline asm
+	mov.b32 	%r193, 0f43000000;
+	.loc	1 40 19                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:40:19
+	div.full.f32 	%r194, %r10, %r193;
+	div.full.f32 	%r195, %r11, %r193;
+	div.full.f32 	%r196, %r12, %r193;
+	div.full.f32 	%r197, %r13, %r193;
+	div.full.f32 	%r198, %r14, %r193;
+	div.full.f32 	%r199, %r15, %r193;
+	div.full.f32 	%r200, %r16, %r193;
+	div.full.f32 	%r201, %r17, %r193;
+	div.full.f32 	%r202, %r18, %r193;
+	div.full.f32 	%r203, %r19, %r193;
+	div.full.f32 	%r204, %r20, %r193;
+	div.full.f32 	%r205, %r21, %r193;
+	div.full.f32 	%r206, %r22, %r193;
+	div.full.f32 	%r207, %r23, %r193;
+	div.full.f32 	%r208, %r24, %r193;
+	div.full.f32 	%r209, %r25, %r193;
+	.loc	1 42 19                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:42:19
+	add.f32 	%r210, %r194, 0f358637BD;
+	add.f32 	%r211, %r195, 0f358637BD;
+	add.f32 	%r212, %r196, 0f358637BD;
+	add.f32 	%r213, %r197, 0f358637BD;
+	add.f32 	%r214, %r198, 0f358637BD;
+	add.f32 	%r215, %r199, 0f358637BD;
+	add.f32 	%r216, %r200, 0f358637BD;
+	add.f32 	%r217, %r201, 0f358637BD;
+	add.f32 	%r218, %r202, 0f358637BD;
+	add.f32 	%r219, %r203, 0f358637BD;
+	add.f32 	%r220, %r204, 0f358637BD;
+	add.f32 	%r221, %r205, 0f358637BD;
+	add.f32 	%r222, %r206, 0f358637BD;
+	add.f32 	%r223, %r207, 0f358637BD;
+	add.f32 	%r224, %r208, 0f358637BD;
+	add.f32 	%r225, %r209, 0f358637BD;
+	.loc	1 43 28                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:43:28
+	rsqrt.approx.ftz.f32 	%r226, %r210;
+	rsqrt.approx.ftz.f32 	%r227, %r211;
+	rsqrt.approx.ftz.f32 	%r228, %r212;
+	rsqrt.approx.ftz.f32 	%r229, %r213;
+	rsqrt.approx.ftz.f32 	%r230, %r214;
+	rsqrt.approx.ftz.f32 	%r231, %r215;
+	rsqrt.approx.ftz.f32 	%r232, %r216;
+	rsqrt.approx.ftz.f32 	%r233, %r217;
+	rsqrt.approx.ftz.f32 	%r234, %r218;
+	rsqrt.approx.ftz.f32 	%r235, %r219;
+	rsqrt.approx.ftz.f32 	%r236, %r220;
+	rsqrt.approx.ftz.f32 	%r237, %r221;
+	rsqrt.approx.ftz.f32 	%r238, %r222;
+	rsqrt.approx.ftz.f32 	%r239, %r223;
+	rsqrt.approx.ftz.f32 	%r240, %r224;
+	rsqrt.approx.ftz.f32 	%r241, %r225;
+	.loc	1 44 19                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:44:19
+	mul.f32 	%r242, %r226, %r177;
+	mul.f32 	%r243, %r227, %r178;
+	mul.f32 	%r244, %r228, %r179;
+	mul.f32 	%r245, %r229, %r180;
+	mul.f32 	%r246, %r230, %r181;
+	mul.f32 	%r247, %r231, %r182;
+	mul.f32 	%r248, %r232, %r183;
+	mul.f32 	%r249, %r233, %r184;
+	mul.f32 	%r250, %r234, %r185;
+	mul.f32 	%r251, %r235, %r186;
+	mul.f32 	%r252, %r236, %r187;
+	mul.f32 	%r253, %r237, %r188;
+	mul.f32 	%r254, %r238, %r189;
+	mul.f32 	%r255, %r239, %r190;
+	mul.f32 	%r256, %r240, %r191;
+	mul.f32 	%r257, %r241, %r192;
+	bar.sync 	0;
+	shl.b32 	%r258, %r135, 9;
+	shl.b32 	%r259, %r114, 2;
+	shr.u32 	%r260, %r79, 1;
+	and.b32 	%r261, %r260, 76;
+	or.b32 	%r262, %r258, %r112;
+	or.b32 	%r263, %r259, %r261;
+	xor.b32 	%r264, %r262, %r263;
+	add.s32 	%r265, %r123, %r264;
+	st.shared.b32 	[%r265], %r242;
+	xor.b32 	%r266, %r264, 16;
+	add.s32 	%r267, %r123, %r266;
+	st.shared.b32 	[%r267+512], %r243;
+	xor.b32 	%r268, %r264, 32;
+	add.s32 	%r269, %r123, %r268;
+	st.shared.b32 	[%r269+1024], %r244;
+	xor.b32 	%r270, %r264, 48;
+	add.s32 	%r271, %r123, %r270;
+	st.shared.b32 	[%r271+1536], %r245;
+	xor.b32 	%r272, %r264, 4;
+	add.s32 	%r273, %r123, %r272;
+	st.shared.b32 	[%r273], %r246;
+	xor.b32 	%r274, %r264, 20;
+	add.s32 	%r275, %r123, %r274;
+	st.shared.b32 	[%r275+512], %r247;
+	xor.b32 	%r276, %r264, 36;
+	add.s32 	%r277, %r123, %r276;
+	st.shared.b32 	[%r277+1024], %r248;
+	xor.b32 	%r278, %r264, 52;
+	add.s32 	%r279, %r123, %r278;
+	st.shared.b32 	[%r279+1536], %r249;
+	xor.b32 	%r280, %r264, 8;
+	add.s32 	%r281, %r123, %r280;
+	st.shared.b32 	[%r281], %r250;
+	xor.b32 	%r282, %r264, 24;
+	add.s32 	%r283, %r123, %r282;
+	st.shared.b32 	[%r283+512], %r251;
+	xor.b32 	%r284, %r264, 40;
+	add.s32 	%r285, %r123, %r284;
+	st.shared.b32 	[%r285+1024], %r252;
+	xor.b32 	%r286, %r264, 56;
+	add.s32 	%r287, %r123, %r286;
+	st.shared.b32 	[%r287+1536], %r253;
+	xor.b32 	%r288, %r264, 12;
+	add.s32 	%r289, %r123, %r288;
+	st.shared.b32 	[%r289], %r254;
+	xor.b32 	%r290, %r264, 28;
+	add.s32 	%r291, %r123, %r290;
+	st.shared.b32 	[%r291+512], %r255;
+	xor.b32 	%r292, %r264, 44;
+	add.s32 	%r293, %r123, %r292;
+	st.shared.b32 	[%r293+1024], %r256;
+	xor.b32 	%r294, %r264, 60;
+	add.s32 	%r295, %r123, %r294;
+	st.shared.b32 	[%r295+1536], %r257;
+	bar.sync 	0;
+	shl.b32 	%r296, %r79, 6;
+	and.b32 	%r297, %r296, 1600;
+	and.b32 	%r298, %r139, 60;
+	shr.u32 	%r299, %r114, 1;
+	and.b32 	%r300, %r117, 2112;
+	or.b32 	%r301, %r297, %r298;
+	or.b32 	%r302, %r300, %r299;
+	xor.b32 	%r303, %r302, %r301;
+	add.s32 	%r304, %r123, %r303;
+	ld.shared.b32 	%r305, [%r304];
+	ld.shared.b32 	%r306, [%r304+128];
+	ld.shared.b32 	%r307, [%r304+256];
+	ld.shared.b32 	%r308, [%r304+384];
+	xor.b32 	%r309, %r303, 8;
+	add.s32 	%r310, %r123, %r309;
+	ld.shared.b32 	%r311, [%r310+8192];
+	ld.shared.b32 	%r312, [%r310+8320];
+	ld.shared.b32 	%r313, [%r310+8448];
+	ld.shared.b32 	%r314, [%r310+8576];
+	xor.b32 	%r315, %r303, 4;
+	add.s32 	%r316, %r123, %r315;
+	ld.shared.b32 	%r317, [%r316+4096];
+	ld.shared.b32 	%r318, [%r316+4224];
+	ld.shared.b32 	%r319, [%r316+4352];
+	ld.shared.b32 	%r320, [%r316+4480];
+	xor.b32 	%r321, %r303, 12;
+	add.s32 	%r322, %r123, %r321;
+	ld.shared.b32 	%r323, [%r322+12288];
+	ld.shared.b32 	%r324, [%r322+12416];
+	ld.shared.b32 	%r325, [%r322+12544];
+	ld.shared.b32 	%r326, [%r322+12672];
+	.loc	1 45 31                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:45:31
+	mul.wide.s32 	%rd34, %r92, 2;
+	add.s64 	%rd10, %rd29, %rd34;
+	.loc	1 45 71                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:45:71
+	// begin inline asm
+	mov.u64 %rd11, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd11, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r26, %r5;
+	mov.u32 %r27, %r5;
+	mov.u32 %r28, %r5;
+	mov.u32 %r29, %r5;
+	@%p1 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r26, %r27, %r28, %r29 }, [ %rd10 + 0 ], %rd11;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd12, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd12, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r30, %r5;
+	mov.u32 %r31, %r5;
+	mov.u32 %r32, %r5;
+	mov.u32 %r33, %r5;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r30, %r31, %r32, %r33 }, [ %rd10 + 0 ], %rd12;
+	// end inline asm
+	.loc	1 54 45                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:54:45
+	add.s32 	%r327, %r104, -3145728;
+	add.s32 	%r328, %r105, -3145728;
+	.loc	1 54 31                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:54:31
+	mad.wide.s32 	%rd13, %r327, 2, %rd30;
+	mad.wide.s32 	%rd15, %r328, 2, %rd30;
+	.loc	1 54 83                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:54:83
+	add.s32 	%r329, %r78, -8192;
+	setp.lt.u32 	%p13, %r329, 65536;
+	and.pred 	%p4, %p8, %p13;
+	add.s32 	%r330, %r78, -8160;
+	setp.lt.u32 	%p14, %r330, 65568;
+	and.pred 	%p5, %p8, %p14;
+	and.pred 	%p6, %p9, %p13;
+	.loc	1 54 67                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:54:67
+	// begin inline asm
+	mov.u64 %rd14, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd14, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r34, %r5;
+	mov.u32 %r35, %r5;
+	mov.u32 %r36, %r5;
+	mov.u32 %r37, %r5;
+	@%p4 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r34, %r35, %r36, %r37 }, [ %rd13 + 0 ], %rd14;
+	// end inline asm
+	prmt.b32 	%r331, %r34, %r35, 0x7632U;
+	prmt.b32 	%r332, %r36, %r37, 0x7632U;
+	// begin inline asm
+	mov.u64 %rd16, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd16, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r38, %r5;
+	mov.u32 %r39, %r5;
+	mov.u32 %r40, %r5;
+	mov.u32 %r41, %r5;
+	@%p5 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r38, %r39, %r40, %r41 }, [ %rd15 + 0 ], %rd16;
+	// end inline asm
+	prmt.b32 	%r333, %r38, %r39, 0x7632U;
+	prmt.b32 	%r334, %r40, %r41, 0x7632U;
+	.loc	1 54 134                        // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:54:134
+	bar.sync 	0;
+	prmt.b32 	%r335, %r34, %r35, 0x5410U;
+	st.shared.b32 	[%r124], %r335;
+	prmt.b32 	%r336, %r36, %r37, 0x5410U;
+	st.shared.b32 	[%r124+128], %r336;
+	st.shared.b32 	[%r128+4096], %r331;
+	st.shared.b32 	[%r128+4224], %r332;
+	prmt.b32 	%r337, %r38, %r39, 0x5410U;
+	st.shared.b32 	[%r130+2048], %r337;
+	prmt.b32 	%r338, %r40, %r41, 0x5410U;
+	st.shared.b32 	[%r130+2176], %r338;
+	st.shared.b32 	[%r134+6144], %r333;
+	st.shared.b32 	[%r134+6272], %r334;
+	bar.sync 	0;
+	ld.shared.b16 	%rs17, [%r146];
+	ld.shared.b16 	%rs18, [%r148+256];
+	ld.shared.b16 	%rs19, [%r150+512];
+	ld.shared.b16 	%rs20, [%r152+768];
+	ld.shared.b16 	%rs21, [%r154];
+	ld.shared.b16 	%rs22, [%r156+256];
+	ld.shared.b16 	%rs23, [%r158+512];
+	ld.shared.b16 	%rs24, [%r160+768];
+	ld.shared.b16 	%rs25, [%r162];
+	ld.shared.b16 	%rs26, [%r164+256];
+	ld.shared.b16 	%rs27, [%r166+512];
+	ld.shared.b16 	%rs28, [%r168+768];
+	ld.shared.b16 	%rs29, [%r170];
+	ld.shared.b16 	%rs30, [%r172+256];
+	ld.shared.b16 	%rs31, [%r174+512];
+	ld.shared.b16 	%rs32, [%r176+768];
+	.loc	1 56 52                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:56:52
+	add.s32 	%r339, %r85, -8192;
+	.loc	1 56 31                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:56:31
+	mad.wide.s32 	%rd17, %r339, 4, %rd31;
+	.loc	1 56 90                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:56:90
+	// begin inline asm
+	mov.u64 %rd18, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd18, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r42, %r5;
+	mov.u32 %r43, %r5;
+	mov.u32 %r44, %r5;
+	mov.u32 %r45, %r5;
+	@%p6 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r42, %r43, %r44, %r45 }, [ %rd17 + 0 ], %rd18;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd19, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd19, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r46, %r5;
+	mov.u32 %r47, %r5;
+	mov.u32 %r48, %r5;
+	mov.u32 %r49, %r5;
+	@%p6 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r46, %r47, %r48, %r49 }, [ %rd17 + 0 ], %rd19;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd20, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd20, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r50, %r5;
+	mov.u32 %r51, %r5;
+	mov.u32 %r52, %r5;
+	mov.u32 %r53, %r5;
+	@%p6 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r50, %r51, %r52, %r53 }, [ %rd17 + 0 ], %rd20;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd21, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd21, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r54, %r5;
+	mov.u32 %r55, %r5;
+	mov.u32 %r56, %r5;
+	mov.u32 %r57, %r5;
+	@%p6 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r54, %r55, %r56, %r57 }, [ %rd17 + 0 ], %rd21;
+	// end inline asm
+	.loc	1 58 21                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:58:21
+	div.full.f32 	%r340, %r42, %r193;
+	div.full.f32 	%r341, %r43, %r193;
+	div.full.f32 	%r342, %r44, %r193;
+	div.full.f32 	%r343, %r45, %r193;
+	div.full.f32 	%r344, %r46, %r193;
+	div.full.f32 	%r345, %r47, %r193;
+	div.full.f32 	%r346, %r48, %r193;
+	div.full.f32 	%r347, %r49, %r193;
+	div.full.f32 	%r348, %r50, %r193;
+	div.full.f32 	%r349, %r51, %r193;
+	div.full.f32 	%r350, %r52, %r193;
+	div.full.f32 	%r351, %r53, %r193;
+	div.full.f32 	%r352, %r54, %r193;
+	div.full.f32 	%r353, %r55, %r193;
+	div.full.f32 	%r354, %r56, %r193;
+	div.full.f32 	%r355, %r57, %r193;
+	.loc	1 60 20                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:60:20
+	add.f32 	%r356, %r340, 0f358637BD;
+	add.f32 	%r357, %r341, 0f358637BD;
+	add.f32 	%r358, %r342, 0f358637BD;
+	add.f32 	%r359, %r343, 0f358637BD;
+	add.f32 	%r360, %r344, 0f358637BD;
+	add.f32 	%r361, %r345, 0f358637BD;
+	add.f32 	%r362, %r346, 0f358637BD;
+	add.f32 	%r363, %r347, 0f358637BD;
+	add.f32 	%r364, %r348, 0f358637BD;
+	add.f32 	%r365, %r349, 0f358637BD;
+	add.f32 	%r366, %r350, 0f358637BD;
+	add.f32 	%r367, %r351, 0f358637BD;
+	add.f32 	%r368, %r352, 0f358637BD;
+	add.f32 	%r369, %r353, 0f358637BD;
+	add.f32 	%r370, %r354, 0f358637BD;
+	add.f32 	%r371, %r355, 0f358637BD;
+	.loc	1 61 28                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:61:28
+	rsqrt.approx.ftz.f32 	%r372, %r356;
+	rsqrt.approx.ftz.f32 	%r373, %r357;
+	rsqrt.approx.ftz.f32 	%r374, %r358;
+	rsqrt.approx.ftz.f32 	%r375, %r359;
+	rsqrt.approx.ftz.f32 	%r376, %r360;
+	rsqrt.approx.ftz.f32 	%r377, %r361;
+	rsqrt.approx.ftz.f32 	%r378, %r362;
+	rsqrt.approx.ftz.f32 	%r379, %r363;
+	rsqrt.approx.ftz.f32 	%r380, %r364;
+	rsqrt.approx.ftz.f32 	%r381, %r365;
+	rsqrt.approx.ftz.f32 	%r382, %r366;
+	rsqrt.approx.ftz.f32 	%r383, %r367;
+	rsqrt.approx.ftz.f32 	%r384, %r368;
+	rsqrt.approx.ftz.f32 	%r385, %r369;
+	rsqrt.approx.ftz.f32 	%r386, %r370;
+	rsqrt.approx.ftz.f32 	%r387, %r371;
+	.loc	1 23 21                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:23:21
+	setp.lt.s32 	%p15, %r83, 73728;
+	.loc	1 35 18                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:35:18
+	setp.lt.s32 	%p16, %r84, 8192;
+	.loc	1 54 134                        // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:54:134
+	cvt.f32.bf16 	%r388, %rs32;
+	cvt.f32.bf16 	%r389, %rs31;
+	cvt.f32.bf16 	%r390, %rs30;
+	cvt.f32.bf16 	%r391, %rs29;
+	cvt.f32.bf16 	%r392, %rs28;
+	cvt.f32.bf16 	%r393, %rs27;
+	cvt.f32.bf16 	%r394, %rs26;
+	cvt.f32.bf16 	%r395, %rs25;
+	cvt.f32.bf16 	%r396, %rs24;
+	cvt.f32.bf16 	%r397, %rs23;
+	cvt.f32.bf16 	%r398, %rs22;
+	cvt.f32.bf16 	%r399, %rs21;
+	cvt.f32.bf16 	%r400, %rs20;
+	cvt.f32.bf16 	%r401, %rs19;
+	cvt.f32.bf16 	%r402, %rs18;
+	cvt.f32.bf16 	%r403, %rs17;
+	.loc	1 62 20                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:62:20
+	mul.f32 	%r404, %r372, %r403;
+	mul.f32 	%r405, %r373, %r402;
+	mul.f32 	%r406, %r374, %r401;
+	mul.f32 	%r407, %r375, %r400;
+	mul.f32 	%r408, %r376, %r399;
+	mul.f32 	%r409, %r377, %r398;
+	mul.f32 	%r410, %r378, %r397;
+	mul.f32 	%r411, %r379, %r396;
+	mul.f32 	%r412, %r380, %r395;
+	mul.f32 	%r413, %r381, %r394;
+	mul.f32 	%r414, %r382, %r393;
+	mul.f32 	%r415, %r383, %r392;
+	mul.f32 	%r416, %r384, %r391;
+	mul.f32 	%r417, %r385, %r390;
+	mul.f32 	%r418, %r386, %r389;
+	mul.f32 	%r419, %r387, %r388;
+	bar.sync 	0;
+	st.shared.b32 	[%r265], %r404;
+	st.shared.b32 	[%r267+512], %r405;
+	st.shared.b32 	[%r269+1024], %r406;
+	st.shared.b32 	[%r271+1536], %r407;
+	st.shared.b32 	[%r273], %r408;
+	st.shared.b32 	[%r275+512], %r409;
+	st.shared.b32 	[%r277+1024], %r410;
+	st.shared.b32 	[%r279+1536], %r411;
+	st.shared.b32 	[%r281], %r412;
+	st.shared.b32 	[%r283+512], %r413;
+	st.shared.b32 	[%r285+1024], %r414;
+	st.shared.b32 	[%r287+1536], %r415;
+	st.shared.b32 	[%r289], %r416;
+	st.shared.b32 	[%r291+512], %r417;
+	st.shared.b32 	[%r293+1024], %r418;
+	st.shared.b32 	[%r295+1536], %r419;
+	bar.sync 	0;
+	ld.shared.b32 	%r420, [%r304];
+	ld.shared.b32 	%r421, [%r304+128];
+	ld.shared.b32 	%r422, [%r304+256];
+	ld.shared.b32 	%r423, [%r304+384];
+	ld.shared.b32 	%r424, [%r310+8192];
+	ld.shared.b32 	%r425, [%r310+8320];
+	ld.shared.b32 	%r426, [%r310+8448];
+	ld.shared.b32 	%r427, [%r310+8576];
+	ld.shared.b32 	%r428, [%r316+4096];
+	ld.shared.b32 	%r429, [%r316+4224];
+	ld.shared.b32 	%r430, [%r316+4352];
+	ld.shared.b32 	%r431, [%r316+4480];
+	ld.shared.b32 	%r432, [%r322+12288];
+	ld.shared.b32 	%r433, [%r322+12416];
+	ld.shared.b32 	%r434, [%r322+12544];
+	ld.shared.b32 	%r435, [%r322+12672];
+	.loc	1 63 31                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:63:31
+	add.s64 	%rd22, %rd32, %rd34;
+	.loc	1 63 71                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:63:71
+	// begin inline asm
+	mov.u64 %rd23, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd23, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r58, %r5;
+	mov.u32 %r59, %r5;
+	mov.u32 %r60, %r5;
+	mov.u32 %r61, %r5;
+	@%p4 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r58, %r59, %r60, %r61 }, [ %rd22 + 0 ], %rd23;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd24, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd24, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r62, %r5;
+	mov.u32 %r63, %r5;
+	mov.u32 %r64, %r5;
+	mov.u32 %r65, %r5;
+	@%p5 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r62, %r63, %r64, %r65 }, [ %rd22 + 0 ], %rd24;
+	// end inline asm
+	.loc	1 70 34                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:70:34
+	shl.b32 	%r436, %r83, 7;
+	shl.b32 	%r437, %r84, 7;
+	.loc	1 70 30                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:70:30
+	add.s32 	%r438, %r436, %r92;
+	add.s32 	%r439, %r437, %r92;
+	.loc	1 70 25                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:70:25
+	mad.wide.s32 	%rd25, %r438, 2, %rd33;
+	mad.wide.s32 	%rd26, %r439, 2, %rd33;
+	.loc	1 70 54                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:70:54
+	and.pred 	%p7, %p8, %p15;
+	.loc	1 45 137                        // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:45:137
+	mov.b32 	{%rs33, %rs34}, %r26;
+	cvt.f32.bf16 	%r440, %rs33;
+	cvt.f32.bf16 	%r441, %rs34;
+	.loc	1 47 20                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:47:20
+	mul.f32 	%r442, %r311, %r441;
+	mul.f32 	%r443, %r305, %r440;
+	.loc	1 63 138                        // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:63:138
+	mov.b32 	{%rs35, %rs36}, %r58;
+	cvt.f32.bf16 	%r444, %rs35;
+	cvt.f32.bf16 	%r445, %rs36;
+	.loc	1 65 20                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:65:20
+	mul.f32 	%r446, %r424, %r445;
+	mul.f32 	%r447, %r420, %r444;
+	.loc	1 0 0                           // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:0
+	selp.f32 	%r448, %r443, %r447, %p10;
+	selp.f32 	%r449, %r442, %r446, %p10;
+	.loc	1 70 46                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:70:46
+	cvt.rn.bf16x2.f32 	%r66, %r449, %r448;
+	.loc	1 45 137                        // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:45:137
+	mov.b32 	{%rs37, %rs38}, %r27;
+	cvt.f32.bf16 	%r450, %rs37;
+	cvt.f32.bf16 	%r451, %rs38;
+	.loc	1 47 20                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:47:20
+	mul.f32 	%r452, %r312, %r451;
+	mul.f32 	%r453, %r306, %r450;
+	.loc	1 63 138                        // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:63:138
+	mov.b32 	{%rs39, %rs40}, %r59;
+	cvt.f32.bf16 	%r454, %rs39;
+	cvt.f32.bf16 	%r455, %rs40;
+	.loc	1 65 20                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:65:20
+	mul.f32 	%r456, %r425, %r455;
+	mul.f32 	%r457, %r421, %r454;
+	.loc	1 0 0                           // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:0
+	selp.f32 	%r458, %r453, %r457, %p10;
+	selp.f32 	%r459, %r452, %r456, %p10;
+	.loc	1 70 46                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:70:46
+	cvt.rn.bf16x2.f32 	%r67, %r459, %r458;
+	.loc	1 45 137                        // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:45:137
+	mov.b32 	{%rs41, %rs42}, %r28;
+	cvt.f32.bf16 	%r460, %rs41;
+	cvt.f32.bf16 	%r461, %rs42;
+	.loc	1 47 20                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:47:20
+	mul.f32 	%r462, %r313, %r461;
+	mul.f32 	%r463, %r307, %r460;
+	.loc	1 63 138                        // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:63:138
+	mov.b32 	{%rs43, %rs44}, %r60;
+	cvt.f32.bf16 	%r464, %rs43;
+	cvt.f32.bf16 	%r465, %rs44;
+	.loc	1 65 20                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:65:20
+	mul.f32 	%r466, %r426, %r465;
+	mul.f32 	%r467, %r422, %r464;
+	.loc	1 0 0                           // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:0
+	selp.f32 	%r468, %r463, %r467, %p10;
+	selp.f32 	%r469, %r462, %r466, %p10;
+	.loc	1 70 46                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:70:46
+	cvt.rn.bf16x2.f32 	%r68, %r469, %r468;
+	.loc	1 45 137                        // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:45:137
+	mov.b32 	{%rs45, %rs46}, %r29;
+	cvt.f32.bf16 	%r470, %rs45;
+	cvt.f32.bf16 	%r471, %rs46;
+	.loc	1 47 20                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:47:20
+	mul.f32 	%r472, %r314, %r471;
+	mul.f32 	%r473, %r308, %r470;
+	.loc	1 63 138                        // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:63:138
+	mov.b32 	{%rs47, %rs48}, %r61;
+	cvt.f32.bf16 	%r474, %rs47;
+	cvt.f32.bf16 	%r475, %rs48;
+	.loc	1 65 20                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:65:20
+	mul.f32 	%r476, %r427, %r475;
+	mul.f32 	%r477, %r423, %r474;
+	.loc	1 0 0                           // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:0
+	selp.f32 	%r478, %r473, %r477, %p10;
+	selp.f32 	%r479, %r472, %r476, %p10;
+	.loc	1 70 46                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:70:46
+	cvt.rn.bf16x2.f32 	%r69, %r479, %r478;
+	.loc	1 45 137                        // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:45:137
+	mov.b32 	{%rs49, %rs50}, %r30;
+	cvt.f32.bf16 	%r480, %rs49;
+	cvt.f32.bf16 	%r481, %rs50;
+	.loc	1 47 20                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:47:20
+	mul.f32 	%r482, %r323, %r481;
+	mul.f32 	%r483, %r317, %r480;
+	.loc	1 63 138                        // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:63:138
+	mov.b32 	{%rs51, %rs52}, %r62;
+	cvt.f32.bf16 	%r484, %rs51;
+	cvt.f32.bf16 	%r485, %rs52;
+	.loc	1 65 20                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:65:20
+	mul.f32 	%r486, %r432, %r485;
+	mul.f32 	%r487, %r428, %r484;
+	.loc	1 0 0                           // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:0
+	selp.f32 	%r488, %r483, %r487, %p16;
+	selp.f32 	%r489, %r482, %r486, %p16;
+	.loc	1 70 46                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:70:46
+	cvt.rn.bf16x2.f32 	%r70, %r489, %r488;
+	.loc	1 45 137                        // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:45:137
+	mov.b32 	{%rs53, %rs54}, %r31;
+	cvt.f32.bf16 	%r490, %rs53;
+	cvt.f32.bf16 	%r491, %rs54;
+	.loc	1 47 20                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:47:20
+	mul.f32 	%r492, %r324, %r491;
+	mul.f32 	%r493, %r318, %r490;
+	.loc	1 63 138                        // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:63:138
+	mov.b32 	{%rs55, %rs56}, %r63;
+	cvt.f32.bf16 	%r494, %rs55;
+	cvt.f32.bf16 	%r495, %rs56;
+	.loc	1 65 20                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:65:20
+	mul.f32 	%r496, %r433, %r495;
+	mul.f32 	%r497, %r429, %r494;
+	.loc	1 0 0                           // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:0
+	selp.f32 	%r498, %r493, %r497, %p16;
+	selp.f32 	%r499, %r492, %r496, %p16;
+	.loc	1 70 46                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:70:46
+	cvt.rn.bf16x2.f32 	%r71, %r499, %r498;
+	.loc	1 45 137                        // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:45:137
+	mov.b32 	{%rs57, %rs58}, %r32;
+	cvt.f32.bf16 	%r500, %rs57;
+	cvt.f32.bf16 	%r501, %rs58;
+	.loc	1 47 20                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:47:20
+	mul.f32 	%r502, %r325, %r501;
+	mul.f32 	%r503, %r319, %r500;
+	.loc	1 63 138                        // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:63:138
+	mov.b32 	{%rs59, %rs60}, %r64;
+	cvt.f32.bf16 	%r504, %rs59;
+	cvt.f32.bf16 	%r505, %rs60;
+	.loc	1 65 20                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:65:20
+	mul.f32 	%r506, %r434, %r505;
+	mul.f32 	%r507, %r430, %r504;
+	.loc	1 0 0                           // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:0
+	selp.f32 	%r508, %r503, %r507, %p16;
+	selp.f32 	%r509, %r502, %r506, %p16;
+	.loc	1 70 46                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:70:46
+	cvt.rn.bf16x2.f32 	%r72, %r509, %r508;
+	.loc	1 45 137                        // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:45:137
+	mov.b32 	{%rs61, %rs62}, %r33;
+	cvt.f32.bf16 	%r510, %rs61;
+	cvt.f32.bf16 	%r511, %rs62;
+	.loc	1 47 20                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:47:20
+	mul.f32 	%r512, %r326, %r511;
+	mul.f32 	%r513, %r320, %r510;
+	.loc	1 63 138                        // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:63:138
+	mov.b32 	{%rs63, %rs64}, %r65;
+	cvt.f32.bf16 	%r514, %rs63;
+	cvt.f32.bf16 	%r515, %rs64;
+	.loc	1 65 20                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:65:20
+	mul.f32 	%r516, %r435, %r515;
+	mul.f32 	%r517, %r431, %r514;
+	.loc	1 0 0                           // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:0
+	selp.f32 	%r518, %r513, %r517, %p16;
+	selp.f32 	%r519, %r512, %r516, %p16;
+	.loc	1 70 46                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:70:46
+	cvt.rn.bf16x2.f32 	%r73, %r519, %r518;
+	// begin inline asm
+	@%p7 st.global.v4.b32 [ %rd25 + 0 ], { %r66, %r67, %r68, %r69 };
+	// end inline asm
+	// begin inline asm
+	@%p7 st.global.v4.b32 [ %rd26 + 0 ], { %r70, %r71, %r72, %r73 };
+	// end inline asm
+	.loc	1 70 4                          // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:70:4
+	ret;
+$L__tmp1:
+$L__func_end0:
+                                        // -- End function
+}
+	.file	1 "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py"
+	.section	.debug_abbrev
+	{
+.b8 1                                   // Abbreviation Code
+.b8 17                                  // DW_TAG_compile_unit
+.b8 0                                   // DW_CHILDREN_no
+.b8 37                                  // DW_AT_producer
+.b8 8                                   // DW_FORM_string
+.b8 19                                  // DW_AT_language
+.b8 5                                   // DW_FORM_data2
+.b8 3                                   // DW_AT_name
+.b8 8                                   // DW_FORM_string
+.b8 16                                  // DW_AT_stmt_list
+.b8 6                                   // DW_FORM_data4
+.b8 27                                  // DW_AT_comp_dir
+.b8 8                                   // DW_FORM_string
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 0                                   // EOM(3)
+	}
+	.section	.debug_info
+	{
+.b32 224                                // Length of Unit
+.b8 2                                   // DWARF version number
+.b8 0
+.b32 .debug_abbrev                      // Offset Into Abbrev. Section
+.b8 8                                   // Address Size (in bytes)
+.b8 1                                   // Abbrev [1] 0xb:0xd9 DW_TAG_compile_unit
+.b8 116                                 // DW_AT_producer
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 0
+.b8 2                                   // DW_AT_language
+.b8 0
+.b8 99                                  // DW_AT_name
+.b8 50
+.b8 104
+.b8 105
+.b8 106
+.b8 51
+.b8 104
+.b8 109
+.b8 108
+.b8 111
+.b8 117
+.b8 109
+.b8 120
+.b8 100
+.b8 109
+.b8 104
+.b8 117
+.b8 101
+.b8 122
+.b8 115
+.b8 121
+.b8 104
+.b8 107
+.b8 109
+.b8 110
+.b8 113
+.b8 103
+.b8 110
+.b8 102
+.b8 97
+.b8 53
+.b8 105
+.b8 118
+.b8 114
+.b8 101
+.b8 50
+.b8 55
+.b8 117
+.b8 111
+.b8 115
+.b8 121
+.b8 109
+.b8 97
+.b8 109
+.b8 51
+.b8 100
+.b8 114
+.b8 55
+.b8 97
+.b8 53
+.b8 120
+.b8 98
+.b8 46
+.b8 112
+.b8 121
+.b8 0
+.b32 .debug_line                        // DW_AT_stmt_list
+.b8 47                                  // DW_AT_comp_dir
+.b8 97
+.b8 112
+.b8 112
+.b8 47
+.b8 116
+.b8 101
+.b8 110
+.b8 115
+.b8 111
+.b8 114
+.b8 114
+.b8 116
+.b8 95
+.b8 108
+.b8 108
+.b8 109
+.b8 47
+.b8 118
+.b8 105
+.b8 115
+.b8 117
+.b8 97
+.b8 108
+.b8 95
+.b8 103
+.b8 101
+.b8 110
+.b8 47
+.b8 99
+.b8 111
+.b8 109
+.b8 112
+.b8 105
+.b8 108
+.b8 101
+.b8 100
+.b8 95
+.b8 99
+.b8 97
+.b8 99
+.b8 104
+.b8 101
+.b8 47
+.b8 102
+.b8 108
+.b8 117
+.b8 120
+.b8 50
+.b8 95
+.b8 107
+.b8 108
+.b8 101
+.b8 105
+.b8 110
+.b8 95
+.b8 57
+.b8 98
+.b8 95
+.b8 78
+.b8 86
+.b8 73
+.b8 68
+.b8 73
+.b8 65
+.b8 95
+.b8 71
+.b8 101
+.b8 70
+.b8 111
+.b8 114
+.b8 99
+.b8 101
+.b8 95
+.b8 82
+.b8 84
+.b8 88
+.b8 95
+.b8 52
+.b8 48
+.b8 57
+.b8 48
+.b8 95
+.b8 115
+.b8 109
+.b8 56
+.b8 57
+.b8 95
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 50
+.b8 46
+.b8 49
+.b8 48
+.b8 46
+.b8 48
+.b8 97
+.b8 48
+.b8 95
+.b8 98
+.b8 52
+.b8 101
+.b8 52
+.b8 101
+.b8 101
+.b8 56
+.b8 49
+.b8 100
+.b8 51
+.b8 46
+.b8 110
+.b8 118
+.b8 50
+.b8 53
+.b8 46
+.b8 49
+.b8 50
+.b8 95
+.b8 99
+.b8 117
+.b8 100
+.b8 97
+.b8 49
+.b8 51
+.b8 95
+.b8 49
+.b8 47
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 105
+.b8 110
+.b8 100
+.b8 117
+.b8 99
+.b8 116
+.b8 111
+.b8 114
+.b8 47
+.b8 50
+.b8 104
+.b8 0
+	}
+	.section	.debug_macinfo	{	}
diff --git a/triton/RNNMPWWZPRYLZDDP3QNL7R5SV7EYTG7WXIUJKWKAEGE4BUI424IA/triton_poi_fused__fused_rms_norm_cat_view_2.source b/triton/RNNMPWWZPRYLZDDP3QNL7R5SV7EYTG7WXIUJKWKAEGE4BUI424IA/triton_poi_fused__fused_rms_norm_cat_view_2.source
new file mode 100644
index 0000000000000000000000000000000000000000..b94eb6d636bfed0194580b375716177991cbaafa
--- /dev/null
+++ b/triton/RNNMPWWZPRYLZDDP3QNL7R5SV7EYTG7WXIUJKWKAEGE4BUI424IA/triton_poi_fused__fused_rms_norm_cat_view_2.source
@@ -0,0 +1,415 @@
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":18:0)
+#loc99 = loc("in_ptr0"(#loc))
+#loc100 = loc("in_ptr1"(#loc))
+#loc101 = loc("in_ptr2"(#loc))
+#loc102 = loc("in_ptr3"(#loc))
+#loc103 = loc("in_ptr4"(#loc))
+#loc104 = loc("in_ptr5"(#loc))
+#loc105 = loc("out_ptr0"(#loc))
+#loc106 = loc("ynumel"(#loc))
+#loc107 = loc("xnumel"(#loc))
+module {
+  tt.func public @triton_poi_fused__fused_rms_norm_cat_view_2(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %in_ptr4: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr4"(#loc)), %in_ptr5: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr5"(#loc)), %out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ynumel: i32 {tt.divisibility = 16 : i32} loc("ynumel"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} {
+    %ynumel_0 = arith.constant 73728 : i32 loc(#loc108)
+    %xnumel_1 = arith.constant 128 : i32 loc(#loc109)
+    %yoffset = tt.get_program_id y : i32 loc(#loc110)
+    %yoffset_2 = tt.get_program_id z : i32 loc(#loc111)
+    %yoffset_3 = tt.get_num_programs y : i32 loc(#loc112)
+    %yoffset_4 = arith.muli %yoffset_2, %yoffset_3 : i32 loc(#loc113)
+    %yoffset_5 = arith.addi %yoffset, %yoffset_4 : i32 loc(#loc114)
+    %yoffset_6 = arith.constant 64 : i32 loc(#loc115)
+    %yoffset_7 = arith.constant 64 : i32 loc(#loc115)
+    %yoffset_8 = arith.muli %yoffset_5, %yoffset_7 : i32 loc(#loc115)
+    %yindex = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc116)
+    %yindex_9 = tt.expand_dims %yindex {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc117)
+    %yindex_10 = tt.splat %yoffset_8 : i32 -> tensor<64x1xi32> loc(#loc118)
+    %yindex_11 = arith.addi %yindex_10, %yindex_9 : tensor<64x1xi32> loc(#loc118)
+    %ymask = arith.constant dense<73728> : tensor<64x1xi32> loc(#loc119)
+    %ymask_12 = arith.cmpi slt, %yindex_11, %ymask : tensor<64x1xi32> loc(#loc119)
+    %xoffset = tt.get_program_id x : i32 loc(#loc120)
+    %xoffset_13 = arith.constant 64 : i32 loc(#loc121)
+    %xoffset_14 = arith.constant 64 : i32 loc(#loc121)
+    %xoffset_15 = arith.muli %xoffset, %xoffset_14 : i32 loc(#loc121)
+    %xindex = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc122)
+    %xindex_16 = tt.expand_dims %xindex {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc123)
+    %xindex_17 = tt.splat %xoffset_15 : i32 -> tensor<1x64xi32> loc(#loc124)
+    %xindex_18 = arith.addi %xindex_17, %xindex_16 : tensor<1x64xi32> loc(#loc124)
+    %xmask = arith.constant dense<128> : tensor<1x64xi32> loc(#loc125)
+    %xmask_19 = arith.cmpi slt, %xindex_18, %xmask : tensor<1x64xi32> loc(#loc125)
+    %y1 = arith.constant 32 : i32 loc(#loc126)
+    %y1_20 = arith.constant 32 : i32 loc(#loc126)
+    %y1_21 = arith.constant dense<32> : tensor<64x1xi32> loc(#loc126)
+    %y1_22 = arith.divsi %yindex_11, %y1_21 : tensor<64x1xi32> loc(#loc126)
+    %y0 = arith.constant 32 : i32 loc(#loc127)
+    %y0_23 = arith.constant 32 : i32 loc(#loc127)
+    %y0_24 = arith.constant dense<32> : tensor<64x1xi32> loc(#loc127)
+    %y0_25 = arith.remsi %yindex_11, %y0_24 : tensor<64x1xi32> loc(#loc127)
+    %tmp1 = arith.constant 0 : i64 loc(#loc128)
+    %tmp1_26 = arith.constant dense<0> : tensor<1x1xi64> loc(#loc128)
+    %tmp2 = arith.extsi %y1_22 : tensor<64x1xi32> to tensor<64x1xi64> loc(#loc129)
+    %tmp2_27 = arith.constant dense<0> : tensor<64x1xi64> loc(#loc129)
+    %tmp2_28 = arith.cmpi sge, %tmp2, %tmp2_27 : tensor<64x1xi64> loc(#loc129)
+    %tmp3 = arith.constant 256 : i64 loc(#loc130)
+    %tmp3_29 = arith.constant dense<256> : tensor<1x1xi64> loc(#loc130)
+    %tmp4 = arith.extsi %y1_22 : tensor<64x1xi32> to tensor<64x1xi64> loc(#loc131)
+    %tmp4_30 = arith.constant dense<256> : tensor<64x1xi64> loc(#loc131)
+    %tmp4_31 = arith.cmpi slt, %tmp4, %tmp4_30 : tensor<64x1xi64> loc(#loc131)
+    %tmp5 = arith.constant 128 : i32 loc(#loc132)
+    %tmp5_32 = arith.constant 128 : i32 loc(#loc132)
+    %tmp5_33 = arith.constant dense<128> : tensor<64x1xi32> loc(#loc132)
+    %tmp5_34 = arith.muli %tmp5_33, %y0_25 : tensor<64x1xi32> loc(#loc132)
+    %tmp5_35 = tt.broadcast %xindex_18 : tensor<1x64xi32> -> tensor<64x64xi32> loc(#loc133)
+    %tmp5_36 = tt.broadcast %tmp5_34 : tensor<64x1xi32> -> tensor<64x64xi32> loc(#loc133)
+    %tmp5_37 = arith.addi %tmp5_35, %tmp5_36 : tensor<64x64xi32> loc(#loc133)
+    %tmp5_38 = arith.constant 12288 : i32 loc(#loc134)
+    %tmp5_39 = arith.constant 12288 : i32 loc(#loc134)
+    %tmp5_40 = arith.constant dense<12288> : tensor<64x1xi32> loc(#loc134)
+    %tmp5_41 = arith.muli %tmp5_40, %y1_22 : tensor<64x1xi32> loc(#loc134)
+    %tmp5_42 = tt.broadcast %tmp5_41 : tensor<64x1xi32> -> tensor<64x64xi32> loc(#loc135)
+    %tmp5_43 = arith.addi %tmp5_37, %tmp5_42 : tensor<64x64xi32> loc(#loc135)
+    %tmp5_44 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<64x64x!tt.ptr<bf16>> loc(#loc136)
+    %tmp5_45 = tt.addptr %tmp5_44, %tmp5_43 : tensor<64x64x!tt.ptr<bf16>>, tensor<64x64xi32> loc(#loc136)
+    %tmp5_46 = tt.broadcast %tmp4_31 : tensor<64x1xi1> -> tensor<64x64xi1> loc(#loc137)
+    %tmp5_47 = tt.broadcast %xmask_19 : tensor<1x64xi1> -> tensor<64x64xi1> loc(#loc137)
+    %tmp5_48 = arith.andi %tmp5_46, %tmp5_47 : tensor<64x64xi1> loc(#loc137)
+    %tmp5_49 = tt.broadcast %ymask_12 : tensor<64x1xi1> -> tensor<64x64xi1> loc(#loc138)
+    %tmp5_50 = arith.andi %tmp5_48, %tmp5_49 : tensor<64x64xi1> loc(#loc138)
+    %tmp5_51 = arith.constant 0.000000e+00 : f32 loc(#loc139)
+    %tmp5_52 = arith.constant dense<0.000000e+00> : tensor<64x64xf32> loc(#loc139)
+    %tmp5_53 = arith.truncf %tmp5_52 : tensor<64x64xf32> to tensor<64x64xbf16> loc(#loc139)
+    %tmp5_54 = tt.load %tmp5_45, %tmp5_50, %tmp5_53 evictionPolicy = evict_last : tensor<64x64x!tt.ptr<bf16>> loc(#loc139)
+    %tmp5_55 = arith.extf %tmp5_54 : tensor<64x64xbf16> to tensor<64x64xf32> loc(#loc140)
+    %tmp7 = arith.constant 32 : i32 loc(#loc141)
+    %tmp7_56 = arith.constant 32 : i32 loc(#loc141)
+    %tmp7_57 = arith.constant dense<32> : tensor<64x1xi32> loc(#loc141)
+    %tmp7_58 = arith.muli %tmp7_57, %y1_22 : tensor<64x1xi32> loc(#loc141)
+    %tmp7_59 = arith.addi %y0_25, %tmp7_58 : tensor<64x1xi32> loc(#loc142)
+    %tmp7_60 = tt.broadcast %tmp7_59 : tensor<64x1xi32> -> tensor<64x64xi32> loc(#loc143)
+    %tmp7_61 = tt.splat %in_ptr1 : !tt.ptr<f32> -> tensor<64x64x!tt.ptr<f32>> loc(#loc144)
+    %tmp7_62 = tt.addptr %tmp7_61, %tmp7_60 : tensor<64x64x!tt.ptr<f32>>, tensor<64x64xi32> loc(#loc144)
+    %tmp7_63 = tt.broadcast %tmp4_31 : tensor<64x1xi1> -> tensor<64x64xi1> loc(#loc145)
+    %tmp7_64 = tt.broadcast %xmask_19 : tensor<1x64xi1> -> tensor<64x64xi1> loc(#loc145)
+    %tmp7_65 = arith.andi %tmp7_63, %tmp7_64 : tensor<64x64xi1> loc(#loc145)
+    %tmp7_66 = tt.broadcast %ymask_12 : tensor<64x1xi1> -> tensor<64x64xi1> loc(#loc146)
+    %tmp7_67 = arith.andi %tmp7_65, %tmp7_66 : tensor<64x64xi1> loc(#loc146)
+    %tmp7_68 = arith.constant 0.000000e+00 : f32 loc(#loc147)
+    %tmp7_69 = arith.constant dense<0.000000e+00> : tensor<64x64xf32> loc(#loc147)
+    %tmp7_70 = tt.load %tmp7_62, %tmp7_67, %tmp7_69 evictionPolicy = evict_last : tensor<64x64x!tt.ptr<f32>> loc(#loc147)
+    %tmp8 = arith.constant 1.280000e+02 : f32 loc(#loc148)
+    %tmp9 = arith.constant dense<1.280000e+02> : tensor<64x64xf32> loc(#loc149)
+    %tmp9_71 = arith.divf %tmp7_70, %tmp9 : tensor<64x64xf32> loc(#loc149)
+    %tmp10 = arith.constant 9.99999997E-7 : f32 loc(#loc150)
+    %tmp11 = arith.constant dense<9.99999997E-7> : tensor<64x64xf32> loc(#loc151)
+    %tmp11_72 = arith.addf %tmp9_71, %tmp11 : tensor<64x64xf32> loc(#loc151)
+    %tmp12 = tt.extern_elementwise %tmp11_72 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<64x64xf32>) -> tensor<64x64xf32> loc(#loc152)
+    %tmp13 = arith.mulf %tmp5_55, %tmp12 : tensor<64x64xf32> loc(#loc153)
+    %tmp14 = tt.broadcast %xindex_18 : tensor<1x64xi32> -> tensor<64x64xi32> loc(#loc154)
+    %tmp14_73 = tt.splat %in_ptr2 : !tt.ptr<bf16> -> tensor<64x64x!tt.ptr<bf16>> loc(#loc155)
+    %tmp14_74 = tt.addptr %tmp14_73, %tmp14 : tensor<64x64x!tt.ptr<bf16>>, tensor<64x64xi32> loc(#loc155)
+    %tmp14_75 = tt.broadcast %tmp4_31 : tensor<64x1xi1> -> tensor<64x64xi1> loc(#loc156)
+    %tmp14_76 = tt.broadcast %xmask_19 : tensor<1x64xi1> -> tensor<64x64xi1> loc(#loc156)
+    %tmp14_77 = arith.andi %tmp14_75, %tmp14_76 : tensor<64x64xi1> loc(#loc156)
+    %tmp14_78 = tt.broadcast %ymask_12 : tensor<64x1xi1> -> tensor<64x64xi1> loc(#loc157)
+    %tmp14_79 = arith.andi %tmp14_77, %tmp14_78 : tensor<64x64xi1> loc(#loc157)
+    %tmp14_80 = arith.constant 0.000000e+00 : f32 loc(#loc158)
+    %tmp14_81 = arith.constant dense<0.000000e+00> : tensor<64x64xf32> loc(#loc158)
+    %tmp14_82 = arith.truncf %tmp14_81 : tensor<64x64xf32> to tensor<64x64xbf16> loc(#loc158)
+    %tmp14_83 = tt.load %tmp14_74, %tmp14_79, %tmp14_82 evictionPolicy = evict_last : tensor<64x64x!tt.ptr<bf16>> loc(#loc158)
+    %tmp14_84 = arith.extf %tmp14_83 : tensor<64x64xbf16> to tensor<64x64xf32> loc(#loc159)
+    %tmp16 = arith.mulf %tmp13, %tmp14_84 : tensor<64x64xf32> loc(#loc160)
+    %tmp18 = arith.constant 0.000000e+00 : f32 loc(#loc161)
+    %tmp18_85 = arith.constant dense<0.000000e+00> : tensor<64x64xf32> loc(#loc161)
+    %tmp19 = tt.broadcast %tmp4_31 : tensor<64x1xi1> -> tensor<64x64xi1> loc(#loc162)
+    %tmp19_86 = arith.select %tmp19, %tmp16, %tmp18_85 : tensor<64x64xi1>, tensor<64x64xf32> loc(#loc162)
+    %tmp20 = arith.extsi %y1_22 : tensor<64x1xi32> to tensor<64x1xi64> loc(#loc163)
+    %tmp20_87 = arith.constant dense<256> : tensor<64x1xi64> loc(#loc163)
+    %tmp20_88 = arith.cmpi sge, %tmp20, %tmp20_87 : tensor<64x1xi64> loc(#loc163)
+    %tmp21 = arith.constant 2304 : i64 loc(#loc164)
+    %tmp21_89 = arith.constant dense<2304> : tensor<1x1xi64> loc(#loc164)
+    %tmp22 = arith.extsi %y1_22 : tensor<64x1xi32> to tensor<64x1xi64> loc(#loc165)
+    %tmp22_90 = arith.constant dense<2304> : tensor<64x1xi64> loc(#loc165)
+    %tmp22_91 = arith.cmpi slt, %tmp22, %tmp22_90 : tensor<64x1xi64> loc(#loc165)
+    %tmp23 = arith.constant 128 : i32 loc(#loc166)
+    %tmp23_92 = arith.constant 128 : i32 loc(#loc166)
+    %tmp23_93 = arith.constant dense<128> : tensor<64x1xi32> loc(#loc166)
+    %tmp23_94 = arith.muli %tmp23_93, %y0_25 : tensor<64x1xi32> loc(#loc166)
+    %tmp23_95 = tt.broadcast %xindex_18 : tensor<1x64xi32> -> tensor<64x64xi32> loc(#loc167)
+    %tmp23_96 = tt.broadcast %tmp23_94 : tensor<64x1xi32> -> tensor<64x64xi32> loc(#loc167)
+    %tmp23_97 = arith.addi %tmp23_95, %tmp23_96 : tensor<64x64xi32> loc(#loc167)
+    %tmp23_98 = arith.constant -256 : i32 loc(#loc168)
+    %tmp23_99 = arith.constant -256 : i32 loc(#loc168)
+    %tmp23_100 = arith.constant dense<-256> : tensor<64x1xi32> loc(#loc168)
+    %tmp23_101 = arith.addi %tmp23_100, %y1_22 : tensor<64x1xi32> loc(#loc168)
+    %tmp23_102 = arith.constant 12288 : i32 loc(#loc169)
+    %tmp23_103 = arith.constant 12288 : i32 loc(#loc169)
+    %tmp23_104 = arith.constant dense<12288> : tensor<64x1xi32> loc(#loc169)
+    %tmp23_105 = arith.muli %tmp23_104, %tmp23_101 : tensor<64x1xi32> loc(#loc169)
+    %tmp23_106 = tt.broadcast %tmp23_105 : tensor<64x1xi32> -> tensor<64x64xi32> loc(#loc170)
+    %tmp23_107 = arith.addi %tmp23_97, %tmp23_106 : tensor<64x64xi32> loc(#loc170)
+    %tmp23_108 = tt.splat %in_ptr3 : !tt.ptr<bf16> -> tensor<64x64x!tt.ptr<bf16>> loc(#loc171)
+    %tmp23_109 = tt.addptr %tmp23_108, %tmp23_107 : tensor<64x64x!tt.ptr<bf16>>, tensor<64x64xi32> loc(#loc171)
+    %tmp23_110 = tt.broadcast %tmp20_88 : tensor<64x1xi1> -> tensor<64x64xi1> loc(#loc172)
+    %tmp23_111 = tt.broadcast %xmask_19 : tensor<1x64xi1> -> tensor<64x64xi1> loc(#loc172)
+    %tmp23_112 = arith.andi %tmp23_110, %tmp23_111 : tensor<64x64xi1> loc(#loc172)
+    %tmp23_113 = tt.broadcast %ymask_12 : tensor<64x1xi1> -> tensor<64x64xi1> loc(#loc173)
+    %tmp23_114 = arith.andi %tmp23_112, %tmp23_113 : tensor<64x64xi1> loc(#loc173)
+    %tmp23_115 = arith.constant 0.000000e+00 : f32 loc(#loc174)
+    %tmp23_116 = arith.constant dense<0.000000e+00> : tensor<64x64xf32> loc(#loc174)
+    %tmp23_117 = arith.truncf %tmp23_116 : tensor<64x64xf32> to tensor<64x64xbf16> loc(#loc174)
+    %tmp23_118 = tt.load %tmp23_109, %tmp23_114, %tmp23_117 evictionPolicy = evict_last : tensor<64x64x!tt.ptr<bf16>> loc(#loc174)
+    %tmp23_119 = arith.extf %tmp23_118 : tensor<64x64xbf16> to tensor<64x64xf32> loc(#loc175)
+    %tmp25 = arith.constant -256 : i32 loc(#loc176)
+    %tmp25_120 = arith.constant -256 : i32 loc(#loc176)
+    %tmp25_121 = arith.constant dense<-256> : tensor<64x1xi32> loc(#loc176)
+    %tmp25_122 = arith.addi %tmp25_121, %y1_22 : tensor<64x1xi32> loc(#loc176)
+    %tmp25_123 = arith.constant 32 : i32 loc(#loc177)
+    %tmp25_124 = arith.constant 32 : i32 loc(#loc177)
+    %tmp25_125 = arith.constant dense<32> : tensor<64x1xi32> loc(#loc177)
+    %tmp25_126 = arith.muli %tmp25_125, %tmp25_122 : tensor<64x1xi32> loc(#loc177)
+    %tmp25_127 = arith.addi %y0_25, %tmp25_126 : tensor<64x1xi32> loc(#loc178)
+    %tmp25_128 = tt.broadcast %tmp25_127 : tensor<64x1xi32> -> tensor<64x64xi32> loc(#loc179)
+    %tmp25_129 = tt.splat %in_ptr4 : !tt.ptr<f32> -> tensor<64x64x!tt.ptr<f32>> loc(#loc180)
+    %tmp25_130 = tt.addptr %tmp25_129, %tmp25_128 : tensor<64x64x!tt.ptr<f32>>, tensor<64x64xi32> loc(#loc180)
+    %tmp25_131 = tt.broadcast %tmp20_88 : tensor<64x1xi1> -> tensor<64x64xi1> loc(#loc181)
+    %tmp25_132 = tt.broadcast %xmask_19 : tensor<1x64xi1> -> tensor<64x64xi1> loc(#loc181)
+    %tmp25_133 = arith.andi %tmp25_131, %tmp25_132 : tensor<64x64xi1> loc(#loc181)
+    %tmp25_134 = tt.broadcast %ymask_12 : tensor<64x1xi1> -> tensor<64x64xi1> loc(#loc182)
+    %tmp25_135 = arith.andi %tmp25_133, %tmp25_134 : tensor<64x64xi1> loc(#loc182)
+    %tmp25_136 = arith.constant 0.000000e+00 : f32 loc(#loc183)
+    %tmp25_137 = arith.constant dense<0.000000e+00> : tensor<64x64xf32> loc(#loc183)
+    %tmp25_138 = tt.load %tmp25_130, %tmp25_135, %tmp25_137 evictionPolicy = evict_last : tensor<64x64x!tt.ptr<f32>> loc(#loc183)
+    %tmp26 = arith.constant 1.280000e+02 : f32 loc(#loc184)
+    %tmp27 = arith.constant dense<1.280000e+02> : tensor<64x64xf32> loc(#loc185)
+    %tmp27_139 = arith.divf %tmp25_138, %tmp27 : tensor<64x64xf32> loc(#loc185)
+    %tmp28 = arith.constant 9.99999997E-7 : f32 loc(#loc186)
+    %tmp29 = arith.constant dense<9.99999997E-7> : tensor<64x64xf32> loc(#loc187)
+    %tmp29_140 = arith.addf %tmp27_139, %tmp29 : tensor<64x64xf32> loc(#loc187)
+    %tmp30 = tt.extern_elementwise %tmp29_140 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<64x64xf32>) -> tensor<64x64xf32> loc(#loc188)
+    %tmp31 = arith.mulf %tmp23_119, %tmp30 : tensor<64x64xf32> loc(#loc189)
+    %tmp32 = tt.broadcast %xindex_18 : tensor<1x64xi32> -> tensor<64x64xi32> loc(#loc190)
+    %tmp32_141 = tt.splat %in_ptr5 : !tt.ptr<bf16> -> tensor<64x64x!tt.ptr<bf16>> loc(#loc191)
+    %tmp32_142 = tt.addptr %tmp32_141, %tmp32 : tensor<64x64x!tt.ptr<bf16>>, tensor<64x64xi32> loc(#loc191)
+    %tmp32_143 = tt.broadcast %tmp20_88 : tensor<64x1xi1> -> tensor<64x64xi1> loc(#loc192)
+    %tmp32_144 = tt.broadcast %xmask_19 : tensor<1x64xi1> -> tensor<64x64xi1> loc(#loc192)
+    %tmp32_145 = arith.andi %tmp32_143, %tmp32_144 : tensor<64x64xi1> loc(#loc192)
+    %tmp32_146 = tt.broadcast %ymask_12 : tensor<64x1xi1> -> tensor<64x64xi1> loc(#loc193)
+    %tmp32_147 = arith.andi %tmp32_145, %tmp32_146 : tensor<64x64xi1> loc(#loc193)
+    %tmp32_148 = arith.constant 0.000000e+00 : f32 loc(#loc194)
+    %tmp32_149 = arith.constant dense<0.000000e+00> : tensor<64x64xf32> loc(#loc194)
+    %tmp32_150 = arith.truncf %tmp32_149 : tensor<64x64xf32> to tensor<64x64xbf16> loc(#loc194)
+    %tmp32_151 = tt.load %tmp32_142, %tmp32_147, %tmp32_150 evictionPolicy = evict_last : tensor<64x64x!tt.ptr<bf16>> loc(#loc194)
+    %tmp32_152 = arith.extf %tmp32_151 : tensor<64x64xbf16> to tensor<64x64xf32> loc(#loc195)
+    %tmp34 = arith.mulf %tmp31, %tmp32_152 : tensor<64x64xf32> loc(#loc196)
+    %tmp36 = arith.constant 0.000000e+00 : f32 loc(#loc197)
+    %tmp36_153 = arith.constant dense<0.000000e+00> : tensor<64x64xf32> loc(#loc197)
+    %tmp37 = tt.broadcast %tmp20_88 : tensor<64x1xi1> -> tensor<64x64xi1> loc(#loc198)
+    %tmp37_154 = arith.select %tmp37, %tmp34, %tmp36_153 : tensor<64x64xi1>, tensor<64x64xf32> loc(#loc198)
+    %tmp38 = tt.broadcast %tmp4_31 : tensor<64x1xi1> -> tensor<64x64xi1> loc(#loc199)
+    %tmp38_155 = arith.select %tmp38, %tmp19_86, %tmp37_154 : tensor<64x64xi1>, tensor<64x64xf32> loc(#loc199)
+    %c128_i32 = arith.constant 128 : i32 loc(#loc93)
+    %c128_i32_156 = arith.constant 128 : i32 loc(#loc93)
+    %cst = arith.constant dense<128> : tensor<64x1xi32> loc(#loc93)
+    %0 = arith.muli %cst, %yindex_11 : tensor<64x1xi32> loc(#loc93)
+    %1 = tt.broadcast %xindex_18 : tensor<1x64xi32> -> tensor<64x64xi32> loc(#loc94)
+    %2 = tt.broadcast %0 : tensor<64x1xi32> -> tensor<64x64xi32> loc(#loc94)
+    %3 = arith.addi %1, %2 : tensor<64x64xi32> loc(#loc94)
+    %4 = tt.splat %out_ptr0 : !tt.ptr<bf16> -> tensor<64x64x!tt.ptr<bf16>> loc(#loc95)
+    %5 = tt.addptr %4, %3 : tensor<64x64x!tt.ptr<bf16>>, tensor<64x64xi32> loc(#loc95)
+    %6 = tt.broadcast %xmask_19 : tensor<1x64xi1> -> tensor<64x64xi1> loc(#loc96)
+    %7 = tt.broadcast %ymask_12 : tensor<64x1xi1> -> tensor<64x64xi1> loc(#loc96)
+    %8 = arith.andi %6, %7 : tensor<64x64xi1> loc(#loc96)
+    %9 = arith.truncf %tmp38_155 : tensor<64x64xf32> to tensor<64x64xbf16> loc(#loc97)
+    tt.store %5, %9, %8 : tensor<64x64x!tt.ptr<bf16>> loc(#loc97)
+    tt.return loc(#loc98)
+  } loc(#loc)
+} loc(#loc)
+#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":19:13)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":20:13)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:29)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:48)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:69)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:53)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:34)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:75)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":22:36)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":22:44)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":22:23)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":23:21)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":24:28)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":24:33)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":25:36)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":25:44)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":25:23)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":26:21)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":27:19)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":29:19)
+#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":32:30)
+#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":33:19)
+#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":34:32)
+#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":35:18)
+#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:39)
+#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:35)
+#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:51)
+#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:44)
+#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:30)
+#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:64)
+#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:72)
+#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:57)
+#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:123)
+#loc34 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:55)
+#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:51)
+#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:60)
+#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:30)
+#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:87)
+#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:95)
+#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:80)
+#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":39:11)
+#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":40:19)
+#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":41:12)
+#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":42:19)
+#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":43:28)
+#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":44:19)
+#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:51)
+#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:31)
+#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:78)
+#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:86)
+#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:71)
+#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:137)
+#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":47:20)
+#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":49:38)
+#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":50:34)
+#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":51:20)
+#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":52:34)
+#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":53:19)
+#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:40)
+#loc60 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:36)
+#loc61 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:61)
+#loc62 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:52)
+#loc63 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:45)
+#loc64 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:31)
+#loc65 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:75)
+#loc66 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:83)
+#loc67 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:67)
+#loc68 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:134)
+#loc69 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:65)
+#loc70 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:56)
+#loc71 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:52)
+#loc72 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:70)
+#loc73 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:31)
+#loc74 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:98)
+#loc75 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:106)
+#loc76 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:90)
+#loc77 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":57:12)
+#loc78 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":58:21)
+#loc79 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":59:12)
+#loc80 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":60:20)
+#loc81 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":61:28)
+#loc82 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":62:20)
+#loc83 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:51)
+#loc84 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:31)
+#loc85 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:79)
+#loc86 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:87)
+#loc87 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:71)
+#loc88 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:138)
+#loc89 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":65:20)
+#loc90 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":67:38)
+#loc91 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":68:35)
+#loc92 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":69:34)
+#loc93 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:34)
+#loc94 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:30)
+#loc95 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:25)
+#loc96 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:54)
+#loc97 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:46)
+#loc98 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:4)
+#loc108 = loc("ynumel"(#loc1))
+#loc109 = loc("xnumel"(#loc2))
+#loc110 = loc("yoffset"(#loc3))
+#loc111 = loc("yoffset"(#loc4))
+#loc112 = loc("yoffset"(#loc5))
+#loc113 = loc("yoffset"(#loc6))
+#loc114 = loc("yoffset"(#loc7))
+#loc115 = loc("yoffset"(#loc8))
+#loc116 = loc("yindex"(#loc9))
+#loc117 = loc("yindex"(#loc10))
+#loc118 = loc("yindex"(#loc11))
+#loc119 = loc("ymask"(#loc12))
+#loc120 = loc("xoffset"(#loc13))
+#loc121 = loc("xoffset"(#loc14))
+#loc122 = loc("xindex"(#loc15))
+#loc123 = loc("xindex"(#loc16))
+#loc124 = loc("xindex"(#loc17))
+#loc125 = loc("xmask"(#loc18))
+#loc126 = loc("y1"(#loc19))
+#loc127 = loc("y0"(#loc20))
+#loc128 = loc("tmp1"(#loc21))
+#loc129 = loc("tmp2"(#loc22))
+#loc130 = loc("tmp3"(#loc23))
+#loc131 = loc("tmp4"(#loc24))
+#loc132 = loc("tmp5"(#loc25))
+#loc133 = loc("tmp5"(#loc26))
+#loc134 = loc("tmp5"(#loc27))
+#loc135 = loc("tmp5"(#loc28))
+#loc136 = loc("tmp5"(#loc29))
+#loc137 = loc("tmp5"(#loc30))
+#loc138 = loc("tmp5"(#loc31))
+#loc139 = loc("tmp5"(#loc32))
+#loc140 = loc("tmp5"(#loc33))
+#loc141 = loc("tmp7"(#loc34))
+#loc142 = loc("tmp7"(#loc35))
+#loc143 = loc("tmp7"(#loc36))
+#loc144 = loc("tmp7"(#loc37))
+#loc145 = loc("tmp7"(#loc38))
+#loc146 = loc("tmp7"(#loc39))
+#loc147 = loc("tmp7"(#loc40))
+#loc148 = loc("tmp8"(#loc41))
+#loc149 = loc("tmp9"(#loc42))
+#loc150 = loc("tmp10"(#loc43))
+#loc151 = loc("tmp11"(#loc44))
+#loc152 = loc("tmp12"(#loc45))
+#loc153 = loc("tmp13"(#loc46))
+#loc154 = loc("tmp14"(#loc47))
+#loc155 = loc("tmp14"(#loc48))
+#loc156 = loc("tmp14"(#loc49))
+#loc157 = loc("tmp14"(#loc50))
+#loc158 = loc("tmp14"(#loc51))
+#loc159 = loc("tmp14"(#loc52))
+#loc160 = loc("tmp16"(#loc53))
+#loc161 = loc("tmp18"(#loc54))
+#loc162 = loc("tmp19"(#loc55))
+#loc163 = loc("tmp20"(#loc56))
+#loc164 = loc("tmp21"(#loc57))
+#loc165 = loc("tmp22"(#loc58))
+#loc166 = loc("tmp23"(#loc59))
+#loc167 = loc("tmp23"(#loc60))
+#loc168 = loc("tmp23"(#loc61))
+#loc169 = loc("tmp23"(#loc62))
+#loc170 = loc("tmp23"(#loc63))
+#loc171 = loc("tmp23"(#loc64))
+#loc172 = loc("tmp23"(#loc65))
+#loc173 = loc("tmp23"(#loc66))
+#loc174 = loc("tmp23"(#loc67))
+#loc175 = loc("tmp23"(#loc68))
+#loc176 = loc("tmp25"(#loc69))
+#loc177 = loc("tmp25"(#loc70))
+#loc178 = loc("tmp25"(#loc71))
+#loc179 = loc("tmp25"(#loc72))
+#loc180 = loc("tmp25"(#loc73))
+#loc181 = loc("tmp25"(#loc74))
+#loc182 = loc("tmp25"(#loc75))
+#loc183 = loc("tmp25"(#loc76))
+#loc184 = loc("tmp26"(#loc77))
+#loc185 = loc("tmp27"(#loc78))
+#loc186 = loc("tmp28"(#loc79))
+#loc187 = loc("tmp29"(#loc80))
+#loc188 = loc("tmp30"(#loc81))
+#loc189 = loc("tmp31"(#loc82))
+#loc190 = loc("tmp32"(#loc83))
+#loc191 = loc("tmp32"(#loc84))
+#loc192 = loc("tmp32"(#loc85))
+#loc193 = loc("tmp32"(#loc86))
+#loc194 = loc("tmp32"(#loc87))
+#loc195 = loc("tmp32"(#loc88))
+#loc196 = loc("tmp34"(#loc89))
+#loc197 = loc("tmp36"(#loc90))
+#loc198 = loc("tmp37"(#loc91))
+#loc199 = loc("tmp38"(#loc92))
diff --git a/triton/RNNMPWWZPRYLZDDP3QNL7R5SV7EYTG7WXIUJKWKAEGE4BUI424IA/triton_poi_fused__fused_rms_norm_cat_view_2.ttgir b/triton/RNNMPWWZPRYLZDDP3QNL7R5SV7EYTG7WXIUJKWKAEGE4BUI424IA/triton_poi_fused__fused_rms_norm_cat_view_2.ttgir
new file mode 100644
index 0000000000000000000000000000000000000000..77ad0ab5b1f794f85b02769d464f0854ffadfa5a
--- /dev/null
+++ b/triton/RNNMPWWZPRYLZDDP3QNL7R5SV7EYTG7WXIUJKWKAEGE4BUI424IA/triton_poi_fused__fused_rms_norm_cat_view_2.ttgir
@@ -0,0 +1,287 @@
+#blocked = #ttg.blocked<{sizePerThread = [4, 1], threadsPerWarp = [16, 2], warpsPerCTA = [1, 8], order = [0, 1]}>
+#blocked1 = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [4, 8], warpsPerCTA = [8, 1], order = [1, 0]}>
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":18:0)
+#loc70 = loc("in_ptr0"(#loc))
+#loc71 = loc("in_ptr1"(#loc))
+#loc72 = loc("in_ptr2"(#loc))
+#loc73 = loc("in_ptr3"(#loc))
+#loc74 = loc("in_ptr4"(#loc))
+#loc75 = loc("in_ptr5"(#loc))
+#loc76 = loc("out_ptr0"(#loc))
+#loc77 = loc("ynumel"(#loc))
+#loc78 = loc("xnumel"(#loc))
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "cuda:89", "ttg.threads-per-warp" = 32 : i32} {
+  tt.func public @triton_poi_fused__fused_rms_norm_cat_view_2(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %in_ptr4: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr4"(#loc)), %in_ptr5: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr5"(#loc)), %out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ynumel: i32 {tt.divisibility = 16 : i32} loc("ynumel"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} {
+    %cst = arith.constant dense<-256> : tensor<64x1xi32, #blocked> loc(#loc1)
+    %cst_0 = arith.constant dense<-256> : tensor<64x1xi32, #blocked1> loc(#loc1)
+    %cst_1 = arith.constant dense<12288> : tensor<64x1xi32, #blocked1> loc(#loc1)
+    %cst_2 = arith.constant dense<128> : tensor<64x1xi32, #blocked1> loc(#loc1)
+    %cst_3 = arith.constant dense<256> : tensor<64x1xi64, #blocked> loc(#loc1)
+    %cst_4 = arith.constant dense<256> : tensor<64x1xi64, #blocked1> loc(#loc1)
+    %cst_5 = arith.constant dense<32> : tensor<64x1xi32, #blocked> loc(#loc1)
+    %cst_6 = arith.constant dense<32> : tensor<64x1xi32, #blocked1> loc(#loc1)
+    %cst_7 = arith.constant dense<128> : tensor<1x64xi32, #blocked> loc(#loc1)
+    %cst_8 = arith.constant dense<128> : tensor<1x64xi32, #blocked1> loc(#loc1)
+    %cst_9 = arith.constant dense<73728> : tensor<64x1xi32, #blocked> loc(#loc1)
+    %cst_10 = arith.constant dense<73728> : tensor<64x1xi32, #blocked1> loc(#loc1)
+    %c64_i32 = arith.constant 64 : i32 loc(#loc1)
+    %cst_11 = arith.constant dense<0.000000e+00> : tensor<64x64xbf16, #blocked1> loc(#loc1)
+    %cst_12 = arith.constant dense<0.000000e+00> : tensor<64x64xf32, #blocked> loc(#loc1)
+    %cst_13 = arith.constant dense<9.99999997E-7> : tensor<64x64xf32, #blocked> loc(#loc1)
+    %cst_14 = arith.constant dense<1.280000e+02> : tensor<64x64xf32, #blocked> loc(#loc1)
+    %cst_15 = arith.constant dense<0.000000e+00> : tensor<64x64xf32, #blocked1> loc(#loc1)
+    %yoffset = tt.get_program_id y : i32 loc(#loc79)
+    %yoffset_16 = tt.get_program_id z : i32 loc(#loc80)
+    %yoffset_17 = tt.get_num_programs y : i32 loc(#loc81)
+    %yoffset_18 = arith.muli %yoffset_16, %yoffset_17 : i32 loc(#loc82)
+    %yoffset_19 = arith.addi %yoffset, %yoffset_18 : i32 loc(#loc83)
+    %yoffset_20 = arith.muli %yoffset_19, %c64_i32 : i32 loc(#loc84)
+    %yindex = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc85)
+    %yindex_21 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc85)
+    %yindex_22 = tt.expand_dims %yindex {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<64x1xi32, #blocked1> loc(#loc85)
+    %yindex_23 = tt.expand_dims %yindex_21 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> loc(#loc85)
+    %yindex_24 = tt.splat %yoffset_20 : i32 -> tensor<64x1xi32, #blocked1> loc(#loc86)
+    %yindex_25 = tt.splat %yoffset_20 : i32 -> tensor<64x1xi32, #blocked> loc(#loc86)
+    %yindex_26 = arith.addi %yindex_24, %yindex_22 : tensor<64x1xi32, #blocked1> loc(#loc86)
+    %yindex_27 = arith.addi %yindex_25, %yindex_23 : tensor<64x1xi32, #blocked> loc(#loc86)
+    %ymask = arith.cmpi slt, %yindex_26, %cst_10 : tensor<64x1xi32, #blocked1> loc(#loc87)
+    %ymask_28 = arith.cmpi slt, %yindex_27, %cst_9 : tensor<64x1xi32, #blocked> loc(#loc87)
+    %xoffset = tt.get_program_id x : i32 loc(#loc88)
+    %xoffset_29 = arith.muli %xoffset, %c64_i32 : i32 loc(#loc89)
+    %xindex = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc90)
+    %xindex_30 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc90)
+    %xindex_31 = tt.expand_dims %xindex {axis = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x64xi32, #blocked1> loc(#loc90)
+    %xindex_32 = tt.expand_dims %xindex_30 {axis = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x64xi32, #blocked> loc(#loc90)
+    %xindex_33 = tt.splat %xoffset_29 : i32 -> tensor<1x64xi32, #blocked1> loc(#loc91)
+    %xindex_34 = tt.splat %xoffset_29 : i32 -> tensor<1x64xi32, #blocked> loc(#loc91)
+    %xindex_35 = arith.addi %xindex_33, %xindex_31 : tensor<1x64xi32, #blocked1> loc(#loc91)
+    %xindex_36 = arith.addi %xindex_34, %xindex_32 : tensor<1x64xi32, #blocked> loc(#loc91)
+    %xmask = arith.cmpi slt, %xindex_35, %cst_8 : tensor<1x64xi32, #blocked1> loc(#loc92)
+    %xmask_37 = arith.cmpi slt, %xindex_36, %cst_7 : tensor<1x64xi32, #blocked> loc(#loc92)
+    %y1 = arith.divsi %yindex_26, %cst_6 : tensor<64x1xi32, #blocked1> loc(#loc93)
+    %y1_38 = arith.divsi %yindex_27, %cst_5 : tensor<64x1xi32, #blocked> loc(#loc93)
+    %y0 = arith.remsi %yindex_26, %cst_6 : tensor<64x1xi32, #blocked1> loc(#loc94)
+    %y0_39 = arith.remsi %yindex_27, %cst_5 : tensor<64x1xi32, #blocked> loc(#loc94)
+    %tmp4 = arith.extsi %y1 : tensor<64x1xi32, #blocked1> to tensor<64x1xi64, #blocked1> loc(#loc95)
+    %tmp4_40 = arith.extsi %y1_38 : tensor<64x1xi32, #blocked> to tensor<64x1xi64, #blocked> loc(#loc95)
+    %tmp4_41 = arith.cmpi slt, %tmp4, %cst_4 : tensor<64x1xi64, #blocked1> loc(#loc95)
+    %tmp4_42 = arith.cmpi slt, %tmp4_40, %cst_3 : tensor<64x1xi64, #blocked> loc(#loc95)
+    %tmp5 = arith.muli %y0, %cst_2 : tensor<64x1xi32, #blocked1> loc(#loc96)
+    %tmp5_43 = tt.broadcast %xindex_35 : tensor<1x64xi32, #blocked1> -> tensor<64x64xi32, #blocked1> loc(#loc97)
+    %tmp5_44 = tt.broadcast %tmp5 : tensor<64x1xi32, #blocked1> -> tensor<64x64xi32, #blocked1> loc(#loc97)
+    %tmp5_45 = arith.addi %tmp5_43, %tmp5_44 : tensor<64x64xi32, #blocked1> loc(#loc97)
+    %tmp5_46 = arith.muli %y1, %cst_1 : tensor<64x1xi32, #blocked1> loc(#loc98)
+    %tmp5_47 = tt.broadcast %tmp5_46 : tensor<64x1xi32, #blocked1> -> tensor<64x64xi32, #blocked1> loc(#loc99)
+    %tmp5_48 = arith.addi %tmp5_45, %tmp5_47 : tensor<64x64xi32, #blocked1> loc(#loc99)
+    %tmp5_49 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<64x64x!tt.ptr<bf16>, #blocked1> loc(#loc100)
+    %tmp5_50 = tt.addptr %tmp5_49, %tmp5_48 : tensor<64x64x!tt.ptr<bf16>, #blocked1>, tensor<64x64xi32, #blocked1> loc(#loc100)
+    %tmp5_51 = tt.broadcast %tmp4_41 : tensor<64x1xi1, #blocked1> -> tensor<64x64xi1, #blocked1> loc(#loc101)
+    %tmp5_52 = tt.broadcast %tmp4_42 : tensor<64x1xi1, #blocked> -> tensor<64x64xi1, #blocked> loc(#loc101)
+    %tmp5_53 = tt.broadcast %xmask : tensor<1x64xi1, #blocked1> -> tensor<64x64xi1, #blocked1> loc(#loc101)
+    %tmp5_54 = tt.broadcast %xmask_37 : tensor<1x64xi1, #blocked> -> tensor<64x64xi1, #blocked> loc(#loc101)
+    %tmp5_55 = arith.andi %tmp5_51, %tmp5_53 : tensor<64x64xi1, #blocked1> loc(#loc101)
+    %tmp5_56 = arith.andi %tmp5_52, %tmp5_54 : tensor<64x64xi1, #blocked> loc(#loc101)
+    %tmp5_57 = tt.broadcast %ymask : tensor<64x1xi1, #blocked1> -> tensor<64x64xi1, #blocked1> loc(#loc102)
+    %tmp5_58 = tt.broadcast %ymask_28 : tensor<64x1xi1, #blocked> -> tensor<64x64xi1, #blocked> loc(#loc102)
+    %tmp5_59 = arith.andi %tmp5_55, %tmp5_57 : tensor<64x64xi1, #blocked1> loc(#loc102)
+    %tmp5_60 = arith.andi %tmp5_56, %tmp5_58 : tensor<64x64xi1, #blocked> loc(#loc102)
+    %tmp5_61 = tt.load %tmp5_50, %tmp5_59, %cst_11 evictionPolicy = evict_last : tensor<64x64x!tt.ptr<bf16>, #blocked1> loc(#loc103)
+    %tmp5_62 = ttg.convert_layout %tmp5_61 : tensor<64x64xbf16, #blocked1> -> tensor<64x64xbf16, #blocked> loc(#loc104)
+    %tmp5_63 = arith.extf %tmp5_62 : tensor<64x64xbf16, #blocked> to tensor<64x64xf32, #blocked> loc(#loc104)
+    %tmp7 = arith.muli %y1_38, %cst_5 : tensor<64x1xi32, #blocked> loc(#loc105)
+    %tmp7_64 = arith.addi %y0_39, %tmp7 : tensor<64x1xi32, #blocked> loc(#loc106)
+    %tmp7_65 = tt.splat %in_ptr1 : !tt.ptr<f32> -> tensor<64x1x!tt.ptr<f32>, #blocked> loc(#loc107)
+    %tmp7_66 = tt.addptr %tmp7_65, %tmp7_64 : tensor<64x1x!tt.ptr<f32>, #blocked>, tensor<64x1xi32, #blocked> loc(#loc107)
+    %tmp7_67 = tt.broadcast %tmp7_66 : tensor<64x1x!tt.ptr<f32>, #blocked> -> tensor<64x64x!tt.ptr<f32>, #blocked> loc(#loc107)
+    %tmp7_68 = tt.load %tmp7_67, %tmp5_60, %cst_12 evictionPolicy = evict_last : tensor<64x64x!tt.ptr<f32>, #blocked> loc(#loc108)
+    %tmp9 = arith.divf %tmp7_68, %cst_14 : tensor<64x64xf32, #blocked> loc(#loc109)
+    %tmp11 = arith.addf %tmp9, %cst_13 : tensor<64x64xf32, #blocked> loc(#loc110)
+    %tmp12 = tt.extern_elementwise %tmp11 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<64x64xf32, #blocked>) -> tensor<64x64xf32, #blocked> loc(#loc111)
+    %tmp13 = arith.mulf %tmp5_63, %tmp12 : tensor<64x64xf32, #blocked> loc(#loc112)
+    %tmp13_69 = ttg.convert_layout %tmp13 : tensor<64x64xf32, #blocked> -> tensor<64x64xf32, #blocked1> loc(#loc112)
+    %tmp14 = tt.splat %in_ptr2 : !tt.ptr<bf16> -> tensor<1x64x!tt.ptr<bf16>, #blocked1> loc(#loc113)
+    %tmp14_70 = tt.addptr %tmp14, %xindex_35 : tensor<1x64x!tt.ptr<bf16>, #blocked1>, tensor<1x64xi32, #blocked1> loc(#loc113)
+    %tmp14_71 = tt.broadcast %tmp14_70 : tensor<1x64x!tt.ptr<bf16>, #blocked1> -> tensor<64x64x!tt.ptr<bf16>, #blocked1> loc(#loc113)
+    %tmp14_72 = tt.load %tmp14_71, %tmp5_59, %cst_11 evictionPolicy = evict_last : tensor<64x64x!tt.ptr<bf16>, #blocked1> loc(#loc114)
+    %tmp14_73 = arith.extf %tmp14_72 : tensor<64x64xbf16, #blocked1> to tensor<64x64xf32, #blocked1> loc(#loc115)
+    %tmp16 = arith.mulf %tmp13_69, %tmp14_73 : tensor<64x64xf32, #blocked1> loc(#loc116)
+    %tmp20 = arith.cmpi sge, %tmp4, %cst_4 : tensor<64x1xi64, #blocked1> loc(#loc117)
+    %tmp20_74 = arith.cmpi sge, %tmp4_40, %cst_3 : tensor<64x1xi64, #blocked> loc(#loc117)
+    %tmp23 = arith.addi %y1, %cst_0 : tensor<64x1xi32, #blocked1> loc(#loc118)
+    %tmp23_75 = arith.addi %y1_38, %cst : tensor<64x1xi32, #blocked> loc(#loc118)
+    %tmp23_76 = arith.muli %tmp23, %cst_1 : tensor<64x1xi32, #blocked1> loc(#loc119)
+    %tmp23_77 = tt.broadcast %tmp23_76 : tensor<64x1xi32, #blocked1> -> tensor<64x64xi32, #blocked1> loc(#loc120)
+    %tmp23_78 = arith.addi %tmp5_45, %tmp23_77 : tensor<64x64xi32, #blocked1> loc(#loc120)
+    %tmp23_79 = tt.splat %in_ptr3 : !tt.ptr<bf16> -> tensor<64x64x!tt.ptr<bf16>, #blocked1> loc(#loc121)
+    %tmp23_80 = tt.addptr %tmp23_79, %tmp23_78 : tensor<64x64x!tt.ptr<bf16>, #blocked1>, tensor<64x64xi32, #blocked1> loc(#loc121)
+    %tmp23_81 = tt.broadcast %tmp20 : tensor<64x1xi1, #blocked1> -> tensor<64x64xi1, #blocked1> loc(#loc122)
+    %tmp23_82 = tt.broadcast %tmp20_74 : tensor<64x1xi1, #blocked> -> tensor<64x64xi1, #blocked> loc(#loc122)
+    %tmp23_83 = arith.andi %tmp23_81, %tmp5_53 : tensor<64x64xi1, #blocked1> loc(#loc122)
+    %tmp23_84 = arith.andi %tmp23_82, %tmp5_54 : tensor<64x64xi1, #blocked> loc(#loc122)
+    %tmp23_85 = arith.andi %tmp23_83, %tmp5_57 : tensor<64x64xi1, #blocked1> loc(#loc123)
+    %tmp23_86 = arith.andi %tmp23_84, %tmp5_58 : tensor<64x64xi1, #blocked> loc(#loc123)
+    %tmp23_87 = tt.load %tmp23_80, %tmp23_85, %cst_11 evictionPolicy = evict_last : tensor<64x64x!tt.ptr<bf16>, #blocked1> loc(#loc124)
+    %tmp23_88 = ttg.convert_layout %tmp23_87 : tensor<64x64xbf16, #blocked1> -> tensor<64x64xbf16, #blocked> loc(#loc125)
+    %tmp23_89 = arith.extf %tmp23_88 : tensor<64x64xbf16, #blocked> to tensor<64x64xf32, #blocked> loc(#loc125)
+    %tmp25 = arith.muli %tmp23_75, %cst_5 : tensor<64x1xi32, #blocked> loc(#loc126)
+    %tmp25_90 = arith.addi %y0_39, %tmp25 : tensor<64x1xi32, #blocked> loc(#loc127)
+    %tmp25_91 = tt.splat %in_ptr4 : !tt.ptr<f32> -> tensor<64x1x!tt.ptr<f32>, #blocked> loc(#loc128)
+    %tmp25_92 = tt.addptr %tmp25_91, %tmp25_90 : tensor<64x1x!tt.ptr<f32>, #blocked>, tensor<64x1xi32, #blocked> loc(#loc128)
+    %tmp25_93 = tt.broadcast %tmp25_92 : tensor<64x1x!tt.ptr<f32>, #blocked> -> tensor<64x64x!tt.ptr<f32>, #blocked> loc(#loc128)
+    %tmp25_94 = tt.load %tmp25_93, %tmp23_86, %cst_12 evictionPolicy = evict_last : tensor<64x64x!tt.ptr<f32>, #blocked> loc(#loc129)
+    %tmp27 = arith.divf %tmp25_94, %cst_14 : tensor<64x64xf32, #blocked> loc(#loc130)
+    %tmp29 = arith.addf %tmp27, %cst_13 : tensor<64x64xf32, #blocked> loc(#loc131)
+    %tmp30 = tt.extern_elementwise %tmp29 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<64x64xf32, #blocked>) -> tensor<64x64xf32, #blocked> loc(#loc132)
+    %tmp31 = arith.mulf %tmp23_89, %tmp30 : tensor<64x64xf32, #blocked> loc(#loc133)
+    %tmp31_95 = ttg.convert_layout %tmp31 : tensor<64x64xf32, #blocked> -> tensor<64x64xf32, #blocked1> loc(#loc133)
+    %tmp32 = tt.splat %in_ptr5 : !tt.ptr<bf16> -> tensor<1x64x!tt.ptr<bf16>, #blocked1> loc(#loc134)
+    %tmp32_96 = tt.addptr %tmp32, %xindex_35 : tensor<1x64x!tt.ptr<bf16>, #blocked1>, tensor<1x64xi32, #blocked1> loc(#loc134)
+    %tmp32_97 = tt.broadcast %tmp32_96 : tensor<1x64x!tt.ptr<bf16>, #blocked1> -> tensor<64x64x!tt.ptr<bf16>, #blocked1> loc(#loc134)
+    %tmp32_98 = tt.load %tmp32_97, %tmp23_85, %cst_11 evictionPolicy = evict_last : tensor<64x64x!tt.ptr<bf16>, #blocked1> loc(#loc135)
+    %tmp32_99 = arith.extf %tmp32_98 : tensor<64x64xbf16, #blocked1> to tensor<64x64xf32, #blocked1> loc(#loc136)
+    %tmp34 = arith.mulf %tmp31_95, %tmp32_99 : tensor<64x64xf32, #blocked1> loc(#loc137)
+    %tmp37 = arith.select %tmp23_81, %tmp34, %cst_15 : tensor<64x64xi1, #blocked1>, tensor<64x64xf32, #blocked1> loc(#loc138)
+    %tmp38 = arith.select %tmp5_51, %tmp16, %tmp37 : tensor<64x64xi1, #blocked1>, tensor<64x64xf32, #blocked1> loc(#loc141)
+    %0 = arith.muli %yindex_26, %cst_2 : tensor<64x1xi32, #blocked1> loc(#loc64)
+    %1 = tt.broadcast %0 : tensor<64x1xi32, #blocked1> -> tensor<64x64xi32, #blocked1> loc(#loc65)
+    %2 = arith.addi %tmp5_43, %1 : tensor<64x64xi32, #blocked1> loc(#loc65)
+    %3 = tt.splat %out_ptr0 : !tt.ptr<bf16> -> tensor<64x64x!tt.ptr<bf16>, #blocked1> loc(#loc66)
+    %4 = tt.addptr %3, %2 : tensor<64x64x!tt.ptr<bf16>, #blocked1>, tensor<64x64xi32, #blocked1> loc(#loc66)
+    %5 = arith.andi %tmp5_53, %tmp5_57 : tensor<64x64xi1, #blocked1> loc(#loc67)
+    %6 = arith.truncf %tmp38 : tensor<64x64xf32, #blocked1> to tensor<64x64xbf16, #blocked1> loc(#loc68)
+    tt.store %4, %6, %5 : tensor<64x64x!tt.ptr<bf16>, #blocked1> loc(#loc68)
+    tt.return loc(#loc69)
+  } loc(#loc)
+} loc(#loc)
+#loc1 = loc(unknown)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:29)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:48)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:69)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:53)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:34)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:75)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":22:44)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":22:23)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":23:21)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":24:28)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":24:33)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":25:44)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":25:23)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":26:21)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":27:19)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":29:19)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":35:18)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:39)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:35)
+#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:51)
+#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:44)
+#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:30)
+#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:64)
+#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:72)
+#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:57)
+#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:123)
+#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:55)
+#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:51)
+#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:30)
+#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:80)
+#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":40:19)
+#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":42:19)
+#loc34 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":43:28)
+#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":44:19)
+#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:31)
+#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:71)
+#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:137)
+#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":47:20)
+#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":51:20)
+#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:61)
+#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:52)
+#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:45)
+#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:31)
+#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:75)
+#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:83)
+#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:67)
+#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:134)
+#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:56)
+#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:52)
+#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:31)
+#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:90)
+#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":58:21)
+#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":60:20)
+#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":61:28)
+#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":62:20)
+#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:31)
+#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:71)
+#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:138)
+#loc60 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":65:20)
+#loc61 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":68:35)
+#loc62 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":69:34)
+#loc63 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":50:34)
+#loc64 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:34)
+#loc65 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:30)
+#loc66 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:25)
+#loc67 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:54)
+#loc68 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:46)
+#loc69 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:4)
+#loc79 = loc("yoffset"(#loc2))
+#loc80 = loc("yoffset"(#loc3))
+#loc81 = loc("yoffset"(#loc4))
+#loc82 = loc("yoffset"(#loc5))
+#loc83 = loc("yoffset"(#loc6))
+#loc84 = loc("yoffset"(#loc7))
+#loc85 = loc("yindex"(#loc8))
+#loc86 = loc("yindex"(#loc9))
+#loc87 = loc("ymask"(#loc10))
+#loc88 = loc("xoffset"(#loc11))
+#loc89 = loc("xoffset"(#loc12))
+#loc90 = loc("xindex"(#loc13))
+#loc91 = loc("xindex"(#loc14))
+#loc92 = loc("xmask"(#loc15))
+#loc93 = loc("y1"(#loc16))
+#loc94 = loc("y0"(#loc17))
+#loc95 = loc("tmp4"(#loc18))
+#loc96 = loc("tmp5"(#loc19))
+#loc97 = loc("tmp5"(#loc20))
+#loc98 = loc("tmp5"(#loc21))
+#loc99 = loc("tmp5"(#loc22))
+#loc100 = loc("tmp5"(#loc23))
+#loc101 = loc("tmp5"(#loc24))
+#loc102 = loc("tmp5"(#loc25))
+#loc103 = loc("tmp5"(#loc26))
+#loc104 = loc("tmp5"(#loc27))
+#loc105 = loc("tmp7"(#loc28))
+#loc106 = loc("tmp7"(#loc29))
+#loc107 = loc("tmp7"(#loc30))
+#loc108 = loc("tmp7"(#loc31))
+#loc109 = loc("tmp9"(#loc32))
+#loc110 = loc("tmp11"(#loc33))
+#loc111 = loc("tmp12"(#loc34))
+#loc112 = loc("tmp13"(#loc35))
+#loc113 = loc("tmp14"(#loc36))
+#loc114 = loc("tmp14"(#loc37))
+#loc115 = loc("tmp14"(#loc38))
+#loc116 = loc("tmp16"(#loc39))
+#loc117 = loc("tmp20"(#loc40))
+#loc118 = loc("tmp23"(#loc41))
+#loc119 = loc("tmp23"(#loc42))
+#loc120 = loc("tmp23"(#loc43))
+#loc121 = loc("tmp23"(#loc44))
+#loc122 = loc("tmp23"(#loc45))
+#loc123 = loc("tmp23"(#loc46))
+#loc124 = loc("tmp23"(#loc47))
+#loc125 = loc("tmp23"(#loc48))
+#loc126 = loc("tmp25"(#loc49))
+#loc127 = loc("tmp25"(#loc50))
+#loc128 = loc("tmp25"(#loc51))
+#loc129 = loc("tmp25"(#loc52))
+#loc130 = loc("tmp27"(#loc53))
+#loc131 = loc("tmp29"(#loc54))
+#loc132 = loc("tmp30"(#loc55))
+#loc133 = loc("tmp31"(#loc56))
+#loc134 = loc("tmp32"(#loc57))
+#loc135 = loc("tmp32"(#loc58))
+#loc136 = loc("tmp32"(#loc59))
+#loc137 = loc("tmp34"(#loc60))
+#loc138 = loc("tmp37"(#loc61))
+#loc139 = loc("tmp38"(#loc62))
+#loc140 = loc("tmp19"(#loc63))
+#loc141 = loc(fused[#loc139, #loc140])
diff --git a/triton/RNNMPWWZPRYLZDDP3QNL7R5SV7EYTG7WXIUJKWKAEGE4BUI424IA/triton_poi_fused__fused_rms_norm_cat_view_2.ttir b/triton/RNNMPWWZPRYLZDDP3QNL7R5SV7EYTG7WXIUJKWKAEGE4BUI424IA/triton_poi_fused__fused_rms_norm_cat_view_2.ttir
new file mode 100644
index 0000000000000000000000000000000000000000..cac973030c96375fbf414a39a695eb455cb30e8c
--- /dev/null
+++ b/triton/RNNMPWWZPRYLZDDP3QNL7R5SV7EYTG7WXIUJKWKAEGE4BUI424IA/triton_poi_fused__fused_rms_norm_cat_view_2.ttir
@@ -0,0 +1,252 @@
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":18:0)
+#loc71 = loc("in_ptr0"(#loc))
+#loc72 = loc("in_ptr1"(#loc))
+#loc73 = loc("in_ptr2"(#loc))
+#loc74 = loc("in_ptr3"(#loc))
+#loc75 = loc("in_ptr4"(#loc))
+#loc76 = loc("in_ptr5"(#loc))
+#loc77 = loc("out_ptr0"(#loc))
+#loc78 = loc("ynumel"(#loc))
+#loc79 = loc("xnumel"(#loc))
+module {
+  tt.func public @triton_poi_fused__fused_rms_norm_cat_view_2(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %in_ptr4: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr4"(#loc)), %in_ptr5: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr5"(#loc)), %out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ynumel: i32 {tt.divisibility = 16 : i32} loc("ynumel"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} {
+    %cst = arith.constant dense<0.000000e+00> : tensor<64x64xbf16> loc(#loc1)
+    %cst_0 = arith.constant dense<-256> : tensor<64x1xi32> loc(#loc1)
+    %cst_1 = arith.constant dense<9.99999997E-7> : tensor<64x64xf32> loc(#loc1)
+    %cst_2 = arith.constant dense<1.280000e+02> : tensor<64x64xf32> loc(#loc1)
+    %cst_3 = arith.constant dense<0.000000e+00> : tensor<64x64xf32> loc(#loc1)
+    %cst_4 = arith.constant dense<12288> : tensor<64x1xi32> loc(#loc1)
+    %cst_5 = arith.constant dense<128> : tensor<64x1xi32> loc(#loc1)
+    %cst_6 = arith.constant dense<256> : tensor<64x1xi64> loc(#loc1)
+    %cst_7 = arith.constant dense<32> : tensor<64x1xi32> loc(#loc1)
+    %xmask = arith.constant dense<128> : tensor<1x64xi32> loc(#loc80)
+    %ymask = arith.constant dense<73728> : tensor<64x1xi32> loc(#loc81)
+    %c64_i32 = arith.constant 64 : i32 loc(#loc1)
+    %yoffset = tt.get_program_id y : i32 loc(#loc82)
+    %yoffset_8 = tt.get_program_id z : i32 loc(#loc83)
+    %yoffset_9 = tt.get_num_programs y : i32 loc(#loc84)
+    %yoffset_10 = arith.muli %yoffset_8, %yoffset_9 : i32 loc(#loc85)
+    %yoffset_11 = arith.addi %yoffset, %yoffset_10 : i32 loc(#loc86)
+    %yoffset_12 = arith.muli %yoffset_11, %c64_i32 : i32 loc(#loc87)
+    %yindex = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc88)
+    %yindex_13 = tt.expand_dims %yindex {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc89)
+    %yindex_14 = tt.splat %yoffset_12 : i32 -> tensor<64x1xi32> loc(#loc90)
+    %yindex_15 = arith.addi %yindex_14, %yindex_13 : tensor<64x1xi32> loc(#loc90)
+    %ymask_16 = arith.cmpi slt, %yindex_15, %ymask : tensor<64x1xi32> loc(#loc81)
+    %xoffset = tt.get_program_id x : i32 loc(#loc91)
+    %xoffset_17 = arith.muli %xoffset, %c64_i32 : i32 loc(#loc92)
+    %xindex = tt.expand_dims %yindex {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc93)
+    %xindex_18 = tt.splat %xoffset_17 : i32 -> tensor<1x64xi32> loc(#loc94)
+    %xindex_19 = arith.addi %xindex_18, %xindex : tensor<1x64xi32> loc(#loc94)
+    %xmask_20 = arith.cmpi slt, %xindex_19, %xmask : tensor<1x64xi32> loc(#loc80)
+    %y1 = arith.divsi %yindex_15, %cst_7 : tensor<64x1xi32> loc(#loc95)
+    %y0 = arith.remsi %yindex_15, %cst_7 : tensor<64x1xi32> loc(#loc96)
+    %tmp4 = arith.extsi %y1 : tensor<64x1xi32> to tensor<64x1xi64> loc(#loc97)
+    %tmp4_21 = arith.cmpi slt, %tmp4, %cst_6 : tensor<64x1xi64> loc(#loc97)
+    %tmp5 = arith.muli %y0, %cst_5 : tensor<64x1xi32> loc(#loc98)
+    %tmp5_22 = tt.broadcast %xindex_19 : tensor<1x64xi32> -> tensor<64x64xi32> loc(#loc99)
+    %tmp5_23 = tt.broadcast %tmp5 : tensor<64x1xi32> -> tensor<64x64xi32> loc(#loc99)
+    %tmp5_24 = arith.addi %tmp5_22, %tmp5_23 : tensor<64x64xi32> loc(#loc99)
+    %tmp5_25 = arith.muli %y1, %cst_4 : tensor<64x1xi32> loc(#loc100)
+    %tmp5_26 = tt.broadcast %tmp5_25 : tensor<64x1xi32> -> tensor<64x64xi32> loc(#loc101)
+    %tmp5_27 = arith.addi %tmp5_24, %tmp5_26 : tensor<64x64xi32> loc(#loc101)
+    %tmp5_28 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<64x64x!tt.ptr<bf16>> loc(#loc102)
+    %tmp5_29 = tt.addptr %tmp5_28, %tmp5_27 : tensor<64x64x!tt.ptr<bf16>>, tensor<64x64xi32> loc(#loc102)
+    %tmp5_30 = tt.broadcast %tmp4_21 : tensor<64x1xi1> -> tensor<64x64xi1> loc(#loc103)
+    %tmp5_31 = tt.broadcast %xmask_20 : tensor<1x64xi1> -> tensor<64x64xi1> loc(#loc103)
+    %tmp5_32 = arith.andi %tmp5_30, %tmp5_31 : tensor<64x64xi1> loc(#loc103)
+    %tmp5_33 = tt.broadcast %ymask_16 : tensor<64x1xi1> -> tensor<64x64xi1> loc(#loc104)
+    %tmp5_34 = arith.andi %tmp5_32, %tmp5_33 : tensor<64x64xi1> loc(#loc104)
+    %tmp5_35 = tt.load %tmp5_29, %tmp5_34, %cst evictionPolicy = evict_last : tensor<64x64x!tt.ptr<bf16>> loc(#loc105)
+    %tmp5_36 = arith.extf %tmp5_35 : tensor<64x64xbf16> to tensor<64x64xf32> loc(#loc106)
+    %tmp7 = arith.muli %y1, %cst_7 : tensor<64x1xi32> loc(#loc107)
+    %tmp7_37 = arith.addi %y0, %tmp7 : tensor<64x1xi32> loc(#loc108)
+    %tmp7_38 = tt.splat %in_ptr1 : !tt.ptr<f32> -> tensor<64x1x!tt.ptr<f32>> loc(#loc109)
+    %tmp7_39 = tt.addptr %tmp7_38, %tmp7_37 : tensor<64x1x!tt.ptr<f32>>, tensor<64x1xi32> loc(#loc109)
+    %tmp7_40 = tt.broadcast %tmp7_39 : tensor<64x1x!tt.ptr<f32>> -> tensor<64x64x!tt.ptr<f32>> loc(#loc109)
+    %tmp7_41 = tt.load %tmp7_40, %tmp5_34, %cst_3 evictionPolicy = evict_last : tensor<64x64x!tt.ptr<f32>> loc(#loc110)
+    %tmp9 = arith.divf %tmp7_41, %cst_2 : tensor<64x64xf32> loc(#loc111)
+    %tmp11 = arith.addf %tmp9, %cst_1 : tensor<64x64xf32> loc(#loc112)
+    %tmp12 = tt.extern_elementwise %tmp11 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<64x64xf32>) -> tensor<64x64xf32> loc(#loc113)
+    %tmp13 = arith.mulf %tmp5_36, %tmp12 : tensor<64x64xf32> loc(#loc114)
+    %tmp14 = tt.splat %in_ptr2 : !tt.ptr<bf16> -> tensor<1x64x!tt.ptr<bf16>> loc(#loc115)
+    %tmp14_42 = tt.addptr %tmp14, %xindex_19 : tensor<1x64x!tt.ptr<bf16>>, tensor<1x64xi32> loc(#loc115)
+    %tmp14_43 = tt.broadcast %tmp14_42 : tensor<1x64x!tt.ptr<bf16>> -> tensor<64x64x!tt.ptr<bf16>> loc(#loc115)
+    %tmp14_44 = tt.load %tmp14_43, %tmp5_34, %cst evictionPolicy = evict_last : tensor<64x64x!tt.ptr<bf16>> loc(#loc116)
+    %tmp14_45 = arith.extf %tmp14_44 : tensor<64x64xbf16> to tensor<64x64xf32> loc(#loc117)
+    %tmp16 = arith.mulf %tmp13, %tmp14_45 : tensor<64x64xf32> loc(#loc118)
+    %tmp19 = arith.select %tmp5_30, %tmp16, %cst_3 : tensor<64x64xi1>, tensor<64x64xf32> loc(#loc119)
+    %tmp20 = arith.cmpi sge, %tmp4, %cst_6 : tensor<64x1xi64> loc(#loc120)
+    %tmp23 = arith.addi %y1, %cst_0 : tensor<64x1xi32> loc(#loc121)
+    %tmp23_46 = arith.muli %tmp23, %cst_4 : tensor<64x1xi32> loc(#loc122)
+    %tmp23_47 = tt.broadcast %tmp23_46 : tensor<64x1xi32> -> tensor<64x64xi32> loc(#loc123)
+    %tmp23_48 = arith.addi %tmp5_24, %tmp23_47 : tensor<64x64xi32> loc(#loc123)
+    %tmp23_49 = tt.splat %in_ptr3 : !tt.ptr<bf16> -> tensor<64x64x!tt.ptr<bf16>> loc(#loc124)
+    %tmp23_50 = tt.addptr %tmp23_49, %tmp23_48 : tensor<64x64x!tt.ptr<bf16>>, tensor<64x64xi32> loc(#loc124)
+    %tmp23_51 = tt.broadcast %tmp20 : tensor<64x1xi1> -> tensor<64x64xi1> loc(#loc125)
+    %tmp23_52 = arith.andi %tmp23_51, %tmp5_31 : tensor<64x64xi1> loc(#loc125)
+    %tmp23_53 = arith.andi %tmp23_52, %tmp5_33 : tensor<64x64xi1> loc(#loc126)
+    %tmp23_54 = tt.load %tmp23_50, %tmp23_53, %cst evictionPolicy = evict_last : tensor<64x64x!tt.ptr<bf16>> loc(#loc127)
+    %tmp23_55 = arith.extf %tmp23_54 : tensor<64x64xbf16> to tensor<64x64xf32> loc(#loc128)
+    %tmp25 = arith.muli %tmp23, %cst_7 : tensor<64x1xi32> loc(#loc129)
+    %tmp25_56 = arith.addi %y0, %tmp25 : tensor<64x1xi32> loc(#loc130)
+    %tmp25_57 = tt.splat %in_ptr4 : !tt.ptr<f32> -> tensor<64x1x!tt.ptr<f32>> loc(#loc131)
+    %tmp25_58 = tt.addptr %tmp25_57, %tmp25_56 : tensor<64x1x!tt.ptr<f32>>, tensor<64x1xi32> loc(#loc131)
+    %tmp25_59 = tt.broadcast %tmp25_58 : tensor<64x1x!tt.ptr<f32>> -> tensor<64x64x!tt.ptr<f32>> loc(#loc131)
+    %tmp25_60 = tt.load %tmp25_59, %tmp23_53, %cst_3 evictionPolicy = evict_last : tensor<64x64x!tt.ptr<f32>> loc(#loc132)
+    %tmp27 = arith.divf %tmp25_60, %cst_2 : tensor<64x64xf32> loc(#loc133)
+    %tmp29 = arith.addf %tmp27, %cst_1 : tensor<64x64xf32> loc(#loc134)
+    %tmp30 = tt.extern_elementwise %tmp29 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<64x64xf32>) -> tensor<64x64xf32> loc(#loc135)
+    %tmp31 = arith.mulf %tmp23_55, %tmp30 : tensor<64x64xf32> loc(#loc136)
+    %tmp32 = tt.splat %in_ptr5 : !tt.ptr<bf16> -> tensor<1x64x!tt.ptr<bf16>> loc(#loc137)
+    %tmp32_61 = tt.addptr %tmp32, %xindex_19 : tensor<1x64x!tt.ptr<bf16>>, tensor<1x64xi32> loc(#loc137)
+    %tmp32_62 = tt.broadcast %tmp32_61 : tensor<1x64x!tt.ptr<bf16>> -> tensor<64x64x!tt.ptr<bf16>> loc(#loc137)
+    %tmp32_63 = tt.load %tmp32_62, %tmp23_53, %cst evictionPolicy = evict_last : tensor<64x64x!tt.ptr<bf16>> loc(#loc138)
+    %tmp32_64 = arith.extf %tmp32_63 : tensor<64x64xbf16> to tensor<64x64xf32> loc(#loc139)
+    %tmp34 = arith.mulf %tmp31, %tmp32_64 : tensor<64x64xf32> loc(#loc140)
+    %tmp37 = arith.select %tmp23_51, %tmp34, %cst_3 : tensor<64x64xi1>, tensor<64x64xf32> loc(#loc141)
+    %tmp38 = arith.select %tmp5_30, %tmp19, %tmp37 : tensor<64x64xi1>, tensor<64x64xf32> loc(#loc142)
+    %0 = arith.muli %yindex_15, %cst_5 : tensor<64x1xi32> loc(#loc65)
+    %1 = tt.broadcast %0 : tensor<64x1xi32> -> tensor<64x64xi32> loc(#loc66)
+    %2 = arith.addi %tmp5_22, %1 : tensor<64x64xi32> loc(#loc66)
+    %3 = tt.splat %out_ptr0 : !tt.ptr<bf16> -> tensor<64x64x!tt.ptr<bf16>> loc(#loc67)
+    %4 = tt.addptr %3, %2 : tensor<64x64x!tt.ptr<bf16>>, tensor<64x64xi32> loc(#loc67)
+    %5 = arith.andi %tmp5_31, %tmp5_33 : tensor<64x64xi1> loc(#loc68)
+    %6 = arith.truncf %tmp38 : tensor<64x64xf32> to tensor<64x64xbf16> loc(#loc69)
+    tt.store %4, %6, %5 : tensor<64x64x!tt.ptr<bf16>> loc(#loc69)
+    tt.return loc(#loc70)
+  } loc(#loc)
+} loc(#loc)
+#loc1 = loc(unknown)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":26:21)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":23:21)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:29)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:48)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:69)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:53)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:34)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:75)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":22:36)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":22:44)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":22:23)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":24:28)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":24:33)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":25:44)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":25:23)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":27:19)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":29:19)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":35:18)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:39)
+#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:35)
+#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:51)
+#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:44)
+#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:30)
+#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:64)
+#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:72)
+#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:57)
+#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:123)
+#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:55)
+#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:51)
+#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:30)
+#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:80)
+#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":40:19)
+#loc34 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":42:19)
+#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":43:28)
+#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":44:19)
+#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:31)
+#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:71)
+#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:137)
+#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":47:20)
+#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":50:34)
+#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":51:20)
+#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:61)
+#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:52)
+#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:45)
+#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:31)
+#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:75)
+#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:83)
+#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:67)
+#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:134)
+#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:56)
+#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:52)
+#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:31)
+#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:90)
+#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":58:21)
+#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":60:20)
+#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":61:28)
+#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":62:20)
+#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:31)
+#loc60 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:71)
+#loc61 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:138)
+#loc62 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":65:20)
+#loc63 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":68:35)
+#loc64 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":69:34)
+#loc65 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:34)
+#loc66 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:30)
+#loc67 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:25)
+#loc68 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:54)
+#loc69 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:46)
+#loc70 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:4)
+#loc80 = loc("xmask"(#loc2))
+#loc81 = loc("ymask"(#loc3))
+#loc82 = loc("yoffset"(#loc4))
+#loc83 = loc("yoffset"(#loc5))
+#loc84 = loc("yoffset"(#loc6))
+#loc85 = loc("yoffset"(#loc7))
+#loc86 = loc("yoffset"(#loc8))
+#loc87 = loc("yoffset"(#loc9))
+#loc88 = loc("yindex"(#loc10))
+#loc89 = loc("yindex"(#loc11))
+#loc90 = loc("yindex"(#loc12))
+#loc91 = loc("xoffset"(#loc13))
+#loc92 = loc("xoffset"(#loc14))
+#loc93 = loc("xindex"(#loc15))
+#loc94 = loc("xindex"(#loc16))
+#loc95 = loc("y1"(#loc17))
+#loc96 = loc("y0"(#loc18))
+#loc97 = loc("tmp4"(#loc19))
+#loc98 = loc("tmp5"(#loc20))
+#loc99 = loc("tmp5"(#loc21))
+#loc100 = loc("tmp5"(#loc22))
+#loc101 = loc("tmp5"(#loc23))
+#loc102 = loc("tmp5"(#loc24))
+#loc103 = loc("tmp5"(#loc25))
+#loc104 = loc("tmp5"(#loc26))
+#loc105 = loc("tmp5"(#loc27))
+#loc106 = loc("tmp5"(#loc28))
+#loc107 = loc("tmp7"(#loc29))
+#loc108 = loc("tmp7"(#loc30))
+#loc109 = loc("tmp7"(#loc31))
+#loc110 = loc("tmp7"(#loc32))
+#loc111 = loc("tmp9"(#loc33))
+#loc112 = loc("tmp11"(#loc34))
+#loc113 = loc("tmp12"(#loc35))
+#loc114 = loc("tmp13"(#loc36))
+#loc115 = loc("tmp14"(#loc37))
+#loc116 = loc("tmp14"(#loc38))
+#loc117 = loc("tmp14"(#loc39))
+#loc118 = loc("tmp16"(#loc40))
+#loc119 = loc("tmp19"(#loc41))
+#loc120 = loc("tmp20"(#loc42))
+#loc121 = loc("tmp23"(#loc43))
+#loc122 = loc("tmp23"(#loc44))
+#loc123 = loc("tmp23"(#loc45))
+#loc124 = loc("tmp23"(#loc46))
+#loc125 = loc("tmp23"(#loc47))
+#loc126 = loc("tmp23"(#loc48))
+#loc127 = loc("tmp23"(#loc49))
+#loc128 = loc("tmp23"(#loc50))
+#loc129 = loc("tmp25"(#loc51))
+#loc130 = loc("tmp25"(#loc52))
+#loc131 = loc("tmp25"(#loc53))
+#loc132 = loc("tmp25"(#loc54))
+#loc133 = loc("tmp27"(#loc55))
+#loc134 = loc("tmp29"(#loc56))
+#loc135 = loc("tmp30"(#loc57))
+#loc136 = loc("tmp31"(#loc58))
+#loc137 = loc("tmp32"(#loc59))
+#loc138 = loc("tmp32"(#loc60))
+#loc139 = loc("tmp32"(#loc61))
+#loc140 = loc("tmp34"(#loc62))
+#loc141 = loc("tmp37"(#loc63))
+#loc142 = loc("tmp38"(#loc64))
diff --git a/triton/S5Z477N52AES6BCQLQSCQFPIOCFND32E3P2XRI6OSCXZCB32BOSQ/__grp__triton_red_fused_add_mul_native_layer_norm_0.json b/triton/S5Z477N52AES6BCQLQSCQFPIOCFND32E3P2XRI6OSCXZCB32BOSQ/__grp__triton_red_fused_add_mul_native_layer_norm_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..edd2db01880eb9998e9b112829861fe28f144dbf
--- /dev/null
+++ b/triton/S5Z477N52AES6BCQLQSCQFPIOCFND32E3P2XRI6OSCXZCB32BOSQ/__grp__triton_red_fused_add_mul_native_layer_norm_0.json
@@ -0,0 +1 @@
+{"child_paths": {"triton_red_fused_add_mul_native_layer_norm_0.source": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/S5Z477N52AES6BCQLQSCQFPIOCFND32E3P2XRI6OSCXZCB32BOSQ/triton_red_fused_add_mul_native_layer_norm_0.source", "triton_red_fused_add_mul_native_layer_norm_0.ttir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/S5Z477N52AES6BCQLQSCQFPIOCFND32E3P2XRI6OSCXZCB32BOSQ/triton_red_fused_add_mul_native_layer_norm_0.ttir", "triton_red_fused_add_mul_native_layer_norm_0.ttgir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/S5Z477N52AES6BCQLQSCQFPIOCFND32E3P2XRI6OSCXZCB32BOSQ/triton_red_fused_add_mul_native_layer_norm_0.ttgir", "triton_red_fused_add_mul_native_layer_norm_0.llir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/S5Z477N52AES6BCQLQSCQFPIOCFND32E3P2XRI6OSCXZCB32BOSQ/triton_red_fused_add_mul_native_layer_norm_0.llir", "triton_red_fused_add_mul_native_layer_norm_0.ptx": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/S5Z477N52AES6BCQLQSCQFPIOCFND32E3P2XRI6OSCXZCB32BOSQ/triton_red_fused_add_mul_native_layer_norm_0.ptx", "triton_red_fused_add_mul_native_layer_norm_0.cubin": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/S5Z477N52AES6BCQLQSCQFPIOCFND32E3P2XRI6OSCXZCB32BOSQ/triton_red_fused_add_mul_native_layer_norm_0.cubin", "triton_red_fused_add_mul_native_layer_norm_0.json": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/S5Z477N52AES6BCQLQSCQFPIOCFND32E3P2XRI6OSCXZCB32BOSQ/triton_red_fused_add_mul_native_layer_norm_0.json"}}
\ No newline at end of file
diff --git a/triton/S5Z477N52AES6BCQLQSCQFPIOCFND32E3P2XRI6OSCXZCB32BOSQ/triton_red_fused_add_mul_native_layer_norm_0.cubin b/triton/S5Z477N52AES6BCQLQSCQFPIOCFND32E3P2XRI6OSCXZCB32BOSQ/triton_red_fused_add_mul_native_layer_norm_0.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..961c6a7bdbc02e331d11ca186da91efe70a1fc3d
Binary files /dev/null and b/triton/S5Z477N52AES6BCQLQSCQFPIOCFND32E3P2XRI6OSCXZCB32BOSQ/triton_red_fused_add_mul_native_layer_norm_0.cubin differ
diff --git a/triton/S5Z477N52AES6BCQLQSCQFPIOCFND32E3P2XRI6OSCXZCB32BOSQ/triton_red_fused_add_mul_native_layer_norm_0.json b/triton/S5Z477N52AES6BCQLQSCQFPIOCFND32E3P2XRI6OSCXZCB32BOSQ/triton_red_fused_add_mul_native_layer_norm_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..e6e70c01ff7d9e4072991fcd6ad9e922ea68a17f
--- /dev/null
+++ b/triton/S5Z477N52AES6BCQLQSCQFPIOCFND32E3P2XRI6OSCXZCB32BOSQ/triton_red_fused_add_mul_native_layer_norm_0.json
@@ -0,0 +1 @@
+{"hash": "9773cffdbdd0092f04505c242815e8708ad1ef44dbf578a3ce90af91077a0ba5", "target": {"backend": "cuda", "arch": 89, "warp_size": 32}, "num_warps": 16, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "enable_reflect_ftz": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee", "bf16x3", "bf16x6"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm89", "instrumentation_mode": "", "triton_version": "3.6.0", "tensordesc_meta": [], "shared": 192, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused_add_mul_native_layer_norm_0"}
\ No newline at end of file
diff --git a/triton/S5Z477N52AES6BCQLQSCQFPIOCFND32E3P2XRI6OSCXZCB32BOSQ/triton_red_fused_add_mul_native_layer_norm_0.llir b/triton/S5Z477N52AES6BCQLQSCQFPIOCFND32E3P2XRI6OSCXZCB32BOSQ/triton_red_fused_add_mul_native_layer_norm_0.llir
new file mode 100644
index 0000000000000000000000000000000000000000..bf1cf2cba787c5bc98d995d084f32746a623f725
--- /dev/null
+++ b/triton/S5Z477N52AES6BCQLQSCQFPIOCFND32E3P2XRI6OSCXZCB32BOSQ/triton_red_fused_add_mul_native_layer_norm_0.llir
@@ -0,0 +1,565 @@
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64"
+
+@global_smem = external addrspace(3) global [0 x i8], align 16
+@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1
+
+; Function Attrs: nounwind
+define ptx_kernel void @triton_red_fused_add_mul_native_layer_norm_0(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, i32 %4, i32 %5, ptr addrspace(1) readnone captures(none) %6, ptr addrspace(1) readnone captures(none) %7) local_unnamed_addr #0 !dbg !5 {
+__nv_rsqrtf.exit:
+  %8 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !8
+  %9 = icmp samesign ult i32 %8, 2304, !dbg !9
+  %10 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
+  %11 = shl nuw nsw i32 %10, 2, !dbg !10
+  %12 = and i32 %11, 2044, !dbg !10
+  %13 = shl i32 %8, 12, !dbg !11
+  %14 = or disjoint i32 %12, %13
+  %15 = sext i32 %14 to i64, !dbg !12
+  %16 = getelementptr bfloat, ptr addrspace(1) %0, i64 %15, !dbg !13
+  %17 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !14
+  %18 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %16, i64 %17, i1 %9) #6, !dbg !14
+  %19 = extractvalue { i32, i32 } %18, 1, !dbg !14
+  %20 = bitcast i32 %19 to <2 x bfloat>, !dbg !14
+  %21 = extractelement <2 x bfloat> %20, i64 1, !dbg !14
+  %22 = fpext bfloat %21 to float, !dbg !15
+  %23 = extractelement <2 x bfloat> %20, i64 0, !dbg !14
+  %24 = fpext bfloat %23 to float, !dbg !15
+  %25 = extractvalue { i32, i32 } %18, 0, !dbg !14
+  %26 = bitcast i32 %25 to <2 x bfloat>, !dbg !14
+  %27 = extractelement <2 x bfloat> %26, i64 1, !dbg !14
+  %28 = fpext bfloat %27 to float, !dbg !15
+  %29 = extractelement <2 x bfloat> %26, i64 0, !dbg !14
+  %30 = fpext bfloat %29 to float, !dbg !15
+  %31 = select i1 %9, float %30, float 0.000000e+00, !dbg !16
+  %32 = select i1 %9, float %28, float 0.000000e+00, !dbg !16
+  %33 = select i1 %9, float %24, float 0.000000e+00, !dbg !16
+  %34 = select i1 %9, float %22, float 0.000000e+00, !dbg !16
+  %35 = getelementptr bfloat, ptr addrspace(1) %0, i64 %15, !dbg !13
+  %36 = getelementptr i8, ptr addrspace(1) %35, i64 4096, !dbg !13
+  %37 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !14
+  %38 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %36, i64 %37, i1 %9) #6, !dbg !14
+  %39 = extractvalue { i32, i32 } %38, 0, !dbg !14
+  %40 = bitcast i32 %39 to <2 x bfloat>, !dbg !14
+  %41 = extractelement <2 x bfloat> %40, i64 0, !dbg !14
+  %42 = fpext bfloat %41 to float, !dbg !15
+  %43 = fsub float %42, %31, !dbg !17
+  %44 = select i1 %9, float 2.000000e+00, float 1.000000e+00, !dbg !22
+  %45 = tail call float @llvm.nvvm.div.full(float %43, float %44), !dbg !23
+  %46 = fadd float %31, %45, !dbg !24
+  %47 = fsub float %42, %46, !dbg !25
+  %48 = fmul float %43, %47, !dbg !26
+  %49 = fadd float %48, 0.000000e+00, !dbg !27
+  %50 = extractelement <2 x bfloat> %40, i64 1, !dbg !14
+  %51 = fpext bfloat %50 to float, !dbg !15
+  %52 = fsub float %51, %32, !dbg !17
+  %53 = tail call float @llvm.nvvm.div.full(float %52, float %44), !dbg !23
+  %54 = fadd float %32, %53, !dbg !24
+  %55 = fsub float %51, %54, !dbg !25
+  %56 = fmul float %52, %55, !dbg !26
+  %57 = fadd float %56, 0.000000e+00, !dbg !27
+  %58 = extractvalue { i32, i32 } %38, 1, !dbg !14
+  %59 = bitcast i32 %58 to <2 x bfloat>, !dbg !14
+  %60 = extractelement <2 x bfloat> %59, i64 0, !dbg !14
+  %61 = fpext bfloat %60 to float, !dbg !15
+  %62 = fsub float %61, %33, !dbg !17
+  %63 = tail call float @llvm.nvvm.div.full(float %62, float %44), !dbg !23
+  %64 = fadd float %33, %63, !dbg !24
+  %65 = fsub float %61, %64, !dbg !25
+  %66 = fmul float %62, %65, !dbg !26
+  %67 = fadd float %66, 0.000000e+00, !dbg !27
+  %68 = extractelement <2 x bfloat> %59, i64 1, !dbg !14
+  %69 = fpext bfloat %68 to float, !dbg !15
+  %70 = fsub float %69, %34, !dbg !17
+  %71 = tail call float @llvm.nvvm.div.full(float %70, float %44), !dbg !23
+  %72 = fadd float %34, %71, !dbg !24
+  %73 = fsub float %69, %72, !dbg !25
+  %74 = fmul float %70, %73, !dbg !26
+  %75 = fadd float %74, 0.000000e+00, !dbg !27
+  %76 = select i1 %9, float %46, float 0.000000e+00, !dbg !16
+  %77 = select i1 %9, float %54, float 0.000000e+00, !dbg !16
+  %78 = select i1 %9, float %64, float 0.000000e+00, !dbg !16
+  %79 = select i1 %9, float %72, float 0.000000e+00, !dbg !16
+  %80 = select i1 %9, float %67, float 0.000000e+00, !dbg !28
+  %81 = select i1 %9, float %75, float 0.000000e+00, !dbg !28
+  %82 = select i1 %9, float 2.000000e+00, float 0.000000e+00, !dbg !22
+  %83 = select i1 %9, float 2.000000e+00, float 0.000000e+00, !dbg !22
+  %84 = select i1 %9, float 2.000000e+00, float 0.000000e+00, !dbg !22
+  %85 = select i1 %9, float 2.000000e+00, float 0.000000e+00, !dbg !22
+  %86 = and i32 %10, 511, !dbg !10
+  %87 = and i32 %10, 31, !dbg !10
+  %88 = lshr i32 %86, 5, !dbg !10
+  %89 = fsub float %77, %76, !dbg !29
+  %90 = select i1 %9, float 4.000000e+00, float 0.000000e+00, !dbg !32
+  %91 = fcmp oeq float %90, 0.000000e+00, !dbg !33
+  %92 = tail call float @llvm.nvvm.div.full(float %83, float %90), !dbg !34
+  %93 = select i1 %91, float 0.000000e+00, float %92, !dbg !35
+  %94 = fmul float %89, %93, !dbg !36
+  %95 = fadd float %76, %94, !dbg !37
+  %96 = fadd float %49, %57, !dbg !38
+  %97 = select i1 %9, float %96, float 0.000000e+00, !dbg !38
+  %98 = fmul float %89, %89, !dbg !39
+  %99 = fmul float %98, %82, !dbg !40
+  %100 = fmul float %99, %93, !dbg !41
+  %101 = fadd float %97, %100, !dbg !42
+  %102 = fsub float %78, %95, !dbg !29
+  %103 = select i1 %9, float 6.000000e+00, float 0.000000e+00, !dbg !32
+  %104 = fcmp oeq float %103, 0.000000e+00, !dbg !33
+  %105 = tail call float @llvm.nvvm.div.full(float %84, float %103), !dbg !34
+  %106 = select i1 %104, float 0.000000e+00, float %105, !dbg !35
+  %107 = fmul float %106, %102, !dbg !36
+  %108 = fadd float %95, %107, !dbg !37
+  %109 = fadd float %80, %101, !dbg !38
+  %110 = fmul float %102, %102, !dbg !39
+  %111 = fmul float %90, %110, !dbg !40
+  %112 = fmul float %106, %111, !dbg !41
+  %113 = fadd float %109, %112, !dbg !42
+  %114 = fsub float %79, %108, !dbg !29
+  %115 = select i1 %9, float 8.000000e+00, float 0.000000e+00, !dbg !32
+  %116 = fcmp oeq float %115, 0.000000e+00, !dbg !33
+  %117 = tail call float @llvm.nvvm.div.full(float %85, float %115), !dbg !34
+  %118 = select i1 %116, float 0.000000e+00, float %117, !dbg !35
+  %119 = fmul float %118, %114, !dbg !36
+  %120 = fadd float %108, %119, !dbg !37
+  %121 = fadd float %81, %113, !dbg !38
+  %122 = fmul float %114, %114, !dbg !39
+  %123 = fmul float %103, %122, !dbg !40
+  %124 = fmul float %118, %123, !dbg !41
+  %125 = fadd float %121, %124, !dbg !42
+  %126 = bitcast float %120 to i32, !dbg !30
+  %127 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %126, i32 16, i32 31), !dbg !30
+  %128 = bitcast i32 %127 to float, !dbg !30
+  %129 = bitcast float %125 to i32, !dbg !30
+  %130 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %129, i32 16, i32 31), !dbg !30
+  %131 = bitcast i32 %130 to float, !dbg !30
+  %132 = bitcast float %115 to i32, !dbg !30
+  %133 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %132, i32 16, i32 31), !dbg !30
+  %134 = bitcast i32 %133 to float, !dbg !30
+  %135 = fsub float %128, %120, !dbg !29
+  %136 = fadd float %115, %134, !dbg !32
+  %137 = fcmp oeq float %136, 0.000000e+00, !dbg !33
+  %138 = tail call float @llvm.nvvm.div.full(float %134, float %136), !dbg !34
+  %139 = select i1 %137, float 0.000000e+00, float %138, !dbg !35
+  %140 = fmul float %139, %135, !dbg !36
+  %141 = fadd float %120, %140, !dbg !37
+  %142 = fadd float %125, %131, !dbg !38
+  %143 = fmul float %135, %135, !dbg !39
+  %144 = fmul float %115, %143, !dbg !40
+  %145 = fmul float %139, %144, !dbg !41
+  %146 = fadd float %142, %145, !dbg !42
+  %147 = bitcast float %141 to i32, !dbg !30
+  %148 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %147, i32 8, i32 31), !dbg !30
+  %149 = bitcast i32 %148 to float, !dbg !30
+  %150 = bitcast float %146 to i32, !dbg !30
+  %151 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %150, i32 8, i32 31), !dbg !30
+  %152 = bitcast i32 %151 to float, !dbg !30
+  %153 = bitcast float %136 to i32, !dbg !30
+  %154 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %153, i32 8, i32 31), !dbg !30
+  %155 = bitcast i32 %154 to float, !dbg !30
+  %156 = fsub float %149, %141, !dbg !29
+  %157 = fadd float %136, %155, !dbg !32
+  %158 = fcmp oeq float %157, 0.000000e+00, !dbg !33
+  %159 = tail call float @llvm.nvvm.div.full(float %155, float %157), !dbg !34
+  %160 = select i1 %158, float 0.000000e+00, float %159, !dbg !35
+  %161 = fmul float %156, %160, !dbg !36
+  %162 = fadd float %141, %161, !dbg !37
+  %163 = fadd float %146, %152, !dbg !38
+  %164 = fmul float %156, %156, !dbg !39
+  %165 = fmul float %136, %164, !dbg !40
+  %166 = fmul float %160, %165, !dbg !41
+  %167 = fadd float %163, %166, !dbg !42
+  %168 = bitcast float %162 to i32, !dbg !30
+  %169 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %168, i32 4, i32 31), !dbg !30
+  %170 = bitcast i32 %169 to float, !dbg !30
+  %171 = bitcast float %167 to i32, !dbg !30
+  %172 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %171, i32 4, i32 31), !dbg !30
+  %173 = bitcast i32 %172 to float, !dbg !30
+  %174 = bitcast float %157 to i32, !dbg !30
+  %175 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %174, i32 4, i32 31), !dbg !30
+  %176 = bitcast i32 %175 to float, !dbg !30
+  %177 = fsub float %170, %162, !dbg !29
+  %178 = fadd float %157, %176, !dbg !32
+  %179 = fcmp oeq float %178, 0.000000e+00, !dbg !33
+  %180 = tail call float @llvm.nvvm.div.full(float %176, float %178), !dbg !34
+  %181 = select i1 %179, float 0.000000e+00, float %180, !dbg !35
+  %182 = fmul float %177, %181, !dbg !36
+  %183 = fadd float %162, %182, !dbg !37
+  %184 = fadd float %167, %173, !dbg !38
+  %185 = fmul float %177, %177, !dbg !39
+  %186 = fmul float %157, %185, !dbg !40
+  %187 = fmul float %181, %186, !dbg !41
+  %188 = fadd float %184, %187, !dbg !42
+  %189 = bitcast float %183 to i32, !dbg !30
+  %190 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %189, i32 2, i32 31), !dbg !30
+  %191 = bitcast i32 %190 to float, !dbg !30
+  %192 = bitcast float %188 to i32, !dbg !30
+  %193 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %192, i32 2, i32 31), !dbg !30
+  %194 = bitcast i32 %193 to float, !dbg !30
+  %195 = bitcast float %178 to i32, !dbg !30
+  %196 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %195, i32 2, i32 31), !dbg !30
+  %197 = bitcast i32 %196 to float, !dbg !30
+  %198 = fsub float %191, %183, !dbg !29
+  %199 = fadd float %178, %197, !dbg !32
+  %200 = fcmp oeq float %199, 0.000000e+00, !dbg !33
+  %201 = tail call float @llvm.nvvm.div.full(float %197, float %199), !dbg !34
+  %202 = select i1 %200, float 0.000000e+00, float %201, !dbg !35
+  %203 = fmul float %198, %202, !dbg !36
+  %204 = fadd float %183, %203, !dbg !37
+  %205 = fadd float %188, %194, !dbg !38
+  %206 = fmul float %198, %198, !dbg !39
+  %207 = fmul float %178, %206, !dbg !40
+  %208 = fmul float %202, %207, !dbg !41
+  %209 = fadd float %205, %208, !dbg !42
+  %210 = bitcast float %204 to i32, !dbg !30
+  %211 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %210, i32 1, i32 31), !dbg !30
+  %212 = bitcast i32 %211 to float, !dbg !30
+  %213 = bitcast float %209 to i32, !dbg !30
+  %214 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %213, i32 1, i32 31), !dbg !30
+  %215 = bitcast i32 %214 to float, !dbg !30
+  %216 = bitcast float %199 to i32, !dbg !30
+  %217 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %216, i32 1, i32 31), !dbg !30
+  %218 = bitcast i32 %217 to float, !dbg !30
+  %219 = fsub float %212, %204, !dbg !29
+  %220 = fadd float %199, %218, !dbg !32
+  %221 = fcmp oeq float %220, 0.000000e+00, !dbg !33
+  %222 = tail call float @llvm.nvvm.div.full(float %218, float %220), !dbg !34
+  %223 = select i1 %221, float 0.000000e+00, float %222, !dbg !35
+  %224 = fmul float %219, %223, !dbg !36
+  %225 = fadd float %204, %224, !dbg !37
+  %226 = fadd float %209, %215, !dbg !38
+  %227 = fmul float %219, %219, !dbg !39
+  %228 = fmul float %199, %227, !dbg !40
+  %229 = fmul float %223, %228, !dbg !41
+  %230 = fadd float %226, %229, !dbg !42
+  %231 = icmp eq i32 %87, 0, !dbg !30
+  %232 = getelementptr float, ptr addrspace(3) @global_smem, i32 %88, !dbg !30
+  %233 = bitcast float %225 to <1 x i32>, !dbg !30
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %232, <1 x i32> %233, i1 %231) #6, !dbg !30
+  %234 = getelementptr float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 64), i32 %88, !dbg !30
+  %235 = bitcast float %230 to <1 x i32>, !dbg !30
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %234, <1 x i32> %235, i1 %231) #6, !dbg !30
+  %236 = getelementptr float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 128), i32 %88, !dbg !30
+  %237 = bitcast float %220 to <1 x i32>, !dbg !30
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %236, <1 x i32> %237, i1 %231) #6, !dbg !30
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !30
+  %238 = icmp samesign ult i32 %86, 16, !dbg !30
+  %239 = getelementptr float, ptr addrspace(3) @global_smem, i32 %86, !dbg !30
+  %240 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %239, i1 %238) #6, !dbg !30
+  %241 = bitcast i32 %240 to float, !dbg !30
+  %242 = getelementptr float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 64), i32 %86, !dbg !30
+  %243 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %242, i1 %238) #6, !dbg !30
+  %244 = bitcast i32 %243 to float, !dbg !30
+  %245 = getelementptr float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 128), i32 %86, !dbg !30
+  %246 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %245, i1 %238) #6, !dbg !30
+  %247 = bitcast i32 %246 to float, !dbg !30
+  %248 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %240, i32 8, i32 31), !dbg !30
+  %249 = bitcast i32 %248 to float, !dbg !30
+  %250 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %243, i32 8, i32 31), !dbg !30
+  %251 = bitcast i32 %250 to float, !dbg !30
+  %252 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %246, i32 8, i32 31), !dbg !30
+  %253 = bitcast i32 %252 to float, !dbg !30
+  %254 = fsub float %249, %241, !dbg !29
+  %255 = fadd float %247, %253, !dbg !32
+  %256 = fcmp oeq float %255, 0.000000e+00, !dbg !33
+  %257 = tail call float @llvm.nvvm.div.full(float %253, float %255), !dbg !34
+  %258 = select i1 %256, float 0.000000e+00, float %257, !dbg !35
+  %259 = fmul float %254, %258, !dbg !36
+  %260 = fadd float %259, %241, !dbg !37
+  %261 = fadd float %244, %251, !dbg !38
+  %262 = fmul float %254, %254, !dbg !39
+  %263 = fmul float %262, %247, !dbg !40
+  %264 = fmul float %263, %258, !dbg !41
+  %265 = fadd float %261, %264, !dbg !42
+  %266 = bitcast float %260 to i32, !dbg !30
+  %267 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %266, i32 4, i32 31), !dbg !30
+  %268 = bitcast i32 %267 to float, !dbg !30
+  %269 = bitcast float %265 to i32, !dbg !30
+  %270 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %269, i32 4, i32 31), !dbg !30
+  %271 = bitcast i32 %270 to float, !dbg !30
+  %272 = bitcast float %255 to i32, !dbg !30
+  %273 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %272, i32 4, i32 31), !dbg !30
+  %274 = bitcast i32 %273 to float, !dbg !30
+  %275 = fsub float %268, %260, !dbg !29
+  %276 = fadd float %255, %274, !dbg !32
+  %277 = fcmp oeq float %276, 0.000000e+00, !dbg !33
+  %278 = tail call float @llvm.nvvm.div.full(float %274, float %276), !dbg !34
+  %279 = select i1 %277, float 0.000000e+00, float %278, !dbg !35
+  %280 = fmul float %275, %279, !dbg !36
+  %281 = fadd float %260, %280, !dbg !37
+  %282 = fadd float %265, %271, !dbg !38
+  %283 = fmul float %275, %275, !dbg !39
+  %284 = fmul float %255, %283, !dbg !40
+  %285 = fmul float %279, %284, !dbg !41
+  %286 = fadd float %282, %285, !dbg !42
+  %287 = bitcast float %281 to i32, !dbg !30
+  %288 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %287, i32 2, i32 31), !dbg !30
+  %289 = bitcast i32 %288 to float, !dbg !30
+  %290 = bitcast float %286 to i32, !dbg !30
+  %291 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %290, i32 2, i32 31), !dbg !30
+  %292 = bitcast i32 %291 to float, !dbg !30
+  %293 = bitcast float %276 to i32, !dbg !30
+  %294 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %293, i32 2, i32 31), !dbg !30
+  %295 = bitcast i32 %294 to float, !dbg !30
+  %296 = fsub float %289, %281, !dbg !29
+  %297 = fadd float %276, %295, !dbg !32
+  %298 = fcmp oeq float %297, 0.000000e+00, !dbg !33
+  %299 = tail call float @llvm.nvvm.div.full(float %295, float %297), !dbg !34
+  %300 = select i1 %298, float 0.000000e+00, float %299, !dbg !35
+  %301 = fmul float %296, %300, !dbg !36
+  %302 = fadd float %281, %301, !dbg !37
+  %303 = fadd float %286, %292, !dbg !38
+  %304 = fmul float %296, %296, !dbg !39
+  %305 = fmul float %276, %304, !dbg !40
+  %306 = fmul float %300, %305, !dbg !41
+  %307 = fadd float %303, %306, !dbg !42
+  %308 = bitcast float %302 to i32, !dbg !30
+  %309 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %308, i32 1, i32 31), !dbg !30
+  %310 = bitcast i32 %309 to float, !dbg !30
+  %311 = bitcast float %307 to i32, !dbg !30
+  %312 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %311, i32 1, i32 31), !dbg !30
+  %313 = bitcast i32 %312 to float, !dbg !30
+  %314 = bitcast float %297 to i32, !dbg !30
+  %315 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %314, i32 1, i32 31), !dbg !30
+  %316 = bitcast i32 %315 to float, !dbg !30
+  %317 = fsub float %310, %302, !dbg !29
+  %318 = fadd float %297, %316, !dbg !32
+  %319 = fcmp oeq float %318, 0.000000e+00, !dbg !33
+  %320 = tail call float @llvm.nvvm.div.full(float %316, float %318), !dbg !34
+  %321 = select i1 %319, float 0.000000e+00, float %320, !dbg !35
+  %322 = fmul float %317, %321, !dbg !36
+  %323 = fadd float %302, %322, !dbg !37
+  %324 = fadd float %307, %313, !dbg !38
+  %325 = fmul float %317, %317, !dbg !39
+  %326 = fmul float %297, %325, !dbg !40
+  %327 = fmul float %321, %326, !dbg !41
+  %328 = fadd float %324, %327, !dbg !42
+  %329 = and i32 %10, 15, !dbg !30
+  %330 = icmp eq i32 %329, 0, !dbg !30
+  %331 = and i1 %238, %330, !dbg !30
+  %332 = bitcast float %323 to <1 x i32>, !dbg !30
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %239, <1 x i32> %332, i1 %331) #6, !dbg !30
+  %333 = bitcast float %328 to <1 x i32>, !dbg !30
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %242, <1 x i32> %333, i1 %331) #6, !dbg !30
+  %334 = bitcast float %318 to <1 x i32>, !dbg !30
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %245, <1 x i32> %334, i1 %331) #6, !dbg !30
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !30
+  %335 = load float, ptr addrspace(3) @global_smem, align 16, !dbg !30
+  %336 = load float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 64), align 16, !dbg !30
+  %337 = tail call float @llvm.nvvm.div.full(float %336, float 4.096000e+03), !dbg !43
+  %338 = fadd float %337, 0x3EB0C6F7A0000000, !dbg !44
+  %339 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45
+  %340 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45
+  %341 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45
+  %342 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45
+  %.not.i15 = icmp eq i32 %342, 0, !dbg !45
+  br i1 %.not.i15, label %345, label %343, !dbg !45
+
+343:                                              ; preds = %__nv_rsqrtf.exit
+  %344 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %338), !dbg !45
+  br label %__nv_rsqrtf.exit17, !dbg !45
+
+345:                                              ; preds = %__nv_rsqrtf.exit
+  %346 = tail call float @llvm.nvvm.rsqrt.approx.f(float %338), !dbg !45
+  br label %__nv_rsqrtf.exit17, !dbg !45
+
+__nv_rsqrtf.exit17:                               ; preds = %343, %345
+  %.0.i16 = phi float [ %344, %343 ], [ %346, %345 ], !dbg !45
+  %347 = zext nneg i32 %12 to i64, !dbg !46
+  %348 = sext i32 %13 to i64, !dbg !46
+  %349 = getelementptr bfloat, ptr addrspace(1) %1, i64 %347, !dbg !47
+  %350 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !48
+  %351 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %349, i64 %350, i1 true) #6, !dbg !48
+  %352 = extractvalue { i32, i32 } %351, 0, !dbg !48
+  %353 = bitcast i32 %352 to <2 x bfloat>, !dbg !48
+  %354 = extractvalue { i32, i32 } %351, 1, !dbg !48
+  %355 = bitcast i32 %354 to <2 x bfloat>, !dbg !48
+  %356 = or disjoint i64 %347, %348, !dbg !49
+  %357 = getelementptr bfloat, ptr addrspace(1) %0, i64 %356, !dbg !50
+  %358 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #6, !dbg !51
+  %359 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %357, i64 %358, i1 %9) #6, !dbg !51
+  %360 = extractvalue { i32, i32 } %359, 0, !dbg !51
+  %361 = bitcast i32 %360 to <2 x bfloat>, !dbg !51
+  %362 = extractvalue { i32, i32 } %359, 1, !dbg !51
+  %363 = bitcast i32 %362 to <2 x bfloat>, !dbg !51
+  %364 = getelementptr bfloat, ptr addrspace(1) %2, i64 %347, !dbg !52
+  %365 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !53
+  %366 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %364, i64 %365, i1 true) #6, !dbg !53
+  %367 = extractvalue { i32, i32 } %366, 0, !dbg !53
+  %368 = bitcast i32 %367 to <2 x bfloat>, !dbg !53
+  %369 = extractvalue { i32, i32 } %366, 1, !dbg !53
+  %370 = bitcast i32 %369 to <2 x bfloat>, !dbg !53
+  %371 = getelementptr bfloat, ptr addrspace(1) %3, i64 %356, !dbg !54
+  %372 = fpext <2 x bfloat> %353 to <2 x float>, !dbg !55
+  %373 = fpext <2 x bfloat> %361 to <2 x float>, !dbg !56
+  %374 = fpext <2 x bfloat> %368 to <2 x float>, !dbg !57
+  %375 = fadd <2 x float> %372, splat (float 1.000000e+00), !dbg !58
+  %376 = insertelement <2 x float> poison, float %335, i64 0, !dbg !59
+  %377 = shufflevector <2 x float> %376, <2 x float> poison, <2 x i32> zeroinitializer, !dbg !59
+  %378 = fsub <2 x float> %373, %377, !dbg !59
+  %379 = insertelement <2 x float> poison, float %.0.i16, i64 0, !dbg !60
+  %380 = shufflevector <2 x float> %379, <2 x float> poison, <2 x i32> zeroinitializer, !dbg !60
+  %381 = fmul <2 x float> %380, %378, !dbg !60
+  %382 = fmul <2 x float> %375, %381, !dbg !61
+  %383 = fadd <2 x float> %382, %374, !dbg !62
+  %384 = fptrunc <2 x float> %383 to <2 x bfloat>, !dbg !63
+  %385 = fpext <2 x bfloat> %355 to <2 x float>, !dbg !55
+  %386 = fpext <2 x bfloat> %363 to <2 x float>, !dbg !56
+  %387 = fpext <2 x bfloat> %370 to <2 x float>, !dbg !57
+  %388 = fadd <2 x float> %385, splat (float 1.000000e+00), !dbg !58
+  %389 = fsub <2 x float> %386, %377, !dbg !59
+  %390 = fmul <2 x float> %380, %389, !dbg !60
+  %391 = fmul <2 x float> %388, %390, !dbg !61
+  %392 = fadd <2 x float> %391, %387, !dbg !62
+  %393 = fptrunc <2 x float> %392 to <2 x bfloat>, !dbg !63
+  %394 = bitcast <2 x bfloat> %384 to i32, !dbg !63
+  %395 = bitcast <2 x bfloat> %393 to i32, !dbg !63
+  tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %394, i32 %395, ptr addrspace(1) %371, i1 %9) #6, !dbg !63
+  %396 = or disjoint i64 %347, 2048, !dbg !64
+  %397 = getelementptr bfloat, ptr addrspace(1) %1, i64 %396, !dbg !47
+  %398 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !48
+  %399 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %397, i64 %398, i1 true) #6, !dbg !48
+  %400 = extractvalue { i32, i32 } %399, 0, !dbg !48
+  %401 = bitcast i32 %400 to <2 x bfloat>, !dbg !48
+  %402 = extractvalue { i32, i32 } %399, 1, !dbg !48
+  %403 = bitcast i32 %402 to <2 x bfloat>, !dbg !48
+  %404 = or disjoint i64 %396, %348, !dbg !49
+  %405 = getelementptr bfloat, ptr addrspace(1) %0, i64 %404, !dbg !50
+  %406 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #6, !dbg !51
+  %407 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %405, i64 %406, i1 %9) #6, !dbg !51
+  %408 = extractvalue { i32, i32 } %407, 0, !dbg !51
+  %409 = bitcast i32 %408 to <2 x bfloat>, !dbg !51
+  %410 = extractvalue { i32, i32 } %407, 1, !dbg !51
+  %411 = bitcast i32 %410 to <2 x bfloat>, !dbg !51
+  %412 = getelementptr bfloat, ptr addrspace(1) %2, i64 %396, !dbg !52
+  %413 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !53
+  %414 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %412, i64 %413, i1 true) #6, !dbg !53
+  %415 = extractvalue { i32, i32 } %414, 0, !dbg !53
+  %416 = bitcast i32 %415 to <2 x bfloat>, !dbg !53
+  %417 = extractvalue { i32, i32 } %414, 1, !dbg !53
+  %418 = bitcast i32 %417 to <2 x bfloat>, !dbg !53
+  %419 = getelementptr bfloat, ptr addrspace(1) %3, i64 %404, !dbg !54
+  %420 = fpext <2 x bfloat> %401 to <2 x float>, !dbg !55
+  %421 = fpext <2 x bfloat> %409 to <2 x float>, !dbg !56
+  %422 = fpext <2 x bfloat> %416 to <2 x float>, !dbg !57
+  %423 = fadd <2 x float> %420, splat (float 1.000000e+00), !dbg !58
+  %424 = fsub <2 x float> %421, %377, !dbg !59
+  %425 = fmul <2 x float> %380, %424, !dbg !60
+  %426 = fmul <2 x float> %423, %425, !dbg !61
+  %427 = fadd <2 x float> %426, %422, !dbg !62
+  %428 = fptrunc <2 x float> %427 to <2 x bfloat>, !dbg !63
+  %429 = fpext <2 x bfloat> %403 to <2 x float>, !dbg !55
+  %430 = fpext <2 x bfloat> %411 to <2 x float>, !dbg !56
+  %431 = fpext <2 x bfloat> %418 to <2 x float>, !dbg !57
+  %432 = fadd <2 x float> %429, splat (float 1.000000e+00), !dbg !58
+  %433 = fsub <2 x float> %430, %377, !dbg !59
+  %434 = fmul <2 x float> %380, %433, !dbg !60
+  %435 = fmul <2 x float> %432, %434, !dbg !61
+  %436 = fadd <2 x float> %435, %431, !dbg !62
+  %437 = fptrunc <2 x float> %436 to <2 x bfloat>, !dbg !63
+  %438 = bitcast <2 x bfloat> %428 to i32, !dbg !63
+  %439 = bitcast <2 x bfloat> %437 to i32, !dbg !63
+  tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %438, i32 %439, ptr addrspace(1) %419, i1 %9) #6, !dbg !63
+  ret void, !dbg !65
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.div.full(float, float) #2
+
+; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
+declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #3
+
+; Function Attrs: convergent nocallback nounwind
+declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #4
+
+declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #5
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #2
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.rsqrt.approx.f(float) #2
+
+attributes #0 = { nounwind "nvvm.reqntid"="512" }
+attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #2 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
+attributes #3 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
+attributes #4 = { convergent nocallback nounwind }
+attributes #5 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #6 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3}
+!llvm.ident = !{!4}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly)
+!1 = !DIFile(filename: "cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py", directory: "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg")
+!2 = !{i32 2, !"Debug Info Version", i32 3}
+!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
+!4 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
+!5 = distinct !DISubprogram(name: "triton_red_fused_add_mul_native_layer_norm_0", linkageName: "triton_red_fused_add_mul_native_layer_norm_0", scope: !1, file: !1, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
+!7 = !{}
+!8 = !DILocation(line: 23, column: 28, scope: !5)
+!9 = !DILocation(line: 25, column: 21, scope: !5)
+!10 = !DILocation(line: 26, column: 37, scope: !5)
+!11 = !DILocation(line: 38, column: 46, scope: !5)
+!12 = !DILocation(line: 32, column: 43, scope: !5)
+!13 = !DILocation(line: 38, column: 34, scope: !5)
+!14 = !DILocation(line: 38, column: 51, scope: !5)
+!15 = !DILocation(line: 38, column: 112, scope: !5)
+!16 = !DILocation(line: 44, column: 62, scope: !5)
+!17 = !DILocation(line: 222, column: 24, scope: !18, inlinedAt: !20)
+!18 = distinct !DILexicalBlockFile(scope: !5, file: !19, discriminator: 0)
+!19 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime")
+!20 = !DILocation(line: 42, column: 51, scope: !21)
+!21 = distinct !DILexicalBlockFile(scope: !5, file: !1, discriminator: 0)
+!22 = !DILocation(line: 46, column: 66, scope: !5)
+!23 = !DILocation(line: 224, column: 34, scope: !18, inlinedAt: !20)
+!24 = !DILocation(line: 224, column: 26, scope: !18, inlinedAt: !20)
+!25 = !DILocation(line: 225, column: 39, scope: !18, inlinedAt: !20)
+!26 = !DILocation(line: 225, column: 31, scope: !18, inlinedAt: !20)
+!27 = !DILocation(line: 225, column: 22, scope: !18, inlinedAt: !20)
+!28 = !DILocation(line: 45, column: 58, scope: !5)
+!29 = !DILocation(line: 231, column: 21, scope: !18, inlinedAt: !30)
+!30 = !DILocation(line: 243, column: 46, scope: !18, inlinedAt: !31)
+!31 = !DILocation(line: 47, column: 79, scope: !21)
+!32 = !DILocation(line: 232, column: 28, scope: !18, inlinedAt: !30)
+!33 = !DILocation(line: 233, column: 39, scope: !18, inlinedAt: !30)
+!34 = !DILocation(line: 233, column: 60, scope: !18, inlinedAt: !30)
+!35 = !DILocation(line: 233, column: 49, scope: !18, inlinedAt: !30)
+!36 = !DILocation(line: 235, column: 25, scope: !18, inlinedAt: !30)
+!37 = !DILocation(line: 235, column: 17, scope: !18, inlinedAt: !30)
+!38 = !DILocation(line: 236, column: 15, scope: !18, inlinedAt: !30)
+!39 = !DILocation(line: 236, column: 30, scope: !18, inlinedAt: !30)
+!40 = !DILocation(line: 236, column: 38, scope: !18, inlinedAt: !30)
+!41 = !DILocation(line: 236, column: 49, scope: !18, inlinedAt: !30)
+!42 = !DILocation(line: 236, column: 22, scope: !18, inlinedAt: !30)
+!43 = !DILocation(line: 65, column: 24, scope: !5)
+!44 = !DILocation(line: 67, column: 24, scope: !5)
+!45 = !DILocation(line: 68, column: 32, scope: !5)
+!46 = !DILocation(line: 51, column: 43, scope: !5)
+!47 = !DILocation(line: 57, column: 34, scope: !5)
+!48 = !DILocation(line: 57, column: 41, scope: !5)
+!49 = !DILocation(line: 58, column: 42, scope: !5)
+!50 = !DILocation(line: 58, column: 35, scope: !5)
+!51 = !DILocation(line: 58, column: 52, scope: !5)
+!52 = !DILocation(line: 59, column: 35, scope: !5)
+!53 = !DILocation(line: 59, column: 42, scope: !5)
+!54 = !DILocation(line: 73, column: 29, scope: !5)
+!55 = !DILocation(line: 57, column: 94, scope: !5)
+!56 = !DILocation(line: 58, column: 114, scope: !5)
+!57 = !DILocation(line: 59, column: 95, scope: !5)
+!58 = !DILocation(line: 61, column: 23, scope: !5)
+!59 = !DILocation(line: 63, column: 24, scope: !5)
+!60 = !DILocation(line: 69, column: 24, scope: !5)
+!61 = !DILocation(line: 71, column: 24, scope: !5)
+!62 = !DILocation(line: 72, column: 24, scope: !5)
+!63 = !DILocation(line: 73, column: 53, scope: !5)
+!64 = !DILocation(line: 52, column: 31, scope: !5)
+!65 = !DILocation(line: 51, column: 4, scope: !5)
diff --git a/triton/S5Z477N52AES6BCQLQSCQFPIOCFND32E3P2XRI6OSCXZCB32BOSQ/triton_red_fused_add_mul_native_layer_norm_0.ptx b/triton/S5Z477N52AES6BCQLQSCQFPIOCFND32E3P2XRI6OSCXZCB32BOSQ/triton_red_fused_add_mul_native_layer_norm_0.ptx
new file mode 100644
index 0000000000000000000000000000000000000000..0814d80a5b656399575e5e4401221d643f43d572
--- /dev/null
+++ b/triton/S5Z477N52AES6BCQLQSCQFPIOCFND32E3P2XRI6OSCXZCB32BOSQ/triton_red_fused_add_mul_native_layer_norm_0.ptx
@@ -0,0 +1,1089 @@
+//
+// Generated by LLVM NVPTX Back-End
+//
+
+.version 9.1
+.target sm_89
+.address_size 64
+
+	// .globl	triton_red_fused_add_mul_native_layer_norm_0 // -- Begin function triton_red_fused_add_mul_native_layer_norm_0
+.extern .shared .align 16 .b8 global_smem[];
+.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90};
+                                        // @triton_red_fused_add_mul_native_layer_norm_0
+.visible .entry triton_red_fused_add_mul_native_layer_norm_0(
+	.param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_0_param_0,
+	.param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_0_param_1,
+	.param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_0_param_2,
+	.param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_0_param_3,
+	.param .u32 triton_red_fused_add_mul_native_layer_norm_0_param_4,
+	.param .u32 triton_red_fused_add_mul_native_layer_norm_0_param_5,
+	.param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_0_param_6,
+	.param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_0_param_7
+)
+.reqntid 512
+{
+	.reg .pred 	%p<19>;
+	.reg .b16 	%rs<33>;
+	.reg .b32 	%r<282>;
+	.reg .b64 	%rd<28>;
+	.loc	1 18 0                          // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:18:0
+$L__func_begin0:
+	.loc	1 18 0                          // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:18:0
+
+// %bb.0:                               // %__nv_rsqrtf.exit
+	ld.param.b64 	%rd19, [triton_red_fused_add_mul_native_layer_norm_0_param_0];
+	ld.param.b64 	%rd20, [triton_red_fused_add_mul_native_layer_norm_0_param_1];
+$L__tmp0:
+	.loc	1 23 28                         // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:23:28
+	mov.u32 	%r37, %ctaid.x;
+	.loc	1 25 21                         // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:25:21
+	setp.lt.u32 	%p1, %r37, 2304;
+	ld.param.b64 	%rd21, [triton_red_fused_add_mul_native_layer_norm_0_param_2];
+	ld.param.b64 	%rd22, [triton_red_fused_add_mul_native_layer_norm_0_param_3];
+	.loc	1 26 37                         // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:26:37
+	mov.u32 	%r38, %tid.x;
+	shl.b32 	%r39, %r38, 2;
+	and.b32 	%r40, %r39, 2044;
+	.loc	1 38 46                         // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:38:46
+	shl.b32 	%r41, %r37, 12;
+	or.b32 	%r42, %r40, %r41;
+	.loc	1 38 34                         // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:38:34
+	mad.wide.s32 	%rd1, %r42, 2, %rd19;
+	.loc	1 38 51                         // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:38:51
+	// begin inline asm
+	mov.u64 %rd2, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd2, 1.0;
+	// end inline asm
+	mov.b32 	%r3, 0;
+	// begin inline asm
+	mov.u32 %r1, %r3;
+	mov.u32 %r2, %r3;
+	@%p1 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { %r1, %r2 }, [ %rd1 + 0 ], %rd2;
+	// end inline asm
+	mov.b32 	{%rs1, %rs2}, %r2;
+	.loc	1 38 112                        // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:38:112
+	cvt.f32.bf16 	%r43, %rs2;
+	cvt.f32.bf16 	%r44, %rs1;
+	.loc	1 38 51                         // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:38:51
+	mov.b32 	{%rs3, %rs4}, %r1;
+	.loc	1 38 112                        // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:38:112
+	cvt.f32.bf16 	%r45, %rs4;
+	cvt.f32.bf16 	%r46, %rs3;
+	.loc	1 44 62                         // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:44:62
+	selp.f32 	%r47, %r46, 0f00000000, %p1;
+	selp.f32 	%r48, %r45, 0f00000000, %p1;
+	selp.f32 	%r49, %r44, 0f00000000, %p1;
+	selp.f32 	%r50, %r43, 0f00000000, %p1;
+	.loc	1 38 34                         // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:38:34
+	add.s64 	%rd3, %rd1, 4096;
+	.loc	1 38 51                         // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:38:51
+	// begin inline asm
+	mov.u64 %rd4, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd4, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r4, %r3;
+	mov.u32 %r5, %r3;
+	@%p1 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { %r4, %r5 }, [ %rd3 + 0 ], %rd4;
+	// end inline asm
+	mov.b32 	{%rs5, %rs6}, %r4;
+	.loc	1 38 112                        // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:38:112
+	cvt.f32.bf16 	%r51, %rs5;
+$L__tmp1:
+	.loc	2 222 24                        // triton_helpers.py:222:24 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:42:51 ]
+	sub.f32 	%r52, %r51, %r47;
+$L__tmp2:
+	.loc	1 46 66                         // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:46:66
+	selp.f32 	%r53, 0f40000000, 0f3F800000, %p1;
+$L__tmp3:
+	.loc	2 224 34                        // triton_helpers.py:224:34 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:42:51 ]
+	div.full.f32 	%r54, %r52, %r53;
+	.loc	2 224 26                        // triton_helpers.py:224:26 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:42:51 ]
+	add.f32 	%r55, %r47, %r54;
+	.loc	2 225 39                        // triton_helpers.py:225:39 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:42:51 ]
+	sub.f32 	%r56, %r51, %r55;
+	.loc	2 225 22                        // triton_helpers.py:225:22 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:42:51 ]
+	fma.rn.f32 	%r57, %r52, %r56, 0f00000000;
+$L__tmp4:
+	.loc	1 38 112                        // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:38:112
+	cvt.f32.bf16 	%r58, %rs6;
+$L__tmp5:
+	.loc	2 222 24                        // triton_helpers.py:222:24 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:42:51 ]
+	sub.f32 	%r59, %r58, %r48;
+	.loc	2 224 34                        // triton_helpers.py:224:34 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:42:51 ]
+	div.full.f32 	%r60, %r59, %r53;
+	.loc	2 224 26                        // triton_helpers.py:224:26 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:42:51 ]
+	add.f32 	%r61, %r48, %r60;
+	.loc	2 225 39                        // triton_helpers.py:225:39 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:42:51 ]
+	sub.f32 	%r62, %r58, %r61;
+	.loc	2 225 22                        // triton_helpers.py:225:22 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:42:51 ]
+	fma.rn.f32 	%r63, %r59, %r62, 0f00000000;
+$L__tmp6:
+	.loc	1 38 51                         // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:38:51
+	mov.b32 	{%rs7, %rs8}, %r5;
+	.loc	1 38 112                        // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:38:112
+	cvt.f32.bf16 	%r64, %rs7;
+$L__tmp7:
+	.loc	2 222 24                        // triton_helpers.py:222:24 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:42:51 ]
+	sub.f32 	%r65, %r64, %r49;
+	.loc	2 224 34                        // triton_helpers.py:224:34 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:42:51 ]
+	div.full.f32 	%r66, %r65, %r53;
+	.loc	2 224 26                        // triton_helpers.py:224:26 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:42:51 ]
+	add.f32 	%r67, %r49, %r66;
+	.loc	2 225 39                        // triton_helpers.py:225:39 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:42:51 ]
+	sub.f32 	%r68, %r64, %r67;
+	.loc	2 225 22                        // triton_helpers.py:225:22 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:42:51 ]
+	fma.rn.f32 	%r69, %r65, %r68, 0f00000000;
+$L__tmp8:
+	.loc	1 38 112                        // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:38:112
+	cvt.f32.bf16 	%r70, %rs8;
+$L__tmp9:
+	.loc	2 222 24                        // triton_helpers.py:222:24 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:42:51 ]
+	sub.f32 	%r71, %r70, %r50;
+	.loc	2 224 34                        // triton_helpers.py:224:34 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:42:51 ]
+	div.full.f32 	%r72, %r71, %r53;
+	.loc	2 224 26                        // triton_helpers.py:224:26 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:42:51 ]
+	add.f32 	%r73, %r50, %r72;
+	.loc	2 225 39                        // triton_helpers.py:225:39 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:42:51 ]
+	sub.f32 	%r74, %r70, %r73;
+	.loc	2 225 22                        // triton_helpers.py:225:22 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:42:51 ]
+	fma.rn.f32 	%r75, %r71, %r74, 0f00000000;
+$L__tmp10:
+	.loc	1 44 62                         // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:44:62
+	selp.f32 	%r76, %r55, 0f00000000, %p1;
+	selp.f32 	%r77, %r61, 0f00000000, %p1;
+	selp.f32 	%r78, %r67, 0f00000000, %p1;
+	selp.f32 	%r79, %r73, 0f00000000, %p1;
+	.loc	1 45 58                         // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:45:58
+	selp.f32 	%r80, %r69, 0f00000000, %p1;
+	selp.f32 	%r81, %r75, 0f00000000, %p1;
+	.loc	1 46 66                         // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:46:66
+	selp.f32 	%r82, 0f40000000, 0f00000000, %p1;
+	.loc	1 26 37                         // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:26:37
+	and.b32 	%r83, %r38, 511;
+	and.b32 	%r84, %r38, 31;
+$L__tmp11:
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	sub.f32 	%r85, %r77, %r76;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	selp.f32 	%r86, 0f40800000, 0f00000000, %p1;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	setp.eq.f32 	%p6, %r86, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	div.full.f32 	%r87, %r82, %r86;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	selp.f32 	%r88, 0f00000000, %r87, %p6;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	fma.rn.f32 	%r89, %r85, %r88, %r76;
+	.loc	2 236 15                        // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	add.f32 	%r90, %r57, %r63;
+	selp.f32 	%r91, %r90, 0f00000000, %p1;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	mul.f32 	%r92, %r85, %r85;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	mul.f32 	%r93, %r92, %r82;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	fma.rn.f32 	%r94, %r93, %r88, %r91;
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	sub.f32 	%r95, %r78, %r89;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	selp.f32 	%r96, 0f40C00000, 0f00000000, %p1;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	setp.eq.f32 	%p7, %r96, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	div.full.f32 	%r97, %r82, %r96;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	selp.f32 	%r98, 0f00000000, %r97, %p7;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	fma.rn.f32 	%r99, %r98, %r95, %r89;
+	.loc	2 236 15                        // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	add.f32 	%r100, %r80, %r94;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	mul.f32 	%r101, %r95, %r95;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	mul.f32 	%r102, %r86, %r101;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	fma.rn.f32 	%r103, %r98, %r102, %r100;
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	sub.f32 	%r104, %r79, %r99;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	selp.f32 	%r105, 0f41000000, 0f00000000, %p1;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	setp.eq.f32 	%p8, %r105, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	div.full.f32 	%r106, %r82, %r105;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	selp.f32 	%r107, 0f00000000, %r106, %p8;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	fma.rn.f32 	%r108, %r107, %r104, %r99;
+	.loc	2 236 15                        // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	add.f32 	%r109, %r81, %r103;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	mul.f32 	%r110, %r104, %r104;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	mul.f32 	%r111, %r96, %r110;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	fma.rn.f32 	%r112, %r107, %r111, %r109;
+$L__tmp12:
+	.loc	2 243 46                        // triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ]
+	shfl.sync.bfly.b32 	%r113, %r108, 16, 31, -1;
+	shfl.sync.bfly.b32 	%r114, %r112, 16, 31, -1;
+	shfl.sync.bfly.b32 	%r115, %r105, 16, 31, -1;
+$L__tmp13:
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	sub.f32 	%r116, %r113, %r108;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	add.f32 	%r117, %r105, %r115;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	setp.eq.f32 	%p9, %r117, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	div.full.f32 	%r118, %r115, %r117;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	selp.f32 	%r119, 0f00000000, %r118, %p9;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	fma.rn.f32 	%r120, %r119, %r116, %r108;
+	.loc	2 236 15                        // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	add.f32 	%r121, %r112, %r114;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	mul.f32 	%r122, %r116, %r116;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	mul.f32 	%r123, %r105, %r122;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	fma.rn.f32 	%r124, %r119, %r123, %r121;
+$L__tmp14:
+	.loc	2 243 46                        // triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ]
+	shfl.sync.bfly.b32 	%r125, %r120, 8, 31, -1;
+	shfl.sync.bfly.b32 	%r126, %r124, 8, 31, -1;
+	shfl.sync.bfly.b32 	%r127, %r117, 8, 31, -1;
+$L__tmp15:
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	sub.f32 	%r128, %r125, %r120;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	add.f32 	%r129, %r117, %r127;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	setp.eq.f32 	%p10, %r129, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	div.full.f32 	%r130, %r127, %r129;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	selp.f32 	%r131, 0f00000000, %r130, %p10;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	fma.rn.f32 	%r132, %r128, %r131, %r120;
+	.loc	2 236 15                        // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	add.f32 	%r133, %r124, %r126;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	mul.f32 	%r134, %r128, %r128;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	mul.f32 	%r135, %r117, %r134;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	fma.rn.f32 	%r136, %r131, %r135, %r133;
+$L__tmp16:
+	.loc	2 243 46                        // triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ]
+	shfl.sync.bfly.b32 	%r137, %r132, 4, 31, -1;
+	shfl.sync.bfly.b32 	%r138, %r136, 4, 31, -1;
+	shfl.sync.bfly.b32 	%r139, %r129, 4, 31, -1;
+$L__tmp17:
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	sub.f32 	%r140, %r137, %r132;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	add.f32 	%r141, %r129, %r139;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	setp.eq.f32 	%p11, %r141, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	div.full.f32 	%r142, %r139, %r141;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	selp.f32 	%r143, 0f00000000, %r142, %p11;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	fma.rn.f32 	%r144, %r140, %r143, %r132;
+	.loc	2 236 15                        // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	add.f32 	%r145, %r136, %r138;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	mul.f32 	%r146, %r140, %r140;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	mul.f32 	%r147, %r129, %r146;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	fma.rn.f32 	%r148, %r143, %r147, %r145;
+$L__tmp18:
+	.loc	2 243 46                        // triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ]
+	shfl.sync.bfly.b32 	%r149, %r144, 2, 31, -1;
+	shfl.sync.bfly.b32 	%r150, %r148, 2, 31, -1;
+	shfl.sync.bfly.b32 	%r151, %r141, 2, 31, -1;
+$L__tmp19:
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	sub.f32 	%r152, %r149, %r144;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	add.f32 	%r153, %r141, %r151;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	setp.eq.f32 	%p12, %r153, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	div.full.f32 	%r154, %r151, %r153;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	selp.f32 	%r155, 0f00000000, %r154, %p12;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	fma.rn.f32 	%r156, %r152, %r155, %r144;
+	.loc	2 236 15                        // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	add.f32 	%r157, %r148, %r150;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	mul.f32 	%r158, %r152, %r152;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	mul.f32 	%r159, %r141, %r158;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	fma.rn.f32 	%r160, %r155, %r159, %r157;
+$L__tmp20:
+	.loc	2 243 46                        // triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ]
+	shfl.sync.bfly.b32 	%r161, %r156, 1, 31, -1;
+	shfl.sync.bfly.b32 	%r162, %r160, 1, 31, -1;
+	shfl.sync.bfly.b32 	%r163, %r153, 1, 31, -1;
+$L__tmp21:
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	sub.f32 	%r164, %r161, %r156;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	add.f32 	%r11, %r153, %r163;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	setp.eq.f32 	%p13, %r11, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	div.full.f32 	%r165, %r163, %r11;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	selp.f32 	%r166, 0f00000000, %r165, %p13;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	fma.rn.f32 	%r7, %r164, %r166, %r156;
+	.loc	2 236 15                        // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	add.f32 	%r167, %r160, %r162;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	mul.f32 	%r168, %r164, %r164;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	mul.f32 	%r169, %r153, %r168;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	fma.rn.f32 	%r9, %r166, %r169, %r167;
+$L__tmp22:
+	.loc	2 243 46                        // triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ]
+	setp.eq.b32 	%p2, %r84, 0;
+	shr.u32 	%r170, %r38, 3;
+	and.b32 	%r171, %r170, 60;
+	mov.b32 	%r172, global_smem;
+	add.s32 	%r6, %r172, %r171;
+	// begin inline asm
+	@%p2 st.shared.b32 [ %r6 + 0 ], %r7;
+	// end inline asm
+	add.s32 	%r8, %r6, 64;
+	// begin inline asm
+	@%p2 st.shared.b32 [ %r8 + 0 ], %r9;
+	// end inline asm
+	add.s32 	%r10, %r6, 128;
+	// begin inline asm
+	@%p2 st.shared.b32 [ %r10 + 0 ], %r11;
+	// end inline asm
+	bar.sync 	0;
+	setp.lt.u32 	%p3, %r83, 16;
+	shl.b32 	%r173, %r83, 2;
+	add.s32 	%r13, %r172, %r173;
+	// begin inline asm
+	@%p3 ld.shared.b32 %r12, [ %r13 + 0 ];
+	// end inline asm
+	add.s32 	%r15, %r13, 64;
+	// begin inline asm
+	@%p3 ld.shared.b32 %r14, [ %r15 + 0 ];
+	// end inline asm
+	add.s32 	%r17, %r13, 128;
+	// begin inline asm
+	@%p3 ld.shared.b32 %r16, [ %r17 + 0 ];
+	// end inline asm
+	shfl.sync.bfly.b32 	%r174, %r12, 8, 31, -1;
+	shfl.sync.bfly.b32 	%r175, %r14, 8, 31, -1;
+	shfl.sync.bfly.b32 	%r176, %r16, 8, 31, -1;
+$L__tmp23:
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	sub.f32 	%r177, %r174, %r12;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	add.f32 	%r178, %r16, %r176;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	setp.eq.f32 	%p14, %r178, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	div.full.f32 	%r179, %r176, %r178;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	selp.f32 	%r180, 0f00000000, %r179, %p14;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	fma.rn.f32 	%r181, %r177, %r180, %r12;
+	.loc	2 236 15                        // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	add.f32 	%r182, %r14, %r175;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	mul.f32 	%r183, %r177, %r177;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	mul.f32 	%r184, %r183, %r16;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	fma.rn.f32 	%r185, %r184, %r180, %r182;
+$L__tmp24:
+	.loc	2 243 46                        // triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ]
+	shfl.sync.bfly.b32 	%r186, %r181, 4, 31, -1;
+	shfl.sync.bfly.b32 	%r187, %r185, 4, 31, -1;
+	shfl.sync.bfly.b32 	%r188, %r178, 4, 31, -1;
+$L__tmp25:
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	sub.f32 	%r189, %r186, %r181;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	add.f32 	%r190, %r178, %r188;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	setp.eq.f32 	%p15, %r190, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	div.full.f32 	%r191, %r188, %r190;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	selp.f32 	%r192, 0f00000000, %r191, %p15;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	fma.rn.f32 	%r193, %r189, %r192, %r181;
+	.loc	2 236 15                        // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	add.f32 	%r194, %r185, %r187;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	mul.f32 	%r195, %r189, %r189;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	mul.f32 	%r196, %r178, %r195;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	fma.rn.f32 	%r197, %r192, %r196, %r194;
+$L__tmp26:
+	.loc	2 243 46                        // triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ]
+	shfl.sync.bfly.b32 	%r198, %r193, 2, 31, -1;
+	shfl.sync.bfly.b32 	%r199, %r197, 2, 31, -1;
+	shfl.sync.bfly.b32 	%r200, %r190, 2, 31, -1;
+$L__tmp27:
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	sub.f32 	%r201, %r198, %r193;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	add.f32 	%r202, %r190, %r200;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	setp.eq.f32 	%p16, %r202, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	div.full.f32 	%r203, %r200, %r202;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	selp.f32 	%r204, 0f00000000, %r203, %p16;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	fma.rn.f32 	%r205, %r201, %r204, %r193;
+	.loc	2 236 15                        // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	add.f32 	%r206, %r197, %r199;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	mul.f32 	%r207, %r201, %r201;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	mul.f32 	%r208, %r190, %r207;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	fma.rn.f32 	%r209, %r204, %r208, %r206;
+$L__tmp28:
+	.loc	2 243 46                        // triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ]
+	shfl.sync.bfly.b32 	%r210, %r205, 1, 31, -1;
+	shfl.sync.bfly.b32 	%r211, %r209, 1, 31, -1;
+	shfl.sync.bfly.b32 	%r212, %r202, 1, 31, -1;
+$L__tmp29:
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	sub.f32 	%r213, %r210, %r205;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	add.f32 	%r20, %r202, %r212;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	setp.eq.f32 	%p17, %r20, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	div.full.f32 	%r214, %r212, %r20;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	selp.f32 	%r215, 0f00000000, %r214, %p17;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	fma.rn.f32 	%r18, %r213, %r215, %r205;
+	.loc	2 236 15                        // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	add.f32 	%r216, %r209, %r211;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	mul.f32 	%r217, %r213, %r213;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	mul.f32 	%r218, %r202, %r217;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	fma.rn.f32 	%r19, %r215, %r218, %r216;
+$L__tmp30:
+	.loc	2 243 46                        // triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ]
+	and.b32 	%r219, %r38, 15;
+	setp.eq.b32 	%p18, %r219, 0;
+	and.pred 	%p4, %p3, %p18;
+	// begin inline asm
+	@%p4 st.shared.b32 [ %r13 + 0 ], %r18;
+	// end inline asm
+	// begin inline asm
+	@%p4 st.shared.b32 [ %r15 + 0 ], %r19;
+	// end inline asm
+	// begin inline asm
+	@%p4 st.shared.b32 [ %r17 + 0 ], %r20;
+	// end inline asm
+	bar.sync 	0;
+	ld.shared.b32 	%r220, [global_smem];
+	ld.shared.b32 	%r221, [global_smem+64];
+	mov.b32 	%r222, 0f45800000;
+$L__tmp31:
+	.loc	1 65 24                         // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:65:24
+	div.full.f32 	%r223, %r221, %r222;
+	.loc	1 67 24                         // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:67:24
+	add.f32 	%r224, %r223, 0f358637BD;
+	.loc	1 68 32                         // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:68:32
+	rsqrt.approx.ftz.f32 	%r225, %r224;
+	.loc	1 51 43                         // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:51:43
+	cvt.u64.u32 	%rd23, %r40;
+	cvt.s64.s32 	%rd24, %r41;
+	.loc	1 57 34                         // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:57:34
+	mul.wide.u32 	%rd25, %r40, 2;
+	add.s64 	%rd5, %rd20, %rd25;
+	.loc	1 57 41                         // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:57:41
+	// begin inline asm
+	mov.u64 %rd6, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd6, 1.0;
+	// end inline asm
+	mov.pred 	%p5, -1;
+	// begin inline asm
+	mov.u32 %r21, %r3;
+	mov.u32 %r22, %r3;
+	@%p5 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { %r21, %r22 }, [ %rd5 + 0 ], %rd6;
+	// end inline asm
+	.loc	1 58 42                         // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:58:42
+	or.b64 	%rd26, %rd23, %rd24;
+	.loc	1 58 35                         // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:58:35
+	shl.b64 	%rd27, %rd26, 1;
+	add.s64 	%rd7, %rd19, %rd27;
+	.loc	1 58 52                         // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:58:52
+	// begin inline asm
+	mov.u64 %rd8, 0x0;
+	createpolicy.fractional.L2::evict_first.b64 %rd8, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r23, %r3;
+	mov.u32 %r24, %r3;
+	@%p1 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { %r23, %r24 }, [ %rd7 + 0 ], %rd8;
+	// end inline asm
+	.loc	1 59 35                         // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:59:35
+	add.s64 	%rd9, %rd21, %rd25;
+	.loc	1 59 42                         // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:59:42
+	// begin inline asm
+	mov.u64 %rd10, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd10, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r25, %r3;
+	mov.u32 %r26, %r3;
+	@%p5 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { %r25, %r26 }, [ %rd9 + 0 ], %rd10;
+	// end inline asm
+	.loc	1 73 29                         // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:73:29
+	add.s64 	%rd11, %rd22, %rd27;
+	.loc	1 57 94                         // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:57:94
+	mov.b32 	{%rs9, %rs10}, %r21;
+	cvt.f32.bf16 	%r226, %rs9;
+	cvt.f32.bf16 	%r227, %rs10;
+	.loc	1 58 114                        // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:58:114
+	mov.b32 	{%rs11, %rs12}, %r23;
+	cvt.f32.bf16 	%r228, %rs12;
+	cvt.f32.bf16 	%r229, %rs11;
+	.loc	1 59 95                         // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:59:95
+	mov.b32 	{%rs13, %rs14}, %r25;
+	cvt.f32.bf16 	%r230, %rs14;
+	cvt.f32.bf16 	%r231, %rs13;
+	.loc	1 61 23                         // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:61:23
+	add.f32 	%r232, %r227, 0f3F800000;
+	add.f32 	%r233, %r226, 0f3F800000;
+	.loc	1 63 24                         // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:63:24
+	sub.f32 	%r234, %r229, %r220;
+	sub.f32 	%r235, %r228, %r220;
+	.loc	1 69 24                         // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:69:24
+	mul.f32 	%r236, %r225, %r235;
+	mul.f32 	%r237, %r225, %r234;
+	.loc	1 72 24                         // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:72:24
+	fma.rn.f32 	%r238, %r233, %r237, %r231;
+	fma.rn.f32 	%r239, %r232, %r236, %r230;
+	.loc	1 73 53                         // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:73:53
+	cvt.rn.bf16x2.f32 	%r27, %r239, %r238;
+	.loc	1 57 94                         // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:57:94
+	mov.b32 	{%rs15, %rs16}, %r22;
+	cvt.f32.bf16 	%r240, %rs15;
+	cvt.f32.bf16 	%r241, %rs16;
+	.loc	1 58 114                        // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:58:114
+	mov.b32 	{%rs17, %rs18}, %r24;
+	cvt.f32.bf16 	%r242, %rs18;
+	cvt.f32.bf16 	%r243, %rs17;
+	.loc	1 59 95                         // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:59:95
+	mov.b32 	{%rs19, %rs20}, %r26;
+	cvt.f32.bf16 	%r244, %rs20;
+	cvt.f32.bf16 	%r245, %rs19;
+	.loc	1 61 23                         // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:61:23
+	add.f32 	%r246, %r241, 0f3F800000;
+	add.f32 	%r247, %r240, 0f3F800000;
+	.loc	1 63 24                         // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:63:24
+	sub.f32 	%r248, %r243, %r220;
+	sub.f32 	%r249, %r242, %r220;
+	.loc	1 69 24                         // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:69:24
+	mul.f32 	%r250, %r225, %r249;
+	mul.f32 	%r251, %r225, %r248;
+	.loc	1 72 24                         // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:72:24
+	fma.rn.f32 	%r252, %r247, %r251, %r245;
+	fma.rn.f32 	%r253, %r246, %r250, %r244;
+	.loc	1 73 53                         // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:73:53
+	cvt.rn.bf16x2.f32 	%r28, %r253, %r252;
+	// begin inline asm
+	@%p1 st.global.v2.b32 [ %rd11 + 0 ], { %r27, %r28 };
+	// end inline asm
+	.loc	1 57 34                         // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:57:34
+	add.s64 	%rd12, %rd5, 4096;
+	.loc	1 57 41                         // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:57:41
+	// begin inline asm
+	mov.u64 %rd13, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd13, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r29, %r3;
+	mov.u32 %r30, %r3;
+	@%p5 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { %r29, %r30 }, [ %rd12 + 0 ], %rd13;
+	// end inline asm
+	.loc	1 58 35                         // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:58:35
+	add.s64 	%rd14, %rd7, 4096;
+	.loc	1 58 52                         // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:58:52
+	// begin inline asm
+	mov.u64 %rd15, 0x0;
+	createpolicy.fractional.L2::evict_first.b64 %rd15, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r31, %r3;
+	mov.u32 %r32, %r3;
+	@%p1 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { %r31, %r32 }, [ %rd14 + 0 ], %rd15;
+	// end inline asm
+	.loc	1 59 35                         // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:59:35
+	add.s64 	%rd16, %rd9, 4096;
+	.loc	1 59 42                         // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:59:42
+	// begin inline asm
+	mov.u64 %rd17, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd17, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r33, %r3;
+	mov.u32 %r34, %r3;
+	@%p5 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { %r33, %r34 }, [ %rd16 + 0 ], %rd17;
+	// end inline asm
+	.loc	1 73 29                         // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:73:29
+	add.s64 	%rd18, %rd11, 4096;
+	.loc	1 57 94                         // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:57:94
+	mov.b32 	{%rs21, %rs22}, %r29;
+	cvt.f32.bf16 	%r254, %rs21;
+	cvt.f32.bf16 	%r255, %rs22;
+	.loc	1 58 114                        // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:58:114
+	mov.b32 	{%rs23, %rs24}, %r31;
+	cvt.f32.bf16 	%r256, %rs24;
+	cvt.f32.bf16 	%r257, %rs23;
+	.loc	1 59 95                         // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:59:95
+	mov.b32 	{%rs25, %rs26}, %r33;
+	cvt.f32.bf16 	%r258, %rs26;
+	cvt.f32.bf16 	%r259, %rs25;
+	.loc	1 61 23                         // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:61:23
+	add.f32 	%r260, %r255, 0f3F800000;
+	add.f32 	%r261, %r254, 0f3F800000;
+	.loc	1 63 24                         // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:63:24
+	sub.f32 	%r262, %r257, %r220;
+	sub.f32 	%r263, %r256, %r220;
+	.loc	1 69 24                         // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:69:24
+	mul.f32 	%r264, %r225, %r263;
+	mul.f32 	%r265, %r225, %r262;
+	.loc	1 72 24                         // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:72:24
+	fma.rn.f32 	%r266, %r261, %r265, %r259;
+	fma.rn.f32 	%r267, %r260, %r264, %r258;
+	.loc	1 73 53                         // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:73:53
+	cvt.rn.bf16x2.f32 	%r35, %r267, %r266;
+	.loc	1 57 94                         // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:57:94
+	mov.b32 	{%rs27, %rs28}, %r30;
+	cvt.f32.bf16 	%r268, %rs27;
+	cvt.f32.bf16 	%r269, %rs28;
+	.loc	1 58 114                        // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:58:114
+	mov.b32 	{%rs29, %rs30}, %r32;
+	cvt.f32.bf16 	%r270, %rs30;
+	cvt.f32.bf16 	%r271, %rs29;
+	.loc	1 59 95                         // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:59:95
+	mov.b32 	{%rs31, %rs32}, %r34;
+	cvt.f32.bf16 	%r272, %rs32;
+	cvt.f32.bf16 	%r273, %rs31;
+	.loc	1 61 23                         // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:61:23
+	add.f32 	%r274, %r269, 0f3F800000;
+	add.f32 	%r275, %r268, 0f3F800000;
+	.loc	1 63 24                         // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:63:24
+	sub.f32 	%r276, %r271, %r220;
+	sub.f32 	%r277, %r270, %r220;
+	.loc	1 69 24                         // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:69:24
+	mul.f32 	%r278, %r225, %r277;
+	mul.f32 	%r279, %r225, %r276;
+	.loc	1 72 24                         // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:72:24
+	fma.rn.f32 	%r280, %r275, %r279, %r273;
+	fma.rn.f32 	%r281, %r274, %r278, %r272;
+	.loc	1 73 53                         // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:73:53
+	cvt.rn.bf16x2.f32 	%r36, %r281, %r280;
+	// begin inline asm
+	@%p1 st.global.v2.b32 [ %rd18 + 0 ], { %r35, %r36 };
+	// end inline asm
+	.loc	1 51 4                          // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:51:4
+	ret;
+$L__tmp32:
+$L__func_end0:
+                                        // -- End function
+}
+	.file	1 "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py"
+	.file	2 "/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py"
+	.section	.debug_abbrev
+	{
+.b8 1                                   // Abbreviation Code
+.b8 17                                  // DW_TAG_compile_unit
+.b8 1                                   // DW_CHILDREN_yes
+.b8 37                                  // DW_AT_producer
+.b8 8                                   // DW_FORM_string
+.b8 19                                  // DW_AT_language
+.b8 5                                   // DW_FORM_data2
+.b8 3                                   // DW_AT_name
+.b8 8                                   // DW_FORM_string
+.b8 16                                  // DW_AT_stmt_list
+.b8 6                                   // DW_FORM_data4
+.b8 27                                  // DW_AT_comp_dir
+.b8 8                                   // DW_FORM_string
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 2                                   // Abbreviation Code
+.b8 46                                  // DW_TAG_subprogram
+.b8 0                                   // DW_CHILDREN_no
+.b8 3                                   // DW_AT_name
+.b8 8                                   // DW_FORM_string
+.b8 32                                  // DW_AT_inline
+.b8 11                                  // DW_FORM_data1
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 3                                   // Abbreviation Code
+.b8 46                                  // DW_TAG_subprogram
+.b8 1                                   // DW_CHILDREN_yes
+.b8 17                                  // DW_AT_low_pc
+.b8 1                                   // DW_FORM_addr
+.b8 18                                  // DW_AT_high_pc
+.b8 1                                   // DW_FORM_addr
+.b8 49                                  // DW_AT_abstract_origin
+.b8 19                                  // DW_FORM_ref4
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 4                                   // Abbreviation Code
+.b8 29                                  // DW_TAG_inlined_subroutine
+.b8 0                                   // DW_CHILDREN_no
+.b8 49                                  // DW_AT_abstract_origin
+.b8 19                                  // DW_FORM_ref4
+.b8 17                                  // DW_AT_low_pc
+.b8 1                                   // DW_FORM_addr
+.b8 18                                  // DW_AT_high_pc
+.b8 1                                   // DW_FORM_addr
+.b8 88                                  // DW_AT_call_file
+.b8 11                                  // DW_FORM_data1
+.b8 89                                  // DW_AT_call_line
+.b8 11                                  // DW_FORM_data1
+.b8 87                                  // DW_AT_call_column
+.b8 11                                  // DW_FORM_data1
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 5                                   // Abbreviation Code
+.b8 29                                  // DW_TAG_inlined_subroutine
+.b8 1                                   // DW_CHILDREN_yes
+.b8 49                                  // DW_AT_abstract_origin
+.b8 19                                  // DW_FORM_ref4
+.b8 17                                  // DW_AT_low_pc
+.b8 1                                   // DW_FORM_addr
+.b8 18                                  // DW_AT_high_pc
+.b8 1                                   // DW_FORM_addr
+.b8 88                                  // DW_AT_call_file
+.b8 11                                  // DW_FORM_data1
+.b8 89                                  // DW_AT_call_line
+.b8 11                                  // DW_FORM_data1
+.b8 87                                  // DW_AT_call_column
+.b8 11                                  // DW_FORM_data1
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 0                                   // EOM(3)
+	}
+	.section	.debug_info
+	{
+.b32 367                                // Length of Unit
+.b8 2                                   // DWARF version number
+.b8 0
+.b32 .debug_abbrev                      // Offset Into Abbrev. Section
+.b8 8                                   // Address Size (in bytes)
+.b8 1                                   // Abbrev [1] 0xb:0x168 DW_TAG_compile_unit
+.b8 116                                 // DW_AT_producer
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 0
+.b8 2                                   // DW_AT_language
+.b8 0
+.b8 99                                  // DW_AT_name
+.b8 112
+.b8 103
+.b8 101
+.b8 115
+.b8 122
+.b8 104
+.b8 52
+.b8 110
+.b8 112
+.b8 121
+.b8 110
+.b8 121
+.b8 55
+.b8 117
+.b8 50
+.b8 113
+.b8 120
+.b8 108
+.b8 107
+.b8 116
+.b8 112
+.b8 118
+.b8 50
+.b8 121
+.b8 50
+.b8 120
+.b8 100
+.b8 103
+.b8 103
+.b8 122
+.b8 121
+.b8 108
+.b8 53
+.b8 111
+.b8 112
+.b8 111
+.b8 121
+.b8 51
+.b8 111
+.b8 114
+.b8 117
+.b8 113
+.b8 115
+.b8 113
+.b8 101
+.b8 116
+.b8 52
+.b8 112
+.b8 53
+.b8 101
+.b8 107
+.b8 46
+.b8 112
+.b8 121
+.b8 0
+.b32 .debug_line                        // DW_AT_stmt_list
+.b8 47                                  // DW_AT_comp_dir
+.b8 97
+.b8 112
+.b8 112
+.b8 47
+.b8 116
+.b8 101
+.b8 110
+.b8 115
+.b8 111
+.b8 114
+.b8 114
+.b8 116
+.b8 95
+.b8 108
+.b8 108
+.b8 109
+.b8 47
+.b8 118
+.b8 105
+.b8 115
+.b8 117
+.b8 97
+.b8 108
+.b8 95
+.b8 103
+.b8 101
+.b8 110
+.b8 47
+.b8 99
+.b8 111
+.b8 109
+.b8 112
+.b8 105
+.b8 108
+.b8 101
+.b8 100
+.b8 95
+.b8 99
+.b8 97
+.b8 99
+.b8 104
+.b8 101
+.b8 47
+.b8 102
+.b8 108
+.b8 117
+.b8 120
+.b8 50
+.b8 95
+.b8 107
+.b8 108
+.b8 101
+.b8 105
+.b8 110
+.b8 95
+.b8 57
+.b8 98
+.b8 95
+.b8 78
+.b8 86
+.b8 73
+.b8 68
+.b8 73
+.b8 65
+.b8 95
+.b8 71
+.b8 101
+.b8 70
+.b8 111
+.b8 114
+.b8 99
+.b8 101
+.b8 95
+.b8 82
+.b8 84
+.b8 88
+.b8 95
+.b8 52
+.b8 48
+.b8 57
+.b8 48
+.b8 95
+.b8 115
+.b8 109
+.b8 56
+.b8 57
+.b8 95
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 50
+.b8 46
+.b8 49
+.b8 48
+.b8 46
+.b8 48
+.b8 97
+.b8 48
+.b8 95
+.b8 98
+.b8 52
+.b8 101
+.b8 52
+.b8 101
+.b8 101
+.b8 56
+.b8 49
+.b8 100
+.b8 51
+.b8 46
+.b8 110
+.b8 118
+.b8 50
+.b8 53
+.b8 46
+.b8 49
+.b8 50
+.b8 95
+.b8 99
+.b8 117
+.b8 100
+.b8 97
+.b8 49
+.b8 51
+.b8 95
+.b8 49
+.b8 47
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 105
+.b8 110
+.b8 100
+.b8 117
+.b8 99
+.b8 116
+.b8 111
+.b8 114
+.b8 47
+.b8 112
+.b8 103
+.b8 0
+.b8 2                                   // Abbrev [2] 0xe4:0x2f DW_TAG_subprogram
+.b8 116                                 // DW_AT_name
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 114
+.b8 101
+.b8 100
+.b8 95
+.b8 102
+.b8 117
+.b8 115
+.b8 101
+.b8 100
+.b8 95
+.b8 97
+.b8 100
+.b8 100
+.b8 95
+.b8 109
+.b8 117
+.b8 108
+.b8 95
+.b8 110
+.b8 97
+.b8 116
+.b8 105
+.b8 118
+.b8 101
+.b8 95
+.b8 108
+.b8 97
+.b8 121
+.b8 101
+.b8 114
+.b8 95
+.b8 110
+.b8 111
+.b8 114
+.b8 109
+.b8 95
+.b8 48
+.b8 0
+.b8 1                                   // DW_AT_inline
+.b8 3                                   // Abbrev [3] 0x113:0x5f DW_TAG_subprogram
+.b64 $L__func_begin0                    // DW_AT_low_pc
+.b64 $L__func_end0                      // DW_AT_high_pc
+.b32 228                                // DW_AT_abstract_origin
+.b8 4                                   // Abbrev [4] 0x128:0x18 DW_TAG_inlined_subroutine
+.b32 228                                // DW_AT_abstract_origin
+.b64 $L__tmp1                           // DW_AT_low_pc
+.b64 $L__tmp10                          // DW_AT_high_pc
+.b8 1                                   // DW_AT_call_file
+.b8 42                                  // DW_AT_call_line
+.b8 51                                  // DW_AT_call_column
+.b8 5                                   // Abbrev [5] 0x140:0x31 DW_TAG_inlined_subroutine
+.b32 228                                // DW_AT_abstract_origin
+.b64 $L__tmp11                          // DW_AT_low_pc
+.b64 $L__tmp31                          // DW_AT_high_pc
+.b8 1                                   // DW_AT_call_file
+.b8 47                                  // DW_AT_call_line
+.b8 79                                  // DW_AT_call_column
+.b8 4                                   // Abbrev [4] 0x158:0x18 DW_TAG_inlined_subroutine
+.b32 228                                // DW_AT_abstract_origin
+.b64 $L__tmp11                          // DW_AT_low_pc
+.b64 $L__tmp30                          // DW_AT_high_pc
+.b8 2                                   // DW_AT_call_file
+.b8 243                                 // DW_AT_call_line
+.b8 46                                  // DW_AT_call_column
+.b8 0                                   // End Of Children Mark
+.b8 0                                   // End Of Children Mark
+.b8 0                                   // End Of Children Mark
+	}
+	.section	.debug_macinfo	{	}
diff --git a/triton/S5Z477N52AES6BCQLQSCQFPIOCFND32E3P2XRI6OSCXZCB32BOSQ/triton_red_fused_add_mul_native_layer_norm_0.source b/triton/S5Z477N52AES6BCQLQSCQFPIOCFND32E3P2XRI6OSCXZCB32BOSQ/triton_red_fused_add_mul_native_layer_norm_0.source
new file mode 100644
index 0000000000000000000000000000000000000000..ccafe4a89244b01843849b5d6816b92e0e34f71e
--- /dev/null
+++ b/triton/S5Z477N52AES6BCQLQSCQFPIOCFND32E3P2XRI6OSCXZCB32BOSQ/triton_red_fused_add_mul_native_layer_norm_0.source
@@ -0,0 +1,420 @@
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":18:0)
+#loc72 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":216:0)
+#loc85 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":133:0)
+#loc89 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":242:0)
+#loc91 = loc(unknown)
+#loc94 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":230:0)
+#loc109 = loc("in_ptr0"(#loc))
+#loc110 = loc("in_ptr1"(#loc))
+#loc111 = loc("in_ptr2"(#loc))
+#loc112 = loc("out_ptr2"(#loc))
+#loc113 = loc("xnumel"(#loc))
+#loc114 = loc("r0_numel"(#loc))
+#loc171 = loc("value"(#loc72))
+#loc172 = loc("mean"(#loc72))
+#loc173 = loc("m2"(#loc72))
+#loc174 = loc("weight"(#loc72))
+#loc175 = loc("first_iteration"(#loc72))
+#loc185 = loc("input"(#loc85))
+#loc186 = loc("mean"(#loc89))
+#loc187 = loc("m2"(#loc89))
+#loc188 = loc("weight"(#loc89))
+#loc189 = loc("mean_1"(#loc94))
+#loc190 = loc("m2_1"(#loc94))
+#loc191 = loc("weight_1"(#loc94))
+#loc192 = loc("mean_2"(#loc94))
+#loc193 = loc("m2_2"(#loc94))
+#loc194 = loc("weight_2"(#loc94))
+#loc201 = loc("new_mean"(#loc171))
+module {
+  tt.func public @triton_red_fused_add_mul_native_layer_norm_0(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %out_ptr2: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
+    %xnumel_0 = arith.constant 2304 : i32 loc(#loc115)
+    %r0_numel_1 = arith.constant 4096 : i32 loc(#loc116)
+    %xoffset = tt.get_program_id x : i32 loc(#loc117)
+    %xoffset_2 = arith.constant 1 : i32 loc(#loc118)
+    %xoffset_3 = arith.constant 1 : i32 loc(#loc118)
+    %xoffset_4 = arith.muli %xoffset, %xoffset_3 : i32 loc(#loc118)
+    %xindex = tt.make_range {end = 1 : i32, start = 0 : i32} : tensor<1xi32> loc(#loc119)
+    %xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc120)
+    %xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<1x1xi32> loc(#loc121)
+    %xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<1x1xi32> loc(#loc121)
+    %xmask = arith.constant dense<2304> : tensor<1x1xi32> loc(#loc122)
+    %xmask_8 = arith.cmpi slt, %xindex_7, %xmask : tensor<1x1xi32> loc(#loc122)
+    %r0_base = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32> loc(#loc123)
+    %r0_base_9 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<2048xi32> -> tensor<1x2048xi32> loc(#loc124)
+    %tmp3_mean = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_2048__(1,)cconstexpr_fp32_"() : () -> tensor<1x2048xf32> loc(#loc125)
+    %tmp3_m2 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_2048__(1,)cconstexpr_fp32_"() : () -> tensor<1x2048xf32> loc(#loc126)
+    %tmp3_weight = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_2048__(1,)cconstexpr_fp32_"() : () -> tensor<1x2048xf32> loc(#loc127)
+    %c0_i32 = arith.constant 0 : i32 loc(#loc14)
+    %c2048_i32 = arith.constant 2048 : i32 loc(#loc14)
+    %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc14)
+    %1 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc14)
+    %2 = arith.bitcast %c2048_i32 : i32 to i32 loc(#loc14)
+    %3 = ub.poison : i32 loc(#loc14)
+    %tmp3_weight_10:3 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%tmp3_mean_13 = %tmp3_mean, %tmp3_m2_14 = %tmp3_m2, %tmp3_weight_15 = %tmp3_weight) -> (tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32>)  : i32 {
+      %r0_index = tt.splat %r0_offset : i32 -> tensor<1x2048xi32> loc(#loc129)
+      %r0_index_16 = arith.addi %r0_index, %r0_base_9 : tensor<1x2048xi32> loc(#loc129)
+      %r0_mask = arith.constant dense<4096> : tensor<1x2048xi32> loc(#loc130)
+      %r0_mask_17 = arith.cmpi slt, %r0_index_16, %r0_mask : tensor<1x2048xi32> loc(#loc130)
+      %tmp0 = arith.constant 4096 : i32 loc(#loc131)
+      %tmp0_18 = arith.constant 4096 : i32 loc(#loc131)
+      %tmp0_19 = arith.constant dense<4096> : tensor<1x1xi32> loc(#loc131)
+      %tmp0_20 = arith.muli %tmp0_19, %xindex_7 : tensor<1x1xi32> loc(#loc131)
+      %tmp0_21 = tt.broadcast %tmp0_20 : tensor<1x1xi32> -> tensor<1x2048xi32> loc(#loc132)
+      %tmp0_22 = arith.addi %r0_index_16, %tmp0_21 : tensor<1x2048xi32> loc(#loc132)
+      %tmp0_23 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<1x2048x!tt.ptr<bf16>> loc(#loc133)
+      %tmp0_24 = tt.addptr %tmp0_23, %tmp0_22 : tensor<1x2048x!tt.ptr<bf16>>, tensor<1x2048xi32> loc(#loc133)
+      %tmp0_25 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x2048xi1> loc(#loc134)
+      %tmp0_26 = arith.andi %r0_mask_17, %tmp0_25 : tensor<1x2048xi1> loc(#loc134)
+      %tmp0_27 = arith.constant 0.000000e+00 : f32 loc(#loc135)
+      %tmp0_28 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc135)
+      %tmp0_29 = arith.truncf %tmp0_28 : tensor<1x2048xf32> to tensor<1x2048xbf16> loc(#loc135)
+      %tmp0_30 = tt.load %tmp0_24, %tmp0_26, %tmp0_29 evictionPolicy = evict_last : tensor<1x2048x!tt.ptr<bf16>> loc(#loc135)
+      %tmp0_31 = arith.extf %tmp0_30 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc136)
+      %c0_i32_32 = arith.constant 0 : i32 loc(#loc23)
+      %9 = arith.cmpi eq, %r0_offset, %c0_i32_32 : i32 loc(#loc23)
+      %10:3 = tt.call @torch._inductor.runtime.triton_helpers.welford_reduce__fp32S1_2048S_fp32S1_2048S_fp32S1_2048S_fp32S1_2048S_u1__(%tmp0_31, %tmp3_mean_13, %tmp3_m2_14, %tmp3_weight_15, %9) : (tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32>, i1) -> (tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32>) loc(#loc24)
+      %tmp3_mean_33 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x2048xi1> loc(#loc137)
+      %tmp3_mean_34 = arith.andi %r0_mask_17, %tmp3_mean_33 : tensor<1x2048xi1> loc(#loc137)
+      %tmp3_mean_35 = arith.select %tmp3_mean_34, %10#0, %tmp3_mean_13 : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc138)
+      %tmp3_m2_36 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x2048xi1> loc(#loc139)
+      %tmp3_m2_37 = arith.andi %r0_mask_17, %tmp3_m2_36 : tensor<1x2048xi1> loc(#loc139)
+      %tmp3_m2_38 = arith.select %tmp3_m2_37, %10#1, %tmp3_m2_14 : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc140)
+      %tmp3_weight_39 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x2048xi1> loc(#loc141)
+      %tmp3_weight_40 = arith.andi %r0_mask_17, %tmp3_weight_39 : tensor<1x2048xi1> loc(#loc141)
+      %tmp3_weight_41 = arith.select %tmp3_weight_40, %10#2, %tmp3_weight_15 : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc142)
+      scf.yield %tmp3_mean_35, %tmp3_m2_38, %tmp3_weight_41 : tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32> loc(#loc31)
+    } loc(#loc207)
+    %4:3 = tt.call @"torch._inductor.runtime.triton_helpers.welford__fp32S1_2048S_fp32S1_2048S_fp32S1_2048S__(3,)cconstexpr_1_"(%tmp3_weight_10#0, %tmp3_weight_10#1, %tmp3_weight_10#2) : (tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32>) -> (tensor<1xf32>, tensor<1xf32>, tensor<1xf32>) loc(#loc32)
+    %tmp3 = tt.expand_dims %4#0 {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc143)
+    %tmp7 = tt.expand_dims %4#1 {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc144)
+    %tmp8 = tt.expand_dims %4#2 {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc145)
+    %c0_i32_11 = arith.constant 0 : i32 loc(#loc36)
+    %c2048_i32_12 = arith.constant 2048 : i32 loc(#loc36)
+    %5 = arith.bitcast %c0_i32_11 : i32 to i32 loc(#loc36)
+    %6 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc36)
+    %7 = arith.bitcast %c2048_i32_12 : i32 to i32 loc(#loc36)
+    %8 = ub.poison : i32 loc(#loc36)
+    scf.for %r0_offset = %5 to %6 step %7  : i32 {
+      %r0_index = tt.splat %r0_offset : i32 -> tensor<1x2048xi32> loc(#loc146)
+      %r0_index_13 = arith.addi %r0_index, %r0_base_9 : tensor<1x2048xi32> loc(#loc146)
+      %r0_mask = arith.constant dense<4096> : tensor<1x2048xi32> loc(#loc147)
+      %r0_mask_14 = arith.cmpi slt, %r0_index_13, %r0_mask : tensor<1x2048xi32> loc(#loc147)
+      %tmp9 = tt.splat %in_ptr1 : !tt.ptr<bf16> -> tensor<1x2048x!tt.ptr<bf16>> loc(#loc148)
+      %tmp9_15 = tt.addptr %tmp9, %r0_index_13 : tensor<1x2048x!tt.ptr<bf16>>, tensor<1x2048xi32> loc(#loc148)
+      %tmp9_16 = arith.constant 0.000000e+00 : f32 loc(#loc149)
+      %tmp9_17 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc149)
+      %tmp9_18 = arith.truncf %tmp9_17 : tensor<1x2048xf32> to tensor<1x2048xbf16> loc(#loc149)
+      %tmp9_19 = tt.load %tmp9_15, %r0_mask_14, %tmp9_18 evictionPolicy = evict_last : tensor<1x2048x!tt.ptr<bf16>> loc(#loc149)
+      %tmp9_20 = arith.extf %tmp9_19 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc150)
+      %tmp12 = arith.constant 4096 : i32 loc(#loc151)
+      %tmp12_21 = arith.constant 4096 : i32 loc(#loc151)
+      %tmp12_22 = arith.constant dense<4096> : tensor<1x1xi32> loc(#loc151)
+      %tmp12_23 = arith.muli %tmp12_22, %xindex_7 : tensor<1x1xi32> loc(#loc151)
+      %tmp12_24 = tt.broadcast %tmp12_23 : tensor<1x1xi32> -> tensor<1x2048xi32> loc(#loc152)
+      %tmp12_25 = arith.addi %r0_index_13, %tmp12_24 : tensor<1x2048xi32> loc(#loc152)
+      %tmp12_26 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<1x2048x!tt.ptr<bf16>> loc(#loc153)
+      %tmp12_27 = tt.addptr %tmp12_26, %tmp12_25 : tensor<1x2048x!tt.ptr<bf16>>, tensor<1x2048xi32> loc(#loc153)
+      %tmp12_28 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x2048xi1> loc(#loc154)
+      %tmp12_29 = arith.andi %r0_mask_14, %tmp12_28 : tensor<1x2048xi1> loc(#loc154)
+      %tmp12_30 = arith.constant 0.000000e+00 : f32 loc(#loc155)
+      %tmp12_31 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc155)
+      %tmp12_32 = arith.truncf %tmp12_31 : tensor<1x2048xf32> to tensor<1x2048xbf16> loc(#loc155)
+      %tmp12_33 = tt.load %tmp12_27, %tmp12_29, %tmp12_32 evictionPolicy = evict_first : tensor<1x2048x!tt.ptr<bf16>> loc(#loc155)
+      %tmp12_34 = arith.extf %tmp12_33 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc156)
+      %tmp23 = tt.splat %in_ptr2 : !tt.ptr<bf16> -> tensor<1x2048x!tt.ptr<bf16>> loc(#loc157)
+      %tmp23_35 = tt.addptr %tmp23, %r0_index_13 : tensor<1x2048x!tt.ptr<bf16>>, tensor<1x2048xi32> loc(#loc157)
+      %tmp23_36 = arith.constant 0.000000e+00 : f32 loc(#loc158)
+      %tmp23_37 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc158)
+      %tmp23_38 = arith.truncf %tmp23_37 : tensor<1x2048xf32> to tensor<1x2048xbf16> loc(#loc158)
+      %tmp23_39 = tt.load %tmp23_35, %r0_mask_14, %tmp23_38 evictionPolicy = evict_last : tensor<1x2048x!tt.ptr<bf16>> loc(#loc158)
+      %tmp23_40 = arith.extf %tmp23_39 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc159)
+      %tmp10 = arith.constant 1.000000e+00 : f32 loc(#loc160)
+      %tmp11 = arith.constant dense<1.000000e+00> : tensor<1x2048xf32> loc(#loc161)
+      %tmp11_41 = arith.addf %tmp9_20, %tmp11 : tensor<1x2048xf32> loc(#loc161)
+      %tmp14 = tt.broadcast %tmp3 : tensor<1x1xf32> -> tensor<1x2048xf32> loc(#loc162)
+      %tmp14_42 = arith.subf %tmp12_34, %tmp14 : tensor<1x2048xf32> loc(#loc162)
+      %tmp15 = arith.constant 4.096000e+03 : f32 loc(#loc163)
+      %tmp16 = arith.constant dense<4.096000e+03> : tensor<1x1xf32> loc(#loc164)
+      %tmp16_43 = arith.divf %tmp7, %tmp16 : tensor<1x1xf32> loc(#loc164)
+      %tmp17 = arith.constant 9.99999997E-7 : f32 loc(#loc165)
+      %tmp18 = arith.constant dense<9.99999997E-7> : tensor<1x1xf32> loc(#loc166)
+      %tmp18_44 = arith.addf %tmp16_43, %tmp18 : tensor<1x1xf32> loc(#loc166)
+      %tmp19 = tt.extern_elementwise %tmp18_44 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<1x1xf32>) -> tensor<1x1xf32> loc(#loc167)
+      %tmp20 = tt.broadcast %tmp19 : tensor<1x1xf32> -> tensor<1x2048xf32> loc(#loc168)
+      %tmp20_45 = arith.mulf %tmp14_42, %tmp20 : tensor<1x2048xf32> loc(#loc168)
+      %tmp22 = arith.mulf %tmp11_41, %tmp20_45 : tensor<1x2048xf32> loc(#loc169)
+      %tmp24 = arith.addf %tmp22, %tmp23_40 : tensor<1x2048xf32> loc(#loc170)
+      %c4096_i32 = arith.constant 4096 : i32 loc(#loc62)
+      %c4096_i32_46 = arith.constant 4096 : i32 loc(#loc62)
+      %cst = arith.constant dense<4096> : tensor<1x1xi32> loc(#loc62)
+      %9 = arith.muli %cst, %xindex_7 : tensor<1x1xi32> loc(#loc62)
+      %10 = tt.broadcast %9 : tensor<1x1xi32> -> tensor<1x2048xi32> loc(#loc63)
+      %11 = arith.addi %r0_index_13, %10 : tensor<1x2048xi32> loc(#loc63)
+      %12 = tt.splat %out_ptr2 : !tt.ptr<bf16> -> tensor<1x2048x!tt.ptr<bf16>> loc(#loc64)
+      %13 = tt.addptr %12, %11 : tensor<1x2048x!tt.ptr<bf16>>, tensor<1x2048xi32> loc(#loc64)
+      %14 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x2048xi1> loc(#loc65)
+      %15 = arith.andi %r0_mask_14, %14 : tensor<1x2048xi1> loc(#loc65)
+      %16 = arith.truncf %tmp24 : tensor<1x2048xf32> to tensor<1x2048xbf16> loc(#loc66)
+      tt.store %13, %16, %15 : tensor<1x2048x!tt.ptr<bf16>> loc(#loc66)
+    } loc(#loc36)
+    tt.return loc(#loc67)
+  } loc(#loc)
+  tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_2048__(1,)cconstexpr_fp32_"() -> tensor<1x2048xf32> attributes {noinline = false} {
+    %cst = arith.constant 0.000000e+00 : f32 loc(#loc69)
+    %cst_0 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc69)
+    tt.return %cst_0 : tensor<1x2048xf32> loc(#loc70)
+  ^bb1:  // no predecessors
+    %0 = ub.poison : tensor<1x2048xf32> loc(#loc71)
+    tt.return %0 : tensor<1x2048xf32> loc(#loc71)
+  } loc(#loc68)
+  tt.func private @torch._inductor.runtime.triton_helpers.welford_reduce__fp32S1_2048S_fp32S1_2048S_fp32S1_2048S_fp32S1_2048S_u1__(%new_mean: tensor<1x2048xf32> loc("new_mean"(#loc171)), %mean: tensor<1x2048xf32> loc("mean"(#loc72)), %m2: tensor<1x2048xf32> loc("m2"(#loc72)), %weight: tensor<1x2048xf32> loc("weight"(#loc72)), %first_iteration: i1 loc("first_iteration"(#loc72))) -> (tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32>) attributes {noinline = false} {
+    %0:3 = scf.if %first_iteration -> (tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32>) {
+      %new_weight = arith.constant 1.000000e+00 : f32 loc(#loc176)
+      %new_weight_0 = arith.constant dense<1.000000e+00> : tensor<1x2048xf32> loc(#loc202)
+      %new_m2 = tt.call @triton.language.standard.zeros_like__fp32S1_2048S__(%m2) : (tensor<1x2048xf32>) -> tensor<1x2048xf32> loc(#loc203)
+      scf.yield %new_m2, %new_mean, %new_weight_0 : tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32> loc(#loc203)
+    } else {
+      %delta = arith.subf %new_mean, %mean : tensor<1x2048xf32> loc(#loc178)
+      %new_weight = arith.constant 1 : i32 loc(#loc179)
+      %new_weight_0 = arith.constant 1.000000e+00 : f32 loc(#loc179)
+      %new_weight_1 = arith.constant dense<1.000000e+00> : tensor<1x2048xf32> loc(#loc179)
+      %new_weight_2 = arith.addf %weight, %new_weight_1 : tensor<1x2048xf32> loc(#loc204)
+      %new_mean_3 = arith.divf %delta, %new_weight_2 : tensor<1x2048xf32> loc(#loc180)
+      %new_mean_4 = arith.addf %mean, %new_mean_3 : tensor<1x2048xf32> loc(#loc205)
+      %new_m2 = arith.subf %new_mean, %new_mean_4 : tensor<1x2048xf32> loc(#loc182)
+      %new_m2_5 = arith.mulf %delta, %new_m2 : tensor<1x2048xf32> loc(#loc183)
+      %new_m2_6 = arith.addf %m2, %new_m2_5 : tensor<1x2048xf32> loc(#loc206)
+      scf.yield %new_m2_6, %new_mean_4, %new_weight_2 : tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32> loc(#loc184)
+    } loc(#loc73)
+    tt.return %0#1, %0#0, %0#2 : tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32> loc(#loc83)
+  ^bb1:  // no predecessors
+    %1 = ub.poison : tensor<1x2048xf32> loc(#loc84)
+    %2 = ub.poison : tensor<1x2048xf32> loc(#loc84)
+    %3 = ub.poison : tensor<1x2048xf32> loc(#loc84)
+    tt.return %1, %2, %3 : tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32> loc(#loc84)
+  } loc(#loc72)
+  tt.func private @triton.language.standard.zeros_like__fp32S1_2048S__(%input: tensor<1x2048xf32> loc("input"(#loc85))) -> tensor<1x2048xf32> attributes {noinline = false} {
+    %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_2048__(1,)cconstexpr_fp32_"() : () -> tensor<1x2048xf32> loc(#loc86)
+    tt.return %0 : tensor<1x2048xf32> loc(#loc87)
+  ^bb1:  // no predecessors
+    %1 = ub.poison : tensor<1x2048xf32> loc(#loc88)
+    tt.return %1 : tensor<1x2048xf32> loc(#loc88)
+  } loc(#loc85)
+  tt.func private @"torch._inductor.runtime.triton_helpers.welford__fp32S1_2048S_fp32S1_2048S_fp32S1_2048S__(3,)cconstexpr_1_"(%mean: tensor<1x2048xf32> loc("mean"(#loc89)), %m2: tensor<1x2048xf32> loc("m2"(#loc89)), %weight: tensor<1x2048xf32> loc("weight"(#loc89))) -> (tensor<1xf32>, tensor<1xf32>, tensor<1xf32>) attributes {noinline = false} {
+    %0:3 = "tt.reduce"(%mean, %m2, %weight) <{axis = 1 : i32}> ({
+    ^bb0(%arg3: f32 loc(unknown), %arg4: f32 loc(unknown), %arg5: f32 loc(unknown), %arg6: f32 loc(unknown), %arg7: f32 loc(unknown), %arg8: f32 loc(unknown)):
+      %4:3 = tt.call @torch._inductor.runtime.triton_helpers.welford_combine__fp32_fp32_fp32_fp32_fp32_fp32__(%arg3, %arg4, %arg5, %arg6, %arg7, %arg8) : (f32, f32, f32, f32, f32, f32) -> (f32, f32, f32) loc(#loc90)
+      tt.reduce.return %4#0, %4#1, %4#2 : f32, f32, f32 loc(#loc90)
+    }) : (tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32>) -> (tensor<1xf32>, tensor<1xf32>, tensor<1xf32>) loc(#loc90)
+    tt.return %0#0, %0#1, %0#2 : tensor<1xf32>, tensor<1xf32>, tensor<1xf32> loc(#loc92)
+  ^bb1:  // no predecessors
+    %1 = ub.poison : tensor<1xf32> loc(#loc93)
+    %2 = ub.poison : tensor<1xf32> loc(#loc93)
+    %3 = ub.poison : tensor<1xf32> loc(#loc93)
+    tt.return %1, %2, %3 : tensor<1xf32>, tensor<1xf32>, tensor<1xf32> loc(#loc93)
+  } loc(#loc89)
+  tt.func private @torch._inductor.runtime.triton_helpers.welford_combine__fp32_fp32_fp32_fp32_fp32_fp32__(%mean_1: f32 loc("mean_1"(#loc94)), %m2_1: f32 loc("m2_1"(#loc94)), %weight_1: f32 loc("weight_1"(#loc94)), %mean_2: f32 loc("mean_2"(#loc94)), %m2_2: f32 loc("m2_2"(#loc94)), %weight_2: f32 loc("weight_2"(#loc94))) -> (f32, f32, f32) attributes {noinline = false} {
+    %delta = arith.subf %mean_2, %mean_1 : f32 loc(#loc195)
+    %new_weight = arith.addf %weight_1, %weight_2 : f32 loc(#loc196)
+    %w2_over_w = arith.constant 0.000000e+00 : f32 loc(#loc197)
+    %w2_over_w_0 = arith.cmpf oeq, %new_weight, %w2_over_w : f32 loc(#loc197)
+    %w2_over_w_1 = arith.divf %weight_2, %new_weight : f32 loc(#loc198)
+    %w2_over_w_2 = arith.constant 0.000000e+00 : f32 loc(#loc199)
+    %w2_over_w_3 = arith.constant 0.000000e+00 : f32 loc(#loc199)
+    %w2_over_w_4 = arith.select %w2_over_w_0, %w2_over_w_3, %w2_over_w_1 : f32 loc(#loc199)
+    %0 = arith.mulf %delta, %w2_over_w_4 : f32 loc(#loc100)
+    %1 = arith.addf %mean_1, %0 : f32 loc(#loc101)
+    %2 = arith.addf %m2_1, %m2_2 : f32 loc(#loc102)
+    %3 = arith.mulf %delta, %delta : f32 loc(#loc103)
+    %4 = arith.mulf %3, %weight_1 : f32 loc(#loc104)
+    %5 = arith.mulf %4, %w2_over_w_4 : f32 loc(#loc105)
+    %6 = arith.addf %2, %5 : f32 loc(#loc106)
+    tt.return %1, %6, %new_weight : f32, f32, f32 loc(#loc107)
+  ^bb1:  // no predecessors
+    %7 = ub.poison : f32 loc(#loc108)
+    %8 = ub.poison : f32 loc(#loc108)
+    %9 = ub.poison : f32 loc(#loc108)
+    tt.return %7, %8, %9 : f32, f32, f32 loc(#loc108)
+  } loc(#loc94)
+} loc(#loc)
+#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":19:13)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":20:15)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":23:28)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":23:33)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":24:36)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":24:44)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":24:23)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":25:21)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":26:27)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":26:37)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":29:45)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":30:43)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":31:47)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":32:43)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":33:31)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":34:29)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":38:46)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":38:41)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":38:34)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":38:61)
+#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":38:51)
+#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":38:112)
+#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":42:62)
+#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":42:51)
+#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":44:39)
+#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":44:62)
+#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":45:37)
+#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":45:58)
+#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":46:41)
+#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":46:66)
+#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":46:8)
+#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":47:79)
+#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":48:16)
+#loc34 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":49:16)
+#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":50:16)
+#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":51:43)
+#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":52:31)
+#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":53:29)
+#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":57:34)
+#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":57:41)
+#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":57:94)
+#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":58:47)
+#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":58:42)
+#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":58:35)
+#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":58:62)
+#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":58:52)
+#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":58:114)
+#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":59:35)
+#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":59:42)
+#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":59:95)
+#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":60:16)
+#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":61:23)
+#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":63:24)
+#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":64:16)
+#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":65:24)
+#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":66:16)
+#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":67:24)
+#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":68:32)
+#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":69:24)
+#loc60 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":71:24)
+#loc61 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":72:24)
+#loc62 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":73:41)
+#loc63 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":73:36)
+#loc64 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":73:29)
+#loc65 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":73:63)
+#loc66 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":73:53)
+#loc67 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":51:4)
+#loc68 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":120:0)
+#loc69 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":129:31)
+#loc70 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":129:11)
+#loc71 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":129:4)
+#loc73 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":217:7)
+#loc74 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":218:46)
+#loc75 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":220:31)
+#loc76 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":222:24)
+#loc77 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":223:30)
+#loc78 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":224:34)
+#loc79 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":224:26)
+#loc80 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":225:39)
+#loc81 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":225:31)
+#loc82 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":225:22)
+#loc83 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":226:11)
+#loc84 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":226:4)
+#loc86 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":140:30)
+#loc87 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":140:11)
+#loc88 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":140:4)
+#loc90 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":243:46)
+#loc92 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":243:11)
+#loc93 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":243:4)
+#loc95 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":231:21)
+#loc96 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":232:28)
+#loc97 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:39)
+#loc98 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:60)
+#loc99 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:49)
+#loc100 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":235:25)
+#loc101 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":235:17)
+#loc102 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:15)
+#loc103 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:30)
+#loc104 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:38)
+#loc105 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:49)
+#loc106 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:22)
+#loc107 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":234:11)
+#loc108 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":234:4)
+#loc115 = loc("xnumel"(#loc1))
+#loc116 = loc("r0_numel"(#loc2))
+#loc117 = loc("xoffset"(#loc3))
+#loc118 = loc("xoffset"(#loc4))
+#loc119 = loc("xindex"(#loc5))
+#loc120 = loc("xindex"(#loc6))
+#loc121 = loc("xindex"(#loc7))
+#loc122 = loc("xmask"(#loc8))
+#loc123 = loc("r0_base"(#loc9))
+#loc124 = loc("r0_base"(#loc10))
+#loc125 = loc("tmp3_mean"(#loc11))
+#loc126 = loc("tmp3_m2"(#loc12))
+#loc127 = loc("tmp3_weight"(#loc13))
+#loc128 = loc("tmp3_mean"(#loc14))
+#loc129 = loc("r0_index"(#loc15))
+#loc130 = loc("r0_mask"(#loc16))
+#loc131 = loc("tmp0"(#loc17))
+#loc132 = loc("tmp0"(#loc18))
+#loc133 = loc("tmp0"(#loc19))
+#loc134 = loc("tmp0"(#loc20))
+#loc135 = loc("tmp0"(#loc21))
+#loc136 = loc("tmp0"(#loc22))
+#loc137 = loc("tmp3_mean"(#loc25))
+#loc138 = loc("tmp3_mean"(#loc26))
+#loc139 = loc("tmp3_m2"(#loc27))
+#loc140 = loc("tmp3_m2"(#loc28))
+#loc141 = loc("tmp3_weight"(#loc29))
+#loc142 = loc("tmp3_weight"(#loc30))
+#loc143 = loc("tmp3"(#loc33))
+#loc144 = loc("tmp7"(#loc34))
+#loc145 = loc("tmp8"(#loc35))
+#loc146 = loc("r0_index"(#loc37))
+#loc147 = loc("r0_mask"(#loc38))
+#loc148 = loc("tmp9"(#loc39))
+#loc149 = loc("tmp9"(#loc40))
+#loc150 = loc("tmp9"(#loc41))
+#loc151 = loc("tmp12"(#loc42))
+#loc152 = loc("tmp12"(#loc43))
+#loc153 = loc("tmp12"(#loc44))
+#loc154 = loc("tmp12"(#loc45))
+#loc155 = loc("tmp12"(#loc46))
+#loc156 = loc("tmp12"(#loc47))
+#loc157 = loc("tmp23"(#loc48))
+#loc158 = loc("tmp23"(#loc49))
+#loc159 = loc("tmp23"(#loc50))
+#loc160 = loc("tmp10"(#loc51))
+#loc161 = loc("tmp11"(#loc52))
+#loc162 = loc("tmp14"(#loc53))
+#loc163 = loc("tmp15"(#loc54))
+#loc164 = loc("tmp16"(#loc55))
+#loc165 = loc("tmp17"(#loc56))
+#loc166 = loc("tmp18"(#loc57))
+#loc167 = loc("tmp19"(#loc58))
+#loc168 = loc("tmp20"(#loc59))
+#loc169 = loc("tmp22"(#loc60))
+#loc170 = loc("tmp24"(#loc61))
+#loc176 = loc("new_weight"(#loc74))
+#loc177 = loc("new_m2"(#loc75))
+#loc178 = loc("delta"(#loc76))
+#loc179 = loc("new_weight"(#loc77))
+#loc180 = loc("new_mean"(#loc78))
+#loc181 = loc("new_mean"(#loc79))
+#loc182 = loc("new_m2"(#loc80))
+#loc183 = loc("new_m2"(#loc81))
+#loc184 = loc("new_m2"(#loc82))
+#loc195 = loc("delta"(#loc95))
+#loc196 = loc("new_weight"(#loc96))
+#loc197 = loc("w2_over_w"(#loc97))
+#loc198 = loc("w2_over_w"(#loc98))
+#loc199 = loc("w2_over_w"(#loc99))
+#loc200 = loc("tmp3_m2"(#loc128))
+#loc202 = loc("new_weight"(#loc176))
+#loc203 = loc("new_m2"(#loc177))
+#loc204 = loc("new_weight"(#loc179))
+#loc205 = loc("new_mean"(#loc181))
+#loc206 = loc("new_m2"(#loc184))
+#loc207 = loc("tmp3_weight"(#loc200))
diff --git a/triton/S5Z477N52AES6BCQLQSCQFPIOCFND32E3P2XRI6OSCXZCB32BOSQ/triton_red_fused_add_mul_native_layer_norm_0.ttgir b/triton/S5Z477N52AES6BCQLQSCQFPIOCFND32E3P2XRI6OSCXZCB32BOSQ/triton_red_fused_add_mul_native_layer_norm_0.ttgir
new file mode 100644
index 0000000000000000000000000000000000000000..01c28ba9fcb2607a2b5cfdbf5812a231e6550f88
--- /dev/null
+++ b/triton/S5Z477N52AES6BCQLQSCQFPIOCFND32E3P2XRI6OSCXZCB32BOSQ/triton_red_fused_add_mul_native_layer_norm_0.ttgir
@@ -0,0 +1,261 @@
+#blocked = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 32], warpsPerCTA = [1, 16], order = [1, 0]}>
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":18:0)
+#loc1 = loc(unknown)
+#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":47:79)
+#loc70 = loc("in_ptr0"(#loc))
+#loc71 = loc("in_ptr1"(#loc))
+#loc72 = loc("in_ptr2"(#loc))
+#loc73 = loc("out_ptr2"(#loc))
+#loc74 = loc("xnumel"(#loc))
+#loc75 = loc("r0_numel"(#loc))
+#loc101 = loc(callsite(#loc1 at #loc30))
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 16 : i32, ttg.target = "cuda:89", "ttg.threads-per-warp" = 32 : i32} {
+  tt.func public @triton_red_fused_add_mul_native_layer_norm_0(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %out_ptr2: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
+    %cst = arith.constant dense<4096> : tensor<1x2048xi32, #blocked> loc(#loc1)
+    %c0_i32 = arith.constant 0 : i32 loc(#loc1)
+    %cst_0 = arith.constant dense<0.000000e+00> : tensor<1x2048xbf16, #blocked> loc(#loc1)
+    %c4096_i32 = arith.constant 4096 : i32 loc(#loc1)
+    %c2048_i32 = arith.constant 2048 : i32 loc(#loc1)
+    %c2304_i32 = arith.constant 2304 : i32 loc(#loc1)
+    %cst_1 = arith.constant 0.000000e+00 : f32 loc(#loc1)
+    %cst_2 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32, #blocked> loc(#loc1)
+    %cst_3 = arith.constant dense<9.99999997E-7> : tensor<1x1xf32, #blocked> loc(#loc1)
+    %cst_4 = arith.constant dense<4.096000e+03> : tensor<1x1xf32, #blocked> loc(#loc1)
+    %cst_5 = arith.constant dense<1.000000e+00> : tensor<1x2048xf32, #blocked> loc(#loc1)
+    %xoffset = tt.get_program_id x : i32 loc(#loc76)
+    %xmask = arith.cmpi slt, %xoffset, %c2304_i32 : i32 loc(#loc77)
+    %r0_base = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc78)
+    %r0_base_6 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<2048xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x2048xi32, #blocked> loc(#loc78)
+    %tmp0 = arith.muli %xoffset, %c4096_i32 : i32 loc(#loc79)
+    %tmp0_7 = tt.splat %tmp0 : i32 -> tensor<1x2048xi32, #blocked> loc(#loc130)
+    %tmp0_8 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<1x2048x!tt.ptr<bf16>, #blocked> loc(#loc81)
+    %tmp0_9 = tt.splat %xmask : i1 -> tensor<1x2048xi1, #blocked> loc(#loc131)
+    %tmp3_weight:3 = scf.for %tmp3_weight_10 = %c0_i32 to %c4096_i32 step %c2048_i32 iter_args(%arg7 = %cst_2, %arg8 = %cst_2, %arg9 = %cst_2) -> (tensor<1x2048xf32, #blocked>, tensor<1x2048xf32, #blocked>, tensor<1x2048xf32, #blocked>)  : i32 {
+      %r0_index = tt.splat %tmp3_weight_10 : i32 -> tensor<1x2048xi32, #blocked> loc(#loc84)
+      %r0_index_11 = arith.addi %r0_index, %r0_base_6 : tensor<1x2048xi32, #blocked> loc(#loc84)
+      %r0_mask = arith.cmpi slt, %r0_index_11, %cst : tensor<1x2048xi32, #blocked> loc(#loc85)
+      %tmp0_12 = arith.addi %r0_index_11, %tmp0_7 : tensor<1x2048xi32, #blocked> loc(#loc80)
+      %tmp0_13 = tt.addptr %tmp0_8, %tmp0_12 : tensor<1x2048x!tt.ptr<bf16>, #blocked>, tensor<1x2048xi32, #blocked> loc(#loc81)
+      %tmp0_14 = arith.andi %r0_mask, %tmp0_9 : tensor<1x2048xi1, #blocked> loc(#loc82)
+      %tmp0_15 = tt.load %tmp0_13, %tmp0_14, %cst_0 evictionPolicy = evict_last : tensor<1x2048x!tt.ptr<bf16>, #blocked> loc(#loc86)
+      %tmp0_16 = arith.extf %tmp0_15 : tensor<1x2048xbf16, #blocked> to tensor<1x2048xf32, #blocked> loc(#loc87)
+      %2 = arith.cmpi eq, %tmp3_weight_10, %c0_i32 : i32 loc(#loc14)
+      %3:3 = scf.if %2 -> (tensor<1x2048xf32, #blocked>, tensor<1x2048xf32, #blocked>, tensor<1x2048xf32, #blocked>) {
+        scf.yield %cst_2, %tmp0_16, %cst_5 : tensor<1x2048xf32, #blocked>, tensor<1x2048xf32, #blocked>, tensor<1x2048xf32, #blocked> loc(#loc155)
+      } else {
+        %delta = arith.subf %tmp0_16, %arg7 : tensor<1x2048xf32, #blocked> loc(#loc134)
+        %new_weight = arith.addf %arg9, %cst_5 : tensor<1x2048xf32, #blocked> loc(#loc156)
+        %new_mean = arith.divf %delta, %new_weight : tensor<1x2048xf32, #blocked> loc(#loc136)
+        %new_mean_18 = arith.addf %arg7, %new_mean : tensor<1x2048xf32, #blocked> loc(#loc157)
+        %new_m2 = arith.subf %tmp0_16, %new_mean_18 : tensor<1x2048xf32, #blocked> loc(#loc138)
+        %new_m2_19 = arith.mulf %delta, %new_m2 : tensor<1x2048xf32, #blocked> loc(#loc139)
+        %new_m2_20 = arith.addf %arg8, %new_m2_19 : tensor<1x2048xf32, #blocked> loc(#loc158)
+        scf.yield %new_m2_20, %new_mean_18, %new_weight : tensor<1x2048xf32, #blocked>, tensor<1x2048xf32, #blocked>, tensor<1x2048xf32, #blocked> loc(#loc141)
+      } loc(#loc88)
+      %tmp3_mean = arith.select %tmp0_14, %3#1, %arg7 : tensor<1x2048xi1, #blocked>, tensor<1x2048xf32, #blocked> loc(#loc97)
+      %tmp3_m2 = arith.select %tmp0_14, %3#0, %arg8 : tensor<1x2048xi1, #blocked>, tensor<1x2048xf32, #blocked> loc(#loc98)
+      %tmp3_weight_17 = arith.select %tmp0_14, %3#2, %arg9 : tensor<1x2048xi1, #blocked>, tensor<1x2048xf32, #blocked> loc(#loc99)
+      scf.yield %tmp3_mean, %tmp3_m2, %tmp3_weight_17 : tensor<1x2048xf32, #blocked>, tensor<1x2048xf32, #blocked>, tensor<1x2048xf32, #blocked> loc(#loc28)
+    } loc(#loc154)
+    %0:3 = "tt.reduce"(%tmp3_weight#0, %tmp3_weight#1, %tmp3_weight#2) <{axis = 1 : i32}> ({
+    ^bb0(%arg6: f32 loc(callsite(#loc1 at #loc30)), %arg7: f32 loc(callsite(#loc1 at #loc30)), %arg8: f32 loc(callsite(#loc1 at #loc30)), %arg9: f32 loc(callsite(#loc1 at #loc30)), %arg10: f32 loc(callsite(#loc1 at #loc30)), %arg11: f32 loc(callsite(#loc1 at #loc30))):
+      %delta = arith.subf %arg9, %arg6 : f32 loc(#loc142)
+      %new_weight = arith.addf %arg8, %arg11 : f32 loc(#loc143)
+      %w2_over_w = arith.cmpf oeq, %new_weight, %cst_1 : f32 loc(#loc144)
+      %w2_over_w_10 = arith.divf %arg11, %new_weight : f32 loc(#loc145)
+      %w2_over_w_11 = arith.select %w2_over_w, %cst_1, %w2_over_w_10 : f32 loc(#loc146)
+      %2 = arith.mulf %delta, %w2_over_w_11 : f32 loc(#loc147)
+      %3 = arith.addf %arg6, %2 : f32 loc(#loc148)
+      %4 = arith.addf %arg7, %arg10 : f32 loc(#loc149)
+      %5 = arith.mulf %delta, %delta : f32 loc(#loc150)
+      %6 = arith.mulf %5, %arg8 : f32 loc(#loc151)
+      %7 = arith.mulf %6, %w2_over_w_11 : f32 loc(#loc152)
+      %8 = arith.addf %4, %7 : f32 loc(#loc153)
+      tt.reduce.return %3, %8, %new_weight : f32, f32, f32 loc(#loc100)
+    }) : (tensor<1x2048xf32, #blocked>, tensor<1x2048xf32, #blocked>, tensor<1x2048xf32, #blocked>) -> (tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked}>>, tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked}>>, tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked}>>) loc(#loc100)
+    %tmp3 = tt.expand_dims %0#0 {axis = 1 : i32} : tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<1x1xf32, #blocked> loc(#loc107)
+    %tmp7 = tt.expand_dims %0#1 {axis = 1 : i32} : tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<1x1xf32, #blocked> loc(#loc108)
+    %tmp9 = tt.splat %in_ptr1 : !tt.ptr<bf16> -> tensor<1x2048x!tt.ptr<bf16>, #blocked> loc(#loc109)
+    %tmp23 = tt.splat %in_ptr2 : !tt.ptr<bf16> -> tensor<1x2048x!tt.ptr<bf16>, #blocked> loc(#loc110)
+    %tmp14 = tt.broadcast %tmp3 : tensor<1x1xf32, #blocked> -> tensor<1x2048xf32, #blocked> loc(#loc111)
+    %tmp16 = arith.divf %tmp7, %cst_4 : tensor<1x1xf32, #blocked> loc(#loc112)
+    %tmp18 = arith.addf %tmp16, %cst_3 : tensor<1x1xf32, #blocked> loc(#loc113)
+    %tmp19 = tt.extern_elementwise %tmp18 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<1x1xf32, #blocked>) -> tensor<1x1xf32, #blocked> loc(#loc114)
+    %tmp20 = tt.broadcast %tmp19 : tensor<1x1xf32, #blocked> -> tensor<1x2048xf32, #blocked> loc(#loc115)
+    %1 = tt.splat %out_ptr2 : !tt.ptr<bf16> -> tensor<1x2048x!tt.ptr<bf16>, #blocked> loc(#loc52)
+    scf.for %r0_offset = %c0_i32 to %c4096_i32 step %c2048_i32  : i32 {
+      %r0_index = tt.splat %r0_offset : i32 -> tensor<1x2048xi32, #blocked> loc(#loc116)
+      %r0_index_10 = arith.addi %r0_index, %r0_base_6 : tensor<1x2048xi32, #blocked> loc(#loc116)
+      %r0_mask = arith.cmpi slt, %r0_index_10, %cst : tensor<1x2048xi32, #blocked> loc(#loc117)
+      %tmp9_11 = tt.addptr %tmp9, %r0_index_10 : tensor<1x2048x!tt.ptr<bf16>, #blocked>, tensor<1x2048xi32, #blocked> loc(#loc109)
+      %tmp9_12 = tt.load %tmp9_11, %r0_mask, %cst_0 evictionPolicy = evict_last : tensor<1x2048x!tt.ptr<bf16>, #blocked> loc(#loc118)
+      %tmp9_13 = arith.extf %tmp9_12 : tensor<1x2048xbf16, #blocked> to tensor<1x2048xf32, #blocked> loc(#loc119)
+      %tmp12 = arith.addi %r0_index_10, %tmp0_7 : tensor<1x2048xi32, #blocked> loc(#loc120)
+      %tmp12_14 = tt.addptr %tmp0_8, %tmp12 : tensor<1x2048x!tt.ptr<bf16>, #blocked>, tensor<1x2048xi32, #blocked> loc(#loc121)
+      %tmp12_15 = arith.andi %r0_mask, %tmp0_9 : tensor<1x2048xi1, #blocked> loc(#loc122)
+      %tmp12_16 = tt.load %tmp12_14, %tmp12_15, %cst_0 evictionPolicy = evict_first : tensor<1x2048x!tt.ptr<bf16>, #blocked> loc(#loc123)
+      %tmp12_17 = arith.extf %tmp12_16 : tensor<1x2048xbf16, #blocked> to tensor<1x2048xf32, #blocked> loc(#loc124)
+      %tmp23_18 = tt.addptr %tmp23, %r0_index_10 : tensor<1x2048x!tt.ptr<bf16>, #blocked>, tensor<1x2048xi32, #blocked> loc(#loc110)
+      %tmp23_19 = tt.load %tmp23_18, %r0_mask, %cst_0 evictionPolicy = evict_last : tensor<1x2048x!tt.ptr<bf16>, #blocked> loc(#loc125)
+      %tmp23_20 = arith.extf %tmp23_19 : tensor<1x2048xbf16, #blocked> to tensor<1x2048xf32, #blocked> loc(#loc126)
+      %tmp11 = arith.addf %tmp9_13, %cst_5 : tensor<1x2048xf32, #blocked> loc(#loc127)
+      %tmp14_21 = arith.subf %tmp12_17, %tmp14 : tensor<1x2048xf32, #blocked> loc(#loc111)
+      %tmp20_22 = arith.mulf %tmp14_21, %tmp20 : tensor<1x2048xf32, #blocked> loc(#loc115)
+      %tmp22 = arith.mulf %tmp11, %tmp20_22 : tensor<1x2048xf32, #blocked> loc(#loc128)
+      %tmp24 = arith.addf %tmp22, %tmp23_20 : tensor<1x2048xf32, #blocked> loc(#loc129)
+      %2 = tt.addptr %1, %tmp12 : tensor<1x2048x!tt.ptr<bf16>, #blocked>, tensor<1x2048xi32, #blocked> loc(#loc52)
+      %3 = arith.truncf %tmp24 : tensor<1x2048xf32, #blocked> to tensor<1x2048xbf16, #blocked> loc(#loc68)
+      tt.store %2, %3, %tmp12_15 : tensor<1x2048x!tt.ptr<bf16>, #blocked> loc(#loc68)
+    } loc(#loc53)
+    tt.return loc(#loc69)
+  } loc(#loc)
+} loc(#loc)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":23:28)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":25:21)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":26:37)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":38:46)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":38:41)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":38:34)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":38:61)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":32:43)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":33:31)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":34:29)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":38:51)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":38:112)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":42:62)
+#loc15 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":217:7)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":42:51)
+#loc17 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":220:31)
+#loc18 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":222:24)
+#loc19 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":223:30)
+#loc20 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":224:34)
+#loc21 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":224:26)
+#loc22 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":225:39)
+#loc23 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":225:31)
+#loc24 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":225:22)
+#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":44:62)
+#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":45:58)
+#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":46:66)
+#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":46:8)
+#loc29 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":243:46)
+#loc31 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":231:21)
+#loc32 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":232:28)
+#loc33 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:39)
+#loc34 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:60)
+#loc35 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:49)
+#loc36 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":235:25)
+#loc37 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":235:17)
+#loc38 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:15)
+#loc39 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:30)
+#loc40 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:38)
+#loc41 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:49)
+#loc42 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:22)
+#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":48:16)
+#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":49:16)
+#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":57:34)
+#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":59:35)
+#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":63:24)
+#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":65:24)
+#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":67:24)
+#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":68:32)
+#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":69:24)
+#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":73:29)
+#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":51:43)
+#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":52:31)
+#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":53:29)
+#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":57:41)
+#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":57:94)
+#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":58:42)
+#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":58:35)
+#loc60 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":58:62)
+#loc61 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":58:52)
+#loc62 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":58:114)
+#loc63 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":59:42)
+#loc64 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":59:95)
+#loc65 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":61:23)
+#loc66 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":71:24)
+#loc67 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":72:24)
+#loc68 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":73:53)
+#loc69 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":51:4)
+#loc76 = loc("xoffset"(#loc2))
+#loc77 = loc("xmask"(#loc3))
+#loc78 = loc("r0_base"(#loc4))
+#loc79 = loc("tmp0"(#loc5))
+#loc80 = loc("tmp0"(#loc6))
+#loc81 = loc("tmp0"(#loc7))
+#loc82 = loc("tmp0"(#loc8))
+#loc83 = loc("tmp3_mean"(#loc9))
+#loc84 = loc("r0_index"(#loc10))
+#loc85 = loc("r0_mask"(#loc11))
+#loc86 = loc("tmp0"(#loc12))
+#loc87 = loc("tmp0"(#loc13))
+#loc88 = loc(callsite(#loc15 at #loc16))
+#loc89 = loc("new_m2"(#loc17))
+#loc90 = loc("delta"(#loc18))
+#loc91 = loc("new_weight"(#loc19))
+#loc92 = loc("new_mean"(#loc20))
+#loc93 = loc("new_mean"(#loc21))
+#loc94 = loc("new_m2"(#loc22))
+#loc95 = loc("new_m2"(#loc23))
+#loc96 = loc("new_m2"(#loc24))
+#loc97 = loc("tmp3_mean"(#loc25))
+#loc98 = loc("tmp3_m2"(#loc26))
+#loc99 = loc("tmp3_weight"(#loc27))
+#loc100 = loc(callsite(#loc29 at #loc30))
+#loc102 = loc("delta"(#loc31))
+#loc103 = loc("new_weight"(#loc32))
+#loc104 = loc("w2_over_w"(#loc33))
+#loc105 = loc("w2_over_w"(#loc34))
+#loc106 = loc("w2_over_w"(#loc35))
+#loc107 = loc("tmp3"(#loc43))
+#loc108 = loc("tmp7"(#loc44))
+#loc109 = loc("tmp9"(#loc45))
+#loc110 = loc("tmp23"(#loc46))
+#loc111 = loc("tmp14"(#loc47))
+#loc112 = loc("tmp16"(#loc48))
+#loc113 = loc("tmp18"(#loc49))
+#loc114 = loc("tmp19"(#loc50))
+#loc115 = loc("tmp20"(#loc51))
+#loc116 = loc("r0_index"(#loc54))
+#loc117 = loc("r0_mask"(#loc55))
+#loc118 = loc("tmp9"(#loc56))
+#loc119 = loc("tmp9"(#loc57))
+#loc120 = loc("tmp12"(#loc58))
+#loc121 = loc("tmp12"(#loc59))
+#loc122 = loc("tmp12"(#loc60))
+#loc123 = loc("tmp12"(#loc61))
+#loc124 = loc("tmp12"(#loc62))
+#loc125 = loc("tmp23"(#loc63))
+#loc126 = loc("tmp23"(#loc64))
+#loc127 = loc("tmp11"(#loc65))
+#loc128 = loc("tmp22"(#loc66))
+#loc129 = loc("tmp24"(#loc67))
+#loc130 = loc(fused[#loc80, #loc79])
+#loc131 = loc(fused[#loc82, #loc77])
+#loc132 = loc("tmp3_m2"(#loc83))
+#loc133 = loc("new_m2"(#loc89))
+#loc134 = loc(callsite(#loc90 at #loc16))
+#loc135 = loc("new_weight"(#loc91))
+#loc136 = loc(callsite(#loc92 at #loc16))
+#loc137 = loc("new_mean"(#loc93))
+#loc138 = loc(callsite(#loc94 at #loc16))
+#loc139 = loc(callsite(#loc95 at #loc16))
+#loc140 = loc("new_m2"(#loc96))
+#loc141 = loc(callsite(#loc96 at #loc16))
+#loc142 = loc(callsite(#loc102 at #loc100))
+#loc143 = loc(callsite(#loc103 at #loc100))
+#loc144 = loc(callsite(#loc104 at #loc100))
+#loc145 = loc(callsite(#loc105 at #loc100))
+#loc146 = loc(callsite(#loc106 at #loc100))
+#loc147 = loc(callsite(#loc36 at #loc100))
+#loc148 = loc(callsite(#loc37 at #loc100))
+#loc149 = loc(callsite(#loc38 at #loc100))
+#loc150 = loc(callsite(#loc39 at #loc100))
+#loc151 = loc(callsite(#loc40 at #loc100))
+#loc152 = loc(callsite(#loc41 at #loc100))
+#loc153 = loc(callsite(#loc42 at #loc100))
+#loc154 = loc("tmp3_weight"(#loc132))
+#loc155 = loc(callsite(#loc133 at #loc16))
+#loc156 = loc(callsite(#loc135 at #loc16))
+#loc157 = loc(callsite(#loc137 at #loc16))
+#loc158 = loc(callsite(#loc140 at #loc16))
diff --git a/triton/S5Z477N52AES6BCQLQSCQFPIOCFND32E3P2XRI6OSCXZCB32BOSQ/triton_red_fused_add_mul_native_layer_norm_0.ttir b/triton/S5Z477N52AES6BCQLQSCQFPIOCFND32E3P2XRI6OSCXZCB32BOSQ/triton_red_fused_add_mul_native_layer_norm_0.ttir
new file mode 100644
index 0000000000000000000000000000000000000000..0def56ed8ffd5e33b836ac76b5836d67fdbc827f
--- /dev/null
+++ b/triton/S5Z477N52AES6BCQLQSCQFPIOCFND32E3P2XRI6OSCXZCB32BOSQ/triton_red_fused_add_mul_native_layer_norm_0.ttir
@@ -0,0 +1,270 @@
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":18:0)
+#loc2 = loc(unknown)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":47:79)
+#loc72 = loc("in_ptr0"(#loc))
+#loc73 = loc("in_ptr1"(#loc))
+#loc74 = loc("in_ptr2"(#loc))
+#loc75 = loc("out_ptr2"(#loc))
+#loc76 = loc("xnumel"(#loc))
+#loc77 = loc("r0_numel"(#loc))
+#loc79 = loc(callsite(#loc2 at #loc3))
+module {
+  tt.func public @triton_red_fused_add_mul_native_layer_norm_0(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %out_ptr2: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
+    %xmask = arith.constant 2304 : i32 loc(#loc78)
+    %cst = arith.constant 0.000000e+00 : f32 loc(#loc79)
+    %cst_0 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc2)
+    %cst_1 = arith.constant dense<0.000000e+00> : tensor<1x2048xbf16> loc(#loc2)
+    %c2048_i32 = arith.constant 2048 : i32 loc(#loc2)
+    %c4096_i32 = arith.constant 4096 : i32 loc(#loc2)
+    %cst_2 = arith.constant dense<9.99999997E-7> : tensor<1x1xf32> loc(#loc2)
+    %cst_3 = arith.constant dense<4.096000e+03> : tensor<1x1xf32> loc(#loc2)
+    %cst_4 = arith.constant dense<1.000000e+00> : tensor<1x2048xf32> loc(#loc2)
+    %cst_5 = arith.constant dense<4096> : tensor<1x2048xi32> loc(#loc2)
+    %c0_i32 = arith.constant 0 : i32 loc(#loc2)
+    %xoffset = tt.get_program_id x : i32 loc(#loc80)
+    %xmask_6 = arith.cmpi slt, %xoffset, %xmask : i32 loc(#loc78)
+    %r0_base = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32> loc(#loc81)
+    %r0_base_7 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<2048xi32> -> tensor<1x2048xi32> loc(#loc82)
+    %tmp3_weight:3 = scf.for %r0_offset = %c0_i32 to %c4096_i32 step %c2048_i32 iter_args(%tmp3_mean = %cst_0, %tmp3_m2 = %cst_0, %tmp3_weight_8 = %cst_0) -> (tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32>)  : i32 {
+      %r0_index = tt.splat %r0_offset : i32 -> tensor<1x2048xi32> loc(#loc84)
+      %r0_index_9 = arith.addi %r0_index, %r0_base_7 : tensor<1x2048xi32> loc(#loc84)
+      %r0_mask = arith.cmpi slt, %r0_index_9, %cst_5 : tensor<1x2048xi32> loc(#loc85)
+      %tmp0 = arith.muli %xoffset, %c4096_i32 : i32 loc(#loc86)
+      %tmp0_10 = tt.splat %tmp0 : i32 -> tensor<1x2048xi32> loc(#loc135)
+      %tmp0_11 = arith.addi %r0_index_9, %tmp0_10 : tensor<1x2048xi32> loc(#loc87)
+      %tmp0_12 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<1x2048x!tt.ptr<bf16>> loc(#loc88)
+      %tmp0_13 = tt.addptr %tmp0_12, %tmp0_11 : tensor<1x2048x!tt.ptr<bf16>>, tensor<1x2048xi32> loc(#loc88)
+      %tmp0_14 = tt.splat %xmask_6 : i1 -> tensor<1x2048xi1> loc(#loc136)
+      %tmp0_15 = arith.andi %r0_mask, %tmp0_14 : tensor<1x2048xi1> loc(#loc89)
+      %tmp0_16 = tt.load %tmp0_13, %tmp0_15, %cst_1 evictionPolicy = evict_last : tensor<1x2048x!tt.ptr<bf16>> loc(#loc90)
+      %tmp0_17 = arith.extf %tmp0_16 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc91)
+      %1 = arith.cmpi eq, %r0_offset, %c0_i32 : i32 loc(#loc16)
+      %2:3 = scf.if %1 -> (tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32>) {
+        scf.yield %cst_0, %tmp0_17, %cst_4 : tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32> loc(#loc161)
+      } else {
+        %delta = arith.subf %tmp0_17, %tmp3_mean : tensor<1x2048xf32> loc(#loc138)
+        %new_weight = arith.addf %tmp3_weight_8, %cst_4 : tensor<1x2048xf32> loc(#loc162)
+        %new_mean = arith.divf %delta, %new_weight : tensor<1x2048xf32> loc(#loc140)
+        %new_mean_21 = arith.addf %tmp3_mean, %new_mean : tensor<1x2048xf32> loc(#loc163)
+        %new_m2 = arith.subf %tmp0_17, %new_mean_21 : tensor<1x2048xf32> loc(#loc142)
+        %new_m2_22 = arith.mulf %delta, %new_m2 : tensor<1x2048xf32> loc(#loc143)
+        %new_m2_23 = arith.addf %tmp3_m2, %new_m2_22 : tensor<1x2048xf32> loc(#loc164)
+        scf.yield %new_m2_23, %new_mean_21, %new_weight : tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32> loc(#loc145)
+      } loc(#loc92)
+      %tmp3_mean_18 = arith.select %tmp0_15, %2#1, %tmp3_mean : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc101)
+      %tmp3_m2_19 = arith.select %tmp0_15, %2#0, %tmp3_m2 : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc102)
+      %tmp3_weight_20 = arith.select %tmp0_15, %2#2, %tmp3_weight_8 : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc103)
+      scf.yield %tmp3_mean_18, %tmp3_m2_19, %tmp3_weight_20 : tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32> loc(#loc30)
+    } loc(#loc160)
+    %0:3 = "tt.reduce"(%tmp3_weight#0, %tmp3_weight#1, %tmp3_weight#2) <{axis = 1 : i32}> ({
+    ^bb0(%arg6: f32 loc(callsite(#loc2 at #loc3)), %arg7: f32 loc(callsite(#loc2 at #loc3)), %arg8: f32 loc(callsite(#loc2 at #loc3)), %arg9: f32 loc(callsite(#loc2 at #loc3)), %arg10: f32 loc(callsite(#loc2 at #loc3)), %arg11: f32 loc(callsite(#loc2 at #loc3))):
+      %delta = arith.subf %arg9, %arg6 : f32 loc(#loc146)
+      %new_weight = arith.addf %arg8, %arg11 : f32 loc(#loc147)
+      %w2_over_w = arith.cmpf oeq, %new_weight, %cst : f32 loc(#loc148)
+      %w2_over_w_8 = arith.divf %arg11, %new_weight : f32 loc(#loc149)
+      %w2_over_w_9 = arith.select %w2_over_w, %cst, %w2_over_w_8 : f32 loc(#loc150)
+      %1 = arith.mulf %delta, %w2_over_w_9 : f32 loc(#loc151)
+      %2 = arith.addf %arg6, %1 : f32 loc(#loc152)
+      %3 = arith.addf %arg7, %arg10 : f32 loc(#loc153)
+      %4 = arith.mulf %delta, %delta : f32 loc(#loc154)
+      %5 = arith.mulf %4, %arg8 : f32 loc(#loc155)
+      %6 = arith.mulf %5, %w2_over_w_9 : f32 loc(#loc156)
+      %7 = arith.addf %3, %6 : f32 loc(#loc157)
+      tt.reduce.return %2, %7, %new_weight : f32, f32, f32 loc(#loc104)
+    }) : (tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32>) -> (tensor<1xf32>, tensor<1xf32>, tensor<1xf32>) loc(#loc104)
+    %tmp3 = tt.expand_dims %0#0 {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc110)
+    %tmp7 = tt.expand_dims %0#1 {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc111)
+    scf.for %r0_offset = %c0_i32 to %c4096_i32 step %c2048_i32  : i32 {
+      %r0_index = tt.splat %r0_offset : i32 -> tensor<1x2048xi32> loc(#loc112)
+      %r0_index_8 = arith.addi %r0_index, %r0_base_7 : tensor<1x2048xi32> loc(#loc112)
+      %r0_mask = arith.cmpi slt, %r0_index_8, %cst_5 : tensor<1x2048xi32> loc(#loc113)
+      %tmp9 = tt.splat %in_ptr1 : !tt.ptr<bf16> -> tensor<1x2048x!tt.ptr<bf16>> loc(#loc114)
+      %tmp9_9 = tt.addptr %tmp9, %r0_index_8 : tensor<1x2048x!tt.ptr<bf16>>, tensor<1x2048xi32> loc(#loc114)
+      %tmp9_10 = tt.load %tmp9_9, %r0_mask, %cst_1 evictionPolicy = evict_last : tensor<1x2048x!tt.ptr<bf16>> loc(#loc115)
+      %tmp9_11 = arith.extf %tmp9_10 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc116)
+      %tmp12 = arith.muli %xoffset, %c4096_i32 : i32 loc(#loc117)
+      %tmp12_12 = tt.splat %tmp12 : i32 -> tensor<1x2048xi32> loc(#loc158)
+      %tmp12_13 = arith.addi %r0_index_8, %tmp12_12 : tensor<1x2048xi32> loc(#loc118)
+      %tmp12_14 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<1x2048x!tt.ptr<bf16>> loc(#loc119)
+      %tmp12_15 = tt.addptr %tmp12_14, %tmp12_13 : tensor<1x2048x!tt.ptr<bf16>>, tensor<1x2048xi32> loc(#loc119)
+      %tmp12_16 = tt.splat %xmask_6 : i1 -> tensor<1x2048xi1> loc(#loc159)
+      %tmp12_17 = arith.andi %r0_mask, %tmp12_16 : tensor<1x2048xi1> loc(#loc120)
+      %tmp12_18 = tt.load %tmp12_15, %tmp12_17, %cst_1 evictionPolicy = evict_first : tensor<1x2048x!tt.ptr<bf16>> loc(#loc121)
+      %tmp12_19 = arith.extf %tmp12_18 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc122)
+      %tmp23 = tt.splat %in_ptr2 : !tt.ptr<bf16> -> tensor<1x2048x!tt.ptr<bf16>> loc(#loc123)
+      %tmp23_20 = tt.addptr %tmp23, %r0_index_8 : tensor<1x2048x!tt.ptr<bf16>>, tensor<1x2048xi32> loc(#loc123)
+      %tmp23_21 = tt.load %tmp23_20, %r0_mask, %cst_1 evictionPolicy = evict_last : tensor<1x2048x!tt.ptr<bf16>> loc(#loc124)
+      %tmp23_22 = arith.extf %tmp23_21 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc125)
+      %tmp11 = arith.addf %tmp9_11, %cst_4 : tensor<1x2048xf32> loc(#loc126)
+      %tmp14 = tt.broadcast %tmp3 : tensor<1x1xf32> -> tensor<1x2048xf32> loc(#loc127)
+      %tmp14_23 = arith.subf %tmp12_19, %tmp14 : tensor<1x2048xf32> loc(#loc127)
+      %tmp16 = arith.divf %tmp7, %cst_3 : tensor<1x1xf32> loc(#loc128)
+      %tmp18 = arith.addf %tmp16, %cst_2 : tensor<1x1xf32> loc(#loc129)
+      %tmp19 = tt.extern_elementwise %tmp18 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<1x1xf32>) -> tensor<1x1xf32> loc(#loc130)
+      %tmp20 = tt.broadcast %tmp19 : tensor<1x1xf32> -> tensor<1x2048xf32> loc(#loc131)
+      %tmp20_24 = arith.mulf %tmp14_23, %tmp20 : tensor<1x2048xf32> loc(#loc131)
+      %tmp22 = arith.mulf %tmp11, %tmp20_24 : tensor<1x2048xf32> loc(#loc132)
+      %tmp24 = arith.addf %tmp22, %tmp23_22 : tensor<1x2048xf32> loc(#loc133)
+      %1 = tt.splat %out_ptr2 : !tt.ptr<bf16> -> tensor<1x2048x!tt.ptr<bf16>> loc(#loc69)
+      %2 = tt.addptr %1, %tmp12_13 : tensor<1x2048x!tt.ptr<bf16>>, tensor<1x2048xi32> loc(#loc69)
+      %3 = arith.truncf %tmp24 : tensor<1x2048xf32> to tensor<1x2048xbf16> loc(#loc70)
+      tt.store %2, %3, %tmp12_17 : tensor<1x2048x!tt.ptr<bf16>> loc(#loc70)
+    } loc(#loc46)
+    tt.return loc(#loc71)
+  } loc(#loc)
+} loc(#loc)
+#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":25:21)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":23:28)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":26:27)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":26:37)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":32:43)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":33:31)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":34:29)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":38:46)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":38:41)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":38:34)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":38:61)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":38:51)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":38:112)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":42:62)
+#loc17 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":217:7)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":42:51)
+#loc19 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":220:31)
+#loc20 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":222:24)
+#loc21 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":223:30)
+#loc22 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":224:34)
+#loc23 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":224:26)
+#loc24 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":225:39)
+#loc25 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":225:31)
+#loc26 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":225:22)
+#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":44:62)
+#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":45:58)
+#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":46:66)
+#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":46:8)
+#loc31 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":243:46)
+#loc32 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":231:21)
+#loc33 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":232:28)
+#loc34 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:39)
+#loc35 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:60)
+#loc36 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:49)
+#loc37 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":235:25)
+#loc38 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":235:17)
+#loc39 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:15)
+#loc40 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:30)
+#loc41 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:38)
+#loc42 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:49)
+#loc43 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:22)
+#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":48:16)
+#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":49:16)
+#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":51:43)
+#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":52:31)
+#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":53:29)
+#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":57:34)
+#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":57:41)
+#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":57:94)
+#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":58:47)
+#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":58:42)
+#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":58:35)
+#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":58:62)
+#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":58:52)
+#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":58:114)
+#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":59:35)
+#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":59:42)
+#loc60 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":59:95)
+#loc61 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":61:23)
+#loc62 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":63:24)
+#loc63 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":65:24)
+#loc64 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":67:24)
+#loc65 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":68:32)
+#loc66 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":69:24)
+#loc67 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":71:24)
+#loc68 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":72:24)
+#loc69 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":73:29)
+#loc70 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":73:53)
+#loc71 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":51:4)
+#loc78 = loc("xmask"(#loc1))
+#loc80 = loc("xoffset"(#loc4))
+#loc81 = loc("r0_base"(#loc5))
+#loc82 = loc("r0_base"(#loc6))
+#loc83 = loc("tmp3_mean"(#loc7))
+#loc84 = loc("r0_index"(#loc8))
+#loc85 = loc("r0_mask"(#loc9))
+#loc86 = loc("tmp0"(#loc10))
+#loc87 = loc("tmp0"(#loc11))
+#loc88 = loc("tmp0"(#loc12))
+#loc89 = loc("tmp0"(#loc13))
+#loc90 = loc("tmp0"(#loc14))
+#loc91 = loc("tmp0"(#loc15))
+#loc92 = loc(callsite(#loc17 at #loc18))
+#loc93 = loc("new_m2"(#loc19))
+#loc94 = loc("delta"(#loc20))
+#loc95 = loc("new_weight"(#loc21))
+#loc96 = loc("new_mean"(#loc22))
+#loc97 = loc("new_mean"(#loc23))
+#loc98 = loc("new_m2"(#loc24))
+#loc99 = loc("new_m2"(#loc25))
+#loc100 = loc("new_m2"(#loc26))
+#loc101 = loc("tmp3_mean"(#loc27))
+#loc102 = loc("tmp3_m2"(#loc28))
+#loc103 = loc("tmp3_weight"(#loc29))
+#loc104 = loc(callsite(#loc31 at #loc3))
+#loc105 = loc("delta"(#loc32))
+#loc106 = loc("new_weight"(#loc33))
+#loc107 = loc("w2_over_w"(#loc34))
+#loc108 = loc("w2_over_w"(#loc35))
+#loc109 = loc("w2_over_w"(#loc36))
+#loc110 = loc("tmp3"(#loc44))
+#loc111 = loc("tmp7"(#loc45))
+#loc112 = loc("r0_index"(#loc47))
+#loc113 = loc("r0_mask"(#loc48))
+#loc114 = loc("tmp9"(#loc49))
+#loc115 = loc("tmp9"(#loc50))
+#loc116 = loc("tmp9"(#loc51))
+#loc117 = loc("tmp12"(#loc52))
+#loc118 = loc("tmp12"(#loc53))
+#loc119 = loc("tmp12"(#loc54))
+#loc120 = loc("tmp12"(#loc55))
+#loc121 = loc("tmp12"(#loc56))
+#loc122 = loc("tmp12"(#loc57))
+#loc123 = loc("tmp23"(#loc58))
+#loc124 = loc("tmp23"(#loc59))
+#loc125 = loc("tmp23"(#loc60))
+#loc126 = loc("tmp11"(#loc61))
+#loc127 = loc("tmp14"(#loc62))
+#loc128 = loc("tmp16"(#loc63))
+#loc129 = loc("tmp18"(#loc64))
+#loc130 = loc("tmp19"(#loc65))
+#loc131 = loc("tmp20"(#loc66))
+#loc132 = loc("tmp22"(#loc67))
+#loc133 = loc("tmp24"(#loc68))
+#loc134 = loc("tmp3_m2"(#loc83))
+#loc135 = loc(fused[#loc87, #loc86])
+#loc136 = loc(fused[#loc89, #loc78])
+#loc137 = loc("new_m2"(#loc93))
+#loc138 = loc(callsite(#loc94 at #loc18))
+#loc139 = loc("new_weight"(#loc95))
+#loc140 = loc(callsite(#loc96 at #loc18))
+#loc141 = loc("new_mean"(#loc97))
+#loc142 = loc(callsite(#loc98 at #loc18))
+#loc143 = loc(callsite(#loc99 at #loc18))
+#loc144 = loc("new_m2"(#loc100))
+#loc145 = loc(callsite(#loc100 at #loc18))
+#loc146 = loc(callsite(#loc105 at #loc104))
+#loc147 = loc(callsite(#loc106 at #loc104))
+#loc148 = loc(callsite(#loc107 at #loc104))
+#loc149 = loc(callsite(#loc108 at #loc104))
+#loc150 = loc(callsite(#loc109 at #loc104))
+#loc151 = loc(callsite(#loc37 at #loc104))
+#loc152 = loc(callsite(#loc38 at #loc104))
+#loc153 = loc(callsite(#loc39 at #loc104))
+#loc154 = loc(callsite(#loc40 at #loc104))
+#loc155 = loc(callsite(#loc41 at #loc104))
+#loc156 = loc(callsite(#loc42 at #loc104))
+#loc157 = loc(callsite(#loc43 at #loc104))
+#loc158 = loc(fused[#loc118, #loc117])
+#loc159 = loc(fused[#loc120, #loc78])
+#loc160 = loc("tmp3_weight"(#loc134))
+#loc161 = loc(callsite(#loc137 at #loc18))
+#loc162 = loc(callsite(#loc139 at #loc18))
+#loc163 = loc(callsite(#loc141 at #loc18))
+#loc164 = loc(callsite(#loc144 at #loc18))
diff --git a/triton/SJ2F5NEEPBSFTTPVSLW22OOIZQR5FPT5YWSURMFRPHLWAFZ5VB7A/__grp__triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.json b/triton/SJ2F5NEEPBSFTTPVSLW22OOIZQR5FPT5YWSURMFRPHLWAFZ5VB7A/__grp__triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..b440f6895dc388cb40c3999ebd3b8ee37e8c8fd5
--- /dev/null
+++ b/triton/SJ2F5NEEPBSFTTPVSLW22OOIZQR5FPT5YWSURMFRPHLWAFZ5VB7A/__grp__triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.json
@@ -0,0 +1 @@
+{"child_paths": {"triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.source": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/SJ2F5NEEPBSFTTPVSLW22OOIZQR5FPT5YWSURMFRPHLWAFZ5VB7A/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.source", "triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.ttir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/SJ2F5NEEPBSFTTPVSLW22OOIZQR5FPT5YWSURMFRPHLWAFZ5VB7A/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.ttir", "triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.ttgir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/SJ2F5NEEPBSFTTPVSLW22OOIZQR5FPT5YWSURMFRPHLWAFZ5VB7A/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.ttgir", "triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.llir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/SJ2F5NEEPBSFTTPVSLW22OOIZQR5FPT5YWSURMFRPHLWAFZ5VB7A/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.llir", "triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.ptx": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/SJ2F5NEEPBSFTTPVSLW22OOIZQR5FPT5YWSURMFRPHLWAFZ5VB7A/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.ptx", "triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.cubin": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/SJ2F5NEEPBSFTTPVSLW22OOIZQR5FPT5YWSURMFRPHLWAFZ5VB7A/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.cubin", "triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.json": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/SJ2F5NEEPBSFTTPVSLW22OOIZQR5FPT5YWSURMFRPHLWAFZ5VB7A/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.json"}}
\ No newline at end of file
diff --git a/triton/SJ2F5NEEPBSFTTPVSLW22OOIZQR5FPT5YWSURMFRPHLWAFZ5VB7A/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.cubin b/triton/SJ2F5NEEPBSFTTPVSLW22OOIZQR5FPT5YWSURMFRPHLWAFZ5VB7A/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..41afac6714899d52b2b8f5db21d1797580c8e5f9
Binary files /dev/null and b/triton/SJ2F5NEEPBSFTTPVSLW22OOIZQR5FPT5YWSURMFRPHLWAFZ5VB7A/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.cubin differ
diff --git a/triton/SJ2F5NEEPBSFTTPVSLW22OOIZQR5FPT5YWSURMFRPHLWAFZ5VB7A/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.json b/triton/SJ2F5NEEPBSFTTPVSLW22OOIZQR5FPT5YWSURMFRPHLWAFZ5VB7A/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..9d4aac61231b80282dbad2829700cf1cb258cc6b
--- /dev/null
+++ b/triton/SJ2F5NEEPBSFTTPVSLW22OOIZQR5FPT5YWSURMFRPHLWAFZ5VB7A/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.json
@@ -0,0 +1 @@
+{"hash": "92745eb484786459cdf592edad39c8cc23d2be7dc5a548b0b179d760173da87e", "target": {"backend": "cuda", "arch": 89, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "enable_reflect_ftz": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee", "bf16x3", "bf16x6"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm89", "instrumentation_mode": "", "triton_version": "3.6.0", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1"}
\ No newline at end of file
diff --git a/triton/SJ2F5NEEPBSFTTPVSLW22OOIZQR5FPT5YWSURMFRPHLWAFZ5VB7A/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.llir b/triton/SJ2F5NEEPBSFTTPVSLW22OOIZQR5FPT5YWSURMFRPHLWAFZ5VB7A/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.llir
new file mode 100644
index 0000000000000000000000000000000000000000..5331a368fe28413c50f3abcc954b469f66ad1578
--- /dev/null
+++ b/triton/SJ2F5NEEPBSFTTPVSLW22OOIZQR5FPT5YWSURMFRPHLWAFZ5VB7A/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.llir
@@ -0,0 +1,123 @@
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64"
+
+; Function Attrs: nounwind
+define ptx_kernel void @triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1(ptr addrspace(1) %0, ptr addrspace(1) %1, i64 %2, i32 %3, ptr addrspace(1) readnone captures(none) %4, ptr addrspace(1) readnone captures(none) %5) local_unnamed_addr #0 !dbg !4 {
+  %7 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7
+  %8 = shl i32 %7, 10, !dbg !8
+  %9 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9
+  %10 = shl nuw nsw i32 %9, 3, !dbg !9
+  %11 = and i32 %10, 1016, !dbg !9
+  %12 = or disjoint i32 %11, %8, !dbg !10
+  %13 = or i32 %8, %10, !dbg !9
+  %14 = or disjoint i32 %13, 1, !dbg !10
+  %15 = or disjoint i32 %13, 2, !dbg !10
+  %16 = or disjoint i32 %13, 3, !dbg !10
+  %17 = or disjoint i32 %13, 4, !dbg !10
+  %18 = or disjoint i32 %13, 5, !dbg !10
+  %19 = or disjoint i32 %13, 6, !dbg !10
+  %20 = or disjoint i32 %13, 7, !dbg !10
+  %21 = sdiv i32 %12, 128, !dbg !11
+  %22 = mul i32 %21, 128, !dbg !12
+  %.decomposed = sub i32 %12, %22, !dbg !12
+  %23 = srem i32 %14, 128, !dbg !12
+  %24 = srem i32 %15, 128, !dbg !12
+  %25 = srem i32 %16, 128, !dbg !12
+  %26 = srem i32 %17, 128, !dbg !12
+  %27 = srem i32 %18, 128, !dbg !12
+  %28 = srem i32 %19, 128, !dbg !12
+  %29 = srem i32 %20, 128, !dbg !12
+  %30 = srem i32 %21, 2304, !dbg !13
+  %31 = sdiv i32 %12, 294912, !dbg !14
+  %32 = shl nsw i32 %31, 7, !dbg !15
+  %33 = add nsw i32 %32, %.decomposed, !dbg !16
+  %34 = add nsw i32 %32, %23, !dbg !16
+  %35 = add nsw i32 %32, %24, !dbg !16
+  %36 = add nsw i32 %32, %25, !dbg !16
+  %37 = add nsw i32 %32, %26, !dbg !16
+  %38 = add nsw i32 %32, %27, !dbg !16
+  %39 = add nsw i32 %32, %28, !dbg !16
+  %40 = add nsw i32 %32, %29, !dbg !16
+  %41 = sext i32 %30 to i64, !dbg !17
+  %42 = mul i64 %2, %41, !dbg !17
+  %43 = sext i32 %33 to i64, !dbg !18
+  %44 = sext i32 %34 to i64, !dbg !18
+  %45 = sext i32 %35 to i64, !dbg !18
+  %46 = sext i32 %36 to i64, !dbg !18
+  %47 = sext i32 %37 to i64, !dbg !18
+  %48 = sext i32 %38 to i64, !dbg !18
+  %49 = sext i32 %39 to i64, !dbg !18
+  %50 = sext i32 %40 to i64, !dbg !18
+  %51 = getelementptr bfloat, ptr addrspace(1) %0, i64 %42, !dbg !19
+  %52 = getelementptr bfloat, ptr addrspace(1) %51, i64 %43, !dbg !19
+  %53 = getelementptr bfloat, ptr addrspace(1) %51, i64 %44, !dbg !19
+  %54 = getelementptr bfloat, ptr addrspace(1) %51, i64 %45, !dbg !19
+  %55 = getelementptr bfloat, ptr addrspace(1) %51, i64 %46, !dbg !19
+  %56 = getelementptr bfloat, ptr addrspace(1) %51, i64 %47, !dbg !19
+  %57 = getelementptr bfloat, ptr addrspace(1) %51, i64 %48, !dbg !19
+  %58 = getelementptr bfloat, ptr addrspace(1) %51, i64 %49, !dbg !19
+  %59 = getelementptr bfloat, ptr addrspace(1) %51, i64 %50, !dbg !19
+  %60 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09ld.global.b16 { $0 }, [ $1 + 0 ];", "=c,l"(ptr addrspace(1) %52) #2, !dbg !20
+  %61 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09ld.global.b16 { $0 }, [ $1 + 0 ];", "=c,l"(ptr addrspace(1) %53) #2, !dbg !20
+  %62 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09ld.global.b16 { $0 }, [ $1 + 0 ];", "=c,l"(ptr addrspace(1) %54) #2, !dbg !20
+  %63 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09ld.global.b16 { $0 }, [ $1 + 0 ];", "=c,l"(ptr addrspace(1) %55) #2, !dbg !20
+  %64 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09ld.global.b16 { $0 }, [ $1 + 0 ];", "=c,l"(ptr addrspace(1) %56) #2, !dbg !20
+  %65 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09ld.global.b16 { $0 }, [ $1 + 0 ];", "=c,l"(ptr addrspace(1) %57) #2, !dbg !20
+  %66 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09ld.global.b16 { $0 }, [ $1 + 0 ];", "=c,l"(ptr addrspace(1) %58) #2, !dbg !20
+  %67 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09ld.global.b16 { $0 }, [ $1 + 0 ];", "=c,l"(ptr addrspace(1) %59) #2, !dbg !20
+  %68 = sext i32 %12 to i64, !dbg !21
+  %69 = getelementptr bfloat, ptr addrspace(1) %1, i64 %68, !dbg !21
+  %70 = insertelement <2 x i16> poison, i16 %60, i64 0, !dbg !22
+  %71 = insertelement <2 x i16> %70, i16 %61, i64 1, !dbg !22
+  %72 = bitcast <2 x i16> %71 to i32, !dbg !22
+  %73 = insertelement <2 x i16> poison, i16 %62, i64 0, !dbg !22
+  %74 = insertelement <2 x i16> %73, i16 %63, i64 1, !dbg !22
+  %75 = bitcast <2 x i16> %74 to i32, !dbg !22
+  %76 = insertelement <2 x i16> poison, i16 %64, i64 0, !dbg !22
+  %77 = insertelement <2 x i16> %76, i16 %65, i64 1, !dbg !22
+  %78 = bitcast <2 x i16> %77 to i32, !dbg !22
+  %79 = insertelement <2 x i16> poison, i16 %66, i64 0, !dbg !22
+  %80 = insertelement <2 x i16> %79, i16 %67, i64 1, !dbg !22
+  %81 = bitcast <2 x i16> %80 to i32, !dbg !22
+  tail call void asm sideeffect "st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l"(i32 %72, i32 %75, i32 %78, i32 %81, ptr addrspace(1) %69) #2, !dbg !22
+  ret void, !dbg !23
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1
+
+attributes #0 = { nounwind "nvvm.reqntid"="128" }
+attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #2 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly)
+!1 = !DIFile(filename: "c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py", directory: "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v")
+!2 = !{i32 2, !"Debug Info Version", i32 3}
+!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
+!4 = distinct !DISubprogram(name: "triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1", linkageName: "triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!5 = !DISubroutineType(cc: DW_CC_normal, types: !6)
+!6 = !{}
+!7 = !DILocation(line: 20, column: 28, scope: !4)
+!8 = !DILocation(line: 20, column: 33, scope: !4)
+!9 = !DILocation(line: 21, column: 36, scope: !4)
+!10 = !DILocation(line: 21, column: 23, scope: !4)
+!11 = !DILocation(line: 24, column: 21, scope: !4)
+!12 = !DILocation(line: 23, column: 19, scope: !4)
+!13 = !DILocation(line: 24, column: 28, scope: !4)
+!14 = !DILocation(line: 25, column: 19, scope: !4)
+!15 = !DILocation(line: 27, column: 39, scope: !4)
+!16 = !DILocation(line: 27, column: 35, scope: !4)
+!17 = !DILocation(line: 27, column: 48, scope: !4)
+!18 = !DILocation(line: 27, column: 44, scope: !4)
+!19 = !DILocation(line: 27, column: 30, scope: !4)
+!20 = !DILocation(line: 27, column: 53, scope: !4)
+!21 = !DILocation(line: 28, column: 25, scope: !4)
+!22 = !DILocation(line: 28, column: 36, scope: !4)
+!23 = !DILocation(line: 28, column: 4, scope: !4)
diff --git a/triton/SJ2F5NEEPBSFTTPVSLW22OOIZQR5FPT5YWSURMFRPHLWAFZ5VB7A/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.ptx b/triton/SJ2F5NEEPBSFTTPVSLW22OOIZQR5FPT5YWSURMFRPHLWAFZ5VB7A/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.ptx
new file mode 100644
index 0000000000000000000000000000000000000000..dbcbbe9ecfff9d5358bf42cde236b49b4e68b7e3
--- /dev/null
+++ b/triton/SJ2F5NEEPBSFTTPVSLW22OOIZQR5FPT5YWSURMFRPHLWAFZ5VB7A/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.ptx
@@ -0,0 +1,412 @@
+//
+// Generated by LLVM NVPTX Back-End
+//
+
+.version 9.1
+.target sm_89
+.address_size 64
+
+	// .globl	triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1 // -- Begin function triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1
+                                        // @triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1
+.visible .entry triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1(
+	.param .u64 .ptr .global .align 1 triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1_param_0,
+	.param .u64 .ptr .global .align 1 triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1_param_1,
+	.param .u64 triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1_param_2,
+	.param .u32 triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1_param_3,
+	.param .u64 .ptr .global .align 1 triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1_param_4,
+	.param .u64 .ptr .global .align 1 triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1_param_5
+)
+.reqntid 128
+{
+	.reg .b16 	%rs<9>;
+	.reg .b32 	%r<65>;
+	.reg .b64 	%rd<17>;
+	.loc	1 18 0                          // c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py:18:0
+$L__func_begin0:
+	.loc	1 18 0                          // c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py:18:0
+
+// %bb.0:
+	ld.param.b64 	%rd10, [triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1_param_0];
+	ld.param.b64 	%rd11, [triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1_param_1];
+$L__tmp0:
+	.loc	1 20 28                         // c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py:20:28
+	mov.u32 	%r5, %ctaid.x;
+	.loc	1 20 33                         // c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py:20:33
+	shl.b32 	%r6, %r5, 10;
+	ld.param.b64 	%rd12, [triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1_param_2];
+	.loc	1 21 36                         // c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py:21:36
+	mov.u32 	%r7, %tid.x;
+	shl.b32 	%r8, %r7, 3;
+	and.b32 	%r9, %r8, 1016;
+	.loc	1 21 23                         // c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py:21:23
+	or.b32 	%r10, %r9, %r6;
+	.loc	1 21 36                         // c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py:21:36
+	or.b32 	%r11, %r6, %r8;
+	.loc	1 21 23                         // c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py:21:23
+	or.b32 	%r12, %r11, 1;
+	or.b32 	%r13, %r11, 2;
+	or.b32 	%r14, %r11, 3;
+	or.b32 	%r15, %r11, 4;
+	or.b32 	%r16, %r11, 5;
+	or.b32 	%r17, %r11, 6;
+	or.b32 	%r18, %r11, 7;
+	.loc	1 24 21                         // c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py:24:21
+	bfe.s32 	%r19, %r5, 21, 1;
+	.loc	1 23 19                         // c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py:23:19
+	shr.u32 	%r20, %r19, 25;
+	.loc	1 24 21                         // c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py:24:21
+	add.s32 	%r21, %r10, %r20;
+	shr.s32 	%r22, %r21, 7;
+	.loc	1 23 19                         // c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py:23:19
+	and.b32 	%r23, %r21, -128;
+	sub.s32 	%r24, %r10, %r23;
+	add.s32 	%r25, %r12, %r20;
+	and.b32 	%r26, %r25, -128;
+	sub.s32 	%r27, %r12, %r26;
+	add.s32 	%r28, %r13, %r20;
+	and.b32 	%r29, %r28, -128;
+	sub.s32 	%r30, %r13, %r29;
+	add.s32 	%r31, %r14, %r20;
+	and.b32 	%r32, %r31, -128;
+	sub.s32 	%r33, %r14, %r32;
+	add.s32 	%r34, %r15, %r20;
+	and.b32 	%r35, %r34, -128;
+	sub.s32 	%r36, %r15, %r35;
+	add.s32 	%r37, %r16, %r20;
+	and.b32 	%r38, %r37, -128;
+	sub.s32 	%r39, %r16, %r38;
+	add.s32 	%r40, %r17, %r20;
+	and.b32 	%r41, %r40, -128;
+	sub.s32 	%r42, %r17, %r41;
+	add.s32 	%r43, %r18, %r20;
+	and.b32 	%r44, %r43, -128;
+	sub.s32 	%r45, %r18, %r44;
+	.loc	1 24 28                         // c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py:24:28
+	mul.hi.s32 	%r46, %r22, 954437177;
+	shr.u32 	%r47, %r46, 31;
+	shr.s32 	%r48, %r46, 9;
+	add.s32 	%r49, %r48, %r47;
+	mul.lo.s32 	%r50, %r49, 2304;
+	sub.s32 	%r51, %r22, %r50;
+	.loc	1 25 19                         // c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py:25:19
+	mul.hi.s32 	%r52, %r10, 954437177;
+	shr.u32 	%r53, %r52, 31;
+	shr.s32 	%r54, %r52, 16;
+	add.s32 	%r55, %r54, %r53;
+	.loc	1 27 39                         // c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py:27:39
+	shl.b32 	%r56, %r55, 7;
+	.loc	1 27 35                         // c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py:27:35
+	add.s32 	%r57, %r56, %r24;
+	add.s32 	%r58, %r56, %r27;
+	add.s32 	%r59, %r56, %r30;
+	add.s32 	%r60, %r56, %r33;
+	add.s32 	%r61, %r56, %r36;
+	add.s32 	%r62, %r56, %r39;
+	add.s32 	%r63, %r56, %r42;
+	add.s32 	%r64, %r56, %r45;
+	.loc	1 27 48                         // c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py:27:48
+	cvt.s64.s32 	%rd13, %r51;
+	mul.lo.s64 	%rd14, %rd12, %rd13;
+	.loc	1 27 30                         // c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py:27:30
+	shl.b64 	%rd15, %rd14, 1;
+	add.s64 	%rd16, %rd10, %rd15;
+	mad.wide.s32 	%rd1, %r57, 2, %rd16;
+	mad.wide.s32 	%rd2, %r58, 2, %rd16;
+	mad.wide.s32 	%rd3, %r59, 2, %rd16;
+	mad.wide.s32 	%rd4, %r60, 2, %rd16;
+	mad.wide.s32 	%rd5, %r61, 2, %rd16;
+	mad.wide.s32 	%rd6, %r62, 2, %rd16;
+	mad.wide.s32 	%rd7, %r63, 2, %rd16;
+	mad.wide.s32 	%rd8, %r64, 2, %rd16;
+	.loc	1 27 53                         // c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py:27:53
+	// begin inline asm
+	mov.u16 %rs1, 0x0;
+	ld.global.b16 { %rs1 }, [ %rd1 + 0 ];
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs2, 0x0;
+	ld.global.b16 { %rs2 }, [ %rd2 + 0 ];
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs3, 0x0;
+	ld.global.b16 { %rs3 }, [ %rd3 + 0 ];
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs4, 0x0;
+	ld.global.b16 { %rs4 }, [ %rd4 + 0 ];
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs5, 0x0;
+	ld.global.b16 { %rs5 }, [ %rd5 + 0 ];
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs6, 0x0;
+	ld.global.b16 { %rs6 }, [ %rd6 + 0 ];
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs7, 0x0;
+	ld.global.b16 { %rs7 }, [ %rd7 + 0 ];
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs8, 0x0;
+	ld.global.b16 { %rs8 }, [ %rd8 + 0 ];
+	// end inline asm
+	.loc	1 28 25                         // c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py:28:25
+	mad.wide.s32 	%rd9, %r10, 2, %rd11;
+	.loc	1 28 36                         // c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py:28:36
+	mov.b32 	%r1, {%rs1, %rs2};
+	mov.b32 	%r2, {%rs3, %rs4};
+	mov.b32 	%r3, {%rs5, %rs6};
+	mov.b32 	%r4, {%rs7, %rs8};
+	// begin inline asm
+	st.global.v4.b32 [ %rd9 + 0 ], { %r1, %r2, %r3, %r4 };
+	// end inline asm
+	.loc	1 28 4                          // c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py:28:4
+	ret;
+$L__tmp1:
+$L__func_end0:
+                                        // -- End function
+}
+	.file	1 "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py"
+	.section	.debug_abbrev
+	{
+.b8 1                                   // Abbreviation Code
+.b8 17                                  // DW_TAG_compile_unit
+.b8 0                                   // DW_CHILDREN_no
+.b8 37                                  // DW_AT_producer
+.b8 8                                   // DW_FORM_string
+.b8 19                                  // DW_AT_language
+.b8 5                                   // DW_FORM_data2
+.b8 3                                   // DW_AT_name
+.b8 8                                   // DW_FORM_string
+.b8 16                                  // DW_AT_stmt_list
+.b8 6                                   // DW_FORM_data4
+.b8 27                                  // DW_AT_comp_dir
+.b8 8                                   // DW_FORM_string
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 0                                   // EOM(3)
+	}
+	.section	.debug_info
+	{
+.b32 224                                // Length of Unit
+.b8 2                                   // DWARF version number
+.b8 0
+.b32 .debug_abbrev                      // Offset Into Abbrev. Section
+.b8 8                                   // Address Size (in bytes)
+.b8 1                                   // Abbrev [1] 0xb:0xd9 DW_TAG_compile_unit
+.b8 116                                 // DW_AT_producer
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 0
+.b8 2                                   // DW_AT_language
+.b8 0
+.b8 99                                  // DW_AT_name
+.b8 51
+.b8 118
+.b8 106
+.b8 105
+.b8 108
+.b8 118
+.b8 99
+.b8 121
+.b8 55
+.b8 115
+.b8 100
+.b8 113
+.b8 99
+.b8 97
+.b8 120
+.b8 102
+.b8 115
+.b8 112
+.b8 102
+.b8 102
+.b8 97
+.b8 100
+.b8 98
+.b8 115
+.b8 114
+.b8 121
+.b8 51
+.b8 115
+.b8 113
+.b8 109
+.b8 52
+.b8 106
+.b8 55
+.b8 113
+.b8 112
+.b8 54
+.b8 117
+.b8 51
+.b8 116
+.b8 117
+.b8 115
+.b8 114
+.b8 54
+.b8 112
+.b8 51
+.b8 52
+.b8 115
+.b8 98
+.b8 105
+.b8 97
+.b8 114
+.b8 46
+.b8 112
+.b8 121
+.b8 0
+.b32 .debug_line                        // DW_AT_stmt_list
+.b8 47                                  // DW_AT_comp_dir
+.b8 97
+.b8 112
+.b8 112
+.b8 47
+.b8 116
+.b8 101
+.b8 110
+.b8 115
+.b8 111
+.b8 114
+.b8 114
+.b8 116
+.b8 95
+.b8 108
+.b8 108
+.b8 109
+.b8 47
+.b8 118
+.b8 105
+.b8 115
+.b8 117
+.b8 97
+.b8 108
+.b8 95
+.b8 103
+.b8 101
+.b8 110
+.b8 47
+.b8 99
+.b8 111
+.b8 109
+.b8 112
+.b8 105
+.b8 108
+.b8 101
+.b8 100
+.b8 95
+.b8 99
+.b8 97
+.b8 99
+.b8 104
+.b8 101
+.b8 47
+.b8 102
+.b8 108
+.b8 117
+.b8 120
+.b8 50
+.b8 95
+.b8 107
+.b8 108
+.b8 101
+.b8 105
+.b8 110
+.b8 95
+.b8 57
+.b8 98
+.b8 95
+.b8 78
+.b8 86
+.b8 73
+.b8 68
+.b8 73
+.b8 65
+.b8 95
+.b8 71
+.b8 101
+.b8 70
+.b8 111
+.b8 114
+.b8 99
+.b8 101
+.b8 95
+.b8 82
+.b8 84
+.b8 88
+.b8 95
+.b8 52
+.b8 48
+.b8 57
+.b8 48
+.b8 95
+.b8 115
+.b8 109
+.b8 56
+.b8 57
+.b8 95
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 50
+.b8 46
+.b8 49
+.b8 48
+.b8 46
+.b8 48
+.b8 97
+.b8 48
+.b8 95
+.b8 98
+.b8 52
+.b8 101
+.b8 52
+.b8 101
+.b8 101
+.b8 56
+.b8 49
+.b8 100
+.b8 51
+.b8 46
+.b8 110
+.b8 118
+.b8 50
+.b8 53
+.b8 46
+.b8 49
+.b8 50
+.b8 95
+.b8 99
+.b8 117
+.b8 100
+.b8 97
+.b8 49
+.b8 51
+.b8 95
+.b8 49
+.b8 47
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 105
+.b8 110
+.b8 100
+.b8 117
+.b8 99
+.b8 116
+.b8 111
+.b8 114
+.b8 47
+.b8 51
+.b8 118
+.b8 0
+	}
+	.section	.debug_macinfo	{	}
diff --git a/triton/SJ2F5NEEPBSFTTPVSLW22OOIZQR5FPT5YWSURMFRPHLWAFZ5VB7A/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.source b/triton/SJ2F5NEEPBSFTTPVSLW22OOIZQR5FPT5YWSURMFRPHLWAFZ5VB7A/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.source
new file mode 100644
index 0000000000000000000000000000000000000000..199c64a2732f37f7f42b0fb3f8cc301c559bcc0a
--- /dev/null
+++ b/triton/SJ2F5NEEPBSFTTPVSLW22OOIZQR5FPT5YWSURMFRPHLWAFZ5VB7A/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.source
@@ -0,0 +1,91 @@
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":18:0)
+#loc21 = loc("in_ptr0"(#loc))
+#loc22 = loc("out_ptr0"(#loc))
+#loc23 = loc("ks0"(#loc))
+#loc24 = loc("xnumel"(#loc))
+module {
+  tt.func public @triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i64 loc("ks0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} {
+    %xnumel_0 = arith.constant 9437184 : i32 loc(#loc25)
+    %xoffset = tt.get_program_id x : i32 loc(#loc26)
+    %xoffset_1 = arith.constant 1024 : i32 loc(#loc27)
+    %xoffset_2 = arith.constant 1024 : i32 loc(#loc27)
+    %xoffset_3 = arith.muli %xoffset, %xoffset_2 : i32 loc(#loc27)
+    %xindex = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32> loc(#loc28)
+    %xindex_4 = tt.splat %xoffset_3 : i32 -> tensor<1024xi32> loc(#loc29)
+    %xindex_5 = arith.addi %xindex_4, %xindex : tensor<1024xi32> loc(#loc29)
+    %xmask = arith.constant true loc(#loc30)
+    %xmask_6 = arith.constant dense<true> : tensor<1024xi1> loc(#loc30)
+    %x0 = arith.constant 128 : i32 loc(#loc31)
+    %x0_7 = arith.constant 128 : i32 loc(#loc31)
+    %x0_8 = arith.constant dense<128> : tensor<1024xi32> loc(#loc31)
+    %x0_9 = arith.remsi %xindex_5, %x0_8 : tensor<1024xi32> loc(#loc31)
+    %x1 = arith.constant 128 : i32 loc(#loc32)
+    %x1_10 = arith.constant 128 : i32 loc(#loc32)
+    %x1_11 = arith.constant dense<128> : tensor<1024xi32> loc(#loc32)
+    %x1_12 = arith.divsi %xindex_5, %x1_11 : tensor<1024xi32> loc(#loc32)
+    %x1_13 = arith.constant 2304 : i32 loc(#loc33)
+    %x1_14 = arith.constant 2304 : i32 loc(#loc33)
+    %x1_15 = arith.constant dense<2304> : tensor<1024xi32> loc(#loc33)
+    %x1_16 = arith.remsi %x1_12, %x1_15 : tensor<1024xi32> loc(#loc33)
+    %x2 = arith.constant 294912 : i32 loc(#loc34)
+    %x2_17 = arith.constant 294912 : i32 loc(#loc34)
+    %x2_18 = arith.constant dense<294912> : tensor<1024xi32> loc(#loc34)
+    %x2_19 = arith.divsi %xindex_5, %x2_18 : tensor<1024xi32> loc(#loc34)
+    %tmp0 = arith.constant 128 : i32 loc(#loc35)
+    %tmp0_20 = arith.constant 128 : i32 loc(#loc35)
+    %tmp0_21 = arith.constant dense<128> : tensor<1024xi32> loc(#loc35)
+    %tmp0_22 = arith.muli %tmp0_21, %x2_19 : tensor<1024xi32> loc(#loc35)
+    %tmp0_23 = arith.addi %x0_9, %tmp0_22 : tensor<1024xi32> loc(#loc36)
+    %tmp0_24 = arith.extsi %x1_16 : tensor<1024xi32> to tensor<1024xi64> loc(#loc37)
+    %tmp0_25 = tt.splat %ks0 : i64 -> tensor<1024xi64> loc(#loc37)
+    %tmp0_26 = arith.muli %tmp0_25, %tmp0_24 : tensor<1024xi64> loc(#loc37)
+    %tmp0_27 = arith.extsi %tmp0_23 : tensor<1024xi32> to tensor<1024xi64> loc(#loc38)
+    %tmp0_28 = arith.addi %tmp0_27, %tmp0_26 : tensor<1024xi64> loc(#loc38)
+    %tmp0_29 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<1024x!tt.ptr<bf16>> loc(#loc39)
+    %tmp0_30 = tt.addptr %tmp0_29, %tmp0_28 : tensor<1024x!tt.ptr<bf16>>, tensor<1024xi64> loc(#loc39)
+    %tmp0_31 = tt.load %tmp0_30 : tensor<1024x!tt.ptr<bf16>> loc(#loc40)
+    %tmp0_32 = arith.extf %tmp0_31 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc41)
+    %0 = tt.splat %out_ptr0 : !tt.ptr<bf16> -> tensor<1024x!tt.ptr<bf16>> loc(#loc18)
+    %1 = tt.addptr %0, %xindex_5 : tensor<1024x!tt.ptr<bf16>>, tensor<1024xi32> loc(#loc18)
+    %2 = arith.truncf %tmp0_32 : tensor<1024xf32> to tensor<1024xbf16> loc(#loc19)
+    tt.store %1, %2 : tensor<1024x!tt.ptr<bf16>> loc(#loc19)
+    tt.return loc(#loc20)
+  } loc(#loc)
+} loc(#loc)
+#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":19:13)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":20:28)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":20:33)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":21:36)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":21:23)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":22:36)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":23:19)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":24:21)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":24:28)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":25:19)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":27:39)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":27:35)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":27:48)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":27:44)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":27:30)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":27:53)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":27:62)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":28:25)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":28:36)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":28:4)
+#loc25 = loc("xnumel"(#loc1))
+#loc26 = loc("xoffset"(#loc2))
+#loc27 = loc("xoffset"(#loc3))
+#loc28 = loc("xindex"(#loc4))
+#loc29 = loc("xindex"(#loc5))
+#loc30 = loc("xmask"(#loc6))
+#loc31 = loc("x0"(#loc7))
+#loc32 = loc("x1"(#loc8))
+#loc33 = loc("x1"(#loc9))
+#loc34 = loc("x2"(#loc10))
+#loc35 = loc("tmp0"(#loc11))
+#loc36 = loc("tmp0"(#loc12))
+#loc37 = loc("tmp0"(#loc13))
+#loc38 = loc("tmp0"(#loc14))
+#loc39 = loc("tmp0"(#loc15))
+#loc40 = loc("tmp0"(#loc16))
+#loc41 = loc("tmp0"(#loc17))
diff --git a/triton/SJ2F5NEEPBSFTTPVSLW22OOIZQR5FPT5YWSURMFRPHLWAFZ5VB7A/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.ttgir b/triton/SJ2F5NEEPBSFTTPVSLW22OOIZQR5FPT5YWSURMFRPHLWAFZ5VB7A/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.ttgir
new file mode 100644
index 0000000000000000000000000000000000000000..3af54dbba1279f97ad547ccf9297191e53fb9d47
--- /dev/null
+++ b/triton/SJ2F5NEEPBSFTTPVSLW22OOIZQR5FPT5YWSURMFRPHLWAFZ5VB7A/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.ttgir
@@ -0,0 +1,69 @@
+#blocked = #ttg.blocked<{sizePerThread = [8], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}>
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":18:0)
+#loc19 = loc("in_ptr0"(#loc))
+#loc20 = loc("out_ptr0"(#loc))
+#loc21 = loc("ks0"(#loc))
+#loc22 = loc("xnumel"(#loc))
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:89", "ttg.threads-per-warp" = 32 : i32} {
+  tt.func public @triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i64 loc("ks0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} {
+    %cst = arith.constant dense<128> : tensor<1024xi32, #blocked> loc(#loc1)
+    %cst_0 = arith.constant dense<2304> : tensor<1024xi32, #blocked> loc(#loc1)
+    %cst_1 = arith.constant dense<294912> : tensor<1024xi32, #blocked> loc(#loc1)
+    %c1024_i32 = arith.constant 1024 : i32 loc(#loc1)
+    %xoffset = tt.get_program_id x : i32 loc(#loc23)
+    %xoffset_2 = arith.muli %xoffset, %c1024_i32 : i32 loc(#loc24)
+    %xindex = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked> loc(#loc25)
+    %xindex_3 = tt.splat %xoffset_2 : i32 -> tensor<1024xi32, #blocked> loc(#loc26)
+    %xindex_4 = arith.addi %xindex_3, %xindex : tensor<1024xi32, #blocked> loc(#loc26)
+    %x0 = arith.remsi %xindex_4, %cst : tensor<1024xi32, #blocked> loc(#loc27)
+    %x1 = arith.divsi %xindex_4, %cst : tensor<1024xi32, #blocked> loc(#loc28)
+    %x1_5 = arith.remsi %x1, %cst_0 : tensor<1024xi32, #blocked> loc(#loc29)
+    %x2 = arith.divsi %xindex_4, %cst_1 : tensor<1024xi32, #blocked> loc(#loc30)
+    %tmp0 = arith.muli %x2, %cst : tensor<1024xi32, #blocked> loc(#loc31)
+    %tmp0_6 = arith.addi %x0, %tmp0 : tensor<1024xi32, #blocked> loc(#loc32)
+    %tmp0_7 = arith.extsi %x1_5 : tensor<1024xi32, #blocked> to tensor<1024xi64, #blocked> loc(#loc33)
+    %tmp0_8 = tt.splat %ks0 : i64 -> tensor<1024xi64, #blocked> loc(#loc33)
+    %tmp0_9 = arith.muli %tmp0_8, %tmp0_7 : tensor<1024xi64, #blocked> loc(#loc33)
+    %tmp0_10 = arith.extsi %tmp0_6 : tensor<1024xi32, #blocked> to tensor<1024xi64, #blocked> loc(#loc34)
+    %tmp0_11 = arith.addi %tmp0_10, %tmp0_9 : tensor<1024xi64, #blocked> loc(#loc34)
+    %tmp0_12 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<1024x!tt.ptr<bf16>, #blocked> loc(#loc35)
+    %tmp0_13 = tt.addptr %tmp0_12, %tmp0_11 : tensor<1024x!tt.ptr<bf16>, #blocked>, tensor<1024xi64, #blocked> loc(#loc35)
+    %tmp0_14 = tt.load %tmp0_13 : tensor<1024x!tt.ptr<bf16>, #blocked> loc(#loc36)
+    %0 = tt.splat %out_ptr0 : !tt.ptr<bf16> -> tensor<1024x!tt.ptr<bf16>, #blocked> loc(#loc16)
+    %1 = tt.addptr %0, %xindex_4 : tensor<1024x!tt.ptr<bf16>, #blocked>, tensor<1024xi32, #blocked> loc(#loc16)
+    tt.store %1, %tmp0_14 : tensor<1024x!tt.ptr<bf16>, #blocked> loc(#loc17)
+    tt.return loc(#loc18)
+  } loc(#loc)
+} loc(#loc)
+#loc1 = loc(unknown)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":20:28)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":20:33)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":21:36)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":21:23)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":23:19)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":24:21)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":24:28)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":25:19)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":27:39)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":27:35)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":27:48)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":27:44)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":27:30)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":27:53)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":28:25)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":28:36)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":28:4)
+#loc23 = loc("xoffset"(#loc2))
+#loc24 = loc("xoffset"(#loc3))
+#loc25 = loc("xindex"(#loc4))
+#loc26 = loc("xindex"(#loc5))
+#loc27 = loc("x0"(#loc6))
+#loc28 = loc("x1"(#loc7))
+#loc29 = loc("x1"(#loc8))
+#loc30 = loc("x2"(#loc9))
+#loc31 = loc("tmp0"(#loc10))
+#loc32 = loc("tmp0"(#loc11))
+#loc33 = loc("tmp0"(#loc12))
+#loc34 = loc("tmp0"(#loc13))
+#loc35 = loc("tmp0"(#loc14))
+#loc36 = loc("tmp0"(#loc15))
diff --git a/triton/SJ2F5NEEPBSFTTPVSLW22OOIZQR5FPT5YWSURMFRPHLWAFZ5VB7A/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.ttir b/triton/SJ2F5NEEPBSFTTPVSLW22OOIZQR5FPT5YWSURMFRPHLWAFZ5VB7A/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.ttir
new file mode 100644
index 0000000000000000000000000000000000000000..695072be3d76b7acc3a4f38b021280208523f65e
--- /dev/null
+++ b/triton/SJ2F5NEEPBSFTTPVSLW22OOIZQR5FPT5YWSURMFRPHLWAFZ5VB7A/triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1.ttir
@@ -0,0 +1,68 @@
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":18:0)
+#loc19 = loc("in_ptr0"(#loc))
+#loc20 = loc("out_ptr0"(#loc))
+#loc21 = loc("ks0"(#loc))
+#loc22 = loc("xnumel"(#loc))
+module {
+  tt.func public @triton_poi_fused__scaled_dot_product_cudnn_attention_clone_permute_1(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i64 loc("ks0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} {
+    %x2 = arith.constant dense<294912> : tensor<1024xi32> loc(#loc23)
+    %x1 = arith.constant dense<2304> : tensor<1024xi32> loc(#loc24)
+    %cst = arith.constant dense<128> : tensor<1024xi32> loc(#loc3)
+    %c1024_i32 = arith.constant 1024 : i32 loc(#loc3)
+    %xoffset = tt.get_program_id x : i32 loc(#loc25)
+    %xoffset_0 = arith.muli %xoffset, %c1024_i32 : i32 loc(#loc26)
+    %xindex = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32> loc(#loc27)
+    %xindex_1 = tt.splat %xoffset_0 : i32 -> tensor<1024xi32> loc(#loc28)
+    %xindex_2 = arith.addi %xindex_1, %xindex : tensor<1024xi32> loc(#loc28)
+    %x0 = arith.remsi %xindex_2, %cst : tensor<1024xi32> loc(#loc29)
+    %x1_3 = arith.divsi %xindex_2, %cst : tensor<1024xi32> loc(#loc30)
+    %x1_4 = arith.remsi %x1_3, %x1 : tensor<1024xi32> loc(#loc24)
+    %x2_5 = arith.divsi %xindex_2, %x2 : tensor<1024xi32> loc(#loc23)
+    %tmp0 = arith.muli %x2_5, %cst : tensor<1024xi32> loc(#loc31)
+    %tmp0_6 = arith.addi %x0, %tmp0 : tensor<1024xi32> loc(#loc32)
+    %tmp0_7 = arith.extsi %x1_4 : tensor<1024xi32> to tensor<1024xi64> loc(#loc33)
+    %tmp0_8 = tt.splat %ks0 : i64 -> tensor<1024xi64> loc(#loc33)
+    %tmp0_9 = arith.muli %tmp0_8, %tmp0_7 : tensor<1024xi64> loc(#loc33)
+    %tmp0_10 = arith.extsi %tmp0_6 : tensor<1024xi32> to tensor<1024xi64> loc(#loc34)
+    %tmp0_11 = arith.addi %tmp0_10, %tmp0_9 : tensor<1024xi64> loc(#loc34)
+    %tmp0_12 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<1024x!tt.ptr<bf16>> loc(#loc35)
+    %tmp0_13 = tt.addptr %tmp0_12, %tmp0_11 : tensor<1024x!tt.ptr<bf16>>, tensor<1024xi64> loc(#loc35)
+    %tmp0_14 = tt.load %tmp0_13 : tensor<1024x!tt.ptr<bf16>> loc(#loc36)
+    %0 = tt.splat %out_ptr0 : !tt.ptr<bf16> -> tensor<1024x!tt.ptr<bf16>> loc(#loc16)
+    %1 = tt.addptr %0, %xindex_2 : tensor<1024x!tt.ptr<bf16>>, tensor<1024xi32> loc(#loc16)
+    tt.store %1, %tmp0_14 : tensor<1024x!tt.ptr<bf16>> loc(#loc17)
+    tt.return loc(#loc18)
+  } loc(#loc)
+} loc(#loc)
+#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":25:19)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":24:28)
+#loc3 = loc(unknown)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":20:28)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":20:33)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":21:36)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":21:23)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":23:19)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":24:21)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":27:39)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":27:35)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":27:48)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":27:44)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":27:30)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":27:53)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":28:25)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":28:36)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/3v/c3vjilvcy7sdqcaxfspffadbsry3sqm4j7qp6u3tusr6p34sbiar.py":28:4)
+#loc23 = loc("x2"(#loc1))
+#loc24 = loc("x1"(#loc2))
+#loc25 = loc("xoffset"(#loc4))
+#loc26 = loc("xoffset"(#loc5))
+#loc27 = loc("xindex"(#loc6))
+#loc28 = loc("xindex"(#loc7))
+#loc29 = loc("x0"(#loc8))
+#loc30 = loc("x1"(#loc9))
+#loc31 = loc("tmp0"(#loc10))
+#loc32 = loc("tmp0"(#loc11))
+#loc33 = loc("tmp0"(#loc12))
+#loc34 = loc("tmp0"(#loc13))
+#loc35 = loc("tmp0"(#loc14))
+#loc36 = loc("tmp0"(#loc15))
diff --git a/triton/SWIO2NFSYH3NKX6EWLJXN7WN2QH2K7ETN3JE2BQRCZXLIIDUWOOA/__grp__triton_poi_fused_clone_0.json b/triton/SWIO2NFSYH3NKX6EWLJXN7WN2QH2K7ETN3JE2BQRCZXLIIDUWOOA/__grp__triton_poi_fused_clone_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..5a91cb061705d4a758630b7bef07113222f472eb
--- /dev/null
+++ b/triton/SWIO2NFSYH3NKX6EWLJXN7WN2QH2K7ETN3JE2BQRCZXLIIDUWOOA/__grp__triton_poi_fused_clone_0.json
@@ -0,0 +1 @@
+{"child_paths": {"triton_poi_fused_clone_0.source": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/SWIO2NFSYH3NKX6EWLJXN7WN2QH2K7ETN3JE2BQRCZXLIIDUWOOA/triton_poi_fused_clone_0.source", "triton_poi_fused_clone_0.ttir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/SWIO2NFSYH3NKX6EWLJXN7WN2QH2K7ETN3JE2BQRCZXLIIDUWOOA/triton_poi_fused_clone_0.ttir", "triton_poi_fused_clone_0.ttgir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/SWIO2NFSYH3NKX6EWLJXN7WN2QH2K7ETN3JE2BQRCZXLIIDUWOOA/triton_poi_fused_clone_0.ttgir", "triton_poi_fused_clone_0.llir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/SWIO2NFSYH3NKX6EWLJXN7WN2QH2K7ETN3JE2BQRCZXLIIDUWOOA/triton_poi_fused_clone_0.llir", "triton_poi_fused_clone_0.ptx": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/SWIO2NFSYH3NKX6EWLJXN7WN2QH2K7ETN3JE2BQRCZXLIIDUWOOA/triton_poi_fused_clone_0.ptx", "triton_poi_fused_clone_0.cubin": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/SWIO2NFSYH3NKX6EWLJXN7WN2QH2K7ETN3JE2BQRCZXLIIDUWOOA/triton_poi_fused_clone_0.cubin", "triton_poi_fused_clone_0.json": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/SWIO2NFSYH3NKX6EWLJXN7WN2QH2K7ETN3JE2BQRCZXLIIDUWOOA/triton_poi_fused_clone_0.json"}}
\ No newline at end of file
diff --git a/triton/SWIO2NFSYH3NKX6EWLJXN7WN2QH2K7ETN3JE2BQRCZXLIIDUWOOA/triton_poi_fused_clone_0.cubin b/triton/SWIO2NFSYH3NKX6EWLJXN7WN2QH2K7ETN3JE2BQRCZXLIIDUWOOA/triton_poi_fused_clone_0.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..055052cc157b8fe4afe03346b48c610808954ce5
Binary files /dev/null and b/triton/SWIO2NFSYH3NKX6EWLJXN7WN2QH2K7ETN3JE2BQRCZXLIIDUWOOA/triton_poi_fused_clone_0.cubin differ
diff --git a/triton/SWIO2NFSYH3NKX6EWLJXN7WN2QH2K7ETN3JE2BQRCZXLIIDUWOOA/triton_poi_fused_clone_0.json b/triton/SWIO2NFSYH3NKX6EWLJXN7WN2QH2K7ETN3JE2BQRCZXLIIDUWOOA/triton_poi_fused_clone_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..f3dd9bf66d959d8878c3edb690f9686c2a48590b
--- /dev/null
+++ b/triton/SWIO2NFSYH3NKX6EWLJXN7WN2QH2K7ETN3JE2BQRCZXLIIDUWOOA/triton_poi_fused_clone_0.json
@@ -0,0 +1 @@
+{"hash": "9590ed34b2c1f6d55fc4b2d376fecdd40fa57c936ed24d0611166eb42074b39c", "target": {"backend": "cuda", "arch": 89, "warp_size": 32}, "num_warps": 8, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "enable_reflect_ftz": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee", "bf16x3", "bf16x6"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm89", "instrumentation_mode": "", "triton_version": "3.6.0", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_poi_fused_clone_0"}
\ No newline at end of file
diff --git a/triton/SWIO2NFSYH3NKX6EWLJXN7WN2QH2K7ETN3JE2BQRCZXLIIDUWOOA/triton_poi_fused_clone_0.llir b/triton/SWIO2NFSYH3NKX6EWLJXN7WN2QH2K7ETN3JE2BQRCZXLIIDUWOOA/triton_poi_fused_clone_0.llir
new file mode 100644
index 0000000000000000000000000000000000000000..4e49e98d7484e2bf4e86c108d16128fc6ef3b2ba
--- /dev/null
+++ b/triton/SWIO2NFSYH3NKX6EWLJXN7WN2QH2K7ETN3JE2BQRCZXLIIDUWOOA/triton_poi_fused_clone_0.llir
@@ -0,0 +1,49 @@
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64"
+
+; Function Attrs: nounwind
+define ptx_kernel void @triton_poi_fused_clone_0(ptr addrspace(1) %0, ptr addrspace(1) %1, i32 %2, ptr addrspace(1) readnone captures(none) %3, ptr addrspace(1) readnone captures(none) %4) local_unnamed_addr #0 !dbg !4 {
+  %6 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7
+  %7 = shl i32 %6, 9, !dbg !8
+  %8 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9
+  %9 = shl nuw nsw i32 %8, 1, !dbg !9
+  %10 = and i32 %9, 510, !dbg !9
+  %11 = or disjoint i32 %10, %7, !dbg !10
+  %12 = sext i32 %11 to i64, !dbg !11
+  %13 = getelementptr bfloat, ptr addrspace(1) %0, i64 %12, !dbg !11
+  %14 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l"(ptr addrspace(1) %13) #2, !dbg !12
+  %15 = getelementptr bfloat, ptr addrspace(1) %1, i64 %12, !dbg !13
+  tail call void asm sideeffect "st.global.b32 [ $1 + 0 ], { $0 };", "r,l"(i32 %14, ptr addrspace(1) %15) #2, !dbg !14
+  ret void, !dbg !15
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1
+
+attributes #0 = { nounwind "nvvm.reqntid"="256" }
+attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #2 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly)
+!1 = !DIFile(filename: "cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py", directory: "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz")
+!2 = !{i32 2, !"Debug Info Version", i32 3}
+!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
+!4 = distinct !DISubprogram(name: "triton_poi_fused_clone_0", linkageName: "triton_poi_fused_clone_0", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!5 = !DISubroutineType(cc: DW_CC_normal, types: !6)
+!6 = !{}
+!7 = !DILocation(line: 20, column: 28, scope: !4)
+!8 = !DILocation(line: 20, column: 33, scope: !4)
+!9 = !DILocation(line: 21, column: 36, scope: !4)
+!10 = !DILocation(line: 21, column: 23, scope: !4)
+!11 = !DILocation(line: 24, column: 30, scope: !4)
+!12 = !DILocation(line: 24, column: 35, scope: !4)
+!13 = !DILocation(line: 25, column: 25, scope: !4)
+!14 = !DILocation(line: 25, column: 36, scope: !4)
+!15 = !DILocation(line: 25, column: 4, scope: !4)
diff --git a/triton/SWIO2NFSYH3NKX6EWLJXN7WN2QH2K7ETN3JE2BQRCZXLIIDUWOOA/triton_poi_fused_clone_0.ptx b/triton/SWIO2NFSYH3NKX6EWLJXN7WN2QH2K7ETN3JE2BQRCZXLIIDUWOOA/triton_poi_fused_clone_0.ptx
new file mode 100644
index 0000000000000000000000000000000000000000..600b92ae91bba525917638c0ed6e7d3e0a212bd0
--- /dev/null
+++ b/triton/SWIO2NFSYH3NKX6EWLJXN7WN2QH2K7ETN3JE2BQRCZXLIIDUWOOA/triton_poi_fused_clone_0.ptx
@@ -0,0 +1,302 @@
+//
+// Generated by LLVM NVPTX Back-End
+//
+
+.version 9.1
+.target sm_89
+.address_size 64
+
+	// .globl	triton_poi_fused_clone_0 // -- Begin function triton_poi_fused_clone_0
+                                        // @triton_poi_fused_clone_0
+.visible .entry triton_poi_fused_clone_0(
+	.param .u64 .ptr .global .align 1 triton_poi_fused_clone_0_param_0,
+	.param .u64 .ptr .global .align 1 triton_poi_fused_clone_0_param_1,
+	.param .u32 triton_poi_fused_clone_0_param_2,
+	.param .u64 .ptr .global .align 1 triton_poi_fused_clone_0_param_3,
+	.param .u64 .ptr .global .align 1 triton_poi_fused_clone_0_param_4
+)
+.reqntid 256
+{
+	.reg .b32 	%r<8>;
+	.reg .b64 	%rd<6>;
+	.loc	1 18 0                          // cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py:18:0
+$L__func_begin0:
+	.loc	1 18 0                          // cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py:18:0
+
+// %bb.0:
+	ld.param.b64 	%rd3, [triton_poi_fused_clone_0_param_0];
+	ld.param.b64 	%rd4, [triton_poi_fused_clone_0_param_1];
+$L__tmp0:
+	.loc	1 20 28                         // cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py:20:28
+	mov.u32 	%r2, %ctaid.x;
+	.loc	1 20 33                         // cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py:20:33
+	shl.b32 	%r3, %r2, 9;
+	.loc	1 21 36                         // cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py:21:36
+	mov.u32 	%r4, %tid.x;
+	shl.b32 	%r5, %r4, 1;
+	and.b32 	%r6, %r5, 510;
+	.loc	1 21 23                         // cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py:21:23
+	or.b32 	%r7, %r6, %r3;
+	.loc	1 24 30                         // cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py:24:30
+	mul.wide.s32 	%rd5, %r7, 2;
+	add.s64 	%rd1, %rd3, %rd5;
+	.loc	1 24 35                         // cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py:24:35
+	// begin inline asm
+	mov.u32 %r1, 0x0;
+	ld.global.b32 { %r1 }, [ %rd1 + 0 ];
+	// end inline asm
+	.loc	1 25 25                         // cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py:25:25
+	add.s64 	%rd2, %rd4, %rd5;
+	.loc	1 25 36                         // cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py:25:36
+	// begin inline asm
+	st.global.b32 [ %rd2 + 0 ], { %r1 };
+	// end inline asm
+	.loc	1 25 4                          // cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py:25:4
+	ret;
+$L__tmp1:
+$L__func_end0:
+                                        // -- End function
+}
+	.file	1 "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py"
+	.section	.debug_abbrev
+	{
+.b8 1                                   // Abbreviation Code
+.b8 17                                  // DW_TAG_compile_unit
+.b8 0                                   // DW_CHILDREN_no
+.b8 37                                  // DW_AT_producer
+.b8 8                                   // DW_FORM_string
+.b8 19                                  // DW_AT_language
+.b8 5                                   // DW_FORM_data2
+.b8 3                                   // DW_AT_name
+.b8 8                                   // DW_FORM_string
+.b8 16                                  // DW_AT_stmt_list
+.b8 6                                   // DW_FORM_data4
+.b8 27                                  // DW_AT_comp_dir
+.b8 8                                   // DW_FORM_string
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 0                                   // EOM(3)
+	}
+	.section	.debug_info
+	{
+.b32 224                                // Length of Unit
+.b8 2                                   // DWARF version number
+.b8 0
+.b32 .debug_abbrev                      // Offset Into Abbrev. Section
+.b8 8                                   // Address Size (in bytes)
+.b8 1                                   // Abbrev [1] 0xb:0xd9 DW_TAG_compile_unit
+.b8 116                                 // DW_AT_producer
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 0
+.b8 2                                   // DW_AT_language
+.b8 0
+.b8 99                                  // DW_AT_name
+.b8 99
+.b8 122
+.b8 103
+.b8 55
+.b8 116
+.b8 112
+.b8 105
+.b8 116
+.b8 117
+.b8 112
+.b8 114
+.b8 119
+.b8 103
+.b8 113
+.b8 112
+.b8 117
+.b8 97
+.b8 106
+.b8 122
+.b8 121
+.b8 50
+.b8 110
+.b8 121
+.b8 108
+.b8 102
+.b8 107
+.b8 52
+.b8 51
+.b8 109
+.b8 100
+.b8 111
+.b8 122
+.b8 100
+.b8 53
+.b8 118
+.b8 119
+.b8 111
+.b8 55
+.b8 55
+.b8 109
+.b8 117
+.b8 113
+.b8 51
+.b8 107
+.b8 111
+.b8 115
+.b8 112
+.b8 110
+.b8 102
+.b8 55
+.b8 98
+.b8 46
+.b8 112
+.b8 121
+.b8 0
+.b32 .debug_line                        // DW_AT_stmt_list
+.b8 47                                  // DW_AT_comp_dir
+.b8 97
+.b8 112
+.b8 112
+.b8 47
+.b8 116
+.b8 101
+.b8 110
+.b8 115
+.b8 111
+.b8 114
+.b8 114
+.b8 116
+.b8 95
+.b8 108
+.b8 108
+.b8 109
+.b8 47
+.b8 118
+.b8 105
+.b8 115
+.b8 117
+.b8 97
+.b8 108
+.b8 95
+.b8 103
+.b8 101
+.b8 110
+.b8 47
+.b8 99
+.b8 111
+.b8 109
+.b8 112
+.b8 105
+.b8 108
+.b8 101
+.b8 100
+.b8 95
+.b8 99
+.b8 97
+.b8 99
+.b8 104
+.b8 101
+.b8 47
+.b8 102
+.b8 108
+.b8 117
+.b8 120
+.b8 50
+.b8 95
+.b8 107
+.b8 108
+.b8 101
+.b8 105
+.b8 110
+.b8 95
+.b8 57
+.b8 98
+.b8 95
+.b8 78
+.b8 86
+.b8 73
+.b8 68
+.b8 73
+.b8 65
+.b8 95
+.b8 71
+.b8 101
+.b8 70
+.b8 111
+.b8 114
+.b8 99
+.b8 101
+.b8 95
+.b8 82
+.b8 84
+.b8 88
+.b8 95
+.b8 52
+.b8 48
+.b8 57
+.b8 48
+.b8 95
+.b8 115
+.b8 109
+.b8 56
+.b8 57
+.b8 95
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 50
+.b8 46
+.b8 49
+.b8 48
+.b8 46
+.b8 48
+.b8 97
+.b8 48
+.b8 95
+.b8 98
+.b8 52
+.b8 101
+.b8 52
+.b8 101
+.b8 101
+.b8 56
+.b8 49
+.b8 100
+.b8 51
+.b8 46
+.b8 110
+.b8 118
+.b8 50
+.b8 53
+.b8 46
+.b8 49
+.b8 50
+.b8 95
+.b8 99
+.b8 117
+.b8 100
+.b8 97
+.b8 49
+.b8 51
+.b8 95
+.b8 49
+.b8 47
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 105
+.b8 110
+.b8 100
+.b8 117
+.b8 99
+.b8 116
+.b8 111
+.b8 114
+.b8 47
+.b8 99
+.b8 122
+.b8 0
+	}
+	.section	.debug_macinfo	{	}
diff --git a/triton/SWIO2NFSYH3NKX6EWLJXN7WN2QH2K7ETN3JE2BQRCZXLIIDUWOOA/triton_poi_fused_clone_0.source b/triton/SWIO2NFSYH3NKX6EWLJXN7WN2QH2K7ETN3JE2BQRCZXLIIDUWOOA/triton_poi_fused_clone_0.source
new file mode 100644
index 0000000000000000000000000000000000000000..0278f8e8405a4b1bfa1acbdcd0f84a4a64473918
--- /dev/null
+++ b/triton/SWIO2NFSYH3NKX6EWLJXN7WN2QH2K7ETN3JE2BQRCZXLIIDUWOOA/triton_poi_fused_clone_0.source
@@ -0,0 +1,48 @@
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py":18:0)
+#loc13 = loc("in_ptr0"(#loc))
+#loc14 = loc("out_ptr0"(#loc))
+#loc15 = loc("xnumel"(#loc))
+module {
+  tt.func public @triton_poi_fused_clone_0(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} {
+    %xnumel_0 = arith.constant 8388608 : i32 loc(#loc16)
+    %xoffset = tt.get_program_id x : i32 loc(#loc17)
+    %xoffset_1 = arith.constant 512 : i32 loc(#loc18)
+    %xoffset_2 = arith.constant 512 : i32 loc(#loc18)
+    %xoffset_3 = arith.muli %xoffset, %xoffset_2 : i32 loc(#loc18)
+    %xindex = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32> loc(#loc19)
+    %xindex_4 = tt.splat %xoffset_3 : i32 -> tensor<512xi32> loc(#loc20)
+    %xindex_5 = arith.addi %xindex_4, %xindex : tensor<512xi32> loc(#loc20)
+    %xmask = arith.constant true loc(#loc21)
+    %xmask_6 = arith.constant dense<true> : tensor<512xi1> loc(#loc21)
+    %tmp0 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<512x!tt.ptr<bf16>> loc(#loc22)
+    %tmp0_7 = tt.addptr %tmp0, %xindex_5 : tensor<512x!tt.ptr<bf16>>, tensor<512xi32> loc(#loc22)
+    %tmp0_8 = tt.load %tmp0_7 : tensor<512x!tt.ptr<bf16>> loc(#loc23)
+    %tmp0_9 = arith.extf %tmp0_8 : tensor<512xbf16> to tensor<512xf32> loc(#loc24)
+    %0 = tt.splat %out_ptr0 : !tt.ptr<bf16> -> tensor<512x!tt.ptr<bf16>> loc(#loc10)
+    %1 = tt.addptr %0, %xindex_5 : tensor<512x!tt.ptr<bf16>>, tensor<512xi32> loc(#loc10)
+    %2 = arith.truncf %tmp0_9 : tensor<512xf32> to tensor<512xbf16> loc(#loc11)
+    tt.store %1, %2 : tensor<512x!tt.ptr<bf16>> loc(#loc11)
+    tt.return loc(#loc12)
+  } loc(#loc)
+} loc(#loc)
+#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py":19:13)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py":20:28)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py":20:33)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py":21:36)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py":21:23)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py":22:36)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py":24:30)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py":24:35)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py":24:44)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py":25:25)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py":25:36)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py":25:4)
+#loc16 = loc("xnumel"(#loc1))
+#loc17 = loc("xoffset"(#loc2))
+#loc18 = loc("xoffset"(#loc3))
+#loc19 = loc("xindex"(#loc4))
+#loc20 = loc("xindex"(#loc5))
+#loc21 = loc("xmask"(#loc6))
+#loc22 = loc("tmp0"(#loc7))
+#loc23 = loc("tmp0"(#loc8))
+#loc24 = loc("tmp0"(#loc9))
diff --git a/triton/SWIO2NFSYH3NKX6EWLJXN7WN2QH2K7ETN3JE2BQRCZXLIIDUWOOA/triton_poi_fused_clone_0.ttgir b/triton/SWIO2NFSYH3NKX6EWLJXN7WN2QH2K7ETN3JE2BQRCZXLIIDUWOOA/triton_poi_fused_clone_0.ttgir
new file mode 100644
index 0000000000000000000000000000000000000000..9e0bdd3bd4ab82744d77aea6cd7952c3e50475b5
--- /dev/null
+++ b/triton/SWIO2NFSYH3NKX6EWLJXN7WN2QH2K7ETN3JE2BQRCZXLIIDUWOOA/triton_poi_fused_clone_0.ttgir
@@ -0,0 +1,38 @@
+#blocked = #ttg.blocked<{sizePerThread = [2], threadsPerWarp = [32], warpsPerCTA = [8], order = [0]}>
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py":18:0)
+#loc11 = loc("in_ptr0"(#loc))
+#loc12 = loc("out_ptr0"(#loc))
+#loc13 = loc("xnumel"(#loc))
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "cuda:89", "ttg.threads-per-warp" = 32 : i32} {
+  tt.func public @triton_poi_fused_clone_0(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} {
+    %c512_i32 = arith.constant 512 : i32 loc(#loc1)
+    %xoffset = tt.get_program_id x : i32 loc(#loc14)
+    %xoffset_0 = arith.muli %xoffset, %c512_i32 : i32 loc(#loc15)
+    %xindex = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32, #blocked> loc(#loc16)
+    %xindex_1 = tt.splat %xoffset_0 : i32 -> tensor<512xi32, #blocked> loc(#loc17)
+    %xindex_2 = arith.addi %xindex_1, %xindex : tensor<512xi32, #blocked> loc(#loc17)
+    %tmp0 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<512x!tt.ptr<bf16>, #blocked> loc(#loc18)
+    %tmp0_3 = tt.addptr %tmp0, %xindex_2 : tensor<512x!tt.ptr<bf16>, #blocked>, tensor<512xi32, #blocked> loc(#loc18)
+    %tmp0_4 = tt.load %tmp0_3 : tensor<512x!tt.ptr<bf16>, #blocked> loc(#loc19)
+    %0 = tt.splat %out_ptr0 : !tt.ptr<bf16> -> tensor<512x!tt.ptr<bf16>, #blocked> loc(#loc8)
+    %1 = tt.addptr %0, %xindex_2 : tensor<512x!tt.ptr<bf16>, #blocked>, tensor<512xi32, #blocked> loc(#loc8)
+    tt.store %1, %tmp0_4 : tensor<512x!tt.ptr<bf16>, #blocked> loc(#loc9)
+    tt.return loc(#loc10)
+  } loc(#loc)
+} loc(#loc)
+#loc1 = loc(unknown)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py":20:28)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py":20:33)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py":21:36)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py":21:23)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py":24:30)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py":24:35)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py":25:25)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py":25:36)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py":25:4)
+#loc14 = loc("xoffset"(#loc2))
+#loc15 = loc("xoffset"(#loc3))
+#loc16 = loc("xindex"(#loc4))
+#loc17 = loc("xindex"(#loc5))
+#loc18 = loc("tmp0"(#loc6))
+#loc19 = loc("tmp0"(#loc7))
diff --git a/triton/SWIO2NFSYH3NKX6EWLJXN7WN2QH2K7ETN3JE2BQRCZXLIIDUWOOA/triton_poi_fused_clone_0.ttir b/triton/SWIO2NFSYH3NKX6EWLJXN7WN2QH2K7ETN3JE2BQRCZXLIIDUWOOA/triton_poi_fused_clone_0.ttir
new file mode 100644
index 0000000000000000000000000000000000000000..103f351850775c2f0700eff25ca1a4616c55313d
--- /dev/null
+++ b/triton/SWIO2NFSYH3NKX6EWLJXN7WN2QH2K7ETN3JE2BQRCZXLIIDUWOOA/triton_poi_fused_clone_0.ttir
@@ -0,0 +1,37 @@
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py":18:0)
+#loc11 = loc("in_ptr0"(#loc))
+#loc12 = loc("out_ptr0"(#loc))
+#loc13 = loc("xnumel"(#loc))
+module {
+  tt.func public @triton_poi_fused_clone_0(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} {
+    %c512_i32 = arith.constant 512 : i32 loc(#loc1)
+    %xoffset = tt.get_program_id x : i32 loc(#loc14)
+    %xoffset_0 = arith.muli %xoffset, %c512_i32 : i32 loc(#loc15)
+    %xindex = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32> loc(#loc16)
+    %xindex_1 = tt.splat %xoffset_0 : i32 -> tensor<512xi32> loc(#loc17)
+    %xindex_2 = arith.addi %xindex_1, %xindex : tensor<512xi32> loc(#loc17)
+    %tmp0 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<512x!tt.ptr<bf16>> loc(#loc18)
+    %tmp0_3 = tt.addptr %tmp0, %xindex_2 : tensor<512x!tt.ptr<bf16>>, tensor<512xi32> loc(#loc18)
+    %tmp0_4 = tt.load %tmp0_3 : tensor<512x!tt.ptr<bf16>> loc(#loc19)
+    %0 = tt.splat %out_ptr0 : !tt.ptr<bf16> -> tensor<512x!tt.ptr<bf16>> loc(#loc8)
+    %1 = tt.addptr %0, %xindex_2 : tensor<512x!tt.ptr<bf16>>, tensor<512xi32> loc(#loc8)
+    tt.store %1, %tmp0_4 : tensor<512x!tt.ptr<bf16>> loc(#loc9)
+    tt.return loc(#loc10)
+  } loc(#loc)
+} loc(#loc)
+#loc1 = loc(unknown)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py":20:28)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py":20:33)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py":21:36)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py":21:23)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py":24:30)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py":24:35)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py":25:25)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py":25:36)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py":25:4)
+#loc14 = loc("xoffset"(#loc2))
+#loc15 = loc("xoffset"(#loc3))
+#loc16 = loc("xindex"(#loc4))
+#loc17 = loc("xindex"(#loc5))
+#loc18 = loc("tmp0"(#loc6))
+#loc19 = loc("tmp0"(#loc7))
diff --git a/triton/TADKEHLBMNHPYJR7HM24RCH2SQBJWQ5J3N2BE4JL76WWKUZUI7BA/__grp__triton_red_fused_add_mul_native_layer_norm_0.json b/triton/TADKEHLBMNHPYJR7HM24RCH2SQBJWQ5J3N2BE4JL76WWKUZUI7BA/__grp__triton_red_fused_add_mul_native_layer_norm_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..083ac03b785483c0c6b04b854a1fcce889cd658c
--- /dev/null
+++ b/triton/TADKEHLBMNHPYJR7HM24RCH2SQBJWQ5J3N2BE4JL76WWKUZUI7BA/__grp__triton_red_fused_add_mul_native_layer_norm_0.json
@@ -0,0 +1 @@
+{"child_paths": {"triton_red_fused_add_mul_native_layer_norm_0.source": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/TADKEHLBMNHPYJR7HM24RCH2SQBJWQ5J3N2BE4JL76WWKUZUI7BA/triton_red_fused_add_mul_native_layer_norm_0.source", "triton_red_fused_add_mul_native_layer_norm_0.ttir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/TADKEHLBMNHPYJR7HM24RCH2SQBJWQ5J3N2BE4JL76WWKUZUI7BA/triton_red_fused_add_mul_native_layer_norm_0.ttir", "triton_red_fused_add_mul_native_layer_norm_0.ttgir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/TADKEHLBMNHPYJR7HM24RCH2SQBJWQ5J3N2BE4JL76WWKUZUI7BA/triton_red_fused_add_mul_native_layer_norm_0.ttgir", "triton_red_fused_add_mul_native_layer_norm_0.llir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/TADKEHLBMNHPYJR7HM24RCH2SQBJWQ5J3N2BE4JL76WWKUZUI7BA/triton_red_fused_add_mul_native_layer_norm_0.llir", "triton_red_fused_add_mul_native_layer_norm_0.ptx": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/TADKEHLBMNHPYJR7HM24RCH2SQBJWQ5J3N2BE4JL76WWKUZUI7BA/triton_red_fused_add_mul_native_layer_norm_0.ptx", "triton_red_fused_add_mul_native_layer_norm_0.cubin": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/TADKEHLBMNHPYJR7HM24RCH2SQBJWQ5J3N2BE4JL76WWKUZUI7BA/triton_red_fused_add_mul_native_layer_norm_0.cubin", "triton_red_fused_add_mul_native_layer_norm_0.json": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/TADKEHLBMNHPYJR7HM24RCH2SQBJWQ5J3N2BE4JL76WWKUZUI7BA/triton_red_fused_add_mul_native_layer_norm_0.json"}}
\ No newline at end of file
diff --git a/triton/TADKEHLBMNHPYJR7HM24RCH2SQBJWQ5J3N2BE4JL76WWKUZUI7BA/triton_red_fused_add_mul_native_layer_norm_0.cubin b/triton/TADKEHLBMNHPYJR7HM24RCH2SQBJWQ5J3N2BE4JL76WWKUZUI7BA/triton_red_fused_add_mul_native_layer_norm_0.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..b1846ac86154d050ac5ae2d56051eb2aac1822d0
Binary files /dev/null and b/triton/TADKEHLBMNHPYJR7HM24RCH2SQBJWQ5J3N2BE4JL76WWKUZUI7BA/triton_red_fused_add_mul_native_layer_norm_0.cubin differ
diff --git a/triton/TADKEHLBMNHPYJR7HM24RCH2SQBJWQ5J3N2BE4JL76WWKUZUI7BA/triton_red_fused_add_mul_native_layer_norm_0.json b/triton/TADKEHLBMNHPYJR7HM24RCH2SQBJWQ5J3N2BE4JL76WWKUZUI7BA/triton_red_fused_add_mul_native_layer_norm_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..25a5084020b5275d21fb2a9f28e5c061831a4397
--- /dev/null
+++ b/triton/TADKEHLBMNHPYJR7HM24RCH2SQBJWQ5J3N2BE4JL76WWKUZUI7BA/triton_red_fused_add_mul_native_layer_norm_0.json
@@ -0,0 +1 @@
+{"hash": "9806a21d61634efc263f3b35c888fa94029b43a9db7412712bffad65533447c2", "target": {"backend": "cuda", "arch": 89, "warp_size": 32}, "num_warps": 16, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "enable_reflect_ftz": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee", "bf16x3", "bf16x6"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm89", "instrumentation_mode": "", "triton_version": "3.6.0", "tensordesc_meta": [], "shared": 192, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused_add_mul_native_layer_norm_0"}
\ No newline at end of file
diff --git a/triton/TADKEHLBMNHPYJR7HM24RCH2SQBJWQ5J3N2BE4JL76WWKUZUI7BA/triton_red_fused_add_mul_native_layer_norm_0.llir b/triton/TADKEHLBMNHPYJR7HM24RCH2SQBJWQ5J3N2BE4JL76WWKUZUI7BA/triton_red_fused_add_mul_native_layer_norm_0.llir
new file mode 100644
index 0000000000000000000000000000000000000000..d8495bd5c7e7bb482050746dacb50e85af071fc1
--- /dev/null
+++ b/triton/TADKEHLBMNHPYJR7HM24RCH2SQBJWQ5J3N2BE4JL76WWKUZUI7BA/triton_red_fused_add_mul_native_layer_norm_0.llir
@@ -0,0 +1,601 @@
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64"
+
+@global_smem = external addrspace(3) global [0 x i8], align 16
+@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1
+
+; Function Attrs: nounwind
+define ptx_kernel void @triton_red_fused_add_mul_native_layer_norm_0(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, ptr addrspace(1) %6, i32 %7, i32 %8, ptr addrspace(1) readnone captures(none) %9, ptr addrspace(1) readnone captures(none) %10) local_unnamed_addr #0 !dbg !5 {
+__nv_rsqrtf.exit:
+  %11 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !8
+  %12 = icmp samesign ult i32 %11, 2048, !dbg !9
+  %13 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
+  %14 = and i32 %13, 511, !dbg !10
+  %15 = and i32 %13, 31, !dbg !10
+  %16 = lshr i32 %14, 5, !dbg !10
+  %17 = shl nuw nsw i32 %13, 3, !dbg !10
+  %18 = and i32 %17, 4088, !dbg !10
+  %19 = shl i32 %11, 12, !dbg !11
+  %20 = or disjoint i32 %18, %19, !dbg !12
+  %21 = sext i32 %20 to i64, !dbg !13
+  %22 = getelementptr bfloat, ptr addrspace(1) %0, i64 %21, !dbg !13
+  %23 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #6, !dbg !14
+  %24 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %22, i64 %23, i1 %12) #6, !dbg !14
+  %25 = extractvalue { i32, i32, i32, i32 } %24, 0, !dbg !14
+  %26 = bitcast i32 %25 to <2 x bfloat>, !dbg !14
+  %27 = extractvalue { i32, i32, i32, i32 } %24, 1, !dbg !14
+  %28 = bitcast i32 %27 to <2 x bfloat>, !dbg !14
+  %29 = extractvalue { i32, i32, i32, i32 } %24, 2, !dbg !14
+  %30 = bitcast i32 %29 to <2 x bfloat>, !dbg !14
+  %31 = extractvalue { i32, i32, i32, i32 } %24, 3, !dbg !14
+  %32 = bitcast i32 %31 to <2 x bfloat>, !dbg !14
+  %33 = zext nneg i32 %18 to i64, !dbg !15
+  %34 = getelementptr bfloat, ptr addrspace(1) %1, i64 %33, !dbg !15
+  %35 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !16
+  %36 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %34, i64 %35, i1 true) #6, !dbg !16
+  %37 = extractvalue { i32, i32, i32, i32 } %36, 0, !dbg !16
+  %38 = bitcast i32 %37 to <2 x bfloat>, !dbg !16
+  %39 = extractvalue { i32, i32, i32, i32 } %36, 1, !dbg !16
+  %40 = bitcast i32 %39 to <2 x bfloat>, !dbg !16
+  %41 = extractvalue { i32, i32, i32, i32 } %36, 2, !dbg !16
+  %42 = bitcast i32 %41 to <2 x bfloat>, !dbg !16
+  %43 = extractvalue { i32, i32, i32, i32 } %36, 3, !dbg !16
+  %44 = bitcast i32 %43 to <2 x bfloat>, !dbg !16
+  %45 = getelementptr bfloat, ptr addrspace(1) %2, i64 %21, !dbg !17
+  %46 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #6, !dbg !18
+  %47 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %45, i64 %46, i1 %12) #6, !dbg !18
+  %48 = extractvalue { i32, i32, i32, i32 } %47, 0, !dbg !18
+  %49 = bitcast i32 %48 to <2 x bfloat>, !dbg !18
+  %50 = extractvalue { i32, i32, i32, i32 } %47, 1, !dbg !18
+  %51 = bitcast i32 %50 to <2 x bfloat>, !dbg !18
+  %52 = extractvalue { i32, i32, i32, i32 } %47, 2, !dbg !18
+  %53 = bitcast i32 %52 to <2 x bfloat>, !dbg !18
+  %54 = extractvalue { i32, i32, i32, i32 } %47, 3, !dbg !18
+  %55 = bitcast i32 %54 to <2 x bfloat>, !dbg !18
+  %56 = select i1 %12, float 1.000000e+00, float 0.000000e+00, !dbg !19
+  %57 = getelementptr bfloat, ptr addrspace(1) %5, i64 %21, !dbg !20
+  %58 = fpext <2 x bfloat> %26 to <2 x float>, !dbg !21
+  %59 = fpext <2 x bfloat> %38 to <2 x float>, !dbg !22
+  %60 = fpext <2 x bfloat> %49 to <2 x float>, !dbg !23
+  %61 = fmul <2 x float> %59, %60, !dbg !24
+  %62 = fadd <2 x float> %61, %58, !dbg !25
+  %63 = extractelement <2 x float> %62, i64 0, !dbg !26
+  %64 = select i1 %12, float %63, float 0.000000e+00, !dbg !26
+  %65 = extractelement <2 x float> %62, i64 1, !dbg !26
+  %66 = select i1 %12, float %65, float 0.000000e+00, !dbg !26
+  %67 = fptrunc <2 x float> %62 to <2 x bfloat>, !dbg !27
+  %68 = fpext <2 x bfloat> %28 to <2 x float>, !dbg !21
+  %69 = fpext <2 x bfloat> %40 to <2 x float>, !dbg !22
+  %70 = fpext <2 x bfloat> %51 to <2 x float>, !dbg !23
+  %71 = fmul <2 x float> %69, %70, !dbg !24
+  %72 = fadd <2 x float> %71, %68, !dbg !25
+  %73 = extractelement <2 x float> %72, i64 0, !dbg !26
+  %74 = select i1 %12, float %73, float 0.000000e+00, !dbg !26
+  %75 = extractelement <2 x float> %72, i64 1, !dbg !26
+  %76 = select i1 %12, float %75, float 0.000000e+00, !dbg !26
+  %77 = fptrunc <2 x float> %72 to <2 x bfloat>, !dbg !27
+  %78 = fpext <2 x bfloat> %30 to <2 x float>, !dbg !21
+  %79 = fpext <2 x bfloat> %42 to <2 x float>, !dbg !22
+  %80 = fpext <2 x bfloat> %53 to <2 x float>, !dbg !23
+  %81 = fmul <2 x float> %79, %80, !dbg !24
+  %82 = fadd <2 x float> %81, %78, !dbg !25
+  %83 = extractelement <2 x float> %82, i64 0, !dbg !26
+  %84 = select i1 %12, float %83, float 0.000000e+00, !dbg !26
+  %85 = extractelement <2 x float> %82, i64 1, !dbg !26
+  %86 = select i1 %12, float %85, float 0.000000e+00, !dbg !26
+  %87 = fptrunc <2 x float> %82 to <2 x bfloat>, !dbg !27
+  %88 = fpext <2 x bfloat> %32 to <2 x float>, !dbg !21
+  %89 = fpext <2 x bfloat> %44 to <2 x float>, !dbg !22
+  %90 = fpext <2 x bfloat> %55 to <2 x float>, !dbg !23
+  %91 = fmul <2 x float> %89, %90, !dbg !24
+  %92 = fadd <2 x float> %91, %88, !dbg !25
+  %93 = extractelement <2 x float> %92, i64 0, !dbg !26
+  %94 = select i1 %12, float %93, float 0.000000e+00, !dbg !26
+  %95 = extractelement <2 x float> %92, i64 1, !dbg !26
+  %96 = select i1 %12, float %95, float 0.000000e+00, !dbg !26
+  %97 = fptrunc <2 x float> %92 to <2 x bfloat>, !dbg !27
+  %98 = bitcast <2 x bfloat> %67 to i32, !dbg !27
+  %99 = bitcast <2 x bfloat> %77 to i32, !dbg !27
+  %100 = bitcast <2 x bfloat> %87 to i32, !dbg !27
+  %101 = bitcast <2 x bfloat> %97 to i32, !dbg !27
+  tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %98, i32 %99, i32 %100, i32 %101, ptr addrspace(1) %57, i1 %12) #6, !dbg !27
+  %102 = fsub float %66, %64, !dbg !28
+  %103 = select i1 %12, float 2.000000e+00, float 0.000000e+00, !dbg !34
+  %104 = fcmp oeq float %103, 0.000000e+00, !dbg !35
+  %105 = tail call float @llvm.nvvm.div.full(float %56, float %103), !dbg !36
+  %106 = select i1 %104, float 0.000000e+00, float %105, !dbg !37
+  %107 = fmul float %106, %102, !dbg !38
+  %108 = fadd float %64, %107, !dbg !39
+  %109 = fmul float %102, %102, !dbg !40
+  %110 = fmul float %56, %109, !dbg !41
+  %111 = fmul float %106, %110, !dbg !42
+  %112 = fadd float %111, 0.000000e+00, !dbg !43
+  %113 = fsub float %74, %108, !dbg !28
+  %114 = select i1 %12, float 3.000000e+00, float 0.000000e+00, !dbg !34
+  %115 = fcmp oeq float %114, 0.000000e+00, !dbg !35
+  %116 = tail call float @llvm.nvvm.div.full(float %56, float %114), !dbg !36
+  %117 = select i1 %115, float 0.000000e+00, float %116, !dbg !37
+  %118 = fmul float %117, %113, !dbg !38
+  %119 = fadd float %108, %118, !dbg !39
+  %120 = fmul float %113, %113, !dbg !40
+  %121 = fmul float %103, %120, !dbg !41
+  %122 = fmul float %117, %121, !dbg !42
+  %123 = fadd float %112, %122, !dbg !43
+  %124 = fsub float %76, %119, !dbg !28
+  %125 = select i1 %12, float 4.000000e+00, float 0.000000e+00, !dbg !34
+  %126 = fcmp oeq float %125, 0.000000e+00, !dbg !35
+  %127 = tail call float @llvm.nvvm.div.full(float %56, float %125), !dbg !36
+  %128 = select i1 %126, float 0.000000e+00, float %127, !dbg !37
+  %129 = fmul float %128, %124, !dbg !38
+  %130 = fadd float %119, %129, !dbg !39
+  %131 = fmul float %124, %124, !dbg !40
+  %132 = fmul float %114, %131, !dbg !41
+  %133 = fmul float %128, %132, !dbg !42
+  %134 = fadd float %123, %133, !dbg !43
+  %135 = fsub float %84, %130, !dbg !28
+  %136 = select i1 %12, float 5.000000e+00, float 0.000000e+00, !dbg !34
+  %137 = fcmp oeq float %136, 0.000000e+00, !dbg !35
+  %138 = tail call float @llvm.nvvm.div.full(float %56, float %136), !dbg !36
+  %139 = select i1 %137, float 0.000000e+00, float %138, !dbg !37
+  %140 = fmul float %139, %135, !dbg !38
+  %141 = fadd float %130, %140, !dbg !39
+  %142 = fmul float %135, %135, !dbg !40
+  %143 = fmul float %125, %142, !dbg !41
+  %144 = fmul float %139, %143, !dbg !42
+  %145 = fadd float %134, %144, !dbg !43
+  %146 = fsub float %86, %141, !dbg !28
+  %147 = select i1 %12, float 6.000000e+00, float 0.000000e+00, !dbg !34
+  %148 = fcmp oeq float %147, 0.000000e+00, !dbg !35
+  %149 = tail call float @llvm.nvvm.div.full(float %56, float %147), !dbg !36
+  %150 = select i1 %148, float 0.000000e+00, float %149, !dbg !37
+  %151 = fmul float %150, %146, !dbg !38
+  %152 = fadd float %141, %151, !dbg !39
+  %153 = fmul float %146, %146, !dbg !40
+  %154 = fmul float %136, %153, !dbg !41
+  %155 = fmul float %150, %154, !dbg !42
+  %156 = fadd float %145, %155, !dbg !43
+  %157 = fsub float %94, %152, !dbg !28
+  %158 = select i1 %12, float 7.000000e+00, float 0.000000e+00, !dbg !34
+  %159 = fcmp oeq float %158, 0.000000e+00, !dbg !35
+  %160 = tail call float @llvm.nvvm.div.full(float %56, float %158), !dbg !36
+  %161 = select i1 %159, float 0.000000e+00, float %160, !dbg !37
+  %162 = fmul float %161, %157, !dbg !38
+  %163 = fadd float %152, %162, !dbg !39
+  %164 = fmul float %157, %157, !dbg !40
+  %165 = fmul float %147, %164, !dbg !41
+  %166 = fmul float %161, %165, !dbg !42
+  %167 = fadd float %156, %166, !dbg !43
+  %168 = fsub float %96, %163, !dbg !28
+  %169 = select i1 %12, float 8.000000e+00, float 0.000000e+00, !dbg !34
+  %170 = fcmp oeq float %169, 0.000000e+00, !dbg !35
+  %171 = tail call float @llvm.nvvm.div.full(float %56, float %169), !dbg !36
+  %172 = select i1 %170, float 0.000000e+00, float %171, !dbg !37
+  %173 = fmul float %172, %168, !dbg !38
+  %174 = fadd float %163, %173, !dbg !39
+  %175 = fmul float %168, %168, !dbg !40
+  %176 = fmul float %158, %175, !dbg !41
+  %177 = fmul float %172, %176, !dbg !42
+  %178 = fadd float %167, %177, !dbg !43
+  %179 = bitcast float %174 to i32, !dbg !31
+  %180 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %179, i32 16, i32 31), !dbg !31
+  %181 = bitcast i32 %180 to float, !dbg !31
+  %182 = bitcast float %178 to i32, !dbg !31
+  %183 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %182, i32 16, i32 31), !dbg !31
+  %184 = bitcast i32 %183 to float, !dbg !31
+  %185 = bitcast float %169 to i32, !dbg !31
+  %186 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %185, i32 16, i32 31), !dbg !31
+  %187 = bitcast i32 %186 to float, !dbg !31
+  %188 = fsub float %181, %174, !dbg !28
+  %189 = fadd float %169, %187, !dbg !34
+  %190 = fcmp oeq float %189, 0.000000e+00, !dbg !35
+  %191 = tail call float @llvm.nvvm.div.full(float %187, float %189), !dbg !36
+  %192 = select i1 %190, float 0.000000e+00, float %191, !dbg !37
+  %193 = fmul float %192, %188, !dbg !38
+  %194 = fadd float %174, %193, !dbg !39
+  %195 = fadd float %178, %184, !dbg !44
+  %196 = fmul float %188, %188, !dbg !40
+  %197 = fmul float %169, %196, !dbg !41
+  %198 = fmul float %192, %197, !dbg !42
+  %199 = fadd float %195, %198, !dbg !43
+  %200 = bitcast float %194 to i32, !dbg !31
+  %201 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %200, i32 8, i32 31), !dbg !31
+  %202 = bitcast i32 %201 to float, !dbg !31
+  %203 = bitcast float %199 to i32, !dbg !31
+  %204 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %203, i32 8, i32 31), !dbg !31
+  %205 = bitcast i32 %204 to float, !dbg !31
+  %206 = bitcast float %189 to i32, !dbg !31
+  %207 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %206, i32 8, i32 31), !dbg !31
+  %208 = bitcast i32 %207 to float, !dbg !31
+  %209 = fsub float %202, %194, !dbg !28
+  %210 = fadd float %189, %208, !dbg !34
+  %211 = fcmp oeq float %210, 0.000000e+00, !dbg !35
+  %212 = tail call float @llvm.nvvm.div.full(float %208, float %210), !dbg !36
+  %213 = select i1 %211, float 0.000000e+00, float %212, !dbg !37
+  %214 = fmul float %213, %209, !dbg !38
+  %215 = fadd float %194, %214, !dbg !39
+  %216 = fadd float %199, %205, !dbg !44
+  %217 = fmul float %209, %209, !dbg !40
+  %218 = fmul float %189, %217, !dbg !41
+  %219 = fmul float %213, %218, !dbg !42
+  %220 = fadd float %216, %219, !dbg !43
+  %221 = bitcast float %215 to i32, !dbg !31
+  %222 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %221, i32 4, i32 31), !dbg !31
+  %223 = bitcast i32 %222 to float, !dbg !31
+  %224 = bitcast float %220 to i32, !dbg !31
+  %225 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %224, i32 4, i32 31), !dbg !31
+  %226 = bitcast i32 %225 to float, !dbg !31
+  %227 = bitcast float %210 to i32, !dbg !31
+  %228 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %227, i32 4, i32 31), !dbg !31
+  %229 = bitcast i32 %228 to float, !dbg !31
+  %230 = fsub float %223, %215, !dbg !28
+  %231 = fadd float %210, %229, !dbg !34
+  %232 = fcmp oeq float %231, 0.000000e+00, !dbg !35
+  %233 = tail call float @llvm.nvvm.div.full(float %229, float %231), !dbg !36
+  %234 = select i1 %232, float 0.000000e+00, float %233, !dbg !37
+  %235 = fmul float %234, %230, !dbg !38
+  %236 = fadd float %215, %235, !dbg !39
+  %237 = fadd float %220, %226, !dbg !44
+  %238 = fmul float %230, %230, !dbg !40
+  %239 = fmul float %210, %238, !dbg !41
+  %240 = fmul float %234, %239, !dbg !42
+  %241 = fadd float %237, %240, !dbg !43
+  %242 = bitcast float %236 to i32, !dbg !31
+  %243 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %242, i32 2, i32 31), !dbg !31
+  %244 = bitcast i32 %243 to float, !dbg !31
+  %245 = bitcast float %241 to i32, !dbg !31
+  %246 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %245, i32 2, i32 31), !dbg !31
+  %247 = bitcast i32 %246 to float, !dbg !31
+  %248 = bitcast float %231 to i32, !dbg !31
+  %249 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %248, i32 2, i32 31), !dbg !31
+  %250 = bitcast i32 %249 to float, !dbg !31
+  %251 = fsub float %244, %236, !dbg !28
+  %252 = fadd float %231, %250, !dbg !34
+  %253 = fcmp oeq float %252, 0.000000e+00, !dbg !35
+  %254 = tail call float @llvm.nvvm.div.full(float %250, float %252), !dbg !36
+  %255 = select i1 %253, float 0.000000e+00, float %254, !dbg !37
+  %256 = fmul float %255, %251, !dbg !38
+  %257 = fadd float %236, %256, !dbg !39
+  %258 = fadd float %241, %247, !dbg !44
+  %259 = fmul float %251, %251, !dbg !40
+  %260 = fmul float %231, %259, !dbg !41
+  %261 = fmul float %255, %260, !dbg !42
+  %262 = fadd float %258, %261, !dbg !43
+  %263 = bitcast float %257 to i32, !dbg !31
+  %264 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %263, i32 1, i32 31), !dbg !31
+  %265 = bitcast i32 %264 to float, !dbg !31
+  %266 = bitcast float %262 to i32, !dbg !31
+  %267 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %266, i32 1, i32 31), !dbg !31
+  %268 = bitcast i32 %267 to float, !dbg !31
+  %269 = bitcast float %252 to i32, !dbg !31
+  %270 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %269, i32 1, i32 31), !dbg !31
+  %271 = bitcast i32 %270 to float, !dbg !31
+  %272 = fsub float %265, %257, !dbg !28
+  %273 = fadd float %252, %271, !dbg !34
+  %274 = fcmp oeq float %273, 0.000000e+00, !dbg !35
+  %275 = tail call float @llvm.nvvm.div.full(float %271, float %273), !dbg !36
+  %276 = select i1 %274, float 0.000000e+00, float %275, !dbg !37
+  %277 = fmul float %276, %272, !dbg !38
+  %278 = fadd float %257, %277, !dbg !39
+  %279 = fadd float %262, %268, !dbg !44
+  %280 = fmul float %272, %272, !dbg !40
+  %281 = fmul float %252, %280, !dbg !41
+  %282 = fmul float %276, %281, !dbg !42
+  %283 = fadd float %279, %282, !dbg !43
+  %284 = icmp eq i32 %15, 0, !dbg !31
+  %285 = getelementptr float, ptr addrspace(3) @global_smem, i32 %16, !dbg !31
+  %286 = bitcast float %278 to <1 x i32>, !dbg !31
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %285, <1 x i32> %286, i1 %284) #6, !dbg !31
+  %287 = getelementptr float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 64), i32 %16, !dbg !31
+  %288 = bitcast float %283 to <1 x i32>, !dbg !31
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %287, <1 x i32> %288, i1 %284) #6, !dbg !31
+  %289 = getelementptr float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 128), i32 %16, !dbg !31
+  %290 = bitcast float %273 to <1 x i32>, !dbg !31
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %289, <1 x i32> %290, i1 %284) #6, !dbg !31
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !31
+  %291 = icmp samesign ult i32 %14, 16, !dbg !31
+  %292 = getelementptr float, ptr addrspace(3) @global_smem, i32 %14, !dbg !31
+  %293 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %292, i1 %291) #6, !dbg !31
+  %294 = bitcast i32 %293 to float, !dbg !31
+  %295 = getelementptr float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 64), i32 %14, !dbg !31
+  %296 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %295, i1 %291) #6, !dbg !31
+  %297 = bitcast i32 %296 to float, !dbg !31
+  %298 = getelementptr float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 128), i32 %14, !dbg !31
+  %299 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %298, i1 %291) #6, !dbg !31
+  %300 = bitcast i32 %299 to float, !dbg !31
+  %301 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %293, i32 8, i32 31), !dbg !31
+  %302 = bitcast i32 %301 to float, !dbg !31
+  %303 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %296, i32 8, i32 31), !dbg !31
+  %304 = bitcast i32 %303 to float, !dbg !31
+  %305 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %299, i32 8, i32 31), !dbg !31
+  %306 = bitcast i32 %305 to float, !dbg !31
+  %307 = fsub float %302, %294, !dbg !28
+  %308 = fadd float %300, %306, !dbg !34
+  %309 = fcmp oeq float %308, 0.000000e+00, !dbg !35
+  %310 = tail call float @llvm.nvvm.div.full(float %306, float %308), !dbg !36
+  %311 = select i1 %309, float 0.000000e+00, float %310, !dbg !37
+  %312 = fmul float %307, %311, !dbg !38
+  %313 = fadd float %312, %294, !dbg !39
+  %314 = fadd float %297, %304, !dbg !44
+  %315 = fmul float %307, %307, !dbg !40
+  %316 = fmul float %315, %300, !dbg !41
+  %317 = fmul float %316, %311, !dbg !42
+  %318 = fadd float %314, %317, !dbg !43
+  %319 = bitcast float %313 to i32, !dbg !31
+  %320 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %319, i32 4, i32 31), !dbg !31
+  %321 = bitcast i32 %320 to float, !dbg !31
+  %322 = bitcast float %318 to i32, !dbg !31
+  %323 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %322, i32 4, i32 31), !dbg !31
+  %324 = bitcast i32 %323 to float, !dbg !31
+  %325 = bitcast float %308 to i32, !dbg !31
+  %326 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %325, i32 4, i32 31), !dbg !31
+  %327 = bitcast i32 %326 to float, !dbg !31
+  %328 = fsub float %321, %313, !dbg !28
+  %329 = fadd float %308, %327, !dbg !34
+  %330 = fcmp oeq float %329, 0.000000e+00, !dbg !35
+  %331 = tail call float @llvm.nvvm.div.full(float %327, float %329), !dbg !36
+  %332 = select i1 %330, float 0.000000e+00, float %331, !dbg !37
+  %333 = fmul float %328, %332, !dbg !38
+  %334 = fadd float %313, %333, !dbg !39
+  %335 = fadd float %318, %324, !dbg !44
+  %336 = fmul float %328, %328, !dbg !40
+  %337 = fmul float %308, %336, !dbg !41
+  %338 = fmul float %332, %337, !dbg !42
+  %339 = fadd float %335, %338, !dbg !43
+  %340 = bitcast float %334 to i32, !dbg !31
+  %341 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %340, i32 2, i32 31), !dbg !31
+  %342 = bitcast i32 %341 to float, !dbg !31
+  %343 = bitcast float %339 to i32, !dbg !31
+  %344 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %343, i32 2, i32 31), !dbg !31
+  %345 = bitcast i32 %344 to float, !dbg !31
+  %346 = bitcast float %329 to i32, !dbg !31
+  %347 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %346, i32 2, i32 31), !dbg !31
+  %348 = bitcast i32 %347 to float, !dbg !31
+  %349 = fsub float %342, %334, !dbg !28
+  %350 = fadd float %329, %348, !dbg !34
+  %351 = fcmp oeq float %350, 0.000000e+00, !dbg !35
+  %352 = tail call float @llvm.nvvm.div.full(float %348, float %350), !dbg !36
+  %353 = select i1 %351, float 0.000000e+00, float %352, !dbg !37
+  %354 = fmul float %349, %353, !dbg !38
+  %355 = fadd float %334, %354, !dbg !39
+  %356 = fadd float %339, %345, !dbg !44
+  %357 = fmul float %349, %349, !dbg !40
+  %358 = fmul float %329, %357, !dbg !41
+  %359 = fmul float %353, %358, !dbg !42
+  %360 = fadd float %356, %359, !dbg !43
+  %361 = bitcast float %355 to i32, !dbg !31
+  %362 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %361, i32 1, i32 31), !dbg !31
+  %363 = bitcast i32 %362 to float, !dbg !31
+  %364 = bitcast float %360 to i32, !dbg !31
+  %365 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %364, i32 1, i32 31), !dbg !31
+  %366 = bitcast i32 %365 to float, !dbg !31
+  %367 = bitcast float %350 to i32, !dbg !31
+  %368 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %367, i32 1, i32 31), !dbg !31
+  %369 = bitcast i32 %368 to float, !dbg !31
+  %370 = fsub float %363, %355, !dbg !28
+  %371 = fadd float %350, %369, !dbg !34
+  %372 = fcmp oeq float %371, 0.000000e+00, !dbg !35
+  %373 = tail call float @llvm.nvvm.div.full(float %369, float %371), !dbg !36
+  %374 = select i1 %372, float 0.000000e+00, float %373, !dbg !37
+  %375 = fmul float %370, %374, !dbg !38
+  %376 = fadd float %355, %375, !dbg !39
+  %377 = fadd float %360, %366, !dbg !44
+  %378 = fmul float %370, %370, !dbg !40
+  %379 = fmul float %350, %378, !dbg !41
+  %380 = fmul float %374, %379, !dbg !42
+  %381 = fadd float %377, %380, !dbg !43
+  %382 = and i32 %13, 15, !dbg !31
+  %383 = icmp eq i32 %382, 0, !dbg !31
+  %384 = and i1 %291, %383, !dbg !31
+  %385 = bitcast float %376 to <1 x i32>, !dbg !31
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %292, <1 x i32> %385, i1 %384) #6, !dbg !31
+  %386 = bitcast float %381 to <1 x i32>, !dbg !31
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %295, <1 x i32> %386, i1 %384) #6, !dbg !31
+  %387 = bitcast float %371 to <1 x i32>, !dbg !31
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %298, <1 x i32> %387, i1 %384) #6, !dbg !31
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !31
+  %388 = load float, ptr addrspace(3) @global_smem, align 16, !dbg !31
+  %389 = load float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 64), align 16, !dbg !31
+  %390 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #6, !dbg !45
+  %391 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %57, i64 %390, i1 %12) #6, !dbg !45
+  %392 = getelementptr bfloat, ptr addrspace(1) %3, i64 %33, !dbg !46
+  %393 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !47
+  %394 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %392, i64 %393, i1 true) #6, !dbg !47
+  %395 = getelementptr bfloat, ptr addrspace(1) %4, i64 %33, !dbg !48
+  %396 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !49
+  %397 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %395, i64 %396, i1 true) #6, !dbg !49
+  %398 = tail call float @llvm.nvvm.div.full(float %389, float 4.096000e+03), !dbg !50
+  %399 = fadd float %398, 0x3EB0C6F7A0000000, !dbg !51
+  %400 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !52
+  %401 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !52
+  %402 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !52
+  %403 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !52
+  %404 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !52
+  %405 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !52
+  %406 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !52
+  %407 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !52
+  %.not.i19 = icmp eq i32 %407, 0, !dbg !52
+  br i1 %.not.i19, label %410, label %408, !dbg !52
+
+408:                                              ; preds = %__nv_rsqrtf.exit
+  %409 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %399), !dbg !52
+  br label %__nv_rsqrtf.exit21, !dbg !52
+
+410:                                              ; preds = %__nv_rsqrtf.exit
+  %411 = tail call float @llvm.nvvm.rsqrt.approx.f(float %399), !dbg !52
+  br label %__nv_rsqrtf.exit21, !dbg !52
+
+__nv_rsqrtf.exit21:                               ; preds = %408, %410
+  %.0.i20 = phi float [ %409, %408 ], [ %411, %410 ], !dbg !52
+  %412 = extractvalue { i32, i32, i32, i32 } %391, 3, !dbg !45
+  %413 = bitcast i32 %412 to <2 x bfloat>, !dbg !45
+  %414 = extractvalue { i32, i32, i32, i32 } %391, 2, !dbg !45
+  %415 = bitcast i32 %414 to <2 x bfloat>, !dbg !45
+  %416 = extractvalue { i32, i32, i32, i32 } %391, 1, !dbg !45
+  %417 = bitcast i32 %416 to <2 x bfloat>, !dbg !45
+  %418 = extractvalue { i32, i32, i32, i32 } %391, 0, !dbg !45
+  %419 = bitcast i32 %418 to <2 x bfloat>, !dbg !45
+  %420 = extractvalue { i32, i32, i32, i32 } %397, 3, !dbg !49
+  %421 = bitcast i32 %420 to <2 x bfloat>, !dbg !49
+  %422 = extractvalue { i32, i32, i32, i32 } %397, 2, !dbg !49
+  %423 = bitcast i32 %422 to <2 x bfloat>, !dbg !49
+  %424 = extractvalue { i32, i32, i32, i32 } %397, 1, !dbg !49
+  %425 = bitcast i32 %424 to <2 x bfloat>, !dbg !49
+  %426 = extractvalue { i32, i32, i32, i32 } %397, 0, !dbg !49
+  %427 = bitcast i32 %426 to <2 x bfloat>, !dbg !49
+  %428 = extractvalue { i32, i32, i32, i32 } %394, 3, !dbg !47
+  %429 = bitcast i32 %428 to <2 x bfloat>, !dbg !47
+  %430 = extractvalue { i32, i32, i32, i32 } %394, 2, !dbg !47
+  %431 = bitcast i32 %430 to <2 x bfloat>, !dbg !47
+  %432 = extractvalue { i32, i32, i32, i32 } %394, 1, !dbg !47
+  %433 = bitcast i32 %432 to <2 x bfloat>, !dbg !47
+  %434 = extractvalue { i32, i32, i32, i32 } %394, 0, !dbg !47
+  %435 = bitcast i32 %434 to <2 x bfloat>, !dbg !47
+  %436 = getelementptr bfloat, ptr addrspace(1) %6, i64 %21, !dbg !53
+  %437 = fpext <2 x bfloat> %419 to <2 x float>, !dbg !54
+  %438 = insertelement <2 x float> poison, float %388, i64 0, !dbg !55
+  %439 = shufflevector <2 x float> %438, <2 x float> poison, <2 x i32> zeroinitializer, !dbg !55
+  %440 = fsub <2 x float> %437, %439, !dbg !55
+  %441 = fpext <2 x bfloat> %427 to <2 x float>, !dbg !56
+  %442 = fpext <2 x bfloat> %435 to <2 x float>, !dbg !57
+  %443 = insertelement <2 x float> poison, float %.0.i20, i64 0, !dbg !58
+  %444 = shufflevector <2 x float> %443, <2 x float> poison, <2 x i32> zeroinitializer, !dbg !58
+  %445 = fmul <2 x float> %440, %444, !dbg !58
+  %446 = fadd <2 x float> %442, splat (float 1.000000e+00), !dbg !59
+  %447 = fmul <2 x float> %446, %445, !dbg !60
+  %448 = fadd <2 x float> %447, %441, !dbg !61
+  %449 = fptrunc <2 x float> %448 to <2 x bfloat>, !dbg !62
+  %450 = fpext <2 x bfloat> %417 to <2 x float>, !dbg !54
+  %451 = fsub <2 x float> %450, %439, !dbg !55
+  %452 = fpext <2 x bfloat> %425 to <2 x float>, !dbg !56
+  %453 = fpext <2 x bfloat> %433 to <2 x float>, !dbg !57
+  %454 = fmul <2 x float> %451, %444, !dbg !58
+  %455 = fadd <2 x float> %453, splat (float 1.000000e+00), !dbg !59
+  %456 = fmul <2 x float> %455, %454, !dbg !60
+  %457 = fadd <2 x float> %456, %452, !dbg !61
+  %458 = fptrunc <2 x float> %457 to <2 x bfloat>, !dbg !62
+  %459 = fpext <2 x bfloat> %415 to <2 x float>, !dbg !54
+  %460 = fsub <2 x float> %459, %439, !dbg !55
+  %461 = fpext <2 x bfloat> %423 to <2 x float>, !dbg !56
+  %462 = fpext <2 x bfloat> %431 to <2 x float>, !dbg !57
+  %463 = fmul <2 x float> %460, %444, !dbg !58
+  %464 = fadd <2 x float> %462, splat (float 1.000000e+00), !dbg !59
+  %465 = fmul <2 x float> %464, %463, !dbg !60
+  %466 = fadd <2 x float> %465, %461, !dbg !61
+  %467 = fptrunc <2 x float> %466 to <2 x bfloat>, !dbg !62
+  %468 = fpext <2 x bfloat> %413 to <2 x float>, !dbg !54
+  %469 = fsub <2 x float> %468, %439, !dbg !55
+  %470 = fpext <2 x bfloat> %421 to <2 x float>, !dbg !56
+  %471 = fpext <2 x bfloat> %429 to <2 x float>, !dbg !57
+  %472 = fmul <2 x float> %469, %444, !dbg !58
+  %473 = fadd <2 x float> %471, splat (float 1.000000e+00), !dbg !59
+  %474 = fmul <2 x float> %473, %472, !dbg !60
+  %475 = fadd <2 x float> %474, %470, !dbg !61
+  %476 = fptrunc <2 x float> %475 to <2 x bfloat>, !dbg !62
+  %477 = bitcast <2 x bfloat> %449 to i32, !dbg !62
+  %478 = bitcast <2 x bfloat> %458 to i32, !dbg !62
+  %479 = bitcast <2 x bfloat> %467 to i32, !dbg !62
+  %480 = bitcast <2 x bfloat> %476 to i32, !dbg !62
+  tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %477, i32 %478, i32 %479, i32 %480, ptr addrspace(1) %436, i1 %12) #6, !dbg !62
+  ret void, !dbg !63
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.div.full(float, float) #2
+
+; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
+declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #3
+
+; Function Attrs: convergent nocallback nounwind
+declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #4
+
+declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #5
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #2
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.rsqrt.approx.f(float) #2
+
+attributes #0 = { nounwind "nvvm.reqntid"="512" }
+attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #2 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
+attributes #3 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
+attributes #4 = { convergent nocallback nounwind }
+attributes #5 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #6 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3}
+!llvm.ident = !{!4}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly)
+!1 = !DIFile(filename: "cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py", directory: "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3")
+!2 = !{i32 2, !"Debug Info Version", i32 3}
+!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
+!4 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
+!5 = distinct !DISubprogram(name: "triton_red_fused_add_mul_native_layer_norm_0", linkageName: "triton_red_fused_add_mul_native_layer_norm_0", scope: !1, file: !1, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
+!7 = !{}
+!8 = !DILocation(line: 23, column: 28, scope: !5)
+!9 = !DILocation(line: 25, column: 21, scope: !5)
+!10 = !DILocation(line: 26, column: 37, scope: !5)
+!11 = !DILocation(line: 38, column: 46, scope: !5)
+!12 = !DILocation(line: 38, column: 41, scope: !5)
+!13 = !DILocation(line: 38, column: 34, scope: !5)
+!14 = !DILocation(line: 38, column: 51, scope: !5)
+!15 = !DILocation(line: 39, column: 34, scope: !5)
+!16 = !DILocation(line: 39, column: 41, scope: !5)
+!17 = !DILocation(line: 40, column: 34, scope: !5)
+!18 = !DILocation(line: 40, column: 51, scope: !5)
+!19 = !DILocation(line: 50, column: 66, scope: !5)
+!20 = !DILocation(line: 51, column: 29, scope: !5)
+!21 = !DILocation(line: 38, column: 113, scope: !5)
+!22 = !DILocation(line: 39, column: 94, scope: !5)
+!23 = !DILocation(line: 40, column: 113, scope: !5)
+!24 = !DILocation(line: 41, column: 22, scope: !5)
+!25 = !DILocation(line: 42, column: 22, scope: !5)
+!26 = !DILocation(line: 48, column: 62, scope: !5)
+!27 = !DILocation(line: 51, column: 52, scope: !5)
+!28 = !DILocation(line: 231, column: 21, scope: !29, inlinedAt: !31)
+!29 = distinct !DILexicalBlockFile(scope: !5, file: !30, discriminator: 0)
+!30 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime")
+!31 = !DILocation(line: 243, column: 46, scope: !29, inlinedAt: !32)
+!32 = !DILocation(line: 52, column: 80, scope: !33)
+!33 = distinct !DILexicalBlockFile(scope: !5, file: !1, discriminator: 0)
+!34 = !DILocation(line: 232, column: 28, scope: !29, inlinedAt: !31)
+!35 = !DILocation(line: 233, column: 39, scope: !29, inlinedAt: !31)
+!36 = !DILocation(line: 233, column: 60, scope: !29, inlinedAt: !31)
+!37 = !DILocation(line: 233, column: 49, scope: !29, inlinedAt: !31)
+!38 = !DILocation(line: 235, column: 25, scope: !29, inlinedAt: !31)
+!39 = !DILocation(line: 235, column: 17, scope: !29, inlinedAt: !31)
+!40 = !DILocation(line: 236, column: 30, scope: !29, inlinedAt: !31)
+!41 = !DILocation(line: 236, column: 38, scope: !29, inlinedAt: !31)
+!42 = !DILocation(line: 236, column: 49, scope: !29, inlinedAt: !31)
+!43 = !DILocation(line: 236, column: 22, scope: !29, inlinedAt: !31)
+!44 = !DILocation(line: 236, column: 15, scope: !29, inlinedAt: !31)
+!45 = !DILocation(line: 62, column: 53, scope: !5)
+!46 = !DILocation(line: 63, column: 35, scope: !5)
+!47 = !DILocation(line: 63, column: 42, scope: !5)
+!48 = !DILocation(line: 64, column: 35, scope: !5)
+!49 = !DILocation(line: 64, column: 42, scope: !5)
+!50 = !DILocation(line: 68, column: 25, scope: !5)
+!51 = !DILocation(line: 70, column: 24, scope: !5)
+!52 = !DILocation(line: 71, column: 32, scope: !5)
+!53 = !DILocation(line: 78, column: 29, scope: !5)
+!54 = !DILocation(line: 62, column: 115, scope: !5)
+!55 = !DILocation(line: 66, column: 24, scope: !5)
+!56 = !DILocation(line: 64, column: 95, scope: !5)
+!57 = !DILocation(line: 63, column: 95, scope: !5)
+!58 = !DILocation(line: 72, column: 24, scope: !5)
+!59 = !DILocation(line: 75, column: 24, scope: !5)
+!60 = !DILocation(line: 76, column: 24, scope: !5)
+!61 = !DILocation(line: 77, column: 24, scope: !5)
+!62 = !DILocation(line: 78, column: 53, scope: !5)
+!63 = !DILocation(line: 56, column: 4, scope: !5)
diff --git a/triton/TADKEHLBMNHPYJR7HM24RCH2SQBJWQ5J3N2BE4JL76WWKUZUI7BA/triton_red_fused_add_mul_native_layer_norm_0.ptx b/triton/TADKEHLBMNHPYJR7HM24RCH2SQBJWQ5J3N2BE4JL76WWKUZUI7BA/triton_red_fused_add_mul_native_layer_norm_0.ptx
new file mode 100644
index 0000000000000000000000000000000000000000..80f0c611b683fbf5bbc6bba4582956e310c5c76c
--- /dev/null
+++ b/triton/TADKEHLBMNHPYJR7HM24RCH2SQBJWQ5J3N2BE4JL76WWKUZUI7BA/triton_red_fused_add_mul_native_layer_norm_0.ptx
@@ -0,0 +1,1129 @@
+//
+// Generated by LLVM NVPTX Back-End
+//
+
+.version 9.1
+.target sm_89
+.address_size 64
+
+	// .globl	triton_red_fused_add_mul_native_layer_norm_0 // -- Begin function triton_red_fused_add_mul_native_layer_norm_0
+.extern .shared .align 16 .b8 global_smem[];
+.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90};
+                                        // @triton_red_fused_add_mul_native_layer_norm_0
+.visible .entry triton_red_fused_add_mul_native_layer_norm_0(
+	.param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_0_param_0,
+	.param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_0_param_1,
+	.param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_0_param_2,
+	.param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_0_param_3,
+	.param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_0_param_4,
+	.param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_0_param_5,
+	.param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_0_param_6,
+	.param .u32 triton_red_fused_add_mul_native_layer_norm_0_param_7,
+	.param .u32 triton_red_fused_add_mul_native_layer_norm_0_param_8,
+	.param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_0_param_9,
+	.param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_0_param_10
+)
+.reqntid 512
+{
+	.reg .pred 	%p<23>;
+	.reg .b16 	%rs<49>;
+	.reg .b32 	%r<323>;
+	.reg .b64 	%rd<23>;
+	.loc	1 18 0                          // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:18:0
+$L__func_begin0:
+	.loc	1 18 0                          // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:18:0
+
+// %bb.0:                               // %__nv_rsqrtf.exit
+	ld.param.b64 	%rd14, [triton_red_fused_add_mul_native_layer_norm_0_param_0];
+	ld.param.b64 	%rd15, [triton_red_fused_add_mul_native_layer_norm_0_param_1];
+$L__tmp0:
+	.loc	1 23 28                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:23:28
+	mov.u32 	%r49, %ctaid.x;
+	.loc	1 25 21                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:25:21
+	setp.lt.u32 	%p1, %r49, 2048;
+	ld.param.b64 	%rd16, [triton_red_fused_add_mul_native_layer_norm_0_param_2];
+	ld.param.b64 	%rd17, [triton_red_fused_add_mul_native_layer_norm_0_param_3];
+	.loc	1 26 37                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:26:37
+	mov.u32 	%r50, %tid.x;
+	and.b32 	%r51, %r50, 511;
+	ld.param.b64 	%rd18, [triton_red_fused_add_mul_native_layer_norm_0_param_4];
+	and.b32 	%r52, %r50, 31;
+	ld.param.b64 	%rd19, [triton_red_fused_add_mul_native_layer_norm_0_param_5];
+	ld.param.b64 	%rd20, [triton_red_fused_add_mul_native_layer_norm_0_param_6];
+	shl.b32 	%r53, %r50, 3;
+	and.b32 	%r54, %r53, 4088;
+	.loc	1 38 46                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:38:46
+	shl.b32 	%r55, %r49, 12;
+	.loc	1 38 41                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:38:41
+	or.b32 	%r56, %r54, %r55;
+	.loc	1 38 34                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:38:34
+	mul.wide.s32 	%rd21, %r56, 2;
+	add.s64 	%rd1, %rd14, %rd21;
+	.loc	1 38 51                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:38:51
+	// begin inline asm
+	mov.u64 %rd2, 0x0;
+	createpolicy.fractional.L2::evict_first.b64 %rd2, 1.0;
+	// end inline asm
+	mov.b32 	%r5, 0;
+	// begin inline asm
+	mov.u32 %r1, %r5;
+	mov.u32 %r2, %r5;
+	mov.u32 %r3, %r5;
+	mov.u32 %r4, %r5;
+	@%p1 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { %r1, %r2, %r3, %r4 }, [ %rd1 + 0 ], %rd2;
+	// end inline asm
+	.loc	1 39 34                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:39:34
+	mul.wide.u32 	%rd22, %r54, 2;
+	add.s64 	%rd3, %rd15, %rd22;
+	.loc	1 39 41                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:39:41
+	// begin inline asm
+	mov.u64 %rd4, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd4, 1.0;
+	// end inline asm
+	mov.pred 	%p2, -1;
+	// begin inline asm
+	mov.u32 %r6, %r5;
+	mov.u32 %r7, %r5;
+	mov.u32 %r8, %r5;
+	mov.u32 %r9, %r5;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r6, %r7, %r8, %r9 }, [ %rd3 + 0 ], %rd4;
+	// end inline asm
+	.loc	1 40 34                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:40:34
+	add.s64 	%rd5, %rd16, %rd21;
+	.loc	1 40 51                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:40:51
+	// begin inline asm
+	mov.u64 %rd6, 0x0;
+	createpolicy.fractional.L2::evict_first.b64 %rd6, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r10, %r5;
+	mov.u32 %r11, %r5;
+	mov.u32 %r12, %r5;
+	mov.u32 %r13, %r5;
+	@%p1 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { %r10, %r11, %r12, %r13 }, [ %rd5 + 0 ], %rd6;
+	// end inline asm
+	.loc	1 50 66                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:50:66
+	selp.f32 	%r57, 0f3F800000, 0f00000000, %p1;
+	.loc	1 51 29                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:51:29
+	add.s64 	%rd7, %rd19, %rd21;
+	.loc	1 38 113                        // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:38:113
+	mov.b32 	{%rs1, %rs2}, %r1;
+	cvt.f32.bf16 	%r58, %rs1;
+	cvt.f32.bf16 	%r59, %rs2;
+	.loc	1 39 94                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:39:94
+	mov.b32 	{%rs3, %rs4}, %r6;
+	cvt.f32.bf16 	%r60, %rs3;
+	cvt.f32.bf16 	%r61, %rs4;
+	.loc	1 40 113                        // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:40:113
+	mov.b32 	{%rs5, %rs6}, %r10;
+	cvt.f32.bf16 	%r62, %rs5;
+	cvt.f32.bf16 	%r63, %rs6;
+	.loc	1 42 22                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:42:22
+	fma.rn.f32 	%r64, %r61, %r63, %r59;
+	fma.rn.f32 	%r65, %r60, %r62, %r58;
+	.loc	1 48 62                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:48:62
+	selp.f32 	%r66, %r65, 0f00000000, %p1;
+	selp.f32 	%r67, %r64, 0f00000000, %p1;
+	.loc	1 51 52                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:51:52
+	cvt.rn.bf16x2.f32 	%r14, %r64, %r65;
+	.loc	1 38 113                        // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:38:113
+	mov.b32 	{%rs7, %rs8}, %r2;
+	cvt.f32.bf16 	%r68, %rs7;
+	cvt.f32.bf16 	%r69, %rs8;
+	.loc	1 39 94                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:39:94
+	mov.b32 	{%rs9, %rs10}, %r7;
+	cvt.f32.bf16 	%r70, %rs9;
+	cvt.f32.bf16 	%r71, %rs10;
+	.loc	1 40 113                        // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:40:113
+	mov.b32 	{%rs11, %rs12}, %r11;
+	cvt.f32.bf16 	%r72, %rs11;
+	cvt.f32.bf16 	%r73, %rs12;
+	.loc	1 42 22                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:42:22
+	fma.rn.f32 	%r74, %r71, %r73, %r69;
+	fma.rn.f32 	%r75, %r70, %r72, %r68;
+	.loc	1 48 62                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:48:62
+	selp.f32 	%r76, %r75, 0f00000000, %p1;
+	selp.f32 	%r77, %r74, 0f00000000, %p1;
+	.loc	1 51 52                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:51:52
+	cvt.rn.bf16x2.f32 	%r15, %r74, %r75;
+	.loc	1 38 113                        // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:38:113
+	mov.b32 	{%rs13, %rs14}, %r3;
+	cvt.f32.bf16 	%r78, %rs13;
+	cvt.f32.bf16 	%r79, %rs14;
+	.loc	1 39 94                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:39:94
+	mov.b32 	{%rs15, %rs16}, %r8;
+	cvt.f32.bf16 	%r80, %rs15;
+	cvt.f32.bf16 	%r81, %rs16;
+	.loc	1 40 113                        // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:40:113
+	mov.b32 	{%rs17, %rs18}, %r12;
+	cvt.f32.bf16 	%r82, %rs17;
+	cvt.f32.bf16 	%r83, %rs18;
+	.loc	1 42 22                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:42:22
+	fma.rn.f32 	%r84, %r81, %r83, %r79;
+	fma.rn.f32 	%r85, %r80, %r82, %r78;
+	.loc	1 48 62                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:48:62
+	selp.f32 	%r86, %r85, 0f00000000, %p1;
+	selp.f32 	%r87, %r84, 0f00000000, %p1;
+	.loc	1 51 52                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:51:52
+	cvt.rn.bf16x2.f32 	%r16, %r84, %r85;
+	.loc	1 38 113                        // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:38:113
+	mov.b32 	{%rs19, %rs20}, %r4;
+	cvt.f32.bf16 	%r88, %rs19;
+	cvt.f32.bf16 	%r89, %rs20;
+	.loc	1 39 94                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:39:94
+	mov.b32 	{%rs21, %rs22}, %r9;
+	cvt.f32.bf16 	%r90, %rs21;
+	cvt.f32.bf16 	%r91, %rs22;
+	.loc	1 40 113                        // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:40:113
+	mov.b32 	{%rs23, %rs24}, %r13;
+	cvt.f32.bf16 	%r92, %rs23;
+	cvt.f32.bf16 	%r93, %rs24;
+	.loc	1 42 22                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:42:22
+	fma.rn.f32 	%r94, %r91, %r93, %r89;
+	fma.rn.f32 	%r95, %r90, %r92, %r88;
+	.loc	1 48 62                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:48:62
+	selp.f32 	%r96, %r95, 0f00000000, %p1;
+	selp.f32 	%r97, %r94, 0f00000000, %p1;
+	.loc	1 51 52                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:51:52
+	cvt.rn.bf16x2.f32 	%r17, %r94, %r95;
+	// begin inline asm
+	@%p1 st.global.v4.b32 [ %rd7 + 0 ], { %r14, %r15, %r16, %r17 };
+	// end inline asm
+$L__tmp1:
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	sub.f32 	%r98, %r67, %r66;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	selp.f32 	%r99, 0f40000000, 0f00000000, %p1;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	setp.eq.f32 	%p6, %r99, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	div.full.f32 	%r100, %r57, %r99;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	selp.f32 	%r101, 0f00000000, %r100, %p6;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	fma.rn.f32 	%r102, %r101, %r98, %r66;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	mul.f32 	%r103, %r98, %r98;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	mul.f32 	%r104, %r57, %r103;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	fma.rn.f32 	%r105, %r101, %r104, 0f00000000;
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	sub.f32 	%r106, %r76, %r102;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	selp.f32 	%r107, 0f40400000, 0f00000000, %p1;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	setp.eq.f32 	%p7, %r107, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	div.full.f32 	%r108, %r57, %r107;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	selp.f32 	%r109, 0f00000000, %r108, %p7;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	fma.rn.f32 	%r110, %r109, %r106, %r102;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	mul.f32 	%r111, %r106, %r106;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	mul.f32 	%r112, %r99, %r111;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	fma.rn.f32 	%r113, %r109, %r112, %r105;
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	sub.f32 	%r114, %r77, %r110;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	selp.f32 	%r115, 0f40800000, 0f00000000, %p1;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	setp.eq.f32 	%p8, %r115, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	div.full.f32 	%r116, %r57, %r115;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	selp.f32 	%r117, 0f00000000, %r116, %p8;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	fma.rn.f32 	%r118, %r117, %r114, %r110;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	mul.f32 	%r119, %r114, %r114;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	mul.f32 	%r120, %r107, %r119;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	fma.rn.f32 	%r121, %r117, %r120, %r113;
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	sub.f32 	%r122, %r86, %r118;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	selp.f32 	%r123, 0f40A00000, 0f00000000, %p1;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	setp.eq.f32 	%p9, %r123, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	div.full.f32 	%r124, %r57, %r123;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	selp.f32 	%r125, 0f00000000, %r124, %p9;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	fma.rn.f32 	%r126, %r125, %r122, %r118;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	mul.f32 	%r127, %r122, %r122;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	mul.f32 	%r128, %r115, %r127;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	fma.rn.f32 	%r129, %r125, %r128, %r121;
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	sub.f32 	%r130, %r87, %r126;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	selp.f32 	%r131, 0f40C00000, 0f00000000, %p1;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	setp.eq.f32 	%p10, %r131, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	div.full.f32 	%r132, %r57, %r131;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	selp.f32 	%r133, 0f00000000, %r132, %p10;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	fma.rn.f32 	%r134, %r133, %r130, %r126;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	mul.f32 	%r135, %r130, %r130;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	mul.f32 	%r136, %r123, %r135;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	fma.rn.f32 	%r137, %r133, %r136, %r129;
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	sub.f32 	%r138, %r96, %r134;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	selp.f32 	%r139, 0f40E00000, 0f00000000, %p1;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	setp.eq.f32 	%p11, %r139, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	div.full.f32 	%r140, %r57, %r139;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	selp.f32 	%r141, 0f00000000, %r140, %p11;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	fma.rn.f32 	%r142, %r141, %r138, %r134;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	mul.f32 	%r143, %r138, %r138;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	mul.f32 	%r144, %r131, %r143;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	fma.rn.f32 	%r145, %r141, %r144, %r137;
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	sub.f32 	%r146, %r97, %r142;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	selp.f32 	%r147, 0f41000000, 0f00000000, %p1;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	setp.eq.f32 	%p12, %r147, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	div.full.f32 	%r148, %r57, %r147;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	selp.f32 	%r149, 0f00000000, %r148, %p12;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	fma.rn.f32 	%r150, %r149, %r146, %r142;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	mul.f32 	%r151, %r146, %r146;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	mul.f32 	%r152, %r139, %r151;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	fma.rn.f32 	%r153, %r149, %r152, %r145;
+$L__tmp2:
+	.loc	2 243 46                        // triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ]
+	shfl.sync.bfly.b32 	%r154, %r150, 16, 31, -1;
+	shfl.sync.bfly.b32 	%r155, %r153, 16, 31, -1;
+	shfl.sync.bfly.b32 	%r156, %r147, 16, 31, -1;
+$L__tmp3:
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	sub.f32 	%r157, %r154, %r150;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	add.f32 	%r158, %r147, %r156;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	setp.eq.f32 	%p13, %r158, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	div.full.f32 	%r159, %r156, %r158;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	selp.f32 	%r160, 0f00000000, %r159, %p13;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	fma.rn.f32 	%r161, %r160, %r157, %r150;
+	.loc	2 236 15                        // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	add.f32 	%r162, %r153, %r155;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	mul.f32 	%r163, %r157, %r157;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	mul.f32 	%r164, %r147, %r163;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	fma.rn.f32 	%r165, %r160, %r164, %r162;
+$L__tmp4:
+	.loc	2 243 46                        // triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ]
+	shfl.sync.bfly.b32 	%r166, %r161, 8, 31, -1;
+	shfl.sync.bfly.b32 	%r167, %r165, 8, 31, -1;
+	shfl.sync.bfly.b32 	%r168, %r158, 8, 31, -1;
+$L__tmp5:
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	sub.f32 	%r169, %r166, %r161;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	add.f32 	%r170, %r158, %r168;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	setp.eq.f32 	%p14, %r170, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	div.full.f32 	%r171, %r168, %r170;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	selp.f32 	%r172, 0f00000000, %r171, %p14;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	fma.rn.f32 	%r173, %r172, %r169, %r161;
+	.loc	2 236 15                        // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	add.f32 	%r174, %r165, %r167;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	mul.f32 	%r175, %r169, %r169;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	mul.f32 	%r176, %r158, %r175;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	fma.rn.f32 	%r177, %r172, %r176, %r174;
+$L__tmp6:
+	.loc	2 243 46                        // triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ]
+	shfl.sync.bfly.b32 	%r178, %r173, 4, 31, -1;
+	shfl.sync.bfly.b32 	%r179, %r177, 4, 31, -1;
+	shfl.sync.bfly.b32 	%r180, %r170, 4, 31, -1;
+$L__tmp7:
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	sub.f32 	%r181, %r178, %r173;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	add.f32 	%r182, %r170, %r180;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	setp.eq.f32 	%p15, %r182, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	div.full.f32 	%r183, %r180, %r182;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	selp.f32 	%r184, 0f00000000, %r183, %p15;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	fma.rn.f32 	%r185, %r184, %r181, %r173;
+	.loc	2 236 15                        // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	add.f32 	%r186, %r177, %r179;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	mul.f32 	%r187, %r181, %r181;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	mul.f32 	%r188, %r170, %r187;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	fma.rn.f32 	%r189, %r184, %r188, %r186;
+$L__tmp8:
+	.loc	2 243 46                        // triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ]
+	shfl.sync.bfly.b32 	%r190, %r185, 2, 31, -1;
+	shfl.sync.bfly.b32 	%r191, %r189, 2, 31, -1;
+	shfl.sync.bfly.b32 	%r192, %r182, 2, 31, -1;
+$L__tmp9:
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	sub.f32 	%r193, %r190, %r185;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	add.f32 	%r194, %r182, %r192;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	setp.eq.f32 	%p16, %r194, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	div.full.f32 	%r195, %r192, %r194;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	selp.f32 	%r196, 0f00000000, %r195, %p16;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	fma.rn.f32 	%r197, %r196, %r193, %r185;
+	.loc	2 236 15                        // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	add.f32 	%r198, %r189, %r191;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	mul.f32 	%r199, %r193, %r193;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	mul.f32 	%r200, %r182, %r199;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	fma.rn.f32 	%r201, %r196, %r200, %r198;
+$L__tmp10:
+	.loc	2 243 46                        // triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ]
+	shfl.sync.bfly.b32 	%r202, %r197, 1, 31, -1;
+	shfl.sync.bfly.b32 	%r203, %r201, 1, 31, -1;
+	shfl.sync.bfly.b32 	%r204, %r194, 1, 31, -1;
+$L__tmp11:
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	sub.f32 	%r205, %r202, %r197;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	add.f32 	%r23, %r194, %r204;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	setp.eq.f32 	%p17, %r23, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	div.full.f32 	%r206, %r204, %r23;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	selp.f32 	%r207, 0f00000000, %r206, %p17;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	fma.rn.f32 	%r19, %r207, %r205, %r197;
+	.loc	2 236 15                        // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	add.f32 	%r208, %r201, %r203;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	mul.f32 	%r209, %r205, %r205;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	mul.f32 	%r210, %r194, %r209;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	fma.rn.f32 	%r21, %r207, %r210, %r208;
+$L__tmp12:
+	.loc	2 243 46                        // triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ]
+	setp.eq.b32 	%p3, %r52, 0;
+	shr.u32 	%r211, %r50, 3;
+	and.b32 	%r212, %r211, 60;
+	mov.b32 	%r213, global_smem;
+	add.s32 	%r18, %r213, %r212;
+	// begin inline asm
+	@%p3 st.shared.b32 [ %r18 + 0 ], %r19;
+	// end inline asm
+	add.s32 	%r20, %r18, 64;
+	// begin inline asm
+	@%p3 st.shared.b32 [ %r20 + 0 ], %r21;
+	// end inline asm
+	add.s32 	%r22, %r18, 128;
+	// begin inline asm
+	@%p3 st.shared.b32 [ %r22 + 0 ], %r23;
+	// end inline asm
+	bar.sync 	0;
+	setp.lt.u32 	%p4, %r51, 16;
+	shl.b32 	%r214, %r51, 2;
+	add.s32 	%r25, %r213, %r214;
+	// begin inline asm
+	@%p4 ld.shared.b32 %r24, [ %r25 + 0 ];
+	// end inline asm
+	add.s32 	%r27, %r25, 64;
+	// begin inline asm
+	@%p4 ld.shared.b32 %r26, [ %r27 + 0 ];
+	// end inline asm
+	add.s32 	%r29, %r25, 128;
+	// begin inline asm
+	@%p4 ld.shared.b32 %r28, [ %r29 + 0 ];
+	// end inline asm
+	shfl.sync.bfly.b32 	%r215, %r24, 8, 31, -1;
+	shfl.sync.bfly.b32 	%r216, %r26, 8, 31, -1;
+	shfl.sync.bfly.b32 	%r217, %r28, 8, 31, -1;
+$L__tmp13:
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	sub.f32 	%r218, %r215, %r24;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	add.f32 	%r219, %r28, %r217;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	setp.eq.f32 	%p18, %r219, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	div.full.f32 	%r220, %r217, %r219;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	selp.f32 	%r221, 0f00000000, %r220, %p18;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	fma.rn.f32 	%r222, %r218, %r221, %r24;
+	.loc	2 236 15                        // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	add.f32 	%r223, %r26, %r216;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	mul.f32 	%r224, %r218, %r218;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	mul.f32 	%r225, %r224, %r28;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	fma.rn.f32 	%r226, %r225, %r221, %r223;
+$L__tmp14:
+	.loc	2 243 46                        // triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ]
+	shfl.sync.bfly.b32 	%r227, %r222, 4, 31, -1;
+	shfl.sync.bfly.b32 	%r228, %r226, 4, 31, -1;
+	shfl.sync.bfly.b32 	%r229, %r219, 4, 31, -1;
+$L__tmp15:
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	sub.f32 	%r230, %r227, %r222;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	add.f32 	%r231, %r219, %r229;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	setp.eq.f32 	%p19, %r231, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	div.full.f32 	%r232, %r229, %r231;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	selp.f32 	%r233, 0f00000000, %r232, %p19;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	fma.rn.f32 	%r234, %r230, %r233, %r222;
+	.loc	2 236 15                        // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	add.f32 	%r235, %r226, %r228;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	mul.f32 	%r236, %r230, %r230;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	mul.f32 	%r237, %r219, %r236;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	fma.rn.f32 	%r238, %r233, %r237, %r235;
+$L__tmp16:
+	.loc	2 243 46                        // triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ]
+	shfl.sync.bfly.b32 	%r239, %r234, 2, 31, -1;
+	shfl.sync.bfly.b32 	%r240, %r238, 2, 31, -1;
+	shfl.sync.bfly.b32 	%r241, %r231, 2, 31, -1;
+$L__tmp17:
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	sub.f32 	%r242, %r239, %r234;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	add.f32 	%r243, %r231, %r241;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	setp.eq.f32 	%p20, %r243, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	div.full.f32 	%r244, %r241, %r243;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	selp.f32 	%r245, 0f00000000, %r244, %p20;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	fma.rn.f32 	%r246, %r242, %r245, %r234;
+	.loc	2 236 15                        // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	add.f32 	%r247, %r238, %r240;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	mul.f32 	%r248, %r242, %r242;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	mul.f32 	%r249, %r231, %r248;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	fma.rn.f32 	%r250, %r245, %r249, %r247;
+$L__tmp18:
+	.loc	2 243 46                        // triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ]
+	shfl.sync.bfly.b32 	%r251, %r246, 1, 31, -1;
+	shfl.sync.bfly.b32 	%r252, %r250, 1, 31, -1;
+	shfl.sync.bfly.b32 	%r253, %r243, 1, 31, -1;
+$L__tmp19:
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	sub.f32 	%r254, %r251, %r246;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	add.f32 	%r32, %r243, %r253;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	setp.eq.f32 	%p21, %r32, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	div.full.f32 	%r255, %r253, %r32;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	selp.f32 	%r256, 0f00000000, %r255, %p21;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	fma.rn.f32 	%r30, %r254, %r256, %r246;
+	.loc	2 236 15                        // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	add.f32 	%r257, %r250, %r252;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	mul.f32 	%r258, %r254, %r254;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	mul.f32 	%r259, %r243, %r258;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ] ]
+	fma.rn.f32 	%r31, %r256, %r259, %r257;
+$L__tmp20:
+	.loc	2 243 46                        // triton_helpers.py:243:46 @[ cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:52:80 ]
+	and.b32 	%r260, %r50, 15;
+	setp.eq.b32 	%p22, %r260, 0;
+	and.pred 	%p5, %p4, %p22;
+	// begin inline asm
+	@%p5 st.shared.b32 [ %r25 + 0 ], %r30;
+	// end inline asm
+	// begin inline asm
+	@%p5 st.shared.b32 [ %r27 + 0 ], %r31;
+	// end inline asm
+	// begin inline asm
+	@%p5 st.shared.b32 [ %r29 + 0 ], %r32;
+	// end inline asm
+	bar.sync 	0;
+	ld.shared.b32 	%r261, [global_smem];
+	ld.shared.b32 	%r262, [global_smem+64];
+$L__tmp21:
+	.loc	1 62 53                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:62:53
+	// begin inline asm
+	mov.u64 %rd8, 0x0;
+	createpolicy.fractional.L2::evict_first.b64 %rd8, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r33, %r5;
+	mov.u32 %r34, %r5;
+	mov.u32 %r35, %r5;
+	mov.u32 %r36, %r5;
+	@%p1 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { %r33, %r34, %r35, %r36 }, [ %rd7 + 0 ], %rd8;
+	// end inline asm
+	.loc	1 63 35                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:63:35
+	add.s64 	%rd9, %rd17, %rd22;
+	.loc	1 63 42                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:63:42
+	// begin inline asm
+	mov.u64 %rd10, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd10, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r37, %r5;
+	mov.u32 %r38, %r5;
+	mov.u32 %r39, %r5;
+	mov.u32 %r40, %r5;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r37, %r38, %r39, %r40 }, [ %rd9 + 0 ], %rd10;
+	// end inline asm
+	.loc	1 64 35                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:64:35
+	add.s64 	%rd11, %rd18, %rd22;
+	.loc	1 64 42                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:64:42
+	// begin inline asm
+	mov.u64 %rd12, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd12, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r41, %r5;
+	mov.u32 %r42, %r5;
+	mov.u32 %r43, %r5;
+	mov.u32 %r44, %r5;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r41, %r42, %r43, %r44 }, [ %rd11 + 0 ], %rd12;
+	// end inline asm
+	mov.b32 	%r263, 0f45800000;
+	.loc	1 68 25                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:68:25
+	div.full.f32 	%r264, %r262, %r263;
+	.loc	1 70 24                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:70:24
+	add.f32 	%r265, %r264, 0f358637BD;
+	.loc	1 71 32                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:71:32
+	rsqrt.approx.ftz.f32 	%r266, %r265;
+	.loc	1 78 29                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:78:29
+	add.s64 	%rd13, %rd20, %rd21;
+	.loc	1 62 115                        // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:62:115
+	mov.b32 	{%rs25, %rs26}, %r33;
+	cvt.f32.bf16 	%r267, %rs26;
+	cvt.f32.bf16 	%r268, %rs25;
+	.loc	1 66 24                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:66:24
+	sub.f32 	%r269, %r268, %r261;
+	sub.f32 	%r270, %r267, %r261;
+	.loc	1 64 95                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:64:95
+	mov.b32 	{%rs27, %rs28}, %r41;
+	cvt.f32.bf16 	%r271, %rs28;
+	cvt.f32.bf16 	%r272, %rs27;
+	.loc	1 63 95                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:63:95
+	mov.b32 	{%rs29, %rs30}, %r37;
+	cvt.f32.bf16 	%r273, %rs29;
+	cvt.f32.bf16 	%r274, %rs30;
+	.loc	1 72 24                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:72:24
+	mul.f32 	%r275, %r270, %r266;
+	mul.f32 	%r276, %r269, %r266;
+	.loc	1 75 24                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:75:24
+	add.f32 	%r277, %r274, 0f3F800000;
+	add.f32 	%r278, %r273, 0f3F800000;
+	.loc	1 77 24                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:77:24
+	fma.rn.f32 	%r279, %r278, %r276, %r272;
+	fma.rn.f32 	%r280, %r277, %r275, %r271;
+	.loc	1 78 53                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:78:53
+	cvt.rn.bf16x2.f32 	%r45, %r280, %r279;
+	.loc	1 62 115                        // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:62:115
+	mov.b32 	{%rs31, %rs32}, %r34;
+	cvt.f32.bf16 	%r281, %rs32;
+	cvt.f32.bf16 	%r282, %rs31;
+	.loc	1 66 24                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:66:24
+	sub.f32 	%r283, %r282, %r261;
+	sub.f32 	%r284, %r281, %r261;
+	.loc	1 64 95                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:64:95
+	mov.b32 	{%rs33, %rs34}, %r42;
+	cvt.f32.bf16 	%r285, %rs34;
+	cvt.f32.bf16 	%r286, %rs33;
+	.loc	1 63 95                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:63:95
+	mov.b32 	{%rs35, %rs36}, %r38;
+	cvt.f32.bf16 	%r287, %rs35;
+	cvt.f32.bf16 	%r288, %rs36;
+	.loc	1 72 24                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:72:24
+	mul.f32 	%r289, %r284, %r266;
+	mul.f32 	%r290, %r283, %r266;
+	.loc	1 75 24                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:75:24
+	add.f32 	%r291, %r288, 0f3F800000;
+	add.f32 	%r292, %r287, 0f3F800000;
+	.loc	1 77 24                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:77:24
+	fma.rn.f32 	%r293, %r292, %r290, %r286;
+	fma.rn.f32 	%r294, %r291, %r289, %r285;
+	.loc	1 78 53                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:78:53
+	cvt.rn.bf16x2.f32 	%r46, %r294, %r293;
+	.loc	1 62 115                        // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:62:115
+	mov.b32 	{%rs37, %rs38}, %r35;
+	cvt.f32.bf16 	%r295, %rs38;
+	cvt.f32.bf16 	%r296, %rs37;
+	.loc	1 66 24                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:66:24
+	sub.f32 	%r297, %r296, %r261;
+	sub.f32 	%r298, %r295, %r261;
+	.loc	1 64 95                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:64:95
+	mov.b32 	{%rs39, %rs40}, %r43;
+	cvt.f32.bf16 	%r299, %rs40;
+	cvt.f32.bf16 	%r300, %rs39;
+	.loc	1 63 95                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:63:95
+	mov.b32 	{%rs41, %rs42}, %r39;
+	cvt.f32.bf16 	%r301, %rs41;
+	cvt.f32.bf16 	%r302, %rs42;
+	.loc	1 72 24                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:72:24
+	mul.f32 	%r303, %r298, %r266;
+	mul.f32 	%r304, %r297, %r266;
+	.loc	1 75 24                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:75:24
+	add.f32 	%r305, %r302, 0f3F800000;
+	add.f32 	%r306, %r301, 0f3F800000;
+	.loc	1 77 24                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:77:24
+	fma.rn.f32 	%r307, %r306, %r304, %r300;
+	fma.rn.f32 	%r308, %r305, %r303, %r299;
+	.loc	1 78 53                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:78:53
+	cvt.rn.bf16x2.f32 	%r47, %r308, %r307;
+	.loc	1 62 115                        // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:62:115
+	mov.b32 	{%rs43, %rs44}, %r36;
+	cvt.f32.bf16 	%r309, %rs44;
+	cvt.f32.bf16 	%r310, %rs43;
+	.loc	1 66 24                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:66:24
+	sub.f32 	%r311, %r310, %r261;
+	sub.f32 	%r312, %r309, %r261;
+	.loc	1 64 95                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:64:95
+	mov.b32 	{%rs45, %rs46}, %r44;
+	cvt.f32.bf16 	%r313, %rs46;
+	cvt.f32.bf16 	%r314, %rs45;
+	.loc	1 63 95                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:63:95
+	mov.b32 	{%rs47, %rs48}, %r40;
+	cvt.f32.bf16 	%r315, %rs47;
+	cvt.f32.bf16 	%r316, %rs48;
+	.loc	1 72 24                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:72:24
+	mul.f32 	%r317, %r312, %r266;
+	mul.f32 	%r318, %r311, %r266;
+	.loc	1 75 24                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:75:24
+	add.f32 	%r319, %r316, 0f3F800000;
+	add.f32 	%r320, %r315, 0f3F800000;
+	.loc	1 77 24                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:77:24
+	fma.rn.f32 	%r321, %r320, %r318, %r314;
+	fma.rn.f32 	%r322, %r319, %r317, %r313;
+	.loc	1 78 53                         // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:78:53
+	cvt.rn.bf16x2.f32 	%r48, %r322, %r321;
+	// begin inline asm
+	@%p1 st.global.v4.b32 [ %rd13 + 0 ], { %r45, %r46, %r47, %r48 };
+	// end inline asm
+	.loc	1 56 4                          // cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py:56:4
+	ret;
+$L__tmp22:
+$L__func_end0:
+                                        // -- End function
+}
+	.file	1 "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py"
+	.file	2 "/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py"
+	.section	.debug_abbrev
+	{
+.b8 1                                   // Abbreviation Code
+.b8 17                                  // DW_TAG_compile_unit
+.b8 1                                   // DW_CHILDREN_yes
+.b8 37                                  // DW_AT_producer
+.b8 8                                   // DW_FORM_string
+.b8 19                                  // DW_AT_language
+.b8 5                                   // DW_FORM_data2
+.b8 3                                   // DW_AT_name
+.b8 8                                   // DW_FORM_string
+.b8 16                                  // DW_AT_stmt_list
+.b8 6                                   // DW_FORM_data4
+.b8 27                                  // DW_AT_comp_dir
+.b8 8                                   // DW_FORM_string
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 2                                   // Abbreviation Code
+.b8 46                                  // DW_TAG_subprogram
+.b8 0                                   // DW_CHILDREN_no
+.b8 3                                   // DW_AT_name
+.b8 8                                   // DW_FORM_string
+.b8 32                                  // DW_AT_inline
+.b8 11                                  // DW_FORM_data1
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 3                                   // Abbreviation Code
+.b8 46                                  // DW_TAG_subprogram
+.b8 1                                   // DW_CHILDREN_yes
+.b8 17                                  // DW_AT_low_pc
+.b8 1                                   // DW_FORM_addr
+.b8 18                                  // DW_AT_high_pc
+.b8 1                                   // DW_FORM_addr
+.b8 49                                  // DW_AT_abstract_origin
+.b8 19                                  // DW_FORM_ref4
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 4                                   // Abbreviation Code
+.b8 29                                  // DW_TAG_inlined_subroutine
+.b8 1                                   // DW_CHILDREN_yes
+.b8 49                                  // DW_AT_abstract_origin
+.b8 19                                  // DW_FORM_ref4
+.b8 17                                  // DW_AT_low_pc
+.b8 1                                   // DW_FORM_addr
+.b8 18                                  // DW_AT_high_pc
+.b8 1                                   // DW_FORM_addr
+.b8 88                                  // DW_AT_call_file
+.b8 11                                  // DW_FORM_data1
+.b8 89                                  // DW_AT_call_line
+.b8 11                                  // DW_FORM_data1
+.b8 87                                  // DW_AT_call_column
+.b8 11                                  // DW_FORM_data1
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 5                                   // Abbreviation Code
+.b8 29                                  // DW_TAG_inlined_subroutine
+.b8 0                                   // DW_CHILDREN_no
+.b8 49                                  // DW_AT_abstract_origin
+.b8 19                                  // DW_FORM_ref4
+.b8 17                                  // DW_AT_low_pc
+.b8 1                                   // DW_FORM_addr
+.b8 18                                  // DW_AT_high_pc
+.b8 1                                   // DW_FORM_addr
+.b8 88                                  // DW_AT_call_file
+.b8 11                                  // DW_FORM_data1
+.b8 89                                  // DW_AT_call_line
+.b8 11                                  // DW_FORM_data1
+.b8 87                                  // DW_AT_call_column
+.b8 11                                  // DW_FORM_data1
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 0                                   // EOM(3)
+	}
+	.section	.debug_info
+	{
+.b32 343                                // Length of Unit
+.b8 2                                   // DWARF version number
+.b8 0
+.b32 .debug_abbrev                      // Offset Into Abbrev. Section
+.b8 8                                   // Address Size (in bytes)
+.b8 1                                   // Abbrev [1] 0xb:0x150 DW_TAG_compile_unit
+.b8 116                                 // DW_AT_producer
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 0
+.b8 2                                   // DW_AT_language
+.b8 0
+.b8 99                                  // DW_AT_name
+.b8 119
+.b8 51
+.b8 106
+.b8 98
+.b8 105
+.b8 121
+.b8 53
+.b8 122
+.b8 114
+.b8 107
+.b8 121
+.b8 109
+.b8 55
+.b8 118
+.b8 107
+.b8 110
+.b8 110
+.b8 51
+.b8 122
+.b8 105
+.b8 117
+.b8 107
+.b8 51
+.b8 113
+.b8 105
+.b8 109
+.b8 108
+.b8 98
+.b8 99
+.b8 104
+.b8 105
+.b8 110
+.b8 50
+.b8 98
+.b8 98
+.b8 122
+.b8 51
+.b8 115
+.b8 117
+.b8 102
+.b8 54
+.b8 113
+.b8 120
+.b8 105
+.b8 106
+.b8 110
+.b8 98
+.b8 102
+.b8 99
+.b8 51
+.b8 121
+.b8 46
+.b8 112
+.b8 121
+.b8 0
+.b32 .debug_line                        // DW_AT_stmt_list
+.b8 47                                  // DW_AT_comp_dir
+.b8 97
+.b8 112
+.b8 112
+.b8 47
+.b8 116
+.b8 101
+.b8 110
+.b8 115
+.b8 111
+.b8 114
+.b8 114
+.b8 116
+.b8 95
+.b8 108
+.b8 108
+.b8 109
+.b8 47
+.b8 118
+.b8 105
+.b8 115
+.b8 117
+.b8 97
+.b8 108
+.b8 95
+.b8 103
+.b8 101
+.b8 110
+.b8 47
+.b8 99
+.b8 111
+.b8 109
+.b8 112
+.b8 105
+.b8 108
+.b8 101
+.b8 100
+.b8 95
+.b8 99
+.b8 97
+.b8 99
+.b8 104
+.b8 101
+.b8 47
+.b8 102
+.b8 108
+.b8 117
+.b8 120
+.b8 50
+.b8 95
+.b8 107
+.b8 108
+.b8 101
+.b8 105
+.b8 110
+.b8 95
+.b8 57
+.b8 98
+.b8 95
+.b8 78
+.b8 86
+.b8 73
+.b8 68
+.b8 73
+.b8 65
+.b8 95
+.b8 71
+.b8 101
+.b8 70
+.b8 111
+.b8 114
+.b8 99
+.b8 101
+.b8 95
+.b8 82
+.b8 84
+.b8 88
+.b8 95
+.b8 52
+.b8 48
+.b8 57
+.b8 48
+.b8 95
+.b8 115
+.b8 109
+.b8 56
+.b8 57
+.b8 95
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 50
+.b8 46
+.b8 49
+.b8 48
+.b8 46
+.b8 48
+.b8 97
+.b8 48
+.b8 95
+.b8 98
+.b8 52
+.b8 101
+.b8 52
+.b8 101
+.b8 101
+.b8 56
+.b8 49
+.b8 100
+.b8 51
+.b8 46
+.b8 110
+.b8 118
+.b8 50
+.b8 53
+.b8 46
+.b8 49
+.b8 50
+.b8 95
+.b8 99
+.b8 117
+.b8 100
+.b8 97
+.b8 49
+.b8 51
+.b8 95
+.b8 49
+.b8 47
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 105
+.b8 110
+.b8 100
+.b8 117
+.b8 99
+.b8 116
+.b8 111
+.b8 114
+.b8 47
+.b8 119
+.b8 51
+.b8 0
+.b8 2                                   // Abbrev [2] 0xe4:0x2f DW_TAG_subprogram
+.b8 116                                 // DW_AT_name
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 114
+.b8 101
+.b8 100
+.b8 95
+.b8 102
+.b8 117
+.b8 115
+.b8 101
+.b8 100
+.b8 95
+.b8 97
+.b8 100
+.b8 100
+.b8 95
+.b8 109
+.b8 117
+.b8 108
+.b8 95
+.b8 110
+.b8 97
+.b8 116
+.b8 105
+.b8 118
+.b8 101
+.b8 95
+.b8 108
+.b8 97
+.b8 121
+.b8 101
+.b8 114
+.b8 95
+.b8 110
+.b8 111
+.b8 114
+.b8 109
+.b8 95
+.b8 48
+.b8 0
+.b8 1                                   // DW_AT_inline
+.b8 3                                   // Abbrev [3] 0x113:0x47 DW_TAG_subprogram
+.b64 $L__func_begin0                    // DW_AT_low_pc
+.b64 $L__func_end0                      // DW_AT_high_pc
+.b32 228                                // DW_AT_abstract_origin
+.b8 4                                   // Abbrev [4] 0x128:0x31 DW_TAG_inlined_subroutine
+.b32 228                                // DW_AT_abstract_origin
+.b64 $L__tmp1                           // DW_AT_low_pc
+.b64 $L__tmp21                          // DW_AT_high_pc
+.b8 1                                   // DW_AT_call_file
+.b8 52                                  // DW_AT_call_line
+.b8 80                                  // DW_AT_call_column
+.b8 5                                   // Abbrev [5] 0x140:0x18 DW_TAG_inlined_subroutine
+.b32 228                                // DW_AT_abstract_origin
+.b64 $L__tmp1                           // DW_AT_low_pc
+.b64 $L__tmp20                          // DW_AT_high_pc
+.b8 2                                   // DW_AT_call_file
+.b8 243                                 // DW_AT_call_line
+.b8 46                                  // DW_AT_call_column
+.b8 0                                   // End Of Children Mark
+.b8 0                                   // End Of Children Mark
+.b8 0                                   // End Of Children Mark
+	}
+	.section	.debug_macinfo	{	}
diff --git a/triton/TADKEHLBMNHPYJR7HM24RCH2SQBJWQ5J3N2BE4JL76WWKUZUI7BA/triton_red_fused_add_mul_native_layer_norm_0.source b/triton/TADKEHLBMNHPYJR7HM24RCH2SQBJWQ5J3N2BE4JL76WWKUZUI7BA/triton_red_fused_add_mul_native_layer_norm_0.source
new file mode 100644
index 0000000000000000000000000000000000000000..003e79d3962a0d73b491298810f1598181bcc406
--- /dev/null
+++ b/triton/TADKEHLBMNHPYJR7HM24RCH2SQBJWQ5J3N2BE4JL76WWKUZUI7BA/triton_red_fused_add_mul_native_layer_norm_0.source
@@ -0,0 +1,486 @@
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":18:0)
+#loc88 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":216:0)
+#loc101 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":133:0)
+#loc105 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":242:0)
+#loc107 = loc(unknown)
+#loc110 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":230:0)
+#loc125 = loc("in_ptr0"(#loc))
+#loc126 = loc("in_ptr1"(#loc))
+#loc127 = loc("in_ptr2"(#loc))
+#loc128 = loc("in_ptr3"(#loc))
+#loc129 = loc("in_ptr4"(#loc))
+#loc130 = loc("out_ptr0"(#loc))
+#loc131 = loc("out_ptr3"(#loc))
+#loc132 = loc("xnumel"(#loc))
+#loc133 = loc("r0_numel"(#loc))
+#loc201 = loc("value"(#loc88))
+#loc202 = loc("mean"(#loc88))
+#loc203 = loc("m2"(#loc88))
+#loc204 = loc("weight"(#loc88))
+#loc205 = loc("first_iteration"(#loc88))
+#loc215 = loc("input"(#loc101))
+#loc216 = loc("mean"(#loc105))
+#loc217 = loc("m2"(#loc105))
+#loc218 = loc("weight"(#loc105))
+#loc219 = loc("mean_1"(#loc110))
+#loc220 = loc("m2_1"(#loc110))
+#loc221 = loc("weight_1"(#loc110))
+#loc222 = loc("mean_2"(#loc110))
+#loc223 = loc("m2_2"(#loc110))
+#loc224 = loc("weight_2"(#loc110))
+#loc231 = loc("new_mean"(#loc201))
+module {
+  tt.func public @triton_red_fused_add_mul_native_layer_norm_0(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %in_ptr4: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr4"(#loc)), %out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %out_ptr3: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr3"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
+    %xnumel_0 = arith.constant 2048 : i32 loc(#loc134)
+    %r0_numel_1 = arith.constant 4096 : i32 loc(#loc135)
+    %xoffset = tt.get_program_id x : i32 loc(#loc136)
+    %xoffset_2 = arith.constant 1 : i32 loc(#loc137)
+    %xoffset_3 = arith.constant 1 : i32 loc(#loc137)
+    %xoffset_4 = arith.muli %xoffset, %xoffset_3 : i32 loc(#loc137)
+    %xindex = tt.make_range {end = 1 : i32, start = 0 : i32} : tensor<1xi32> loc(#loc138)
+    %xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc139)
+    %xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<1x1xi32> loc(#loc140)
+    %xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<1x1xi32> loc(#loc140)
+    %xmask = arith.constant dense<2048> : tensor<1x1xi32> loc(#loc141)
+    %xmask_8 = arith.cmpi slt, %xindex_7, %xmask : tensor<1x1xi32> loc(#loc141)
+    %r0_base = tt.make_range {end = 4096 : i32, start = 0 : i32} : tensor<4096xi32> loc(#loc142)
+    %r0_base_9 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<4096xi32> -> tensor<1x4096xi32> loc(#loc143)
+    %tmp7_mean = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_4096__(1,)cconstexpr_fp32_"() : () -> tensor<1x4096xf32> loc(#loc144)
+    %tmp7_m2 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_4096__(1,)cconstexpr_fp32_"() : () -> tensor<1x4096xf32> loc(#loc145)
+    %tmp7_weight = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_4096__(1,)cconstexpr_fp32_"() : () -> tensor<1x4096xf32> loc(#loc146)
+    %c0_i32 = arith.constant 0 : i32 loc(#loc14)
+    %c4096_i32 = arith.constant 4096 : i32 loc(#loc14)
+    %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc14)
+    %1 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc14)
+    %2 = arith.bitcast %c4096_i32 : i32 to i32 loc(#loc14)
+    %3 = ub.poison : i32 loc(#loc14)
+    %tmp7_weight_10:3 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%tmp7_mean_13 = %tmp7_mean, %tmp7_m2_14 = %tmp7_m2, %tmp7_weight_15 = %tmp7_weight) -> (tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32>)  : i32 {
+      %r0_index = tt.splat %r0_offset : i32 -> tensor<1x4096xi32> loc(#loc148)
+      %r0_index_16 = arith.addi %r0_index, %r0_base_9 : tensor<1x4096xi32> loc(#loc148)
+      %r0_mask = arith.constant dense<4096> : tensor<1x4096xi32> loc(#loc149)
+      %r0_mask_17 = arith.cmpi slt, %r0_index_16, %r0_mask : tensor<1x4096xi32> loc(#loc149)
+      %tmp0 = arith.constant 4096 : i32 loc(#loc150)
+      %tmp0_18 = arith.constant 4096 : i32 loc(#loc150)
+      %tmp0_19 = arith.constant dense<4096> : tensor<1x1xi32> loc(#loc150)
+      %tmp0_20 = arith.muli %tmp0_19, %xindex_7 : tensor<1x1xi32> loc(#loc150)
+      %tmp0_21 = tt.broadcast %tmp0_20 : tensor<1x1xi32> -> tensor<1x4096xi32> loc(#loc151)
+      %tmp0_22 = arith.addi %r0_index_16, %tmp0_21 : tensor<1x4096xi32> loc(#loc151)
+      %tmp0_23 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<1x4096x!tt.ptr<bf16>> loc(#loc152)
+      %tmp0_24 = tt.addptr %tmp0_23, %tmp0_22 : tensor<1x4096x!tt.ptr<bf16>>, tensor<1x4096xi32> loc(#loc152)
+      %tmp0_25 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x4096xi1> loc(#loc153)
+      %tmp0_26 = arith.andi %r0_mask_17, %tmp0_25 : tensor<1x4096xi1> loc(#loc153)
+      %tmp0_27 = arith.constant 0.000000e+00 : f32 loc(#loc154)
+      %tmp0_28 = arith.constant dense<0.000000e+00> : tensor<1x4096xf32> loc(#loc154)
+      %tmp0_29 = arith.truncf %tmp0_28 : tensor<1x4096xf32> to tensor<1x4096xbf16> loc(#loc154)
+      %tmp0_30 = tt.load %tmp0_24, %tmp0_26, %tmp0_29 evictionPolicy = evict_first : tensor<1x4096x!tt.ptr<bf16>> loc(#loc154)
+      %tmp0_31 = arith.extf %tmp0_30 : tensor<1x4096xbf16> to tensor<1x4096xf32> loc(#loc155)
+      %tmp1 = tt.splat %in_ptr1 : !tt.ptr<bf16> -> tensor<1x4096x!tt.ptr<bf16>> loc(#loc156)
+      %tmp1_32 = tt.addptr %tmp1, %r0_index_16 : tensor<1x4096x!tt.ptr<bf16>>, tensor<1x4096xi32> loc(#loc156)
+      %tmp1_33 = arith.constant 0.000000e+00 : f32 loc(#loc157)
+      %tmp1_34 = arith.constant dense<0.000000e+00> : tensor<1x4096xf32> loc(#loc157)
+      %tmp1_35 = arith.truncf %tmp1_34 : tensor<1x4096xf32> to tensor<1x4096xbf16> loc(#loc157)
+      %tmp1_36 = tt.load %tmp1_32, %r0_mask_17, %tmp1_35 evictionPolicy = evict_last : tensor<1x4096x!tt.ptr<bf16>> loc(#loc157)
+      %tmp1_37 = arith.extf %tmp1_36 : tensor<1x4096xbf16> to tensor<1x4096xf32> loc(#loc158)
+      %tmp2 = arith.constant 4096 : i32 loc(#loc159)
+      %tmp2_38 = arith.constant 4096 : i32 loc(#loc159)
+      %tmp2_39 = arith.constant dense<4096> : tensor<1x1xi32> loc(#loc159)
+      %tmp2_40 = arith.muli %tmp2_39, %xindex_7 : tensor<1x1xi32> loc(#loc159)
+      %tmp2_41 = tt.broadcast %tmp2_40 : tensor<1x1xi32> -> tensor<1x4096xi32> loc(#loc160)
+      %tmp2_42 = arith.addi %r0_index_16, %tmp2_41 : tensor<1x4096xi32> loc(#loc160)
+      %tmp2_43 = tt.splat %in_ptr2 : !tt.ptr<bf16> -> tensor<1x4096x!tt.ptr<bf16>> loc(#loc161)
+      %tmp2_44 = tt.addptr %tmp2_43, %tmp2_42 : tensor<1x4096x!tt.ptr<bf16>>, tensor<1x4096xi32> loc(#loc161)
+      %tmp2_45 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x4096xi1> loc(#loc162)
+      %tmp2_46 = arith.andi %r0_mask_17, %tmp2_45 : tensor<1x4096xi1> loc(#loc162)
+      %tmp2_47 = arith.constant 0.000000e+00 : f32 loc(#loc163)
+      %tmp2_48 = arith.constant dense<0.000000e+00> : tensor<1x4096xf32> loc(#loc163)
+      %tmp2_49 = arith.truncf %tmp2_48 : tensor<1x4096xf32> to tensor<1x4096xbf16> loc(#loc163)
+      %tmp2_50 = tt.load %tmp2_44, %tmp2_46, %tmp2_49 evictionPolicy = evict_first : tensor<1x4096x!tt.ptr<bf16>> loc(#loc163)
+      %tmp2_51 = arith.extf %tmp2_50 : tensor<1x4096xbf16> to tensor<1x4096xf32> loc(#loc164)
+      %tmp3 = arith.mulf %tmp1_37, %tmp2_51 : tensor<1x4096xf32> loc(#loc165)
+      %tmp4 = arith.addf %tmp0_31, %tmp3 : tensor<1x4096xf32> loc(#loc166)
+      %c0_i32_52 = arith.constant 0 : i32 loc(#loc34)
+      %9 = arith.cmpi eq, %r0_offset, %c0_i32_52 : i32 loc(#loc34)
+      %10:3 = tt.call @torch._inductor.runtime.triton_helpers.welford_reduce__fp32S1_4096S_fp32S1_4096S_fp32S1_4096S_fp32S1_4096S_u1__(%tmp4, %tmp7_mean_13, %tmp7_m2_14, %tmp7_weight_15, %9) : (tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32>, i1) -> (tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32>) loc(#loc35)
+      %tmp7_mean_53 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x4096xi1> loc(#loc167)
+      %tmp7_mean_54 = arith.andi %r0_mask_17, %tmp7_mean_53 : tensor<1x4096xi1> loc(#loc167)
+      %tmp7_mean_55 = arith.select %tmp7_mean_54, %10#0, %tmp7_mean_13 : tensor<1x4096xi1>, tensor<1x4096xf32> loc(#loc168)
+      %tmp7_m2_56 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x4096xi1> loc(#loc169)
+      %tmp7_m2_57 = arith.andi %r0_mask_17, %tmp7_m2_56 : tensor<1x4096xi1> loc(#loc169)
+      %tmp7_m2_58 = arith.select %tmp7_m2_57, %10#1, %tmp7_m2_14 : tensor<1x4096xi1>, tensor<1x4096xf32> loc(#loc170)
+      %tmp7_weight_59 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x4096xi1> loc(#loc171)
+      %tmp7_weight_60 = arith.andi %r0_mask_17, %tmp7_weight_59 : tensor<1x4096xi1> loc(#loc171)
+      %tmp7_weight_61 = arith.select %tmp7_weight_60, %10#2, %tmp7_weight_15 : tensor<1x4096xi1>, tensor<1x4096xf32> loc(#loc172)
+      %c4096_i32_62 = arith.constant 4096 : i32 loc(#loc42)
+      %c4096_i32_63 = arith.constant 4096 : i32 loc(#loc42)
+      %cst = arith.constant dense<4096> : tensor<1x1xi32> loc(#loc42)
+      %11 = arith.muli %cst, %xindex_7 : tensor<1x1xi32> loc(#loc42)
+      %12 = tt.broadcast %11 : tensor<1x1xi32> -> tensor<1x4096xi32> loc(#loc43)
+      %13 = arith.addi %r0_index_16, %12 : tensor<1x4096xi32> loc(#loc43)
+      %14 = tt.splat %out_ptr0 : !tt.ptr<bf16> -> tensor<1x4096x!tt.ptr<bf16>> loc(#loc44)
+      %15 = tt.addptr %14, %13 : tensor<1x4096x!tt.ptr<bf16>>, tensor<1x4096xi32> loc(#loc44)
+      %16 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x4096xi1> loc(#loc45)
+      %17 = arith.andi %r0_mask_17, %16 : tensor<1x4096xi1> loc(#loc45)
+      %18 = arith.truncf %tmp4 : tensor<1x4096xf32> to tensor<1x4096xbf16> loc(#loc46)
+      tt.store %15, %18, %17 : tensor<1x4096x!tt.ptr<bf16>> loc(#loc46)
+      scf.yield %tmp7_mean_55, %tmp7_m2_58, %tmp7_weight_61 : tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32> loc(#loc47)
+    } loc(#loc237)
+    %4:3 = tt.call @"torch._inductor.runtime.triton_helpers.welford__fp32S1_4096S_fp32S1_4096S_fp32S1_4096S__(3,)cconstexpr_1_"(%tmp7_weight_10#0, %tmp7_weight_10#1, %tmp7_weight_10#2) : (tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32>) -> (tensor<1xf32>, tensor<1xf32>, tensor<1xf32>) loc(#loc48)
+    %tmp7 = tt.expand_dims %4#0 {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc173)
+    %tmp11 = tt.expand_dims %4#1 {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc174)
+    %tmp12 = tt.expand_dims %4#2 {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc175)
+    %c0_i32_11 = arith.constant 0 : i32 loc(#loc52)
+    %c4096_i32_12 = arith.constant 4096 : i32 loc(#loc52)
+    %5 = arith.bitcast %c0_i32_11 : i32 to i32 loc(#loc52)
+    %6 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc52)
+    %7 = arith.bitcast %c4096_i32_12 : i32 to i32 loc(#loc52)
+    %8 = ub.poison : i32 loc(#loc52)
+    scf.for %r0_offset = %5 to %6 step %7  : i32 {
+      %r0_index = tt.splat %r0_offset : i32 -> tensor<1x4096xi32> loc(#loc176)
+      %r0_index_13 = arith.addi %r0_index, %r0_base_9 : tensor<1x4096xi32> loc(#loc176)
+      %r0_mask = arith.constant dense<4096> : tensor<1x4096xi32> loc(#loc177)
+      %r0_mask_14 = arith.cmpi slt, %r0_index_13, %r0_mask : tensor<1x4096xi32> loc(#loc177)
+      %tmp13 = arith.constant 4096 : i32 loc(#loc178)
+      %tmp13_15 = arith.constant 4096 : i32 loc(#loc178)
+      %tmp13_16 = arith.constant dense<4096> : tensor<1x1xi32> loc(#loc178)
+      %tmp13_17 = arith.muli %tmp13_16, %xindex_7 : tensor<1x1xi32> loc(#loc178)
+      %tmp13_18 = tt.broadcast %tmp13_17 : tensor<1x1xi32> -> tensor<1x4096xi32> loc(#loc179)
+      %tmp13_19 = arith.addi %r0_index_13, %tmp13_18 : tensor<1x4096xi32> loc(#loc179)
+      %tmp13_20 = tt.splat %out_ptr0 : !tt.ptr<bf16> -> tensor<1x4096x!tt.ptr<bf16>> loc(#loc180)
+      %tmp13_21 = tt.addptr %tmp13_20, %tmp13_19 : tensor<1x4096x!tt.ptr<bf16>>, tensor<1x4096xi32> loc(#loc180)
+      %tmp13_22 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x4096xi1> loc(#loc181)
+      %tmp13_23 = arith.andi %r0_mask_14, %tmp13_22 : tensor<1x4096xi1> loc(#loc181)
+      %tmp13_24 = arith.constant 0.000000e+00 : f32 loc(#loc182)
+      %tmp13_25 = arith.constant dense<0.000000e+00> : tensor<1x4096xf32> loc(#loc182)
+      %tmp13_26 = arith.truncf %tmp13_25 : tensor<1x4096xf32> to tensor<1x4096xbf16> loc(#loc182)
+      %tmp13_27 = tt.load %tmp13_21, %tmp13_23, %tmp13_26 evictionPolicy = evict_first : tensor<1x4096x!tt.ptr<bf16>> loc(#loc182)
+      %tmp13_28 = arith.extf %tmp13_27 : tensor<1x4096xbf16> to tensor<1x4096xf32> loc(#loc183)
+      %tmp23 = tt.splat %in_ptr3 : !tt.ptr<bf16> -> tensor<1x4096x!tt.ptr<bf16>> loc(#loc184)
+      %tmp23_29 = tt.addptr %tmp23, %r0_index_13 : tensor<1x4096x!tt.ptr<bf16>>, tensor<1x4096xi32> loc(#loc184)
+      %tmp23_30 = arith.constant 0.000000e+00 : f32 loc(#loc185)
+      %tmp23_31 = arith.constant dense<0.000000e+00> : tensor<1x4096xf32> loc(#loc185)
+      %tmp23_32 = arith.truncf %tmp23_31 : tensor<1x4096xf32> to tensor<1x4096xbf16> loc(#loc185)
+      %tmp23_33 = tt.load %tmp23_29, %r0_mask_14, %tmp23_32 evictionPolicy = evict_last : tensor<1x4096x!tt.ptr<bf16>> loc(#loc185)
+      %tmp23_34 = arith.extf %tmp23_33 : tensor<1x4096xbf16> to tensor<1x4096xf32> loc(#loc186)
+      %tmp27 = tt.splat %in_ptr4 : !tt.ptr<bf16> -> tensor<1x4096x!tt.ptr<bf16>> loc(#loc187)
+      %tmp27_35 = tt.addptr %tmp27, %r0_index_13 : tensor<1x4096x!tt.ptr<bf16>>, tensor<1x4096xi32> loc(#loc187)
+      %tmp27_36 = arith.constant 0.000000e+00 : f32 loc(#loc188)
+      %tmp27_37 = arith.constant dense<0.000000e+00> : tensor<1x4096xf32> loc(#loc188)
+      %tmp27_38 = arith.truncf %tmp27_37 : tensor<1x4096xf32> to tensor<1x4096xbf16> loc(#loc188)
+      %tmp27_39 = tt.load %tmp27_35, %r0_mask_14, %tmp27_38 evictionPolicy = evict_last : tensor<1x4096x!tt.ptr<bf16>> loc(#loc188)
+      %tmp27_40 = arith.extf %tmp27_39 : tensor<1x4096xbf16> to tensor<1x4096xf32> loc(#loc189)
+      %tmp15 = tt.broadcast %tmp7 : tensor<1x1xf32> -> tensor<1x4096xf32> loc(#loc190)
+      %tmp15_41 = arith.subf %tmp13_28, %tmp15 : tensor<1x4096xf32> loc(#loc190)
+      %tmp16 = arith.constant 4.096000e+03 : f32 loc(#loc191)
+      %tmp17 = arith.constant dense<4.096000e+03> : tensor<1x1xf32> loc(#loc192)
+      %tmp17_42 = arith.divf %tmp11, %tmp17 : tensor<1x1xf32> loc(#loc192)
+      %tmp18 = arith.constant 9.99999997E-7 : f32 loc(#loc193)
+      %tmp19 = arith.constant dense<9.99999997E-7> : tensor<1x1xf32> loc(#loc194)
+      %tmp19_43 = arith.addf %tmp17_42, %tmp19 : tensor<1x1xf32> loc(#loc194)
+      %tmp20 = tt.extern_elementwise %tmp19_43 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<1x1xf32>) -> tensor<1x1xf32> loc(#loc195)
+      %tmp21 = tt.broadcast %tmp20 : tensor<1x1xf32> -> tensor<1x4096xf32> loc(#loc196)
+      %tmp21_44 = arith.mulf %tmp15_41, %tmp21 : tensor<1x4096xf32> loc(#loc196)
+      %tmp24 = arith.constant 1.000000e+00 : f32 loc(#loc197)
+      %tmp25 = arith.constant dense<1.000000e+00> : tensor<1x4096xf32> loc(#loc198)
+      %tmp25_45 = arith.addf %tmp23_34, %tmp25 : tensor<1x4096xf32> loc(#loc198)
+      %tmp26 = arith.mulf %tmp21_44, %tmp25_45 : tensor<1x4096xf32> loc(#loc199)
+      %tmp28 = arith.addf %tmp26, %tmp27_40 : tensor<1x4096xf32> loc(#loc200)
+      %c4096_i32_46 = arith.constant 4096 : i32 loc(#loc78)
+      %c4096_i32_47 = arith.constant 4096 : i32 loc(#loc78)
+      %cst = arith.constant dense<4096> : tensor<1x1xi32> loc(#loc78)
+      %9 = arith.muli %cst, %xindex_7 : tensor<1x1xi32> loc(#loc78)
+      %10 = tt.broadcast %9 : tensor<1x1xi32> -> tensor<1x4096xi32> loc(#loc79)
+      %11 = arith.addi %r0_index_13, %10 : tensor<1x4096xi32> loc(#loc79)
+      %12 = tt.splat %out_ptr3 : !tt.ptr<bf16> -> tensor<1x4096x!tt.ptr<bf16>> loc(#loc80)
+      %13 = tt.addptr %12, %11 : tensor<1x4096x!tt.ptr<bf16>>, tensor<1x4096xi32> loc(#loc80)
+      %14 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x4096xi1> loc(#loc81)
+      %15 = arith.andi %r0_mask_14, %14 : tensor<1x4096xi1> loc(#loc81)
+      %16 = arith.truncf %tmp28 : tensor<1x4096xf32> to tensor<1x4096xbf16> loc(#loc82)
+      tt.store %13, %16, %15 : tensor<1x4096x!tt.ptr<bf16>> loc(#loc82)
+    } loc(#loc52)
+    tt.return loc(#loc83)
+  } loc(#loc)
+  tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_4096__(1,)cconstexpr_fp32_"() -> tensor<1x4096xf32> attributes {noinline = false} {
+    %cst = arith.constant 0.000000e+00 : f32 loc(#loc85)
+    %cst_0 = arith.constant dense<0.000000e+00> : tensor<1x4096xf32> loc(#loc85)
+    tt.return %cst_0 : tensor<1x4096xf32> loc(#loc86)
+  ^bb1:  // no predecessors
+    %0 = ub.poison : tensor<1x4096xf32> loc(#loc87)
+    tt.return %0 : tensor<1x4096xf32> loc(#loc87)
+  } loc(#loc84)
+  tt.func private @torch._inductor.runtime.triton_helpers.welford_reduce__fp32S1_4096S_fp32S1_4096S_fp32S1_4096S_fp32S1_4096S_u1__(%new_mean: tensor<1x4096xf32> loc("new_mean"(#loc201)), %mean: tensor<1x4096xf32> loc("mean"(#loc88)), %m2: tensor<1x4096xf32> loc("m2"(#loc88)), %weight: tensor<1x4096xf32> loc("weight"(#loc88)), %first_iteration: i1 loc("first_iteration"(#loc88))) -> (tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32>) attributes {noinline = false} {
+    %0:3 = scf.if %first_iteration -> (tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32>) {
+      %new_weight = arith.constant 1.000000e+00 : f32 loc(#loc206)
+      %new_weight_0 = arith.constant dense<1.000000e+00> : tensor<1x4096xf32> loc(#loc232)
+      %new_m2 = tt.call @triton.language.standard.zeros_like__fp32S1_4096S__(%m2) : (tensor<1x4096xf32>) -> tensor<1x4096xf32> loc(#loc233)
+      scf.yield %new_m2, %new_mean, %new_weight_0 : tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32> loc(#loc233)
+    } else {
+      %delta = arith.subf %new_mean, %mean : tensor<1x4096xf32> loc(#loc208)
+      %new_weight = arith.constant 1 : i32 loc(#loc209)
+      %new_weight_0 = arith.constant 1.000000e+00 : f32 loc(#loc209)
+      %new_weight_1 = arith.constant dense<1.000000e+00> : tensor<1x4096xf32> loc(#loc209)
+      %new_weight_2 = arith.addf %weight, %new_weight_1 : tensor<1x4096xf32> loc(#loc234)
+      %new_mean_3 = arith.divf %delta, %new_weight_2 : tensor<1x4096xf32> loc(#loc210)
+      %new_mean_4 = arith.addf %mean, %new_mean_3 : tensor<1x4096xf32> loc(#loc235)
+      %new_m2 = arith.subf %new_mean, %new_mean_4 : tensor<1x4096xf32> loc(#loc212)
+      %new_m2_5 = arith.mulf %delta, %new_m2 : tensor<1x4096xf32> loc(#loc213)
+      %new_m2_6 = arith.addf %m2, %new_m2_5 : tensor<1x4096xf32> loc(#loc236)
+      scf.yield %new_m2_6, %new_mean_4, %new_weight_2 : tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32> loc(#loc214)
+    } loc(#loc89)
+    tt.return %0#1, %0#0, %0#2 : tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32> loc(#loc99)
+  ^bb1:  // no predecessors
+    %1 = ub.poison : tensor<1x4096xf32> loc(#loc100)
+    %2 = ub.poison : tensor<1x4096xf32> loc(#loc100)
+    %3 = ub.poison : tensor<1x4096xf32> loc(#loc100)
+    tt.return %1, %2, %3 : tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32> loc(#loc100)
+  } loc(#loc88)
+  tt.func private @triton.language.standard.zeros_like__fp32S1_4096S__(%input: tensor<1x4096xf32> loc("input"(#loc101))) -> tensor<1x4096xf32> attributes {noinline = false} {
+    %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_4096__(1,)cconstexpr_fp32_"() : () -> tensor<1x4096xf32> loc(#loc102)
+    tt.return %0 : tensor<1x4096xf32> loc(#loc103)
+  ^bb1:  // no predecessors
+    %1 = ub.poison : tensor<1x4096xf32> loc(#loc104)
+    tt.return %1 : tensor<1x4096xf32> loc(#loc104)
+  } loc(#loc101)
+  tt.func private @"torch._inductor.runtime.triton_helpers.welford__fp32S1_4096S_fp32S1_4096S_fp32S1_4096S__(3,)cconstexpr_1_"(%mean: tensor<1x4096xf32> loc("mean"(#loc105)), %m2: tensor<1x4096xf32> loc("m2"(#loc105)), %weight: tensor<1x4096xf32> loc("weight"(#loc105))) -> (tensor<1xf32>, tensor<1xf32>, tensor<1xf32>) attributes {noinline = false} {
+    %0:3 = "tt.reduce"(%mean, %m2, %weight) <{axis = 1 : i32}> ({
+    ^bb0(%arg3: f32 loc(unknown), %arg4: f32 loc(unknown), %arg5: f32 loc(unknown), %arg6: f32 loc(unknown), %arg7: f32 loc(unknown), %arg8: f32 loc(unknown)):
+      %4:3 = tt.call @torch._inductor.runtime.triton_helpers.welford_combine__fp32_fp32_fp32_fp32_fp32_fp32__(%arg3, %arg4, %arg5, %arg6, %arg7, %arg8) : (f32, f32, f32, f32, f32, f32) -> (f32, f32, f32) loc(#loc106)
+      tt.reduce.return %4#0, %4#1, %4#2 : f32, f32, f32 loc(#loc106)
+    }) : (tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32>) -> (tensor<1xf32>, tensor<1xf32>, tensor<1xf32>) loc(#loc106)
+    tt.return %0#0, %0#1, %0#2 : tensor<1xf32>, tensor<1xf32>, tensor<1xf32> loc(#loc108)
+  ^bb1:  // no predecessors
+    %1 = ub.poison : tensor<1xf32> loc(#loc109)
+    %2 = ub.poison : tensor<1xf32> loc(#loc109)
+    %3 = ub.poison : tensor<1xf32> loc(#loc109)
+    tt.return %1, %2, %3 : tensor<1xf32>, tensor<1xf32>, tensor<1xf32> loc(#loc109)
+  } loc(#loc105)
+  tt.func private @torch._inductor.runtime.triton_helpers.welford_combine__fp32_fp32_fp32_fp32_fp32_fp32__(%mean_1: f32 loc("mean_1"(#loc110)), %m2_1: f32 loc("m2_1"(#loc110)), %weight_1: f32 loc("weight_1"(#loc110)), %mean_2: f32 loc("mean_2"(#loc110)), %m2_2: f32 loc("m2_2"(#loc110)), %weight_2: f32 loc("weight_2"(#loc110))) -> (f32, f32, f32) attributes {noinline = false} {
+    %delta = arith.subf %mean_2, %mean_1 : f32 loc(#loc225)
+    %new_weight = arith.addf %weight_1, %weight_2 : f32 loc(#loc226)
+    %w2_over_w = arith.constant 0.000000e+00 : f32 loc(#loc227)
+    %w2_over_w_0 = arith.cmpf oeq, %new_weight, %w2_over_w : f32 loc(#loc227)
+    %w2_over_w_1 = arith.divf %weight_2, %new_weight : f32 loc(#loc228)
+    %w2_over_w_2 = arith.constant 0.000000e+00 : f32 loc(#loc229)
+    %w2_over_w_3 = arith.constant 0.000000e+00 : f32 loc(#loc229)
+    %w2_over_w_4 = arith.select %w2_over_w_0, %w2_over_w_3, %w2_over_w_1 : f32 loc(#loc229)
+    %0 = arith.mulf %delta, %w2_over_w_4 : f32 loc(#loc116)
+    %1 = arith.addf %mean_1, %0 : f32 loc(#loc117)
+    %2 = arith.addf %m2_1, %m2_2 : f32 loc(#loc118)
+    %3 = arith.mulf %delta, %delta : f32 loc(#loc119)
+    %4 = arith.mulf %3, %weight_1 : f32 loc(#loc120)
+    %5 = arith.mulf %4, %w2_over_w_4 : f32 loc(#loc121)
+    %6 = arith.addf %2, %5 : f32 loc(#loc122)
+    tt.return %1, %6, %new_weight : f32, f32, f32 loc(#loc123)
+  ^bb1:  // no predecessors
+    %7 = ub.poison : f32 loc(#loc124)
+    %8 = ub.poison : f32 loc(#loc124)
+    %9 = ub.poison : f32 loc(#loc124)
+    tt.return %7, %8, %9 : f32, f32, f32 loc(#loc124)
+  } loc(#loc110)
+} loc(#loc)
+#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":19:13)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":20:15)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":23:28)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":23:33)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":24:36)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":24:44)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":24:23)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":25:21)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":26:27)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":26:37)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":29:45)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":30:43)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":31:47)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":32:43)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":33:31)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":34:29)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":38:46)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":38:41)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":38:34)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":38:61)
+#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":38:51)
+#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":38:113)
+#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":39:34)
+#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":39:41)
+#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":39:94)
+#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":40:46)
+#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":40:41)
+#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":40:34)
+#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":40:61)
+#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":40:51)
+#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":40:113)
+#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":41:22)
+#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":42:22)
+#loc34 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":46:62)
+#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":46:51)
+#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":48:39)
+#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":48:62)
+#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":49:37)
+#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":49:58)
+#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":50:41)
+#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":50:66)
+#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":51:41)
+#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":51:36)
+#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":51:29)
+#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":51:62)
+#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":51:52)
+#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":51:8)
+#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":52:80)
+#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":53:16)
+#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":54:17)
+#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":55:18)
+#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":56:43)
+#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":57:31)
+#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":58:29)
+#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":62:48)
+#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":62:43)
+#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":62:36)
+#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":62:63)
+#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":62:53)
+#loc60 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":62:115)
+#loc61 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":63:35)
+#loc62 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":63:42)
+#loc63 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":63:95)
+#loc64 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":64:35)
+#loc65 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":64:42)
+#loc66 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":64:95)
+#loc67 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":66:24)
+#loc68 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":67:16)
+#loc69 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":68:25)
+#loc70 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":69:16)
+#loc71 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":70:24)
+#loc72 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":71:32)
+#loc73 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":72:24)
+#loc74 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":74:16)
+#loc75 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":75:24)
+#loc76 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":76:24)
+#loc77 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":77:24)
+#loc78 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":78:41)
+#loc79 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":78:36)
+#loc80 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":78:29)
+#loc81 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":78:63)
+#loc82 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":78:53)
+#loc83 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":56:4)
+#loc84 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":120:0)
+#loc85 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":129:31)
+#loc86 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":129:11)
+#loc87 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":129:4)
+#loc89 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":217:7)
+#loc90 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":218:46)
+#loc91 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":220:31)
+#loc92 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":222:24)
+#loc93 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":223:30)
+#loc94 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":224:34)
+#loc95 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":224:26)
+#loc96 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":225:39)
+#loc97 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":225:31)
+#loc98 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":225:22)
+#loc99 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":226:11)
+#loc100 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":226:4)
+#loc102 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":140:30)
+#loc103 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":140:11)
+#loc104 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":140:4)
+#loc106 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":243:46)
+#loc108 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":243:11)
+#loc109 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":243:4)
+#loc111 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":231:21)
+#loc112 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":232:28)
+#loc113 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:39)
+#loc114 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:60)
+#loc115 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:49)
+#loc116 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":235:25)
+#loc117 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":235:17)
+#loc118 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:15)
+#loc119 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:30)
+#loc120 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:38)
+#loc121 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:49)
+#loc122 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:22)
+#loc123 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":234:11)
+#loc124 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":234:4)
+#loc134 = loc("xnumel"(#loc1))
+#loc135 = loc("r0_numel"(#loc2))
+#loc136 = loc("xoffset"(#loc3))
+#loc137 = loc("xoffset"(#loc4))
+#loc138 = loc("xindex"(#loc5))
+#loc139 = loc("xindex"(#loc6))
+#loc140 = loc("xindex"(#loc7))
+#loc141 = loc("xmask"(#loc8))
+#loc142 = loc("r0_base"(#loc9))
+#loc143 = loc("r0_base"(#loc10))
+#loc144 = loc("tmp7_mean"(#loc11))
+#loc145 = loc("tmp7_m2"(#loc12))
+#loc146 = loc("tmp7_weight"(#loc13))
+#loc147 = loc("tmp7_mean"(#loc14))
+#loc148 = loc("r0_index"(#loc15))
+#loc149 = loc("r0_mask"(#loc16))
+#loc150 = loc("tmp0"(#loc17))
+#loc151 = loc("tmp0"(#loc18))
+#loc152 = loc("tmp0"(#loc19))
+#loc153 = loc("tmp0"(#loc20))
+#loc154 = loc("tmp0"(#loc21))
+#loc155 = loc("tmp0"(#loc22))
+#loc156 = loc("tmp1"(#loc23))
+#loc157 = loc("tmp1"(#loc24))
+#loc158 = loc("tmp1"(#loc25))
+#loc159 = loc("tmp2"(#loc26))
+#loc160 = loc("tmp2"(#loc27))
+#loc161 = loc("tmp2"(#loc28))
+#loc162 = loc("tmp2"(#loc29))
+#loc163 = loc("tmp2"(#loc30))
+#loc164 = loc("tmp2"(#loc31))
+#loc165 = loc("tmp3"(#loc32))
+#loc166 = loc("tmp4"(#loc33))
+#loc167 = loc("tmp7_mean"(#loc36))
+#loc168 = loc("tmp7_mean"(#loc37))
+#loc169 = loc("tmp7_m2"(#loc38))
+#loc170 = loc("tmp7_m2"(#loc39))
+#loc171 = loc("tmp7_weight"(#loc40))
+#loc172 = loc("tmp7_weight"(#loc41))
+#loc173 = loc("tmp7"(#loc49))
+#loc174 = loc("tmp11"(#loc50))
+#loc175 = loc("tmp12"(#loc51))
+#loc176 = loc("r0_index"(#loc53))
+#loc177 = loc("r0_mask"(#loc54))
+#loc178 = loc("tmp13"(#loc55))
+#loc179 = loc("tmp13"(#loc56))
+#loc180 = loc("tmp13"(#loc57))
+#loc181 = loc("tmp13"(#loc58))
+#loc182 = loc("tmp13"(#loc59))
+#loc183 = loc("tmp13"(#loc60))
+#loc184 = loc("tmp23"(#loc61))
+#loc185 = loc("tmp23"(#loc62))
+#loc186 = loc("tmp23"(#loc63))
+#loc187 = loc("tmp27"(#loc64))
+#loc188 = loc("tmp27"(#loc65))
+#loc189 = loc("tmp27"(#loc66))
+#loc190 = loc("tmp15"(#loc67))
+#loc191 = loc("tmp16"(#loc68))
+#loc192 = loc("tmp17"(#loc69))
+#loc193 = loc("tmp18"(#loc70))
+#loc194 = loc("tmp19"(#loc71))
+#loc195 = loc("tmp20"(#loc72))
+#loc196 = loc("tmp21"(#loc73))
+#loc197 = loc("tmp24"(#loc74))
+#loc198 = loc("tmp25"(#loc75))
+#loc199 = loc("tmp26"(#loc76))
+#loc200 = loc("tmp28"(#loc77))
+#loc206 = loc("new_weight"(#loc90))
+#loc207 = loc("new_m2"(#loc91))
+#loc208 = loc("delta"(#loc92))
+#loc209 = loc("new_weight"(#loc93))
+#loc210 = loc("new_mean"(#loc94))
+#loc211 = loc("new_mean"(#loc95))
+#loc212 = loc("new_m2"(#loc96))
+#loc213 = loc("new_m2"(#loc97))
+#loc214 = loc("new_m2"(#loc98))
+#loc225 = loc("delta"(#loc111))
+#loc226 = loc("new_weight"(#loc112))
+#loc227 = loc("w2_over_w"(#loc113))
+#loc228 = loc("w2_over_w"(#loc114))
+#loc229 = loc("w2_over_w"(#loc115))
+#loc230 = loc("tmp7_m2"(#loc147))
+#loc232 = loc("new_weight"(#loc206))
+#loc233 = loc("new_m2"(#loc207))
+#loc234 = loc("new_weight"(#loc209))
+#loc235 = loc("new_mean"(#loc211))
+#loc236 = loc("new_m2"(#loc214))
+#loc237 = loc("tmp7_weight"(#loc230))
diff --git a/triton/TADKEHLBMNHPYJR7HM24RCH2SQBJWQ5J3N2BE4JL76WWKUZUI7BA/triton_red_fused_add_mul_native_layer_norm_0.ttgir b/triton/TADKEHLBMNHPYJR7HM24RCH2SQBJWQ5J3N2BE4JL76WWKUZUI7BA/triton_red_fused_add_mul_native_layer_norm_0.ttgir
new file mode 100644
index 0000000000000000000000000000000000000000..c5c42efca1ff3bc8bd7058afe91a8602915ef9c3
--- /dev/null
+++ b/triton/TADKEHLBMNHPYJR7HM24RCH2SQBJWQ5J3N2BE4JL76WWKUZUI7BA/triton_red_fused_add_mul_native_layer_norm_0.ttgir
@@ -0,0 +1,214 @@
+#blocked = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [1, 32], warpsPerCTA = [1, 16], order = [1, 0]}>
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":18:0)
+#loc1 = loc(unknown)
+#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":52:80)
+#loc59 = loc("in_ptr0"(#loc))
+#loc60 = loc("in_ptr1"(#loc))
+#loc61 = loc("in_ptr2"(#loc))
+#loc62 = loc("in_ptr3"(#loc))
+#loc63 = loc("in_ptr4"(#loc))
+#loc64 = loc("out_ptr0"(#loc))
+#loc65 = loc("out_ptr3"(#loc))
+#loc66 = loc("xnumel"(#loc))
+#loc67 = loc("r0_numel"(#loc))
+#loc89 = loc(callsite(#loc1 at #loc25))
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 16 : i32, ttg.target = "cuda:89", "ttg.threads-per-warp" = 32 : i32} {
+  tt.func public @triton_red_fused_add_mul_native_layer_norm_0(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %in_ptr4: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr4"(#loc)), %out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %out_ptr3: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr3"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
+    %cst = arith.constant dense<4096> : tensor<1x4096xi32, #blocked> loc(#loc1)
+    %cst_0 = arith.constant dense<0.000000e+00> : tensor<1x4096xbf16, #blocked> loc(#loc1)
+    %c4096_i32 = arith.constant 4096 : i32 loc(#loc1)
+    %c2048_i32 = arith.constant 2048 : i32 loc(#loc1)
+    %cst_1 = arith.constant 0.000000e+00 : f32 loc(#loc1)
+    %cst_2 = arith.constant dense<0.000000e+00> : tensor<1x4096xf32, #blocked> loc(#loc1)
+    %cst_3 = arith.constant dense<1.000000e+00> : tensor<1x4096xf32, #blocked> loc(#loc1)
+    %cst_4 = arith.constant dense<9.99999997E-7> : tensor<1x1xf32, #blocked> loc(#loc1)
+    %cst_5 = arith.constant dense<4.096000e+03> : tensor<1x1xf32, #blocked> loc(#loc1)
+    %xoffset = tt.get_program_id x : i32 loc(#loc68)
+    %xmask = arith.cmpi slt, %xoffset, %c2048_i32 : i32 loc(#loc69)
+    %r0_base = tt.make_range {end = 4096 : i32, start = 0 : i32} : tensor<4096xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc70)
+    %r0_base_6 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<4096xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x4096xi32, #blocked> loc(#loc70)
+    %r0_mask = arith.cmpi slt, %r0_base_6, %cst : tensor<1x4096xi32, #blocked> loc(#loc71)
+    %tmp0 = arith.muli %xoffset, %c4096_i32 : i32 loc(#loc72)
+    %tmp0_7 = tt.splat %tmp0 : i32 -> tensor<1x4096xi32, #blocked> loc(#loc113)
+    %tmp0_8 = arith.addi %r0_base_6, %tmp0_7 : tensor<1x4096xi32, #blocked> loc(#loc73)
+    %tmp0_9 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<1x4096x!tt.ptr<bf16>, #blocked> loc(#loc74)
+    %tmp0_10 = tt.addptr %tmp0_9, %tmp0_8 : tensor<1x4096x!tt.ptr<bf16>, #blocked>, tensor<1x4096xi32, #blocked> loc(#loc74)
+    %tmp0_11 = tt.splat %xmask : i1 -> tensor<1x4096xi1, #blocked> loc(#loc114)
+    %tmp0_12 = arith.andi %r0_mask, %tmp0_11 : tensor<1x4096xi1, #blocked> loc(#loc75)
+    %tmp0_13 = tt.load %tmp0_10, %tmp0_12, %cst_0 evictionPolicy = evict_first : tensor<1x4096x!tt.ptr<bf16>, #blocked> loc(#loc76)
+    %tmp0_14 = arith.extf %tmp0_13 : tensor<1x4096xbf16, #blocked> to tensor<1x4096xf32, #blocked> loc(#loc77)
+    %tmp1 = tt.splat %in_ptr1 : !tt.ptr<bf16> -> tensor<1x4096x!tt.ptr<bf16>, #blocked> loc(#loc78)
+    %tmp1_15 = tt.addptr %tmp1, %r0_base_6 : tensor<1x4096x!tt.ptr<bf16>, #blocked>, tensor<1x4096xi32, #blocked> loc(#loc78)
+    %tmp1_16 = tt.load %tmp1_15, %r0_mask, %cst_0 evictionPolicy = evict_last : tensor<1x4096x!tt.ptr<bf16>, #blocked> loc(#loc79)
+    %tmp1_17 = arith.extf %tmp1_16 : tensor<1x4096xbf16, #blocked> to tensor<1x4096xf32, #blocked> loc(#loc80)
+    %tmp2 = tt.splat %in_ptr2 : !tt.ptr<bf16> -> tensor<1x4096x!tt.ptr<bf16>, #blocked> loc(#loc81)
+    %tmp2_18 = tt.addptr %tmp2, %tmp0_8 : tensor<1x4096x!tt.ptr<bf16>, #blocked>, tensor<1x4096xi32, #blocked> loc(#loc81)
+    %tmp2_19 = tt.load %tmp2_18, %tmp0_12, %cst_0 evictionPolicy = evict_first : tensor<1x4096x!tt.ptr<bf16>, #blocked> loc(#loc82)
+    %tmp2_20 = arith.extf %tmp2_19 : tensor<1x4096xbf16, #blocked> to tensor<1x4096xf32, #blocked> loc(#loc83)
+    %tmp3 = arith.mulf %tmp1_17, %tmp2_20 : tensor<1x4096xf32, #blocked> loc(#loc84)
+    %tmp4 = arith.addf %tmp0_14, %tmp3 : tensor<1x4096xf32, #blocked> loc(#loc85)
+    %tmp7_mean = arith.select %tmp0_12, %tmp4, %cst_2 : tensor<1x4096xi1, #blocked>, tensor<1x4096xf32, #blocked> loc(#loc86)
+    %tmp7_weight = arith.select %tmp0_12, %cst_3, %cst_2 : tensor<1x4096xi1, #blocked>, tensor<1x4096xf32, #blocked> loc(#loc87)
+    %0 = tt.splat %out_ptr0 : !tt.ptr<bf16> -> tensor<1x4096x!tt.ptr<bf16>, #blocked> loc(#loc22)
+    %1 = tt.addptr %0, %tmp0_8 : tensor<1x4096x!tt.ptr<bf16>, #blocked>, tensor<1x4096xi32, #blocked> loc(#loc22)
+    %2 = arith.truncf %tmp4 : tensor<1x4096xf32, #blocked> to tensor<1x4096xbf16, #blocked> loc(#loc23)
+    tt.store %1, %2, %tmp0_12 : tensor<1x4096x!tt.ptr<bf16>, #blocked> loc(#loc23)
+    %3:3 = "tt.reduce"(%tmp7_mean, %cst_2, %tmp7_weight) <{axis = 1 : i32}> ({
+    ^bb0(%arg9: f32 loc(callsite(#loc1 at #loc25)), %arg10: f32 loc(callsite(#loc1 at #loc25)), %arg11: f32 loc(callsite(#loc1 at #loc25)), %arg12: f32 loc(callsite(#loc1 at #loc25)), %arg13: f32 loc(callsite(#loc1 at #loc25)), %arg14: f32 loc(callsite(#loc1 at #loc25))):
+      %delta = arith.subf %arg12, %arg9 : f32 loc(#loc115)
+      %new_weight = arith.addf %arg11, %arg14 : f32 loc(#loc116)
+      %w2_over_w = arith.cmpf oeq, %new_weight, %cst_1 : f32 loc(#loc117)
+      %w2_over_w_30 = arith.divf %arg14, %new_weight : f32 loc(#loc118)
+      %w2_over_w_31 = arith.select %w2_over_w, %cst_1, %w2_over_w_30 : f32 loc(#loc119)
+      %7 = arith.mulf %delta, %w2_over_w_31 : f32 loc(#loc120)
+      %8 = arith.addf %arg9, %7 : f32 loc(#loc121)
+      %9 = arith.addf %arg10, %arg13 : f32 loc(#loc122)
+      %10 = arith.mulf %delta, %delta : f32 loc(#loc123)
+      %11 = arith.mulf %10, %arg11 : f32 loc(#loc124)
+      %12 = arith.mulf %11, %w2_over_w_31 : f32 loc(#loc125)
+      %13 = arith.addf %9, %12 : f32 loc(#loc126)
+      tt.reduce.return %8, %13, %new_weight : f32, f32, f32 loc(#loc88)
+    }) : (tensor<1x4096xf32, #blocked>, tensor<1x4096xf32, #blocked>, tensor<1x4096xf32, #blocked>) -> (tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked}>>, tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked}>>, tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked}>>) loc(#loc88)
+    %tmp7 = tt.expand_dims %3#0 {axis = 1 : i32} : tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<1x1xf32, #blocked> loc(#loc95)
+    %tmp11 = tt.expand_dims %3#1 {axis = 1 : i32} : tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<1x1xf32, #blocked> loc(#loc96)
+    %tmp13 = tt.load %1, %tmp0_12, %cst_0 evictionPolicy = evict_first : tensor<1x4096x!tt.ptr<bf16>, #blocked> loc(#loc97)
+    %tmp13_21 = arith.extf %tmp13 : tensor<1x4096xbf16, #blocked> to tensor<1x4096xf32, #blocked> loc(#loc98)
+    %tmp23 = tt.splat %in_ptr3 : !tt.ptr<bf16> -> tensor<1x4096x!tt.ptr<bf16>, #blocked> loc(#loc99)
+    %tmp23_22 = tt.addptr %tmp23, %r0_base_6 : tensor<1x4096x!tt.ptr<bf16>, #blocked>, tensor<1x4096xi32, #blocked> loc(#loc99)
+    %tmp23_23 = tt.load %tmp23_22, %r0_mask, %cst_0 evictionPolicy = evict_last : tensor<1x4096x!tt.ptr<bf16>, #blocked> loc(#loc100)
+    %tmp23_24 = arith.extf %tmp23_23 : tensor<1x4096xbf16, #blocked> to tensor<1x4096xf32, #blocked> loc(#loc101)
+    %tmp27 = tt.splat %in_ptr4 : !tt.ptr<bf16> -> tensor<1x4096x!tt.ptr<bf16>, #blocked> loc(#loc102)
+    %tmp27_25 = tt.addptr %tmp27, %r0_base_6 : tensor<1x4096x!tt.ptr<bf16>, #blocked>, tensor<1x4096xi32, #blocked> loc(#loc102)
+    %tmp27_26 = tt.load %tmp27_25, %r0_mask, %cst_0 evictionPolicy = evict_last : tensor<1x4096x!tt.ptr<bf16>, #blocked> loc(#loc103)
+    %tmp27_27 = arith.extf %tmp27_26 : tensor<1x4096xbf16, #blocked> to tensor<1x4096xf32, #blocked> loc(#loc104)
+    %tmp15 = tt.broadcast %tmp7 : tensor<1x1xf32, #blocked> -> tensor<1x4096xf32, #blocked> loc(#loc105)
+    %tmp15_28 = arith.subf %tmp13_21, %tmp15 : tensor<1x4096xf32, #blocked> loc(#loc105)
+    %tmp17 = arith.divf %tmp11, %cst_5 : tensor<1x1xf32, #blocked> loc(#loc106)
+    %tmp19 = arith.addf %tmp17, %cst_4 : tensor<1x1xf32, #blocked> loc(#loc107)
+    %tmp20 = tt.extern_elementwise %tmp19 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<1x1xf32, #blocked>) -> tensor<1x1xf32, #blocked> loc(#loc108)
+    %tmp21 = tt.broadcast %tmp20 : tensor<1x1xf32, #blocked> -> tensor<1x4096xf32, #blocked> loc(#loc109)
+    %tmp21_29 = arith.mulf %tmp15_28, %tmp21 : tensor<1x4096xf32, #blocked> loc(#loc109)
+    %tmp25 = arith.addf %tmp23_24, %cst_3 : tensor<1x4096xf32, #blocked> loc(#loc110)
+    %tmp26 = arith.mulf %tmp21_29, %tmp25 : tensor<1x4096xf32, #blocked> loc(#loc111)
+    %tmp28 = arith.addf %tmp26, %tmp27_27 : tensor<1x4096xf32, #blocked> loc(#loc112)
+    %4 = tt.splat %out_ptr3 : !tt.ptr<bf16> -> tensor<1x4096x!tt.ptr<bf16>, #blocked> loc(#loc56)
+    %5 = tt.addptr %4, %tmp0_8 : tensor<1x4096x!tt.ptr<bf16>, #blocked>, tensor<1x4096xi32, #blocked> loc(#loc56)
+    %6 = arith.truncf %tmp28 : tensor<1x4096xf32, #blocked> to tensor<1x4096xbf16, #blocked> loc(#loc57)
+    tt.store %5, %6, %tmp0_12 : tensor<1x4096x!tt.ptr<bf16>, #blocked> loc(#loc57)
+    tt.return loc(#loc58)
+  } loc(#loc)
+} loc(#loc)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":23:28)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":25:21)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":26:37)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":34:29)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":38:46)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":38:41)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":38:34)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":38:61)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":38:51)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":38:113)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":39:34)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":39:41)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":39:94)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":40:34)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":40:51)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":40:113)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":41:22)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":42:22)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":48:62)
+#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":50:66)
+#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":51:29)
+#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":51:52)
+#loc24 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":243:46)
+#loc26 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":231:21)
+#loc27 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":232:28)
+#loc28 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:39)
+#loc29 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:60)
+#loc30 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:49)
+#loc31 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":235:25)
+#loc32 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":235:17)
+#loc33 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:15)
+#loc34 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:30)
+#loc35 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:38)
+#loc36 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:49)
+#loc37 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:22)
+#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":53:16)
+#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":54:17)
+#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":62:53)
+#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":62:115)
+#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":63:35)
+#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":63:42)
+#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":63:95)
+#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":64:35)
+#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":64:42)
+#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":64:95)
+#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":66:24)
+#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":68:25)
+#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":70:24)
+#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":71:32)
+#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":72:24)
+#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":75:24)
+#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":76:24)
+#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":77:24)
+#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":78:29)
+#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":78:53)
+#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":56:4)
+#loc68 = loc("xoffset"(#loc2))
+#loc69 = loc("xmask"(#loc3))
+#loc70 = loc("r0_base"(#loc4))
+#loc71 = loc("r0_mask"(#loc5))
+#loc72 = loc("tmp0"(#loc6))
+#loc73 = loc("tmp0"(#loc7))
+#loc74 = loc("tmp0"(#loc8))
+#loc75 = loc("tmp0"(#loc9))
+#loc76 = loc("tmp0"(#loc10))
+#loc77 = loc("tmp0"(#loc11))
+#loc78 = loc("tmp1"(#loc12))
+#loc79 = loc("tmp1"(#loc13))
+#loc80 = loc("tmp1"(#loc14))
+#loc81 = loc("tmp2"(#loc15))
+#loc82 = loc("tmp2"(#loc16))
+#loc83 = loc("tmp2"(#loc17))
+#loc84 = loc("tmp3"(#loc18))
+#loc85 = loc("tmp4"(#loc19))
+#loc86 = loc("tmp7_mean"(#loc20))
+#loc87 = loc("tmp7_weight"(#loc21))
+#loc88 = loc(callsite(#loc24 at #loc25))
+#loc90 = loc("delta"(#loc26))
+#loc91 = loc("new_weight"(#loc27))
+#loc92 = loc("w2_over_w"(#loc28))
+#loc93 = loc("w2_over_w"(#loc29))
+#loc94 = loc("w2_over_w"(#loc30))
+#loc95 = loc("tmp7"(#loc38))
+#loc96 = loc("tmp11"(#loc39))
+#loc97 = loc("tmp13"(#loc40))
+#loc98 = loc("tmp13"(#loc41))
+#loc99 = loc("tmp23"(#loc42))
+#loc100 = loc("tmp23"(#loc43))
+#loc101 = loc("tmp23"(#loc44))
+#loc102 = loc("tmp27"(#loc45))
+#loc103 = loc("tmp27"(#loc46))
+#loc104 = loc("tmp27"(#loc47))
+#loc105 = loc("tmp15"(#loc48))
+#loc106 = loc("tmp17"(#loc49))
+#loc107 = loc("tmp19"(#loc50))
+#loc108 = loc("tmp20"(#loc51))
+#loc109 = loc("tmp21"(#loc52))
+#loc110 = loc("tmp25"(#loc53))
+#loc111 = loc("tmp26"(#loc54))
+#loc112 = loc("tmp28"(#loc55))
+#loc113 = loc(fused[#loc73, #loc72])
+#loc114 = loc(fused[#loc75, #loc69])
+#loc115 = loc(callsite(#loc90 at #loc88))
+#loc116 = loc(callsite(#loc91 at #loc88))
+#loc117 = loc(callsite(#loc92 at #loc88))
+#loc118 = loc(callsite(#loc93 at #loc88))
+#loc119 = loc(callsite(#loc94 at #loc88))
+#loc120 = loc(callsite(#loc31 at #loc88))
+#loc121 = loc(callsite(#loc32 at #loc88))
+#loc122 = loc(callsite(#loc33 at #loc88))
+#loc123 = loc(callsite(#loc34 at #loc88))
+#loc124 = loc(callsite(#loc35 at #loc88))
+#loc125 = loc(callsite(#loc36 at #loc88))
+#loc126 = loc(callsite(#loc37 at #loc88))
diff --git a/triton/TADKEHLBMNHPYJR7HM24RCH2SQBJWQ5J3N2BE4JL76WWKUZUI7BA/triton_red_fused_add_mul_native_layer_norm_0.ttir b/triton/TADKEHLBMNHPYJR7HM24RCH2SQBJWQ5J3N2BE4JL76WWKUZUI7BA/triton_red_fused_add_mul_native_layer_norm_0.ttir
new file mode 100644
index 0000000000000000000000000000000000000000..a893ef79d8c04071ca6424f9df1c39a003b77ac0
--- /dev/null
+++ b/triton/TADKEHLBMNHPYJR7HM24RCH2SQBJWQ5J3N2BE4JL76WWKUZUI7BA/triton_red_fused_add_mul_native_layer_norm_0.ttir
@@ -0,0 +1,215 @@
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":18:0)
+#loc1 = loc(unknown)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":52:80)
+#loc60 = loc("in_ptr0"(#loc))
+#loc61 = loc("in_ptr1"(#loc))
+#loc62 = loc("in_ptr2"(#loc))
+#loc63 = loc("in_ptr3"(#loc))
+#loc64 = loc("in_ptr4"(#loc))
+#loc65 = loc("out_ptr0"(#loc))
+#loc66 = loc("out_ptr3"(#loc))
+#loc67 = loc("xnumel"(#loc))
+#loc68 = loc("r0_numel"(#loc))
+#loc70 = loc(callsite(#loc1 at #loc3))
+module {
+  tt.func public @triton_red_fused_add_mul_native_layer_norm_0(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %in_ptr4: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr4"(#loc)), %out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %out_ptr3: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr3"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
+    %c4096_i32 = arith.constant 4096 : i32 loc(#loc1)
+    %xmask = arith.constant 2048 : i32 loc(#loc69)
+    %cst = arith.constant 0.000000e+00 : f32 loc(#loc70)
+    %cst_0 = arith.constant dense<0.000000e+00> : tensor<1x4096xf32> loc(#loc1)
+    %cst_1 = arith.constant dense<0.000000e+00> : tensor<1x4096xbf16> loc(#loc1)
+    %cst_2 = arith.constant dense<1.000000e+00> : tensor<1x4096xf32> loc(#loc1)
+    %cst_3 = arith.constant dense<9.99999997E-7> : tensor<1x1xf32> loc(#loc1)
+    %cst_4 = arith.constant dense<4.096000e+03> : tensor<1x1xf32> loc(#loc1)
+    %cst_5 = arith.constant dense<4096> : tensor<1x4096xi32> loc(#loc1)
+    %xoffset = tt.get_program_id x : i32 loc(#loc71)
+    %xmask_6 = arith.cmpi slt, %xoffset, %xmask : i32 loc(#loc69)
+    %r0_base = tt.make_range {end = 4096 : i32, start = 0 : i32} : tensor<4096xi32> loc(#loc72)
+    %r0_base_7 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<4096xi32> -> tensor<1x4096xi32> loc(#loc73)
+    %r0_mask = arith.cmpi slt, %r0_base_7, %cst_5 : tensor<1x4096xi32> loc(#loc74)
+    %tmp0 = arith.muli %xoffset, %c4096_i32 : i32 loc(#loc75)
+    %tmp0_8 = tt.splat %tmp0 : i32 -> tensor<1x4096xi32> loc(#loc115)
+    %tmp0_9 = arith.addi %r0_base_7, %tmp0_8 : tensor<1x4096xi32> loc(#loc76)
+    %tmp0_10 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<1x4096x!tt.ptr<bf16>> loc(#loc77)
+    %tmp0_11 = tt.addptr %tmp0_10, %tmp0_9 : tensor<1x4096x!tt.ptr<bf16>>, tensor<1x4096xi32> loc(#loc77)
+    %tmp0_12 = tt.splat %xmask_6 : i1 -> tensor<1x4096xi1> loc(#loc116)
+    %tmp0_13 = arith.andi %r0_mask, %tmp0_12 : tensor<1x4096xi1> loc(#loc78)
+    %tmp0_14 = tt.load %tmp0_11, %tmp0_13, %cst_1 evictionPolicy = evict_first : tensor<1x4096x!tt.ptr<bf16>> loc(#loc79)
+    %tmp0_15 = arith.extf %tmp0_14 : tensor<1x4096xbf16> to tensor<1x4096xf32> loc(#loc80)
+    %tmp1 = tt.splat %in_ptr1 : !tt.ptr<bf16> -> tensor<1x4096x!tt.ptr<bf16>> loc(#loc81)
+    %tmp1_16 = tt.addptr %tmp1, %r0_base_7 : tensor<1x4096x!tt.ptr<bf16>>, tensor<1x4096xi32> loc(#loc81)
+    %tmp1_17 = tt.load %tmp1_16, %r0_mask, %cst_1 evictionPolicy = evict_last : tensor<1x4096x!tt.ptr<bf16>> loc(#loc82)
+    %tmp1_18 = arith.extf %tmp1_17 : tensor<1x4096xbf16> to tensor<1x4096xf32> loc(#loc83)
+    %tmp2 = tt.splat %in_ptr2 : !tt.ptr<bf16> -> tensor<1x4096x!tt.ptr<bf16>> loc(#loc84)
+    %tmp2_19 = tt.addptr %tmp2, %tmp0_9 : tensor<1x4096x!tt.ptr<bf16>>, tensor<1x4096xi32> loc(#loc84)
+    %tmp2_20 = tt.load %tmp2_19, %tmp0_13, %cst_1 evictionPolicy = evict_first : tensor<1x4096x!tt.ptr<bf16>> loc(#loc85)
+    %tmp2_21 = arith.extf %tmp2_20 : tensor<1x4096xbf16> to tensor<1x4096xf32> loc(#loc86)
+    %tmp3 = arith.mulf %tmp1_18, %tmp2_21 : tensor<1x4096xf32> loc(#loc87)
+    %tmp4 = arith.addf %tmp0_15, %tmp3 : tensor<1x4096xf32> loc(#loc88)
+    %tmp7_mean = arith.select %tmp0_13, %tmp4, %cst_0 : tensor<1x4096xi1>, tensor<1x4096xf32> loc(#loc89)
+    %tmp7_weight = arith.select %tmp0_13, %cst_2, %cst_0 : tensor<1x4096xi1>, tensor<1x4096xf32> loc(#loc90)
+    %0 = tt.splat %out_ptr0 : !tt.ptr<bf16> -> tensor<1x4096x!tt.ptr<bf16>> loc(#loc24)
+    %1 = tt.addptr %0, %tmp0_9 : tensor<1x4096x!tt.ptr<bf16>>, tensor<1x4096xi32> loc(#loc24)
+    %2 = arith.truncf %tmp4 : tensor<1x4096xf32> to tensor<1x4096xbf16> loc(#loc25)
+    tt.store %1, %2, %tmp0_13 : tensor<1x4096x!tt.ptr<bf16>> loc(#loc25)
+    %3:3 = "tt.reduce"(%tmp7_mean, %cst_0, %tmp7_weight) <{axis = 1 : i32}> ({
+    ^bb0(%arg9: f32 loc(callsite(#loc1 at #loc3)), %arg10: f32 loc(callsite(#loc1 at #loc3)), %arg11: f32 loc(callsite(#loc1 at #loc3)), %arg12: f32 loc(callsite(#loc1 at #loc3)), %arg13: f32 loc(callsite(#loc1 at #loc3)), %arg14: f32 loc(callsite(#loc1 at #loc3))):
+      %delta = arith.subf %arg12, %arg9 : f32 loc(#loc117)
+      %new_weight = arith.addf %arg11, %arg14 : f32 loc(#loc118)
+      %w2_over_w = arith.cmpf oeq, %new_weight, %cst : f32 loc(#loc119)
+      %w2_over_w_31 = arith.divf %arg14, %new_weight : f32 loc(#loc120)
+      %w2_over_w_32 = arith.select %w2_over_w, %cst, %w2_over_w_31 : f32 loc(#loc121)
+      %7 = arith.mulf %delta, %w2_over_w_32 : f32 loc(#loc122)
+      %8 = arith.addf %arg9, %7 : f32 loc(#loc123)
+      %9 = arith.addf %arg10, %arg13 : f32 loc(#loc124)
+      %10 = arith.mulf %delta, %delta : f32 loc(#loc125)
+      %11 = arith.mulf %10, %arg11 : f32 loc(#loc126)
+      %12 = arith.mulf %11, %w2_over_w_32 : f32 loc(#loc127)
+      %13 = arith.addf %9, %12 : f32 loc(#loc128)
+      tt.reduce.return %8, %13, %new_weight : f32, f32, f32 loc(#loc91)
+    }) : (tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32>) -> (tensor<1xf32>, tensor<1xf32>, tensor<1xf32>) loc(#loc91)
+    %tmp7 = tt.expand_dims %3#0 {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc97)
+    %tmp11 = tt.expand_dims %3#1 {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc98)
+    %tmp13 = tt.load %1, %tmp0_13, %cst_1 evictionPolicy = evict_first : tensor<1x4096x!tt.ptr<bf16>> loc(#loc99)
+    %tmp13_22 = arith.extf %tmp13 : tensor<1x4096xbf16> to tensor<1x4096xf32> loc(#loc100)
+    %tmp23 = tt.splat %in_ptr3 : !tt.ptr<bf16> -> tensor<1x4096x!tt.ptr<bf16>> loc(#loc101)
+    %tmp23_23 = tt.addptr %tmp23, %r0_base_7 : tensor<1x4096x!tt.ptr<bf16>>, tensor<1x4096xi32> loc(#loc101)
+    %tmp23_24 = tt.load %tmp23_23, %r0_mask, %cst_1 evictionPolicy = evict_last : tensor<1x4096x!tt.ptr<bf16>> loc(#loc102)
+    %tmp23_25 = arith.extf %tmp23_24 : tensor<1x4096xbf16> to tensor<1x4096xf32> loc(#loc103)
+    %tmp27 = tt.splat %in_ptr4 : !tt.ptr<bf16> -> tensor<1x4096x!tt.ptr<bf16>> loc(#loc104)
+    %tmp27_26 = tt.addptr %tmp27, %r0_base_7 : tensor<1x4096x!tt.ptr<bf16>>, tensor<1x4096xi32> loc(#loc104)
+    %tmp27_27 = tt.load %tmp27_26, %r0_mask, %cst_1 evictionPolicy = evict_last : tensor<1x4096x!tt.ptr<bf16>> loc(#loc105)
+    %tmp27_28 = arith.extf %tmp27_27 : tensor<1x4096xbf16> to tensor<1x4096xf32> loc(#loc106)
+    %tmp15 = tt.broadcast %tmp7 : tensor<1x1xf32> -> tensor<1x4096xf32> loc(#loc107)
+    %tmp15_29 = arith.subf %tmp13_22, %tmp15 : tensor<1x4096xf32> loc(#loc107)
+    %tmp17 = arith.divf %tmp11, %cst_4 : tensor<1x1xf32> loc(#loc108)
+    %tmp19 = arith.addf %tmp17, %cst_3 : tensor<1x1xf32> loc(#loc109)
+    %tmp20 = tt.extern_elementwise %tmp19 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<1x1xf32>) -> tensor<1x1xf32> loc(#loc110)
+    %tmp21 = tt.broadcast %tmp20 : tensor<1x1xf32> -> tensor<1x4096xf32> loc(#loc111)
+    %tmp21_30 = arith.mulf %tmp15_29, %tmp21 : tensor<1x4096xf32> loc(#loc111)
+    %tmp25 = arith.addf %tmp23_25, %cst_2 : tensor<1x4096xf32> loc(#loc112)
+    %tmp26 = arith.mulf %tmp21_30, %tmp25 : tensor<1x4096xf32> loc(#loc113)
+    %tmp28 = arith.addf %tmp26, %tmp27_28 : tensor<1x4096xf32> loc(#loc114)
+    %4 = tt.splat %out_ptr3 : !tt.ptr<bf16> -> tensor<1x4096x!tt.ptr<bf16>> loc(#loc57)
+    %5 = tt.addptr %4, %tmp0_9 : tensor<1x4096x!tt.ptr<bf16>>, tensor<1x4096xi32> loc(#loc57)
+    %6 = arith.truncf %tmp28 : tensor<1x4096xf32> to tensor<1x4096xbf16> loc(#loc58)
+    tt.store %5, %6, %tmp0_13 : tensor<1x4096x!tt.ptr<bf16>> loc(#loc58)
+    tt.return loc(#loc59)
+  } loc(#loc)
+} loc(#loc)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":25:21)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":23:28)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":26:27)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":26:37)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":34:29)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":38:46)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":38:41)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":38:34)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":38:61)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":38:51)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":38:113)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":39:34)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":39:41)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":39:94)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":40:34)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":40:51)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":40:113)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":41:22)
+#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":42:22)
+#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":48:62)
+#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":50:66)
+#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":51:29)
+#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":51:52)
+#loc26 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":243:46)
+#loc27 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":231:21)
+#loc28 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":232:28)
+#loc29 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:39)
+#loc30 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:60)
+#loc31 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:49)
+#loc32 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":235:25)
+#loc33 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":235:17)
+#loc34 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:15)
+#loc35 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:30)
+#loc36 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:38)
+#loc37 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:49)
+#loc38 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:22)
+#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":53:16)
+#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":54:17)
+#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":62:53)
+#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":62:115)
+#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":63:35)
+#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":63:42)
+#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":63:95)
+#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":64:35)
+#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":64:42)
+#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":64:95)
+#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":66:24)
+#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":68:25)
+#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":70:24)
+#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":71:32)
+#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":72:24)
+#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":75:24)
+#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":76:24)
+#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":77:24)
+#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":78:29)
+#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":78:53)
+#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/w3/cw3jbiy5zrkym7vknn3ziuk3qimlbchin2bbz3suf6qxijnbfc3y.py":56:4)
+#loc69 = loc("xmask"(#loc2))
+#loc71 = loc("xoffset"(#loc4))
+#loc72 = loc("r0_base"(#loc5))
+#loc73 = loc("r0_base"(#loc6))
+#loc74 = loc("r0_mask"(#loc7))
+#loc75 = loc("tmp0"(#loc8))
+#loc76 = loc("tmp0"(#loc9))
+#loc77 = loc("tmp0"(#loc10))
+#loc78 = loc("tmp0"(#loc11))
+#loc79 = loc("tmp0"(#loc12))
+#loc80 = loc("tmp0"(#loc13))
+#loc81 = loc("tmp1"(#loc14))
+#loc82 = loc("tmp1"(#loc15))
+#loc83 = loc("tmp1"(#loc16))
+#loc84 = loc("tmp2"(#loc17))
+#loc85 = loc("tmp2"(#loc18))
+#loc86 = loc("tmp2"(#loc19))
+#loc87 = loc("tmp3"(#loc20))
+#loc88 = loc("tmp4"(#loc21))
+#loc89 = loc("tmp7_mean"(#loc22))
+#loc90 = loc("tmp7_weight"(#loc23))
+#loc91 = loc(callsite(#loc26 at #loc3))
+#loc92 = loc("delta"(#loc27))
+#loc93 = loc("new_weight"(#loc28))
+#loc94 = loc("w2_over_w"(#loc29))
+#loc95 = loc("w2_over_w"(#loc30))
+#loc96 = loc("w2_over_w"(#loc31))
+#loc97 = loc("tmp7"(#loc39))
+#loc98 = loc("tmp11"(#loc40))
+#loc99 = loc("tmp13"(#loc41))
+#loc100 = loc("tmp13"(#loc42))
+#loc101 = loc("tmp23"(#loc43))
+#loc102 = loc("tmp23"(#loc44))
+#loc103 = loc("tmp23"(#loc45))
+#loc104 = loc("tmp27"(#loc46))
+#loc105 = loc("tmp27"(#loc47))
+#loc106 = loc("tmp27"(#loc48))
+#loc107 = loc("tmp15"(#loc49))
+#loc108 = loc("tmp17"(#loc50))
+#loc109 = loc("tmp19"(#loc51))
+#loc110 = loc("tmp20"(#loc52))
+#loc111 = loc("tmp21"(#loc53))
+#loc112 = loc("tmp25"(#loc54))
+#loc113 = loc("tmp26"(#loc55))
+#loc114 = loc("tmp28"(#loc56))
+#loc115 = loc(fused[#loc76, #loc75])
+#loc116 = loc(fused[#loc78, #loc69])
+#loc117 = loc(callsite(#loc92 at #loc91))
+#loc118 = loc(callsite(#loc93 at #loc91))
+#loc119 = loc(callsite(#loc94 at #loc91))
+#loc120 = loc(callsite(#loc95 at #loc91))
+#loc121 = loc(callsite(#loc96 at #loc91))
+#loc122 = loc(callsite(#loc32 at #loc91))
+#loc123 = loc(callsite(#loc33 at #loc91))
+#loc124 = loc(callsite(#loc34 at #loc91))
+#loc125 = loc(callsite(#loc35 at #loc91))
+#loc126 = loc(callsite(#loc36 at #loc91))
+#loc127 = loc(callsite(#loc37 at #loc91))
+#loc128 = loc(callsite(#loc38 at #loc91))
diff --git a/triton/THXMHSCFSPTLTP6J3LZOK4TG4KTYQR7QTC3P2EXOBY7MDGHSVWFA/__grp__triton_poi_fused_mul_silu_split_0.json b/triton/THXMHSCFSPTLTP6J3LZOK4TG4KTYQR7QTC3P2EXOBY7MDGHSVWFA/__grp__triton_poi_fused_mul_silu_split_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..7267cdd791bac8f1790e5b0f1b90435656064c76
--- /dev/null
+++ b/triton/THXMHSCFSPTLTP6J3LZOK4TG4KTYQR7QTC3P2EXOBY7MDGHSVWFA/__grp__triton_poi_fused_mul_silu_split_0.json
@@ -0,0 +1 @@
+{"child_paths": {"triton_poi_fused_mul_silu_split_0.source": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/THXMHSCFSPTLTP6J3LZOK4TG4KTYQR7QTC3P2EXOBY7MDGHSVWFA/triton_poi_fused_mul_silu_split_0.source", "triton_poi_fused_mul_silu_split_0.ttir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/THXMHSCFSPTLTP6J3LZOK4TG4KTYQR7QTC3P2EXOBY7MDGHSVWFA/triton_poi_fused_mul_silu_split_0.ttir", "triton_poi_fused_mul_silu_split_0.ttgir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/THXMHSCFSPTLTP6J3LZOK4TG4KTYQR7QTC3P2EXOBY7MDGHSVWFA/triton_poi_fused_mul_silu_split_0.ttgir", "triton_poi_fused_mul_silu_split_0.llir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/THXMHSCFSPTLTP6J3LZOK4TG4KTYQR7QTC3P2EXOBY7MDGHSVWFA/triton_poi_fused_mul_silu_split_0.llir", "triton_poi_fused_mul_silu_split_0.ptx": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/THXMHSCFSPTLTP6J3LZOK4TG4KTYQR7QTC3P2EXOBY7MDGHSVWFA/triton_poi_fused_mul_silu_split_0.ptx", "triton_poi_fused_mul_silu_split_0.cubin": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/THXMHSCFSPTLTP6J3LZOK4TG4KTYQR7QTC3P2EXOBY7MDGHSVWFA/triton_poi_fused_mul_silu_split_0.cubin", "triton_poi_fused_mul_silu_split_0.json": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/THXMHSCFSPTLTP6J3LZOK4TG4KTYQR7QTC3P2EXOBY7MDGHSVWFA/triton_poi_fused_mul_silu_split_0.json"}}
\ No newline at end of file
diff --git a/triton/THXMHSCFSPTLTP6J3LZOK4TG4KTYQR7QTC3P2EXOBY7MDGHSVWFA/triton_poi_fused_mul_silu_split_0.cubin b/triton/THXMHSCFSPTLTP6J3LZOK4TG4KTYQR7QTC3P2EXOBY7MDGHSVWFA/triton_poi_fused_mul_silu_split_0.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..fad16e1c39326b809a398992751ab5f0a0b2c6ba
Binary files /dev/null and b/triton/THXMHSCFSPTLTP6J3LZOK4TG4KTYQR7QTC3P2EXOBY7MDGHSVWFA/triton_poi_fused_mul_silu_split_0.cubin differ
diff --git a/triton/THXMHSCFSPTLTP6J3LZOK4TG4KTYQR7QTC3P2EXOBY7MDGHSVWFA/triton_poi_fused_mul_silu_split_0.json b/triton/THXMHSCFSPTLTP6J3LZOK4TG4KTYQR7QTC3P2EXOBY7MDGHSVWFA/triton_poi_fused_mul_silu_split_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..a181dfc740948536c72c45914f1f2c2984a88b7c
--- /dev/null
+++ b/triton/THXMHSCFSPTLTP6J3LZOK4TG4KTYQR7QTC3P2EXOBY7MDGHSVWFA/triton_poi_fused_mul_silu_split_0.json
@@ -0,0 +1 @@
+{"hash": "99eec3c84593e6b9bfc9daf2e57266e2a78847f098b6fd12ee0e3ec198f2ad8a", "target": {"backend": "cuda", "arch": 89, "warp_size": 32}, "num_warps": 8, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "enable_reflect_ftz": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee", "bf16x3", "bf16x6"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm89", "instrumentation_mode": "", "triton_version": "3.6.0", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_poi_fused_mul_silu_split_0"}
\ No newline at end of file
diff --git a/triton/THXMHSCFSPTLTP6J3LZOK4TG4KTYQR7QTC3P2EXOBY7MDGHSVWFA/triton_poi_fused_mul_silu_split_0.llir b/triton/THXMHSCFSPTLTP6J3LZOK4TG4KTYQR7QTC3P2EXOBY7MDGHSVWFA/triton_poi_fused_mul_silu_split_0.llir
new file mode 100644
index 0000000000000000000000000000000000000000..8f13a69bdc963657c9c20ac9a05660641d7244cf
--- /dev/null
+++ b/triton/THXMHSCFSPTLTP6J3LZOK4TG4KTYQR7QTC3P2EXOBY7MDGHSVWFA/triton_poi_fused_mul_silu_split_0.llir
@@ -0,0 +1,102 @@
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64"
+
+; Function Attrs: nounwind
+define ptx_kernel void @triton_poi_fused_mul_silu_split_0(ptr addrspace(1) %0, ptr addrspace(1) %1, i32 %2, ptr addrspace(1) readnone captures(none) %3, ptr addrspace(1) readnone captures(none) %4) local_unnamed_addr #0 !dbg !4 {
+  %6 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7
+  %7 = shl i32 %6, 9, !dbg !8
+  %8 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9
+  %9 = shl nuw nsw i32 %8, 1, !dbg !9
+  %10 = and i32 %9, 510, !dbg !9
+  %11 = or disjoint i32 %10, %7, !dbg !10
+  %12 = srem i32 %11, 12288, !dbg !11
+  %13 = sub nsw i32 %11, %12, !dbg !11
+  %14 = add i32 %13, %11, !dbg !11
+  %15 = sext i32 %14 to i64, !dbg !12
+  %16 = getelementptr bfloat, ptr addrspace(1) %0, i64 %15, !dbg !12
+  %17 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l"(ptr addrspace(1) %16) #3, !dbg !13
+  %18 = bitcast i32 %17 to <2 x bfloat>, !dbg !13
+  %19 = add i32 %14, 12288, !dbg !14
+  %20 = sext i32 %19 to i64, !dbg !15
+  %21 = getelementptr bfloat, ptr addrspace(1) %0, i64 %20, !dbg !15
+  %22 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l"(ptr addrspace(1) %21) #3, !dbg !16
+  %23 = bitcast i32 %22 to <2 x bfloat>, !dbg !16
+  %24 = sext i32 %11 to i64, !dbg !17
+  %25 = getelementptr bfloat, ptr addrspace(1) %1, i64 %24, !dbg !17
+  %26 = fpext <2 x bfloat> %18 to <2 x float>, !dbg !18
+  %27 = fpext <2 x bfloat> %23 to <2 x float>, !dbg !19
+  %28 = extractelement <2 x float> %26, i64 0, !dbg !20
+  %29 = fsub float 0.000000e+00, %28, !dbg !20
+  %30 = extractelement <2 x float> %26, i64 1, !dbg !20
+  %31 = fsub float 0.000000e+00, %30, !dbg !20
+  %32 = fmul float %29, 0x3FF7154760000000, !dbg !25
+  %33 = tail call float @llvm.nvvm.ex2.approx.f(float %32), !dbg !25
+  %34 = fmul float %31, 0x3FF7154760000000, !dbg !25
+  %35 = tail call float @llvm.nvvm.ex2.approx.f(float %34), !dbg !25
+  %36 = fadd float %33, 1.000000e+00, !dbg !26
+  %37 = fadd float %35, 1.000000e+00, !dbg !26
+  %38 = tail call float @llvm.nvvm.div.full(float 1.000000e+00, float %36), !dbg !27
+  %39 = tail call float @llvm.nvvm.div.full(float 1.000000e+00, float %37), !dbg !27
+  %40 = insertelement <2 x float> poison, float %38, i64 0, !dbg !28
+  %41 = insertelement <2 x float> %40, float %39, i64 1, !dbg !28
+  %42 = fmul <2 x float> %41, %26, !dbg !28
+  %43 = fmul <2 x float> %42, %27, !dbg !29
+  %44 = fptrunc <2 x float> %43 to <2 x bfloat>, !dbg !30
+  %45 = bitcast <2 x bfloat> %44 to i32, !dbg !30
+  tail call void asm sideeffect "st.global.b32 [ $1 + 0 ], { $0 };", "r,l"(i32 %45, ptr addrspace(1) %25) #3, !dbg !30
+  ret void, !dbg !31
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.ex2.approx.f(float) #2
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.div.full(float, float) #2
+
+attributes #0 = { nounwind "nvvm.reqntid"="256" }
+attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #2 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
+attributes #3 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly)
+!1 = !DIFile(filename: "csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py", directory: "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy")
+!2 = !{i32 2, !"Debug Info Version", i32 3}
+!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
+!4 = distinct !DISubprogram(name: "triton_poi_fused_mul_silu_split_0", linkageName: "triton_poi_fused_mul_silu_split_0", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!5 = !DISubroutineType(cc: DW_CC_normal, types: !6)
+!6 = !{}
+!7 = !DILocation(line: 19, column: 28, scope: !4)
+!8 = !DILocation(line: 19, column: 33, scope: !4)
+!9 = !DILocation(line: 20, column: 36, scope: !4)
+!10 = !DILocation(line: 20, column: 23, scope: !4)
+!11 = !DILocation(line: 25, column: 35, scope: !4)
+!12 = !DILocation(line: 25, column: 30, scope: !4)
+!13 = !DILocation(line: 25, column: 46, scope: !4)
+!14 = !DILocation(line: 26, column: 43, scope: !4)
+!15 = !DILocation(line: 26, column: 30, scope: !4)
+!16 = !DILocation(line: 26, column: 54, scope: !4)
+!17 = !DILocation(line: 32, column: 25, scope: !4)
+!18 = !DILocation(line: 25, column: 55, scope: !4)
+!19 = !DILocation(line: 26, column: 63, scope: !4)
+!20 = !DILocation(line: 50, column: 30, scope: !21, inlinedAt: !23)
+!21 = distinct !DILexicalBlockFile(scope: !4, file: !22, discriminator: 0)
+!22 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.12/dist-packages/triton/language")
+!23 = !DILocation(line: 28, column: 22, scope: !24)
+!24 = distinct !DILexicalBlockFile(scope: !4, file: !1, discriminator: 0)
+!25 = !DILocation(line: 50, column: 29, scope: !21, inlinedAt: !23)
+!26 = !DILocation(line: 50, column: 20, scope: !21, inlinedAt: !23)
+!27 = !DILocation(line: 50, column: 16, scope: !21, inlinedAt: !23)
+!28 = !DILocation(line: 29, column: 18, scope: !4)
+!29 = !DILocation(line: 31, column: 18, scope: !4)
+!30 = !DILocation(line: 32, column: 36, scope: !4)
+!31 = !DILocation(line: 32, column: 4, scope: !4)
diff --git a/triton/THXMHSCFSPTLTP6J3LZOK4TG4KTYQR7QTC3P2EXOBY7MDGHSVWFA/triton_poi_fused_mul_silu_split_0.ptx b/triton/THXMHSCFSPTLTP6J3LZOK4TG4KTYQR7QTC3P2EXOBY7MDGHSVWFA/triton_poi_fused_mul_silu_split_0.ptx
new file mode 100644
index 0000000000000000000000000000000000000000..dd0ad117afc743c8bd09e3d2c446131c72bb23ab
--- /dev/null
+++ b/triton/THXMHSCFSPTLTP6J3LZOK4TG4KTYQR7QTC3P2EXOBY7MDGHSVWFA/triton_poi_fused_mul_silu_split_0.ptx
@@ -0,0 +1,437 @@
+//
+// Generated by LLVM NVPTX Back-End
+//
+
+.version 9.1
+.target sm_89
+.address_size 64
+
+	// .globl	triton_poi_fused_mul_silu_split_0 // -- Begin function triton_poi_fused_mul_silu_split_0
+                                        // @triton_poi_fused_mul_silu_split_0
+.visible .entry triton_poi_fused_mul_silu_split_0(
+	.param .u64 .ptr .global .align 1 triton_poi_fused_mul_silu_split_0_param_0,
+	.param .u64 .ptr .global .align 1 triton_poi_fused_mul_silu_split_0_param_1,
+	.param .u32 triton_poi_fused_mul_silu_split_0_param_2,
+	.param .u64 .ptr .global .align 1 triton_poi_fused_mul_silu_split_0_param_3,
+	.param .u64 .ptr .global .align 1 triton_poi_fused_mul_silu_split_0_param_4
+)
+.reqntid 256
+{
+	.reg .b16 	%rs<5>;
+	.reg .b32 	%r<36>;
+	.reg .b64 	%rd<6>;
+	.loc	1 18 0                          // csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:18:0
+$L__func_begin0:
+	.loc	1 18 0                          // csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:18:0
+
+// %bb.0:
+	ld.param.b64 	%rd4, [triton_poi_fused_mul_silu_split_0_param_0];
+	ld.param.b64 	%rd5, [triton_poi_fused_mul_silu_split_0_param_1];
+$L__tmp0:
+	.loc	1 19 28                         // csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:19:28
+	mov.u32 	%r4, %ctaid.x;
+	.loc	1 19 33                         // csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:19:33
+	shl.b32 	%r5, %r4, 9;
+	.loc	1 20 36                         // csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:20:36
+	mov.u32 	%r6, %tid.x;
+	shl.b32 	%r7, %r6, 1;
+	and.b32 	%r8, %r7, 510;
+	.loc	1 20 23                         // csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:20:23
+	or.b32 	%r9, %r8, %r5;
+	.loc	1 25 35                         // csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:25:35
+	mul.hi.s32 	%r10, %r9, 715827883;
+	shr.u32 	%r11, %r10, 31;
+	shr.u32 	%r12, %r10, 11;
+	add.s32 	%r13, %r12, %r11;
+	mad.lo.s32 	%r14, %r13, 12288, %r9;
+	.loc	1 25 30                         // csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:25:30
+	mad.wide.s32 	%rd1, %r14, 2, %rd4;
+	.loc	1 25 46                         // csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:25:46
+	// begin inline asm
+	mov.u32 %r1, 0x0;
+	ld.global.b32 { %r1 }, [ %rd1 + 0 ];
+	// end inline asm
+	.loc	1 26 43                         // csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:26:43
+	add.s32 	%r15, %r14, 12288;
+	.loc	1 26 30                         // csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:26:30
+	mad.wide.s32 	%rd2, %r15, 2, %rd4;
+	.loc	1 26 54                         // csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:26:54
+	// begin inline asm
+	mov.u32 %r2, 0x0;
+	ld.global.b32 { %r2 }, [ %rd2 + 0 ];
+	// end inline asm
+	.loc	1 32 25                         // csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:32:25
+	mad.wide.s32 	%rd3, %r9, 2, %rd5;
+	.loc	1 25 55                         // csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:25:55
+	mov.b32 	{%rs1, %rs2}, %r1;
+	cvt.f32.bf16 	%r16, %rs2;
+	cvt.f32.bf16 	%r17, %rs1;
+	.loc	1 26 63                         // csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:26:63
+	mov.b32 	{%rs3, %rs4}, %r2;
+	cvt.f32.bf16 	%r18, %rs4;
+	cvt.f32.bf16 	%r19, %rs3;
+	mov.b32 	%r20, 0f00000000;
+$L__tmp1:
+	.loc	2 50 30                         // standard.py:50:30 @[ csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:28:22 ]
+	sub.f32 	%r21, %r20, %r17;
+	sub.f32 	%r22, %r20, %r16;
+	.loc	2 50 29                         // standard.py:50:29 @[ csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:28:22 ]
+	mul.f32 	%r23, %r21, 0f3FB8AA3B;
+	ex2.approx.f32 	%r24, %r23;
+	mul.f32 	%r25, %r22, 0f3FB8AA3B;
+	ex2.approx.f32 	%r26, %r25;
+	.loc	2 50 20                         // standard.py:50:20 @[ csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:28:22 ]
+	add.f32 	%r27, %r24, 0f3F800000;
+	add.f32 	%r28, %r26, 0f3F800000;
+	mov.b32 	%r29, 0f3F800000;
+	.loc	2 50 16                         // standard.py:50:16 @[ csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:28:22 ]
+	div.full.f32 	%r30, %r29, %r27;
+	div.full.f32 	%r31, %r29, %r28;
+$L__tmp2:
+	.loc	1 29 18                         // csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:29:18
+	mul.f32 	%r32, %r31, %r16;
+	mul.f32 	%r33, %r30, %r17;
+	.loc	1 31 18                         // csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:31:18
+	mul.f32 	%r34, %r33, %r19;
+	mul.f32 	%r35, %r32, %r18;
+	.loc	1 32 36                         // csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:32:36
+	cvt.rn.bf16x2.f32 	%r3, %r35, %r34;
+	// begin inline asm
+	st.global.b32 [ %rd3 + 0 ], { %r3 };
+	// end inline asm
+	.loc	1 32 4                          // csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py:32:4
+	ret;
+$L__tmp3:
+$L__func_end0:
+                                        // -- End function
+}
+	.file	1 "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py"
+	.file	2 "/usr/local/lib/python3.12/dist-packages/triton/language/standard.py"
+	.section	.debug_abbrev
+	{
+.b8 1                                   // Abbreviation Code
+.b8 17                                  // DW_TAG_compile_unit
+.b8 1                                   // DW_CHILDREN_yes
+.b8 37                                  // DW_AT_producer
+.b8 8                                   // DW_FORM_string
+.b8 19                                  // DW_AT_language
+.b8 5                                   // DW_FORM_data2
+.b8 3                                   // DW_AT_name
+.b8 8                                   // DW_FORM_string
+.b8 16                                  // DW_AT_stmt_list
+.b8 6                                   // DW_FORM_data4
+.b8 27                                  // DW_AT_comp_dir
+.b8 8                                   // DW_FORM_string
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 2                                   // Abbreviation Code
+.b8 46                                  // DW_TAG_subprogram
+.b8 0                                   // DW_CHILDREN_no
+.b8 3                                   // DW_AT_name
+.b8 8                                   // DW_FORM_string
+.b8 32                                  // DW_AT_inline
+.b8 11                                  // DW_FORM_data1
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 3                                   // Abbreviation Code
+.b8 46                                  // DW_TAG_subprogram
+.b8 1                                   // DW_CHILDREN_yes
+.b8 17                                  // DW_AT_low_pc
+.b8 1                                   // DW_FORM_addr
+.b8 18                                  // DW_AT_high_pc
+.b8 1                                   // DW_FORM_addr
+.b8 49                                  // DW_AT_abstract_origin
+.b8 19                                  // DW_FORM_ref4
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 4                                   // Abbreviation Code
+.b8 29                                  // DW_TAG_inlined_subroutine
+.b8 0                                   // DW_CHILDREN_no
+.b8 49                                  // DW_AT_abstract_origin
+.b8 19                                  // DW_FORM_ref4
+.b8 17                                  // DW_AT_low_pc
+.b8 1                                   // DW_FORM_addr
+.b8 18                                  // DW_AT_high_pc
+.b8 1                                   // DW_FORM_addr
+.b8 88                                  // DW_AT_call_file
+.b8 11                                  // DW_FORM_data1
+.b8 89                                  // DW_AT_call_line
+.b8 11                                  // DW_FORM_data1
+.b8 87                                  // DW_AT_call_column
+.b8 11                                  // DW_FORM_data1
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 0                                   // EOM(3)
+	}
+	.section	.debug_info
+	{
+.b32 307                                // Length of Unit
+.b8 2                                   // DWARF version number
+.b8 0
+.b32 .debug_abbrev                      // Offset Into Abbrev. Section
+.b8 8                                   // Address Size (in bytes)
+.b8 1                                   // Abbrev [1] 0xb:0x12c DW_TAG_compile_unit
+.b8 116                                 // DW_AT_producer
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 0
+.b8 2                                   // DW_AT_language
+.b8 0
+.b8 99                                  // DW_AT_name
+.b8 115
+.b8 121
+.b8 97
+.b8 101
+.b8 51
+.b8 111
+.b8 107
+.b8 50
+.b8 120
+.b8 110
+.b8 122
+.b8 117
+.b8 120
+.b8 104
+.b8 106
+.b8 107
+.b8 120
+.b8 122
+.b8 104
+.b8 100
+.b8 99
+.b8 112
+.b8 99
+.b8 122
+.b8 54
+.b8 106
+.b8 99
+.b8 107
+.b8 99
+.b8 117
+.b8 51
+.b8 118
+.b8 118
+.b8 55
+.b8 101
+.b8 113
+.b8 98
+.b8 51
+.b8 112
+.b8 101
+.b8 119
+.b8 104
+.b8 114
+.b8 118
+.b8 113
+.b8 109
+.b8 105
+.b8 101
+.b8 114
+.b8 103
+.b8 102
+.b8 46
+.b8 112
+.b8 121
+.b8 0
+.b32 .debug_line                        // DW_AT_stmt_list
+.b8 47                                  // DW_AT_comp_dir
+.b8 97
+.b8 112
+.b8 112
+.b8 47
+.b8 116
+.b8 101
+.b8 110
+.b8 115
+.b8 111
+.b8 114
+.b8 114
+.b8 116
+.b8 95
+.b8 108
+.b8 108
+.b8 109
+.b8 47
+.b8 118
+.b8 105
+.b8 115
+.b8 117
+.b8 97
+.b8 108
+.b8 95
+.b8 103
+.b8 101
+.b8 110
+.b8 47
+.b8 99
+.b8 111
+.b8 109
+.b8 112
+.b8 105
+.b8 108
+.b8 101
+.b8 100
+.b8 95
+.b8 99
+.b8 97
+.b8 99
+.b8 104
+.b8 101
+.b8 47
+.b8 102
+.b8 108
+.b8 117
+.b8 120
+.b8 50
+.b8 95
+.b8 107
+.b8 108
+.b8 101
+.b8 105
+.b8 110
+.b8 95
+.b8 57
+.b8 98
+.b8 95
+.b8 78
+.b8 86
+.b8 73
+.b8 68
+.b8 73
+.b8 65
+.b8 95
+.b8 71
+.b8 101
+.b8 70
+.b8 111
+.b8 114
+.b8 99
+.b8 101
+.b8 95
+.b8 82
+.b8 84
+.b8 88
+.b8 95
+.b8 52
+.b8 48
+.b8 57
+.b8 48
+.b8 95
+.b8 115
+.b8 109
+.b8 56
+.b8 57
+.b8 95
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 50
+.b8 46
+.b8 49
+.b8 48
+.b8 46
+.b8 48
+.b8 97
+.b8 48
+.b8 95
+.b8 98
+.b8 52
+.b8 101
+.b8 52
+.b8 101
+.b8 101
+.b8 56
+.b8 49
+.b8 100
+.b8 51
+.b8 46
+.b8 110
+.b8 118
+.b8 50
+.b8 53
+.b8 46
+.b8 49
+.b8 50
+.b8 95
+.b8 99
+.b8 117
+.b8 100
+.b8 97
+.b8 49
+.b8 51
+.b8 95
+.b8 49
+.b8 47
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 105
+.b8 110
+.b8 100
+.b8 117
+.b8 99
+.b8 116
+.b8 111
+.b8 114
+.b8 47
+.b8 115
+.b8 121
+.b8 0
+.b8 2                                   // Abbrev [2] 0xe4:0x24 DW_TAG_subprogram
+.b8 116                                 // DW_AT_name
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 112
+.b8 111
+.b8 105
+.b8 95
+.b8 102
+.b8 117
+.b8 115
+.b8 101
+.b8 100
+.b8 95
+.b8 109
+.b8 117
+.b8 108
+.b8 95
+.b8 115
+.b8 105
+.b8 108
+.b8 117
+.b8 95
+.b8 115
+.b8 112
+.b8 108
+.b8 105
+.b8 116
+.b8 95
+.b8 48
+.b8 0
+.b8 1                                   // DW_AT_inline
+.b8 3                                   // Abbrev [3] 0x108:0x2e DW_TAG_subprogram
+.b64 $L__func_begin0                    // DW_AT_low_pc
+.b64 $L__func_end0                      // DW_AT_high_pc
+.b32 228                                // DW_AT_abstract_origin
+.b8 4                                   // Abbrev [4] 0x11d:0x18 DW_TAG_inlined_subroutine
+.b32 228                                // DW_AT_abstract_origin
+.b64 $L__tmp1                           // DW_AT_low_pc
+.b64 $L__tmp2                           // DW_AT_high_pc
+.b8 1                                   // DW_AT_call_file
+.b8 28                                  // DW_AT_call_line
+.b8 22                                  // DW_AT_call_column
+.b8 0                                   // End Of Children Mark
+.b8 0                                   // End Of Children Mark
+	}
+	.section	.debug_macinfo	{	}
diff --git a/triton/THXMHSCFSPTLTP6J3LZOK4TG4KTYQR7QTC3P2EXOBY7MDGHSVWFA/triton_poi_fused_mul_silu_split_0.source b/triton/THXMHSCFSPTLTP6J3LZOK4TG4KTYQR7QTC3P2EXOBY7MDGHSVWFA/triton_poi_fused_mul_silu_split_0.source
new file mode 100644
index 0000000000000000000000000000000000000000..7038e3a31164d9b44cbd605968faab87006025a0
--- /dev/null
+++ b/triton/THXMHSCFSPTLTP6J3LZOK4TG4KTYQR7QTC3P2EXOBY7MDGHSVWFA/triton_poi_fused_mul_silu_split_0.source
@@ -0,0 +1,126 @@
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":18:0)
+#loc25 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":49:0)
+#loc32 = loc("in_ptr0"(#loc))
+#loc33 = loc("out_ptr0"(#loc))
+#loc34 = loc("xnumel"(#loc))
+#loc56 = loc("x"(#loc25))
+module {
+  tt.func public @triton_poi_fused_mul_silu_split_0(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} {
+    %xoffset = tt.get_program_id x : i32 loc(#loc35)
+    %xoffset_0 = arith.constant 512 : i32 loc(#loc36)
+    %xoffset_1 = arith.constant 512 : i32 loc(#loc36)
+    %xoffset_2 = arith.muli %xoffset, %xoffset_1 : i32 loc(#loc36)
+    %xindex = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32> loc(#loc37)
+    %xindex_3 = tt.splat %xoffset_2 : i32 -> tensor<512xi32> loc(#loc38)
+    %xindex_4 = arith.addi %xindex_3, %xindex : tensor<512xi32> loc(#loc38)
+    %xmask = arith.constant true loc(#loc39)
+    %xmask_5 = arith.constant dense<true> : tensor<512xi1> loc(#loc39)
+    %x0 = arith.constant 12288 : i32 loc(#loc40)
+    %x0_6 = arith.constant 12288 : i32 loc(#loc40)
+    %x0_7 = arith.constant dense<12288> : tensor<512xi32> loc(#loc40)
+    %x0_8 = arith.remsi %xindex_4, %x0_7 : tensor<512xi32> loc(#loc40)
+    %x1 = arith.constant 12288 : i32 loc(#loc41)
+    %x1_9 = arith.constant 12288 : i32 loc(#loc41)
+    %x1_10 = arith.constant dense<12288> : tensor<512xi32> loc(#loc41)
+    %x1_11 = arith.divsi %xindex_4, %x1_10 : tensor<512xi32> loc(#loc41)
+    %tmp0 = arith.constant 24576 : i32 loc(#loc42)
+    %tmp0_12 = arith.constant 24576 : i32 loc(#loc42)
+    %tmp0_13 = arith.constant dense<24576> : tensor<512xi32> loc(#loc42)
+    %tmp0_14 = arith.muli %tmp0_13, %x1_11 : tensor<512xi32> loc(#loc42)
+    %tmp0_15 = arith.addi %x0_8, %tmp0_14 : tensor<512xi32> loc(#loc43)
+    %tmp0_16 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<512x!tt.ptr<bf16>> loc(#loc44)
+    %tmp0_17 = tt.addptr %tmp0_16, %tmp0_15 : tensor<512x!tt.ptr<bf16>>, tensor<512xi32> loc(#loc44)
+    %tmp0_18 = tt.load %tmp0_17 : tensor<512x!tt.ptr<bf16>> loc(#loc45)
+    %tmp0_19 = arith.extf %tmp0_18 : tensor<512xbf16> to tensor<512xf32> loc(#loc46)
+    %tmp5 = arith.constant 12288 : i32 loc(#loc47)
+    %tmp5_20 = arith.constant 12288 : i32 loc(#loc47)
+    %tmp5_21 = arith.constant dense<12288> : tensor<512xi32> loc(#loc47)
+    %tmp5_22 = arith.addi %tmp5_21, %x0_8 : tensor<512xi32> loc(#loc47)
+    %tmp5_23 = arith.constant 24576 : i32 loc(#loc48)
+    %tmp5_24 = arith.constant 24576 : i32 loc(#loc48)
+    %tmp5_25 = arith.constant dense<24576> : tensor<512xi32> loc(#loc48)
+    %tmp5_26 = arith.muli %tmp5_25, %x1_11 : tensor<512xi32> loc(#loc48)
+    %tmp5_27 = arith.addi %tmp5_22, %tmp5_26 : tensor<512xi32> loc(#loc49)
+    %tmp5_28 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<512x!tt.ptr<bf16>> loc(#loc50)
+    %tmp5_29 = tt.addptr %tmp5_28, %tmp5_27 : tensor<512x!tt.ptr<bf16>>, tensor<512xi32> loc(#loc50)
+    %tmp5_30 = tt.load %tmp5_29 : tensor<512x!tt.ptr<bf16>> loc(#loc51)
+    %tmp5_31 = arith.extf %tmp5_30 : tensor<512xbf16> to tensor<512xf32> loc(#loc52)
+    %tmp2 = tt.call @triton.language.standard.sigmoid__fp32S512S__(%tmp0_19) : (tensor<512xf32>) -> tensor<512xf32> loc(#loc53)
+    %tmp3 = arith.mulf %tmp0_19, %tmp2 : tensor<512xf32> loc(#loc54)
+    %tmp6 = arith.mulf %tmp3, %tmp5_31 : tensor<512xf32> loc(#loc55)
+    %0 = tt.splat %out_ptr0 : !tt.ptr<bf16> -> tensor<512x!tt.ptr<bf16>> loc(#loc22)
+    %1 = tt.addptr %0, %xindex_4 : tensor<512x!tt.ptr<bf16>>, tensor<512xi32> loc(#loc22)
+    %2 = arith.truncf %tmp6 : tensor<512xf32> to tensor<512xbf16> loc(#loc23)
+    tt.store %1, %2 : tensor<512x!tt.ptr<bf16>> loc(#loc23)
+    tt.return loc(#loc24)
+  } loc(#loc)
+  tt.func private @triton.language.standard.sigmoid__fp32S512S__(%x: tensor<512xf32> loc("x"(#loc25))) -> tensor<512xf32> attributes {noinline = false} {
+    %cst = arith.constant 0.000000e+00 : f32 loc(#loc26)
+    %cst_0 = arith.constant dense<0.000000e+00> : tensor<512xf32> loc(#loc26)
+    %0 = arith.subf %cst_0, %x : tensor<512xf32> loc(#loc26)
+    %1 = math.exp %0 : tensor<512xf32> loc(#loc27)
+    %c1_i32 = arith.constant 1 : i32 loc(#loc28)
+    %cst_1 = arith.constant 1.000000e+00 : f32 loc(#loc28)
+    %cst_2 = arith.constant dense<1.000000e+00> : tensor<512xf32> loc(#loc28)
+    %2 = arith.addf %cst_2, %1 : tensor<512xf32> loc(#loc28)
+    %c1_i32_3 = arith.constant 1 : i32 loc(#loc29)
+    %cst_4 = arith.constant 1.000000e+00 : f32 loc(#loc29)
+    %cst_5 = arith.constant dense<1.000000e+00> : tensor<512xf32> loc(#loc29)
+    %3 = arith.divf %cst_5, %2 : tensor<512xf32> loc(#loc29)
+    tt.return %3 : tensor<512xf32> loc(#loc30)
+  ^bb1:  // no predecessors
+    %4 = ub.poison : tensor<512xf32> loc(#loc31)
+    tt.return %4 : tensor<512xf32> loc(#loc31)
+  } loc(#loc25)
+} loc(#loc)
+#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":19:28)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":19:33)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":20:36)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":20:23)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":21:36)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":22:19)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":23:19)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":25:41)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":25:35)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":25:30)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":25:46)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":25:55)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":26:38)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":26:49)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":26:43)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":26:30)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":26:54)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":26:63)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":28:22)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":29:18)
+#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":31:18)
+#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":32:25)
+#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":32:36)
+#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":32:4)
+#loc26 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:30)
+#loc27 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:29)
+#loc28 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:20)
+#loc29 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:16)
+#loc30 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:11)
+#loc31 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:4)
+#loc35 = loc("xoffset"(#loc1))
+#loc36 = loc("xoffset"(#loc2))
+#loc37 = loc("xindex"(#loc3))
+#loc38 = loc("xindex"(#loc4))
+#loc39 = loc("xmask"(#loc5))
+#loc40 = loc("x0"(#loc6))
+#loc41 = loc("x1"(#loc7))
+#loc42 = loc("tmp0"(#loc8))
+#loc43 = loc("tmp0"(#loc9))
+#loc44 = loc("tmp0"(#loc10))
+#loc45 = loc("tmp0"(#loc11))
+#loc46 = loc("tmp0"(#loc12))
+#loc47 = loc("tmp5"(#loc13))
+#loc48 = loc("tmp5"(#loc14))
+#loc49 = loc("tmp5"(#loc15))
+#loc50 = loc("tmp5"(#loc16))
+#loc51 = loc("tmp5"(#loc17))
+#loc52 = loc("tmp5"(#loc18))
+#loc53 = loc("tmp2"(#loc19))
+#loc54 = loc("tmp3"(#loc20))
+#loc55 = loc("tmp6"(#loc21))
diff --git a/triton/THXMHSCFSPTLTP6J3LZOK4TG4KTYQR7QTC3P2EXOBY7MDGHSVWFA/triton_poi_fused_mul_silu_split_0.ttgir b/triton/THXMHSCFSPTLTP6J3LZOK4TG4KTYQR7QTC3P2EXOBY7MDGHSVWFA/triton_poi_fused_mul_silu_split_0.ttgir
new file mode 100644
index 0000000000000000000000000000000000000000..a4aae66c87b9169498979273e3a61914413dbf3a
--- /dev/null
+++ b/triton/THXMHSCFSPTLTP6J3LZOK4TG4KTYQR7QTC3P2EXOBY7MDGHSVWFA/triton_poi_fused_mul_silu_split_0.ttgir
@@ -0,0 +1,93 @@
+#blocked = #ttg.blocked<{sizePerThread = [2], threadsPerWarp = [32], warpsPerCTA = [8], order = [0]}>
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":18:0)
+#loc28 = loc("in_ptr0"(#loc))
+#loc29 = loc("out_ptr0"(#loc))
+#loc30 = loc("xnumel"(#loc))
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "cuda:89", "ttg.threads-per-warp" = 32 : i32} {
+  tt.func public @triton_poi_fused_mul_silu_split_0(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} {
+    %cst = arith.constant dense<24576> : tensor<512xi32, #blocked> loc(#loc1)
+    %cst_0 = arith.constant dense<12288> : tensor<512xi32, #blocked> loc(#loc1)
+    %c512_i32 = arith.constant 512 : i32 loc(#loc1)
+    %cst_1 = arith.constant dense<0.000000e+00> : tensor<512xf32, #blocked> loc(#loc1)
+    %cst_2 = arith.constant dense<1.000000e+00> : tensor<512xf32, #blocked> loc(#loc1)
+    %xoffset = tt.get_program_id x : i32 loc(#loc31)
+    %xoffset_3 = arith.muli %xoffset, %c512_i32 : i32 loc(#loc32)
+    %xindex = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32, #blocked> loc(#loc33)
+    %xindex_4 = tt.splat %xoffset_3 : i32 -> tensor<512xi32, #blocked> loc(#loc34)
+    %xindex_5 = arith.addi %xindex_4, %xindex : tensor<512xi32, #blocked> loc(#loc34)
+    %x0 = arith.remsi %xindex_5, %cst_0 : tensor<512xi32, #blocked> loc(#loc35)
+    %x1 = arith.divsi %xindex_5, %cst_0 : tensor<512xi32, #blocked> loc(#loc36)
+    %tmp0 = arith.muli %x1, %cst : tensor<512xi32, #blocked> loc(#loc37)
+    %tmp0_6 = arith.addi %x0, %tmp0 : tensor<512xi32, #blocked> loc(#loc38)
+    %tmp0_7 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<512x!tt.ptr<bf16>, #blocked> loc(#loc39)
+    %tmp0_8 = tt.addptr %tmp0_7, %tmp0_6 : tensor<512x!tt.ptr<bf16>, #blocked>, tensor<512xi32, #blocked> loc(#loc39)
+    %tmp0_9 = tt.load %tmp0_8 : tensor<512x!tt.ptr<bf16>, #blocked> loc(#loc40)
+    %tmp0_10 = arith.extf %tmp0_9 : tensor<512xbf16, #blocked> to tensor<512xf32, #blocked> loc(#loc41)
+    %tmp5 = arith.addi %x0, %cst_0 : tensor<512xi32, #blocked> loc(#loc42)
+    %tmp5_11 = arith.addi %tmp5, %tmp0 : tensor<512xi32, #blocked> loc(#loc43)
+    %tmp5_12 = tt.addptr %tmp0_7, %tmp5_11 : tensor<512x!tt.ptr<bf16>, #blocked>, tensor<512xi32, #blocked> loc(#loc44)
+    %tmp5_13 = tt.load %tmp5_12 : tensor<512x!tt.ptr<bf16>, #blocked> loc(#loc45)
+    %tmp5_14 = arith.extf %tmp5_13 : tensor<512xbf16, #blocked> to tensor<512xf32, #blocked> loc(#loc46)
+    %tmp2 = arith.subf %cst_1, %tmp0_10 : tensor<512xf32, #blocked> loc(#loc50)
+    %tmp2_15 = math.exp %tmp2 : tensor<512xf32, #blocked> loc(#loc51)
+    %tmp2_16 = arith.addf %tmp2_15, %cst_2 : tensor<512xf32, #blocked> loc(#loc52)
+    %tmp2_17 = arith.divf %cst_2, %tmp2_16 : tensor<512xf32, #blocked> loc(#loc53)
+    %tmp3 = arith.mulf %tmp0_10, %tmp2_17 : tensor<512xf32, #blocked> loc(#loc48)
+    %tmp6 = arith.mulf %tmp3, %tmp5_14 : tensor<512xf32, #blocked> loc(#loc49)
+    %0 = tt.splat %out_ptr0 : !tt.ptr<bf16> -> tensor<512x!tt.ptr<bf16>, #blocked> loc(#loc25)
+    %1 = tt.addptr %0, %xindex_5 : tensor<512x!tt.ptr<bf16>, #blocked>, tensor<512xi32, #blocked> loc(#loc25)
+    %2 = arith.truncf %tmp6 : tensor<512xf32, #blocked> to tensor<512xbf16, #blocked> loc(#loc26)
+    tt.store %1, %2 : tensor<512x!tt.ptr<bf16>, #blocked> loc(#loc26)
+    tt.return loc(#loc27)
+  } loc(#loc)
+} loc(#loc)
+#loc1 = loc(unknown)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":19:28)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":19:33)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":20:36)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":20:23)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":22:19)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":23:19)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":25:41)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":25:35)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":25:30)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":25:46)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":25:55)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":26:38)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":26:43)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":26:30)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":26:54)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":26:63)
+#loc18 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:30)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":28:22)
+#loc20 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:29)
+#loc21 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:20)
+#loc22 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:16)
+#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":29:18)
+#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":31:18)
+#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":32:25)
+#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":32:36)
+#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":32:4)
+#loc31 = loc("xoffset"(#loc2))
+#loc32 = loc("xoffset"(#loc3))
+#loc33 = loc("xindex"(#loc4))
+#loc34 = loc("xindex"(#loc5))
+#loc35 = loc("x0"(#loc6))
+#loc36 = loc("x1"(#loc7))
+#loc37 = loc("tmp0"(#loc8))
+#loc38 = loc("tmp0"(#loc9))
+#loc39 = loc("tmp0"(#loc10))
+#loc40 = loc("tmp0"(#loc11))
+#loc41 = loc("tmp0"(#loc12))
+#loc42 = loc("tmp5"(#loc13))
+#loc43 = loc("tmp5"(#loc14))
+#loc44 = loc("tmp5"(#loc15))
+#loc45 = loc("tmp5"(#loc16))
+#loc46 = loc("tmp5"(#loc17))
+#loc47 = loc("tmp2"(#loc19))
+#loc48 = loc("tmp3"(#loc23))
+#loc49 = loc("tmp6"(#loc24))
+#loc50 = loc(callsite(#loc18 at #loc47))
+#loc51 = loc(callsite(#loc20 at #loc47))
+#loc52 = loc(callsite(#loc21 at #loc47))
+#loc53 = loc(callsite(#loc22 at #loc47))
diff --git a/triton/THXMHSCFSPTLTP6J3LZOK4TG4KTYQR7QTC3P2EXOBY7MDGHSVWFA/triton_poi_fused_mul_silu_split_0.ttir b/triton/THXMHSCFSPTLTP6J3LZOK4TG4KTYQR7QTC3P2EXOBY7MDGHSVWFA/triton_poi_fused_mul_silu_split_0.ttir
new file mode 100644
index 0000000000000000000000000000000000000000..c4d7cd88e543358d1548392697db257fdae6edf7
--- /dev/null
+++ b/triton/THXMHSCFSPTLTP6J3LZOK4TG4KTYQR7QTC3P2EXOBY7MDGHSVWFA/triton_poi_fused_mul_silu_split_0.ttir
@@ -0,0 +1,93 @@
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":18:0)
+#loc28 = loc("in_ptr0"(#loc))
+#loc29 = loc("out_ptr0"(#loc))
+#loc30 = loc("xnumel"(#loc))
+module {
+  tt.func public @triton_poi_fused_mul_silu_split_0(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} {
+    %tmp2 = arith.constant dense<0.000000e+00> : tensor<512xf32> loc(#loc50)
+    %tmp2_0 = arith.constant dense<1.000000e+00> : tensor<512xf32> loc(#loc51)
+    %cst = arith.constant dense<24576> : tensor<512xi32> loc(#loc3)
+    %cst_1 = arith.constant dense<12288> : tensor<512xi32> loc(#loc3)
+    %c512_i32 = arith.constant 512 : i32 loc(#loc3)
+    %xoffset = tt.get_program_id x : i32 loc(#loc32)
+    %xoffset_2 = arith.muli %xoffset, %c512_i32 : i32 loc(#loc33)
+    %xindex = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32> loc(#loc34)
+    %xindex_3 = tt.splat %xoffset_2 : i32 -> tensor<512xi32> loc(#loc35)
+    %xindex_4 = arith.addi %xindex_3, %xindex : tensor<512xi32> loc(#loc35)
+    %x0 = arith.remsi %xindex_4, %cst_1 : tensor<512xi32> loc(#loc36)
+    %x1 = arith.divsi %xindex_4, %cst_1 : tensor<512xi32> loc(#loc37)
+    %tmp0 = arith.muli %x1, %cst : tensor<512xi32> loc(#loc38)
+    %tmp0_5 = arith.addi %x0, %tmp0 : tensor<512xi32> loc(#loc39)
+    %tmp0_6 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<512x!tt.ptr<bf16>> loc(#loc40)
+    %tmp0_7 = tt.addptr %tmp0_6, %tmp0_5 : tensor<512x!tt.ptr<bf16>>, tensor<512xi32> loc(#loc40)
+    %tmp0_8 = tt.load %tmp0_7 : tensor<512x!tt.ptr<bf16>> loc(#loc41)
+    %tmp0_9 = arith.extf %tmp0_8 : tensor<512xbf16> to tensor<512xf32> loc(#loc42)
+    %tmp5 = arith.addi %x0, %cst_1 : tensor<512xi32> loc(#loc43)
+    %tmp5_10 = arith.addi %tmp5, %tmp0 : tensor<512xi32> loc(#loc44)
+    %tmp5_11 = tt.addptr %tmp0_6, %tmp5_10 : tensor<512x!tt.ptr<bf16>>, tensor<512xi32> loc(#loc45)
+    %tmp5_12 = tt.load %tmp5_11 : tensor<512x!tt.ptr<bf16>> loc(#loc46)
+    %tmp5_13 = arith.extf %tmp5_12 : tensor<512xbf16> to tensor<512xf32> loc(#loc47)
+    %tmp2_14 = arith.subf %tmp2, %tmp0_9 : tensor<512xf32> loc(#loc50)
+    %tmp2_15 = math.exp %tmp2_14 : tensor<512xf32> loc(#loc52)
+    %tmp2_16 = arith.addf %tmp2_15, %tmp2_0 : tensor<512xf32> loc(#loc53)
+    %tmp2_17 = arith.divf %tmp2_0, %tmp2_16 : tensor<512xf32> loc(#loc54)
+    %tmp3 = arith.mulf %tmp0_9, %tmp2_17 : tensor<512xf32> loc(#loc48)
+    %tmp6 = arith.mulf %tmp3, %tmp5_13 : tensor<512xf32> loc(#loc49)
+    %0 = tt.splat %out_ptr0 : !tt.ptr<bf16> -> tensor<512x!tt.ptr<bf16>> loc(#loc25)
+    %1 = tt.addptr %0, %xindex_4 : tensor<512x!tt.ptr<bf16>>, tensor<512xi32> loc(#loc25)
+    %2 = arith.truncf %tmp6 : tensor<512xf32> to tensor<512xbf16> loc(#loc26)
+    tt.store %1, %2 : tensor<512x!tt.ptr<bf16>> loc(#loc26)
+    tt.return loc(#loc27)
+  } loc(#loc)
+} loc(#loc)
+#loc1 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:30)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":28:22)
+#loc3 = loc(unknown)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":19:28)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":19:33)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":20:36)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":20:23)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":22:19)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":23:19)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":25:41)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":25:35)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":25:30)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":25:46)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":25:55)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":26:38)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":26:43)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":26:30)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":26:54)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":26:63)
+#loc20 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:29)
+#loc21 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:20)
+#loc22 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:16)
+#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":29:18)
+#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":31:18)
+#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":32:25)
+#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":32:36)
+#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/sy/csyae3ok2xnzuxhjkxzhdcpcz6jckcu3vv7eqb3pewhrvqmiergf.py":32:4)
+#loc31 = loc("tmp2"(#loc2))
+#loc32 = loc("xoffset"(#loc4))
+#loc33 = loc("xoffset"(#loc5))
+#loc34 = loc("xindex"(#loc6))
+#loc35 = loc("xindex"(#loc7))
+#loc36 = loc("x0"(#loc8))
+#loc37 = loc("x1"(#loc9))
+#loc38 = loc("tmp0"(#loc10))
+#loc39 = loc("tmp0"(#loc11))
+#loc40 = loc("tmp0"(#loc12))
+#loc41 = loc("tmp0"(#loc13))
+#loc42 = loc("tmp0"(#loc14))
+#loc43 = loc("tmp5"(#loc15))
+#loc44 = loc("tmp5"(#loc16))
+#loc45 = loc("tmp5"(#loc17))
+#loc46 = loc("tmp5"(#loc18))
+#loc47 = loc("tmp5"(#loc19))
+#loc48 = loc("tmp3"(#loc23))
+#loc49 = loc("tmp6"(#loc24))
+#loc50 = loc(callsite(#loc1 at #loc31))
+#loc51 = loc(callsite(#loc3 at #loc31))
+#loc52 = loc(callsite(#loc20 at #loc31))
+#loc53 = loc(callsite(#loc21 at #loc31))
+#loc54 = loc(callsite(#loc22 at #loc31))
diff --git a/triton/U2Y4PVYJWFSKZDU4AWOF4HOYPDZZL3MFTQLITJAO62J7FJKLEL7A/__grp__triton_red_fused_add_mul_native_layer_norm_0.json b/triton/U2Y4PVYJWFSKZDU4AWOF4HOYPDZZL3MFTQLITJAO62J7FJKLEL7A/__grp__triton_red_fused_add_mul_native_layer_norm_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..ec772c40de90fa1ff7b8ed1856d34ade161c06e7
--- /dev/null
+++ b/triton/U2Y4PVYJWFSKZDU4AWOF4HOYPDZZL3MFTQLITJAO62J7FJKLEL7A/__grp__triton_red_fused_add_mul_native_layer_norm_0.json
@@ -0,0 +1 @@
+{"child_paths": {"triton_red_fused_add_mul_native_layer_norm_0.source": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/U2Y4PVYJWFSKZDU4AWOF4HOYPDZZL3MFTQLITJAO62J7FJKLEL7A/triton_red_fused_add_mul_native_layer_norm_0.source", "triton_red_fused_add_mul_native_layer_norm_0.ttir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/U2Y4PVYJWFSKZDU4AWOF4HOYPDZZL3MFTQLITJAO62J7FJKLEL7A/triton_red_fused_add_mul_native_layer_norm_0.ttir", "triton_red_fused_add_mul_native_layer_norm_0.ttgir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/U2Y4PVYJWFSKZDU4AWOF4HOYPDZZL3MFTQLITJAO62J7FJKLEL7A/triton_red_fused_add_mul_native_layer_norm_0.ttgir", "triton_red_fused_add_mul_native_layer_norm_0.llir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/U2Y4PVYJWFSKZDU4AWOF4HOYPDZZL3MFTQLITJAO62J7FJKLEL7A/triton_red_fused_add_mul_native_layer_norm_0.llir", "triton_red_fused_add_mul_native_layer_norm_0.ptx": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/U2Y4PVYJWFSKZDU4AWOF4HOYPDZZL3MFTQLITJAO62J7FJKLEL7A/triton_red_fused_add_mul_native_layer_norm_0.ptx", "triton_red_fused_add_mul_native_layer_norm_0.cubin": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/U2Y4PVYJWFSKZDU4AWOF4HOYPDZZL3MFTQLITJAO62J7FJKLEL7A/triton_red_fused_add_mul_native_layer_norm_0.cubin", "triton_red_fused_add_mul_native_layer_norm_0.json": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/U2Y4PVYJWFSKZDU4AWOF4HOYPDZZL3MFTQLITJAO62J7FJKLEL7A/triton_red_fused_add_mul_native_layer_norm_0.json"}}
\ No newline at end of file
diff --git a/triton/U2Y4PVYJWFSKZDU4AWOF4HOYPDZZL3MFTQLITJAO62J7FJKLEL7A/triton_red_fused_add_mul_native_layer_norm_0.cubin b/triton/U2Y4PVYJWFSKZDU4AWOF4HOYPDZZL3MFTQLITJAO62J7FJKLEL7A/triton_red_fused_add_mul_native_layer_norm_0.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..af6326acd5c3b4f2ec17c9c4db57a063c7f53228
Binary files /dev/null and b/triton/U2Y4PVYJWFSKZDU4AWOF4HOYPDZZL3MFTQLITJAO62J7FJKLEL7A/triton_red_fused_add_mul_native_layer_norm_0.cubin differ
diff --git a/triton/U2Y4PVYJWFSKZDU4AWOF4HOYPDZZL3MFTQLITJAO62J7FJKLEL7A/triton_red_fused_add_mul_native_layer_norm_0.json b/triton/U2Y4PVYJWFSKZDU4AWOF4HOYPDZZL3MFTQLITJAO62J7FJKLEL7A/triton_red_fused_add_mul_native_layer_norm_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..414f10fe396652097f7085a02939c38c196c6d83
--- /dev/null
+++ b/triton/U2Y4PVYJWFSKZDU4AWOF4HOYPDZZL3MFTQLITJAO62J7FJKLEL7A/triton_red_fused_add_mul_native_layer_norm_0.json
@@ -0,0 +1 @@
+{"hash": "a6b1c7d709b164ac8e9c059c5e1dd878f395ed859c1689a40ef693f2a54b22fe", "target": {"backend": "cuda", "arch": 89, "warp_size": 32}, "num_warps": 16, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "enable_reflect_ftz": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee", "bf16x3", "bf16x6"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm89", "instrumentation_mode": "", "triton_version": "3.6.0", "tensordesc_meta": [], "shared": 192, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused_add_mul_native_layer_norm_0"}
\ No newline at end of file
diff --git a/triton/U2Y4PVYJWFSKZDU4AWOF4HOYPDZZL3MFTQLITJAO62J7FJKLEL7A/triton_red_fused_add_mul_native_layer_norm_0.llir b/triton/U2Y4PVYJWFSKZDU4AWOF4HOYPDZZL3MFTQLITJAO62J7FJKLEL7A/triton_red_fused_add_mul_native_layer_norm_0.llir
new file mode 100644
index 0000000000000000000000000000000000000000..dadb9701697cfc57034ffcadb05ab6b8cb284d32
--- /dev/null
+++ b/triton/U2Y4PVYJWFSKZDU4AWOF4HOYPDZZL3MFTQLITJAO62J7FJKLEL7A/triton_red_fused_add_mul_native_layer_norm_0.llir
@@ -0,0 +1,547 @@
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64"
+
+@global_smem = external addrspace(3) global [0 x i8], align 16
+@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1
+
+; Function Attrs: nounwind
+define ptx_kernel void @triton_red_fused_add_mul_native_layer_norm_0(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, i32 %4, i32 %5, ptr addrspace(1) readnone captures(none) %6, ptr addrspace(1) readnone captures(none) %7) local_unnamed_addr #0 !dbg !5 {
+__nv_rsqrtf.exit:
+  %8 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !8
+  %9 = icmp samesign ult i32 %8, 2304, !dbg !9
+  %10 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
+  %11 = and i32 %10, 511, !dbg !10
+  %12 = and i32 %10, 31, !dbg !10
+  %13 = lshr i32 %11, 5, !dbg !10
+  %14 = shl nuw nsw i32 %10, 3, !dbg !10
+  %15 = and i32 %14, 4088, !dbg !10
+  %16 = shl i32 %8, 12, !dbg !11
+  %17 = or disjoint i32 %15, %16, !dbg !12
+  %18 = sext i32 %17 to i64, !dbg !13
+  %19 = getelementptr bfloat, ptr addrspace(1) %0, i64 %18, !dbg !13
+  %20 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !14
+  %21 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %19, i64 %20, i1 %9) #6, !dbg !14
+  %22 = extractvalue { i32, i32, i32, i32 } %21, 0, !dbg !14
+  %23 = bitcast i32 %22 to <2 x bfloat>, !dbg !14
+  %24 = extractvalue { i32, i32, i32, i32 } %21, 1, !dbg !14
+  %25 = bitcast i32 %24 to <2 x bfloat>, !dbg !14
+  %26 = extractvalue { i32, i32, i32, i32 } %21, 2, !dbg !14
+  %27 = bitcast i32 %26 to <2 x bfloat>, !dbg !14
+  %28 = extractvalue { i32, i32, i32, i32 } %21, 3, !dbg !14
+  %29 = bitcast i32 %28 to <2 x bfloat>, !dbg !14
+  %30 = extractelement <2 x bfloat> %23, i64 0, !dbg !14
+  %31 = extractelement <2 x bfloat> %23, i64 1, !dbg !14
+  %32 = extractelement <2 x bfloat> %25, i64 0, !dbg !14
+  %33 = extractelement <2 x bfloat> %25, i64 1, !dbg !14
+  %34 = extractelement <2 x bfloat> %27, i64 0, !dbg !14
+  %35 = extractelement <2 x bfloat> %27, i64 1, !dbg !14
+  %36 = extractelement <2 x bfloat> %29, i64 0, !dbg !14
+  %37 = extractelement <2 x bfloat> %29, i64 1, !dbg !14
+  %38 = fpext bfloat %30 to float, !dbg !15
+  %39 = fpext bfloat %31 to float, !dbg !15
+  %40 = fpext bfloat %32 to float, !dbg !15
+  %41 = fpext bfloat %33 to float, !dbg !15
+  %42 = fpext bfloat %34 to float, !dbg !15
+  %43 = fpext bfloat %35 to float, !dbg !15
+  %44 = fpext bfloat %36 to float, !dbg !15
+  %45 = fpext bfloat %37 to float, !dbg !15
+  %46 = select i1 %9, float %38, float 0.000000e+00, !dbg !16
+  %47 = select i1 %9, float %39, float 0.000000e+00, !dbg !16
+  %48 = select i1 %9, float %40, float 0.000000e+00, !dbg !16
+  %49 = select i1 %9, float %41, float 0.000000e+00, !dbg !16
+  %50 = select i1 %9, float %42, float 0.000000e+00, !dbg !16
+  %51 = select i1 %9, float %43, float 0.000000e+00, !dbg !16
+  %52 = select i1 %9, float %44, float 0.000000e+00, !dbg !16
+  %53 = select i1 %9, float %45, float 0.000000e+00, !dbg !16
+  %54 = select i1 %9, float 1.000000e+00, float 0.000000e+00, !dbg !17
+  %55 = fsub float %47, %46, !dbg !18
+  %56 = select i1 %9, float 2.000000e+00, float 0.000000e+00, !dbg !24
+  %57 = fcmp oeq float %56, 0.000000e+00, !dbg !25
+  %58 = tail call float @llvm.nvvm.div.full(float %54, float %56), !dbg !26
+  %59 = select i1 %57, float 0.000000e+00, float %58, !dbg !27
+  %60 = fmul float %59, %55, !dbg !28
+  %61 = fadd float %46, %60, !dbg !29
+  %62 = fmul float %55, %55, !dbg !30
+  %63 = fmul float %54, %62, !dbg !31
+  %64 = fmul float %59, %63, !dbg !32
+  %65 = fadd float %64, 0.000000e+00, !dbg !33
+  %66 = fsub float %48, %61, !dbg !18
+  %67 = select i1 %9, float 3.000000e+00, float 0.000000e+00, !dbg !24
+  %68 = fcmp oeq float %67, 0.000000e+00, !dbg !25
+  %69 = tail call float @llvm.nvvm.div.full(float %54, float %67), !dbg !26
+  %70 = select i1 %68, float 0.000000e+00, float %69, !dbg !27
+  %71 = fmul float %70, %66, !dbg !28
+  %72 = fadd float %61, %71, !dbg !29
+  %73 = fmul float %66, %66, !dbg !30
+  %74 = fmul float %56, %73, !dbg !31
+  %75 = fmul float %70, %74, !dbg !32
+  %76 = fadd float %65, %75, !dbg !33
+  %77 = fsub float %49, %72, !dbg !18
+  %78 = select i1 %9, float 4.000000e+00, float 0.000000e+00, !dbg !24
+  %79 = fcmp oeq float %78, 0.000000e+00, !dbg !25
+  %80 = tail call float @llvm.nvvm.div.full(float %54, float %78), !dbg !26
+  %81 = select i1 %79, float 0.000000e+00, float %80, !dbg !27
+  %82 = fmul float %81, %77, !dbg !28
+  %83 = fadd float %72, %82, !dbg !29
+  %84 = fmul float %77, %77, !dbg !30
+  %85 = fmul float %67, %84, !dbg !31
+  %86 = fmul float %81, %85, !dbg !32
+  %87 = fadd float %76, %86, !dbg !33
+  %88 = fsub float %50, %83, !dbg !18
+  %89 = select i1 %9, float 5.000000e+00, float 0.000000e+00, !dbg !24
+  %90 = fcmp oeq float %89, 0.000000e+00, !dbg !25
+  %91 = tail call float @llvm.nvvm.div.full(float %54, float %89), !dbg !26
+  %92 = select i1 %90, float 0.000000e+00, float %91, !dbg !27
+  %93 = fmul float %92, %88, !dbg !28
+  %94 = fadd float %83, %93, !dbg !29
+  %95 = fmul float %88, %88, !dbg !30
+  %96 = fmul float %78, %95, !dbg !31
+  %97 = fmul float %92, %96, !dbg !32
+  %98 = fadd float %87, %97, !dbg !33
+  %99 = fsub float %51, %94, !dbg !18
+  %100 = select i1 %9, float 6.000000e+00, float 0.000000e+00, !dbg !24
+  %101 = fcmp oeq float %100, 0.000000e+00, !dbg !25
+  %102 = tail call float @llvm.nvvm.div.full(float %54, float %100), !dbg !26
+  %103 = select i1 %101, float 0.000000e+00, float %102, !dbg !27
+  %104 = fmul float %103, %99, !dbg !28
+  %105 = fadd float %94, %104, !dbg !29
+  %106 = fmul float %99, %99, !dbg !30
+  %107 = fmul float %89, %106, !dbg !31
+  %108 = fmul float %103, %107, !dbg !32
+  %109 = fadd float %98, %108, !dbg !33
+  %110 = fsub float %52, %105, !dbg !18
+  %111 = select i1 %9, float 7.000000e+00, float 0.000000e+00, !dbg !24
+  %112 = fcmp oeq float %111, 0.000000e+00, !dbg !25
+  %113 = tail call float @llvm.nvvm.div.full(float %54, float %111), !dbg !26
+  %114 = select i1 %112, float 0.000000e+00, float %113, !dbg !27
+  %115 = fmul float %114, %110, !dbg !28
+  %116 = fadd float %105, %115, !dbg !29
+  %117 = fmul float %110, %110, !dbg !30
+  %118 = fmul float %100, %117, !dbg !31
+  %119 = fmul float %114, %118, !dbg !32
+  %120 = fadd float %109, %119, !dbg !33
+  %121 = fsub float %53, %116, !dbg !18
+  %122 = select i1 %9, float 8.000000e+00, float 0.000000e+00, !dbg !24
+  %123 = fcmp oeq float %122, 0.000000e+00, !dbg !25
+  %124 = tail call float @llvm.nvvm.div.full(float %54, float %122), !dbg !26
+  %125 = select i1 %123, float 0.000000e+00, float %124, !dbg !27
+  %126 = fmul float %125, %121, !dbg !28
+  %127 = fadd float %116, %126, !dbg !29
+  %128 = fmul float %121, %121, !dbg !30
+  %129 = fmul float %111, %128, !dbg !31
+  %130 = fmul float %125, %129, !dbg !32
+  %131 = fadd float %120, %130, !dbg !33
+  %132 = bitcast float %127 to i32, !dbg !21
+  %133 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %132, i32 16, i32 31), !dbg !21
+  %134 = bitcast i32 %133 to float, !dbg !21
+  %135 = bitcast float %131 to i32, !dbg !21
+  %136 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %135, i32 16, i32 31), !dbg !21
+  %137 = bitcast i32 %136 to float, !dbg !21
+  %138 = bitcast float %122 to i32, !dbg !21
+  %139 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %138, i32 16, i32 31), !dbg !21
+  %140 = bitcast i32 %139 to float, !dbg !21
+  %141 = fsub float %134, %127, !dbg !18
+  %142 = fadd float %122, %140, !dbg !24
+  %143 = fcmp oeq float %142, 0.000000e+00, !dbg !25
+  %144 = tail call float @llvm.nvvm.div.full(float %140, float %142), !dbg !26
+  %145 = select i1 %143, float 0.000000e+00, float %144, !dbg !27
+  %146 = fmul float %145, %141, !dbg !28
+  %147 = fadd float %127, %146, !dbg !29
+  %148 = fadd float %131, %137, !dbg !34
+  %149 = fmul float %141, %141, !dbg !30
+  %150 = fmul float %122, %149, !dbg !31
+  %151 = fmul float %145, %150, !dbg !32
+  %152 = fadd float %148, %151, !dbg !33
+  %153 = bitcast float %147 to i32, !dbg !21
+  %154 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %153, i32 8, i32 31), !dbg !21
+  %155 = bitcast i32 %154 to float, !dbg !21
+  %156 = bitcast float %152 to i32, !dbg !21
+  %157 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %156, i32 8, i32 31), !dbg !21
+  %158 = bitcast i32 %157 to float, !dbg !21
+  %159 = bitcast float %142 to i32, !dbg !21
+  %160 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %159, i32 8, i32 31), !dbg !21
+  %161 = bitcast i32 %160 to float, !dbg !21
+  %162 = fsub float %155, %147, !dbg !18
+  %163 = fadd float %142, %161, !dbg !24
+  %164 = fcmp oeq float %163, 0.000000e+00, !dbg !25
+  %165 = tail call float @llvm.nvvm.div.full(float %161, float %163), !dbg !26
+  %166 = select i1 %164, float 0.000000e+00, float %165, !dbg !27
+  %167 = fmul float %166, %162, !dbg !28
+  %168 = fadd float %147, %167, !dbg !29
+  %169 = fadd float %152, %158, !dbg !34
+  %170 = fmul float %162, %162, !dbg !30
+  %171 = fmul float %142, %170, !dbg !31
+  %172 = fmul float %166, %171, !dbg !32
+  %173 = fadd float %169, %172, !dbg !33
+  %174 = bitcast float %168 to i32, !dbg !21
+  %175 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %174, i32 4, i32 31), !dbg !21
+  %176 = bitcast i32 %175 to float, !dbg !21
+  %177 = bitcast float %173 to i32, !dbg !21
+  %178 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %177, i32 4, i32 31), !dbg !21
+  %179 = bitcast i32 %178 to float, !dbg !21
+  %180 = bitcast float %163 to i32, !dbg !21
+  %181 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %180, i32 4, i32 31), !dbg !21
+  %182 = bitcast i32 %181 to float, !dbg !21
+  %183 = fsub float %176, %168, !dbg !18
+  %184 = fadd float %163, %182, !dbg !24
+  %185 = fcmp oeq float %184, 0.000000e+00, !dbg !25
+  %186 = tail call float @llvm.nvvm.div.full(float %182, float %184), !dbg !26
+  %187 = select i1 %185, float 0.000000e+00, float %186, !dbg !27
+  %188 = fmul float %187, %183, !dbg !28
+  %189 = fadd float %168, %188, !dbg !29
+  %190 = fadd float %173, %179, !dbg !34
+  %191 = fmul float %183, %183, !dbg !30
+  %192 = fmul float %163, %191, !dbg !31
+  %193 = fmul float %187, %192, !dbg !32
+  %194 = fadd float %190, %193, !dbg !33
+  %195 = bitcast float %189 to i32, !dbg !21
+  %196 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %195, i32 2, i32 31), !dbg !21
+  %197 = bitcast i32 %196 to float, !dbg !21
+  %198 = bitcast float %194 to i32, !dbg !21
+  %199 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %198, i32 2, i32 31), !dbg !21
+  %200 = bitcast i32 %199 to float, !dbg !21
+  %201 = bitcast float %184 to i32, !dbg !21
+  %202 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %201, i32 2, i32 31), !dbg !21
+  %203 = bitcast i32 %202 to float, !dbg !21
+  %204 = fsub float %197, %189, !dbg !18
+  %205 = fadd float %184, %203, !dbg !24
+  %206 = fcmp oeq float %205, 0.000000e+00, !dbg !25
+  %207 = tail call float @llvm.nvvm.div.full(float %203, float %205), !dbg !26
+  %208 = select i1 %206, float 0.000000e+00, float %207, !dbg !27
+  %209 = fmul float %208, %204, !dbg !28
+  %210 = fadd float %189, %209, !dbg !29
+  %211 = fadd float %194, %200, !dbg !34
+  %212 = fmul float %204, %204, !dbg !30
+  %213 = fmul float %184, %212, !dbg !31
+  %214 = fmul float %208, %213, !dbg !32
+  %215 = fadd float %211, %214, !dbg !33
+  %216 = bitcast float %210 to i32, !dbg !21
+  %217 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %216, i32 1, i32 31), !dbg !21
+  %218 = bitcast i32 %217 to float, !dbg !21
+  %219 = bitcast float %215 to i32, !dbg !21
+  %220 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %219, i32 1, i32 31), !dbg !21
+  %221 = bitcast i32 %220 to float, !dbg !21
+  %222 = bitcast float %205 to i32, !dbg !21
+  %223 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %222, i32 1, i32 31), !dbg !21
+  %224 = bitcast i32 %223 to float, !dbg !21
+  %225 = fsub float %218, %210, !dbg !18
+  %226 = fadd float %205, %224, !dbg !24
+  %227 = fcmp oeq float %226, 0.000000e+00, !dbg !25
+  %228 = tail call float @llvm.nvvm.div.full(float %224, float %226), !dbg !26
+  %229 = select i1 %227, float 0.000000e+00, float %228, !dbg !27
+  %230 = fmul float %229, %225, !dbg !28
+  %231 = fadd float %210, %230, !dbg !29
+  %232 = fadd float %215, %221, !dbg !34
+  %233 = fmul float %225, %225, !dbg !30
+  %234 = fmul float %205, %233, !dbg !31
+  %235 = fmul float %229, %234, !dbg !32
+  %236 = fadd float %232, %235, !dbg !33
+  %237 = icmp eq i32 %12, 0, !dbg !21
+  %238 = getelementptr float, ptr addrspace(3) @global_smem, i32 %13, !dbg !21
+  %239 = bitcast float %231 to <1 x i32>, !dbg !21
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %238, <1 x i32> %239, i1 %237) #6, !dbg !21
+  %240 = getelementptr float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 64), i32 %13, !dbg !21
+  %241 = bitcast float %236 to <1 x i32>, !dbg !21
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %240, <1 x i32> %241, i1 %237) #6, !dbg !21
+  %242 = getelementptr float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 128), i32 %13, !dbg !21
+  %243 = bitcast float %226 to <1 x i32>, !dbg !21
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %242, <1 x i32> %243, i1 %237) #6, !dbg !21
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !21
+  %244 = icmp samesign ult i32 %11, 16, !dbg !21
+  %245 = getelementptr float, ptr addrspace(3) @global_smem, i32 %11, !dbg !21
+  %246 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %245, i1 %244) #6, !dbg !21
+  %247 = bitcast i32 %246 to float, !dbg !21
+  %248 = getelementptr float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 64), i32 %11, !dbg !21
+  %249 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %248, i1 %244) #6, !dbg !21
+  %250 = bitcast i32 %249 to float, !dbg !21
+  %251 = getelementptr float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 128), i32 %11, !dbg !21
+  %252 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %251, i1 %244) #6, !dbg !21
+  %253 = bitcast i32 %252 to float, !dbg !21
+  %254 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %246, i32 8, i32 31), !dbg !21
+  %255 = bitcast i32 %254 to float, !dbg !21
+  %256 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %249, i32 8, i32 31), !dbg !21
+  %257 = bitcast i32 %256 to float, !dbg !21
+  %258 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %252, i32 8, i32 31), !dbg !21
+  %259 = bitcast i32 %258 to float, !dbg !21
+  %260 = fsub float %255, %247, !dbg !18
+  %261 = fadd float %253, %259, !dbg !24
+  %262 = fcmp oeq float %261, 0.000000e+00, !dbg !25
+  %263 = tail call float @llvm.nvvm.div.full(float %259, float %261), !dbg !26
+  %264 = select i1 %262, float 0.000000e+00, float %263, !dbg !27
+  %265 = fmul float %260, %264, !dbg !28
+  %266 = fadd float %265, %247, !dbg !29
+  %267 = fadd float %250, %257, !dbg !34
+  %268 = fmul float %260, %260, !dbg !30
+  %269 = fmul float %268, %253, !dbg !31
+  %270 = fmul float %269, %264, !dbg !32
+  %271 = fadd float %267, %270, !dbg !33
+  %272 = bitcast float %266 to i32, !dbg !21
+  %273 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %272, i32 4, i32 31), !dbg !21
+  %274 = bitcast i32 %273 to float, !dbg !21
+  %275 = bitcast float %271 to i32, !dbg !21
+  %276 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %275, i32 4, i32 31), !dbg !21
+  %277 = bitcast i32 %276 to float, !dbg !21
+  %278 = bitcast float %261 to i32, !dbg !21
+  %279 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %278, i32 4, i32 31), !dbg !21
+  %280 = bitcast i32 %279 to float, !dbg !21
+  %281 = fsub float %274, %266, !dbg !18
+  %282 = fadd float %261, %280, !dbg !24
+  %283 = fcmp oeq float %282, 0.000000e+00, !dbg !25
+  %284 = tail call float @llvm.nvvm.div.full(float %280, float %282), !dbg !26
+  %285 = select i1 %283, float 0.000000e+00, float %284, !dbg !27
+  %286 = fmul float %281, %285, !dbg !28
+  %287 = fadd float %266, %286, !dbg !29
+  %288 = fadd float %271, %277, !dbg !34
+  %289 = fmul float %281, %281, !dbg !30
+  %290 = fmul float %261, %289, !dbg !31
+  %291 = fmul float %285, %290, !dbg !32
+  %292 = fadd float %288, %291, !dbg !33
+  %293 = bitcast float %287 to i32, !dbg !21
+  %294 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %293, i32 2, i32 31), !dbg !21
+  %295 = bitcast i32 %294 to float, !dbg !21
+  %296 = bitcast float %292 to i32, !dbg !21
+  %297 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %296, i32 2, i32 31), !dbg !21
+  %298 = bitcast i32 %297 to float, !dbg !21
+  %299 = bitcast float %282 to i32, !dbg !21
+  %300 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %299, i32 2, i32 31), !dbg !21
+  %301 = bitcast i32 %300 to float, !dbg !21
+  %302 = fsub float %295, %287, !dbg !18
+  %303 = fadd float %282, %301, !dbg !24
+  %304 = fcmp oeq float %303, 0.000000e+00, !dbg !25
+  %305 = tail call float @llvm.nvvm.div.full(float %301, float %303), !dbg !26
+  %306 = select i1 %304, float 0.000000e+00, float %305, !dbg !27
+  %307 = fmul float %302, %306, !dbg !28
+  %308 = fadd float %287, %307, !dbg !29
+  %309 = fadd float %292, %298, !dbg !34
+  %310 = fmul float %302, %302, !dbg !30
+  %311 = fmul float %282, %310, !dbg !31
+  %312 = fmul float %306, %311, !dbg !32
+  %313 = fadd float %309, %312, !dbg !33
+  %314 = bitcast float %308 to i32, !dbg !21
+  %315 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %314, i32 1, i32 31), !dbg !21
+  %316 = bitcast i32 %315 to float, !dbg !21
+  %317 = bitcast float %313 to i32, !dbg !21
+  %318 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %317, i32 1, i32 31), !dbg !21
+  %319 = bitcast i32 %318 to float, !dbg !21
+  %320 = bitcast float %303 to i32, !dbg !21
+  %321 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %320, i32 1, i32 31), !dbg !21
+  %322 = bitcast i32 %321 to float, !dbg !21
+  %323 = fsub float %316, %308, !dbg !18
+  %324 = fadd float %303, %322, !dbg !24
+  %325 = fcmp oeq float %324, 0.000000e+00, !dbg !25
+  %326 = tail call float @llvm.nvvm.div.full(float %322, float %324), !dbg !26
+  %327 = select i1 %325, float 0.000000e+00, float %326, !dbg !27
+  %328 = fmul float %323, %327, !dbg !28
+  %329 = fadd float %308, %328, !dbg !29
+  %330 = fadd float %313, %319, !dbg !34
+  %331 = fmul float %323, %323, !dbg !30
+  %332 = fmul float %303, %331, !dbg !31
+  %333 = fmul float %327, %332, !dbg !32
+  %334 = fadd float %330, %333, !dbg !33
+  %335 = and i32 %10, 15, !dbg !21
+  %336 = icmp eq i32 %335, 0, !dbg !21
+  %337 = and i1 %244, %336, !dbg !21
+  %338 = bitcast float %329 to <1 x i32>, !dbg !21
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %245, <1 x i32> %338, i1 %337) #6, !dbg !21
+  %339 = bitcast float %334 to <1 x i32>, !dbg !21
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %248, <1 x i32> %339, i1 %337) #6, !dbg !21
+  %340 = bitcast float %324 to <1 x i32>, !dbg !21
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %251, <1 x i32> %340, i1 %337) #6, !dbg !21
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !21
+  %341 = load float, ptr addrspace(3) @global_smem, align 16, !dbg !21
+  %342 = load float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 64), align 16, !dbg !21
+  %343 = zext nneg i32 %15 to i64, !dbg !35
+  %344 = getelementptr bfloat, ptr addrspace(1) %1, i64 %343, !dbg !35
+  %345 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !36
+  %346 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %344, i64 %345, i1 true) #6, !dbg !36
+  %347 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #6, !dbg !37
+  %348 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %19, i64 %347, i1 %9) #6, !dbg !37
+  %349 = getelementptr bfloat, ptr addrspace(1) %2, i64 %343, !dbg !38
+  %350 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !39
+  %351 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %349, i64 %350, i1 true) #6, !dbg !39
+  %352 = tail call float @llvm.nvvm.div.full(float %342, float 4.096000e+03), !dbg !40
+  %353 = fadd float %352, 0x3EB0C6F7A0000000, !dbg !41
+  %354 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !42
+  %355 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !42
+  %356 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !42
+  %357 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !42
+  %358 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !42
+  %359 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !42
+  %360 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !42
+  %361 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !42
+  %.not.i19 = icmp eq i32 %361, 0, !dbg !42
+  br i1 %.not.i19, label %364, label %362, !dbg !42
+
+362:                                              ; preds = %__nv_rsqrtf.exit
+  %363 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %353), !dbg !42
+  br label %__nv_rsqrtf.exit21, !dbg !42
+
+364:                                              ; preds = %__nv_rsqrtf.exit
+  %365 = tail call float @llvm.nvvm.rsqrt.approx.f(float %353), !dbg !42
+  br label %__nv_rsqrtf.exit21, !dbg !42
+
+__nv_rsqrtf.exit21:                               ; preds = %362, %364
+  %.0.i20 = phi float [ %363, %362 ], [ %365, %364 ], !dbg !42
+  %366 = extractvalue { i32, i32, i32, i32 } %348, 3, !dbg !37
+  %367 = bitcast i32 %366 to <2 x bfloat>, !dbg !37
+  %368 = extractvalue { i32, i32, i32, i32 } %348, 2, !dbg !37
+  %369 = bitcast i32 %368 to <2 x bfloat>, !dbg !37
+  %370 = extractvalue { i32, i32, i32, i32 } %348, 1, !dbg !37
+  %371 = bitcast i32 %370 to <2 x bfloat>, !dbg !37
+  %372 = extractvalue { i32, i32, i32, i32 } %348, 0, !dbg !37
+  %373 = bitcast i32 %372 to <2 x bfloat>, !dbg !37
+  %374 = extractvalue { i32, i32, i32, i32 } %346, 3, !dbg !36
+  %375 = bitcast i32 %374 to <2 x bfloat>, !dbg !36
+  %376 = extractvalue { i32, i32, i32, i32 } %346, 2, !dbg !36
+  %377 = bitcast i32 %376 to <2 x bfloat>, !dbg !36
+  %378 = extractvalue { i32, i32, i32, i32 } %346, 1, !dbg !36
+  %379 = bitcast i32 %378 to <2 x bfloat>, !dbg !36
+  %380 = extractvalue { i32, i32, i32, i32 } %346, 0, !dbg !36
+  %381 = bitcast i32 %380 to <2 x bfloat>, !dbg !36
+  %382 = extractvalue { i32, i32, i32, i32 } %351, 3, !dbg !39
+  %383 = bitcast i32 %382 to <2 x bfloat>, !dbg !39
+  %384 = extractvalue { i32, i32, i32, i32 } %351, 2, !dbg !39
+  %385 = bitcast i32 %384 to <2 x bfloat>, !dbg !39
+  %386 = extractvalue { i32, i32, i32, i32 } %351, 1, !dbg !39
+  %387 = bitcast i32 %386 to <2 x bfloat>, !dbg !39
+  %388 = extractvalue { i32, i32, i32, i32 } %351, 0, !dbg !39
+  %389 = bitcast i32 %388 to <2 x bfloat>, !dbg !39
+  %390 = getelementptr bfloat, ptr addrspace(1) %3, i64 %18, !dbg !43
+  %391 = fpext <2 x bfloat> %373 to <2 x float>, !dbg !44
+  %392 = insertelement <2 x float> poison, float %341, i64 0, !dbg !45
+  %393 = shufflevector <2 x float> %392, <2 x float> poison, <2 x i32> zeroinitializer, !dbg !45
+  %394 = fsub <2 x float> %391, %393, !dbg !45
+  %395 = fpext <2 x bfloat> %381 to <2 x float>, !dbg !46
+  %396 = fadd <2 x float> %395, splat (float 1.000000e+00), !dbg !47
+  %397 = fpext <2 x bfloat> %389 to <2 x float>, !dbg !48
+  %398 = insertelement <2 x float> poison, float %.0.i20, i64 0, !dbg !49
+  %399 = shufflevector <2 x float> %398, <2 x float> poison, <2 x i32> zeroinitializer, !dbg !49
+  %400 = fmul <2 x float> %394, %399, !dbg !49
+  %401 = fmul <2 x float> %396, %400, !dbg !50
+  %402 = fadd <2 x float> %401, %397, !dbg !51
+  %403 = fptrunc <2 x float> %402 to <2 x bfloat>, !dbg !52
+  %404 = fpext <2 x bfloat> %371 to <2 x float>, !dbg !44
+  %405 = fsub <2 x float> %404, %393, !dbg !45
+  %406 = fpext <2 x bfloat> %379 to <2 x float>, !dbg !46
+  %407 = fadd <2 x float> %406, splat (float 1.000000e+00), !dbg !47
+  %408 = fpext <2 x bfloat> %387 to <2 x float>, !dbg !48
+  %409 = fmul <2 x float> %405, %399, !dbg !49
+  %410 = fmul <2 x float> %407, %409, !dbg !50
+  %411 = fadd <2 x float> %410, %408, !dbg !51
+  %412 = fptrunc <2 x float> %411 to <2 x bfloat>, !dbg !52
+  %413 = fpext <2 x bfloat> %369 to <2 x float>, !dbg !44
+  %414 = fsub <2 x float> %413, %393, !dbg !45
+  %415 = fpext <2 x bfloat> %377 to <2 x float>, !dbg !46
+  %416 = fadd <2 x float> %415, splat (float 1.000000e+00), !dbg !47
+  %417 = fpext <2 x bfloat> %385 to <2 x float>, !dbg !48
+  %418 = fmul <2 x float> %414, %399, !dbg !49
+  %419 = fmul <2 x float> %416, %418, !dbg !50
+  %420 = fadd <2 x float> %419, %417, !dbg !51
+  %421 = fptrunc <2 x float> %420 to <2 x bfloat>, !dbg !52
+  %422 = fpext <2 x bfloat> %367 to <2 x float>, !dbg !44
+  %423 = fsub <2 x float> %422, %393, !dbg !45
+  %424 = fpext <2 x bfloat> %375 to <2 x float>, !dbg !46
+  %425 = fadd <2 x float> %424, splat (float 1.000000e+00), !dbg !47
+  %426 = fpext <2 x bfloat> %383 to <2 x float>, !dbg !48
+  %427 = fmul <2 x float> %423, %399, !dbg !49
+  %428 = fmul <2 x float> %425, %427, !dbg !50
+  %429 = fadd <2 x float> %428, %426, !dbg !51
+  %430 = fptrunc <2 x float> %429 to <2 x bfloat>, !dbg !52
+  %431 = bitcast <2 x bfloat> %403 to i32, !dbg !52
+  %432 = bitcast <2 x bfloat> %412 to i32, !dbg !52
+  %433 = bitcast <2 x bfloat> %421 to i32, !dbg !52
+  %434 = bitcast <2 x bfloat> %430 to i32, !dbg !52
+  tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %431, i32 %432, i32 %433, i32 %434, ptr addrspace(1) %390, i1 %9) #6, !dbg !52
+  ret void, !dbg !53
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.div.full(float, float) #2
+
+; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
+declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #3
+
+; Function Attrs: convergent nocallback nounwind
+declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #4
+
+declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #5
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #2
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.rsqrt.approx.f(float) #2
+
+attributes #0 = { nounwind "nvvm.reqntid"="512" }
+attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #2 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
+attributes #3 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
+attributes #4 = { convergent nocallback nounwind }
+attributes #5 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #6 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3}
+!llvm.ident = !{!4}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly)
+!1 = !DIFile(filename: "cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py", directory: "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg")
+!2 = !{i32 2, !"Debug Info Version", i32 3}
+!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
+!4 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
+!5 = distinct !DISubprogram(name: "triton_red_fused_add_mul_native_layer_norm_0", linkageName: "triton_red_fused_add_mul_native_layer_norm_0", scope: !1, file: !1, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
+!7 = !{}
+!8 = !DILocation(line: 23, column: 28, scope: !5)
+!9 = !DILocation(line: 25, column: 21, scope: !5)
+!10 = !DILocation(line: 26, column: 37, scope: !5)
+!11 = !DILocation(line: 38, column: 46, scope: !5)
+!12 = !DILocation(line: 38, column: 41, scope: !5)
+!13 = !DILocation(line: 38, column: 34, scope: !5)
+!14 = !DILocation(line: 38, column: 51, scope: !5)
+!15 = !DILocation(line: 38, column: 112, scope: !5)
+!16 = !DILocation(line: 44, column: 62, scope: !5)
+!17 = !DILocation(line: 46, column: 66, scope: !5)
+!18 = !DILocation(line: 231, column: 21, scope: !19, inlinedAt: !21)
+!19 = distinct !DILexicalBlockFile(scope: !5, file: !20, discriminator: 0)
+!20 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime")
+!21 = !DILocation(line: 243, column: 46, scope: !19, inlinedAt: !22)
+!22 = !DILocation(line: 47, column: 79, scope: !23)
+!23 = distinct !DILexicalBlockFile(scope: !5, file: !1, discriminator: 0)
+!24 = !DILocation(line: 232, column: 28, scope: !19, inlinedAt: !21)
+!25 = !DILocation(line: 233, column: 39, scope: !19, inlinedAt: !21)
+!26 = !DILocation(line: 233, column: 60, scope: !19, inlinedAt: !21)
+!27 = !DILocation(line: 233, column: 49, scope: !19, inlinedAt: !21)
+!28 = !DILocation(line: 235, column: 25, scope: !19, inlinedAt: !21)
+!29 = !DILocation(line: 235, column: 17, scope: !19, inlinedAt: !21)
+!30 = !DILocation(line: 236, column: 30, scope: !19, inlinedAt: !21)
+!31 = !DILocation(line: 236, column: 38, scope: !19, inlinedAt: !21)
+!32 = !DILocation(line: 236, column: 49, scope: !19, inlinedAt: !21)
+!33 = !DILocation(line: 236, column: 22, scope: !19, inlinedAt: !21)
+!34 = !DILocation(line: 236, column: 15, scope: !19, inlinedAt: !21)
+!35 = !DILocation(line: 57, column: 34, scope: !5)
+!36 = !DILocation(line: 57, column: 41, scope: !5)
+!37 = !DILocation(line: 58, column: 52, scope: !5)
+!38 = !DILocation(line: 59, column: 35, scope: !5)
+!39 = !DILocation(line: 59, column: 42, scope: !5)
+!40 = !DILocation(line: 65, column: 24, scope: !5)
+!41 = !DILocation(line: 67, column: 24, scope: !5)
+!42 = !DILocation(line: 68, column: 32, scope: !5)
+!43 = !DILocation(line: 73, column: 29, scope: !5)
+!44 = !DILocation(line: 58, column: 114, scope: !5)
+!45 = !DILocation(line: 63, column: 24, scope: !5)
+!46 = !DILocation(line: 57, column: 94, scope: !5)
+!47 = !DILocation(line: 61, column: 23, scope: !5)
+!48 = !DILocation(line: 59, column: 95, scope: !5)
+!49 = !DILocation(line: 69, column: 24, scope: !5)
+!50 = !DILocation(line: 71, column: 24, scope: !5)
+!51 = !DILocation(line: 72, column: 24, scope: !5)
+!52 = !DILocation(line: 73, column: 53, scope: !5)
+!53 = !DILocation(line: 51, column: 4, scope: !5)
diff --git a/triton/U2Y4PVYJWFSKZDU4AWOF4HOYPDZZL3MFTQLITJAO62J7FJKLEL7A/triton_red_fused_add_mul_native_layer_norm_0.ptx b/triton/U2Y4PVYJWFSKZDU4AWOF4HOYPDZZL3MFTQLITJAO62J7FJKLEL7A/triton_red_fused_add_mul_native_layer_norm_0.ptx
new file mode 100644
index 0000000000000000000000000000000000000000..3b88e55748c4e5cea7c33f4ae223a07f59f126e9
--- /dev/null
+++ b/triton/U2Y4PVYJWFSKZDU4AWOF4HOYPDZZL3MFTQLITJAO62J7FJKLEL7A/triton_red_fused_add_mul_native_layer_norm_0.ptx
@@ -0,0 +1,1032 @@
+//
+// Generated by LLVM NVPTX Back-End
+//
+
+.version 9.1
+.target sm_89
+.address_size 64
+
+	// .globl	triton_red_fused_add_mul_native_layer_norm_0 // -- Begin function triton_red_fused_add_mul_native_layer_norm_0
+.extern .shared .align 16 .b8 global_smem[];
+.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90};
+                                        // @triton_red_fused_add_mul_native_layer_norm_0
+.visible .entry triton_red_fused_add_mul_native_layer_norm_0(
+	.param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_0_param_0,
+	.param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_0_param_1,
+	.param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_0_param_2,
+	.param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_0_param_3,
+	.param .u32 triton_red_fused_add_mul_native_layer_norm_0_param_4,
+	.param .u32 triton_red_fused_add_mul_native_layer_norm_0_param_5,
+	.param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_0_param_6,
+	.param .u64 .ptr .global .align 1 triton_red_fused_add_mul_native_layer_norm_0_param_7
+)
+.reqntid 512
+{
+	.reg .pred 	%p<23>;
+	.reg .b16 	%rs<33>;
+	.reg .b32 	%r<287>;
+	.reg .b64 	%rd<15>;
+	.loc	1 18 0                          // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:18:0
+$L__func_begin0:
+	.loc	1 18 0                          // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:18:0
+
+// %bb.0:                               // %__nv_rsqrtf.exit
+	ld.param.b64 	%rd9, [triton_red_fused_add_mul_native_layer_norm_0_param_0];
+	ld.param.b64 	%rd10, [triton_red_fused_add_mul_native_layer_norm_0_param_1];
+$L__tmp0:
+	.loc	1 23 28                         // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:23:28
+	mov.u32 	%r37, %ctaid.x;
+	.loc	1 25 21                         // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:25:21
+	setp.lt.u32 	%p1, %r37, 2304;
+	ld.param.b64 	%rd11, [triton_red_fused_add_mul_native_layer_norm_0_param_2];
+	ld.param.b64 	%rd12, [triton_red_fused_add_mul_native_layer_norm_0_param_3];
+	.loc	1 26 37                         // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:26:37
+	mov.u32 	%r38, %tid.x;
+	and.b32 	%r39, %r38, 511;
+	and.b32 	%r40, %r38, 31;
+	shl.b32 	%r41, %r38, 3;
+	and.b32 	%r42, %r41, 4088;
+	.loc	1 38 46                         // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:38:46
+	shl.b32 	%r43, %r37, 12;
+	.loc	1 38 41                         // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:38:41
+	or.b32 	%r44, %r42, %r43;
+	.loc	1 38 34                         // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:38:34
+	mul.wide.s32 	%rd13, %r44, 2;
+	add.s64 	%rd1, %rd9, %rd13;
+	.loc	1 38 51                         // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:38:51
+	// begin inline asm
+	mov.u64 %rd2, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd2, 1.0;
+	// end inline asm
+	mov.b32 	%r5, 0;
+	// begin inline asm
+	mov.u32 %r1, %r5;
+	mov.u32 %r2, %r5;
+	mov.u32 %r3, %r5;
+	mov.u32 %r4, %r5;
+	@%p1 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r1, %r2, %r3, %r4 }, [ %rd1 + 0 ], %rd2;
+	// end inline asm
+	mov.b32 	{%rs1, %rs2}, %r1;
+	mov.b32 	{%rs3, %rs4}, %r2;
+	mov.b32 	{%rs5, %rs6}, %r3;
+	mov.b32 	{%rs7, %rs8}, %r4;
+	.loc	1 38 112                        // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:38:112
+	cvt.f32.bf16 	%r45, %rs1;
+	cvt.f32.bf16 	%r46, %rs2;
+	cvt.f32.bf16 	%r47, %rs3;
+	cvt.f32.bf16 	%r48, %rs4;
+	cvt.f32.bf16 	%r49, %rs5;
+	cvt.f32.bf16 	%r50, %rs6;
+	cvt.f32.bf16 	%r51, %rs7;
+	cvt.f32.bf16 	%r52, %rs8;
+	.loc	1 44 62                         // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:44:62
+	selp.f32 	%r53, %r45, 0f00000000, %p1;
+	selp.f32 	%r54, %r46, 0f00000000, %p1;
+	selp.f32 	%r55, %r47, 0f00000000, %p1;
+	selp.f32 	%r56, %r48, 0f00000000, %p1;
+	selp.f32 	%r57, %r49, 0f00000000, %p1;
+	selp.f32 	%r58, %r50, 0f00000000, %p1;
+	selp.f32 	%r59, %r51, 0f00000000, %p1;
+	selp.f32 	%r60, %r52, 0f00000000, %p1;
+	.loc	1 46 66                         // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:46:66
+	selp.f32 	%r61, 0f3F800000, 0f00000000, %p1;
+$L__tmp1:
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	sub.f32 	%r62, %r54, %r53;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	selp.f32 	%r63, 0f40000000, 0f00000000, %p1;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	setp.eq.f32 	%p6, %r63, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	div.full.f32 	%r64, %r61, %r63;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	selp.f32 	%r65, 0f00000000, %r64, %p6;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	fma.rn.f32 	%r66, %r65, %r62, %r53;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	mul.f32 	%r67, %r62, %r62;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	mul.f32 	%r68, %r61, %r67;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	fma.rn.f32 	%r69, %r65, %r68, 0f00000000;
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	sub.f32 	%r70, %r55, %r66;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	selp.f32 	%r71, 0f40400000, 0f00000000, %p1;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	setp.eq.f32 	%p7, %r71, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	div.full.f32 	%r72, %r61, %r71;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	selp.f32 	%r73, 0f00000000, %r72, %p7;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	fma.rn.f32 	%r74, %r73, %r70, %r66;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	mul.f32 	%r75, %r70, %r70;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	mul.f32 	%r76, %r63, %r75;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	fma.rn.f32 	%r77, %r73, %r76, %r69;
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	sub.f32 	%r78, %r56, %r74;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	selp.f32 	%r79, 0f40800000, 0f00000000, %p1;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	setp.eq.f32 	%p8, %r79, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	div.full.f32 	%r80, %r61, %r79;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	selp.f32 	%r81, 0f00000000, %r80, %p8;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	fma.rn.f32 	%r82, %r81, %r78, %r74;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	mul.f32 	%r83, %r78, %r78;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	mul.f32 	%r84, %r71, %r83;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	fma.rn.f32 	%r85, %r81, %r84, %r77;
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	sub.f32 	%r86, %r57, %r82;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	selp.f32 	%r87, 0f40A00000, 0f00000000, %p1;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	setp.eq.f32 	%p9, %r87, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	div.full.f32 	%r88, %r61, %r87;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	selp.f32 	%r89, 0f00000000, %r88, %p9;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	fma.rn.f32 	%r90, %r89, %r86, %r82;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	mul.f32 	%r91, %r86, %r86;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	mul.f32 	%r92, %r79, %r91;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	fma.rn.f32 	%r93, %r89, %r92, %r85;
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	sub.f32 	%r94, %r58, %r90;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	selp.f32 	%r95, 0f40C00000, 0f00000000, %p1;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	setp.eq.f32 	%p10, %r95, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	div.full.f32 	%r96, %r61, %r95;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	selp.f32 	%r97, 0f00000000, %r96, %p10;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	fma.rn.f32 	%r98, %r97, %r94, %r90;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	mul.f32 	%r99, %r94, %r94;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	mul.f32 	%r100, %r87, %r99;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	fma.rn.f32 	%r101, %r97, %r100, %r93;
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	sub.f32 	%r102, %r59, %r98;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	selp.f32 	%r103, 0f40E00000, 0f00000000, %p1;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	setp.eq.f32 	%p11, %r103, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	div.full.f32 	%r104, %r61, %r103;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	selp.f32 	%r105, 0f00000000, %r104, %p11;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	fma.rn.f32 	%r106, %r105, %r102, %r98;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	mul.f32 	%r107, %r102, %r102;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	mul.f32 	%r108, %r95, %r107;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	fma.rn.f32 	%r109, %r105, %r108, %r101;
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	sub.f32 	%r110, %r60, %r106;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	selp.f32 	%r111, 0f41000000, 0f00000000, %p1;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	setp.eq.f32 	%p12, %r111, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	div.full.f32 	%r112, %r61, %r111;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	selp.f32 	%r113, 0f00000000, %r112, %p12;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	fma.rn.f32 	%r114, %r113, %r110, %r106;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	mul.f32 	%r115, %r110, %r110;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	mul.f32 	%r116, %r103, %r115;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	fma.rn.f32 	%r117, %r113, %r116, %r109;
+$L__tmp2:
+	.loc	2 243 46                        // triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ]
+	shfl.sync.bfly.b32 	%r118, %r114, 16, 31, -1;
+	shfl.sync.bfly.b32 	%r119, %r117, 16, 31, -1;
+	shfl.sync.bfly.b32 	%r120, %r111, 16, 31, -1;
+$L__tmp3:
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	sub.f32 	%r121, %r118, %r114;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	add.f32 	%r122, %r111, %r120;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	setp.eq.f32 	%p13, %r122, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	div.full.f32 	%r123, %r120, %r122;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	selp.f32 	%r124, 0f00000000, %r123, %p13;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	fma.rn.f32 	%r125, %r124, %r121, %r114;
+	.loc	2 236 15                        // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	add.f32 	%r126, %r117, %r119;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	mul.f32 	%r127, %r121, %r121;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	mul.f32 	%r128, %r111, %r127;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	fma.rn.f32 	%r129, %r124, %r128, %r126;
+$L__tmp4:
+	.loc	2 243 46                        // triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ]
+	shfl.sync.bfly.b32 	%r130, %r125, 8, 31, -1;
+	shfl.sync.bfly.b32 	%r131, %r129, 8, 31, -1;
+	shfl.sync.bfly.b32 	%r132, %r122, 8, 31, -1;
+$L__tmp5:
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	sub.f32 	%r133, %r130, %r125;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	add.f32 	%r134, %r122, %r132;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	setp.eq.f32 	%p14, %r134, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	div.full.f32 	%r135, %r132, %r134;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	selp.f32 	%r136, 0f00000000, %r135, %p14;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	fma.rn.f32 	%r137, %r136, %r133, %r125;
+	.loc	2 236 15                        // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	add.f32 	%r138, %r129, %r131;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	mul.f32 	%r139, %r133, %r133;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	mul.f32 	%r140, %r122, %r139;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	fma.rn.f32 	%r141, %r136, %r140, %r138;
+$L__tmp6:
+	.loc	2 243 46                        // triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ]
+	shfl.sync.bfly.b32 	%r142, %r137, 4, 31, -1;
+	shfl.sync.bfly.b32 	%r143, %r141, 4, 31, -1;
+	shfl.sync.bfly.b32 	%r144, %r134, 4, 31, -1;
+$L__tmp7:
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	sub.f32 	%r145, %r142, %r137;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	add.f32 	%r146, %r134, %r144;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	setp.eq.f32 	%p15, %r146, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	div.full.f32 	%r147, %r144, %r146;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	selp.f32 	%r148, 0f00000000, %r147, %p15;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	fma.rn.f32 	%r149, %r148, %r145, %r137;
+	.loc	2 236 15                        // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	add.f32 	%r150, %r141, %r143;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	mul.f32 	%r151, %r145, %r145;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	mul.f32 	%r152, %r134, %r151;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	fma.rn.f32 	%r153, %r148, %r152, %r150;
+$L__tmp8:
+	.loc	2 243 46                        // triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ]
+	shfl.sync.bfly.b32 	%r154, %r149, 2, 31, -1;
+	shfl.sync.bfly.b32 	%r155, %r153, 2, 31, -1;
+	shfl.sync.bfly.b32 	%r156, %r146, 2, 31, -1;
+$L__tmp9:
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	sub.f32 	%r157, %r154, %r149;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	add.f32 	%r158, %r146, %r156;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	setp.eq.f32 	%p16, %r158, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	div.full.f32 	%r159, %r156, %r158;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	selp.f32 	%r160, 0f00000000, %r159, %p16;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	fma.rn.f32 	%r161, %r160, %r157, %r149;
+	.loc	2 236 15                        // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	add.f32 	%r162, %r153, %r155;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	mul.f32 	%r163, %r157, %r157;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	mul.f32 	%r164, %r146, %r163;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	fma.rn.f32 	%r165, %r160, %r164, %r162;
+$L__tmp10:
+	.loc	2 243 46                        // triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ]
+	shfl.sync.bfly.b32 	%r166, %r161, 1, 31, -1;
+	shfl.sync.bfly.b32 	%r167, %r165, 1, 31, -1;
+	shfl.sync.bfly.b32 	%r168, %r158, 1, 31, -1;
+$L__tmp11:
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	sub.f32 	%r169, %r166, %r161;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	add.f32 	%r11, %r158, %r168;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	setp.eq.f32 	%p17, %r11, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	div.full.f32 	%r170, %r168, %r11;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	selp.f32 	%r171, 0f00000000, %r170, %p17;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	fma.rn.f32 	%r7, %r171, %r169, %r161;
+	.loc	2 236 15                        // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	add.f32 	%r172, %r165, %r167;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	mul.f32 	%r173, %r169, %r169;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	mul.f32 	%r174, %r158, %r173;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	fma.rn.f32 	%r9, %r171, %r174, %r172;
+$L__tmp12:
+	.loc	2 243 46                        // triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ]
+	setp.eq.b32 	%p2, %r40, 0;
+	shr.u32 	%r175, %r38, 3;
+	and.b32 	%r176, %r175, 60;
+	mov.b32 	%r177, global_smem;
+	add.s32 	%r6, %r177, %r176;
+	// begin inline asm
+	@%p2 st.shared.b32 [ %r6 + 0 ], %r7;
+	// end inline asm
+	add.s32 	%r8, %r6, 64;
+	// begin inline asm
+	@%p2 st.shared.b32 [ %r8 + 0 ], %r9;
+	// end inline asm
+	add.s32 	%r10, %r6, 128;
+	// begin inline asm
+	@%p2 st.shared.b32 [ %r10 + 0 ], %r11;
+	// end inline asm
+	bar.sync 	0;
+	setp.lt.u32 	%p3, %r39, 16;
+	shl.b32 	%r178, %r39, 2;
+	add.s32 	%r13, %r177, %r178;
+	// begin inline asm
+	@%p3 ld.shared.b32 %r12, [ %r13 + 0 ];
+	// end inline asm
+	add.s32 	%r15, %r13, 64;
+	// begin inline asm
+	@%p3 ld.shared.b32 %r14, [ %r15 + 0 ];
+	// end inline asm
+	add.s32 	%r17, %r13, 128;
+	// begin inline asm
+	@%p3 ld.shared.b32 %r16, [ %r17 + 0 ];
+	// end inline asm
+	shfl.sync.bfly.b32 	%r179, %r12, 8, 31, -1;
+	shfl.sync.bfly.b32 	%r180, %r14, 8, 31, -1;
+	shfl.sync.bfly.b32 	%r181, %r16, 8, 31, -1;
+$L__tmp13:
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	sub.f32 	%r182, %r179, %r12;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	add.f32 	%r183, %r16, %r181;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	setp.eq.f32 	%p18, %r183, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	div.full.f32 	%r184, %r181, %r183;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	selp.f32 	%r185, 0f00000000, %r184, %p18;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	fma.rn.f32 	%r186, %r182, %r185, %r12;
+	.loc	2 236 15                        // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	add.f32 	%r187, %r14, %r180;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	mul.f32 	%r188, %r182, %r182;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	mul.f32 	%r189, %r188, %r16;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	fma.rn.f32 	%r190, %r189, %r185, %r187;
+$L__tmp14:
+	.loc	2 243 46                        // triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ]
+	shfl.sync.bfly.b32 	%r191, %r186, 4, 31, -1;
+	shfl.sync.bfly.b32 	%r192, %r190, 4, 31, -1;
+	shfl.sync.bfly.b32 	%r193, %r183, 4, 31, -1;
+$L__tmp15:
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	sub.f32 	%r194, %r191, %r186;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	add.f32 	%r195, %r183, %r193;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	setp.eq.f32 	%p19, %r195, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	div.full.f32 	%r196, %r193, %r195;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	selp.f32 	%r197, 0f00000000, %r196, %p19;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	fma.rn.f32 	%r198, %r194, %r197, %r186;
+	.loc	2 236 15                        // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	add.f32 	%r199, %r190, %r192;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	mul.f32 	%r200, %r194, %r194;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	mul.f32 	%r201, %r183, %r200;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	fma.rn.f32 	%r202, %r197, %r201, %r199;
+$L__tmp16:
+	.loc	2 243 46                        // triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ]
+	shfl.sync.bfly.b32 	%r203, %r198, 2, 31, -1;
+	shfl.sync.bfly.b32 	%r204, %r202, 2, 31, -1;
+	shfl.sync.bfly.b32 	%r205, %r195, 2, 31, -1;
+$L__tmp17:
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	sub.f32 	%r206, %r203, %r198;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	add.f32 	%r207, %r195, %r205;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	setp.eq.f32 	%p20, %r207, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	div.full.f32 	%r208, %r205, %r207;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	selp.f32 	%r209, 0f00000000, %r208, %p20;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	fma.rn.f32 	%r210, %r206, %r209, %r198;
+	.loc	2 236 15                        // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	add.f32 	%r211, %r202, %r204;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	mul.f32 	%r212, %r206, %r206;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	mul.f32 	%r213, %r195, %r212;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	fma.rn.f32 	%r214, %r209, %r213, %r211;
+$L__tmp18:
+	.loc	2 243 46                        // triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ]
+	shfl.sync.bfly.b32 	%r215, %r210, 1, 31, -1;
+	shfl.sync.bfly.b32 	%r216, %r214, 1, 31, -1;
+	shfl.sync.bfly.b32 	%r217, %r207, 1, 31, -1;
+$L__tmp19:
+	.loc	2 231 21                        // triton_helpers.py:231:21 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	sub.f32 	%r218, %r215, %r210;
+	.loc	2 232 28                        // triton_helpers.py:232:28 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	add.f32 	%r20, %r207, %r217;
+	.loc	2 233 39                        // triton_helpers.py:233:39 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	setp.eq.f32 	%p21, %r20, 0f00000000;
+	.loc	2 233 60                        // triton_helpers.py:233:60 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	div.full.f32 	%r219, %r217, %r20;
+	.loc	2 233 49                        // triton_helpers.py:233:49 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	selp.f32 	%r220, 0f00000000, %r219, %p21;
+	.loc	2 235 17                        // triton_helpers.py:235:17 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	fma.rn.f32 	%r18, %r218, %r220, %r210;
+	.loc	2 236 15                        // triton_helpers.py:236:15 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	add.f32 	%r221, %r214, %r216;
+	.loc	2 236 30                        // triton_helpers.py:236:30 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	mul.f32 	%r222, %r218, %r218;
+	.loc	2 236 38                        // triton_helpers.py:236:38 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	mul.f32 	%r223, %r207, %r222;
+	.loc	2 236 22                        // triton_helpers.py:236:22 @[ triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ] ]
+	fma.rn.f32 	%r19, %r220, %r223, %r221;
+$L__tmp20:
+	.loc	2 243 46                        // triton_helpers.py:243:46 @[ cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:47:79 ]
+	and.b32 	%r224, %r38, 15;
+	setp.eq.b32 	%p22, %r224, 0;
+	and.pred 	%p4, %p3, %p22;
+	// begin inline asm
+	@%p4 st.shared.b32 [ %r13 + 0 ], %r18;
+	// end inline asm
+	// begin inline asm
+	@%p4 st.shared.b32 [ %r15 + 0 ], %r19;
+	// end inline asm
+	// begin inline asm
+	@%p4 st.shared.b32 [ %r17 + 0 ], %r20;
+	// end inline asm
+	bar.sync 	0;
+	ld.shared.b32 	%r225, [global_smem];
+	ld.shared.b32 	%r226, [global_smem+64];
+$L__tmp21:
+	.loc	1 57 34                         // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:57:34
+	mul.wide.u32 	%rd14, %r42, 2;
+	add.s64 	%rd3, %rd10, %rd14;
+	.loc	1 57 41                         // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:57:41
+	// begin inline asm
+	mov.u64 %rd4, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd4, 1.0;
+	// end inline asm
+	mov.pred 	%p5, -1;
+	// begin inline asm
+	mov.u32 %r21, %r5;
+	mov.u32 %r22, %r5;
+	mov.u32 %r23, %r5;
+	mov.u32 %r24, %r5;
+	@%p5 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r21, %r22, %r23, %r24 }, [ %rd3 + 0 ], %rd4;
+	// end inline asm
+	.loc	1 58 52                         // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:58:52
+	// begin inline asm
+	mov.u64 %rd5, 0x0;
+	createpolicy.fractional.L2::evict_first.b64 %rd5, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r25, %r5;
+	mov.u32 %r26, %r5;
+	mov.u32 %r27, %r5;
+	mov.u32 %r28, %r5;
+	@%p1 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { %r25, %r26, %r27, %r28 }, [ %rd1 + 0 ], %rd5;
+	// end inline asm
+	.loc	1 59 35                         // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:59:35
+	add.s64 	%rd6, %rd11, %rd14;
+	.loc	1 59 42                         // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:59:42
+	// begin inline asm
+	mov.u64 %rd7, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd7, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r29, %r5;
+	mov.u32 %r30, %r5;
+	mov.u32 %r31, %r5;
+	mov.u32 %r32, %r5;
+	@%p5 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r29, %r30, %r31, %r32 }, [ %rd6 + 0 ], %rd7;
+	// end inline asm
+	mov.b32 	%r227, 0f45800000;
+	.loc	1 65 24                         // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:65:24
+	div.full.f32 	%r228, %r226, %r227;
+	.loc	1 67 24                         // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:67:24
+	add.f32 	%r229, %r228, 0f358637BD;
+	.loc	1 68 32                         // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:68:32
+	rsqrt.approx.ftz.f32 	%r230, %r229;
+	.loc	1 73 29                         // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:73:29
+	add.s64 	%rd8, %rd12, %rd13;
+	.loc	1 58 114                        // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:58:114
+	mov.b32 	{%rs9, %rs10}, %r25;
+	cvt.f32.bf16 	%r231, %rs10;
+	cvt.f32.bf16 	%r232, %rs9;
+	.loc	1 63 24                         // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:63:24
+	sub.f32 	%r233, %r232, %r225;
+	sub.f32 	%r234, %r231, %r225;
+	.loc	1 57 94                         // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:57:94
+	mov.b32 	{%rs11, %rs12}, %r21;
+	cvt.f32.bf16 	%r235, %rs11;
+	cvt.f32.bf16 	%r236, %rs12;
+	.loc	1 61 23                         // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:61:23
+	add.f32 	%r237, %r236, 0f3F800000;
+	add.f32 	%r238, %r235, 0f3F800000;
+	.loc	1 59 95                         // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:59:95
+	mov.b32 	{%rs13, %rs14}, %r29;
+	cvt.f32.bf16 	%r239, %rs14;
+	cvt.f32.bf16 	%r240, %rs13;
+	.loc	1 69 24                         // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:69:24
+	mul.f32 	%r241, %r234, %r230;
+	mul.f32 	%r242, %r233, %r230;
+	.loc	1 72 24                         // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:72:24
+	fma.rn.f32 	%r243, %r238, %r242, %r240;
+	fma.rn.f32 	%r244, %r237, %r241, %r239;
+	.loc	1 73 53                         // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:73:53
+	cvt.rn.bf16x2.f32 	%r33, %r244, %r243;
+	.loc	1 58 114                        // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:58:114
+	mov.b32 	{%rs15, %rs16}, %r26;
+	cvt.f32.bf16 	%r245, %rs16;
+	cvt.f32.bf16 	%r246, %rs15;
+	.loc	1 63 24                         // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:63:24
+	sub.f32 	%r247, %r246, %r225;
+	sub.f32 	%r248, %r245, %r225;
+	.loc	1 57 94                         // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:57:94
+	mov.b32 	{%rs17, %rs18}, %r22;
+	cvt.f32.bf16 	%r249, %rs17;
+	cvt.f32.bf16 	%r250, %rs18;
+	.loc	1 61 23                         // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:61:23
+	add.f32 	%r251, %r250, 0f3F800000;
+	add.f32 	%r252, %r249, 0f3F800000;
+	.loc	1 59 95                         // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:59:95
+	mov.b32 	{%rs19, %rs20}, %r30;
+	cvt.f32.bf16 	%r253, %rs20;
+	cvt.f32.bf16 	%r254, %rs19;
+	.loc	1 69 24                         // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:69:24
+	mul.f32 	%r255, %r248, %r230;
+	mul.f32 	%r256, %r247, %r230;
+	.loc	1 72 24                         // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:72:24
+	fma.rn.f32 	%r257, %r252, %r256, %r254;
+	fma.rn.f32 	%r258, %r251, %r255, %r253;
+	.loc	1 73 53                         // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:73:53
+	cvt.rn.bf16x2.f32 	%r34, %r258, %r257;
+	.loc	1 58 114                        // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:58:114
+	mov.b32 	{%rs21, %rs22}, %r27;
+	cvt.f32.bf16 	%r259, %rs22;
+	cvt.f32.bf16 	%r260, %rs21;
+	.loc	1 63 24                         // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:63:24
+	sub.f32 	%r261, %r260, %r225;
+	sub.f32 	%r262, %r259, %r225;
+	.loc	1 57 94                         // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:57:94
+	mov.b32 	{%rs23, %rs24}, %r23;
+	cvt.f32.bf16 	%r263, %rs23;
+	cvt.f32.bf16 	%r264, %rs24;
+	.loc	1 61 23                         // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:61:23
+	add.f32 	%r265, %r264, 0f3F800000;
+	add.f32 	%r266, %r263, 0f3F800000;
+	.loc	1 59 95                         // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:59:95
+	mov.b32 	{%rs25, %rs26}, %r31;
+	cvt.f32.bf16 	%r267, %rs26;
+	cvt.f32.bf16 	%r268, %rs25;
+	.loc	1 69 24                         // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:69:24
+	mul.f32 	%r269, %r262, %r230;
+	mul.f32 	%r270, %r261, %r230;
+	.loc	1 72 24                         // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:72:24
+	fma.rn.f32 	%r271, %r266, %r270, %r268;
+	fma.rn.f32 	%r272, %r265, %r269, %r267;
+	.loc	1 73 53                         // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:73:53
+	cvt.rn.bf16x2.f32 	%r35, %r272, %r271;
+	.loc	1 58 114                        // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:58:114
+	mov.b32 	{%rs27, %rs28}, %r28;
+	cvt.f32.bf16 	%r273, %rs28;
+	cvt.f32.bf16 	%r274, %rs27;
+	.loc	1 63 24                         // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:63:24
+	sub.f32 	%r275, %r274, %r225;
+	sub.f32 	%r276, %r273, %r225;
+	.loc	1 57 94                         // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:57:94
+	mov.b32 	{%rs29, %rs30}, %r24;
+	cvt.f32.bf16 	%r277, %rs29;
+	cvt.f32.bf16 	%r278, %rs30;
+	.loc	1 61 23                         // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:61:23
+	add.f32 	%r279, %r278, 0f3F800000;
+	add.f32 	%r280, %r277, 0f3F800000;
+	.loc	1 59 95                         // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:59:95
+	mov.b32 	{%rs31, %rs32}, %r32;
+	cvt.f32.bf16 	%r281, %rs32;
+	cvt.f32.bf16 	%r282, %rs31;
+	.loc	1 69 24                         // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:69:24
+	mul.f32 	%r283, %r276, %r230;
+	mul.f32 	%r284, %r275, %r230;
+	.loc	1 72 24                         // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:72:24
+	fma.rn.f32 	%r285, %r280, %r284, %r282;
+	fma.rn.f32 	%r286, %r279, %r283, %r281;
+	.loc	1 73 53                         // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:73:53
+	cvt.rn.bf16x2.f32 	%r36, %r286, %r285;
+	// begin inline asm
+	@%p1 st.global.v4.b32 [ %rd8 + 0 ], { %r33, %r34, %r35, %r36 };
+	// end inline asm
+	.loc	1 51 4                          // cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py:51:4
+	ret;
+$L__tmp22:
+$L__func_end0:
+                                        // -- End function
+}
+	.file	1 "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py"
+	.file	2 "/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py"
+	.section	.debug_abbrev
+	{
+.b8 1                                   // Abbreviation Code
+.b8 17                                  // DW_TAG_compile_unit
+.b8 1                                   // DW_CHILDREN_yes
+.b8 37                                  // DW_AT_producer
+.b8 8                                   // DW_FORM_string
+.b8 19                                  // DW_AT_language
+.b8 5                                   // DW_FORM_data2
+.b8 3                                   // DW_AT_name
+.b8 8                                   // DW_FORM_string
+.b8 16                                  // DW_AT_stmt_list
+.b8 6                                   // DW_FORM_data4
+.b8 27                                  // DW_AT_comp_dir
+.b8 8                                   // DW_FORM_string
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 2                                   // Abbreviation Code
+.b8 46                                  // DW_TAG_subprogram
+.b8 0                                   // DW_CHILDREN_no
+.b8 3                                   // DW_AT_name
+.b8 8                                   // DW_FORM_string
+.b8 32                                  // DW_AT_inline
+.b8 11                                  // DW_FORM_data1
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 3                                   // Abbreviation Code
+.b8 46                                  // DW_TAG_subprogram
+.b8 1                                   // DW_CHILDREN_yes
+.b8 17                                  // DW_AT_low_pc
+.b8 1                                   // DW_FORM_addr
+.b8 18                                  // DW_AT_high_pc
+.b8 1                                   // DW_FORM_addr
+.b8 49                                  // DW_AT_abstract_origin
+.b8 19                                  // DW_FORM_ref4
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 4                                   // Abbreviation Code
+.b8 29                                  // DW_TAG_inlined_subroutine
+.b8 1                                   // DW_CHILDREN_yes
+.b8 49                                  // DW_AT_abstract_origin
+.b8 19                                  // DW_FORM_ref4
+.b8 17                                  // DW_AT_low_pc
+.b8 1                                   // DW_FORM_addr
+.b8 18                                  // DW_AT_high_pc
+.b8 1                                   // DW_FORM_addr
+.b8 88                                  // DW_AT_call_file
+.b8 11                                  // DW_FORM_data1
+.b8 89                                  // DW_AT_call_line
+.b8 11                                  // DW_FORM_data1
+.b8 87                                  // DW_AT_call_column
+.b8 11                                  // DW_FORM_data1
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 5                                   // Abbreviation Code
+.b8 29                                  // DW_TAG_inlined_subroutine
+.b8 0                                   // DW_CHILDREN_no
+.b8 49                                  // DW_AT_abstract_origin
+.b8 19                                  // DW_FORM_ref4
+.b8 17                                  // DW_AT_low_pc
+.b8 1                                   // DW_FORM_addr
+.b8 18                                  // DW_AT_high_pc
+.b8 1                                   // DW_FORM_addr
+.b8 88                                  // DW_AT_call_file
+.b8 11                                  // DW_FORM_data1
+.b8 89                                  // DW_AT_call_line
+.b8 11                                  // DW_FORM_data1
+.b8 87                                  // DW_AT_call_column
+.b8 11                                  // DW_FORM_data1
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 0                                   // EOM(3)
+	}
+	.section	.debug_info
+	{
+.b32 343                                // Length of Unit
+.b8 2                                   // DWARF version number
+.b8 0
+.b32 .debug_abbrev                      // Offset Into Abbrev. Section
+.b8 8                                   // Address Size (in bytes)
+.b8 1                                   // Abbrev [1] 0xb:0x150 DW_TAG_compile_unit
+.b8 116                                 // DW_AT_producer
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 0
+.b8 2                                   // DW_AT_language
+.b8 0
+.b8 99                                  // DW_AT_name
+.b8 112
+.b8 103
+.b8 101
+.b8 115
+.b8 122
+.b8 104
+.b8 52
+.b8 110
+.b8 112
+.b8 121
+.b8 110
+.b8 121
+.b8 55
+.b8 117
+.b8 50
+.b8 113
+.b8 120
+.b8 108
+.b8 107
+.b8 116
+.b8 112
+.b8 118
+.b8 50
+.b8 121
+.b8 50
+.b8 120
+.b8 100
+.b8 103
+.b8 103
+.b8 122
+.b8 121
+.b8 108
+.b8 53
+.b8 111
+.b8 112
+.b8 111
+.b8 121
+.b8 51
+.b8 111
+.b8 114
+.b8 117
+.b8 113
+.b8 115
+.b8 113
+.b8 101
+.b8 116
+.b8 52
+.b8 112
+.b8 53
+.b8 101
+.b8 107
+.b8 46
+.b8 112
+.b8 121
+.b8 0
+.b32 .debug_line                        // DW_AT_stmt_list
+.b8 47                                  // DW_AT_comp_dir
+.b8 97
+.b8 112
+.b8 112
+.b8 47
+.b8 116
+.b8 101
+.b8 110
+.b8 115
+.b8 111
+.b8 114
+.b8 114
+.b8 116
+.b8 95
+.b8 108
+.b8 108
+.b8 109
+.b8 47
+.b8 118
+.b8 105
+.b8 115
+.b8 117
+.b8 97
+.b8 108
+.b8 95
+.b8 103
+.b8 101
+.b8 110
+.b8 47
+.b8 99
+.b8 111
+.b8 109
+.b8 112
+.b8 105
+.b8 108
+.b8 101
+.b8 100
+.b8 95
+.b8 99
+.b8 97
+.b8 99
+.b8 104
+.b8 101
+.b8 47
+.b8 102
+.b8 108
+.b8 117
+.b8 120
+.b8 50
+.b8 95
+.b8 107
+.b8 108
+.b8 101
+.b8 105
+.b8 110
+.b8 95
+.b8 57
+.b8 98
+.b8 95
+.b8 78
+.b8 86
+.b8 73
+.b8 68
+.b8 73
+.b8 65
+.b8 95
+.b8 71
+.b8 101
+.b8 70
+.b8 111
+.b8 114
+.b8 99
+.b8 101
+.b8 95
+.b8 82
+.b8 84
+.b8 88
+.b8 95
+.b8 52
+.b8 48
+.b8 57
+.b8 48
+.b8 95
+.b8 115
+.b8 109
+.b8 56
+.b8 57
+.b8 95
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 50
+.b8 46
+.b8 49
+.b8 48
+.b8 46
+.b8 48
+.b8 97
+.b8 48
+.b8 95
+.b8 98
+.b8 52
+.b8 101
+.b8 52
+.b8 101
+.b8 101
+.b8 56
+.b8 49
+.b8 100
+.b8 51
+.b8 46
+.b8 110
+.b8 118
+.b8 50
+.b8 53
+.b8 46
+.b8 49
+.b8 50
+.b8 95
+.b8 99
+.b8 117
+.b8 100
+.b8 97
+.b8 49
+.b8 51
+.b8 95
+.b8 49
+.b8 47
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 105
+.b8 110
+.b8 100
+.b8 117
+.b8 99
+.b8 116
+.b8 111
+.b8 114
+.b8 47
+.b8 112
+.b8 103
+.b8 0
+.b8 2                                   // Abbrev [2] 0xe4:0x2f DW_TAG_subprogram
+.b8 116                                 // DW_AT_name
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 114
+.b8 101
+.b8 100
+.b8 95
+.b8 102
+.b8 117
+.b8 115
+.b8 101
+.b8 100
+.b8 95
+.b8 97
+.b8 100
+.b8 100
+.b8 95
+.b8 109
+.b8 117
+.b8 108
+.b8 95
+.b8 110
+.b8 97
+.b8 116
+.b8 105
+.b8 118
+.b8 101
+.b8 95
+.b8 108
+.b8 97
+.b8 121
+.b8 101
+.b8 114
+.b8 95
+.b8 110
+.b8 111
+.b8 114
+.b8 109
+.b8 95
+.b8 48
+.b8 0
+.b8 1                                   // DW_AT_inline
+.b8 3                                   // Abbrev [3] 0x113:0x47 DW_TAG_subprogram
+.b64 $L__func_begin0                    // DW_AT_low_pc
+.b64 $L__func_end0                      // DW_AT_high_pc
+.b32 228                                // DW_AT_abstract_origin
+.b8 4                                   // Abbrev [4] 0x128:0x31 DW_TAG_inlined_subroutine
+.b32 228                                // DW_AT_abstract_origin
+.b64 $L__tmp1                           // DW_AT_low_pc
+.b64 $L__tmp21                          // DW_AT_high_pc
+.b8 1                                   // DW_AT_call_file
+.b8 47                                  // DW_AT_call_line
+.b8 79                                  // DW_AT_call_column
+.b8 5                                   // Abbrev [5] 0x140:0x18 DW_TAG_inlined_subroutine
+.b32 228                                // DW_AT_abstract_origin
+.b64 $L__tmp1                           // DW_AT_low_pc
+.b64 $L__tmp20                          // DW_AT_high_pc
+.b8 2                                   // DW_AT_call_file
+.b8 243                                 // DW_AT_call_line
+.b8 46                                  // DW_AT_call_column
+.b8 0                                   // End Of Children Mark
+.b8 0                                   // End Of Children Mark
+.b8 0                                   // End Of Children Mark
+	}
+	.section	.debug_macinfo	{	}
diff --git a/triton/U2Y4PVYJWFSKZDU4AWOF4HOYPDZZL3MFTQLITJAO62J7FJKLEL7A/triton_red_fused_add_mul_native_layer_norm_0.source b/triton/U2Y4PVYJWFSKZDU4AWOF4HOYPDZZL3MFTQLITJAO62J7FJKLEL7A/triton_red_fused_add_mul_native_layer_norm_0.source
new file mode 100644
index 0000000000000000000000000000000000000000..5dace2e3fc4776bcfa7ab8e79ab933d32ddae36f
--- /dev/null
+++ b/triton/U2Y4PVYJWFSKZDU4AWOF4HOYPDZZL3MFTQLITJAO62J7FJKLEL7A/triton_red_fused_add_mul_native_layer_norm_0.source
@@ -0,0 +1,420 @@
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":18:0)
+#loc72 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":216:0)
+#loc85 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":133:0)
+#loc89 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":242:0)
+#loc91 = loc(unknown)
+#loc94 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":230:0)
+#loc109 = loc("in_ptr0"(#loc))
+#loc110 = loc("in_ptr1"(#loc))
+#loc111 = loc("in_ptr2"(#loc))
+#loc112 = loc("out_ptr2"(#loc))
+#loc113 = loc("xnumel"(#loc))
+#loc114 = loc("r0_numel"(#loc))
+#loc171 = loc("value"(#loc72))
+#loc172 = loc("mean"(#loc72))
+#loc173 = loc("m2"(#loc72))
+#loc174 = loc("weight"(#loc72))
+#loc175 = loc("first_iteration"(#loc72))
+#loc185 = loc("input"(#loc85))
+#loc186 = loc("mean"(#loc89))
+#loc187 = loc("m2"(#loc89))
+#loc188 = loc("weight"(#loc89))
+#loc189 = loc("mean_1"(#loc94))
+#loc190 = loc("m2_1"(#loc94))
+#loc191 = loc("weight_1"(#loc94))
+#loc192 = loc("mean_2"(#loc94))
+#loc193 = loc("m2_2"(#loc94))
+#loc194 = loc("weight_2"(#loc94))
+#loc201 = loc("new_mean"(#loc171))
+module {
+  tt.func public @triton_red_fused_add_mul_native_layer_norm_0(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %out_ptr2: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
+    %xnumel_0 = arith.constant 2304 : i32 loc(#loc115)
+    %r0_numel_1 = arith.constant 4096 : i32 loc(#loc116)
+    %xoffset = tt.get_program_id x : i32 loc(#loc117)
+    %xoffset_2 = arith.constant 1 : i32 loc(#loc118)
+    %xoffset_3 = arith.constant 1 : i32 loc(#loc118)
+    %xoffset_4 = arith.muli %xoffset, %xoffset_3 : i32 loc(#loc118)
+    %xindex = tt.make_range {end = 1 : i32, start = 0 : i32} : tensor<1xi32> loc(#loc119)
+    %xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc120)
+    %xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<1x1xi32> loc(#loc121)
+    %xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<1x1xi32> loc(#loc121)
+    %xmask = arith.constant dense<2304> : tensor<1x1xi32> loc(#loc122)
+    %xmask_8 = arith.cmpi slt, %xindex_7, %xmask : tensor<1x1xi32> loc(#loc122)
+    %r0_base = tt.make_range {end = 4096 : i32, start = 0 : i32} : tensor<4096xi32> loc(#loc123)
+    %r0_base_9 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<4096xi32> -> tensor<1x4096xi32> loc(#loc124)
+    %tmp3_mean = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_4096__(1,)cconstexpr_fp32_"() : () -> tensor<1x4096xf32> loc(#loc125)
+    %tmp3_m2 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_4096__(1,)cconstexpr_fp32_"() : () -> tensor<1x4096xf32> loc(#loc126)
+    %tmp3_weight = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_4096__(1,)cconstexpr_fp32_"() : () -> tensor<1x4096xf32> loc(#loc127)
+    %c0_i32 = arith.constant 0 : i32 loc(#loc14)
+    %c4096_i32 = arith.constant 4096 : i32 loc(#loc14)
+    %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc14)
+    %1 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc14)
+    %2 = arith.bitcast %c4096_i32 : i32 to i32 loc(#loc14)
+    %3 = ub.poison : i32 loc(#loc14)
+    %tmp3_weight_10:3 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%tmp3_mean_13 = %tmp3_mean, %tmp3_m2_14 = %tmp3_m2, %tmp3_weight_15 = %tmp3_weight) -> (tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32>)  : i32 {
+      %r0_index = tt.splat %r0_offset : i32 -> tensor<1x4096xi32> loc(#loc129)
+      %r0_index_16 = arith.addi %r0_index, %r0_base_9 : tensor<1x4096xi32> loc(#loc129)
+      %r0_mask = arith.constant dense<4096> : tensor<1x4096xi32> loc(#loc130)
+      %r0_mask_17 = arith.cmpi slt, %r0_index_16, %r0_mask : tensor<1x4096xi32> loc(#loc130)
+      %tmp0 = arith.constant 4096 : i32 loc(#loc131)
+      %tmp0_18 = arith.constant 4096 : i32 loc(#loc131)
+      %tmp0_19 = arith.constant dense<4096> : tensor<1x1xi32> loc(#loc131)
+      %tmp0_20 = arith.muli %tmp0_19, %xindex_7 : tensor<1x1xi32> loc(#loc131)
+      %tmp0_21 = tt.broadcast %tmp0_20 : tensor<1x1xi32> -> tensor<1x4096xi32> loc(#loc132)
+      %tmp0_22 = arith.addi %r0_index_16, %tmp0_21 : tensor<1x4096xi32> loc(#loc132)
+      %tmp0_23 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<1x4096x!tt.ptr<bf16>> loc(#loc133)
+      %tmp0_24 = tt.addptr %tmp0_23, %tmp0_22 : tensor<1x4096x!tt.ptr<bf16>>, tensor<1x4096xi32> loc(#loc133)
+      %tmp0_25 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x4096xi1> loc(#loc134)
+      %tmp0_26 = arith.andi %r0_mask_17, %tmp0_25 : tensor<1x4096xi1> loc(#loc134)
+      %tmp0_27 = arith.constant 0.000000e+00 : f32 loc(#loc135)
+      %tmp0_28 = arith.constant dense<0.000000e+00> : tensor<1x4096xf32> loc(#loc135)
+      %tmp0_29 = arith.truncf %tmp0_28 : tensor<1x4096xf32> to tensor<1x4096xbf16> loc(#loc135)
+      %tmp0_30 = tt.load %tmp0_24, %tmp0_26, %tmp0_29 evictionPolicy = evict_last : tensor<1x4096x!tt.ptr<bf16>> loc(#loc135)
+      %tmp0_31 = arith.extf %tmp0_30 : tensor<1x4096xbf16> to tensor<1x4096xf32> loc(#loc136)
+      %c0_i32_32 = arith.constant 0 : i32 loc(#loc23)
+      %9 = arith.cmpi eq, %r0_offset, %c0_i32_32 : i32 loc(#loc23)
+      %10:3 = tt.call @torch._inductor.runtime.triton_helpers.welford_reduce__fp32S1_4096S_fp32S1_4096S_fp32S1_4096S_fp32S1_4096S_u1__(%tmp0_31, %tmp3_mean_13, %tmp3_m2_14, %tmp3_weight_15, %9) : (tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32>, i1) -> (tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32>) loc(#loc24)
+      %tmp3_mean_33 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x4096xi1> loc(#loc137)
+      %tmp3_mean_34 = arith.andi %r0_mask_17, %tmp3_mean_33 : tensor<1x4096xi1> loc(#loc137)
+      %tmp3_mean_35 = arith.select %tmp3_mean_34, %10#0, %tmp3_mean_13 : tensor<1x4096xi1>, tensor<1x4096xf32> loc(#loc138)
+      %tmp3_m2_36 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x4096xi1> loc(#loc139)
+      %tmp3_m2_37 = arith.andi %r0_mask_17, %tmp3_m2_36 : tensor<1x4096xi1> loc(#loc139)
+      %tmp3_m2_38 = arith.select %tmp3_m2_37, %10#1, %tmp3_m2_14 : tensor<1x4096xi1>, tensor<1x4096xf32> loc(#loc140)
+      %tmp3_weight_39 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x4096xi1> loc(#loc141)
+      %tmp3_weight_40 = arith.andi %r0_mask_17, %tmp3_weight_39 : tensor<1x4096xi1> loc(#loc141)
+      %tmp3_weight_41 = arith.select %tmp3_weight_40, %10#2, %tmp3_weight_15 : tensor<1x4096xi1>, tensor<1x4096xf32> loc(#loc142)
+      scf.yield %tmp3_mean_35, %tmp3_m2_38, %tmp3_weight_41 : tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32> loc(#loc31)
+    } loc(#loc207)
+    %4:3 = tt.call @"torch._inductor.runtime.triton_helpers.welford__fp32S1_4096S_fp32S1_4096S_fp32S1_4096S__(3,)cconstexpr_1_"(%tmp3_weight_10#0, %tmp3_weight_10#1, %tmp3_weight_10#2) : (tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32>) -> (tensor<1xf32>, tensor<1xf32>, tensor<1xf32>) loc(#loc32)
+    %tmp3 = tt.expand_dims %4#0 {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc143)
+    %tmp7 = tt.expand_dims %4#1 {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc144)
+    %tmp8 = tt.expand_dims %4#2 {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc145)
+    %c0_i32_11 = arith.constant 0 : i32 loc(#loc36)
+    %c4096_i32_12 = arith.constant 4096 : i32 loc(#loc36)
+    %5 = arith.bitcast %c0_i32_11 : i32 to i32 loc(#loc36)
+    %6 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc36)
+    %7 = arith.bitcast %c4096_i32_12 : i32 to i32 loc(#loc36)
+    %8 = ub.poison : i32 loc(#loc36)
+    scf.for %r0_offset = %5 to %6 step %7  : i32 {
+      %r0_index = tt.splat %r0_offset : i32 -> tensor<1x4096xi32> loc(#loc146)
+      %r0_index_13 = arith.addi %r0_index, %r0_base_9 : tensor<1x4096xi32> loc(#loc146)
+      %r0_mask = arith.constant dense<4096> : tensor<1x4096xi32> loc(#loc147)
+      %r0_mask_14 = arith.cmpi slt, %r0_index_13, %r0_mask : tensor<1x4096xi32> loc(#loc147)
+      %tmp9 = tt.splat %in_ptr1 : !tt.ptr<bf16> -> tensor<1x4096x!tt.ptr<bf16>> loc(#loc148)
+      %tmp9_15 = tt.addptr %tmp9, %r0_index_13 : tensor<1x4096x!tt.ptr<bf16>>, tensor<1x4096xi32> loc(#loc148)
+      %tmp9_16 = arith.constant 0.000000e+00 : f32 loc(#loc149)
+      %tmp9_17 = arith.constant dense<0.000000e+00> : tensor<1x4096xf32> loc(#loc149)
+      %tmp9_18 = arith.truncf %tmp9_17 : tensor<1x4096xf32> to tensor<1x4096xbf16> loc(#loc149)
+      %tmp9_19 = tt.load %tmp9_15, %r0_mask_14, %tmp9_18 evictionPolicy = evict_last : tensor<1x4096x!tt.ptr<bf16>> loc(#loc149)
+      %tmp9_20 = arith.extf %tmp9_19 : tensor<1x4096xbf16> to tensor<1x4096xf32> loc(#loc150)
+      %tmp12 = arith.constant 4096 : i32 loc(#loc151)
+      %tmp12_21 = arith.constant 4096 : i32 loc(#loc151)
+      %tmp12_22 = arith.constant dense<4096> : tensor<1x1xi32> loc(#loc151)
+      %tmp12_23 = arith.muli %tmp12_22, %xindex_7 : tensor<1x1xi32> loc(#loc151)
+      %tmp12_24 = tt.broadcast %tmp12_23 : tensor<1x1xi32> -> tensor<1x4096xi32> loc(#loc152)
+      %tmp12_25 = arith.addi %r0_index_13, %tmp12_24 : tensor<1x4096xi32> loc(#loc152)
+      %tmp12_26 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<1x4096x!tt.ptr<bf16>> loc(#loc153)
+      %tmp12_27 = tt.addptr %tmp12_26, %tmp12_25 : tensor<1x4096x!tt.ptr<bf16>>, tensor<1x4096xi32> loc(#loc153)
+      %tmp12_28 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x4096xi1> loc(#loc154)
+      %tmp12_29 = arith.andi %r0_mask_14, %tmp12_28 : tensor<1x4096xi1> loc(#loc154)
+      %tmp12_30 = arith.constant 0.000000e+00 : f32 loc(#loc155)
+      %tmp12_31 = arith.constant dense<0.000000e+00> : tensor<1x4096xf32> loc(#loc155)
+      %tmp12_32 = arith.truncf %tmp12_31 : tensor<1x4096xf32> to tensor<1x4096xbf16> loc(#loc155)
+      %tmp12_33 = tt.load %tmp12_27, %tmp12_29, %tmp12_32 evictionPolicy = evict_first : tensor<1x4096x!tt.ptr<bf16>> loc(#loc155)
+      %tmp12_34 = arith.extf %tmp12_33 : tensor<1x4096xbf16> to tensor<1x4096xf32> loc(#loc156)
+      %tmp23 = tt.splat %in_ptr2 : !tt.ptr<bf16> -> tensor<1x4096x!tt.ptr<bf16>> loc(#loc157)
+      %tmp23_35 = tt.addptr %tmp23, %r0_index_13 : tensor<1x4096x!tt.ptr<bf16>>, tensor<1x4096xi32> loc(#loc157)
+      %tmp23_36 = arith.constant 0.000000e+00 : f32 loc(#loc158)
+      %tmp23_37 = arith.constant dense<0.000000e+00> : tensor<1x4096xf32> loc(#loc158)
+      %tmp23_38 = arith.truncf %tmp23_37 : tensor<1x4096xf32> to tensor<1x4096xbf16> loc(#loc158)
+      %tmp23_39 = tt.load %tmp23_35, %r0_mask_14, %tmp23_38 evictionPolicy = evict_last : tensor<1x4096x!tt.ptr<bf16>> loc(#loc158)
+      %tmp23_40 = arith.extf %tmp23_39 : tensor<1x4096xbf16> to tensor<1x4096xf32> loc(#loc159)
+      %tmp10 = arith.constant 1.000000e+00 : f32 loc(#loc160)
+      %tmp11 = arith.constant dense<1.000000e+00> : tensor<1x4096xf32> loc(#loc161)
+      %tmp11_41 = arith.addf %tmp9_20, %tmp11 : tensor<1x4096xf32> loc(#loc161)
+      %tmp14 = tt.broadcast %tmp3 : tensor<1x1xf32> -> tensor<1x4096xf32> loc(#loc162)
+      %tmp14_42 = arith.subf %tmp12_34, %tmp14 : tensor<1x4096xf32> loc(#loc162)
+      %tmp15 = arith.constant 4.096000e+03 : f32 loc(#loc163)
+      %tmp16 = arith.constant dense<4.096000e+03> : tensor<1x1xf32> loc(#loc164)
+      %tmp16_43 = arith.divf %tmp7, %tmp16 : tensor<1x1xf32> loc(#loc164)
+      %tmp17 = arith.constant 9.99999997E-7 : f32 loc(#loc165)
+      %tmp18 = arith.constant dense<9.99999997E-7> : tensor<1x1xf32> loc(#loc166)
+      %tmp18_44 = arith.addf %tmp16_43, %tmp18 : tensor<1x1xf32> loc(#loc166)
+      %tmp19 = tt.extern_elementwise %tmp18_44 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<1x1xf32>) -> tensor<1x1xf32> loc(#loc167)
+      %tmp20 = tt.broadcast %tmp19 : tensor<1x1xf32> -> tensor<1x4096xf32> loc(#loc168)
+      %tmp20_45 = arith.mulf %tmp14_42, %tmp20 : tensor<1x4096xf32> loc(#loc168)
+      %tmp22 = arith.mulf %tmp11_41, %tmp20_45 : tensor<1x4096xf32> loc(#loc169)
+      %tmp24 = arith.addf %tmp22, %tmp23_40 : tensor<1x4096xf32> loc(#loc170)
+      %c4096_i32_46 = arith.constant 4096 : i32 loc(#loc62)
+      %c4096_i32_47 = arith.constant 4096 : i32 loc(#loc62)
+      %cst = arith.constant dense<4096> : tensor<1x1xi32> loc(#loc62)
+      %9 = arith.muli %cst, %xindex_7 : tensor<1x1xi32> loc(#loc62)
+      %10 = tt.broadcast %9 : tensor<1x1xi32> -> tensor<1x4096xi32> loc(#loc63)
+      %11 = arith.addi %r0_index_13, %10 : tensor<1x4096xi32> loc(#loc63)
+      %12 = tt.splat %out_ptr2 : !tt.ptr<bf16> -> tensor<1x4096x!tt.ptr<bf16>> loc(#loc64)
+      %13 = tt.addptr %12, %11 : tensor<1x4096x!tt.ptr<bf16>>, tensor<1x4096xi32> loc(#loc64)
+      %14 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x4096xi1> loc(#loc65)
+      %15 = arith.andi %r0_mask_14, %14 : tensor<1x4096xi1> loc(#loc65)
+      %16 = arith.truncf %tmp24 : tensor<1x4096xf32> to tensor<1x4096xbf16> loc(#loc66)
+      tt.store %13, %16, %15 : tensor<1x4096x!tt.ptr<bf16>> loc(#loc66)
+    } loc(#loc36)
+    tt.return loc(#loc67)
+  } loc(#loc)
+  tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_4096__(1,)cconstexpr_fp32_"() -> tensor<1x4096xf32> attributes {noinline = false} {
+    %cst = arith.constant 0.000000e+00 : f32 loc(#loc69)
+    %cst_0 = arith.constant dense<0.000000e+00> : tensor<1x4096xf32> loc(#loc69)
+    tt.return %cst_0 : tensor<1x4096xf32> loc(#loc70)
+  ^bb1:  // no predecessors
+    %0 = ub.poison : tensor<1x4096xf32> loc(#loc71)
+    tt.return %0 : tensor<1x4096xf32> loc(#loc71)
+  } loc(#loc68)
+  tt.func private @torch._inductor.runtime.triton_helpers.welford_reduce__fp32S1_4096S_fp32S1_4096S_fp32S1_4096S_fp32S1_4096S_u1__(%new_mean: tensor<1x4096xf32> loc("new_mean"(#loc171)), %mean: tensor<1x4096xf32> loc("mean"(#loc72)), %m2: tensor<1x4096xf32> loc("m2"(#loc72)), %weight: tensor<1x4096xf32> loc("weight"(#loc72)), %first_iteration: i1 loc("first_iteration"(#loc72))) -> (tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32>) attributes {noinline = false} {
+    %0:3 = scf.if %first_iteration -> (tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32>) {
+      %new_weight = arith.constant 1.000000e+00 : f32 loc(#loc176)
+      %new_weight_0 = arith.constant dense<1.000000e+00> : tensor<1x4096xf32> loc(#loc202)
+      %new_m2 = tt.call @triton.language.standard.zeros_like__fp32S1_4096S__(%m2) : (tensor<1x4096xf32>) -> tensor<1x4096xf32> loc(#loc203)
+      scf.yield %new_m2, %new_mean, %new_weight_0 : tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32> loc(#loc203)
+    } else {
+      %delta = arith.subf %new_mean, %mean : tensor<1x4096xf32> loc(#loc178)
+      %new_weight = arith.constant 1 : i32 loc(#loc179)
+      %new_weight_0 = arith.constant 1.000000e+00 : f32 loc(#loc179)
+      %new_weight_1 = arith.constant dense<1.000000e+00> : tensor<1x4096xf32> loc(#loc179)
+      %new_weight_2 = arith.addf %weight, %new_weight_1 : tensor<1x4096xf32> loc(#loc204)
+      %new_mean_3 = arith.divf %delta, %new_weight_2 : tensor<1x4096xf32> loc(#loc180)
+      %new_mean_4 = arith.addf %mean, %new_mean_3 : tensor<1x4096xf32> loc(#loc205)
+      %new_m2 = arith.subf %new_mean, %new_mean_4 : tensor<1x4096xf32> loc(#loc182)
+      %new_m2_5 = arith.mulf %delta, %new_m2 : tensor<1x4096xf32> loc(#loc183)
+      %new_m2_6 = arith.addf %m2, %new_m2_5 : tensor<1x4096xf32> loc(#loc206)
+      scf.yield %new_m2_6, %new_mean_4, %new_weight_2 : tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32> loc(#loc184)
+    } loc(#loc73)
+    tt.return %0#1, %0#0, %0#2 : tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32> loc(#loc83)
+  ^bb1:  // no predecessors
+    %1 = ub.poison : tensor<1x4096xf32> loc(#loc84)
+    %2 = ub.poison : tensor<1x4096xf32> loc(#loc84)
+    %3 = ub.poison : tensor<1x4096xf32> loc(#loc84)
+    tt.return %1, %2, %3 : tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32> loc(#loc84)
+  } loc(#loc72)
+  tt.func private @triton.language.standard.zeros_like__fp32S1_4096S__(%input: tensor<1x4096xf32> loc("input"(#loc85))) -> tensor<1x4096xf32> attributes {noinline = false} {
+    %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_4096__(1,)cconstexpr_fp32_"() : () -> tensor<1x4096xf32> loc(#loc86)
+    tt.return %0 : tensor<1x4096xf32> loc(#loc87)
+  ^bb1:  // no predecessors
+    %1 = ub.poison : tensor<1x4096xf32> loc(#loc88)
+    tt.return %1 : tensor<1x4096xf32> loc(#loc88)
+  } loc(#loc85)
+  tt.func private @"torch._inductor.runtime.triton_helpers.welford__fp32S1_4096S_fp32S1_4096S_fp32S1_4096S__(3,)cconstexpr_1_"(%mean: tensor<1x4096xf32> loc("mean"(#loc89)), %m2: tensor<1x4096xf32> loc("m2"(#loc89)), %weight: tensor<1x4096xf32> loc("weight"(#loc89))) -> (tensor<1xf32>, tensor<1xf32>, tensor<1xf32>) attributes {noinline = false} {
+    %0:3 = "tt.reduce"(%mean, %m2, %weight) <{axis = 1 : i32}> ({
+    ^bb0(%arg3: f32 loc(unknown), %arg4: f32 loc(unknown), %arg5: f32 loc(unknown), %arg6: f32 loc(unknown), %arg7: f32 loc(unknown), %arg8: f32 loc(unknown)):
+      %4:3 = tt.call @torch._inductor.runtime.triton_helpers.welford_combine__fp32_fp32_fp32_fp32_fp32_fp32__(%arg3, %arg4, %arg5, %arg6, %arg7, %arg8) : (f32, f32, f32, f32, f32, f32) -> (f32, f32, f32) loc(#loc90)
+      tt.reduce.return %4#0, %4#1, %4#2 : f32, f32, f32 loc(#loc90)
+    }) : (tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32>) -> (tensor<1xf32>, tensor<1xf32>, tensor<1xf32>) loc(#loc90)
+    tt.return %0#0, %0#1, %0#2 : tensor<1xf32>, tensor<1xf32>, tensor<1xf32> loc(#loc92)
+  ^bb1:  // no predecessors
+    %1 = ub.poison : tensor<1xf32> loc(#loc93)
+    %2 = ub.poison : tensor<1xf32> loc(#loc93)
+    %3 = ub.poison : tensor<1xf32> loc(#loc93)
+    tt.return %1, %2, %3 : tensor<1xf32>, tensor<1xf32>, tensor<1xf32> loc(#loc93)
+  } loc(#loc89)
+  tt.func private @torch._inductor.runtime.triton_helpers.welford_combine__fp32_fp32_fp32_fp32_fp32_fp32__(%mean_1: f32 loc("mean_1"(#loc94)), %m2_1: f32 loc("m2_1"(#loc94)), %weight_1: f32 loc("weight_1"(#loc94)), %mean_2: f32 loc("mean_2"(#loc94)), %m2_2: f32 loc("m2_2"(#loc94)), %weight_2: f32 loc("weight_2"(#loc94))) -> (f32, f32, f32) attributes {noinline = false} {
+    %delta = arith.subf %mean_2, %mean_1 : f32 loc(#loc195)
+    %new_weight = arith.addf %weight_1, %weight_2 : f32 loc(#loc196)
+    %w2_over_w = arith.constant 0.000000e+00 : f32 loc(#loc197)
+    %w2_over_w_0 = arith.cmpf oeq, %new_weight, %w2_over_w : f32 loc(#loc197)
+    %w2_over_w_1 = arith.divf %weight_2, %new_weight : f32 loc(#loc198)
+    %w2_over_w_2 = arith.constant 0.000000e+00 : f32 loc(#loc199)
+    %w2_over_w_3 = arith.constant 0.000000e+00 : f32 loc(#loc199)
+    %w2_over_w_4 = arith.select %w2_over_w_0, %w2_over_w_3, %w2_over_w_1 : f32 loc(#loc199)
+    %0 = arith.mulf %delta, %w2_over_w_4 : f32 loc(#loc100)
+    %1 = arith.addf %mean_1, %0 : f32 loc(#loc101)
+    %2 = arith.addf %m2_1, %m2_2 : f32 loc(#loc102)
+    %3 = arith.mulf %delta, %delta : f32 loc(#loc103)
+    %4 = arith.mulf %3, %weight_1 : f32 loc(#loc104)
+    %5 = arith.mulf %4, %w2_over_w_4 : f32 loc(#loc105)
+    %6 = arith.addf %2, %5 : f32 loc(#loc106)
+    tt.return %1, %6, %new_weight : f32, f32, f32 loc(#loc107)
+  ^bb1:  // no predecessors
+    %7 = ub.poison : f32 loc(#loc108)
+    %8 = ub.poison : f32 loc(#loc108)
+    %9 = ub.poison : f32 loc(#loc108)
+    tt.return %7, %8, %9 : f32, f32, f32 loc(#loc108)
+  } loc(#loc94)
+} loc(#loc)
+#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":19:13)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":20:15)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":23:28)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":23:33)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":24:36)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":24:44)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":24:23)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":25:21)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":26:27)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":26:37)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":29:45)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":30:43)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":31:47)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":32:43)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":33:31)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":34:29)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":38:46)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":38:41)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":38:34)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":38:61)
+#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":38:51)
+#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":38:112)
+#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":42:62)
+#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":42:51)
+#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":44:39)
+#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":44:62)
+#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":45:37)
+#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":45:58)
+#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":46:41)
+#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":46:66)
+#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":46:8)
+#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":47:79)
+#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":48:16)
+#loc34 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":49:16)
+#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":50:16)
+#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":51:43)
+#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":52:31)
+#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":53:29)
+#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":57:34)
+#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":57:41)
+#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":57:94)
+#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":58:47)
+#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":58:42)
+#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":58:35)
+#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":58:62)
+#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":58:52)
+#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":58:114)
+#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":59:35)
+#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":59:42)
+#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":59:95)
+#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":60:16)
+#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":61:23)
+#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":63:24)
+#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":64:16)
+#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":65:24)
+#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":66:16)
+#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":67:24)
+#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":68:32)
+#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":69:24)
+#loc60 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":71:24)
+#loc61 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":72:24)
+#loc62 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":73:41)
+#loc63 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":73:36)
+#loc64 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":73:29)
+#loc65 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":73:63)
+#loc66 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":73:53)
+#loc67 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":51:4)
+#loc68 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":120:0)
+#loc69 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":129:31)
+#loc70 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":129:11)
+#loc71 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":129:4)
+#loc73 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":217:7)
+#loc74 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":218:46)
+#loc75 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":220:31)
+#loc76 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":222:24)
+#loc77 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":223:30)
+#loc78 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":224:34)
+#loc79 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":224:26)
+#loc80 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":225:39)
+#loc81 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":225:31)
+#loc82 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":225:22)
+#loc83 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":226:11)
+#loc84 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":226:4)
+#loc86 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":140:30)
+#loc87 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":140:11)
+#loc88 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":140:4)
+#loc90 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":243:46)
+#loc92 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":243:11)
+#loc93 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":243:4)
+#loc95 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":231:21)
+#loc96 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":232:28)
+#loc97 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:39)
+#loc98 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:60)
+#loc99 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:49)
+#loc100 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":235:25)
+#loc101 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":235:17)
+#loc102 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:15)
+#loc103 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:30)
+#loc104 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:38)
+#loc105 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:49)
+#loc106 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:22)
+#loc107 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":234:11)
+#loc108 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":234:4)
+#loc115 = loc("xnumel"(#loc1))
+#loc116 = loc("r0_numel"(#loc2))
+#loc117 = loc("xoffset"(#loc3))
+#loc118 = loc("xoffset"(#loc4))
+#loc119 = loc("xindex"(#loc5))
+#loc120 = loc("xindex"(#loc6))
+#loc121 = loc("xindex"(#loc7))
+#loc122 = loc("xmask"(#loc8))
+#loc123 = loc("r0_base"(#loc9))
+#loc124 = loc("r0_base"(#loc10))
+#loc125 = loc("tmp3_mean"(#loc11))
+#loc126 = loc("tmp3_m2"(#loc12))
+#loc127 = loc("tmp3_weight"(#loc13))
+#loc128 = loc("tmp3_mean"(#loc14))
+#loc129 = loc("r0_index"(#loc15))
+#loc130 = loc("r0_mask"(#loc16))
+#loc131 = loc("tmp0"(#loc17))
+#loc132 = loc("tmp0"(#loc18))
+#loc133 = loc("tmp0"(#loc19))
+#loc134 = loc("tmp0"(#loc20))
+#loc135 = loc("tmp0"(#loc21))
+#loc136 = loc("tmp0"(#loc22))
+#loc137 = loc("tmp3_mean"(#loc25))
+#loc138 = loc("tmp3_mean"(#loc26))
+#loc139 = loc("tmp3_m2"(#loc27))
+#loc140 = loc("tmp3_m2"(#loc28))
+#loc141 = loc("tmp3_weight"(#loc29))
+#loc142 = loc("tmp3_weight"(#loc30))
+#loc143 = loc("tmp3"(#loc33))
+#loc144 = loc("tmp7"(#loc34))
+#loc145 = loc("tmp8"(#loc35))
+#loc146 = loc("r0_index"(#loc37))
+#loc147 = loc("r0_mask"(#loc38))
+#loc148 = loc("tmp9"(#loc39))
+#loc149 = loc("tmp9"(#loc40))
+#loc150 = loc("tmp9"(#loc41))
+#loc151 = loc("tmp12"(#loc42))
+#loc152 = loc("tmp12"(#loc43))
+#loc153 = loc("tmp12"(#loc44))
+#loc154 = loc("tmp12"(#loc45))
+#loc155 = loc("tmp12"(#loc46))
+#loc156 = loc("tmp12"(#loc47))
+#loc157 = loc("tmp23"(#loc48))
+#loc158 = loc("tmp23"(#loc49))
+#loc159 = loc("tmp23"(#loc50))
+#loc160 = loc("tmp10"(#loc51))
+#loc161 = loc("tmp11"(#loc52))
+#loc162 = loc("tmp14"(#loc53))
+#loc163 = loc("tmp15"(#loc54))
+#loc164 = loc("tmp16"(#loc55))
+#loc165 = loc("tmp17"(#loc56))
+#loc166 = loc("tmp18"(#loc57))
+#loc167 = loc("tmp19"(#loc58))
+#loc168 = loc("tmp20"(#loc59))
+#loc169 = loc("tmp22"(#loc60))
+#loc170 = loc("tmp24"(#loc61))
+#loc176 = loc("new_weight"(#loc74))
+#loc177 = loc("new_m2"(#loc75))
+#loc178 = loc("delta"(#loc76))
+#loc179 = loc("new_weight"(#loc77))
+#loc180 = loc("new_mean"(#loc78))
+#loc181 = loc("new_mean"(#loc79))
+#loc182 = loc("new_m2"(#loc80))
+#loc183 = loc("new_m2"(#loc81))
+#loc184 = loc("new_m2"(#loc82))
+#loc195 = loc("delta"(#loc95))
+#loc196 = loc("new_weight"(#loc96))
+#loc197 = loc("w2_over_w"(#loc97))
+#loc198 = loc("w2_over_w"(#loc98))
+#loc199 = loc("w2_over_w"(#loc99))
+#loc200 = loc("tmp3_m2"(#loc128))
+#loc202 = loc("new_weight"(#loc176))
+#loc203 = loc("new_m2"(#loc177))
+#loc204 = loc("new_weight"(#loc179))
+#loc205 = loc("new_mean"(#loc181))
+#loc206 = loc("new_m2"(#loc184))
+#loc207 = loc("tmp3_weight"(#loc200))
diff --git a/triton/U2Y4PVYJWFSKZDU4AWOF4HOYPDZZL3MFTQLITJAO62J7FJKLEL7A/triton_red_fused_add_mul_native_layer_norm_0.ttgir b/triton/U2Y4PVYJWFSKZDU4AWOF4HOYPDZZL3MFTQLITJAO62J7FJKLEL7A/triton_red_fused_add_mul_native_layer_norm_0.ttgir
new file mode 100644
index 0000000000000000000000000000000000000000..08b835b3f97fa08461779a0ccc9ff5ce21bc5b61
--- /dev/null
+++ b/triton/U2Y4PVYJWFSKZDU4AWOF4HOYPDZZL3MFTQLITJAO62J7FJKLEL7A/triton_red_fused_add_mul_native_layer_norm_0.ttgir
@@ -0,0 +1,179 @@
+#blocked = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [1, 32], warpsPerCTA = [1, 16], order = [1, 0]}>
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":18:0)
+#loc1 = loc(unknown)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":47:79)
+#loc49 = loc("in_ptr0"(#loc))
+#loc50 = loc("in_ptr1"(#loc))
+#loc51 = loc("in_ptr2"(#loc))
+#loc52 = loc("out_ptr2"(#loc))
+#loc53 = loc("xnumel"(#loc))
+#loc54 = loc("r0_numel"(#loc))
+#loc68 = loc(callsite(#loc1 at #loc15))
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 16 : i32, ttg.target = "cuda:89", "ttg.threads-per-warp" = 32 : i32} {
+  tt.func public @triton_red_fused_add_mul_native_layer_norm_0(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %out_ptr2: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
+    %cst = arith.constant dense<4096> : tensor<1x4096xi32, #blocked> loc(#loc1)
+    %cst_0 = arith.constant dense<0.000000e+00> : tensor<1x4096xbf16, #blocked> loc(#loc1)
+    %c4096_i32 = arith.constant 4096 : i32 loc(#loc1)
+    %c2304_i32 = arith.constant 2304 : i32 loc(#loc1)
+    %cst_1 = arith.constant 0.000000e+00 : f32 loc(#loc1)
+    %cst_2 = arith.constant dense<0.000000e+00> : tensor<1x4096xf32, #blocked> loc(#loc1)
+    %cst_3 = arith.constant dense<9.99999997E-7> : tensor<1x1xf32, #blocked> loc(#loc1)
+    %cst_4 = arith.constant dense<4.096000e+03> : tensor<1x1xf32, #blocked> loc(#loc1)
+    %cst_5 = arith.constant dense<1.000000e+00> : tensor<1x4096xf32, #blocked> loc(#loc1)
+    %xoffset = tt.get_program_id x : i32 loc(#loc55)
+    %xmask = arith.cmpi slt, %xoffset, %c2304_i32 : i32 loc(#loc56)
+    %r0_base = tt.make_range {end = 4096 : i32, start = 0 : i32} : tensor<4096xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc57)
+    %r0_base_6 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<4096xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x4096xi32, #blocked> loc(#loc57)
+    %r0_mask = arith.cmpi slt, %r0_base_6, %cst : tensor<1x4096xi32, #blocked> loc(#loc58)
+    %tmp0 = arith.muli %xoffset, %c4096_i32 : i32 loc(#loc59)
+    %tmp0_7 = tt.splat %tmp0 : i32 -> tensor<1x4096xi32, #blocked> loc(#loc92)
+    %tmp0_8 = arith.addi %r0_base_6, %tmp0_7 : tensor<1x4096xi32, #blocked> loc(#loc60)
+    %tmp0_9 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<1x4096x!tt.ptr<bf16>, #blocked> loc(#loc61)
+    %tmp0_10 = tt.addptr %tmp0_9, %tmp0_8 : tensor<1x4096x!tt.ptr<bf16>, #blocked>, tensor<1x4096xi32, #blocked> loc(#loc61)
+    %tmp0_11 = tt.splat %xmask : i1 -> tensor<1x4096xi1, #blocked> loc(#loc93)
+    %tmp0_12 = arith.andi %r0_mask, %tmp0_11 : tensor<1x4096xi1, #blocked> loc(#loc62)
+    %tmp0_13 = tt.load %tmp0_10, %tmp0_12, %cst_0 evictionPolicy = evict_last : tensor<1x4096x!tt.ptr<bf16>, #blocked> loc(#loc63)
+    %tmp0_14 = arith.extf %tmp0_13 : tensor<1x4096xbf16, #blocked> to tensor<1x4096xf32, #blocked> loc(#loc64)
+    %tmp3_mean = arith.select %tmp0_12, %tmp0_14, %cst_2 : tensor<1x4096xi1, #blocked>, tensor<1x4096xf32, #blocked> loc(#loc65)
+    %tmp3_weight = arith.select %tmp0_12, %cst_5, %cst_2 : tensor<1x4096xi1, #blocked>, tensor<1x4096xf32, #blocked> loc(#loc66)
+    %0:3 = "tt.reduce"(%tmp3_mean, %cst_2, %tmp3_weight) <{axis = 1 : i32}> ({
+    ^bb0(%arg6: f32 loc(callsite(#loc1 at #loc15)), %arg7: f32 loc(callsite(#loc1 at #loc15)), %arg8: f32 loc(callsite(#loc1 at #loc15)), %arg9: f32 loc(callsite(#loc1 at #loc15)), %arg10: f32 loc(callsite(#loc1 at #loc15)), %arg11: f32 loc(callsite(#loc1 at #loc15))):
+      %delta = arith.subf %arg9, %arg6 : f32 loc(#loc94)
+      %new_weight = arith.addf %arg8, %arg11 : f32 loc(#loc95)
+      %w2_over_w = arith.cmpf oeq, %new_weight, %cst_1 : f32 loc(#loc96)
+      %w2_over_w_24 = arith.divf %arg11, %new_weight : f32 loc(#loc97)
+      %w2_over_w_25 = arith.select %w2_over_w, %cst_1, %w2_over_w_24 : f32 loc(#loc98)
+      %4 = arith.mulf %delta, %w2_over_w_25 : f32 loc(#loc99)
+      %5 = arith.addf %arg6, %4 : f32 loc(#loc100)
+      %6 = arith.addf %arg7, %arg10 : f32 loc(#loc101)
+      %7 = arith.mulf %delta, %delta : f32 loc(#loc102)
+      %8 = arith.mulf %7, %arg8 : f32 loc(#loc103)
+      %9 = arith.mulf %8, %w2_over_w_25 : f32 loc(#loc104)
+      %10 = arith.addf %6, %9 : f32 loc(#loc105)
+      tt.reduce.return %5, %10, %new_weight : f32, f32, f32 loc(#loc67)
+    }) : (tensor<1x4096xf32, #blocked>, tensor<1x4096xf32, #blocked>, tensor<1x4096xf32, #blocked>) -> (tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked}>>, tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked}>>, tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked}>>) loc(#loc67)
+    %tmp3 = tt.expand_dims %0#0 {axis = 1 : i32} : tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<1x1xf32, #blocked> loc(#loc74)
+    %tmp7 = tt.expand_dims %0#1 {axis = 1 : i32} : tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<1x1xf32, #blocked> loc(#loc75)
+    %tmp9 = tt.splat %in_ptr1 : !tt.ptr<bf16> -> tensor<1x4096x!tt.ptr<bf16>, #blocked> loc(#loc76)
+    %tmp9_15 = tt.addptr %tmp9, %r0_base_6 : tensor<1x4096x!tt.ptr<bf16>, #blocked>, tensor<1x4096xi32, #blocked> loc(#loc76)
+    %tmp9_16 = tt.load %tmp9_15, %r0_mask, %cst_0 evictionPolicy = evict_last : tensor<1x4096x!tt.ptr<bf16>, #blocked> loc(#loc77)
+    %tmp9_17 = arith.extf %tmp9_16 : tensor<1x4096xbf16, #blocked> to tensor<1x4096xf32, #blocked> loc(#loc78)
+    %tmp12 = tt.load %tmp0_10, %tmp0_12, %cst_0 evictionPolicy = evict_first : tensor<1x4096x!tt.ptr<bf16>, #blocked> loc(#loc79)
+    %tmp12_18 = arith.extf %tmp12 : tensor<1x4096xbf16, #blocked> to tensor<1x4096xf32, #blocked> loc(#loc80)
+    %tmp23 = tt.splat %in_ptr2 : !tt.ptr<bf16> -> tensor<1x4096x!tt.ptr<bf16>, #blocked> loc(#loc81)
+    %tmp23_19 = tt.addptr %tmp23, %r0_base_6 : tensor<1x4096x!tt.ptr<bf16>, #blocked>, tensor<1x4096xi32, #blocked> loc(#loc81)
+    %tmp23_20 = tt.load %tmp23_19, %r0_mask, %cst_0 evictionPolicy = evict_last : tensor<1x4096x!tt.ptr<bf16>, #blocked> loc(#loc82)
+    %tmp23_21 = arith.extf %tmp23_20 : tensor<1x4096xbf16, #blocked> to tensor<1x4096xf32, #blocked> loc(#loc83)
+    %tmp11 = arith.addf %tmp9_17, %cst_5 : tensor<1x4096xf32, #blocked> loc(#loc84)
+    %tmp14 = tt.broadcast %tmp3 : tensor<1x1xf32, #blocked> -> tensor<1x4096xf32, #blocked> loc(#loc85)
+    %tmp14_22 = arith.subf %tmp12_18, %tmp14 : tensor<1x4096xf32, #blocked> loc(#loc85)
+    %tmp16 = arith.divf %tmp7, %cst_4 : tensor<1x1xf32, #blocked> loc(#loc86)
+    %tmp18 = arith.addf %tmp16, %cst_3 : tensor<1x1xf32, #blocked> loc(#loc87)
+    %tmp19 = tt.extern_elementwise %tmp18 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<1x1xf32, #blocked>) -> tensor<1x1xf32, #blocked> loc(#loc88)
+    %tmp20 = tt.broadcast %tmp19 : tensor<1x1xf32, #blocked> -> tensor<1x4096xf32, #blocked> loc(#loc89)
+    %tmp20_23 = arith.mulf %tmp14_22, %tmp20 : tensor<1x4096xf32, #blocked> loc(#loc89)
+    %tmp22 = arith.mulf %tmp11, %tmp20_23 : tensor<1x4096xf32, #blocked> loc(#loc90)
+    %tmp24 = arith.addf %tmp22, %tmp23_21 : tensor<1x4096xf32, #blocked> loc(#loc91)
+    %1 = tt.splat %out_ptr2 : !tt.ptr<bf16> -> tensor<1x4096x!tt.ptr<bf16>, #blocked> loc(#loc46)
+    %2 = tt.addptr %1, %tmp0_8 : tensor<1x4096x!tt.ptr<bf16>, #blocked>, tensor<1x4096xi32, #blocked> loc(#loc46)
+    %3 = arith.truncf %tmp24 : tensor<1x4096xf32, #blocked> to tensor<1x4096xbf16, #blocked> loc(#loc47)
+    tt.store %2, %3, %tmp0_12 : tensor<1x4096x!tt.ptr<bf16>, #blocked> loc(#loc47)
+    tt.return loc(#loc48)
+  } loc(#loc)
+} loc(#loc)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":23:28)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":25:21)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":26:37)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":34:29)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":38:46)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":38:41)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":38:34)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":38:61)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":38:51)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":38:112)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":44:62)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":46:66)
+#loc14 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":243:46)
+#loc16 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":231:21)
+#loc17 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":232:28)
+#loc18 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:39)
+#loc19 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:60)
+#loc20 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:49)
+#loc21 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":235:25)
+#loc22 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":235:17)
+#loc23 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:15)
+#loc24 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:30)
+#loc25 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:38)
+#loc26 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:49)
+#loc27 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:22)
+#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":48:16)
+#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":49:16)
+#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":57:34)
+#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":57:41)
+#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":57:94)
+#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":58:52)
+#loc34 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":58:114)
+#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":59:35)
+#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":59:42)
+#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":59:95)
+#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":61:23)
+#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":63:24)
+#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":65:24)
+#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":67:24)
+#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":68:32)
+#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":69:24)
+#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":71:24)
+#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":72:24)
+#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":73:29)
+#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":73:53)
+#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":51:4)
+#loc55 = loc("xoffset"(#loc2))
+#loc56 = loc("xmask"(#loc3))
+#loc57 = loc("r0_base"(#loc4))
+#loc58 = loc("r0_mask"(#loc5))
+#loc59 = loc("tmp0"(#loc6))
+#loc60 = loc("tmp0"(#loc7))
+#loc61 = loc("tmp0"(#loc8))
+#loc62 = loc("tmp0"(#loc9))
+#loc63 = loc("tmp0"(#loc10))
+#loc64 = loc("tmp0"(#loc11))
+#loc65 = loc("tmp3_mean"(#loc12))
+#loc66 = loc("tmp3_weight"(#loc13))
+#loc67 = loc(callsite(#loc14 at #loc15))
+#loc69 = loc("delta"(#loc16))
+#loc70 = loc("new_weight"(#loc17))
+#loc71 = loc("w2_over_w"(#loc18))
+#loc72 = loc("w2_over_w"(#loc19))
+#loc73 = loc("w2_over_w"(#loc20))
+#loc74 = loc("tmp3"(#loc28))
+#loc75 = loc("tmp7"(#loc29))
+#loc76 = loc("tmp9"(#loc30))
+#loc77 = loc("tmp9"(#loc31))
+#loc78 = loc("tmp9"(#loc32))
+#loc79 = loc("tmp12"(#loc33))
+#loc80 = loc("tmp12"(#loc34))
+#loc81 = loc("tmp23"(#loc35))
+#loc82 = loc("tmp23"(#loc36))
+#loc83 = loc("tmp23"(#loc37))
+#loc84 = loc("tmp11"(#loc38))
+#loc85 = loc("tmp14"(#loc39))
+#loc86 = loc("tmp16"(#loc40))
+#loc87 = loc("tmp18"(#loc41))
+#loc88 = loc("tmp19"(#loc42))
+#loc89 = loc("tmp20"(#loc43))
+#loc90 = loc("tmp22"(#loc44))
+#loc91 = loc("tmp24"(#loc45))
+#loc92 = loc(fused[#loc60, #loc59])
+#loc93 = loc(fused[#loc62, #loc56])
+#loc94 = loc(callsite(#loc69 at #loc67))
+#loc95 = loc(callsite(#loc70 at #loc67))
+#loc96 = loc(callsite(#loc71 at #loc67))
+#loc97 = loc(callsite(#loc72 at #loc67))
+#loc98 = loc(callsite(#loc73 at #loc67))
+#loc99 = loc(callsite(#loc21 at #loc67))
+#loc100 = loc(callsite(#loc22 at #loc67))
+#loc101 = loc(callsite(#loc23 at #loc67))
+#loc102 = loc(callsite(#loc24 at #loc67))
+#loc103 = loc(callsite(#loc25 at #loc67))
+#loc104 = loc(callsite(#loc26 at #loc67))
+#loc105 = loc(callsite(#loc27 at #loc67))
diff --git a/triton/U2Y4PVYJWFSKZDU4AWOF4HOYPDZZL3MFTQLITJAO62J7FJKLEL7A/triton_red_fused_add_mul_native_layer_norm_0.ttir b/triton/U2Y4PVYJWFSKZDU4AWOF4HOYPDZZL3MFTQLITJAO62J7FJKLEL7A/triton_red_fused_add_mul_native_layer_norm_0.ttir
new file mode 100644
index 0000000000000000000000000000000000000000..32b961da3a7b69dd7829b992fb0b6950a2240263
--- /dev/null
+++ b/triton/U2Y4PVYJWFSKZDU4AWOF4HOYPDZZL3MFTQLITJAO62J7FJKLEL7A/triton_red_fused_add_mul_native_layer_norm_0.ttir
@@ -0,0 +1,180 @@
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":18:0)
+#loc1 = loc(unknown)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":47:79)
+#loc50 = loc("in_ptr0"(#loc))
+#loc51 = loc("in_ptr1"(#loc))
+#loc52 = loc("in_ptr2"(#loc))
+#loc53 = loc("out_ptr2"(#loc))
+#loc54 = loc("xnumel"(#loc))
+#loc55 = loc("r0_numel"(#loc))
+#loc57 = loc(callsite(#loc1 at #loc3))
+module {
+  tt.func public @triton_red_fused_add_mul_native_layer_norm_0(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %out_ptr2: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
+    %c4096_i32 = arith.constant 4096 : i32 loc(#loc1)
+    %xmask = arith.constant 2304 : i32 loc(#loc56)
+    %cst = arith.constant 0.000000e+00 : f32 loc(#loc57)
+    %cst_0 = arith.constant dense<0.000000e+00> : tensor<1x4096xf32> loc(#loc1)
+    %cst_1 = arith.constant dense<0.000000e+00> : tensor<1x4096xbf16> loc(#loc1)
+    %cst_2 = arith.constant dense<9.99999997E-7> : tensor<1x1xf32> loc(#loc1)
+    %cst_3 = arith.constant dense<4.096000e+03> : tensor<1x1xf32> loc(#loc1)
+    %cst_4 = arith.constant dense<1.000000e+00> : tensor<1x4096xf32> loc(#loc1)
+    %cst_5 = arith.constant dense<4096> : tensor<1x4096xi32> loc(#loc1)
+    %xoffset = tt.get_program_id x : i32 loc(#loc58)
+    %xmask_6 = arith.cmpi slt, %xoffset, %xmask : i32 loc(#loc56)
+    %r0_base = tt.make_range {end = 4096 : i32, start = 0 : i32} : tensor<4096xi32> loc(#loc59)
+    %r0_base_7 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<4096xi32> -> tensor<1x4096xi32> loc(#loc60)
+    %r0_mask = arith.cmpi slt, %r0_base_7, %cst_5 : tensor<1x4096xi32> loc(#loc61)
+    %tmp0 = arith.muli %xoffset, %c4096_i32 : i32 loc(#loc62)
+    %tmp0_8 = tt.splat %tmp0 : i32 -> tensor<1x4096xi32> loc(#loc94)
+    %tmp0_9 = arith.addi %r0_base_7, %tmp0_8 : tensor<1x4096xi32> loc(#loc63)
+    %tmp0_10 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<1x4096x!tt.ptr<bf16>> loc(#loc64)
+    %tmp0_11 = tt.addptr %tmp0_10, %tmp0_9 : tensor<1x4096x!tt.ptr<bf16>>, tensor<1x4096xi32> loc(#loc64)
+    %tmp0_12 = tt.splat %xmask_6 : i1 -> tensor<1x4096xi1> loc(#loc95)
+    %tmp0_13 = arith.andi %r0_mask, %tmp0_12 : tensor<1x4096xi1> loc(#loc65)
+    %tmp0_14 = tt.load %tmp0_11, %tmp0_13, %cst_1 evictionPolicy = evict_last : tensor<1x4096x!tt.ptr<bf16>> loc(#loc66)
+    %tmp0_15 = arith.extf %tmp0_14 : tensor<1x4096xbf16> to tensor<1x4096xf32> loc(#loc67)
+    %tmp3_mean = arith.select %tmp0_13, %tmp0_15, %cst_0 : tensor<1x4096xi1>, tensor<1x4096xf32> loc(#loc68)
+    %tmp3_weight = arith.select %tmp0_13, %cst_4, %cst_0 : tensor<1x4096xi1>, tensor<1x4096xf32> loc(#loc69)
+    %0:3 = "tt.reduce"(%tmp3_mean, %cst_0, %tmp3_weight) <{axis = 1 : i32}> ({
+    ^bb0(%arg6: f32 loc(callsite(#loc1 at #loc3)), %arg7: f32 loc(callsite(#loc1 at #loc3)), %arg8: f32 loc(callsite(#loc1 at #loc3)), %arg9: f32 loc(callsite(#loc1 at #loc3)), %arg10: f32 loc(callsite(#loc1 at #loc3)), %arg11: f32 loc(callsite(#loc1 at #loc3))):
+      %delta = arith.subf %arg9, %arg6 : f32 loc(#loc96)
+      %new_weight = arith.addf %arg8, %arg11 : f32 loc(#loc97)
+      %w2_over_w = arith.cmpf oeq, %new_weight, %cst : f32 loc(#loc98)
+      %w2_over_w_25 = arith.divf %arg11, %new_weight : f32 loc(#loc99)
+      %w2_over_w_26 = arith.select %w2_over_w, %cst, %w2_over_w_25 : f32 loc(#loc100)
+      %4 = arith.mulf %delta, %w2_over_w_26 : f32 loc(#loc101)
+      %5 = arith.addf %arg6, %4 : f32 loc(#loc102)
+      %6 = arith.addf %arg7, %arg10 : f32 loc(#loc103)
+      %7 = arith.mulf %delta, %delta : f32 loc(#loc104)
+      %8 = arith.mulf %7, %arg8 : f32 loc(#loc105)
+      %9 = arith.mulf %8, %w2_over_w_26 : f32 loc(#loc106)
+      %10 = arith.addf %6, %9 : f32 loc(#loc107)
+      tt.reduce.return %5, %10, %new_weight : f32, f32, f32 loc(#loc70)
+    }) : (tensor<1x4096xf32>, tensor<1x4096xf32>, tensor<1x4096xf32>) -> (tensor<1xf32>, tensor<1xf32>, tensor<1xf32>) loc(#loc70)
+    %tmp3 = tt.expand_dims %0#0 {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc76)
+    %tmp7 = tt.expand_dims %0#1 {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc77)
+    %tmp9 = tt.splat %in_ptr1 : !tt.ptr<bf16> -> tensor<1x4096x!tt.ptr<bf16>> loc(#loc78)
+    %tmp9_16 = tt.addptr %tmp9, %r0_base_7 : tensor<1x4096x!tt.ptr<bf16>>, tensor<1x4096xi32> loc(#loc78)
+    %tmp9_17 = tt.load %tmp9_16, %r0_mask, %cst_1 evictionPolicy = evict_last : tensor<1x4096x!tt.ptr<bf16>> loc(#loc79)
+    %tmp9_18 = arith.extf %tmp9_17 : tensor<1x4096xbf16> to tensor<1x4096xf32> loc(#loc80)
+    %tmp12 = tt.load %tmp0_11, %tmp0_13, %cst_1 evictionPolicy = evict_first : tensor<1x4096x!tt.ptr<bf16>> loc(#loc81)
+    %tmp12_19 = arith.extf %tmp12 : tensor<1x4096xbf16> to tensor<1x4096xf32> loc(#loc82)
+    %tmp23 = tt.splat %in_ptr2 : !tt.ptr<bf16> -> tensor<1x4096x!tt.ptr<bf16>> loc(#loc83)
+    %tmp23_20 = tt.addptr %tmp23, %r0_base_7 : tensor<1x4096x!tt.ptr<bf16>>, tensor<1x4096xi32> loc(#loc83)
+    %tmp23_21 = tt.load %tmp23_20, %r0_mask, %cst_1 evictionPolicy = evict_last : tensor<1x4096x!tt.ptr<bf16>> loc(#loc84)
+    %tmp23_22 = arith.extf %tmp23_21 : tensor<1x4096xbf16> to tensor<1x4096xf32> loc(#loc85)
+    %tmp11 = arith.addf %tmp9_18, %cst_4 : tensor<1x4096xf32> loc(#loc86)
+    %tmp14 = tt.broadcast %tmp3 : tensor<1x1xf32> -> tensor<1x4096xf32> loc(#loc87)
+    %tmp14_23 = arith.subf %tmp12_19, %tmp14 : tensor<1x4096xf32> loc(#loc87)
+    %tmp16 = arith.divf %tmp7, %cst_3 : tensor<1x1xf32> loc(#loc88)
+    %tmp18 = arith.addf %tmp16, %cst_2 : tensor<1x1xf32> loc(#loc89)
+    %tmp19 = tt.extern_elementwise %tmp18 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<1x1xf32>) -> tensor<1x1xf32> loc(#loc90)
+    %tmp20 = tt.broadcast %tmp19 : tensor<1x1xf32> -> tensor<1x4096xf32> loc(#loc91)
+    %tmp20_24 = arith.mulf %tmp14_23, %tmp20 : tensor<1x4096xf32> loc(#loc91)
+    %tmp22 = arith.mulf %tmp11, %tmp20_24 : tensor<1x4096xf32> loc(#loc92)
+    %tmp24 = arith.addf %tmp22, %tmp23_22 : tensor<1x4096xf32> loc(#loc93)
+    %1 = tt.splat %out_ptr2 : !tt.ptr<bf16> -> tensor<1x4096x!tt.ptr<bf16>> loc(#loc47)
+    %2 = tt.addptr %1, %tmp0_9 : tensor<1x4096x!tt.ptr<bf16>>, tensor<1x4096xi32> loc(#loc47)
+    %3 = arith.truncf %tmp24 : tensor<1x4096xf32> to tensor<1x4096xbf16> loc(#loc48)
+    tt.store %2, %3, %tmp0_13 : tensor<1x4096x!tt.ptr<bf16>> loc(#loc48)
+    tt.return loc(#loc49)
+  } loc(#loc)
+} loc(#loc)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":25:21)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":23:28)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":26:27)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":26:37)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":34:29)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":38:46)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":38:41)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":38:34)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":38:61)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":38:51)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":38:112)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":44:62)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":46:66)
+#loc16 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":243:46)
+#loc17 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":231:21)
+#loc18 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":232:28)
+#loc19 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:39)
+#loc20 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:60)
+#loc21 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":233:49)
+#loc22 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":235:25)
+#loc23 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":235:17)
+#loc24 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:15)
+#loc25 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:30)
+#loc26 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:38)
+#loc27 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:49)
+#loc28 = loc("/usr/local/lib/python3.12/dist-packages/torch/_inductor/runtime/triton_helpers.py":236:22)
+#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":48:16)
+#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":49:16)
+#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":57:34)
+#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":57:41)
+#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":57:94)
+#loc34 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":58:52)
+#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":58:114)
+#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":59:35)
+#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":59:42)
+#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":59:95)
+#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":61:23)
+#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":63:24)
+#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":65:24)
+#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":67:24)
+#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":68:32)
+#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":69:24)
+#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":71:24)
+#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":72:24)
+#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":73:29)
+#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":73:53)
+#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/pg/cpgeszh4npyny7u2qxlktpv2y2xdggzyl5opoy3oruqsqet4p5ek.py":51:4)
+#loc56 = loc("xmask"(#loc2))
+#loc58 = loc("xoffset"(#loc4))
+#loc59 = loc("r0_base"(#loc5))
+#loc60 = loc("r0_base"(#loc6))
+#loc61 = loc("r0_mask"(#loc7))
+#loc62 = loc("tmp0"(#loc8))
+#loc63 = loc("tmp0"(#loc9))
+#loc64 = loc("tmp0"(#loc10))
+#loc65 = loc("tmp0"(#loc11))
+#loc66 = loc("tmp0"(#loc12))
+#loc67 = loc("tmp0"(#loc13))
+#loc68 = loc("tmp3_mean"(#loc14))
+#loc69 = loc("tmp3_weight"(#loc15))
+#loc70 = loc(callsite(#loc16 at #loc3))
+#loc71 = loc("delta"(#loc17))
+#loc72 = loc("new_weight"(#loc18))
+#loc73 = loc("w2_over_w"(#loc19))
+#loc74 = loc("w2_over_w"(#loc20))
+#loc75 = loc("w2_over_w"(#loc21))
+#loc76 = loc("tmp3"(#loc29))
+#loc77 = loc("tmp7"(#loc30))
+#loc78 = loc("tmp9"(#loc31))
+#loc79 = loc("tmp9"(#loc32))
+#loc80 = loc("tmp9"(#loc33))
+#loc81 = loc("tmp12"(#loc34))
+#loc82 = loc("tmp12"(#loc35))
+#loc83 = loc("tmp23"(#loc36))
+#loc84 = loc("tmp23"(#loc37))
+#loc85 = loc("tmp23"(#loc38))
+#loc86 = loc("tmp11"(#loc39))
+#loc87 = loc("tmp14"(#loc40))
+#loc88 = loc("tmp16"(#loc41))
+#loc89 = loc("tmp18"(#loc42))
+#loc90 = loc("tmp19"(#loc43))
+#loc91 = loc("tmp20"(#loc44))
+#loc92 = loc("tmp22"(#loc45))
+#loc93 = loc("tmp24"(#loc46))
+#loc94 = loc(fused[#loc63, #loc62])
+#loc95 = loc(fused[#loc65, #loc56])
+#loc96 = loc(callsite(#loc71 at #loc70))
+#loc97 = loc(callsite(#loc72 at #loc70))
+#loc98 = loc(callsite(#loc73 at #loc70))
+#loc99 = loc(callsite(#loc74 at #loc70))
+#loc100 = loc(callsite(#loc75 at #loc70))
+#loc101 = loc(callsite(#loc22 at #loc70))
+#loc102 = loc(callsite(#loc23 at #loc70))
+#loc103 = loc(callsite(#loc24 at #loc70))
+#loc104 = loc(callsite(#loc25 at #loc70))
+#loc105 = loc(callsite(#loc26 at #loc70))
+#loc106 = loc(callsite(#loc27 at #loc70))
+#loc107 = loc(callsite(#loc28 at #loc70))
diff --git a/triton/USWMT7GG23USQSTY4BNJUVZU7MXRA43VVBWSRLYFKD7DKNHJ64QQ/__grp__triton_poi_fused_clone_0.json b/triton/USWMT7GG23USQSTY4BNJUVZU7MXRA43VVBWSRLYFKD7DKNHJ64QQ/__grp__triton_poi_fused_clone_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..00f5f4ae3b359a6f1e1a5caeeb65863755d07e03
--- /dev/null
+++ b/triton/USWMT7GG23USQSTY4BNJUVZU7MXRA43VVBWSRLYFKD7DKNHJ64QQ/__grp__triton_poi_fused_clone_0.json
@@ -0,0 +1 @@
+{"child_paths": {"triton_poi_fused_clone_0.source": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/USWMT7GG23USQSTY4BNJUVZU7MXRA43VVBWSRLYFKD7DKNHJ64QQ/triton_poi_fused_clone_0.source", "triton_poi_fused_clone_0.ttir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/USWMT7GG23USQSTY4BNJUVZU7MXRA43VVBWSRLYFKD7DKNHJ64QQ/triton_poi_fused_clone_0.ttir", "triton_poi_fused_clone_0.ttgir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/USWMT7GG23USQSTY4BNJUVZU7MXRA43VVBWSRLYFKD7DKNHJ64QQ/triton_poi_fused_clone_0.ttgir", "triton_poi_fused_clone_0.llir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/USWMT7GG23USQSTY4BNJUVZU7MXRA43VVBWSRLYFKD7DKNHJ64QQ/triton_poi_fused_clone_0.llir", "triton_poi_fused_clone_0.ptx": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/USWMT7GG23USQSTY4BNJUVZU7MXRA43VVBWSRLYFKD7DKNHJ64QQ/triton_poi_fused_clone_0.ptx", "triton_poi_fused_clone_0.cubin": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/USWMT7GG23USQSTY4BNJUVZU7MXRA43VVBWSRLYFKD7DKNHJ64QQ/triton_poi_fused_clone_0.cubin", "triton_poi_fused_clone_0.json": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/USWMT7GG23USQSTY4BNJUVZU7MXRA43VVBWSRLYFKD7DKNHJ64QQ/triton_poi_fused_clone_0.json"}}
\ No newline at end of file
diff --git a/triton/USWMT7GG23USQSTY4BNJUVZU7MXRA43VVBWSRLYFKD7DKNHJ64QQ/triton_poi_fused_clone_0.cubin b/triton/USWMT7GG23USQSTY4BNJUVZU7MXRA43VVBWSRLYFKD7DKNHJ64QQ/triton_poi_fused_clone_0.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..87b37bbcc3394d686d041ad40b534f5d75aeb67b
Binary files /dev/null and b/triton/USWMT7GG23USQSTY4BNJUVZU7MXRA43VVBWSRLYFKD7DKNHJ64QQ/triton_poi_fused_clone_0.cubin differ
diff --git a/triton/USWMT7GG23USQSTY4BNJUVZU7MXRA43VVBWSRLYFKD7DKNHJ64QQ/triton_poi_fused_clone_0.json b/triton/USWMT7GG23USQSTY4BNJUVZU7MXRA43VVBWSRLYFKD7DKNHJ64QQ/triton_poi_fused_clone_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..3acd55742ff19652520a7886010c6d718f73b442
--- /dev/null
+++ b/triton/USWMT7GG23USQSTY4BNJUVZU7MXRA43VVBWSRLYFKD7DKNHJ64QQ/triton_poi_fused_clone_0.json
@@ -0,0 +1 @@
+{"hash": "a4acc9fcc6d6e9284a78e05a9a5734fb2f107375a86d28af0550fe3534e9f721", "target": {"backend": "cuda", "arch": 89, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "enable_reflect_ftz": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee", "bf16x3", "bf16x6"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm89", "instrumentation_mode": "", "triton_version": "3.6.0", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_poi_fused_clone_0"}
\ No newline at end of file
diff --git a/triton/USWMT7GG23USQSTY4BNJUVZU7MXRA43VVBWSRLYFKD7DKNHJ64QQ/triton_poi_fused_clone_0.llir b/triton/USWMT7GG23USQSTY4BNJUVZU7MXRA43VVBWSRLYFKD7DKNHJ64QQ/triton_poi_fused_clone_0.llir
new file mode 100644
index 0000000000000000000000000000000000000000..9037fc8452a615107e5eabb80b658b1305d18512
--- /dev/null
+++ b/triton/USWMT7GG23USQSTY4BNJUVZU7MXRA43VVBWSRLYFKD7DKNHJ64QQ/triton_poi_fused_clone_0.llir
@@ -0,0 +1,53 @@
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64"
+
+; Function Attrs: nounwind
+define ptx_kernel void @triton_poi_fused_clone_0(ptr addrspace(1) %0, ptr addrspace(1) %1, i32 %2, ptr addrspace(1) readnone captures(none) %3, ptr addrspace(1) readnone captures(none) %4) local_unnamed_addr #0 !dbg !4 {
+  %6 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7
+  %7 = shl i32 %6, 10, !dbg !8
+  %8 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9
+  %9 = shl nuw nsw i32 %8, 3, !dbg !9
+  %10 = and i32 %9, 1016, !dbg !9
+  %11 = or disjoint i32 %10, %7, !dbg !10
+  %12 = sext i32 %11 to i64, !dbg !11
+  %13 = getelementptr bfloat, ptr addrspace(1) %0, i64 %12, !dbg !11
+  %14 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l"(ptr addrspace(1) %13) #2, !dbg !12
+  %15 = extractvalue { i32, i32, i32, i32 } %14, 0, !dbg !12
+  %16 = extractvalue { i32, i32, i32, i32 } %14, 1, !dbg !12
+  %17 = extractvalue { i32, i32, i32, i32 } %14, 2, !dbg !12
+  %18 = extractvalue { i32, i32, i32, i32 } %14, 3, !dbg !12
+  %19 = getelementptr bfloat, ptr addrspace(1) %1, i64 %12, !dbg !13
+  tail call void asm sideeffect "st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l"(i32 %15, i32 %16, i32 %17, i32 %18, ptr addrspace(1) %19) #2, !dbg !14
+  ret void, !dbg !15
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1
+
+attributes #0 = { nounwind "nvvm.reqntid"="128" }
+attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #2 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly)
+!1 = !DIFile(filename: "cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py", directory: "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz")
+!2 = !{i32 2, !"Debug Info Version", i32 3}
+!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
+!4 = distinct !DISubprogram(name: "triton_poi_fused_clone_0", linkageName: "triton_poi_fused_clone_0", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!5 = !DISubroutineType(cc: DW_CC_normal, types: !6)
+!6 = !{}
+!7 = !DILocation(line: 20, column: 28, scope: !4)
+!8 = !DILocation(line: 20, column: 33, scope: !4)
+!9 = !DILocation(line: 21, column: 36, scope: !4)
+!10 = !DILocation(line: 21, column: 23, scope: !4)
+!11 = !DILocation(line: 24, column: 30, scope: !4)
+!12 = !DILocation(line: 24, column: 35, scope: !4)
+!13 = !DILocation(line: 25, column: 25, scope: !4)
+!14 = !DILocation(line: 25, column: 36, scope: !4)
+!15 = !DILocation(line: 25, column: 4, scope: !4)
diff --git a/triton/USWMT7GG23USQSTY4BNJUVZU7MXRA43VVBWSRLYFKD7DKNHJ64QQ/triton_poi_fused_clone_0.ptx b/triton/USWMT7GG23USQSTY4BNJUVZU7MXRA43VVBWSRLYFKD7DKNHJ64QQ/triton_poi_fused_clone_0.ptx
new file mode 100644
index 0000000000000000000000000000000000000000..93822f4cf096bcd6c804817fa461da610da5f574
--- /dev/null
+++ b/triton/USWMT7GG23USQSTY4BNJUVZU7MXRA43VVBWSRLYFKD7DKNHJ64QQ/triton_poi_fused_clone_0.ptx
@@ -0,0 +1,305 @@
+//
+// Generated by LLVM NVPTX Back-End
+//
+
+.version 9.1
+.target sm_89
+.address_size 64
+
+	// .globl	triton_poi_fused_clone_0 // -- Begin function triton_poi_fused_clone_0
+                                        // @triton_poi_fused_clone_0
+.visible .entry triton_poi_fused_clone_0(
+	.param .u64 .ptr .global .align 1 triton_poi_fused_clone_0_param_0,
+	.param .u64 .ptr .global .align 1 triton_poi_fused_clone_0_param_1,
+	.param .u32 triton_poi_fused_clone_0_param_2,
+	.param .u64 .ptr .global .align 1 triton_poi_fused_clone_0_param_3,
+	.param .u64 .ptr .global .align 1 triton_poi_fused_clone_0_param_4
+)
+.reqntid 128
+{
+	.reg .b32 	%r<11>;
+	.reg .b64 	%rd<6>;
+	.loc	1 18 0                          // cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py:18:0
+$L__func_begin0:
+	.loc	1 18 0                          // cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py:18:0
+
+// %bb.0:
+	ld.param.b64 	%rd3, [triton_poi_fused_clone_0_param_0];
+	ld.param.b64 	%rd4, [triton_poi_fused_clone_0_param_1];
+$L__tmp0:
+	.loc	1 20 28                         // cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py:20:28
+	mov.u32 	%r5, %ctaid.x;
+	.loc	1 20 33                         // cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py:20:33
+	shl.b32 	%r6, %r5, 10;
+	.loc	1 21 36                         // cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py:21:36
+	mov.u32 	%r7, %tid.x;
+	shl.b32 	%r8, %r7, 3;
+	and.b32 	%r9, %r8, 1016;
+	.loc	1 21 23                         // cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py:21:23
+	or.b32 	%r10, %r9, %r6;
+	.loc	1 24 30                         // cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py:24:30
+	mul.wide.s32 	%rd5, %r10, 2;
+	add.s64 	%rd1, %rd3, %rd5;
+	.loc	1 24 35                         // cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py:24:35
+	// begin inline asm
+	mov.u32 %r1, 0x0;
+	mov.u32 %r2, 0x0;
+	mov.u32 %r3, 0x0;
+	mov.u32 %r4, 0x0;
+	ld.global.v4.b32 { %r1, %r2, %r3, %r4 }, [ %rd1 + 0 ];
+	// end inline asm
+	.loc	1 25 25                         // cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py:25:25
+	add.s64 	%rd2, %rd4, %rd5;
+	.loc	1 25 36                         // cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py:25:36
+	// begin inline asm
+	st.global.v4.b32 [ %rd2 + 0 ], { %r1, %r2, %r3, %r4 };
+	// end inline asm
+	.loc	1 25 4                          // cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py:25:4
+	ret;
+$L__tmp1:
+$L__func_end0:
+                                        // -- End function
+}
+	.file	1 "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py"
+	.section	.debug_abbrev
+	{
+.b8 1                                   // Abbreviation Code
+.b8 17                                  // DW_TAG_compile_unit
+.b8 0                                   // DW_CHILDREN_no
+.b8 37                                  // DW_AT_producer
+.b8 8                                   // DW_FORM_string
+.b8 19                                  // DW_AT_language
+.b8 5                                   // DW_FORM_data2
+.b8 3                                   // DW_AT_name
+.b8 8                                   // DW_FORM_string
+.b8 16                                  // DW_AT_stmt_list
+.b8 6                                   // DW_FORM_data4
+.b8 27                                  // DW_AT_comp_dir
+.b8 8                                   // DW_FORM_string
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 0                                   // EOM(3)
+	}
+	.section	.debug_info
+	{
+.b32 224                                // Length of Unit
+.b8 2                                   // DWARF version number
+.b8 0
+.b32 .debug_abbrev                      // Offset Into Abbrev. Section
+.b8 8                                   // Address Size (in bytes)
+.b8 1                                   // Abbrev [1] 0xb:0xd9 DW_TAG_compile_unit
+.b8 116                                 // DW_AT_producer
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 0
+.b8 2                                   // DW_AT_language
+.b8 0
+.b8 99                                  // DW_AT_name
+.b8 99
+.b8 122
+.b8 103
+.b8 55
+.b8 116
+.b8 112
+.b8 105
+.b8 116
+.b8 117
+.b8 112
+.b8 114
+.b8 119
+.b8 103
+.b8 113
+.b8 112
+.b8 117
+.b8 97
+.b8 106
+.b8 122
+.b8 121
+.b8 50
+.b8 110
+.b8 121
+.b8 108
+.b8 102
+.b8 107
+.b8 52
+.b8 51
+.b8 109
+.b8 100
+.b8 111
+.b8 122
+.b8 100
+.b8 53
+.b8 118
+.b8 119
+.b8 111
+.b8 55
+.b8 55
+.b8 109
+.b8 117
+.b8 113
+.b8 51
+.b8 107
+.b8 111
+.b8 115
+.b8 112
+.b8 110
+.b8 102
+.b8 55
+.b8 98
+.b8 46
+.b8 112
+.b8 121
+.b8 0
+.b32 .debug_line                        // DW_AT_stmt_list
+.b8 47                                  // DW_AT_comp_dir
+.b8 97
+.b8 112
+.b8 112
+.b8 47
+.b8 116
+.b8 101
+.b8 110
+.b8 115
+.b8 111
+.b8 114
+.b8 114
+.b8 116
+.b8 95
+.b8 108
+.b8 108
+.b8 109
+.b8 47
+.b8 118
+.b8 105
+.b8 115
+.b8 117
+.b8 97
+.b8 108
+.b8 95
+.b8 103
+.b8 101
+.b8 110
+.b8 47
+.b8 99
+.b8 111
+.b8 109
+.b8 112
+.b8 105
+.b8 108
+.b8 101
+.b8 100
+.b8 95
+.b8 99
+.b8 97
+.b8 99
+.b8 104
+.b8 101
+.b8 47
+.b8 102
+.b8 108
+.b8 117
+.b8 120
+.b8 50
+.b8 95
+.b8 107
+.b8 108
+.b8 101
+.b8 105
+.b8 110
+.b8 95
+.b8 57
+.b8 98
+.b8 95
+.b8 78
+.b8 86
+.b8 73
+.b8 68
+.b8 73
+.b8 65
+.b8 95
+.b8 71
+.b8 101
+.b8 70
+.b8 111
+.b8 114
+.b8 99
+.b8 101
+.b8 95
+.b8 82
+.b8 84
+.b8 88
+.b8 95
+.b8 52
+.b8 48
+.b8 57
+.b8 48
+.b8 95
+.b8 115
+.b8 109
+.b8 56
+.b8 57
+.b8 95
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 50
+.b8 46
+.b8 49
+.b8 48
+.b8 46
+.b8 48
+.b8 97
+.b8 48
+.b8 95
+.b8 98
+.b8 52
+.b8 101
+.b8 52
+.b8 101
+.b8 101
+.b8 56
+.b8 49
+.b8 100
+.b8 51
+.b8 46
+.b8 110
+.b8 118
+.b8 50
+.b8 53
+.b8 46
+.b8 49
+.b8 50
+.b8 95
+.b8 99
+.b8 117
+.b8 100
+.b8 97
+.b8 49
+.b8 51
+.b8 95
+.b8 49
+.b8 47
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 105
+.b8 110
+.b8 100
+.b8 117
+.b8 99
+.b8 116
+.b8 111
+.b8 114
+.b8 47
+.b8 99
+.b8 122
+.b8 0
+	}
+	.section	.debug_macinfo	{	}
diff --git a/triton/USWMT7GG23USQSTY4BNJUVZU7MXRA43VVBWSRLYFKD7DKNHJ64QQ/triton_poi_fused_clone_0.source b/triton/USWMT7GG23USQSTY4BNJUVZU7MXRA43VVBWSRLYFKD7DKNHJ64QQ/triton_poi_fused_clone_0.source
new file mode 100644
index 0000000000000000000000000000000000000000..ad631583408510c4e259542b648d1722880a2a21
--- /dev/null
+++ b/triton/USWMT7GG23USQSTY4BNJUVZU7MXRA43VVBWSRLYFKD7DKNHJ64QQ/triton_poi_fused_clone_0.source
@@ -0,0 +1,48 @@
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py":18:0)
+#loc13 = loc("in_ptr0"(#loc))
+#loc14 = loc("out_ptr0"(#loc))
+#loc15 = loc("xnumel"(#loc))
+module {
+  tt.func public @triton_poi_fused_clone_0(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} {
+    %xnumel_0 = arith.constant 8388608 : i32 loc(#loc16)
+    %xoffset = tt.get_program_id x : i32 loc(#loc17)
+    %xoffset_1 = arith.constant 1024 : i32 loc(#loc18)
+    %xoffset_2 = arith.constant 1024 : i32 loc(#loc18)
+    %xoffset_3 = arith.muli %xoffset, %xoffset_2 : i32 loc(#loc18)
+    %xindex = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32> loc(#loc19)
+    %xindex_4 = tt.splat %xoffset_3 : i32 -> tensor<1024xi32> loc(#loc20)
+    %xindex_5 = arith.addi %xindex_4, %xindex : tensor<1024xi32> loc(#loc20)
+    %xmask = arith.constant true loc(#loc21)
+    %xmask_6 = arith.constant dense<true> : tensor<1024xi1> loc(#loc21)
+    %tmp0 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<1024x!tt.ptr<bf16>> loc(#loc22)
+    %tmp0_7 = tt.addptr %tmp0, %xindex_5 : tensor<1024x!tt.ptr<bf16>>, tensor<1024xi32> loc(#loc22)
+    %tmp0_8 = tt.load %tmp0_7 : tensor<1024x!tt.ptr<bf16>> loc(#loc23)
+    %tmp0_9 = arith.extf %tmp0_8 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc24)
+    %0 = tt.splat %out_ptr0 : !tt.ptr<bf16> -> tensor<1024x!tt.ptr<bf16>> loc(#loc10)
+    %1 = tt.addptr %0, %xindex_5 : tensor<1024x!tt.ptr<bf16>>, tensor<1024xi32> loc(#loc10)
+    %2 = arith.truncf %tmp0_9 : tensor<1024xf32> to tensor<1024xbf16> loc(#loc11)
+    tt.store %1, %2 : tensor<1024x!tt.ptr<bf16>> loc(#loc11)
+    tt.return loc(#loc12)
+  } loc(#loc)
+} loc(#loc)
+#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py":19:13)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py":20:28)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py":20:33)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py":21:36)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py":21:23)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py":22:36)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py":24:30)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py":24:35)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py":24:44)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py":25:25)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py":25:36)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py":25:4)
+#loc16 = loc("xnumel"(#loc1))
+#loc17 = loc("xoffset"(#loc2))
+#loc18 = loc("xoffset"(#loc3))
+#loc19 = loc("xindex"(#loc4))
+#loc20 = loc("xindex"(#loc5))
+#loc21 = loc("xmask"(#loc6))
+#loc22 = loc("tmp0"(#loc7))
+#loc23 = loc("tmp0"(#loc8))
+#loc24 = loc("tmp0"(#loc9))
diff --git a/triton/USWMT7GG23USQSTY4BNJUVZU7MXRA43VVBWSRLYFKD7DKNHJ64QQ/triton_poi_fused_clone_0.ttgir b/triton/USWMT7GG23USQSTY4BNJUVZU7MXRA43VVBWSRLYFKD7DKNHJ64QQ/triton_poi_fused_clone_0.ttgir
new file mode 100644
index 0000000000000000000000000000000000000000..9b49825992999fe0c46d246301ba8e08cfc28f9a
--- /dev/null
+++ b/triton/USWMT7GG23USQSTY4BNJUVZU7MXRA43VVBWSRLYFKD7DKNHJ64QQ/triton_poi_fused_clone_0.ttgir
@@ -0,0 +1,38 @@
+#blocked = #ttg.blocked<{sizePerThread = [8], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}>
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py":18:0)
+#loc11 = loc("in_ptr0"(#loc))
+#loc12 = loc("out_ptr0"(#loc))
+#loc13 = loc("xnumel"(#loc))
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:89", "ttg.threads-per-warp" = 32 : i32} {
+  tt.func public @triton_poi_fused_clone_0(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} {
+    %c1024_i32 = arith.constant 1024 : i32 loc(#loc1)
+    %xoffset = tt.get_program_id x : i32 loc(#loc14)
+    %xoffset_0 = arith.muli %xoffset, %c1024_i32 : i32 loc(#loc15)
+    %xindex = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked> loc(#loc16)
+    %xindex_1 = tt.splat %xoffset_0 : i32 -> tensor<1024xi32, #blocked> loc(#loc17)
+    %xindex_2 = arith.addi %xindex_1, %xindex : tensor<1024xi32, #blocked> loc(#loc17)
+    %tmp0 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<1024x!tt.ptr<bf16>, #blocked> loc(#loc18)
+    %tmp0_3 = tt.addptr %tmp0, %xindex_2 : tensor<1024x!tt.ptr<bf16>, #blocked>, tensor<1024xi32, #blocked> loc(#loc18)
+    %tmp0_4 = tt.load %tmp0_3 : tensor<1024x!tt.ptr<bf16>, #blocked> loc(#loc19)
+    %0 = tt.splat %out_ptr0 : !tt.ptr<bf16> -> tensor<1024x!tt.ptr<bf16>, #blocked> loc(#loc8)
+    %1 = tt.addptr %0, %xindex_2 : tensor<1024x!tt.ptr<bf16>, #blocked>, tensor<1024xi32, #blocked> loc(#loc8)
+    tt.store %1, %tmp0_4 : tensor<1024x!tt.ptr<bf16>, #blocked> loc(#loc9)
+    tt.return loc(#loc10)
+  } loc(#loc)
+} loc(#loc)
+#loc1 = loc(unknown)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py":20:28)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py":20:33)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py":21:36)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py":21:23)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py":24:30)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py":24:35)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py":25:25)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py":25:36)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py":25:4)
+#loc14 = loc("xoffset"(#loc2))
+#loc15 = loc("xoffset"(#loc3))
+#loc16 = loc("xindex"(#loc4))
+#loc17 = loc("xindex"(#loc5))
+#loc18 = loc("tmp0"(#loc6))
+#loc19 = loc("tmp0"(#loc7))
diff --git a/triton/USWMT7GG23USQSTY4BNJUVZU7MXRA43VVBWSRLYFKD7DKNHJ64QQ/triton_poi_fused_clone_0.ttir b/triton/USWMT7GG23USQSTY4BNJUVZU7MXRA43VVBWSRLYFKD7DKNHJ64QQ/triton_poi_fused_clone_0.ttir
new file mode 100644
index 0000000000000000000000000000000000000000..813c40eec78ac1de9d3c21ec4f88f82d75fbcb60
--- /dev/null
+++ b/triton/USWMT7GG23USQSTY4BNJUVZU7MXRA43VVBWSRLYFKD7DKNHJ64QQ/triton_poi_fused_clone_0.ttir
@@ -0,0 +1,37 @@
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py":18:0)
+#loc11 = loc("in_ptr0"(#loc))
+#loc12 = loc("out_ptr0"(#loc))
+#loc13 = loc("xnumel"(#loc))
+module {
+  tt.func public @triton_poi_fused_clone_0(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} {
+    %c1024_i32 = arith.constant 1024 : i32 loc(#loc1)
+    %xoffset = tt.get_program_id x : i32 loc(#loc14)
+    %xoffset_0 = arith.muli %xoffset, %c1024_i32 : i32 loc(#loc15)
+    %xindex = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32> loc(#loc16)
+    %xindex_1 = tt.splat %xoffset_0 : i32 -> tensor<1024xi32> loc(#loc17)
+    %xindex_2 = arith.addi %xindex_1, %xindex : tensor<1024xi32> loc(#loc17)
+    %tmp0 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<1024x!tt.ptr<bf16>> loc(#loc18)
+    %tmp0_3 = tt.addptr %tmp0, %xindex_2 : tensor<1024x!tt.ptr<bf16>>, tensor<1024xi32> loc(#loc18)
+    %tmp0_4 = tt.load %tmp0_3 : tensor<1024x!tt.ptr<bf16>> loc(#loc19)
+    %0 = tt.splat %out_ptr0 : !tt.ptr<bf16> -> tensor<1024x!tt.ptr<bf16>> loc(#loc8)
+    %1 = tt.addptr %0, %xindex_2 : tensor<1024x!tt.ptr<bf16>>, tensor<1024xi32> loc(#loc8)
+    tt.store %1, %tmp0_4 : tensor<1024x!tt.ptr<bf16>> loc(#loc9)
+    tt.return loc(#loc10)
+  } loc(#loc)
+} loc(#loc)
+#loc1 = loc(unknown)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py":20:28)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py":20:33)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py":21:36)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py":21:23)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py":24:30)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py":24:35)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py":25:25)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py":25:36)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/cz/cczg7tpituprwgqpuajzy2nylfk43mdozd5vwo77muq3kospnf7b.py":25:4)
+#loc14 = loc("xoffset"(#loc2))
+#loc15 = loc("xoffset"(#loc3))
+#loc16 = loc("xindex"(#loc4))
+#loc17 = loc("xindex"(#loc5))
+#loc18 = loc("tmp0"(#loc6))
+#loc19 = loc("tmp0"(#loc7))
diff --git a/triton/V2UITHYV32CXYY2HOX6V4PW3NX5CTYPJRA4NJTKWUVP3CV64YJDQ/__grp__triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.json b/triton/V2UITHYV32CXYY2HOX6V4PW3NX5CTYPJRA4NJTKWUVP3CV64YJDQ/__grp__triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..969cd04c5f2eaee47bca305c6d2987d3598325a9
--- /dev/null
+++ b/triton/V2UITHYV32CXYY2HOX6V4PW3NX5CTYPJRA4NJTKWUVP3CV64YJDQ/__grp__triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.json
@@ -0,0 +1 @@
+{"child_paths": {"triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.source": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/V2UITHYV32CXYY2HOX6V4PW3NX5CTYPJRA4NJTKWUVP3CV64YJDQ/triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.source", "triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.ttir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/V2UITHYV32CXYY2HOX6V4PW3NX5CTYPJRA4NJTKWUVP3CV64YJDQ/triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.ttir", "triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.ttgir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/V2UITHYV32CXYY2HOX6V4PW3NX5CTYPJRA4NJTKWUVP3CV64YJDQ/triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.ttgir", "triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.llir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/V2UITHYV32CXYY2HOX6V4PW3NX5CTYPJRA4NJTKWUVP3CV64YJDQ/triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.llir", "triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.ptx": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/V2UITHYV32CXYY2HOX6V4PW3NX5CTYPJRA4NJTKWUVP3CV64YJDQ/triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.ptx", "triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.cubin": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/V2UITHYV32CXYY2HOX6V4PW3NX5CTYPJRA4NJTKWUVP3CV64YJDQ/triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.cubin", "triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.json": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/V2UITHYV32CXYY2HOX6V4PW3NX5CTYPJRA4NJTKWUVP3CV64YJDQ/triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.json"}}
\ No newline at end of file
diff --git a/triton/V2UITHYV32CXYY2HOX6V4PW3NX5CTYPJRA4NJTKWUVP3CV64YJDQ/triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.cubin b/triton/V2UITHYV32CXYY2HOX6V4PW3NX5CTYPJRA4NJTKWUVP3CV64YJDQ/triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..66513dc3b9f64fab9f197ca2a7a014d12c89ef89
Binary files /dev/null and b/triton/V2UITHYV32CXYY2HOX6V4PW3NX5CTYPJRA4NJTKWUVP3CV64YJDQ/triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.cubin differ
diff --git a/triton/V2UITHYV32CXYY2HOX6V4PW3NX5CTYPJRA4NJTKWUVP3CV64YJDQ/triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.json b/triton/V2UITHYV32CXYY2HOX6V4PW3NX5CTYPJRA4NJTKWUVP3CV64YJDQ/triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..a26038c12f363fbfc49a4a3aebd2d2203fbab244
--- /dev/null
+++ b/triton/V2UITHYV32CXYY2HOX6V4PW3NX5CTYPJRA4NJTKWUVP3CV64YJDQ/triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.json
@@ -0,0 +1 @@
+{"hash": "aea8899f15de857c634775fd5e3edb6dfa29e1e98838d4cd56a55fb157dcc247", "target": {"backend": "cuda", "arch": 89, "warp_size": 32}, "num_warps": 8, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "enable_reflect_ftz": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee", "bf16x3", "bf16x6"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm89", "instrumentation_mode": "", "triton_version": "3.6.0", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3"}
\ No newline at end of file
diff --git a/triton/V2UITHYV32CXYY2HOX6V4PW3NX5CTYPJRA4NJTKWUVP3CV64YJDQ/triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.llir b/triton/V2UITHYV32CXYY2HOX6V4PW3NX5CTYPJRA4NJTKWUVP3CV64YJDQ/triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.llir
new file mode 100644
index 0000000000000000000000000000000000000000..3ded8f8747fbb2bba62503c8536985d82d4ef70d
--- /dev/null
+++ b/triton/V2UITHYV32CXYY2HOX6V4PW3NX5CTYPJRA4NJTKWUVP3CV64YJDQ/triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.llir
@@ -0,0 +1,201 @@
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64"
+
+; Function Attrs: nounwind
+define ptx_kernel void @triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, i32 %6, ptr addrspace(1) readnone captures(none) %7, ptr addrspace(1) readnone captures(none) %8) local_unnamed_addr #0 !dbg !4 {
+  %10 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7
+  %11 = shl i32 %10, 9, !dbg !8
+  %12 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9
+  %13 = shl nuw nsw i32 %12, 1, !dbg !9
+  %14 = and i32 %13, 510, !dbg !9
+  %15 = or disjoint i32 %14, %11, !dbg !10
+  %16 = or disjoint i32 %15, 1, !dbg !10
+  %17 = sdiv i32 %15, 128, !dbg !11
+  %18 = mul i32 %17, 128, !dbg !12
+  %.decomposed = sub i32 %15, %18, !dbg !12
+  %19 = srem i32 %16, 128, !dbg !12
+  %20 = sdiv i32 %15, 4096, !dbg !13
+  %21 = sext i32 %15 to i64, !dbg !14
+  %22 = getelementptr bfloat, ptr addrspace(1) %0, i64 %21, !dbg !14
+  %23 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l"(ptr addrspace(1) %22) #2, !dbg !15
+  %24 = bitcast i32 %23 to <2 x bfloat>, !dbg !15
+  %25 = shl nsw i32 %20, 7, !dbg !16
+  %26 = add nsw i32 %25, %.decomposed, !dbg !17
+  %27 = sext i32 %26 to i64, !dbg !18
+  %28 = getelementptr float, ptr addrspace(1) %1, i64 %27, !dbg !18
+  %29 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #2, !dbg !19
+  %30 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09ld.global.L1::evict_last.L2::cache_hint.v2.b32 { $0, $1 }, [ $2 + 0 ], $3;", "=r,=r,l,l"(ptr addrspace(1) %28, i64 %29) #2, !dbg !19
+  %31 = extractvalue { i32, i32 } %30, 0, !dbg !19
+  %32 = extractvalue { i32, i32 } %30, 1, !dbg !19
+  %33 = getelementptr float, ptr addrspace(1) %2, i64 %27, !dbg !20
+  %34 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #2, !dbg !21
+  %35 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09ld.global.L1::evict_last.L2::cache_hint.v2.b32 { $0, $1 }, [ $2 + 0 ], $3;", "=r,=r,l,l"(ptr addrspace(1) %33, i64 %34) #2, !dbg !21
+  %36 = extractvalue { i32, i32 } %35, 0, !dbg !21
+  %37 = extractvalue { i32, i32 } %35, 1, !dbg !21
+  %38 = getelementptr bfloat, ptr addrspace(1) %3, i64 %21, !dbg !22
+  %39 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l"(ptr addrspace(1) %38) #2, !dbg !23
+  %40 = bitcast i32 %39 to <2 x bfloat>, !dbg !23
+  %41 = srem i32 %16, 2, !dbg !24
+  %42 = icmp slt i32 %41, 1, !dbg !25
+  %.lhs.trunc = trunc nsw i32 %19 to i8, !dbg !26
+  %43 = sdiv i8 %.lhs.trunc, 2, !dbg !26
+  %.sext = sext i8 %43 to i32, !dbg !26
+  %44 = shl nsw i32 %.sext, 1, !dbg !27
+  %45 = or disjoint i32 %.decomposed, 1, !dbg !28
+  %46 = shl nsw i32 %17, 7, !dbg !29
+  %47 = add i32 %45, %46, !dbg !30
+  %48 = or disjoint i32 %46, 1, !dbg !28
+  %49 = add i32 %48, %44, !dbg !30
+  %50 = sext i32 %47 to i64, !dbg !31
+  %51 = getelementptr bfloat, ptr addrspace(1) %0, i64 %50, !dbg !31
+  %52 = sext i32 %49 to i64, !dbg !31
+  %53 = getelementptr bfloat, ptr addrspace(1) %0, i64 %52, !dbg !31
+  %54 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #2, !dbg !32
+  %55 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %51, i64 %54, i1 true) #2, !dbg !32
+  %56 = bitcast i16 %55 to bfloat, !dbg !32
+  %57 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #2, !dbg !32
+  %58 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %53, i64 %57, i1 %42) #2, !dbg !32
+  %59 = bitcast i16 %58 to bfloat, !dbg !32
+  %60 = fpext bfloat %56 to float, !dbg !33
+  %61 = fpext bfloat %59 to float, !dbg !33
+  %62 = fsub float 0.000000e+00, %60, !dbg !34
+  %63 = fsub float 0.000000e+00, %61, !dbg !34
+  %64 = icmp sgt i32 %41, 0, !dbg !35
+  %65 = add i32 %46, %.decomposed, !dbg !36
+  %66 = add i32 %44, %46, !dbg !36
+  %67 = sext i32 %65 to i64, !dbg !37
+  %68 = getelementptr bfloat, ptr addrspace(1) %0, i64 %67, !dbg !37
+  %69 = sext i32 %66 to i64, !dbg !37
+  %70 = getelementptr bfloat, ptr addrspace(1) %0, i64 %69, !dbg !37
+  %71 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #2, !dbg !38
+  %72 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %68, i64 %71, i1 false) #2, !dbg !38
+  %73 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #2, !dbg !38
+  %74 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %70, i64 %73, i1 %64) #2, !dbg !38
+  %75 = bitcast i16 %74 to bfloat, !dbg !38
+  %76 = fpext bfloat %75 to float, !dbg !39
+  %77 = select i1 %42, float %63, float %76, !dbg !40
+  %78 = getelementptr bfloat, ptr addrspace(1) %3, i64 %50, !dbg !41
+  %79 = getelementptr bfloat, ptr addrspace(1) %3, i64 %52, !dbg !41
+  %80 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #2, !dbg !42
+  %81 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %78, i64 %80, i1 true) #2, !dbg !42
+  %82 = bitcast i16 %81 to bfloat, !dbg !42
+  %83 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #2, !dbg !42
+  %84 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %79, i64 %83, i1 %42) #2, !dbg !42
+  %85 = bitcast i16 %84 to bfloat, !dbg !42
+  %86 = fpext bfloat %82 to float, !dbg !43
+  %87 = fpext bfloat %85 to float, !dbg !43
+  %88 = fsub float 0.000000e+00, %86, !dbg !44
+  %89 = fsub float 0.000000e+00, %87, !dbg !44
+  %90 = getelementptr bfloat, ptr addrspace(1) %3, i64 %67, !dbg !45
+  %91 = getelementptr bfloat, ptr addrspace(1) %3, i64 %69, !dbg !45
+  %92 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #2, !dbg !46
+  %93 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %90, i64 %92, i1 false) #2, !dbg !46
+  %94 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #2, !dbg !46
+  %95 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %91, i64 %94, i1 %64) #2, !dbg !46
+  %96 = bitcast i16 %95 to bfloat, !dbg !46
+  %97 = fpext bfloat %96 to float, !dbg !47
+  %98 = select i1 %42, float %89, float %97, !dbg !40
+  %99 = getelementptr bfloat, ptr addrspace(1) %4, i64 %21, !dbg !48
+  %100 = fpext <2 x bfloat> %24 to <2 x float>, !dbg !49
+  %101 = insertelement <2 x i32> poison, i32 %31, i64 0, !dbg !19
+  %102 = insertelement <2 x i32> %101, i32 %32, i64 1, !dbg !19
+  %103 = bitcast <2 x i32> %102 to <2 x float>, !dbg !19
+  %104 = insertelement <2 x i32> poison, i32 %36, i64 0, !dbg !21
+  %105 = insertelement <2 x i32> %104, i32 %37, i64 1, !dbg !21
+  %106 = bitcast <2 x i32> %105 to <2 x float>, !dbg !21
+  %107 = fmul <2 x float> %100, %103, !dbg !50
+  %108 = insertelement <2 x float> poison, float %62, i64 0, !dbg !51
+  %109 = insertelement <2 x float> %108, float %77, i64 1, !dbg !51
+  %110 = fmul <2 x float> %109, %106, !dbg !51
+  %111 = fadd <2 x float> %107, %110, !dbg !52
+  %112 = fptrunc <2 x float> %111 to <2 x bfloat>, !dbg !53
+  %113 = bitcast <2 x bfloat> %112 to i32, !dbg !53
+  tail call void asm sideeffect "st.global.b32 [ $1 + 0 ], { $0 };", "r,l"(i32 %113, ptr addrspace(1) %99) #2, !dbg !53
+  %114 = getelementptr bfloat, ptr addrspace(1) %5, i64 %21, !dbg !54
+  %115 = fpext <2 x bfloat> %40 to <2 x float>, !dbg !55
+  %116 = fmul <2 x float> %103, %115, !dbg !56
+  %117 = insertelement <2 x float> poison, float %88, i64 0, !dbg !57
+  %118 = insertelement <2 x float> %117, float %98, i64 1, !dbg !57
+  %119 = fmul <2 x float> %118, %106, !dbg !57
+  %120 = fadd <2 x float> %116, %119, !dbg !58
+  %121 = fptrunc <2 x float> %120 to <2 x bfloat>, !dbg !59
+  %122 = bitcast <2 x bfloat> %121 to i32, !dbg !59
+  tail call void asm sideeffect "st.global.b32 [ $1 + 0 ], { $0 };", "r,l"(i32 %122, ptr addrspace(1) %114) #2, !dbg !59
+  ret void, !dbg !60
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1
+
+attributes #0 = { nounwind "nvvm.reqntid"="256" }
+attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #2 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly)
+!1 = !DIFile(filename: "cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py", directory: "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6")
+!2 = !{i32 2, !"Debug Info Version", i32 3}
+!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
+!4 = distinct !DISubprogram(name: "triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3", linkageName: "triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!5 = !DISubroutineType(cc: DW_CC_normal, types: !6)
+!6 = !{}
+!7 = !DILocation(line: 20, column: 28, scope: !4)
+!8 = !DILocation(line: 20, column: 33, scope: !4)
+!9 = !DILocation(line: 21, column: 36, scope: !4)
+!10 = !DILocation(line: 21, column: 23, scope: !4)
+!11 = !DILocation(line: 26, column: 19, scope: !4)
+!12 = !DILocation(line: 24, column: 19, scope: !4)
+!13 = !DILocation(line: 25, column: 19, scope: !4)
+!14 = !DILocation(line: 27, column: 30, scope: !4)
+!15 = !DILocation(line: 27, column: 35, scope: !4)
+!16 = !DILocation(line: 28, column: 39, scope: !4)
+!17 = !DILocation(line: 28, column: 35, scope: !4)
+!18 = !DILocation(line: 28, column: 30, scope: !4)
+!19 = !DILocation(line: 28, column: 44, scope: !4)
+!20 = !DILocation(line: 29, column: 31, scope: !4)
+!21 = !DILocation(line: 29, column: 45, scope: !4)
+!22 = !DILocation(line: 30, column: 31, scope: !4)
+!23 = !DILocation(line: 30, column: 36, scope: !4)
+!24 = !DILocation(line: 33, column: 17, scope: !4)
+!25 = !DILocation(line: 37, column: 18, scope: !4)
+!26 = !DILocation(line: 38, column: 43, scope: !4)
+!27 = !DILocation(line: 38, column: 37, scope: !4)
+!28 = !DILocation(line: 38, column: 34, scope: !4)
+!29 = !DILocation(line: 38, column: 52, scope: !4)
+!30 = !DILocation(line: 38, column: 48, scope: !4)
+!31 = !DILocation(line: 38, column: 30, scope: !4)
+!32 = !DILocation(line: 38, column: 57, scope: !4)
+!33 = !DILocation(line: 38, column: 107, scope: !4)
+!34 = !DILocation(line: 39, column: 13, scope: !4)
+!35 = !DILocation(line: 42, column: 20, scope: !4)
+!36 = !DILocation(line: 45, column: 45, scope: !4)
+!37 = !DILocation(line: 45, column: 31, scope: !4)
+!38 = !DILocation(line: 45, column: 54, scope: !4)
+!39 = !DILocation(line: 45, column: 105, scope: !4)
+!40 = !DILocation(line: 0, scope: !4)
+!41 = !DILocation(line: 53, column: 31, scope: !4)
+!42 = !DILocation(line: 53, column: 58, scope: !4)
+!43 = !DILocation(line: 53, column: 108, scope: !4)
+!44 = !DILocation(line: 54, column: 13, scope: !4)
+!45 = !DILocation(line: 57, column: 31, scope: !4)
+!46 = !DILocation(line: 57, column: 54, scope: !4)
+!47 = !DILocation(line: 57, column: 105, scope: !4)
+!48 = !DILocation(line: 63, column: 25, scope: !4)
+!49 = !DILocation(line: 27, column: 44, scope: !4)
+!50 = !DILocation(line: 32, column: 18, scope: !4)
+!51 = !DILocation(line: 48, column: 20, scope: !4)
+!52 = !DILocation(line: 49, column: 19, scope: !4)
+!53 = !DILocation(line: 63, column: 37, scope: !4)
+!54 = !DILocation(line: 64, column: 25, scope: !4)
+!55 = !DILocation(line: 30, column: 45, scope: !4)
+!56 = !DILocation(line: 52, column: 20, scope: !4)
+!57 = !DILocation(line: 60, column: 20, scope: !4)
+!58 = !DILocation(line: 61, column: 20, scope: !4)
+!59 = !DILocation(line: 64, column: 37, scope: !4)
+!60 = !DILocation(line: 64, column: 4, scope: !4)
diff --git a/triton/V2UITHYV32CXYY2HOX6V4PW3NX5CTYPJRA4NJTKWUVP3CV64YJDQ/triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.ptx b/triton/V2UITHYV32CXYY2HOX6V4PW3NX5CTYPJRA4NJTKWUVP3CV64YJDQ/triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.ptx
new file mode 100644
index 0000000000000000000000000000000000000000..caee9edec3e37d8739695db2fe64c2511513e0de
--- /dev/null
+++ b/triton/V2UITHYV32CXYY2HOX6V4PW3NX5CTYPJRA4NJTKWUVP3CV64YJDQ/triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.ptx
@@ -0,0 +1,517 @@
+//
+// Generated by LLVM NVPTX Back-End
+//
+
+.version 9.1
+.target sm_89
+.address_size 64
+
+	// .globl	triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3 // -- Begin function triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3
+                                        // @triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3
+.visible .entry triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3(
+	.param .u64 .ptr .global .align 1 triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3_param_0,
+	.param .u64 .ptr .global .align 1 triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3_param_1,
+	.param .u64 .ptr .global .align 1 triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3_param_2,
+	.param .u64 .ptr .global .align 1 triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3_param_3,
+	.param .u64 .ptr .global .align 1 triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3_param_4,
+	.param .u64 .ptr .global .align 1 triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3_param_5,
+	.param .u32 triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3_param_6,
+	.param .u64 .ptr .global .align 1 triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3_param_7,
+	.param .u64 .ptr .global .align 1 triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3_param_8
+)
+.reqntid 256
+{
+	.reg .pred 	%p<5>;
+	.reg .b16 	%rs<20>;
+	.reg .b32 	%r<60>;
+	.reg .b64 	%rd<34>;
+	.loc	1 18 0                          // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:18:0
+$L__func_begin0:
+	.loc	1 18 0                          // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:18:0
+
+// %bb.0:
+	ld.param.b64 	%rd23, [triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3_param_0];
+	ld.param.b64 	%rd24, [triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3_param_1];
+$L__tmp0:
+	.loc	1 20 28                         // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:20:28
+	mov.u32 	%r9, %ctaid.x;
+	.loc	1 20 33                         // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:20:33
+	shl.b32 	%r10, %r9, 9;
+	ld.param.b64 	%rd25, [triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3_param_2];
+	ld.param.b64 	%rd26, [triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3_param_3];
+	.loc	1 21 36                         // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:21:36
+	mov.u32 	%r11, %tid.x;
+	shl.b32 	%r12, %r11, 1;
+	ld.param.b64 	%rd27, [triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3_param_4];
+	and.b32 	%r13, %r12, 510;
+	ld.param.b64 	%rd28, [triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3_param_5];
+	.loc	1 21 23                         // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:21:23
+	or.b32 	%r14, %r13, %r10;
+	or.b32 	%r15, %r14, 1;
+	.loc	1 26 19                         // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:26:19
+	bfe.s32 	%r16, %r9, 22, 1;
+	.loc	1 24 19                         // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:24:19
+	shr.u32 	%r17, %r16, 25;
+	.loc	1 26 19                         // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:26:19
+	add.s32 	%r18, %r14, %r17;
+	.loc	1 24 19                         // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:24:19
+	and.b32 	%r19, %r18, -128;
+	sub.s32 	%r20, %r14, %r19;
+	add.s32 	%r21, %r15, %r17;
+	and.b32 	%r22, %r21, 65408;
+	sub.s32 	%r23, %r15, %r22;
+	.loc	1 25 19                         // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:25:19
+	shr.u32 	%r24, %r16, 20;
+	add.s32 	%r25, %r14, %r24;
+	shr.s32 	%r26, %r25, 12;
+	.loc	1 27 30                         // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:27:30
+	mul.wide.s32 	%rd29, %r14, 2;
+	add.s64 	%rd1, %rd23, %rd29;
+	.loc	1 27 35                         // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:27:35
+	// begin inline asm
+	mov.u32 %r1, 0x0;
+	ld.global.b32 { %r1 }, [ %rd1 + 0 ];
+	// end inline asm
+	.loc	1 28 39                         // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:28:39
+	shl.b32 	%r27, %r26, 7;
+	.loc	1 28 35                         // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:28:35
+	add.s32 	%r28, %r27, %r20;
+	.loc	1 28 30                         // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:28:30
+	mul.wide.s32 	%rd30, %r28, 4;
+	add.s64 	%rd2, %rd24, %rd30;
+	.loc	1 28 44                         // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:28:44
+	// begin inline asm
+	mov.u64 %rd3, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd3, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r2, 0x0;
+	mov.u32 %r3, 0x0;
+	ld.global.L1::evict_last.L2::cache_hint.v2.b32 { %r2, %r3 }, [ %rd2 + 0 ], %rd3;
+	// end inline asm
+	.loc	1 29 31                         // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:29:31
+	add.s64 	%rd4, %rd25, %rd30;
+	.loc	1 29 45                         // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:29:45
+	// begin inline asm
+	mov.u64 %rd5, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd5, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r4, 0x0;
+	mov.u32 %r5, 0x0;
+	ld.global.L1::evict_last.L2::cache_hint.v2.b32 { %r4, %r5 }, [ %rd4 + 0 ], %rd5;
+	// end inline asm
+	.loc	1 30 31                         // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:30:31
+	add.s64 	%rd6, %rd26, %rd29;
+	.loc	1 30 36                         // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:30:36
+	// begin inline asm
+	mov.u32 %r6, 0x0;
+	ld.global.b32 { %r6 }, [ %rd6 + 0 ];
+	// end inline asm
+	.loc	1 33 17                         // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:33:17
+	bfe.u32 	%r29, %r9, 22, 1;
+	add.s32 	%r30, %r15, %r29;
+	and.b32 	%r31, %r30, -2;
+	sub.s32 	%r32, %r15, %r31;
+	.loc	1 37 18                         // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:37:18
+	setp.lt.s32 	%p2, %r32, 1;
+	.loc	1 38 43                         // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:38:43
+	cvt.u16.u32 	%rs10, %r23;
+	and.b16 	%rs11, %rs10, 128;
+	shr.u16 	%rs12, %rs11, 7;
+	add.s16 	%rs13, %rs10, %rs12;
+	cvt.s16.s8 	%rs14, %rs13;
+	shr.s16 	%rs15, %rs14, 1;
+	.loc	1 38 48                         // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:38:48
+	mad.wide.s16 	%r33, %rs15, 2, %r19;
+	or.b32 	%r34, %r33, 1;
+	.loc	1 38 30                         // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:38:30
+	mul.wide.s32 	%rd31, %r15, 2;
+	add.s64 	%rd7, %rd23, %rd31;
+	mul.wide.s32 	%rd32, %r34, 2;
+	add.s64 	%rd9, %rd23, %rd32;
+	.loc	1 38 57                         // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:38:57
+	// begin inline asm
+	mov.u64 %rd8, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd8, 1.0;
+	// end inline asm
+	mov.b16 	%rs2, 0;
+	mov.pred 	%p1, -1;
+	// begin inline asm
+	mov.u16 %rs1, %rs2;
+	@%p1 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs1 }, [ %rd7 + 0 ], %rd8;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd10, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd10, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs3, %rs2;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs3 }, [ %rd9 + 0 ], %rd10;
+	// end inline asm
+	.loc	1 38 107                        // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:38:107
+	cvt.f32.bf16 	%r35, %rs1;
+	cvt.f32.bf16 	%r36, %rs3;
+	mov.b32 	%r37, 0f00000000;
+	.loc	1 39 13                         // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:39:13
+	sub.f32 	%r38, %r37, %r35;
+	sub.f32 	%r39, %r37, %r36;
+	.loc	1 42 20                         // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:42:20
+	setp.gt.s32 	%p4, %r32, 0;
+	.loc	1 45 31                         // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:45:31
+	mul.wide.s32 	%rd33, %r33, 2;
+	add.s64 	%rd12, %rd23, %rd33;
+	.loc	1 45 54                         // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:45:54
+	// begin inline asm
+	mov.u64 %rd11, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd11, 1.0;
+	// end inline asm
+	mov.pred 	%p3, 0;
+	// begin inline asm
+	mov.u16 %rs4, %rs2;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs4 }, [ %rd1 + 0 ], %rd11;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd13, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd13, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs5, %rs2;
+	@%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs5 }, [ %rd12 + 0 ], %rd13;
+	// end inline asm
+	.loc	1 45 105                        // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:45:105
+	cvt.f32.bf16 	%r40, %rs5;
+	.loc	1 0 0                           // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:0
+	selp.f32 	%r41, %r39, %r40, %p2;
+	.loc	1 53 31                         // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:53:31
+	add.s64 	%rd14, %rd26, %rd31;
+	add.s64 	%rd16, %rd26, %rd32;
+	.loc	1 53 58                         // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:53:58
+	// begin inline asm
+	mov.u64 %rd15, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd15, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs6, %rs2;
+	@%p1 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs6 }, [ %rd14 + 0 ], %rd15;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd17, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd17, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs7, %rs2;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs7 }, [ %rd16 + 0 ], %rd17;
+	// end inline asm
+	.loc	1 53 108                        // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:53:108
+	cvt.f32.bf16 	%r42, %rs6;
+	cvt.f32.bf16 	%r43, %rs7;
+	.loc	1 54 13                         // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:54:13
+	sub.f32 	%r44, %r37, %r42;
+	sub.f32 	%r45, %r37, %r43;
+	.loc	1 57 31                         // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:57:31
+	add.s64 	%rd19, %rd26, %rd33;
+	.loc	1 57 54                         // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:57:54
+	// begin inline asm
+	mov.u64 %rd18, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd18, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs8, %rs2;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs8 }, [ %rd6 + 0 ], %rd18;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd20, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd20, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs9, %rs2;
+	@%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs9 }, [ %rd19 + 0 ], %rd20;
+	// end inline asm
+	.loc	1 57 105                        // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:57:105
+	cvt.f32.bf16 	%r46, %rs9;
+	.loc	1 0 0                           // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:0
+	selp.f32 	%r47, %r45, %r46, %p2;
+	.loc	1 63 25                         // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:63:25
+	add.s64 	%rd21, %rd27, %rd29;
+	.loc	1 27 44                         // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:27:44
+	mov.b32 	{%rs16, %rs17}, %r1;
+	cvt.f32.bf16 	%r48, %rs16;
+	cvt.f32.bf16 	%r49, %rs17;
+	.loc	1 48 20                         // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:48:20
+	mul.f32 	%r50, %r41, %r5;
+	mul.f32 	%r51, %r38, %r4;
+	.loc	1 49 19                         // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:49:19
+	fma.rn.f32 	%r52, %r49, %r3, %r50;
+	fma.rn.f32 	%r53, %r48, %r2, %r51;
+	.loc	1 63 37                         // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:63:37
+	cvt.rn.bf16x2.f32 	%r7, %r52, %r53;
+	// begin inline asm
+	st.global.b32 [ %rd21 + 0 ], { %r7 };
+	// end inline asm
+	.loc	1 64 25                         // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:64:25
+	add.s64 	%rd22, %rd28, %rd29;
+	.loc	1 30 45                         // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:30:45
+	mov.b32 	{%rs18, %rs19}, %r6;
+	cvt.f32.bf16 	%r54, %rs18;
+	cvt.f32.bf16 	%r55, %rs19;
+	.loc	1 60 20                         // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:60:20
+	mul.f32 	%r56, %r47, %r5;
+	mul.f32 	%r57, %r44, %r4;
+	.loc	1 61 20                         // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:61:20
+	fma.rn.f32 	%r58, %r3, %r55, %r56;
+	fma.rn.f32 	%r59, %r2, %r54, %r57;
+	.loc	1 64 37                         // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:64:37
+	cvt.rn.bf16x2.f32 	%r8, %r58, %r59;
+	// begin inline asm
+	st.global.b32 [ %rd22 + 0 ], { %r8 };
+	// end inline asm
+	.loc	1 64 4                          // cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py:64:4
+	ret;
+$L__tmp1:
+$L__func_end0:
+                                        // -- End function
+}
+	.file	1 "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py"
+	.section	.debug_abbrev
+	{
+.b8 1                                   // Abbreviation Code
+.b8 17                                  // DW_TAG_compile_unit
+.b8 0                                   // DW_CHILDREN_no
+.b8 37                                  // DW_AT_producer
+.b8 8                                   // DW_FORM_string
+.b8 19                                  // DW_AT_language
+.b8 5                                   // DW_FORM_data2
+.b8 3                                   // DW_AT_name
+.b8 8                                   // DW_FORM_string
+.b8 16                                  // DW_AT_stmt_list
+.b8 6                                   // DW_FORM_data4
+.b8 27                                  // DW_AT_comp_dir
+.b8 8                                   // DW_FORM_string
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 0                                   // EOM(3)
+	}
+	.section	.debug_info
+	{
+.b32 224                                // Length of Unit
+.b8 2                                   // DWARF version number
+.b8 0
+.b32 .debug_abbrev                      // Offset Into Abbrev. Section
+.b8 8                                   // Address Size (in bytes)
+.b8 1                                   // Abbrev [1] 0xb:0xd9 DW_TAG_compile_unit
+.b8 116                                 // DW_AT_producer
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 0
+.b8 2                                   // DW_AT_language
+.b8 0
+.b8 99                                  // DW_AT_name
+.b8 106
+.b8 54
+.b8 54
+.b8 116
+.b8 103
+.b8 98
+.b8 102
+.b8 113
+.b8 120
+.b8 55
+.b8 114
+.b8 104
+.b8 121
+.b8 116
+.b8 99
+.b8 121
+.b8 119
+.b8 109
+.b8 106
+.b8 100
+.b8 99
+.b8 105
+.b8 109
+.b8 110
+.b8 119
+.b8 119
+.b8 116
+.b8 113
+.b8 54
+.b8 120
+.b8 106
+.b8 103
+.b8 98
+.b8 50
+.b8 113
+.b8 98
+.b8 113
+.b8 98
+.b8 120
+.b8 120
+.b8 111
+.b8 110
+.b8 97
+.b8 108
+.b8 100
+.b8 111
+.b8 116
+.b8 120
+.b8 54
+.b8 51
+.b8 118
+.b8 46
+.b8 112
+.b8 121
+.b8 0
+.b32 .debug_line                        // DW_AT_stmt_list
+.b8 47                                  // DW_AT_comp_dir
+.b8 97
+.b8 112
+.b8 112
+.b8 47
+.b8 116
+.b8 101
+.b8 110
+.b8 115
+.b8 111
+.b8 114
+.b8 114
+.b8 116
+.b8 95
+.b8 108
+.b8 108
+.b8 109
+.b8 47
+.b8 118
+.b8 105
+.b8 115
+.b8 117
+.b8 97
+.b8 108
+.b8 95
+.b8 103
+.b8 101
+.b8 110
+.b8 47
+.b8 99
+.b8 111
+.b8 109
+.b8 112
+.b8 105
+.b8 108
+.b8 101
+.b8 100
+.b8 95
+.b8 99
+.b8 97
+.b8 99
+.b8 104
+.b8 101
+.b8 47
+.b8 102
+.b8 108
+.b8 117
+.b8 120
+.b8 50
+.b8 95
+.b8 107
+.b8 108
+.b8 101
+.b8 105
+.b8 110
+.b8 95
+.b8 57
+.b8 98
+.b8 95
+.b8 78
+.b8 86
+.b8 73
+.b8 68
+.b8 73
+.b8 65
+.b8 95
+.b8 71
+.b8 101
+.b8 70
+.b8 111
+.b8 114
+.b8 99
+.b8 101
+.b8 95
+.b8 82
+.b8 84
+.b8 88
+.b8 95
+.b8 52
+.b8 48
+.b8 57
+.b8 48
+.b8 95
+.b8 115
+.b8 109
+.b8 56
+.b8 57
+.b8 95
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 50
+.b8 46
+.b8 49
+.b8 48
+.b8 46
+.b8 48
+.b8 97
+.b8 48
+.b8 95
+.b8 98
+.b8 52
+.b8 101
+.b8 52
+.b8 101
+.b8 101
+.b8 56
+.b8 49
+.b8 100
+.b8 51
+.b8 46
+.b8 110
+.b8 118
+.b8 50
+.b8 53
+.b8 46
+.b8 49
+.b8 50
+.b8 95
+.b8 99
+.b8 117
+.b8 100
+.b8 97
+.b8 49
+.b8 51
+.b8 95
+.b8 49
+.b8 47
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 105
+.b8 110
+.b8 100
+.b8 117
+.b8 99
+.b8 116
+.b8 111
+.b8 114
+.b8 47
+.b8 106
+.b8 54
+.b8 0
+	}
+	.section	.debug_macinfo	{	}
diff --git a/triton/V2UITHYV32CXYY2HOX6V4PW3NX5CTYPJRA4NJTKWUVP3CV64YJDQ/triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.source b/triton/V2UITHYV32CXYY2HOX6V4PW3NX5CTYPJRA4NJTKWUVP3CV64YJDQ/triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.source
new file mode 100644
index 0000000000000000000000000000000000000000..eed402848ce10edbf4fc6802fde8521e8b6422e4
--- /dev/null
+++ b/triton/V2UITHYV32CXYY2HOX6V4PW3NX5CTYPJRA4NJTKWUVP3CV64YJDQ/triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.source
@@ -0,0 +1,352 @@
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":18:0)
+#loc81 = loc("in_ptr0"(#loc))
+#loc82 = loc("in_ptr1"(#loc))
+#loc83 = loc("in_ptr2"(#loc))
+#loc84 = loc("in_ptr3"(#loc))
+#loc85 = loc("out_ptr0"(#loc))
+#loc86 = loc("out_ptr1"(#loc))
+#loc87 = loc("xnumel"(#loc))
+module {
+  tt.func public @triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %out_ptr1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} {
+    %xnumel_0 = arith.constant 9437184 : i32 loc(#loc88)
+    %xoffset = tt.get_program_id x : i32 loc(#loc89)
+    %xoffset_1 = arith.constant 512 : i32 loc(#loc90)
+    %xoffset_2 = arith.constant 512 : i32 loc(#loc90)
+    %xoffset_3 = arith.muli %xoffset, %xoffset_2 : i32 loc(#loc90)
+    %xindex = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32> loc(#loc91)
+    %xindex_4 = tt.splat %xoffset_3 : i32 -> tensor<512xi32> loc(#loc92)
+    %xindex_5 = arith.addi %xindex_4, %xindex : tensor<512xi32> loc(#loc92)
+    %xmask = arith.constant true loc(#loc93)
+    %xmask_6 = arith.constant dense<true> : tensor<512xi1> loc(#loc93)
+    %x0 = arith.constant 128 : i32 loc(#loc94)
+    %x0_7 = arith.constant 128 : i32 loc(#loc94)
+    %x0_8 = arith.constant dense<128> : tensor<512xi32> loc(#loc94)
+    %x0_9 = arith.remsi %xindex_5, %x0_8 : tensor<512xi32> loc(#loc94)
+    %x2 = arith.constant 4096 : i32 loc(#loc95)
+    %x2_10 = arith.constant 4096 : i32 loc(#loc95)
+    %x2_11 = arith.constant dense<4096> : tensor<512xi32> loc(#loc95)
+    %x2_12 = arith.divsi %xindex_5, %x2_11 : tensor<512xi32> loc(#loc95)
+    %x4 = arith.constant 128 : i32 loc(#loc96)
+    %x4_13 = arith.constant 128 : i32 loc(#loc96)
+    %x4_14 = arith.constant dense<128> : tensor<512xi32> loc(#loc96)
+    %x4_15 = arith.divsi %xindex_5, %x4_14 : tensor<512xi32> loc(#loc96)
+    %tmp0 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<512x!tt.ptr<bf16>> loc(#loc97)
+    %tmp0_16 = tt.addptr %tmp0, %xindex_5 : tensor<512x!tt.ptr<bf16>>, tensor<512xi32> loc(#loc97)
+    %tmp0_17 = tt.load %tmp0_16 : tensor<512x!tt.ptr<bf16>> loc(#loc98)
+    %tmp0_18 = arith.extf %tmp0_17 : tensor<512xbf16> to tensor<512xf32> loc(#loc99)
+    %tmp2 = arith.constant 128 : i32 loc(#loc100)
+    %tmp2_19 = arith.constant 128 : i32 loc(#loc100)
+    %tmp2_20 = arith.constant dense<128> : tensor<512xi32> loc(#loc100)
+    %tmp2_21 = arith.muli %tmp2_20, %x2_12 : tensor<512xi32> loc(#loc100)
+    %tmp2_22 = arith.addi %x0_9, %tmp2_21 : tensor<512xi32> loc(#loc101)
+    %tmp2_23 = tt.splat %in_ptr1 : !tt.ptr<f32> -> tensor<512x!tt.ptr<f32>> loc(#loc102)
+    %tmp2_24 = tt.addptr %tmp2_23, %tmp2_22 : tensor<512x!tt.ptr<f32>>, tensor<512xi32> loc(#loc102)
+    %tmp2_25 = tt.load %tmp2_24 evictionPolicy = evict_last : tensor<512x!tt.ptr<f32>> loc(#loc103)
+    %tmp19 = arith.constant 128 : i32 loc(#loc104)
+    %tmp19_26 = arith.constant 128 : i32 loc(#loc104)
+    %tmp19_27 = arith.constant dense<128> : tensor<512xi32> loc(#loc104)
+    %tmp19_28 = arith.muli %tmp19_27, %x2_12 : tensor<512xi32> loc(#loc104)
+    %tmp19_29 = arith.addi %x0_9, %tmp19_28 : tensor<512xi32> loc(#loc105)
+    %tmp19_30 = tt.splat %in_ptr2 : !tt.ptr<f32> -> tensor<512x!tt.ptr<f32>> loc(#loc106)
+    %tmp19_31 = tt.addptr %tmp19_30, %tmp19_29 : tensor<512x!tt.ptr<f32>>, tensor<512xi32> loc(#loc106)
+    %tmp19_32 = tt.load %tmp19_31 evictionPolicy = evict_last : tensor<512x!tt.ptr<f32>> loc(#loc107)
+    %tmp23 = tt.splat %in_ptr3 : !tt.ptr<bf16> -> tensor<512x!tt.ptr<bf16>> loc(#loc108)
+    %tmp23_33 = tt.addptr %tmp23, %xindex_5 : tensor<512x!tt.ptr<bf16>>, tensor<512xi32> loc(#loc108)
+    %tmp23_34 = tt.load %tmp23_33 : tensor<512x!tt.ptr<bf16>> loc(#loc109)
+    %tmp23_35 = arith.extf %tmp23_34 : tensor<512xbf16> to tensor<512xf32> loc(#loc110)
+    %tmp3 = arith.mulf %tmp0_18, %tmp2_25 : tensor<512xf32> loc(#loc111)
+    %tmp4 = arith.constant 2 : i32 loc(#loc112)
+    %tmp4_36 = arith.constant 2 : i32 loc(#loc112)
+    %tmp4_37 = arith.constant dense<2> : tensor<512xi32> loc(#loc112)
+    %tmp4_38 = arith.remsi %xindex_5, %tmp4_37 : tensor<512xi32> loc(#loc112)
+    %tmp5 = arith.constant 0 : i64 loc(#loc113)
+    %tmp5_39 = arith.constant dense<0> : tensor<1xi64> loc(#loc113)
+    %tmp6 = arith.extsi %tmp4_38 : tensor<512xi32> to tensor<512xi64> loc(#loc114)
+    %tmp6_40 = arith.constant dense<0> : tensor<512xi64> loc(#loc114)
+    %tmp6_41 = arith.cmpi sge, %tmp6, %tmp6_40 : tensor<512xi64> loc(#loc114)
+    %tmp7 = arith.constant 1 : i64 loc(#loc115)
+    %tmp7_42 = arith.constant dense<1> : tensor<1xi64> loc(#loc115)
+    %tmp8 = arith.extsi %tmp4_38 : tensor<512xi32> to tensor<512xi64> loc(#loc116)
+    %tmp8_43 = arith.constant dense<1> : tensor<512xi64> loc(#loc116)
+    %tmp8_44 = arith.cmpi slt, %tmp8, %tmp8_43 : tensor<512xi64> loc(#loc116)
+    %tmp9 = arith.constant 2 : i32 loc(#loc117)
+    %tmp9_45 = arith.constant 2 : i32 loc(#loc117)
+    %tmp9_46 = arith.constant dense<2> : tensor<512xi32> loc(#loc117)
+    %tmp9_47 = arith.divsi %x0_9, %tmp9_46 : tensor<512xi32> loc(#loc117)
+    %tmp9_48 = arith.constant 2 : i32 loc(#loc118)
+    %tmp9_49 = arith.constant 2 : i32 loc(#loc118)
+    %tmp9_50 = arith.constant dense<2> : tensor<512xi32> loc(#loc118)
+    %tmp9_51 = arith.muli %tmp9_50, %tmp9_47 : tensor<512xi32> loc(#loc118)
+    %tmp9_52 = arith.constant 1 : i32 loc(#loc119)
+    %tmp9_53 = arith.constant 1 : i32 loc(#loc119)
+    %tmp9_54 = arith.constant dense<1> : tensor<512xi32> loc(#loc119)
+    %tmp9_55 = arith.addi %tmp9_54, %tmp9_51 : tensor<512xi32> loc(#loc119)
+    %tmp9_56 = arith.constant 128 : i32 loc(#loc120)
+    %tmp9_57 = arith.constant 128 : i32 loc(#loc120)
+    %tmp9_58 = arith.constant dense<128> : tensor<512xi32> loc(#loc120)
+    %tmp9_59 = arith.muli %tmp9_58, %x4_15 : tensor<512xi32> loc(#loc120)
+    %tmp9_60 = arith.addi %tmp9_55, %tmp9_59 : tensor<512xi32> loc(#loc121)
+    %tmp9_61 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<512x!tt.ptr<bf16>> loc(#loc122)
+    %tmp9_62 = tt.addptr %tmp9_61, %tmp9_60 : tensor<512x!tt.ptr<bf16>>, tensor<512xi32> loc(#loc122)
+    %tmp9_63 = arith.constant 0.000000e+00 : f32 loc(#loc123)
+    %tmp9_64 = arith.constant dense<0.000000e+00> : tensor<512xf32> loc(#loc123)
+    %tmp9_65 = arith.truncf %tmp9_64 : tensor<512xf32> to tensor<512xbf16> loc(#loc123)
+    %tmp9_66 = tt.load %tmp9_62, %tmp8_44, %tmp9_65 evictionPolicy = evict_last : tensor<512x!tt.ptr<bf16>> loc(#loc123)
+    %tmp9_67 = arith.extf %tmp9_66 : tensor<512xbf16> to tensor<512xf32> loc(#loc124)
+    %tmp10 = arith.constant 0.000000e+00 : f32 loc(#loc125)
+    %tmp10_68 = arith.constant dense<0.000000e+00> : tensor<512xf32> loc(#loc125)
+    %tmp10_69 = arith.subf %tmp10_68, %tmp9_67 : tensor<512xf32> loc(#loc125)
+    %tmp11 = arith.constant 0.000000e+00 : f32 loc(#loc126)
+    %tmp11_70 = arith.constant dense<0.000000e+00> : tensor<512xf32> loc(#loc126)
+    %tmp12 = arith.select %tmp8_44, %tmp10_69, %tmp11_70 : tensor<512xi1>, tensor<512xf32> loc(#loc127)
+    %tmp13 = arith.extsi %tmp4_38 : tensor<512xi32> to tensor<512xi64> loc(#loc128)
+    %tmp13_71 = arith.constant dense<1> : tensor<512xi64> loc(#loc128)
+    %tmp13_72 = arith.cmpi sge, %tmp13, %tmp13_71 : tensor<512xi64> loc(#loc128)
+    %tmp14 = arith.constant 2 : i64 loc(#loc129)
+    %tmp14_73 = arith.constant dense<2> : tensor<1xi64> loc(#loc129)
+    %tmp15 = arith.extsi %tmp4_38 : tensor<512xi32> to tensor<512xi64> loc(#loc130)
+    %tmp15_74 = arith.constant dense<2> : tensor<512xi64> loc(#loc130)
+    %tmp15_75 = arith.cmpi slt, %tmp15, %tmp15_74 : tensor<512xi64> loc(#loc130)
+    %tmp16 = arith.constant 2 : i32 loc(#loc131)
+    %tmp16_76 = arith.constant 2 : i32 loc(#loc131)
+    %tmp16_77 = arith.constant dense<2> : tensor<512xi32> loc(#loc131)
+    %tmp16_78 = arith.divsi %x0_9, %tmp16_77 : tensor<512xi32> loc(#loc131)
+    %tmp16_79 = arith.constant 2 : i32 loc(#loc132)
+    %tmp16_80 = arith.constant 2 : i32 loc(#loc132)
+    %tmp16_81 = arith.constant dense<2> : tensor<512xi32> loc(#loc132)
+    %tmp16_82 = arith.muli %tmp16_81, %tmp16_78 : tensor<512xi32> loc(#loc132)
+    %tmp16_83 = arith.constant 128 : i32 loc(#loc133)
+    %tmp16_84 = arith.constant 128 : i32 loc(#loc133)
+    %tmp16_85 = arith.constant dense<128> : tensor<512xi32> loc(#loc133)
+    %tmp16_86 = arith.muli %tmp16_85, %x4_15 : tensor<512xi32> loc(#loc133)
+    %tmp16_87 = arith.addi %tmp16_82, %tmp16_86 : tensor<512xi32> loc(#loc134)
+    %tmp16_88 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<512x!tt.ptr<bf16>> loc(#loc135)
+    %tmp16_89 = tt.addptr %tmp16_88, %tmp16_87 : tensor<512x!tt.ptr<bf16>>, tensor<512xi32> loc(#loc135)
+    %tmp16_90 = arith.constant 0.000000e+00 : f32 loc(#loc136)
+    %tmp16_91 = arith.constant dense<0.000000e+00> : tensor<512xf32> loc(#loc136)
+    %tmp16_92 = arith.truncf %tmp16_91 : tensor<512xf32> to tensor<512xbf16> loc(#loc136)
+    %tmp16_93 = tt.load %tmp16_89, %tmp13_72, %tmp16_92 evictionPolicy = evict_last : tensor<512x!tt.ptr<bf16>> loc(#loc136)
+    %tmp16_94 = arith.extf %tmp16_93 : tensor<512xbf16> to tensor<512xf32> loc(#loc137)
+    %tmp17 = arith.select %tmp8_44, %tmp12, %tmp16_94 : tensor<512xi1>, tensor<512xf32> loc(#loc138)
+    %tmp20 = arith.mulf %tmp17, %tmp19_32 : tensor<512xf32> loc(#loc139)
+    %tmp21 = arith.addf %tmp3, %tmp20 : tensor<512xf32> loc(#loc140)
+    %tmp25 = arith.mulf %tmp23_35, %tmp2_25 : tensor<512xf32> loc(#loc141)
+    %tmp26 = arith.constant 2 : i32 loc(#loc142)
+    %tmp26_95 = arith.constant 2 : i32 loc(#loc142)
+    %tmp26_96 = arith.constant dense<2> : tensor<512xi32> loc(#loc142)
+    %tmp26_97 = arith.divsi %x0_9, %tmp26_96 : tensor<512xi32> loc(#loc142)
+    %tmp26_98 = arith.constant 2 : i32 loc(#loc143)
+    %tmp26_99 = arith.constant 2 : i32 loc(#loc143)
+    %tmp26_100 = arith.constant dense<2> : tensor<512xi32> loc(#loc143)
+    %tmp26_101 = arith.muli %tmp26_100, %tmp26_97 : tensor<512xi32> loc(#loc143)
+    %tmp26_102 = arith.constant 1 : i32 loc(#loc144)
+    %tmp26_103 = arith.constant 1 : i32 loc(#loc144)
+    %tmp26_104 = arith.constant dense<1> : tensor<512xi32> loc(#loc144)
+    %tmp26_105 = arith.addi %tmp26_104, %tmp26_101 : tensor<512xi32> loc(#loc144)
+    %tmp26_106 = arith.constant 128 : i32 loc(#loc145)
+    %tmp26_107 = arith.constant 128 : i32 loc(#loc145)
+    %tmp26_108 = arith.constant dense<128> : tensor<512xi32> loc(#loc145)
+    %tmp26_109 = arith.muli %tmp26_108, %x4_15 : tensor<512xi32> loc(#loc145)
+    %tmp26_110 = arith.addi %tmp26_105, %tmp26_109 : tensor<512xi32> loc(#loc146)
+    %tmp26_111 = tt.splat %in_ptr3 : !tt.ptr<bf16> -> tensor<512x!tt.ptr<bf16>> loc(#loc147)
+    %tmp26_112 = tt.addptr %tmp26_111, %tmp26_110 : tensor<512x!tt.ptr<bf16>>, tensor<512xi32> loc(#loc147)
+    %tmp26_113 = arith.constant 0.000000e+00 : f32 loc(#loc148)
+    %tmp26_114 = arith.constant dense<0.000000e+00> : tensor<512xf32> loc(#loc148)
+    %tmp26_115 = arith.truncf %tmp26_114 : tensor<512xf32> to tensor<512xbf16> loc(#loc148)
+    %tmp26_116 = tt.load %tmp26_112, %tmp8_44, %tmp26_115 evictionPolicy = evict_last : tensor<512x!tt.ptr<bf16>> loc(#loc148)
+    %tmp26_117 = arith.extf %tmp26_116 : tensor<512xbf16> to tensor<512xf32> loc(#loc149)
+    %tmp27 = arith.constant 0.000000e+00 : f32 loc(#loc150)
+    %tmp27_118 = arith.constant dense<0.000000e+00> : tensor<512xf32> loc(#loc150)
+    %tmp27_119 = arith.subf %tmp27_118, %tmp26_117 : tensor<512xf32> loc(#loc150)
+    %tmp28 = arith.constant 0.000000e+00 : f32 loc(#loc151)
+    %tmp28_120 = arith.constant dense<0.000000e+00> : tensor<512xf32> loc(#loc151)
+    %tmp29 = arith.select %tmp8_44, %tmp27_119, %tmp28_120 : tensor<512xi1>, tensor<512xf32> loc(#loc152)
+    %tmp30 = arith.constant 2 : i32 loc(#loc153)
+    %tmp30_121 = arith.constant 2 : i32 loc(#loc153)
+    %tmp30_122 = arith.constant dense<2> : tensor<512xi32> loc(#loc153)
+    %tmp30_123 = arith.divsi %x0_9, %tmp30_122 : tensor<512xi32> loc(#loc153)
+    %tmp30_124 = arith.constant 2 : i32 loc(#loc154)
+    %tmp30_125 = arith.constant 2 : i32 loc(#loc154)
+    %tmp30_126 = arith.constant dense<2> : tensor<512xi32> loc(#loc154)
+    %tmp30_127 = arith.muli %tmp30_126, %tmp30_123 : tensor<512xi32> loc(#loc154)
+    %tmp30_128 = arith.constant 128 : i32 loc(#loc155)
+    %tmp30_129 = arith.constant 128 : i32 loc(#loc155)
+    %tmp30_130 = arith.constant dense<128> : tensor<512xi32> loc(#loc155)
+    %tmp30_131 = arith.muli %tmp30_130, %x4_15 : tensor<512xi32> loc(#loc155)
+    %tmp30_132 = arith.addi %tmp30_127, %tmp30_131 : tensor<512xi32> loc(#loc156)
+    %tmp30_133 = tt.splat %in_ptr3 : !tt.ptr<bf16> -> tensor<512x!tt.ptr<bf16>> loc(#loc157)
+    %tmp30_134 = tt.addptr %tmp30_133, %tmp30_132 : tensor<512x!tt.ptr<bf16>>, tensor<512xi32> loc(#loc157)
+    %tmp30_135 = arith.constant 0.000000e+00 : f32 loc(#loc158)
+    %tmp30_136 = arith.constant dense<0.000000e+00> : tensor<512xf32> loc(#loc158)
+    %tmp30_137 = arith.truncf %tmp30_136 : tensor<512xf32> to tensor<512xbf16> loc(#loc158)
+    %tmp30_138 = tt.load %tmp30_134, %tmp13_72, %tmp30_137 evictionPolicy = evict_last : tensor<512x!tt.ptr<bf16>> loc(#loc158)
+    %tmp30_139 = arith.extf %tmp30_138 : tensor<512xbf16> to tensor<512xf32> loc(#loc159)
+    %tmp31 = arith.select %tmp8_44, %tmp29, %tmp30_139 : tensor<512xi1>, tensor<512xf32> loc(#loc160)
+    %tmp33 = arith.mulf %tmp31, %tmp19_32 : tensor<512xf32> loc(#loc161)
+    %tmp34 = arith.addf %tmp25, %tmp33 : tensor<512xf32> loc(#loc162)
+    %0 = tt.splat %out_ptr0 : !tt.ptr<bf16> -> tensor<512x!tt.ptr<bf16>> loc(#loc76)
+    %1 = tt.addptr %0, %xindex_5 : tensor<512x!tt.ptr<bf16>>, tensor<512xi32> loc(#loc76)
+    %2 = arith.truncf %tmp21 : tensor<512xf32> to tensor<512xbf16> loc(#loc77)
+    tt.store %1, %2 : tensor<512x!tt.ptr<bf16>> loc(#loc77)
+    %3 = tt.splat %out_ptr1 : !tt.ptr<bf16> -> tensor<512x!tt.ptr<bf16>> loc(#loc78)
+    %4 = tt.addptr %3, %xindex_5 : tensor<512x!tt.ptr<bf16>>, tensor<512xi32> loc(#loc78)
+    %5 = arith.truncf %tmp34 : tensor<512xf32> to tensor<512xbf16> loc(#loc79)
+    tt.store %4, %5 : tensor<512x!tt.ptr<bf16>> loc(#loc79)
+    tt.return loc(#loc80)
+  } loc(#loc)
+} loc(#loc)
+#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":19:13)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":20:28)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":20:33)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":21:36)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":21:23)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":22:36)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":24:19)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":25:19)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":26:19)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":27:30)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":27:35)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":27:44)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":28:39)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":28:35)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":28:30)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":28:44)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":29:40)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":29:36)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":29:31)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":29:45)
+#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":30:31)
+#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":30:36)
+#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":30:45)
+#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":32:18)
+#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":33:17)
+#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":34:27)
+#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":35:19)
+#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":36:27)
+#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":37:18)
+#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":38:43)
+#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":38:37)
+#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":38:34)
+#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":38:52)
+#loc34 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":38:48)
+#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":38:30)
+#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":38:57)
+#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":38:107)
+#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":39:13)
+#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":40:38)
+#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":41:34)
+#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":42:20)
+#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":43:28)
+#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":44:19)
+#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":45:40)
+#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":45:34)
+#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":45:49)
+#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":45:45)
+#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":45:31)
+#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":45:54)
+#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":45:105)
+#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":46:34)
+#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":48:20)
+#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":49:19)
+#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":52:20)
+#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":53:44)
+#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":53:38)
+#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":53:35)
+#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":53:53)
+#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":53:49)
+#loc60 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":53:31)
+#loc61 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":53:58)
+#loc62 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":53:108)
+#loc63 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":54:13)
+#loc64 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":55:38)
+#loc65 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":56:34)
+#loc66 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":57:40)
+#loc67 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":57:34)
+#loc68 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":57:49)
+#loc69 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":57:45)
+#loc70 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":57:31)
+#loc71 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":57:54)
+#loc72 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":57:105)
+#loc73 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":58:34)
+#loc74 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":60:20)
+#loc75 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":61:20)
+#loc76 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":63:25)
+#loc77 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":63:37)
+#loc78 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":64:25)
+#loc79 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":64:37)
+#loc80 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":64:4)
+#loc88 = loc("xnumel"(#loc1))
+#loc89 = loc("xoffset"(#loc2))
+#loc90 = loc("xoffset"(#loc3))
+#loc91 = loc("xindex"(#loc4))
+#loc92 = loc("xindex"(#loc5))
+#loc93 = loc("xmask"(#loc6))
+#loc94 = loc("x0"(#loc7))
+#loc95 = loc("x2"(#loc8))
+#loc96 = loc("x4"(#loc9))
+#loc97 = loc("tmp0"(#loc10))
+#loc98 = loc("tmp0"(#loc11))
+#loc99 = loc("tmp0"(#loc12))
+#loc100 = loc("tmp2"(#loc13))
+#loc101 = loc("tmp2"(#loc14))
+#loc102 = loc("tmp2"(#loc15))
+#loc103 = loc("tmp2"(#loc16))
+#loc104 = loc("tmp19"(#loc17))
+#loc105 = loc("tmp19"(#loc18))
+#loc106 = loc("tmp19"(#loc19))
+#loc107 = loc("tmp19"(#loc20))
+#loc108 = loc("tmp23"(#loc21))
+#loc109 = loc("tmp23"(#loc22))
+#loc110 = loc("tmp23"(#loc23))
+#loc111 = loc("tmp3"(#loc24))
+#loc112 = loc("tmp4"(#loc25))
+#loc113 = loc("tmp5"(#loc26))
+#loc114 = loc("tmp6"(#loc27))
+#loc115 = loc("tmp7"(#loc28))
+#loc116 = loc("tmp8"(#loc29))
+#loc117 = loc("tmp9"(#loc30))
+#loc118 = loc("tmp9"(#loc31))
+#loc119 = loc("tmp9"(#loc32))
+#loc120 = loc("tmp9"(#loc33))
+#loc121 = loc("tmp9"(#loc34))
+#loc122 = loc("tmp9"(#loc35))
+#loc123 = loc("tmp9"(#loc36))
+#loc124 = loc("tmp9"(#loc37))
+#loc125 = loc("tmp10"(#loc38))
+#loc126 = loc("tmp11"(#loc39))
+#loc127 = loc("tmp12"(#loc40))
+#loc128 = loc("tmp13"(#loc41))
+#loc129 = loc("tmp14"(#loc42))
+#loc130 = loc("tmp15"(#loc43))
+#loc131 = loc("tmp16"(#loc44))
+#loc132 = loc("tmp16"(#loc45))
+#loc133 = loc("tmp16"(#loc46))
+#loc134 = loc("tmp16"(#loc47))
+#loc135 = loc("tmp16"(#loc48))
+#loc136 = loc("tmp16"(#loc49))
+#loc137 = loc("tmp16"(#loc50))
+#loc138 = loc("tmp17"(#loc51))
+#loc139 = loc("tmp20"(#loc52))
+#loc140 = loc("tmp21"(#loc53))
+#loc141 = loc("tmp25"(#loc54))
+#loc142 = loc("tmp26"(#loc55))
+#loc143 = loc("tmp26"(#loc56))
+#loc144 = loc("tmp26"(#loc57))
+#loc145 = loc("tmp26"(#loc58))
+#loc146 = loc("tmp26"(#loc59))
+#loc147 = loc("tmp26"(#loc60))
+#loc148 = loc("tmp26"(#loc61))
+#loc149 = loc("tmp26"(#loc62))
+#loc150 = loc("tmp27"(#loc63))
+#loc151 = loc("tmp28"(#loc64))
+#loc152 = loc("tmp29"(#loc65))
+#loc153 = loc("tmp30"(#loc66))
+#loc154 = loc("tmp30"(#loc67))
+#loc155 = loc("tmp30"(#loc68))
+#loc156 = loc("tmp30"(#loc69))
+#loc157 = loc("tmp30"(#loc70))
+#loc158 = loc("tmp30"(#loc71))
+#loc159 = loc("tmp30"(#loc72))
+#loc160 = loc("tmp31"(#loc73))
+#loc161 = loc("tmp33"(#loc74))
+#loc162 = loc("tmp34"(#loc75))
diff --git a/triton/V2UITHYV32CXYY2HOX6V4PW3NX5CTYPJRA4NJTKWUVP3CV64YJDQ/triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.ttgir b/triton/V2UITHYV32CXYY2HOX6V4PW3NX5CTYPJRA4NJTKWUVP3CV64YJDQ/triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.ttgir
new file mode 100644
index 0000000000000000000000000000000000000000..d24e57683476b87db609db1f92bd32e40635f906
--- /dev/null
+++ b/triton/V2UITHYV32CXYY2HOX6V4PW3NX5CTYPJRA4NJTKWUVP3CV64YJDQ/triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.ttgir
@@ -0,0 +1,198 @@
+#blocked = #ttg.blocked<{sizePerThread = [2], threadsPerWarp = [32], warpsPerCTA = [8], order = [0]}>
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":18:0)
+#loc59 = loc("in_ptr0"(#loc))
+#loc60 = loc("in_ptr1"(#loc))
+#loc61 = loc("in_ptr2"(#loc))
+#loc62 = loc("in_ptr3"(#loc))
+#loc63 = loc("out_ptr0"(#loc))
+#loc64 = loc("out_ptr1"(#loc))
+#loc65 = loc("xnumel"(#loc))
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "cuda:89", "ttg.threads-per-warp" = 32 : i32} {
+  tt.func public @triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %out_ptr1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} {
+    %cst = arith.constant dense<1> : tensor<512xi32, #blocked> loc(#loc1)
+    %cst_0 = arith.constant dense<1> : tensor<512xi64, #blocked> loc(#loc1)
+    %cst_1 = arith.constant dense<2> : tensor<512xi32, #blocked> loc(#loc1)
+    %cst_2 = arith.constant dense<4096> : tensor<512xi32, #blocked> loc(#loc1)
+    %cst_3 = arith.constant dense<128> : tensor<512xi32, #blocked> loc(#loc1)
+    %c512_i32 = arith.constant 512 : i32 loc(#loc1)
+    %cst_4 = arith.constant dense<0.000000e+00> : tensor<512xbf16, #blocked> loc(#loc1)
+    %cst_5 = arith.constant dense<0.000000e+00> : tensor<512xf32, #blocked> loc(#loc1)
+    %xoffset = tt.get_program_id x : i32 loc(#loc66)
+    %xoffset_6 = arith.muli %xoffset, %c512_i32 : i32 loc(#loc67)
+    %xindex = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32, #blocked> loc(#loc68)
+    %xindex_7 = tt.splat %xoffset_6 : i32 -> tensor<512xi32, #blocked> loc(#loc69)
+    %xindex_8 = arith.addi %xindex_7, %xindex : tensor<512xi32, #blocked> loc(#loc69)
+    %x0 = arith.remsi %xindex_8, %cst_3 : tensor<512xi32, #blocked> loc(#loc70)
+    %x2 = arith.divsi %xindex_8, %cst_2 : tensor<512xi32, #blocked> loc(#loc71)
+    %x4 = arith.divsi %xindex_8, %cst_3 : tensor<512xi32, #blocked> loc(#loc72)
+    %tmp0 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<512x!tt.ptr<bf16>, #blocked> loc(#loc73)
+    %tmp0_9 = tt.addptr %tmp0, %xindex_8 : tensor<512x!tt.ptr<bf16>, #blocked>, tensor<512xi32, #blocked> loc(#loc73)
+    %tmp0_10 = tt.load %tmp0_9 : tensor<512x!tt.ptr<bf16>, #blocked> loc(#loc74)
+    %tmp0_11 = arith.extf %tmp0_10 : tensor<512xbf16, #blocked> to tensor<512xf32, #blocked> loc(#loc75)
+    %tmp2 = arith.muli %x2, %cst_3 : tensor<512xi32, #blocked> loc(#loc76)
+    %tmp2_12 = arith.addi %x0, %tmp2 : tensor<512xi32, #blocked> loc(#loc77)
+    %tmp2_13 = tt.splat %in_ptr1 : !tt.ptr<f32> -> tensor<512x!tt.ptr<f32>, #blocked> loc(#loc78)
+    %tmp2_14 = tt.addptr %tmp2_13, %tmp2_12 : tensor<512x!tt.ptr<f32>, #blocked>, tensor<512xi32, #blocked> loc(#loc78)
+    %tmp2_15 = tt.load %tmp2_14 evictionPolicy = evict_last : tensor<512x!tt.ptr<f32>, #blocked> loc(#loc79)
+    %tmp19 = tt.splat %in_ptr2 : !tt.ptr<f32> -> tensor<512x!tt.ptr<f32>, #blocked> loc(#loc80)
+    %tmp19_16 = tt.addptr %tmp19, %tmp2_12 : tensor<512x!tt.ptr<f32>, #blocked>, tensor<512xi32, #blocked> loc(#loc80)
+    %tmp19_17 = tt.load %tmp19_16 evictionPolicy = evict_last : tensor<512x!tt.ptr<f32>, #blocked> loc(#loc81)
+    %tmp23 = tt.splat %in_ptr3 : !tt.ptr<bf16> -> tensor<512x!tt.ptr<bf16>, #blocked> loc(#loc82)
+    %tmp23_18 = tt.addptr %tmp23, %xindex_8 : tensor<512x!tt.ptr<bf16>, #blocked>, tensor<512xi32, #blocked> loc(#loc82)
+    %tmp23_19 = tt.load %tmp23_18 : tensor<512x!tt.ptr<bf16>, #blocked> loc(#loc83)
+    %tmp23_20 = arith.extf %tmp23_19 : tensor<512xbf16, #blocked> to tensor<512xf32, #blocked> loc(#loc84)
+    %tmp3 = arith.mulf %tmp0_11, %tmp2_15 : tensor<512xf32, #blocked> loc(#loc85)
+    %tmp4 = arith.remsi %xindex_8, %cst_1 : tensor<512xi32, #blocked> loc(#loc86)
+    %tmp8 = arith.extsi %tmp4 : tensor<512xi32, #blocked> to tensor<512xi64, #blocked> loc(#loc87)
+    %tmp8_21 = arith.cmpi slt, %tmp8, %cst_0 : tensor<512xi64, #blocked> loc(#loc87)
+    %tmp9 = arith.divsi %x0, %cst_1 : tensor<512xi32, #blocked> loc(#loc88)
+    %tmp9_22 = arith.muli %tmp9, %cst_1 : tensor<512xi32, #blocked> loc(#loc89)
+    %tmp9_23 = arith.addi %tmp9_22, %cst : tensor<512xi32, #blocked> loc(#loc90)
+    %tmp9_24 = arith.muli %x4, %cst_3 : tensor<512xi32, #blocked> loc(#loc91)
+    %tmp9_25 = arith.addi %tmp9_23, %tmp9_24 : tensor<512xi32, #blocked> loc(#loc92)
+    %tmp9_26 = tt.addptr %tmp0, %tmp9_25 : tensor<512x!tt.ptr<bf16>, #blocked>, tensor<512xi32, #blocked> loc(#loc93)
+    %tmp9_27 = tt.load %tmp9_26, %tmp8_21, %cst_4 evictionPolicy = evict_last : tensor<512x!tt.ptr<bf16>, #blocked> loc(#loc94)
+    %tmp9_28 = arith.extf %tmp9_27 : tensor<512xbf16, #blocked> to tensor<512xf32, #blocked> loc(#loc95)
+    %tmp10 = arith.subf %cst_5, %tmp9_28 : tensor<512xf32, #blocked> loc(#loc96)
+    %tmp13 = arith.cmpi sge, %tmp8, %cst_0 : tensor<512xi64, #blocked> loc(#loc97)
+    %tmp16 = arith.addi %tmp9_22, %tmp9_24 : tensor<512xi32, #blocked> loc(#loc98)
+    %tmp16_29 = tt.addptr %tmp0, %tmp16 : tensor<512x!tt.ptr<bf16>, #blocked>, tensor<512xi32, #blocked> loc(#loc99)
+    %tmp16_30 = tt.load %tmp16_29, %tmp13, %cst_4 evictionPolicy = evict_last : tensor<512x!tt.ptr<bf16>, #blocked> loc(#loc100)
+    %tmp16_31 = arith.extf %tmp16_30 : tensor<512xbf16, #blocked> to tensor<512xf32, #blocked> loc(#loc101)
+    %tmp17 = arith.select %tmp8_21, %tmp10, %tmp16_31 : tensor<512xi1, #blocked>, tensor<512xf32, #blocked> loc(#loc118)
+    %tmp20 = arith.mulf %tmp17, %tmp19_17 : tensor<512xf32, #blocked> loc(#loc104)
+    %tmp21 = arith.addf %tmp3, %tmp20 : tensor<512xf32, #blocked> loc(#loc105)
+    %tmp25 = arith.mulf %tmp23_20, %tmp2_15 : tensor<512xf32, #blocked> loc(#loc106)
+    %tmp26 = tt.addptr %tmp23, %tmp9_25 : tensor<512x!tt.ptr<bf16>, #blocked>, tensor<512xi32, #blocked> loc(#loc107)
+    %tmp26_32 = tt.load %tmp26, %tmp8_21, %cst_4 evictionPolicy = evict_last : tensor<512x!tt.ptr<bf16>, #blocked> loc(#loc108)
+    %tmp26_33 = arith.extf %tmp26_32 : tensor<512xbf16, #blocked> to tensor<512xf32, #blocked> loc(#loc109)
+    %tmp27 = arith.subf %cst_5, %tmp26_33 : tensor<512xf32, #blocked> loc(#loc110)
+    %tmp30 = tt.addptr %tmp23, %tmp16 : tensor<512x!tt.ptr<bf16>, #blocked>, tensor<512xi32, #blocked> loc(#loc111)
+    %tmp30_34 = tt.load %tmp30, %tmp13, %cst_4 evictionPolicy = evict_last : tensor<512x!tt.ptr<bf16>, #blocked> loc(#loc112)
+    %tmp30_35 = arith.extf %tmp30_34 : tensor<512xbf16, #blocked> to tensor<512xf32, #blocked> loc(#loc113)
+    %tmp31 = arith.select %tmp8_21, %tmp27, %tmp30_35 : tensor<512xi1, #blocked>, tensor<512xf32, #blocked> loc(#loc119)
+    %tmp33 = arith.mulf %tmp31, %tmp19_17 : tensor<512xf32, #blocked> loc(#loc116)
+    %tmp34 = arith.addf %tmp25, %tmp33 : tensor<512xf32, #blocked> loc(#loc117)
+    %0 = tt.splat %out_ptr0 : !tt.ptr<bf16> -> tensor<512x!tt.ptr<bf16>, #blocked> loc(#loc54)
+    %1 = tt.addptr %0, %xindex_8 : tensor<512x!tt.ptr<bf16>, #blocked>, tensor<512xi32, #blocked> loc(#loc54)
+    %2 = arith.truncf %tmp21 : tensor<512xf32, #blocked> to tensor<512xbf16, #blocked> loc(#loc55)
+    tt.store %1, %2 : tensor<512x!tt.ptr<bf16>, #blocked> loc(#loc55)
+    %3 = tt.splat %out_ptr1 : !tt.ptr<bf16> -> tensor<512x!tt.ptr<bf16>, #blocked> loc(#loc56)
+    %4 = tt.addptr %3, %xindex_8 : tensor<512x!tt.ptr<bf16>, #blocked>, tensor<512xi32, #blocked> loc(#loc56)
+    %5 = arith.truncf %tmp34 : tensor<512xf32, #blocked> to tensor<512xbf16, #blocked> loc(#loc57)
+    tt.store %4, %5 : tensor<512x!tt.ptr<bf16>, #blocked> loc(#loc57)
+    tt.return loc(#loc58)
+  } loc(#loc)
+} loc(#loc)
+#loc1 = loc(unknown)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":20:28)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":20:33)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":21:36)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":21:23)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":24:19)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":25:19)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":26:19)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":27:30)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":27:35)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":27:44)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":28:39)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":28:35)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":28:30)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":28:44)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":29:31)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":29:45)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":30:31)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":30:36)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":30:45)
+#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":32:18)
+#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":33:17)
+#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":37:18)
+#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":38:43)
+#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":38:37)
+#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":38:34)
+#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":38:52)
+#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":38:48)
+#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":38:30)
+#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":38:57)
+#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":38:107)
+#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":39:13)
+#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":42:20)
+#loc34 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":45:45)
+#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":45:31)
+#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":45:54)
+#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":45:105)
+#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":46:34)
+#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":41:34)
+#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":48:20)
+#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":49:19)
+#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":52:20)
+#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":53:31)
+#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":53:58)
+#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":53:108)
+#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":54:13)
+#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":57:31)
+#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":57:54)
+#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":57:105)
+#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":58:34)
+#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":56:34)
+#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":60:20)
+#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":61:20)
+#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":63:25)
+#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":63:37)
+#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":64:25)
+#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":64:37)
+#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":64:4)
+#loc66 = loc("xoffset"(#loc2))
+#loc67 = loc("xoffset"(#loc3))
+#loc68 = loc("xindex"(#loc4))
+#loc69 = loc("xindex"(#loc5))
+#loc70 = loc("x0"(#loc6))
+#loc71 = loc("x2"(#loc7))
+#loc72 = loc("x4"(#loc8))
+#loc73 = loc("tmp0"(#loc9))
+#loc74 = loc("tmp0"(#loc10))
+#loc75 = loc("tmp0"(#loc11))
+#loc76 = loc("tmp2"(#loc12))
+#loc77 = loc("tmp2"(#loc13))
+#loc78 = loc("tmp2"(#loc14))
+#loc79 = loc("tmp2"(#loc15))
+#loc80 = loc("tmp19"(#loc16))
+#loc81 = loc("tmp19"(#loc17))
+#loc82 = loc("tmp23"(#loc18))
+#loc83 = loc("tmp23"(#loc19))
+#loc84 = loc("tmp23"(#loc20))
+#loc85 = loc("tmp3"(#loc21))
+#loc86 = loc("tmp4"(#loc22))
+#loc87 = loc("tmp8"(#loc23))
+#loc88 = loc("tmp9"(#loc24))
+#loc89 = loc("tmp9"(#loc25))
+#loc90 = loc("tmp9"(#loc26))
+#loc91 = loc("tmp9"(#loc27))
+#loc92 = loc("tmp9"(#loc28))
+#loc93 = loc("tmp9"(#loc29))
+#loc94 = loc("tmp9"(#loc30))
+#loc95 = loc("tmp9"(#loc31))
+#loc96 = loc("tmp10"(#loc32))
+#loc97 = loc("tmp13"(#loc33))
+#loc98 = loc("tmp16"(#loc34))
+#loc99 = loc("tmp16"(#loc35))
+#loc100 = loc("tmp16"(#loc36))
+#loc101 = loc("tmp16"(#loc37))
+#loc102 = loc("tmp17"(#loc38))
+#loc103 = loc("tmp12"(#loc39))
+#loc104 = loc("tmp20"(#loc40))
+#loc105 = loc("tmp21"(#loc41))
+#loc106 = loc("tmp25"(#loc42))
+#loc107 = loc("tmp26"(#loc43))
+#loc108 = loc("tmp26"(#loc44))
+#loc109 = loc("tmp26"(#loc45))
+#loc110 = loc("tmp27"(#loc46))
+#loc111 = loc("tmp30"(#loc47))
+#loc112 = loc("tmp30"(#loc48))
+#loc113 = loc("tmp30"(#loc49))
+#loc114 = loc("tmp31"(#loc50))
+#loc115 = loc("tmp29"(#loc51))
+#loc116 = loc("tmp33"(#loc52))
+#loc117 = loc("tmp34"(#loc53))
+#loc118 = loc(fused[#loc102, #loc103])
+#loc119 = loc(fused[#loc114, #loc115])
diff --git a/triton/V2UITHYV32CXYY2HOX6V4PW3NX5CTYPJRA4NJTKWUVP3CV64YJDQ/triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.ttir b/triton/V2UITHYV32CXYY2HOX6V4PW3NX5CTYPJRA4NJTKWUVP3CV64YJDQ/triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.ttir
new file mode 100644
index 0000000000000000000000000000000000000000..bc0af5f2caf34db5e98fb28c624b801c691e76c0
--- /dev/null
+++ b/triton/V2UITHYV32CXYY2HOX6V4PW3NX5CTYPJRA4NJTKWUVP3CV64YJDQ/triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3.ttir
@@ -0,0 +1,197 @@
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":18:0)
+#loc59 = loc("in_ptr0"(#loc))
+#loc60 = loc("in_ptr1"(#loc))
+#loc61 = loc("in_ptr2"(#loc))
+#loc62 = loc("in_ptr3"(#loc))
+#loc63 = loc("out_ptr0"(#loc))
+#loc64 = loc("out_ptr1"(#loc))
+#loc65 = loc("xnumel"(#loc))
+module {
+  tt.func public @triton_poi_fused__to_copy_add_mul_neg_stack_unbind_unsqueeze_view_3(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %out_ptr1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} {
+    %cst = arith.constant dense<0.000000e+00> : tensor<512xbf16> loc(#loc1)
+    %cst_0 = arith.constant dense<0.000000e+00> : tensor<512xf32> loc(#loc1)
+    %cst_1 = arith.constant dense<1> : tensor<512xi32> loc(#loc1)
+    %cst_2 = arith.constant dense<1> : tensor<512xi64> loc(#loc1)
+    %cst_3 = arith.constant dense<2> : tensor<512xi32> loc(#loc1)
+    %x2 = arith.constant dense<4096> : tensor<512xi32> loc(#loc66)
+    %cst_4 = arith.constant dense<128> : tensor<512xi32> loc(#loc1)
+    %c512_i32 = arith.constant 512 : i32 loc(#loc1)
+    %xoffset = tt.get_program_id x : i32 loc(#loc67)
+    %xoffset_5 = arith.muli %xoffset, %c512_i32 : i32 loc(#loc68)
+    %xindex = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32> loc(#loc69)
+    %xindex_6 = tt.splat %xoffset_5 : i32 -> tensor<512xi32> loc(#loc70)
+    %xindex_7 = arith.addi %xindex_6, %xindex : tensor<512xi32> loc(#loc70)
+    %x0 = arith.remsi %xindex_7, %cst_4 : tensor<512xi32> loc(#loc71)
+    %x2_8 = arith.divsi %xindex_7, %x2 : tensor<512xi32> loc(#loc66)
+    %x4 = arith.divsi %xindex_7, %cst_4 : tensor<512xi32> loc(#loc72)
+    %tmp0 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<512x!tt.ptr<bf16>> loc(#loc73)
+    %tmp0_9 = tt.addptr %tmp0, %xindex_7 : tensor<512x!tt.ptr<bf16>>, tensor<512xi32> loc(#loc73)
+    %tmp0_10 = tt.load %tmp0_9 : tensor<512x!tt.ptr<bf16>> loc(#loc74)
+    %tmp0_11 = arith.extf %tmp0_10 : tensor<512xbf16> to tensor<512xf32> loc(#loc75)
+    %tmp2 = arith.muli %x2_8, %cst_4 : tensor<512xi32> loc(#loc76)
+    %tmp2_12 = arith.addi %x0, %tmp2 : tensor<512xi32> loc(#loc77)
+    %tmp2_13 = tt.splat %in_ptr1 : !tt.ptr<f32> -> tensor<512x!tt.ptr<f32>> loc(#loc78)
+    %tmp2_14 = tt.addptr %tmp2_13, %tmp2_12 : tensor<512x!tt.ptr<f32>>, tensor<512xi32> loc(#loc78)
+    %tmp2_15 = tt.load %tmp2_14 evictionPolicy = evict_last : tensor<512x!tt.ptr<f32>> loc(#loc79)
+    %tmp19 = tt.splat %in_ptr2 : !tt.ptr<f32> -> tensor<512x!tt.ptr<f32>> loc(#loc80)
+    %tmp19_16 = tt.addptr %tmp19, %tmp2_12 : tensor<512x!tt.ptr<f32>>, tensor<512xi32> loc(#loc80)
+    %tmp19_17 = tt.load %tmp19_16 evictionPolicy = evict_last : tensor<512x!tt.ptr<f32>> loc(#loc81)
+    %tmp23 = tt.splat %in_ptr3 : !tt.ptr<bf16> -> tensor<512x!tt.ptr<bf16>> loc(#loc82)
+    %tmp23_18 = tt.addptr %tmp23, %xindex_7 : tensor<512x!tt.ptr<bf16>>, tensor<512xi32> loc(#loc82)
+    %tmp23_19 = tt.load %tmp23_18 : tensor<512x!tt.ptr<bf16>> loc(#loc83)
+    %tmp23_20 = arith.extf %tmp23_19 : tensor<512xbf16> to tensor<512xf32> loc(#loc84)
+    %tmp3 = arith.mulf %tmp0_11, %tmp2_15 : tensor<512xf32> loc(#loc85)
+    %tmp4 = arith.remsi %xindex_7, %cst_3 : tensor<512xi32> loc(#loc86)
+    %tmp8 = arith.extsi %tmp4 : tensor<512xi32> to tensor<512xi64> loc(#loc87)
+    %tmp8_21 = arith.cmpi slt, %tmp8, %cst_2 : tensor<512xi64> loc(#loc87)
+    %tmp9 = arith.divsi %x0, %cst_3 : tensor<512xi32> loc(#loc88)
+    %tmp9_22 = arith.muli %tmp9, %cst_3 : tensor<512xi32> loc(#loc89)
+    %tmp9_23 = arith.addi %tmp9_22, %cst_1 : tensor<512xi32> loc(#loc90)
+    %tmp9_24 = arith.muli %x4, %cst_4 : tensor<512xi32> loc(#loc91)
+    %tmp9_25 = arith.addi %tmp9_23, %tmp9_24 : tensor<512xi32> loc(#loc92)
+    %tmp9_26 = tt.addptr %tmp0, %tmp9_25 : tensor<512x!tt.ptr<bf16>>, tensor<512xi32> loc(#loc93)
+    %tmp9_27 = tt.load %tmp9_26, %tmp8_21, %cst evictionPolicy = evict_last : tensor<512x!tt.ptr<bf16>> loc(#loc94)
+    %tmp9_28 = arith.extf %tmp9_27 : tensor<512xbf16> to tensor<512xf32> loc(#loc95)
+    %tmp10 = arith.subf %cst_0, %tmp9_28 : tensor<512xf32> loc(#loc96)
+    %tmp13 = arith.cmpi sge, %tmp8, %cst_2 : tensor<512xi64> loc(#loc97)
+    %tmp16 = arith.addi %tmp9_22, %tmp9_24 : tensor<512xi32> loc(#loc98)
+    %tmp16_29 = tt.addptr %tmp0, %tmp16 : tensor<512x!tt.ptr<bf16>>, tensor<512xi32> loc(#loc99)
+    %tmp16_30 = tt.load %tmp16_29, %tmp13, %cst evictionPolicy = evict_last : tensor<512x!tt.ptr<bf16>> loc(#loc100)
+    %tmp16_31 = arith.extf %tmp16_30 : tensor<512xbf16> to tensor<512xf32> loc(#loc101)
+    %tmp17 = arith.select %tmp8_21, %tmp10, %tmp16_31 : tensor<512xi1>, tensor<512xf32> loc(#loc118)
+    %tmp20 = arith.mulf %tmp17, %tmp19_17 : tensor<512xf32> loc(#loc104)
+    %tmp21 = arith.addf %tmp3, %tmp20 : tensor<512xf32> loc(#loc105)
+    %tmp25 = arith.mulf %tmp23_20, %tmp2_15 : tensor<512xf32> loc(#loc106)
+    %tmp26 = tt.addptr %tmp23, %tmp9_25 : tensor<512x!tt.ptr<bf16>>, tensor<512xi32> loc(#loc107)
+    %tmp26_32 = tt.load %tmp26, %tmp8_21, %cst evictionPolicy = evict_last : tensor<512x!tt.ptr<bf16>> loc(#loc108)
+    %tmp26_33 = arith.extf %tmp26_32 : tensor<512xbf16> to tensor<512xf32> loc(#loc109)
+    %tmp27 = arith.subf %cst_0, %tmp26_33 : tensor<512xf32> loc(#loc110)
+    %tmp30 = tt.addptr %tmp23, %tmp16 : tensor<512x!tt.ptr<bf16>>, tensor<512xi32> loc(#loc111)
+    %tmp30_34 = tt.load %tmp30, %tmp13, %cst evictionPolicy = evict_last : tensor<512x!tt.ptr<bf16>> loc(#loc112)
+    %tmp30_35 = arith.extf %tmp30_34 : tensor<512xbf16> to tensor<512xf32> loc(#loc113)
+    %tmp31 = arith.select %tmp8_21, %tmp27, %tmp30_35 : tensor<512xi1>, tensor<512xf32> loc(#loc119)
+    %tmp33 = arith.mulf %tmp31, %tmp19_17 : tensor<512xf32> loc(#loc116)
+    %tmp34 = arith.addf %tmp25, %tmp33 : tensor<512xf32> loc(#loc117)
+    %0 = tt.splat %out_ptr0 : !tt.ptr<bf16> -> tensor<512x!tt.ptr<bf16>> loc(#loc54)
+    %1 = tt.addptr %0, %xindex_7 : tensor<512x!tt.ptr<bf16>>, tensor<512xi32> loc(#loc54)
+    %2 = arith.truncf %tmp21 : tensor<512xf32> to tensor<512xbf16> loc(#loc55)
+    tt.store %1, %2 : tensor<512x!tt.ptr<bf16>> loc(#loc55)
+    %3 = tt.splat %out_ptr1 : !tt.ptr<bf16> -> tensor<512x!tt.ptr<bf16>> loc(#loc56)
+    %4 = tt.addptr %3, %xindex_7 : tensor<512x!tt.ptr<bf16>>, tensor<512xi32> loc(#loc56)
+    %5 = arith.truncf %tmp34 : tensor<512xf32> to tensor<512xbf16> loc(#loc57)
+    tt.store %4, %5 : tensor<512x!tt.ptr<bf16>> loc(#loc57)
+    tt.return loc(#loc58)
+  } loc(#loc)
+} loc(#loc)
+#loc1 = loc(unknown)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":25:19)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":20:28)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":20:33)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":21:36)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":21:23)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":24:19)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":26:19)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":27:30)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":27:35)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":27:44)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":28:39)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":28:35)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":28:30)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":28:44)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":29:31)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":29:45)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":30:31)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":30:36)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":30:45)
+#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":32:18)
+#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":33:17)
+#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":37:18)
+#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":38:43)
+#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":38:37)
+#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":38:34)
+#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":38:52)
+#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":38:48)
+#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":38:30)
+#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":38:57)
+#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":38:107)
+#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":39:13)
+#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":42:20)
+#loc34 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":45:45)
+#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":45:31)
+#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":45:54)
+#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":45:105)
+#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":46:34)
+#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":41:34)
+#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":48:20)
+#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":49:19)
+#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":52:20)
+#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":53:31)
+#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":53:58)
+#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":53:108)
+#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":54:13)
+#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":57:31)
+#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":57:54)
+#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":57:105)
+#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":58:34)
+#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":56:34)
+#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":60:20)
+#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":61:20)
+#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":63:25)
+#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":63:37)
+#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":64:25)
+#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":64:37)
+#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/j6/cj66tgbfqx7rhytcywmjdcimnwwtq6xjgb2qbqbxxonaldotx63v.py":64:4)
+#loc66 = loc("x2"(#loc2))
+#loc67 = loc("xoffset"(#loc3))
+#loc68 = loc("xoffset"(#loc4))
+#loc69 = loc("xindex"(#loc5))
+#loc70 = loc("xindex"(#loc6))
+#loc71 = loc("x0"(#loc7))
+#loc72 = loc("x4"(#loc8))
+#loc73 = loc("tmp0"(#loc9))
+#loc74 = loc("tmp0"(#loc10))
+#loc75 = loc("tmp0"(#loc11))
+#loc76 = loc("tmp2"(#loc12))
+#loc77 = loc("tmp2"(#loc13))
+#loc78 = loc("tmp2"(#loc14))
+#loc79 = loc("tmp2"(#loc15))
+#loc80 = loc("tmp19"(#loc16))
+#loc81 = loc("tmp19"(#loc17))
+#loc82 = loc("tmp23"(#loc18))
+#loc83 = loc("tmp23"(#loc19))
+#loc84 = loc("tmp23"(#loc20))
+#loc85 = loc("tmp3"(#loc21))
+#loc86 = loc("tmp4"(#loc22))
+#loc87 = loc("tmp8"(#loc23))
+#loc88 = loc("tmp9"(#loc24))
+#loc89 = loc("tmp9"(#loc25))
+#loc90 = loc("tmp9"(#loc26))
+#loc91 = loc("tmp9"(#loc27))
+#loc92 = loc("tmp9"(#loc28))
+#loc93 = loc("tmp9"(#loc29))
+#loc94 = loc("tmp9"(#loc30))
+#loc95 = loc("tmp9"(#loc31))
+#loc96 = loc("tmp10"(#loc32))
+#loc97 = loc("tmp13"(#loc33))
+#loc98 = loc("tmp16"(#loc34))
+#loc99 = loc("tmp16"(#loc35))
+#loc100 = loc("tmp16"(#loc36))
+#loc101 = loc("tmp16"(#loc37))
+#loc102 = loc("tmp17"(#loc38))
+#loc103 = loc("tmp12"(#loc39))
+#loc104 = loc("tmp20"(#loc40))
+#loc105 = loc("tmp21"(#loc41))
+#loc106 = loc("tmp25"(#loc42))
+#loc107 = loc("tmp26"(#loc43))
+#loc108 = loc("tmp26"(#loc44))
+#loc109 = loc("tmp26"(#loc45))
+#loc110 = loc("tmp27"(#loc46))
+#loc111 = loc("tmp30"(#loc47))
+#loc112 = loc("tmp30"(#loc48))
+#loc113 = loc("tmp30"(#loc49))
+#loc114 = loc("tmp31"(#loc50))
+#loc115 = loc("tmp29"(#loc51))
+#loc116 = loc("tmp33"(#loc52))
+#loc117 = loc("tmp34"(#loc53))
+#loc118 = loc(fused[#loc102, #loc103])
+#loc119 = loc(fused[#loc114, #loc115])
diff --git a/triton/V55R5JX27SH3ZP2ZK3XX5KCHKG3ODHZ7KRAVANZA47FHWYLN35WA/__grp__triton_poi_fused__fused_rms_norm_cat_view_2.json b/triton/V55R5JX27SH3ZP2ZK3XX5KCHKG3ODHZ7KRAVANZA47FHWYLN35WA/__grp__triton_poi_fused__fused_rms_norm_cat_view_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..df301e83da3c7ab0e5771f4536b8fd9e8e6a31c7
--- /dev/null
+++ b/triton/V55R5JX27SH3ZP2ZK3XX5KCHKG3ODHZ7KRAVANZA47FHWYLN35WA/__grp__triton_poi_fused__fused_rms_norm_cat_view_2.json
@@ -0,0 +1 @@
+{"child_paths": {"triton_poi_fused__fused_rms_norm_cat_view_2.source": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/V55R5JX27SH3ZP2ZK3XX5KCHKG3ODHZ7KRAVANZA47FHWYLN35WA/triton_poi_fused__fused_rms_norm_cat_view_2.source", "triton_poi_fused__fused_rms_norm_cat_view_2.ttir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/V55R5JX27SH3ZP2ZK3XX5KCHKG3ODHZ7KRAVANZA47FHWYLN35WA/triton_poi_fused__fused_rms_norm_cat_view_2.ttir", "triton_poi_fused__fused_rms_norm_cat_view_2.ttgir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/V55R5JX27SH3ZP2ZK3XX5KCHKG3ODHZ7KRAVANZA47FHWYLN35WA/triton_poi_fused__fused_rms_norm_cat_view_2.ttgir", "triton_poi_fused__fused_rms_norm_cat_view_2.llir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/V55R5JX27SH3ZP2ZK3XX5KCHKG3ODHZ7KRAVANZA47FHWYLN35WA/triton_poi_fused__fused_rms_norm_cat_view_2.llir", "triton_poi_fused__fused_rms_norm_cat_view_2.ptx": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/V55R5JX27SH3ZP2ZK3XX5KCHKG3ODHZ7KRAVANZA47FHWYLN35WA/triton_poi_fused__fused_rms_norm_cat_view_2.ptx", "triton_poi_fused__fused_rms_norm_cat_view_2.cubin": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/V55R5JX27SH3ZP2ZK3XX5KCHKG3ODHZ7KRAVANZA47FHWYLN35WA/triton_poi_fused__fused_rms_norm_cat_view_2.cubin", "triton_poi_fused__fused_rms_norm_cat_view_2.json": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/V55R5JX27SH3ZP2ZK3XX5KCHKG3ODHZ7KRAVANZA47FHWYLN35WA/triton_poi_fused__fused_rms_norm_cat_view_2.json"}}
\ No newline at end of file
diff --git a/triton/V55R5JX27SH3ZP2ZK3XX5KCHKG3ODHZ7KRAVANZA47FHWYLN35WA/triton_poi_fused__fused_rms_norm_cat_view_2.cubin b/triton/V55R5JX27SH3ZP2ZK3XX5KCHKG3ODHZ7KRAVANZA47FHWYLN35WA/triton_poi_fused__fused_rms_norm_cat_view_2.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..91e74ff6ccfa30663cead80d6f84b745df26f67d
Binary files /dev/null and b/triton/V55R5JX27SH3ZP2ZK3XX5KCHKG3ODHZ7KRAVANZA47FHWYLN35WA/triton_poi_fused__fused_rms_norm_cat_view_2.cubin differ
diff --git a/triton/V55R5JX27SH3ZP2ZK3XX5KCHKG3ODHZ7KRAVANZA47FHWYLN35WA/triton_poi_fused__fused_rms_norm_cat_view_2.json b/triton/V55R5JX27SH3ZP2ZK3XX5KCHKG3ODHZ7KRAVANZA47FHWYLN35WA/triton_poi_fused__fused_rms_norm_cat_view_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..1073740dfa50f56ccce4944c12a3e75b5d2e4b7b
--- /dev/null
+++ b/triton/V55R5JX27SH3ZP2ZK3XX5KCHKG3ODHZ7KRAVANZA47FHWYLN35WA/triton_poi_fused__fused_rms_norm_cat_view_2.json
@@ -0,0 +1 @@
+{"hash": "af7b1ea6fafc8fbcbf5956ef7ea84751b6e19f3f5441503720e7ca7b616ddf6c", "target": {"backend": "cuda", "arch": 89, "warp_size": 32}, "num_warps": 8, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "enable_reflect_ftz": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee", "bf16x3", "bf16x6"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm89", "instrumentation_mode": "", "triton_version": "3.6.0", "tensordesc_meta": [], "shared": 8192, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_poi_fused__fused_rms_norm_cat_view_2"}
\ No newline at end of file
diff --git a/triton/V55R5JX27SH3ZP2ZK3XX5KCHKG3ODHZ7KRAVANZA47FHWYLN35WA/triton_poi_fused__fused_rms_norm_cat_view_2.llir b/triton/V55R5JX27SH3ZP2ZK3XX5KCHKG3ODHZ7KRAVANZA47FHWYLN35WA/triton_poi_fused__fused_rms_norm_cat_view_2.llir
new file mode 100644
index 0000000000000000000000000000000000000000..e6bfbc847213ca0c458db6207e2bc45cd6afabdc
--- /dev/null
+++ b/triton/V55R5JX27SH3ZP2ZK3XX5KCHKG3ODHZ7KRAVANZA47FHWYLN35WA/triton_poi_fused__fused_rms_norm_cat_view_2.llir
@@ -0,0 +1,1284 @@
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64"
+
+@global_smem = external addrspace(3) global [0 x i8], align 16
+@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1
+
+; Function Attrs: nounwind
+define ptx_kernel void @triton_poi_fused__fused_rms_norm_cat_view_2(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, ptr addrspace(1) %6, i32 %7, i32 %8, ptr addrspace(1) readnone captures(none) %9, ptr addrspace(1) readnone captures(none) %10) local_unnamed_addr #0 !dbg !5 {
+  %12 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y(), !dbg !8
+  %13 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.z(), !dbg !9
+  %14 = tail call i32 @llvm.nvvm.read.ptx.sreg.nctaid.y(), !dbg !10
+  %15 = mul nuw i32 %13, %14, !dbg !11
+  %16 = add nuw i32 %15, %12, !dbg !12
+  %17 = shl i32 %16, 8, !dbg !13
+  %18 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !14
+  %19 = lshr i32 %18, 1, !dbg !14
+  %20 = and i32 %19, 127, !dbg !14
+  %21 = shl nuw nsw i32 %18, 2, !dbg !14
+  %22 = and i32 %21, 252, !dbg !14
+  %23 = or disjoint i32 %17, %20, !dbg !15
+  %24 = or disjoint i32 %23, 128, !dbg !15
+  %25 = or disjoint i32 %17, %22, !dbg !15
+  %26 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !16
+  %27 = shl i32 %26, 4, !dbg !17
+  %28 = and i32 %18, 1, !dbg !18
+  %29 = icmp eq i32 %28, 0, !dbg !18
+  %30 = shl nuw nsw i32 %28, 3, !dbg !18
+  %31 = lshr i32 %18, 6, !dbg !18
+  %32 = and i32 %31, 3, !dbg !18
+  %33 = or disjoint i32 %30, %27, !dbg !19
+  %34 = or disjoint i32 %32, %27, !dbg !19
+  %35 = icmp slt i32 %33, 128, !dbg !20
+  %36 = icmp slt i32 %34, 128, !dbg !20
+  %37 = sdiv i32 %23, 32, !dbg !21
+  %38 = sdiv i32 %24, 32, !dbg !21
+  %39 = sdiv i32 %25, 32, !dbg !21
+  %40 = mul i32 %37, 32, !dbg !22
+  %.decomposed = sub i32 %23, %40, !dbg !22
+  %41 = mul i32 %39, 32, !dbg !22
+  %.decomposed148 = sub i32 %25, %41, !dbg !22
+  %42 = icmp slt i32 %23, 8192, !dbg !23
+  %43 = icmp slt i32 %25, 8192, !dbg !23
+  %44 = shl nsw i32 %.decomposed, 7, !dbg !24
+  %45 = add i32 %44, %33, !dbg !25
+  %46 = mul i32 %37, 12288, !dbg !26
+  %47 = mul i32 %38, 12288, !dbg !26
+  %48 = add i32 %45, %46, !dbg !27
+  %49 = add i32 %45, %47, !dbg !27
+  %50 = sext i32 %48 to i64, !dbg !28
+  %51 = getelementptr bfloat, ptr addrspace(1) %0, i64 %50, !dbg !28
+  %52 = sext i32 %49 to i64, !dbg !28
+  %53 = getelementptr bfloat, ptr addrspace(1) %0, i64 %52, !dbg !28
+  %54 = and i1 %35, %42, !dbg !29
+  %55 = and i1 %36, %43, !dbg !29
+  %56 = icmp slt i32 %23, 8064, !dbg !30
+  %57 = and i1 %35, %56, !dbg !30
+  %58 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !31
+  %59 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %51, i64 %58, i1 %54) #6, !dbg !31
+  %60 = extractvalue { i32, i32, i32, i32 } %59, 0, !dbg !31
+  %61 = extractvalue { i32, i32, i32, i32 } %59, 1, !dbg !31
+  %62 = extractvalue { i32, i32, i32, i32 } %59, 2, !dbg !31
+  %63 = extractvalue { i32, i32, i32, i32 } %59, 3, !dbg !31
+  %64 = insertelement <2 x i32> poison, i32 %60, i64 0, !dbg !31
+  %65 = insertelement <2 x i32> %64, i32 %62, i64 1, !dbg !31
+  %66 = lshr <2 x i32> %65, splat (i32 16), !dbg !31
+  %67 = trunc nuw <2 x i32> %66 to <2 x i16>, !dbg !31
+  %68 = insertelement <2 x i32> poison, i32 %61, i64 0, !dbg !31
+  %69 = insertelement <2 x i32> %68, i32 %63, i64 1, !dbg !31
+  %70 = lshr <2 x i32> %69, splat (i32 16), !dbg !31
+  %71 = trunc nuw <2 x i32> %70 to <2 x i16>, !dbg !31
+  %72 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !31
+  %73 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %53, i64 %72, i1 %57) #6, !dbg !31
+  %74 = extractvalue { i32, i32, i32, i32 } %73, 0, !dbg !31
+  %75 = extractvalue { i32, i32, i32, i32 } %73, 1, !dbg !31
+  %76 = extractvalue { i32, i32, i32, i32 } %73, 2, !dbg !31
+  %77 = extractvalue { i32, i32, i32, i32 } %73, 3, !dbg !31
+  %78 = insertelement <2 x i32> poison, i32 %74, i64 0, !dbg !31
+  %79 = insertelement <2 x i32> %78, i32 %76, i64 1, !dbg !31
+  %80 = lshr <2 x i32> %79, splat (i32 16), !dbg !31
+  %81 = trunc nuw <2 x i32> %80 to <2 x i16>, !dbg !31
+  %82 = insertelement <2 x i32> poison, i32 %75, i64 0, !dbg !31
+  %83 = insertelement <2 x i32> %82, i32 %77, i64 1, !dbg !31
+  %84 = lshr <2 x i32> %83, splat (i32 16), !dbg !31
+  %85 = trunc nuw <2 x i32> %84 to <2 x i16>, !dbg !31
+  %86 = and i32 %18, 6, !dbg !32
+  %87 = and i32 %18, 120, !dbg !32
+  %88 = shl nuw nsw i32 %28, 2, !dbg !32
+  %89 = and i32 %18, 128, !dbg !32
+  %90 = icmp eq i32 %89, 0, !dbg !32
+  %91 = select i1 %90, i32 0, i32 4100, !dbg !32
+  %92 = mul nuw nsw i32 %86, 528, !dbg !32
+  %93 = or disjoint i32 %88, %87, !dbg !32
+  %94 = xor i32 %92, %93, !dbg !32
+  %95 = xor i32 %94, %91, !dbg !32
+  %96 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %95, !dbg !32
+  %97 = trunc i32 %60 to i16, !dbg !32
+  %98 = trunc i32 %62 to i16, !dbg !32
+  %99 = insertelement <2 x i16> poison, i16 %97, i64 0, !dbg !32
+  %100 = insertelement <2 x i16> %99, i16 %98, i64 1, !dbg !32
+  store <2 x i16> %100, ptr addrspace(3) %96, align 4, !dbg !32
+  %101 = getelementptr inbounds nuw i8, ptr addrspace(3) %96, i32 256, !dbg !32
+  store <2 x i16> %67, ptr addrspace(3) %101, align 4, !dbg !32
+  %102 = getelementptr inbounds nuw i8, ptr addrspace(3) %96, i32 512, !dbg !32
+  %103 = trunc i32 %61 to i16, !dbg !32
+  %104 = trunc i32 %63 to i16, !dbg !32
+  %105 = insertelement <2 x i16> poison, i16 %103, i64 0, !dbg !32
+  %106 = insertelement <2 x i16> %105, i16 %104, i64 1, !dbg !32
+  store <2 x i16> %106, ptr addrspace(3) %102, align 4, !dbg !32
+  %107 = getelementptr inbounds nuw i8, ptr addrspace(3) %96, i32 768, !dbg !32
+  store <2 x i16> %71, ptr addrspace(3) %107, align 4, !dbg !32
+  %108 = getelementptr inbounds nuw i8, ptr addrspace(3) %96, i32 128, !dbg !32
+  %109 = trunc i32 %74 to i16, !dbg !32
+  %110 = trunc i32 %76 to i16, !dbg !32
+  %111 = insertelement <2 x i16> poison, i16 %109, i64 0, !dbg !32
+  %112 = insertelement <2 x i16> %111, i16 %110, i64 1, !dbg !32
+  store <2 x i16> %112, ptr addrspace(3) %108, align 4, !dbg !32
+  %113 = getelementptr inbounds nuw i8, ptr addrspace(3) %96, i32 384, !dbg !32
+  store <2 x i16> %81, ptr addrspace(3) %113, align 4, !dbg !32
+  %114 = getelementptr inbounds nuw i8, ptr addrspace(3) %96, i32 640, !dbg !32
+  %115 = trunc i32 %75 to i16, !dbg !32
+  %116 = trunc i32 %77 to i16, !dbg !32
+  %117 = insertelement <2 x i16> poison, i16 %115, i64 0, !dbg !32
+  %118 = insertelement <2 x i16> %117, i16 %116, i64 1, !dbg !32
+  store <2 x i16> %118, ptr addrspace(3) %114, align 4, !dbg !32
+  %119 = getelementptr inbounds nuw i8, ptr addrspace(3) %96, i32 896, !dbg !32
+  store <2 x i16> %85, ptr addrspace(3) %119, align 4, !dbg !32
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !32
+  %120 = shl nuw nsw i32 %18, 3, !dbg !32
+  %121 = and i32 %120, 120, !dbg !32
+  %122 = and i32 %18, 224, !dbg !32
+  %123 = shl nuw nsw i32 %122, 2, !dbg !32
+  %124 = and i32 %18, 16, !dbg !32
+  %125 = icmp eq i32 %124, 0, !dbg !32
+  %126 = select i1 %125, i32 0, i32 4100, !dbg !32
+  %127 = or disjoint i32 %126, %123, !dbg !32
+  %128 = or disjoint i32 %127, %121, !dbg !32
+  %129 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %128, !dbg !32
+  %130 = load bfloat, ptr addrspace(3) %129, align 4, !dbg !32
+  %131 = getelementptr inbounds nuw i8, ptr addrspace(3) %129, i32 2, !dbg !32
+  %132 = load bfloat, ptr addrspace(3) %131, align 2, !dbg !32
+  %133 = xor i32 %128, 1056, !dbg !32
+  %134 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %133, !dbg !32
+  %135 = load bfloat, ptr addrspace(3) %134, align 4, !dbg !32
+  %136 = getelementptr inbounds nuw i8, ptr addrspace(3) %134, i32 2, !dbg !32
+  %137 = load bfloat, ptr addrspace(3) %136, align 2, !dbg !32
+  %138 = xor i32 %128, 2112, !dbg !32
+  %139 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %138, !dbg !32
+  %140 = load bfloat, ptr addrspace(3) %139, align 4, !dbg !32
+  %141 = getelementptr inbounds nuw i8, ptr addrspace(3) %139, i32 2, !dbg !32
+  %142 = load bfloat, ptr addrspace(3) %141, align 2, !dbg !32
+  %143 = xor i32 %128, 3168, !dbg !32
+  %144 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %143, !dbg !32
+  %145 = load bfloat, ptr addrspace(3) %144, align 4, !dbg !32
+  %146 = getelementptr inbounds nuw i8, ptr addrspace(3) %144, i32 2, !dbg !32
+  %147 = load bfloat, ptr addrspace(3) %146, align 2, !dbg !32
+  %148 = xor i32 %128, 4, !dbg !32
+  %149 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %148, !dbg !32
+  %150 = load bfloat, ptr addrspace(3) %149, align 4, !dbg !32
+  %151 = getelementptr inbounds nuw i8, ptr addrspace(3) %149, i32 2, !dbg !32
+  %152 = load bfloat, ptr addrspace(3) %151, align 2, !dbg !32
+  %153 = xor i32 %128, 1060, !dbg !32
+  %154 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %153, !dbg !32
+  %155 = load bfloat, ptr addrspace(3) %154, align 4, !dbg !32
+  %156 = getelementptr inbounds nuw i8, ptr addrspace(3) %154, i32 2, !dbg !32
+  %157 = load bfloat, ptr addrspace(3) %156, align 2, !dbg !32
+  %158 = xor i32 %128, 2116, !dbg !32
+  %159 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %158, !dbg !32
+  %160 = load bfloat, ptr addrspace(3) %159, align 4, !dbg !32
+  %161 = getelementptr inbounds nuw i8, ptr addrspace(3) %159, i32 2, !dbg !32
+  %162 = load bfloat, ptr addrspace(3) %161, align 2, !dbg !32
+  %163 = xor i32 %128, 3172, !dbg !32
+  %164 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %163, !dbg !32
+  %165 = load bfloat, ptr addrspace(3) %164, align 4, !dbg !32
+  %166 = getelementptr inbounds nuw i8, ptr addrspace(3) %164, i32 2, !dbg !32
+  %167 = load bfloat, ptr addrspace(3) %166, align 2, !dbg !32
+  %168 = insertelement <4 x bfloat> poison, bfloat %130, i64 0, !dbg !32
+  %169 = insertelement <4 x bfloat> %168, bfloat %150, i64 1, !dbg !32
+  %170 = insertelement <4 x bfloat> %169, bfloat %135, i64 2, !dbg !32
+  %171 = insertelement <4 x bfloat> %170, bfloat %155, i64 3, !dbg !32
+  %172 = fpext <4 x bfloat> %171 to <4 x float>, !dbg !32
+  %173 = insertelement <4 x bfloat> poison, bfloat %140, i64 0, !dbg !32
+  %174 = insertelement <4 x bfloat> %173, bfloat %160, i64 1, !dbg !32
+  %175 = insertelement <4 x bfloat> %174, bfloat %145, i64 2, !dbg !32
+  %176 = insertelement <4 x bfloat> %175, bfloat %165, i64 3, !dbg !32
+  %177 = fpext <4 x bfloat> %176 to <4 x float>, !dbg !32
+  %178 = insertelement <4 x bfloat> poison, bfloat %132, i64 0, !dbg !32
+  %179 = insertelement <4 x bfloat> %178, bfloat %152, i64 1, !dbg !32
+  %180 = insertelement <4 x bfloat> %179, bfloat %137, i64 2, !dbg !32
+  %181 = insertelement <4 x bfloat> %180, bfloat %157, i64 3, !dbg !32
+  %182 = fpext <4 x bfloat> %181 to <4 x float>, !dbg !32
+  %183 = insertelement <4 x bfloat> poison, bfloat %142, i64 0, !dbg !32
+  %184 = insertelement <4 x bfloat> %183, bfloat %162, i64 1, !dbg !32
+  %185 = insertelement <4 x bfloat> %184, bfloat %147, i64 2, !dbg !32
+  %186 = insertelement <4 x bfloat> %185, bfloat %167, i64 3, !dbg !32
+  %187 = fpext <4 x bfloat> %186 to <4 x float>, !dbg !32
+  %188 = sext i32 %25 to i64, !dbg !33
+  %189 = getelementptr float, ptr addrspace(1) %1, i64 %188, !dbg !33
+  %190 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !34
+  %191 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %189, i64 %190, i1 %55) #6, !dbg !34
+  %192 = extractvalue { i32, i32, i32, i32 } %191, 0, !dbg !34
+  %193 = extractvalue { i32, i32, i32, i32 } %191, 1, !dbg !34
+  %194 = extractvalue { i32, i32, i32, i32 } %191, 2, !dbg !34
+  %195 = extractvalue { i32, i32, i32, i32 } %191, 3, !dbg !34
+  %196 = bitcast i32 %192 to float, !dbg !34
+  %197 = bitcast i32 %193 to float, !dbg !34
+  %198 = bitcast i32 %194 to float, !dbg !34
+  %199 = bitcast i32 %195 to float, !dbg !34
+  %200 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !34
+  %201 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %189, i64 %200, i1 %55) #6, !dbg !34
+  %202 = extractvalue { i32, i32, i32, i32 } %201, 0, !dbg !34
+  %203 = extractvalue { i32, i32, i32, i32 } %201, 1, !dbg !34
+  %204 = extractvalue { i32, i32, i32, i32 } %201, 2, !dbg !34
+  %205 = extractvalue { i32, i32, i32, i32 } %201, 3, !dbg !34
+  %206 = bitcast i32 %202 to float, !dbg !34
+  %207 = bitcast i32 %203 to float, !dbg !34
+  %208 = bitcast i32 %204 to float, !dbg !34
+  %209 = bitcast i32 %205 to float, !dbg !34
+  %210 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !34
+  %211 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %189, i64 %210, i1 %55) #6, !dbg !34
+  %212 = extractvalue { i32, i32, i32, i32 } %211, 0, !dbg !34
+  %213 = extractvalue { i32, i32, i32, i32 } %211, 1, !dbg !34
+  %214 = extractvalue { i32, i32, i32, i32 } %211, 2, !dbg !34
+  %215 = extractvalue { i32, i32, i32, i32 } %211, 3, !dbg !34
+  %216 = bitcast i32 %212 to float, !dbg !34
+  %217 = bitcast i32 %213 to float, !dbg !34
+  %218 = bitcast i32 %214 to float, !dbg !34
+  %219 = bitcast i32 %215 to float, !dbg !34
+  %220 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !34
+  %221 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %189, i64 %220, i1 %55) #6, !dbg !34
+  %222 = extractvalue { i32, i32, i32, i32 } %221, 0, !dbg !34
+  %223 = extractvalue { i32, i32, i32, i32 } %221, 1, !dbg !34
+  %224 = extractvalue { i32, i32, i32, i32 } %221, 2, !dbg !34
+  %225 = extractvalue { i32, i32, i32, i32 } %221, 3, !dbg !34
+  %226 = bitcast i32 %222 to float, !dbg !34
+  %227 = bitcast i32 %223 to float, !dbg !34
+  %228 = bitcast i32 %224 to float, !dbg !34
+  %229 = bitcast i32 %225 to float, !dbg !34
+  %230 = tail call float @llvm.nvvm.div.full(float %196, float 1.280000e+02), !dbg !35
+  %231 = tail call float @llvm.nvvm.div.full(float %197, float 1.280000e+02), !dbg !35
+  %232 = tail call float @llvm.nvvm.div.full(float %198, float 1.280000e+02), !dbg !35
+  %233 = tail call float @llvm.nvvm.div.full(float %199, float 1.280000e+02), !dbg !35
+  %234 = tail call float @llvm.nvvm.div.full(float %206, float 1.280000e+02), !dbg !35
+  %235 = tail call float @llvm.nvvm.div.full(float %207, float 1.280000e+02), !dbg !35
+  %236 = tail call float @llvm.nvvm.div.full(float %208, float 1.280000e+02), !dbg !35
+  %237 = tail call float @llvm.nvvm.div.full(float %209, float 1.280000e+02), !dbg !35
+  %238 = tail call float @llvm.nvvm.div.full(float %216, float 1.280000e+02), !dbg !35
+  %239 = tail call float @llvm.nvvm.div.full(float %217, float 1.280000e+02), !dbg !35
+  %240 = tail call float @llvm.nvvm.div.full(float %218, float 1.280000e+02), !dbg !35
+  %241 = tail call float @llvm.nvvm.div.full(float %219, float 1.280000e+02), !dbg !35
+  %242 = tail call float @llvm.nvvm.div.full(float %226, float 1.280000e+02), !dbg !35
+  %243 = tail call float @llvm.nvvm.div.full(float %227, float 1.280000e+02), !dbg !35
+  %244 = tail call float @llvm.nvvm.div.full(float %228, float 1.280000e+02), !dbg !35
+  %245 = tail call float @llvm.nvvm.div.full(float %229, float 1.280000e+02), !dbg !35
+  %246 = fadd float %230, 0x3EB0C6F7A0000000, !dbg !36
+  %247 = fadd float %231, 0x3EB0C6F7A0000000, !dbg !36
+  %248 = fadd float %232, 0x3EB0C6F7A0000000, !dbg !36
+  %249 = fadd float %233, 0x3EB0C6F7A0000000, !dbg !36
+  %250 = fadd float %234, 0x3EB0C6F7A0000000, !dbg !36
+  %251 = fadd float %235, 0x3EB0C6F7A0000000, !dbg !36
+  %252 = fadd float %236, 0x3EB0C6F7A0000000, !dbg !36
+  %253 = fadd float %237, 0x3EB0C6F7A0000000, !dbg !36
+  %254 = fadd float %238, 0x3EB0C6F7A0000000, !dbg !36
+  %255 = fadd float %239, 0x3EB0C6F7A0000000, !dbg !36
+  %256 = fadd float %240, 0x3EB0C6F7A0000000, !dbg !36
+  %257 = fadd float %241, 0x3EB0C6F7A0000000, !dbg !36
+  %258 = fadd float %242, 0x3EB0C6F7A0000000, !dbg !36
+  %259 = fadd float %243, 0x3EB0C6F7A0000000, !dbg !36
+  %260 = fadd float %244, 0x3EB0C6F7A0000000, !dbg !36
+  %261 = fadd float %245, 0x3EB0C6F7A0000000, !dbg !36
+  %262 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !37
+  %.not.i = icmp eq i32 %262, 0, !dbg !37
+  br i1 %.not.i, label %265, label %263, !dbg !37
+
+263:                                              ; preds = %11
+  %264 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %246), !dbg !37
+  br label %__nv_rsqrtf.exit, !dbg !37
+
+265:                                              ; preds = %11
+  %266 = tail call float @llvm.nvvm.rsqrt.approx.f(float %246), !dbg !37
+  br label %__nv_rsqrtf.exit, !dbg !37
+
+__nv_rsqrtf.exit:                                 ; preds = %263, %265
+  %.0.i = phi float [ %264, %263 ], [ %266, %265 ], !dbg !37
+  %267 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !37
+  %.not.i55 = icmp eq i32 %267, 0, !dbg !37
+  br i1 %.not.i55, label %270, label %268, !dbg !37
+
+268:                                              ; preds = %__nv_rsqrtf.exit
+  %269 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %247), !dbg !37
+  br label %__nv_rsqrtf.exit57, !dbg !37
+
+270:                                              ; preds = %__nv_rsqrtf.exit
+  %271 = tail call float @llvm.nvvm.rsqrt.approx.f(float %247), !dbg !37
+  br label %__nv_rsqrtf.exit57, !dbg !37
+
+__nv_rsqrtf.exit57:                               ; preds = %268, %270
+  %.0.i56 = phi float [ %269, %268 ], [ %271, %270 ], !dbg !37
+  %272 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !37
+  %.not.i58 = icmp eq i32 %272, 0, !dbg !37
+  br i1 %.not.i58, label %275, label %273, !dbg !37
+
+273:                                              ; preds = %__nv_rsqrtf.exit57
+  %274 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %248), !dbg !37
+  br label %__nv_rsqrtf.exit60, !dbg !37
+
+275:                                              ; preds = %__nv_rsqrtf.exit57
+  %276 = tail call float @llvm.nvvm.rsqrt.approx.f(float %248), !dbg !37
+  br label %__nv_rsqrtf.exit60, !dbg !37
+
+__nv_rsqrtf.exit60:                               ; preds = %273, %275
+  %.0.i59 = phi float [ %274, %273 ], [ %276, %275 ], !dbg !37
+  %277 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !37
+  %.not.i61 = icmp eq i32 %277, 0, !dbg !37
+  br i1 %.not.i61, label %280, label %278, !dbg !37
+
+278:                                              ; preds = %__nv_rsqrtf.exit60
+  %279 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %249), !dbg !37
+  br label %__nv_rsqrtf.exit63, !dbg !37
+
+280:                                              ; preds = %__nv_rsqrtf.exit60
+  %281 = tail call float @llvm.nvvm.rsqrt.approx.f(float %249), !dbg !37
+  br label %__nv_rsqrtf.exit63, !dbg !37
+
+__nv_rsqrtf.exit63:                               ; preds = %278, %280
+  %.0.i62 = phi float [ %279, %278 ], [ %281, %280 ], !dbg !37
+  %282 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !37
+  %.not.i64 = icmp eq i32 %282, 0, !dbg !37
+  br i1 %.not.i64, label %285, label %283, !dbg !37
+
+283:                                              ; preds = %__nv_rsqrtf.exit63
+  %284 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %250), !dbg !37
+  br label %__nv_rsqrtf.exit66, !dbg !37
+
+285:                                              ; preds = %__nv_rsqrtf.exit63
+  %286 = tail call float @llvm.nvvm.rsqrt.approx.f(float %250), !dbg !37
+  br label %__nv_rsqrtf.exit66, !dbg !37
+
+__nv_rsqrtf.exit66:                               ; preds = %283, %285
+  %.0.i65 = phi float [ %284, %283 ], [ %286, %285 ], !dbg !37
+  %287 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !37
+  %.not.i67 = icmp eq i32 %287, 0, !dbg !37
+  br i1 %.not.i67, label %290, label %288, !dbg !37
+
+288:                                              ; preds = %__nv_rsqrtf.exit66
+  %289 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %251), !dbg !37
+  br label %__nv_rsqrtf.exit69, !dbg !37
+
+290:                                              ; preds = %__nv_rsqrtf.exit66
+  %291 = tail call float @llvm.nvvm.rsqrt.approx.f(float %251), !dbg !37
+  br label %__nv_rsqrtf.exit69, !dbg !37
+
+__nv_rsqrtf.exit69:                               ; preds = %288, %290
+  %.0.i68 = phi float [ %289, %288 ], [ %291, %290 ], !dbg !37
+  %292 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !37
+  %.not.i70 = icmp eq i32 %292, 0, !dbg !37
+  br i1 %.not.i70, label %295, label %293, !dbg !37
+
+293:                                              ; preds = %__nv_rsqrtf.exit69
+  %294 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %252), !dbg !37
+  br label %__nv_rsqrtf.exit72, !dbg !37
+
+295:                                              ; preds = %__nv_rsqrtf.exit69
+  %296 = tail call float @llvm.nvvm.rsqrt.approx.f(float %252), !dbg !37
+  br label %__nv_rsqrtf.exit72, !dbg !37
+
+__nv_rsqrtf.exit72:                               ; preds = %293, %295
+  %.0.i71 = phi float [ %294, %293 ], [ %296, %295 ], !dbg !37
+  %297 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !37
+  %.not.i73 = icmp eq i32 %297, 0, !dbg !37
+  br i1 %.not.i73, label %300, label %298, !dbg !37
+
+298:                                              ; preds = %__nv_rsqrtf.exit72
+  %299 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %253), !dbg !37
+  br label %__nv_rsqrtf.exit75, !dbg !37
+
+300:                                              ; preds = %__nv_rsqrtf.exit72
+  %301 = tail call float @llvm.nvvm.rsqrt.approx.f(float %253), !dbg !37
+  br label %__nv_rsqrtf.exit75, !dbg !37
+
+__nv_rsqrtf.exit75:                               ; preds = %298, %300
+  %.0.i74 = phi float [ %299, %298 ], [ %301, %300 ], !dbg !37
+  %302 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !37
+  %.not.i76 = icmp eq i32 %302, 0, !dbg !37
+  br i1 %.not.i76, label %305, label %303, !dbg !37
+
+303:                                              ; preds = %__nv_rsqrtf.exit75
+  %304 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %254), !dbg !37
+  br label %__nv_rsqrtf.exit78, !dbg !37
+
+305:                                              ; preds = %__nv_rsqrtf.exit75
+  %306 = tail call float @llvm.nvvm.rsqrt.approx.f(float %254), !dbg !37
+  br label %__nv_rsqrtf.exit78, !dbg !37
+
+__nv_rsqrtf.exit78:                               ; preds = %303, %305
+  %.0.i77 = phi float [ %304, %303 ], [ %306, %305 ], !dbg !37
+  %307 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !37
+  %.not.i79 = icmp eq i32 %307, 0, !dbg !37
+  br i1 %.not.i79, label %310, label %308, !dbg !37
+
+308:                                              ; preds = %__nv_rsqrtf.exit78
+  %309 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %255), !dbg !37
+  br label %__nv_rsqrtf.exit81, !dbg !37
+
+310:                                              ; preds = %__nv_rsqrtf.exit78
+  %311 = tail call float @llvm.nvvm.rsqrt.approx.f(float %255), !dbg !37
+  br label %__nv_rsqrtf.exit81, !dbg !37
+
+__nv_rsqrtf.exit81:                               ; preds = %308, %310
+  %.0.i80 = phi float [ %309, %308 ], [ %311, %310 ], !dbg !37
+  %312 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !37
+  %.not.i82 = icmp eq i32 %312, 0, !dbg !37
+  br i1 %.not.i82, label %315, label %313, !dbg !37
+
+313:                                              ; preds = %__nv_rsqrtf.exit81
+  %314 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %256), !dbg !37
+  br label %__nv_rsqrtf.exit84, !dbg !37
+
+315:                                              ; preds = %__nv_rsqrtf.exit81
+  %316 = tail call float @llvm.nvvm.rsqrt.approx.f(float %256), !dbg !37
+  br label %__nv_rsqrtf.exit84, !dbg !37
+
+__nv_rsqrtf.exit84:                               ; preds = %313, %315
+  %.0.i83 = phi float [ %314, %313 ], [ %316, %315 ], !dbg !37
+  %317 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !37
+  %.not.i85 = icmp eq i32 %317, 0, !dbg !37
+  br i1 %.not.i85, label %320, label %318, !dbg !37
+
+318:                                              ; preds = %__nv_rsqrtf.exit84
+  %319 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %257), !dbg !37
+  br label %__nv_rsqrtf.exit87, !dbg !37
+
+320:                                              ; preds = %__nv_rsqrtf.exit84
+  %321 = tail call float @llvm.nvvm.rsqrt.approx.f(float %257), !dbg !37
+  br label %__nv_rsqrtf.exit87, !dbg !37
+
+__nv_rsqrtf.exit87:                               ; preds = %318, %320
+  %.0.i86 = phi float [ %319, %318 ], [ %321, %320 ], !dbg !37
+  %322 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !37
+  %.not.i88 = icmp eq i32 %322, 0, !dbg !37
+  br i1 %.not.i88, label %325, label %323, !dbg !37
+
+323:                                              ; preds = %__nv_rsqrtf.exit87
+  %324 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %258), !dbg !37
+  br label %__nv_rsqrtf.exit90, !dbg !37
+
+325:                                              ; preds = %__nv_rsqrtf.exit87
+  %326 = tail call float @llvm.nvvm.rsqrt.approx.f(float %258), !dbg !37
+  br label %__nv_rsqrtf.exit90, !dbg !37
+
+__nv_rsqrtf.exit90:                               ; preds = %323, %325
+  %.0.i89 = phi float [ %324, %323 ], [ %326, %325 ], !dbg !37
+  %327 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !37
+  %.not.i91 = icmp eq i32 %327, 0, !dbg !37
+  br i1 %.not.i91, label %330, label %328, !dbg !37
+
+328:                                              ; preds = %__nv_rsqrtf.exit90
+  %329 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %259), !dbg !37
+  br label %__nv_rsqrtf.exit93, !dbg !37
+
+330:                                              ; preds = %__nv_rsqrtf.exit90
+  %331 = tail call float @llvm.nvvm.rsqrt.approx.f(float %259), !dbg !37
+  br label %__nv_rsqrtf.exit93, !dbg !37
+
+__nv_rsqrtf.exit93:                               ; preds = %328, %330
+  %.0.i92 = phi float [ %329, %328 ], [ %331, %330 ], !dbg !37
+  %332 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !37
+  %.not.i94 = icmp eq i32 %332, 0, !dbg !37
+  br i1 %.not.i94, label %335, label %333, !dbg !37
+
+333:                                              ; preds = %__nv_rsqrtf.exit93
+  %334 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %260), !dbg !37
+  br label %__nv_rsqrtf.exit96, !dbg !37
+
+335:                                              ; preds = %__nv_rsqrtf.exit93
+  %336 = tail call float @llvm.nvvm.rsqrt.approx.f(float %260), !dbg !37
+  br label %__nv_rsqrtf.exit96, !dbg !37
+
+__nv_rsqrtf.exit96:                               ; preds = %333, %335
+  %.0.i95 = phi float [ %334, %333 ], [ %336, %335 ], !dbg !37
+  %337 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !37
+  %.not.i97 = icmp eq i32 %337, 0, !dbg !37
+  br i1 %.not.i97, label %340, label %338, !dbg !37
+
+338:                                              ; preds = %__nv_rsqrtf.exit96
+  %339 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %261), !dbg !37
+  br label %__nv_rsqrtf.exit99, !dbg !37
+
+340:                                              ; preds = %__nv_rsqrtf.exit96
+  %341 = tail call float @llvm.nvvm.rsqrt.approx.f(float %261), !dbg !37
+  br label %__nv_rsqrtf.exit99, !dbg !37
+
+__nv_rsqrtf.exit99:                               ; preds = %338, %340
+  %.0.i98 = phi float [ %339, %338 ], [ %341, %340 ], !dbg !37
+  %342 = insertelement <4 x float> poison, float %.0.i, i64 0, !dbg !38
+  %343 = insertelement <4 x float> %342, float %.0.i77, i64 1, !dbg !38
+  %344 = insertelement <4 x float> %343, float %.0.i56, i64 2, !dbg !38
+  %345 = insertelement <4 x float> %344, float %.0.i80, i64 3, !dbg !38
+  %346 = fmul <4 x float> %345, %172, !dbg !38
+  %347 = insertelement <4 x float> poison, float %.0.i59, i64 0, !dbg !38
+  %348 = insertelement <4 x float> %347, float %.0.i83, i64 1, !dbg !38
+  %349 = insertelement <4 x float> %348, float %.0.i62, i64 2, !dbg !38
+  %350 = insertelement <4 x float> %349, float %.0.i86, i64 3, !dbg !38
+  %351 = fmul <4 x float> %350, %177, !dbg !38
+  %352 = insertelement <4 x float> poison, float %.0.i65, i64 0, !dbg !38
+  %353 = insertelement <4 x float> %352, float %.0.i89, i64 1, !dbg !38
+  %354 = insertelement <4 x float> %353, float %.0.i68, i64 2, !dbg !38
+  %355 = insertelement <4 x float> %354, float %.0.i92, i64 3, !dbg !38
+  %356 = fmul <4 x float> %355, %182, !dbg !38
+  %357 = insertelement <4 x float> poison, float %.0.i71, i64 0, !dbg !38
+  %358 = insertelement <4 x float> %357, float %.0.i95, i64 1, !dbg !38
+  %359 = insertelement <4 x float> %358, float %.0.i74, i64 2, !dbg !38
+  %360 = insertelement <4 x float> %359, float %.0.i98, i64 3, !dbg !38
+  %361 = fmul <4 x float> %360, %187, !dbg !38
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !38
+  %362 = shl nuw nsw i32 %18, 4, !dbg !38
+  %363 = and i32 %362, 4080, !dbg !38
+  %364 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %363, !dbg !38
+  store <4 x float> %346, ptr addrspace(3) %364, align 16, !dbg !38
+  %365 = xor i32 %363, 4160, !dbg !38
+  %366 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %365, !dbg !38
+  store <4 x float> %351, ptr addrspace(3) %366, align 16, !dbg !38
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !38
+  %367 = shl nuw nsw i32 %18, 7, !dbg !38
+  %368 = and i32 %367, 3072, !dbg !38
+  %369 = shl nuw nsw i32 %86, 3, !dbg !38
+  %370 = shl nuw nsw i32 %122, 1, !dbg !38
+  %371 = select i1 %29, i32 0, i32 4160, !dbg !38
+  %372 = xor i32 %371, %370, !dbg !38
+  %373 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %368, !dbg !38
+  %374 = getelementptr inbounds nuw i8, ptr addrspace(3) %373, i32 %369, !dbg !38
+  %375 = getelementptr inbounds nuw i8, ptr addrspace(3) %374, i32 %372, !dbg !38
+  %376 = tail call { i32, i32, i32, i32 } @llvm.nvvm.ldmatrix.sync.aligned.m8n8.x4.b16.p3(ptr addrspace(3) %375), !dbg !38
+  %377 = getelementptr inbounds nuw i8, ptr addrspace(3) %375, i32 512, !dbg !38
+  %378 = tail call { i32, i32, i32, i32 } @llvm.nvvm.ldmatrix.sync.aligned.m8n8.x4.b16.p3(ptr addrspace(3) nonnull %377), !dbg !38
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !38
+  store <4 x float> %356, ptr addrspace(3) %364, align 16, !dbg !38
+  store <4 x float> %361, ptr addrspace(3) %366, align 16, !dbg !38
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !38
+  %379 = tail call { i32, i32, i32, i32 } @llvm.nvvm.ldmatrix.sync.aligned.m8n8.x4.b16.p3(ptr addrspace(3) %375), !dbg !38
+  %380 = tail call { i32, i32, i32, i32 } @llvm.nvvm.ldmatrix.sync.aligned.m8n8.x4.b16.p3(ptr addrspace(3) nonnull %377), !dbg !38
+  %381 = sext i32 %33 to i64, !dbg !39
+  %382 = getelementptr bfloat, ptr addrspace(1) %2, i64 %381, !dbg !39
+  %383 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !40
+  %384 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %382, i64 %383, i1 %54) #6, !dbg !40
+  %385 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !40
+  %386 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %382, i64 %385, i1 %57) #6, !dbg !40
+  %387 = add i32 %48, -3145728, !dbg !41
+  %388 = add i32 %49, -3145728, !dbg !41
+  %389 = sext i32 %387 to i64, !dbg !42
+  %390 = getelementptr bfloat, ptr addrspace(1) %3, i64 %389, !dbg !42
+  %391 = sext i32 %388 to i64, !dbg !42
+  %392 = getelementptr bfloat, ptr addrspace(1) %3, i64 %391, !dbg !42
+  %393 = add i32 %17, -8192, !dbg !43
+  %394 = icmp ult i32 %393, 65536, !dbg !43
+  %395 = and i1 %35, %394, !dbg !43
+  %396 = add i32 %17, -8064, !dbg !43
+  %397 = icmp ult i32 %396, 65664, !dbg !43
+  %398 = and i1 %35, %397, !dbg !43
+  %399 = and i1 %36, %394, !dbg !43
+  %400 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !44
+  %401 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %390, i64 %400, i1 %395) #6, !dbg !44
+  %402 = extractvalue { i32, i32, i32, i32 } %401, 0, !dbg !44
+  %403 = extractvalue { i32, i32, i32, i32 } %401, 1, !dbg !44
+  %404 = extractvalue { i32, i32, i32, i32 } %401, 2, !dbg !44
+  %405 = extractvalue { i32, i32, i32, i32 } %401, 3, !dbg !44
+  %406 = insertelement <2 x i32> poison, i32 %402, i64 0, !dbg !44
+  %407 = insertelement <2 x i32> %406, i32 %404, i64 1, !dbg !44
+  %408 = lshr <2 x i32> %407, splat (i32 16), !dbg !44
+  %409 = trunc nuw <2 x i32> %408 to <2 x i16>, !dbg !44
+  %410 = insertelement <2 x i32> poison, i32 %403, i64 0, !dbg !44
+  %411 = insertelement <2 x i32> %410, i32 %405, i64 1, !dbg !44
+  %412 = lshr <2 x i32> %411, splat (i32 16), !dbg !44
+  %413 = trunc nuw <2 x i32> %412 to <2 x i16>, !dbg !44
+  %414 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !44
+  %415 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %392, i64 %414, i1 %398) #6, !dbg !44
+  %416 = extractvalue { i32, i32, i32, i32 } %415, 0, !dbg !44
+  %417 = extractvalue { i32, i32, i32, i32 } %415, 1, !dbg !44
+  %418 = extractvalue { i32, i32, i32, i32 } %415, 2, !dbg !44
+  %419 = extractvalue { i32, i32, i32, i32 } %415, 3, !dbg !44
+  %420 = insertelement <2 x i32> poison, i32 %416, i64 0, !dbg !44
+  %421 = insertelement <2 x i32> %420, i32 %418, i64 1, !dbg !44
+  %422 = lshr <2 x i32> %421, splat (i32 16), !dbg !44
+  %423 = trunc nuw <2 x i32> %422 to <2 x i16>, !dbg !44
+  %424 = insertelement <2 x i32> poison, i32 %417, i64 0, !dbg !44
+  %425 = insertelement <2 x i32> %424, i32 %419, i64 1, !dbg !44
+  %426 = lshr <2 x i32> %425, splat (i32 16), !dbg !44
+  %427 = trunc nuw <2 x i32> %426 to <2 x i16>, !dbg !44
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !45
+  %428 = trunc i32 %402 to i16, !dbg !45
+  %429 = trunc i32 %404 to i16, !dbg !45
+  %430 = insertelement <2 x i16> poison, i16 %428, i64 0, !dbg !45
+  %431 = insertelement <2 x i16> %430, i16 %429, i64 1, !dbg !45
+  store <2 x i16> %431, ptr addrspace(3) %96, align 4, !dbg !45
+  store <2 x i16> %409, ptr addrspace(3) %101, align 4, !dbg !45
+  %432 = trunc i32 %403 to i16, !dbg !45
+  %433 = trunc i32 %405 to i16, !dbg !45
+  %434 = insertelement <2 x i16> poison, i16 %432, i64 0, !dbg !45
+  %435 = insertelement <2 x i16> %434, i16 %433, i64 1, !dbg !45
+  store <2 x i16> %435, ptr addrspace(3) %102, align 4, !dbg !45
+  store <2 x i16> %413, ptr addrspace(3) %107, align 4, !dbg !45
+  %436 = trunc i32 %416 to i16, !dbg !45
+  %437 = trunc i32 %418 to i16, !dbg !45
+  %438 = insertelement <2 x i16> poison, i16 %436, i64 0, !dbg !45
+  %439 = insertelement <2 x i16> %438, i16 %437, i64 1, !dbg !45
+  store <2 x i16> %439, ptr addrspace(3) %108, align 4, !dbg !45
+  store <2 x i16> %423, ptr addrspace(3) %113, align 4, !dbg !45
+  %440 = trunc i32 %417 to i16, !dbg !45
+  %441 = trunc i32 %419 to i16, !dbg !45
+  %442 = insertelement <2 x i16> poison, i16 %440, i64 0, !dbg !45
+  %443 = insertelement <2 x i16> %442, i16 %441, i64 1, !dbg !45
+  store <2 x i16> %443, ptr addrspace(3) %114, align 4, !dbg !45
+  store <2 x i16> %427, ptr addrspace(3) %119, align 4, !dbg !45
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !45
+  %444 = load <2 x bfloat>, ptr addrspace(3) %129, align 4, !dbg !45
+  %445 = load <2 x bfloat>, ptr addrspace(3) %134, align 4, !dbg !45
+  %446 = load <2 x bfloat>, ptr addrspace(3) %139, align 4, !dbg !45
+  %447 = load <2 x bfloat>, ptr addrspace(3) %144, align 4, !dbg !45
+  %448 = load <2 x bfloat>, ptr addrspace(3) %149, align 4, !dbg !45
+  %449 = load <2 x bfloat>, ptr addrspace(3) %154, align 4, !dbg !45
+  %450 = load <2 x bfloat>, ptr addrspace(3) %159, align 4, !dbg !45
+  %451 = load <2 x bfloat>, ptr addrspace(3) %164, align 4, !dbg !45
+  %452 = shl nsw i32 %39, 5, !dbg !46
+  %453 = add nsw i32 %.decomposed148, -8192, !dbg !46
+  %454 = add i32 %453, %452, !dbg !47
+  %455 = sext i32 %454 to i64, !dbg !48
+  %456 = getelementptr float, ptr addrspace(1) %4, i64 %455, !dbg !48
+  %457 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !49
+  %458 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %456, i64 %457, i1 %399) #6, !dbg !49
+  %459 = extractvalue { i32, i32, i32, i32 } %458, 0, !dbg !49
+  %460 = extractvalue { i32, i32, i32, i32 } %458, 1, !dbg !49
+  %461 = extractvalue { i32, i32, i32, i32 } %458, 2, !dbg !49
+  %462 = extractvalue { i32, i32, i32, i32 } %458, 3, !dbg !49
+  %463 = bitcast i32 %459 to float, !dbg !49
+  %464 = bitcast i32 %460 to float, !dbg !49
+  %465 = bitcast i32 %461 to float, !dbg !49
+  %466 = bitcast i32 %462 to float, !dbg !49
+  %467 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !49
+  %468 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %456, i64 %467, i1 %399) #6, !dbg !49
+  %469 = extractvalue { i32, i32, i32, i32 } %468, 0, !dbg !49
+  %470 = extractvalue { i32, i32, i32, i32 } %468, 1, !dbg !49
+  %471 = extractvalue { i32, i32, i32, i32 } %468, 2, !dbg !49
+  %472 = extractvalue { i32, i32, i32, i32 } %468, 3, !dbg !49
+  %473 = bitcast i32 %469 to float, !dbg !49
+  %474 = bitcast i32 %470 to float, !dbg !49
+  %475 = bitcast i32 %471 to float, !dbg !49
+  %476 = bitcast i32 %472 to float, !dbg !49
+  %477 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !49
+  %478 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %456, i64 %477, i1 %399) #6, !dbg !49
+  %479 = extractvalue { i32, i32, i32, i32 } %478, 0, !dbg !49
+  %480 = extractvalue { i32, i32, i32, i32 } %478, 1, !dbg !49
+  %481 = extractvalue { i32, i32, i32, i32 } %478, 2, !dbg !49
+  %482 = extractvalue { i32, i32, i32, i32 } %478, 3, !dbg !49
+  %483 = bitcast i32 %479 to float, !dbg !49
+  %484 = bitcast i32 %480 to float, !dbg !49
+  %485 = bitcast i32 %481 to float, !dbg !49
+  %486 = bitcast i32 %482 to float, !dbg !49
+  %487 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !49
+  %488 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %456, i64 %487, i1 %399) #6, !dbg !49
+  %489 = extractvalue { i32, i32, i32, i32 } %488, 0, !dbg !49
+  %490 = extractvalue { i32, i32, i32, i32 } %488, 1, !dbg !49
+  %491 = extractvalue { i32, i32, i32, i32 } %488, 2, !dbg !49
+  %492 = extractvalue { i32, i32, i32, i32 } %488, 3, !dbg !49
+  %493 = bitcast i32 %489 to float, !dbg !49
+  %494 = bitcast i32 %490 to float, !dbg !49
+  %495 = bitcast i32 %491 to float, !dbg !49
+  %496 = bitcast i32 %492 to float, !dbg !49
+  %497 = tail call float @llvm.nvvm.div.full(float %463, float 1.280000e+02), !dbg !50
+  %498 = tail call float @llvm.nvvm.div.full(float %464, float 1.280000e+02), !dbg !50
+  %499 = tail call float @llvm.nvvm.div.full(float %465, float 1.280000e+02), !dbg !50
+  %500 = tail call float @llvm.nvvm.div.full(float %466, float 1.280000e+02), !dbg !50
+  %501 = tail call float @llvm.nvvm.div.full(float %473, float 1.280000e+02), !dbg !50
+  %502 = tail call float @llvm.nvvm.div.full(float %474, float 1.280000e+02), !dbg !50
+  %503 = tail call float @llvm.nvvm.div.full(float %475, float 1.280000e+02), !dbg !50
+  %504 = tail call float @llvm.nvvm.div.full(float %476, float 1.280000e+02), !dbg !50
+  %505 = tail call float @llvm.nvvm.div.full(float %483, float 1.280000e+02), !dbg !50
+  %506 = tail call float @llvm.nvvm.div.full(float %484, float 1.280000e+02), !dbg !50
+  %507 = tail call float @llvm.nvvm.div.full(float %485, float 1.280000e+02), !dbg !50
+  %508 = tail call float @llvm.nvvm.div.full(float %486, float 1.280000e+02), !dbg !50
+  %509 = tail call float @llvm.nvvm.div.full(float %493, float 1.280000e+02), !dbg !50
+  %510 = tail call float @llvm.nvvm.div.full(float %494, float 1.280000e+02), !dbg !50
+  %511 = tail call float @llvm.nvvm.div.full(float %495, float 1.280000e+02), !dbg !50
+  %512 = tail call float @llvm.nvvm.div.full(float %496, float 1.280000e+02), !dbg !50
+  %513 = fadd float %497, 0x3EB0C6F7A0000000, !dbg !51
+  %514 = fadd float %498, 0x3EB0C6F7A0000000, !dbg !51
+  %515 = fadd float %499, 0x3EB0C6F7A0000000, !dbg !51
+  %516 = fadd float %500, 0x3EB0C6F7A0000000, !dbg !51
+  %517 = fadd float %501, 0x3EB0C6F7A0000000, !dbg !51
+  %518 = fadd float %502, 0x3EB0C6F7A0000000, !dbg !51
+  %519 = fadd float %503, 0x3EB0C6F7A0000000, !dbg !51
+  %520 = fadd float %504, 0x3EB0C6F7A0000000, !dbg !51
+  %521 = fadd float %505, 0x3EB0C6F7A0000000, !dbg !51
+  %522 = fadd float %506, 0x3EB0C6F7A0000000, !dbg !51
+  %523 = fadd float %507, 0x3EB0C6F7A0000000, !dbg !51
+  %524 = fadd float %508, 0x3EB0C6F7A0000000, !dbg !51
+  %525 = fadd float %509, 0x3EB0C6F7A0000000, !dbg !51
+  %526 = fadd float %510, 0x3EB0C6F7A0000000, !dbg !51
+  %527 = fadd float %511, 0x3EB0C6F7A0000000, !dbg !51
+  %528 = fadd float %512, 0x3EB0C6F7A0000000, !dbg !51
+  %529 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !52
+  %.not.i100 = icmp eq i32 %529, 0, !dbg !52
+  br i1 %.not.i100, label %532, label %530, !dbg !52
+
+530:                                              ; preds = %__nv_rsqrtf.exit99
+  %531 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %513), !dbg !52
+  br label %__nv_rsqrtf.exit102, !dbg !52
+
+532:                                              ; preds = %__nv_rsqrtf.exit99
+  %533 = tail call float @llvm.nvvm.rsqrt.approx.f(float %513), !dbg !52
+  br label %__nv_rsqrtf.exit102, !dbg !52
+
+__nv_rsqrtf.exit102:                              ; preds = %530, %532
+  %.0.i101 = phi float [ %531, %530 ], [ %533, %532 ], !dbg !52
+  %534 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !52
+  %.not.i103 = icmp eq i32 %534, 0, !dbg !52
+  br i1 %.not.i103, label %537, label %535, !dbg !52
+
+535:                                              ; preds = %__nv_rsqrtf.exit102
+  %536 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %514), !dbg !52
+  br label %__nv_rsqrtf.exit105, !dbg !52
+
+537:                                              ; preds = %__nv_rsqrtf.exit102
+  %538 = tail call float @llvm.nvvm.rsqrt.approx.f(float %514), !dbg !52
+  br label %__nv_rsqrtf.exit105, !dbg !52
+
+__nv_rsqrtf.exit105:                              ; preds = %535, %537
+  %.0.i104 = phi float [ %536, %535 ], [ %538, %537 ], !dbg !52
+  %539 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !52
+  %.not.i106 = icmp eq i32 %539, 0, !dbg !52
+  br i1 %.not.i106, label %542, label %540, !dbg !52
+
+540:                                              ; preds = %__nv_rsqrtf.exit105
+  %541 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %515), !dbg !52
+  br label %__nv_rsqrtf.exit108, !dbg !52
+
+542:                                              ; preds = %__nv_rsqrtf.exit105
+  %543 = tail call float @llvm.nvvm.rsqrt.approx.f(float %515), !dbg !52
+  br label %__nv_rsqrtf.exit108, !dbg !52
+
+__nv_rsqrtf.exit108:                              ; preds = %540, %542
+  %.0.i107 = phi float [ %541, %540 ], [ %543, %542 ], !dbg !52
+  %544 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !52
+  %.not.i109 = icmp eq i32 %544, 0, !dbg !52
+  br i1 %.not.i109, label %547, label %545, !dbg !52
+
+545:                                              ; preds = %__nv_rsqrtf.exit108
+  %546 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %516), !dbg !52
+  br label %__nv_rsqrtf.exit111, !dbg !52
+
+547:                                              ; preds = %__nv_rsqrtf.exit108
+  %548 = tail call float @llvm.nvvm.rsqrt.approx.f(float %516), !dbg !52
+  br label %__nv_rsqrtf.exit111, !dbg !52
+
+__nv_rsqrtf.exit111:                              ; preds = %545, %547
+  %.0.i110 = phi float [ %546, %545 ], [ %548, %547 ], !dbg !52
+  %549 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !52
+  %.not.i112 = icmp eq i32 %549, 0, !dbg !52
+  br i1 %.not.i112, label %552, label %550, !dbg !52
+
+550:                                              ; preds = %__nv_rsqrtf.exit111
+  %551 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %517), !dbg !52
+  br label %__nv_rsqrtf.exit114, !dbg !52
+
+552:                                              ; preds = %__nv_rsqrtf.exit111
+  %553 = tail call float @llvm.nvvm.rsqrt.approx.f(float %517), !dbg !52
+  br label %__nv_rsqrtf.exit114, !dbg !52
+
+__nv_rsqrtf.exit114:                              ; preds = %550, %552
+  %.0.i113 = phi float [ %551, %550 ], [ %553, %552 ], !dbg !52
+  %554 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !52
+  %.not.i115 = icmp eq i32 %554, 0, !dbg !52
+  br i1 %.not.i115, label %557, label %555, !dbg !52
+
+555:                                              ; preds = %__nv_rsqrtf.exit114
+  %556 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %518), !dbg !52
+  br label %__nv_rsqrtf.exit117, !dbg !52
+
+557:                                              ; preds = %__nv_rsqrtf.exit114
+  %558 = tail call float @llvm.nvvm.rsqrt.approx.f(float %518), !dbg !52
+  br label %__nv_rsqrtf.exit117, !dbg !52
+
+__nv_rsqrtf.exit117:                              ; preds = %555, %557
+  %.0.i116 = phi float [ %556, %555 ], [ %558, %557 ], !dbg !52
+  %559 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !52
+  %.not.i118 = icmp eq i32 %559, 0, !dbg !52
+  br i1 %.not.i118, label %562, label %560, !dbg !52
+
+560:                                              ; preds = %__nv_rsqrtf.exit117
+  %561 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %519), !dbg !52
+  br label %__nv_rsqrtf.exit120, !dbg !52
+
+562:                                              ; preds = %__nv_rsqrtf.exit117
+  %563 = tail call float @llvm.nvvm.rsqrt.approx.f(float %519), !dbg !52
+  br label %__nv_rsqrtf.exit120, !dbg !52
+
+__nv_rsqrtf.exit120:                              ; preds = %560, %562
+  %.0.i119 = phi float [ %561, %560 ], [ %563, %562 ], !dbg !52
+  %564 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !52
+  %.not.i121 = icmp eq i32 %564, 0, !dbg !52
+  br i1 %.not.i121, label %567, label %565, !dbg !52
+
+565:                                              ; preds = %__nv_rsqrtf.exit120
+  %566 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %520), !dbg !52
+  br label %__nv_rsqrtf.exit123, !dbg !52
+
+567:                                              ; preds = %__nv_rsqrtf.exit120
+  %568 = tail call float @llvm.nvvm.rsqrt.approx.f(float %520), !dbg !52
+  br label %__nv_rsqrtf.exit123, !dbg !52
+
+__nv_rsqrtf.exit123:                              ; preds = %565, %567
+  %.0.i122 = phi float [ %566, %565 ], [ %568, %567 ], !dbg !52
+  %569 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !52
+  %.not.i124 = icmp eq i32 %569, 0, !dbg !52
+  br i1 %.not.i124, label %572, label %570, !dbg !52
+
+570:                                              ; preds = %__nv_rsqrtf.exit123
+  %571 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %521), !dbg !52
+  br label %__nv_rsqrtf.exit126, !dbg !52
+
+572:                                              ; preds = %__nv_rsqrtf.exit123
+  %573 = tail call float @llvm.nvvm.rsqrt.approx.f(float %521), !dbg !52
+  br label %__nv_rsqrtf.exit126, !dbg !52
+
+__nv_rsqrtf.exit126:                              ; preds = %570, %572
+  %.0.i125 = phi float [ %571, %570 ], [ %573, %572 ], !dbg !52
+  %574 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !52
+  %.not.i127 = icmp eq i32 %574, 0, !dbg !52
+  br i1 %.not.i127, label %577, label %575, !dbg !52
+
+575:                                              ; preds = %__nv_rsqrtf.exit126
+  %576 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %522), !dbg !52
+  br label %__nv_rsqrtf.exit129, !dbg !52
+
+577:                                              ; preds = %__nv_rsqrtf.exit126
+  %578 = tail call float @llvm.nvvm.rsqrt.approx.f(float %522), !dbg !52
+  br label %__nv_rsqrtf.exit129, !dbg !52
+
+__nv_rsqrtf.exit129:                              ; preds = %575, %577
+  %.0.i128 = phi float [ %576, %575 ], [ %578, %577 ], !dbg !52
+  %579 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !52
+  %.not.i130 = icmp eq i32 %579, 0, !dbg !52
+  br i1 %.not.i130, label %582, label %580, !dbg !52
+
+580:                                              ; preds = %__nv_rsqrtf.exit129
+  %581 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %523), !dbg !52
+  br label %__nv_rsqrtf.exit132, !dbg !52
+
+582:                                              ; preds = %__nv_rsqrtf.exit129
+  %583 = tail call float @llvm.nvvm.rsqrt.approx.f(float %523), !dbg !52
+  br label %__nv_rsqrtf.exit132, !dbg !52
+
+__nv_rsqrtf.exit132:                              ; preds = %580, %582
+  %.0.i131 = phi float [ %581, %580 ], [ %583, %582 ], !dbg !52
+  %584 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !52
+  %.not.i133 = icmp eq i32 %584, 0, !dbg !52
+  br i1 %.not.i133, label %587, label %585, !dbg !52
+
+585:                                              ; preds = %__nv_rsqrtf.exit132
+  %586 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %524), !dbg !52
+  br label %__nv_rsqrtf.exit135, !dbg !52
+
+587:                                              ; preds = %__nv_rsqrtf.exit132
+  %588 = tail call float @llvm.nvvm.rsqrt.approx.f(float %524), !dbg !52
+  br label %__nv_rsqrtf.exit135, !dbg !52
+
+__nv_rsqrtf.exit135:                              ; preds = %585, %587
+  %.0.i134 = phi float [ %586, %585 ], [ %588, %587 ], !dbg !52
+  %589 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !52
+  %.not.i136 = icmp eq i32 %589, 0, !dbg !52
+  br i1 %.not.i136, label %592, label %590, !dbg !52
+
+590:                                              ; preds = %__nv_rsqrtf.exit135
+  %591 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %525), !dbg !52
+  br label %__nv_rsqrtf.exit138, !dbg !52
+
+592:                                              ; preds = %__nv_rsqrtf.exit135
+  %593 = tail call float @llvm.nvvm.rsqrt.approx.f(float %525), !dbg !52
+  br label %__nv_rsqrtf.exit138, !dbg !52
+
+__nv_rsqrtf.exit138:                              ; preds = %590, %592
+  %.0.i137 = phi float [ %591, %590 ], [ %593, %592 ], !dbg !52
+  %594 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !52
+  %.not.i139 = icmp eq i32 %594, 0, !dbg !52
+  br i1 %.not.i139, label %597, label %595, !dbg !52
+
+595:                                              ; preds = %__nv_rsqrtf.exit138
+  %596 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %526), !dbg !52
+  br label %__nv_rsqrtf.exit141, !dbg !52
+
+597:                                              ; preds = %__nv_rsqrtf.exit138
+  %598 = tail call float @llvm.nvvm.rsqrt.approx.f(float %526), !dbg !52
+  br label %__nv_rsqrtf.exit141, !dbg !52
+
+__nv_rsqrtf.exit141:                              ; preds = %595, %597
+  %.0.i140 = phi float [ %596, %595 ], [ %598, %597 ], !dbg !52
+  %599 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !52
+  %.not.i142 = icmp eq i32 %599, 0, !dbg !52
+  br i1 %.not.i142, label %602, label %600, !dbg !52
+
+600:                                              ; preds = %__nv_rsqrtf.exit141
+  %601 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %527), !dbg !52
+  br label %__nv_rsqrtf.exit144, !dbg !52
+
+602:                                              ; preds = %__nv_rsqrtf.exit141
+  %603 = tail call float @llvm.nvvm.rsqrt.approx.f(float %527), !dbg !52
+  br label %__nv_rsqrtf.exit144, !dbg !52
+
+__nv_rsqrtf.exit144:                              ; preds = %600, %602
+  %.0.i143 = phi float [ %601, %600 ], [ %603, %602 ], !dbg !52
+  %604 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !52
+  %.not.i145 = icmp eq i32 %604, 0, !dbg !52
+  br i1 %.not.i145, label %607, label %605, !dbg !52
+
+605:                                              ; preds = %__nv_rsqrtf.exit144
+  %606 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %528), !dbg !52
+  br label %__nv_rsqrtf.exit147, !dbg !52
+
+607:                                              ; preds = %__nv_rsqrtf.exit144
+  %608 = tail call float @llvm.nvvm.rsqrt.approx.f(float %528), !dbg !52
+  br label %__nv_rsqrtf.exit147, !dbg !52
+
+__nv_rsqrtf.exit147:                              ; preds = %605, %607
+  %.0.i146 = phi float [ %606, %605 ], [ %608, %607 ], !dbg !52
+  %609 = icmp slt i32 %23, 73728, !dbg !53
+  %610 = icmp slt i32 %24, 8192, !dbg !23
+  %611 = extractvalue { i32, i32, i32, i32 } %380, 3, !dbg !38
+  %612 = extractvalue { i32, i32, i32, i32 } %386, 3, !dbg !40
+  %613 = bitcast i32 %612 to <2 x bfloat>, !dbg !40
+  %614 = extractvalue { i32, i32, i32, i32 } %380, 2, !dbg !38
+  %615 = extractvalue { i32, i32, i32, i32 } %380, 1, !dbg !38
+  %616 = extractvalue { i32, i32, i32, i32 } %386, 2, !dbg !40
+  %617 = bitcast i32 %616 to <2 x bfloat>, !dbg !40
+  %618 = extractvalue { i32, i32, i32, i32 } %380, 0, !dbg !38
+  %619 = extractvalue { i32, i32, i32, i32 } %378, 3, !dbg !38
+  %620 = extractvalue { i32, i32, i32, i32 } %386, 1, !dbg !40
+  %621 = bitcast i32 %620 to <2 x bfloat>, !dbg !40
+  %622 = extractvalue { i32, i32, i32, i32 } %378, 2, !dbg !38
+  %623 = extractvalue { i32, i32, i32, i32 } %378, 1, !dbg !38
+  %624 = extractvalue { i32, i32, i32, i32 } %386, 0, !dbg !40
+  %625 = bitcast i32 %624 to <2 x bfloat>, !dbg !40
+  %626 = extractvalue { i32, i32, i32, i32 } %378, 0, !dbg !38
+  %627 = extractvalue { i32, i32, i32, i32 } %379, 3, !dbg !38
+  %628 = extractvalue { i32, i32, i32, i32 } %384, 3, !dbg !40
+  %629 = bitcast i32 %628 to <2 x bfloat>, !dbg !40
+  %630 = extractvalue { i32, i32, i32, i32 } %379, 2, !dbg !38
+  %631 = extractvalue { i32, i32, i32, i32 } %379, 1, !dbg !38
+  %632 = extractvalue { i32, i32, i32, i32 } %384, 2, !dbg !40
+  %633 = bitcast i32 %632 to <2 x bfloat>, !dbg !40
+  %634 = extractvalue { i32, i32, i32, i32 } %379, 0, !dbg !38
+  %635 = extractvalue { i32, i32, i32, i32 } %376, 3, !dbg !38
+  %636 = extractvalue { i32, i32, i32, i32 } %384, 1, !dbg !40
+  %637 = bitcast i32 %636 to <2 x bfloat>, !dbg !40
+  %638 = extractvalue { i32, i32, i32, i32 } %376, 2, !dbg !38
+  %639 = extractvalue { i32, i32, i32, i32 } %376, 1, !dbg !38
+  %640 = extractvalue { i32, i32, i32, i32 } %384, 0, !dbg !40
+  %641 = bitcast i32 %640 to <2 x bfloat>, !dbg !40
+  %642 = extractvalue { i32, i32, i32, i32 } %376, 0, !dbg !38
+  %643 = shufflevector <2 x bfloat> %444, <2 x bfloat> %448, <4 x i32> <i32 0, i32 2, i32 poison, i32 poison>, !dbg !45
+  %644 = shufflevector <2 x bfloat> %445, <2 x bfloat> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>, !dbg !45
+  %645 = shufflevector <4 x bfloat> %643, <4 x bfloat> %644, <4 x i32> <i32 0, i32 1, i32 4, i32 poison>, !dbg !45
+  %646 = shufflevector <2 x bfloat> %449, <2 x bfloat> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>, !dbg !45
+  %647 = shufflevector <4 x bfloat> %645, <4 x bfloat> %646, <4 x i32> <i32 0, i32 1, i32 2, i32 4>, !dbg !45
+  %648 = fpext <4 x bfloat> %647 to <4 x float>, !dbg !45
+  %649 = insertelement <4 x float> poison, float %.0.i101, i64 0, !dbg !54
+  %650 = insertelement <4 x float> %649, float %.0.i125, i64 1, !dbg !54
+  %651 = insertelement <4 x float> %650, float %.0.i104, i64 2, !dbg !54
+  %652 = insertelement <4 x float> %651, float %.0.i128, i64 3, !dbg !54
+  %653 = fmul <4 x float> %652, %648, !dbg !54
+  %654 = shufflevector <2 x bfloat> %446, <2 x bfloat> %450, <4 x i32> <i32 0, i32 2, i32 poison, i32 poison>, !dbg !45
+  %655 = shufflevector <2 x bfloat> %447, <2 x bfloat> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>, !dbg !45
+  %656 = shufflevector <4 x bfloat> %654, <4 x bfloat> %655, <4 x i32> <i32 0, i32 1, i32 4, i32 poison>, !dbg !45
+  %657 = shufflevector <2 x bfloat> %451, <2 x bfloat> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>, !dbg !45
+  %658 = shufflevector <4 x bfloat> %656, <4 x bfloat> %657, <4 x i32> <i32 0, i32 1, i32 2, i32 4>, !dbg !45
+  %659 = fpext <4 x bfloat> %658 to <4 x float>, !dbg !45
+  %660 = insertelement <4 x float> poison, float %.0.i107, i64 0, !dbg !54
+  %661 = insertelement <4 x float> %660, float %.0.i131, i64 1, !dbg !54
+  %662 = insertelement <4 x float> %661, float %.0.i110, i64 2, !dbg !54
+  %663 = insertelement <4 x float> %662, float %.0.i134, i64 3, !dbg !54
+  %664 = fmul <4 x float> %663, %659, !dbg !54
+  %665 = shufflevector <2 x bfloat> %444, <2 x bfloat> %448, <4 x i32> <i32 1, i32 3, i32 poison, i32 poison>, !dbg !45
+  %666 = shufflevector <2 x bfloat> %445, <2 x bfloat> poison, <4 x i32> <i32 poison, i32 1, i32 poison, i32 poison>, !dbg !45
+  %667 = shufflevector <4 x bfloat> %665, <4 x bfloat> %666, <4 x i32> <i32 0, i32 1, i32 5, i32 poison>, !dbg !45
+  %668 = shufflevector <2 x bfloat> %449, <2 x bfloat> poison, <4 x i32> <i32 poison, i32 1, i32 poison, i32 poison>, !dbg !45
+  %669 = shufflevector <4 x bfloat> %667, <4 x bfloat> %668, <4 x i32> <i32 0, i32 1, i32 2, i32 5>, !dbg !45
+  %670 = fpext <4 x bfloat> %669 to <4 x float>, !dbg !45
+  %671 = insertelement <4 x float> poison, float %.0.i113, i64 0, !dbg !54
+  %672 = insertelement <4 x float> %671, float %.0.i137, i64 1, !dbg !54
+  %673 = insertelement <4 x float> %672, float %.0.i116, i64 2, !dbg !54
+  %674 = insertelement <4 x float> %673, float %.0.i140, i64 3, !dbg !54
+  %675 = fmul <4 x float> %674, %670, !dbg !54
+  %676 = shufflevector <2 x bfloat> %446, <2 x bfloat> %450, <4 x i32> <i32 1, i32 3, i32 poison, i32 poison>, !dbg !45
+  %677 = shufflevector <2 x bfloat> %447, <2 x bfloat> poison, <4 x i32> <i32 poison, i32 1, i32 poison, i32 poison>, !dbg !45
+  %678 = shufflevector <4 x bfloat> %676, <4 x bfloat> %677, <4 x i32> <i32 0, i32 1, i32 5, i32 poison>, !dbg !45
+  %679 = shufflevector <2 x bfloat> %451, <2 x bfloat> poison, <4 x i32> <i32 poison, i32 1, i32 poison, i32 poison>, !dbg !45
+  %680 = shufflevector <4 x bfloat> %678, <4 x bfloat> %679, <4 x i32> <i32 0, i32 1, i32 2, i32 5>, !dbg !45
+  %681 = fpext <4 x bfloat> %680 to <4 x float>, !dbg !45
+  %682 = insertelement <4 x float> poison, float %.0.i119, i64 0, !dbg !54
+  %683 = insertelement <4 x float> %682, float %.0.i143, i64 1, !dbg !54
+  %684 = insertelement <4 x float> %683, float %.0.i122, i64 2, !dbg !54
+  %685 = insertelement <4 x float> %684, float %.0.i146, i64 3, !dbg !54
+  %686 = fmul <4 x float> %685, %681, !dbg !54
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !54
+  store <4 x float> %653, ptr addrspace(3) %364, align 16, !dbg !54
+  store <4 x float> %664, ptr addrspace(3) %366, align 16, !dbg !54
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !54
+  %687 = tail call { i32, i32, i32, i32 } @llvm.nvvm.ldmatrix.sync.aligned.m8n8.x4.b16.p3(ptr addrspace(3) %375), !dbg !54
+  %688 = extractvalue { i32, i32, i32, i32 } %687, 0, !dbg !54
+  %689 = extractvalue { i32, i32, i32, i32 } %687, 1, !dbg !54
+  %690 = extractvalue { i32, i32, i32, i32 } %687, 2, !dbg !54
+  %691 = extractvalue { i32, i32, i32, i32 } %687, 3, !dbg !54
+  %692 = tail call { i32, i32, i32, i32 } @llvm.nvvm.ldmatrix.sync.aligned.m8n8.x4.b16.p3(ptr addrspace(3) nonnull %377), !dbg !54
+  %693 = extractvalue { i32, i32, i32, i32 } %692, 0, !dbg !54
+  %694 = extractvalue { i32, i32, i32, i32 } %692, 1, !dbg !54
+  %695 = extractvalue { i32, i32, i32, i32 } %692, 2, !dbg !54
+  %696 = extractvalue { i32, i32, i32, i32 } %692, 3, !dbg !54
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !54
+  store <4 x float> %675, ptr addrspace(3) %364, align 16, !dbg !54
+  store <4 x float> %686, ptr addrspace(3) %366, align 16, !dbg !54
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !54
+  %697 = tail call { i32, i32, i32, i32 } @llvm.nvvm.ldmatrix.sync.aligned.m8n8.x4.b16.p3(ptr addrspace(3) %375), !dbg !54
+  %698 = extractvalue { i32, i32, i32, i32 } %697, 0, !dbg !54
+  %699 = extractvalue { i32, i32, i32, i32 } %697, 1, !dbg !54
+  %700 = extractvalue { i32, i32, i32, i32 } %697, 2, !dbg !54
+  %701 = extractvalue { i32, i32, i32, i32 } %697, 3, !dbg !54
+  %702 = tail call { i32, i32, i32, i32 } @llvm.nvvm.ldmatrix.sync.aligned.m8n8.x4.b16.p3(ptr addrspace(3) nonnull %377), !dbg !54
+  %703 = extractvalue { i32, i32, i32, i32 } %702, 0, !dbg !54
+  %704 = extractvalue { i32, i32, i32, i32 } %702, 1, !dbg !54
+  %705 = extractvalue { i32, i32, i32, i32 } %702, 2, !dbg !54
+  %706 = extractvalue { i32, i32, i32, i32 } %702, 3, !dbg !54
+  %707 = getelementptr bfloat, ptr addrspace(1) %5, i64 %381, !dbg !55
+  %708 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !56
+  %709 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %707, i64 %708, i1 %395) #6, !dbg !56
+  %710 = extractvalue { i32, i32, i32, i32 } %709, 0, !dbg !56
+  %711 = bitcast i32 %710 to <2 x bfloat>, !dbg !56
+  %712 = extractvalue { i32, i32, i32, i32 } %709, 1, !dbg !56
+  %713 = bitcast i32 %712 to <2 x bfloat>, !dbg !56
+  %714 = extractvalue { i32, i32, i32, i32 } %709, 2, !dbg !56
+  %715 = bitcast i32 %714 to <2 x bfloat>, !dbg !56
+  %716 = extractvalue { i32, i32, i32, i32 } %709, 3, !dbg !56
+  %717 = bitcast i32 %716 to <2 x bfloat>, !dbg !56
+  %718 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !56
+  %719 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %707, i64 %718, i1 %398) #6, !dbg !56
+  %720 = extractvalue { i32, i32, i32, i32 } %719, 0, !dbg !56
+  %721 = bitcast i32 %720 to <2 x bfloat>, !dbg !56
+  %722 = extractvalue { i32, i32, i32, i32 } %719, 1, !dbg !56
+  %723 = bitcast i32 %722 to <2 x bfloat>, !dbg !56
+  %724 = extractvalue { i32, i32, i32, i32 } %719, 2, !dbg !56
+  %725 = bitcast i32 %724 to <2 x bfloat>, !dbg !56
+  %726 = extractvalue { i32, i32, i32, i32 } %719, 3, !dbg !56
+  %727 = bitcast i32 %726 to <2 x bfloat>, !dbg !56
+  %728 = shl i32 %23, 7, !dbg !57
+  %729 = shl i32 %24, 7, !dbg !57
+  %730 = add i32 %728, %33, !dbg !58
+  %731 = add i32 %729, %33, !dbg !58
+  %732 = sext i32 %730 to i64, !dbg !59
+  %733 = getelementptr bfloat, ptr addrspace(1) %6, i64 %732, !dbg !59
+  %734 = sext i32 %731 to i64, !dbg !59
+  %735 = getelementptr bfloat, ptr addrspace(1) %6, i64 %734, !dbg !59
+  %736 = and i1 %35, %609, !dbg !60
+  %737 = insertelement <2 x i32> poison, i32 %642, i64 0, !dbg !38
+  %738 = insertelement <2 x i32> %737, i32 %639, i64 1, !dbg !38
+  %739 = bitcast <2 x i32> %738 to <2 x float>, !dbg !38
+  %740 = fpext <2 x bfloat> %641 to <2 x float>, !dbg !61
+  %741 = fmul <2 x float> %739, %740, !dbg !62
+  %742 = insertelement <2 x i32> poison, i32 %688, i64 0, !dbg !54
+  %743 = insertelement <2 x i32> %742, i32 %689, i64 1, !dbg !54
+  %744 = bitcast <2 x i32> %743 to <2 x float>, !dbg !54
+  %745 = fpext <2 x bfloat> %711 to <2 x float>, !dbg !63
+  %746 = fmul <2 x float> %744, %745, !dbg !64
+  %747 = insertelement <2 x i1> poison, i1 %42, i64 0, !dbg !65
+  %748 = shufflevector <2 x i1> %747, <2 x i1> poison, <2 x i32> zeroinitializer, !dbg !65
+  %749 = select <2 x i1> %748, <2 x float> %741, <2 x float> %746, !dbg !65
+  %750 = fptrunc <2 x float> %749 to <2 x bfloat>, !dbg !66
+  %751 = insertelement <2 x i32> poison, i32 %638, i64 0, !dbg !38
+  %752 = insertelement <2 x i32> %751, i32 %635, i64 1, !dbg !38
+  %753 = bitcast <2 x i32> %752 to <2 x float>, !dbg !38
+  %754 = fpext <2 x bfloat> %637 to <2 x float>, !dbg !61
+  %755 = fmul <2 x float> %753, %754, !dbg !62
+  %756 = insertelement <2 x i32> poison, i32 %690, i64 0, !dbg !54
+  %757 = insertelement <2 x i32> %756, i32 %691, i64 1, !dbg !54
+  %758 = bitcast <2 x i32> %757 to <2 x float>, !dbg !54
+  %759 = fpext <2 x bfloat> %713 to <2 x float>, !dbg !63
+  %760 = fmul <2 x float> %758, %759, !dbg !64
+  %761 = select <2 x i1> %748, <2 x float> %755, <2 x float> %760, !dbg !65
+  %762 = fptrunc <2 x float> %761 to <2 x bfloat>, !dbg !66
+  %763 = insertelement <2 x i32> poison, i32 %634, i64 0, !dbg !38
+  %764 = insertelement <2 x i32> %763, i32 %631, i64 1, !dbg !38
+  %765 = bitcast <2 x i32> %764 to <2 x float>, !dbg !38
+  %766 = fpext <2 x bfloat> %633 to <2 x float>, !dbg !61
+  %767 = fmul <2 x float> %765, %766, !dbg !62
+  %768 = insertelement <2 x i32> poison, i32 %698, i64 0, !dbg !54
+  %769 = insertelement <2 x i32> %768, i32 %699, i64 1, !dbg !54
+  %770 = bitcast <2 x i32> %769 to <2 x float>, !dbg !54
+  %771 = fpext <2 x bfloat> %715 to <2 x float>, !dbg !63
+  %772 = fmul <2 x float> %770, %771, !dbg !64
+  %773 = select <2 x i1> %748, <2 x float> %767, <2 x float> %772, !dbg !65
+  %774 = fptrunc <2 x float> %773 to <2 x bfloat>, !dbg !66
+  %775 = insertelement <2 x i32> poison, i32 %630, i64 0, !dbg !38
+  %776 = insertelement <2 x i32> %775, i32 %627, i64 1, !dbg !38
+  %777 = bitcast <2 x i32> %776 to <2 x float>, !dbg !38
+  %778 = fpext <2 x bfloat> %629 to <2 x float>, !dbg !61
+  %779 = fmul <2 x float> %777, %778, !dbg !62
+  %780 = insertelement <2 x i32> poison, i32 %700, i64 0, !dbg !54
+  %781 = insertelement <2 x i32> %780, i32 %701, i64 1, !dbg !54
+  %782 = bitcast <2 x i32> %781 to <2 x float>, !dbg !54
+  %783 = fpext <2 x bfloat> %717 to <2 x float>, !dbg !63
+  %784 = fmul <2 x float> %782, %783, !dbg !64
+  %785 = select <2 x i1> %748, <2 x float> %779, <2 x float> %784, !dbg !65
+  %786 = fptrunc <2 x float> %785 to <2 x bfloat>, !dbg !66
+  %787 = insertelement <2 x i32> poison, i32 %626, i64 0, !dbg !38
+  %788 = insertelement <2 x i32> %787, i32 %623, i64 1, !dbg !38
+  %789 = bitcast <2 x i32> %788 to <2 x float>, !dbg !38
+  %790 = fpext <2 x bfloat> %625 to <2 x float>, !dbg !61
+  %791 = fmul <2 x float> %789, %790, !dbg !62
+  %792 = insertelement <2 x i32> poison, i32 %693, i64 0, !dbg !54
+  %793 = insertelement <2 x i32> %792, i32 %694, i64 1, !dbg !54
+  %794 = bitcast <2 x i32> %793 to <2 x float>, !dbg !54
+  %795 = fpext <2 x bfloat> %721 to <2 x float>, !dbg !63
+  %796 = fmul <2 x float> %794, %795, !dbg !64
+  %797 = insertelement <2 x i1> poison, i1 %610, i64 0, !dbg !65
+  %798 = shufflevector <2 x i1> %797, <2 x i1> poison, <2 x i32> zeroinitializer, !dbg !65
+  %799 = select <2 x i1> %798, <2 x float> %791, <2 x float> %796, !dbg !65
+  %800 = fptrunc <2 x float> %799 to <2 x bfloat>, !dbg !66
+  %801 = insertelement <2 x i32> poison, i32 %622, i64 0, !dbg !38
+  %802 = insertelement <2 x i32> %801, i32 %619, i64 1, !dbg !38
+  %803 = bitcast <2 x i32> %802 to <2 x float>, !dbg !38
+  %804 = fpext <2 x bfloat> %621 to <2 x float>, !dbg !61
+  %805 = fmul <2 x float> %803, %804, !dbg !62
+  %806 = insertelement <2 x i32> poison, i32 %695, i64 0, !dbg !54
+  %807 = insertelement <2 x i32> %806, i32 %696, i64 1, !dbg !54
+  %808 = bitcast <2 x i32> %807 to <2 x float>, !dbg !54
+  %809 = fpext <2 x bfloat> %723 to <2 x float>, !dbg !63
+  %810 = fmul <2 x float> %808, %809, !dbg !64
+  %811 = select <2 x i1> %798, <2 x float> %805, <2 x float> %810, !dbg !65
+  %812 = fptrunc <2 x float> %811 to <2 x bfloat>, !dbg !66
+  %813 = insertelement <2 x i32> poison, i32 %618, i64 0, !dbg !38
+  %814 = insertelement <2 x i32> %813, i32 %615, i64 1, !dbg !38
+  %815 = bitcast <2 x i32> %814 to <2 x float>, !dbg !38
+  %816 = fpext <2 x bfloat> %617 to <2 x float>, !dbg !61
+  %817 = fmul <2 x float> %815, %816, !dbg !62
+  %818 = insertelement <2 x i32> poison, i32 %703, i64 0, !dbg !54
+  %819 = insertelement <2 x i32> %818, i32 %704, i64 1, !dbg !54
+  %820 = bitcast <2 x i32> %819 to <2 x float>, !dbg !54
+  %821 = fpext <2 x bfloat> %725 to <2 x float>, !dbg !63
+  %822 = fmul <2 x float> %820, %821, !dbg !64
+  %823 = select <2 x i1> %798, <2 x float> %817, <2 x float> %822, !dbg !65
+  %824 = fptrunc <2 x float> %823 to <2 x bfloat>, !dbg !66
+  %825 = insertelement <2 x i32> poison, i32 %614, i64 0, !dbg !38
+  %826 = insertelement <2 x i32> %825, i32 %611, i64 1, !dbg !38
+  %827 = bitcast <2 x i32> %826 to <2 x float>, !dbg !38
+  %828 = fpext <2 x bfloat> %613 to <2 x float>, !dbg !61
+  %829 = fmul <2 x float> %827, %828, !dbg !62
+  %830 = insertelement <2 x i32> poison, i32 %705, i64 0, !dbg !54
+  %831 = insertelement <2 x i32> %830, i32 %706, i64 1, !dbg !54
+  %832 = bitcast <2 x i32> %831 to <2 x float>, !dbg !54
+  %833 = fpext <2 x bfloat> %727 to <2 x float>, !dbg !63
+  %834 = fmul <2 x float> %832, %833, !dbg !64
+  %835 = select <2 x i1> %798, <2 x float> %829, <2 x float> %834, !dbg !65
+  %836 = fptrunc <2 x float> %835 to <2 x bfloat>, !dbg !66
+  %837 = bitcast <2 x bfloat> %750 to i32, !dbg !66
+  %838 = bitcast <2 x bfloat> %762 to i32, !dbg !66
+  %839 = bitcast <2 x bfloat> %774 to i32, !dbg !66
+  %840 = bitcast <2 x bfloat> %786 to i32, !dbg !66
+  tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %837, i32 %838, i32 %839, i32 %840, ptr addrspace(1) %733, i1 %736) #6, !dbg !66
+  %841 = bitcast <2 x bfloat> %800 to i32, !dbg !66
+  %842 = bitcast <2 x bfloat> %812 to i32, !dbg !66
+  %843 = bitcast <2 x bfloat> %824 to i32, !dbg !66
+  %844 = bitcast <2 x bfloat> %836 to i32, !dbg !66
+  tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %841, i32 %842, i32 %843, i32 %844, ptr addrspace(1) %735, i1 %736) #6, !dbg !66
+  ret void, !dbg !67
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 65535) i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() #1
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 65535) i32 @llvm.nvvm.read.ptx.sreg.ctaid.z() #1
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 1, 65536) i32 @llvm.nvvm.read.ptx.sreg.nctaid.y() #1
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1
+
+; Function Attrs: convergent nocallback nounwind
+declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #2
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.div.full(float, float) #3
+
+; Function Attrs: nocallback nofree nounwind memory(argmem: read)
+declare { i32, i32, i32, i32 } @llvm.nvvm.ldmatrix.sync.aligned.m8n8.x4.b16.p3(ptr addrspace(3) readonly captures(none)) #4
+
+declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #5
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #3
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.rsqrt.approx.f(float) #3
+
+attributes #0 = { nounwind "nvvm.reqntid"="256" }
+attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #2 = { convergent nocallback nounwind }
+attributes #3 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
+attributes #4 = { nocallback nofree nounwind memory(argmem: read) }
+attributes #5 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #6 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3}
+!llvm.ident = !{!4}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly)
+!1 = !DIFile(filename: "c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py", directory: "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h")
+!2 = !{i32 2, !"Debug Info Version", i32 3}
+!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
+!4 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
+!5 = distinct !DISubprogram(name: "triton_poi_fused__fused_rms_norm_cat_view_2", linkageName: "triton_poi_fused__fused_rms_norm_cat_view_2", scope: !1, file: !1, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
+!7 = !{}
+!8 = !DILocation(line: 21, column: 29, scope: !5)
+!9 = !DILocation(line: 21, column: 48, scope: !5)
+!10 = !DILocation(line: 21, column: 69, scope: !5)
+!11 = !DILocation(line: 21, column: 53, scope: !5)
+!12 = !DILocation(line: 21, column: 34, scope: !5)
+!13 = !DILocation(line: 21, column: 75, scope: !5)
+!14 = !DILocation(line: 22, column: 44, scope: !5)
+!15 = !DILocation(line: 22, column: 23, scope: !5)
+!16 = !DILocation(line: 24, column: 28, scope: !5)
+!17 = !DILocation(line: 24, column: 33, scope: !5)
+!18 = !DILocation(line: 25, column: 44, scope: !5)
+!19 = !DILocation(line: 25, column: 23, scope: !5)
+!20 = !DILocation(line: 26, column: 21, scope: !5)
+!21 = !DILocation(line: 27, column: 19, scope: !5)
+!22 = !DILocation(line: 29, column: 19, scope: !5)
+!23 = !DILocation(line: 35, column: 18, scope: !5)
+!24 = !DILocation(line: 36, column: 39, scope: !5)
+!25 = !DILocation(line: 36, column: 35, scope: !5)
+!26 = !DILocation(line: 36, column: 51, scope: !5)
+!27 = !DILocation(line: 36, column: 44, scope: !5)
+!28 = !DILocation(line: 36, column: 30, scope: !5)
+!29 = !DILocation(line: 36, column: 64, scope: !5)
+!30 = !DILocation(line: 36, column: 72, scope: !5)
+!31 = !DILocation(line: 36, column: 57, scope: !5)
+!32 = !DILocation(line: 36, column: 123, scope: !5)
+!33 = !DILocation(line: 38, column: 30, scope: !5)
+!34 = !DILocation(line: 38, column: 80, scope: !5)
+!35 = !DILocation(line: 40, column: 19, scope: !5)
+!36 = !DILocation(line: 42, column: 19, scope: !5)
+!37 = !DILocation(line: 43, column: 28, scope: !5)
+!38 = !DILocation(line: 44, column: 19, scope: !5)
+!39 = !DILocation(line: 45, column: 31, scope: !5)
+!40 = !DILocation(line: 45, column: 71, scope: !5)
+!41 = !DILocation(line: 54, column: 45, scope: !5)
+!42 = !DILocation(line: 54, column: 31, scope: !5)
+!43 = !DILocation(line: 54, column: 83, scope: !5)
+!44 = !DILocation(line: 54, column: 67, scope: !5)
+!45 = !DILocation(line: 54, column: 134, scope: !5)
+!46 = !DILocation(line: 56, column: 56, scope: !5)
+!47 = !DILocation(line: 56, column: 52, scope: !5)
+!48 = !DILocation(line: 56, column: 31, scope: !5)
+!49 = !DILocation(line: 56, column: 90, scope: !5)
+!50 = !DILocation(line: 58, column: 21, scope: !5)
+!51 = !DILocation(line: 60, column: 20, scope: !5)
+!52 = !DILocation(line: 61, column: 28, scope: !5)
+!53 = !DILocation(line: 23, column: 21, scope: !5)
+!54 = !DILocation(line: 62, column: 20, scope: !5)
+!55 = !DILocation(line: 63, column: 31, scope: !5)
+!56 = !DILocation(line: 63, column: 71, scope: !5)
+!57 = !DILocation(line: 70, column: 34, scope: !5)
+!58 = !DILocation(line: 70, column: 30, scope: !5)
+!59 = !DILocation(line: 70, column: 25, scope: !5)
+!60 = !DILocation(line: 70, column: 54, scope: !5)
+!61 = !DILocation(line: 45, column: 137, scope: !5)
+!62 = !DILocation(line: 47, column: 20, scope: !5)
+!63 = !DILocation(line: 63, column: 138, scope: !5)
+!64 = !DILocation(line: 65, column: 20, scope: !5)
+!65 = !DILocation(line: 0, scope: !5)
+!66 = !DILocation(line: 70, column: 46, scope: !5)
+!67 = !DILocation(line: 70, column: 4, scope: !5)
diff --git a/triton/V55R5JX27SH3ZP2ZK3XX5KCHKG3ODHZ7KRAVANZA47FHWYLN35WA/triton_poi_fused__fused_rms_norm_cat_view_2.ptx b/triton/V55R5JX27SH3ZP2ZK3XX5KCHKG3ODHZ7KRAVANZA47FHWYLN35WA/triton_poi_fused__fused_rms_norm_cat_view_2.ptx
new file mode 100644
index 0000000000000000000000000000000000000000..5a427e450939597d2805f81b8a4a782ae1d609b0
--- /dev/null
+++ b/triton/V55R5JX27SH3ZP2ZK3XX5KCHKG3ODHZ7KRAVANZA47FHWYLN35WA/triton_poi_fused__fused_rms_norm_cat_view_2.ptx
@@ -0,0 +1,1038 @@
+//
+// Generated by LLVM NVPTX Back-End
+//
+
+.version 9.1
+.target sm_89
+.address_size 64
+
+	// .globl	triton_poi_fused__fused_rms_norm_cat_view_2 // -- Begin function triton_poi_fused__fused_rms_norm_cat_view_2
+.extern .shared .align 16 .b8 global_smem[];
+.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90};
+                                        // @triton_poi_fused__fused_rms_norm_cat_view_2
+.visible .entry triton_poi_fused__fused_rms_norm_cat_view_2(
+	.param .u64 .ptr .global .align 1 triton_poi_fused__fused_rms_norm_cat_view_2_param_0,
+	.param .u64 .ptr .global .align 1 triton_poi_fused__fused_rms_norm_cat_view_2_param_1,
+	.param .u64 .ptr .global .align 1 triton_poi_fused__fused_rms_norm_cat_view_2_param_2,
+	.param .u64 .ptr .global .align 1 triton_poi_fused__fused_rms_norm_cat_view_2_param_3,
+	.param .u64 .ptr .global .align 1 triton_poi_fused__fused_rms_norm_cat_view_2_param_4,
+	.param .u64 .ptr .global .align 1 triton_poi_fused__fused_rms_norm_cat_view_2_param_5,
+	.param .u64 .ptr .global .align 1 triton_poi_fused__fused_rms_norm_cat_view_2_param_6,
+	.param .u32 triton_poi_fused__fused_rms_norm_cat_view_2_param_7,
+	.param .u32 triton_poi_fused__fused_rms_norm_cat_view_2_param_8,
+	.param .u64 .ptr .global .align 1 triton_poi_fused__fused_rms_norm_cat_view_2_param_9,
+	.param .u64 .ptr .global .align 1 triton_poi_fused__fused_rms_norm_cat_view_2_param_10
+)
+.reqntid 256
+{
+	.reg .pred 	%p<17>;
+	.reg .b16 	%rs<65>;
+	.reg .b32 	%r<452>;
+	.reg .b64 	%rd<35>;
+	.loc	1 18 0                          // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:18:0
+$L__func_begin0:
+	.loc	1 18 0                          // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:18:0
+
+// %bb.0:                               // %__nv_rsqrtf.exit
+	ld.param.b64 	%rd27, [triton_poi_fused__fused_rms_norm_cat_view_2_param_0];
+	ld.param.b64 	%rd28, [triton_poi_fused__fused_rms_norm_cat_view_2_param_1];
+$L__tmp0:
+	.loc	1 21 29                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:21:29
+	mov.u32 	%r74, %ctaid.y;
+	ld.param.b64 	%rd29, [triton_poi_fused__fused_rms_norm_cat_view_2_param_2];
+	.loc	1 21 48                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:21:48
+	mov.u32 	%r75, %ctaid.z;
+	ld.param.b64 	%rd30, [triton_poi_fused__fused_rms_norm_cat_view_2_param_3];
+	.loc	1 21 69                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:21:69
+	mov.u32 	%r76, %nctaid.y;
+	ld.param.b64 	%rd31, [triton_poi_fused__fused_rms_norm_cat_view_2_param_4];
+	.loc	1 21 34                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:21:34
+	mad.lo.s32 	%r77, %r75, %r76, %r74;
+	ld.param.b64 	%rd32, [triton_poi_fused__fused_rms_norm_cat_view_2_param_5];
+	.loc	1 21 75                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:21:75
+	shl.b32 	%r78, %r77, 8;
+	ld.param.b64 	%rd33, [triton_poi_fused__fused_rms_norm_cat_view_2_param_6];
+	.loc	1 22 44                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:22:44
+	mov.u32 	%r79, %tid.x;
+	bfe.u32 	%r80, %r79, 1, 7;
+	shl.b32 	%r81, %r79, 2;
+	and.b32 	%r82, %r81, 252;
+	.loc	1 22 23                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:22:23
+	or.b32 	%r83, %r78, %r80;
+	or.b32 	%r84, %r83, 128;
+	or.b32 	%r85, %r78, %r82;
+	.loc	1 24 28                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:24:28
+	mov.u32 	%r86, %ctaid.x;
+	.loc	1 24 33                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:24:33
+	shl.b32 	%r87, %r86, 4;
+	.loc	1 25 44                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:25:44
+	and.b32 	%r88, %r79, 1;
+	neg.s32 	%r89, %r88;
+	shl.b32 	%r90, %r88, 3;
+	bfe.u32 	%r91, %r79, 6, 2;
+	.loc	1 25 23                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:25:23
+	or.b32 	%r92, %r90, %r87;
+	or.b32 	%r93, %r91, %r87;
+	.loc	1 26 21                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:26:21
+	setp.lt.s32 	%p8, %r92, 128;
+	setp.lt.s32 	%p9, %r93, 128;
+	.loc	1 27 19                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:27:19
+	bfe.s32 	%r94, %r77, 23, 1;
+	shr.u32 	%r95, %r94, 27;
+	add.s32 	%r96, %r83, %r95;
+	shr.u32 	%r97, %r96, 5;
+	add.s32 	%r98, %r84, %r95;
+	shr.u32 	%r99, %r98, 5;
+	.loc	1 29 19                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:29:19
+	and.b32 	%r100, %r96, 33554400;
+	sub.s32 	%r101, %r83, %r100;
+	.loc	1 35 18                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:35:18
+	setp.lt.s32 	%p10, %r83, 8192;
+	setp.lt.s32 	%p11, %r85, 8192;
+	.loc	1 36 39                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:36:39
+	shl.b32 	%r102, %r101, 7;
+	.loc	1 36 35                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:36:35
+	add.s32 	%r103, %r102, %r92;
+	.loc	1 36 44                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:36:44
+	mad.lo.s32 	%r104, %r97, 12288, %r103;
+	mad.lo.s32 	%r105, %r99, 12288, %r103;
+	.loc	1 36 30                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:36:30
+	mad.wide.s32 	%rd1, %r104, 2, %rd27;
+	mad.wide.s32 	%rd3, %r105, 2, %rd27;
+	.loc	1 36 64                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:36:64
+	and.pred 	%p1, %p8, %p10;
+	and.pred 	%p3, %p9, %p11;
+	.loc	1 36 72                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:36:72
+	setp.lt.s32 	%p12, %r83, 8064;
+	and.pred 	%p2, %p8, %p12;
+	.loc	1 36 57                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:36:57
+	// begin inline asm
+	mov.u64 %rd2, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd2, 1.0;
+	// end inline asm
+	mov.b32 	%r5, 0;
+	// begin inline asm
+	mov.u32 %r1, %r5;
+	mov.u32 %r2, %r5;
+	mov.u32 %r3, %r5;
+	mov.u32 %r4, %r5;
+	@%p1 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r1, %r2, %r3, %r4 }, [ %rd1 + 0 ], %rd2;
+	// end inline asm
+	prmt.b32 	%r106, %r1, %r3, 0x7632U;
+	prmt.b32 	%r107, %r2, %r4, 0x7632U;
+	// begin inline asm
+	mov.u64 %rd4, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd4, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r6, %r5;
+	mov.u32 %r7, %r5;
+	mov.u32 %r8, %r5;
+	mov.u32 %r9, %r5;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r6, %r7, %r8, %r9 }, [ %rd3 + 0 ], %rd4;
+	// end inline asm
+	prmt.b32 	%r108, %r6, %r8, 0x7632U;
+	prmt.b32 	%r109, %r7, %r9, 0x7632U;
+	.loc	1 36 123                        // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:36:123
+	and.b32 	%r110, %r79, 6;
+	and.b32 	%r111, %r79, 120;
+	shl.b32 	%r112, %r88, 2;
+	bfe.s32 	%r113, %r79, 7, 1;
+	and.b32 	%r114, %r113, 4100;
+	mul.lo.s32 	%r115, %r110, 528;
+	or.b32 	%r116, %r112, %r111;
+	xor.b32 	%r117, %r115, %r116;
+	xor.b32 	%r118, %r117, %r114;
+	mov.b32 	%r119, global_smem;
+	add.s32 	%r120, %r119, %r118;
+	prmt.b32 	%r121, %r1, %r3, 0x5410U;
+	st.shared.b32 	[%r120], %r121;
+	st.shared.b32 	[%r120+256], %r106;
+	prmt.b32 	%r122, %r2, %r4, 0x5410U;
+	st.shared.b32 	[%r120+512], %r122;
+	st.shared.b32 	[%r120+768], %r107;
+	prmt.b32 	%r123, %r6, %r8, 0x5410U;
+	st.shared.b32 	[%r120+128], %r123;
+	st.shared.b32 	[%r120+384], %r108;
+	prmt.b32 	%r124, %r7, %r9, 0x5410U;
+	st.shared.b32 	[%r120+640], %r124;
+	st.shared.b32 	[%r120+896], %r109;
+	bar.sync 	0;
+	shl.b32 	%r125, %r79, 3;
+	and.b32 	%r126, %r125, 120;
+	and.b32 	%r127, %r79, 224;
+	shl.b32 	%r128, %r127, 2;
+	bfe.s32 	%r129, %r79, 4, 1;
+	and.b32 	%r130, %r129, 4100;
+	or.b32 	%r131, %r130, %r128;
+	or.b32 	%r132, %r131, %r126;
+	add.s32 	%r133, %r119, %r132;
+	ld.shared.v2.b16 	{%rs1, %rs2}, [%r133];
+	xor.b32 	%r134, %r132, 32;
+	add.s32 	%r135, %r119, %r134;
+	ld.shared.v2.b16 	{%rs3, %rs4}, [%r135+1024];
+	xor.b32 	%r136, %r132, 64;
+	add.s32 	%r137, %r119, %r136;
+	ld.shared.v2.b16 	{%rs5, %rs6}, [%r137+2048];
+	xor.b32 	%r138, %r132, 96;
+	add.s32 	%r139, %r119, %r138;
+	ld.shared.v2.b16 	{%rs7, %rs8}, [%r139+3072];
+	xor.b32 	%r140, %r132, 4;
+	add.s32 	%r141, %r119, %r140;
+	ld.shared.v2.b16 	{%rs9, %rs10}, [%r141];
+	xor.b32 	%r142, %r132, 36;
+	add.s32 	%r143, %r119, %r142;
+	ld.shared.v2.b16 	{%rs11, %rs12}, [%r143+1024];
+	xor.b32 	%r144, %r132, 68;
+	add.s32 	%r145, %r119, %r144;
+	ld.shared.v2.b16 	{%rs13, %rs14}, [%r145+2048];
+	xor.b32 	%r146, %r132, 100;
+	add.s32 	%r147, %r119, %r146;
+	ld.shared.v2.b16 	{%rs15, %rs16}, [%r147+3072];
+	cvt.f32.bf16 	%r148, %rs9;
+	cvt.f32.bf16 	%r149, %rs11;
+	cvt.f32.bf16 	%r150, %rs1;
+	cvt.f32.bf16 	%r151, %rs3;
+	cvt.f32.bf16 	%r152, %rs13;
+	cvt.f32.bf16 	%r153, %rs15;
+	cvt.f32.bf16 	%r154, %rs5;
+	cvt.f32.bf16 	%r155, %rs7;
+	cvt.f32.bf16 	%r156, %rs10;
+	cvt.f32.bf16 	%r157, %rs12;
+	cvt.f32.bf16 	%r158, %rs2;
+	cvt.f32.bf16 	%r159, %rs4;
+	cvt.f32.bf16 	%r160, %rs14;
+	cvt.f32.bf16 	%r161, %rs16;
+	cvt.f32.bf16 	%r162, %rs6;
+	cvt.f32.bf16 	%r163, %rs8;
+	.loc	1 38 30                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:38:30
+	mad.wide.s32 	%rd5, %r85, 4, %rd28;
+	.loc	1 38 80                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:38:80
+	// begin inline asm
+	mov.u64 %rd6, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd6, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r10, %r5;
+	mov.u32 %r11, %r5;
+	mov.u32 %r12, %r5;
+	mov.u32 %r13, %r5;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r10, %r11, %r12, %r13 }, [ %rd5 + 0 ], %rd6;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd7, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd7, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r14, %r5;
+	mov.u32 %r15, %r5;
+	mov.u32 %r16, %r5;
+	mov.u32 %r17, %r5;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r14, %r15, %r16, %r17 }, [ %rd5 + 0 ], %rd7;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd8, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd8, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r18, %r5;
+	mov.u32 %r19, %r5;
+	mov.u32 %r20, %r5;
+	mov.u32 %r21, %r5;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r18, %r19, %r20, %r21 }, [ %rd5 + 0 ], %rd8;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd9, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd9, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r22, %r5;
+	mov.u32 %r23, %r5;
+	mov.u32 %r24, %r5;
+	mov.u32 %r25, %r5;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r22, %r23, %r24, %r25 }, [ %rd5 + 0 ], %rd9;
+	// end inline asm
+	mov.b32 	%r164, 0f43000000;
+	.loc	1 40 19                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:40:19
+	div.full.f32 	%r165, %r10, %r164;
+	div.full.f32 	%r166, %r11, %r164;
+	div.full.f32 	%r167, %r12, %r164;
+	div.full.f32 	%r168, %r13, %r164;
+	div.full.f32 	%r169, %r14, %r164;
+	div.full.f32 	%r170, %r15, %r164;
+	div.full.f32 	%r171, %r16, %r164;
+	div.full.f32 	%r172, %r17, %r164;
+	div.full.f32 	%r173, %r18, %r164;
+	div.full.f32 	%r174, %r19, %r164;
+	div.full.f32 	%r175, %r20, %r164;
+	div.full.f32 	%r176, %r21, %r164;
+	div.full.f32 	%r177, %r22, %r164;
+	div.full.f32 	%r178, %r23, %r164;
+	div.full.f32 	%r179, %r24, %r164;
+	div.full.f32 	%r180, %r25, %r164;
+	.loc	1 42 19                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:42:19
+	add.f32 	%r181, %r165, 0f358637BD;
+	add.f32 	%r182, %r166, 0f358637BD;
+	add.f32 	%r183, %r167, 0f358637BD;
+	add.f32 	%r184, %r168, 0f358637BD;
+	add.f32 	%r185, %r169, 0f358637BD;
+	add.f32 	%r186, %r170, 0f358637BD;
+	add.f32 	%r187, %r171, 0f358637BD;
+	add.f32 	%r188, %r172, 0f358637BD;
+	add.f32 	%r189, %r173, 0f358637BD;
+	add.f32 	%r190, %r174, 0f358637BD;
+	add.f32 	%r191, %r175, 0f358637BD;
+	add.f32 	%r192, %r176, 0f358637BD;
+	add.f32 	%r193, %r177, 0f358637BD;
+	add.f32 	%r194, %r178, 0f358637BD;
+	add.f32 	%r195, %r179, 0f358637BD;
+	add.f32 	%r196, %r180, 0f358637BD;
+	.loc	1 43 28                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:43:28
+	rsqrt.approx.ftz.f32 	%r197, %r181;
+	rsqrt.approx.ftz.f32 	%r198, %r182;
+	rsqrt.approx.ftz.f32 	%r199, %r183;
+	rsqrt.approx.ftz.f32 	%r200, %r184;
+	rsqrt.approx.ftz.f32 	%r201, %r185;
+	rsqrt.approx.ftz.f32 	%r202, %r186;
+	rsqrt.approx.ftz.f32 	%r203, %r187;
+	rsqrt.approx.ftz.f32 	%r204, %r188;
+	rsqrt.approx.ftz.f32 	%r205, %r189;
+	rsqrt.approx.ftz.f32 	%r206, %r190;
+	rsqrt.approx.ftz.f32 	%r207, %r191;
+	rsqrt.approx.ftz.f32 	%r208, %r192;
+	rsqrt.approx.ftz.f32 	%r209, %r193;
+	rsqrt.approx.ftz.f32 	%r210, %r194;
+	rsqrt.approx.ftz.f32 	%r211, %r195;
+	rsqrt.approx.ftz.f32 	%r212, %r196;
+	.loc	1 44 19                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:44:19
+	mul.f32 	%r213, %r198, %r151;
+	mul.f32 	%r214, %r197, %r150;
+	mul.f32 	%r215, %r206, %r149;
+	mul.f32 	%r216, %r205, %r148;
+	mul.f32 	%r217, %r200, %r155;
+	mul.f32 	%r218, %r199, %r154;
+	mul.f32 	%r219, %r208, %r153;
+	mul.f32 	%r220, %r207, %r152;
+	mul.f32 	%r221, %r202, %r159;
+	mul.f32 	%r222, %r201, %r158;
+	mul.f32 	%r223, %r210, %r157;
+	mul.f32 	%r224, %r209, %r156;
+	mul.f32 	%r225, %r204, %r163;
+	mul.f32 	%r226, %r203, %r162;
+	mul.f32 	%r227, %r212, %r161;
+	mul.f32 	%r228, %r211, %r160;
+	bar.sync 	0;
+	shl.b32 	%r229, %r79, 4;
+	and.b32 	%r230, %r229, 4080;
+	add.s32 	%r231, %r119, %r230;
+	st.shared.v4.b32 	[%r231], {%r214, %r216, %r213, %r215};
+	xor.b32 	%r232, %r230, 64;
+	add.s32 	%r233, %r119, %r232;
+	st.shared.v4.b32 	[%r233+4096], {%r218, %r220, %r217, %r219};
+	bar.sync 	0;
+	shl.b32 	%r234, %r79, 7;
+	and.b32 	%r235, %r234, 3072;
+	shl.b32 	%r236, %r110, 3;
+	shl.b32 	%r237, %r127, 1;
+	and.b32 	%r238, %r89, 4160;
+	xor.b32 	%r239, %r238, %r237;
+	add.s32 	%r240, %r119, %r235;
+	add.s32 	%r241, %r240, %r236;
+	add.s32 	%r242, %r241, %r239;
+	ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r243, %r244, %r245, %r246}, [%r242];
+	ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r247, %r248, %r249, %r250}, [%r242+512];
+	bar.sync 	0;
+	st.shared.v4.b32 	[%r231], {%r222, %r224, %r221, %r223};
+	st.shared.v4.b32 	[%r233+4096], {%r226, %r228, %r225, %r227};
+	bar.sync 	0;
+	ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r251, %r252, %r253, %r254}, [%r242];
+	ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r255, %r256, %r257, %r258}, [%r242+512];
+	.loc	1 45 31                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:45:31
+	mul.wide.s32 	%rd34, %r92, 2;
+	add.s64 	%rd10, %rd29, %rd34;
+	.loc	1 45 71                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:45:71
+	// begin inline asm
+	mov.u64 %rd11, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd11, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r26, %r5;
+	mov.u32 %r27, %r5;
+	mov.u32 %r28, %r5;
+	mov.u32 %r29, %r5;
+	@%p1 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r26, %r27, %r28, %r29 }, [ %rd10 + 0 ], %rd11;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd12, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd12, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r30, %r5;
+	mov.u32 %r31, %r5;
+	mov.u32 %r32, %r5;
+	mov.u32 %r33, %r5;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r30, %r31, %r32, %r33 }, [ %rd10 + 0 ], %rd12;
+	// end inline asm
+	.loc	1 54 45                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:54:45
+	add.s32 	%r259, %r104, -3145728;
+	add.s32 	%r260, %r105, -3145728;
+	.loc	1 54 31                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:54:31
+	mad.wide.s32 	%rd13, %r259, 2, %rd30;
+	mad.wide.s32 	%rd15, %r260, 2, %rd30;
+	.loc	1 54 83                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:54:83
+	add.s32 	%r261, %r78, -8192;
+	setp.lt.u32 	%p13, %r261, 65536;
+	and.pred 	%p4, %p8, %p13;
+	add.s32 	%r262, %r78, -8064;
+	setp.lt.u32 	%p14, %r262, 65664;
+	and.pred 	%p5, %p8, %p14;
+	and.pred 	%p6, %p9, %p13;
+	.loc	1 54 67                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:54:67
+	// begin inline asm
+	mov.u64 %rd14, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd14, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r34, %r5;
+	mov.u32 %r35, %r5;
+	mov.u32 %r36, %r5;
+	mov.u32 %r37, %r5;
+	@%p4 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r34, %r35, %r36, %r37 }, [ %rd13 + 0 ], %rd14;
+	// end inline asm
+	prmt.b32 	%r263, %r34, %r36, 0x7632U;
+	prmt.b32 	%r264, %r35, %r37, 0x7632U;
+	// begin inline asm
+	mov.u64 %rd16, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd16, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r38, %r5;
+	mov.u32 %r39, %r5;
+	mov.u32 %r40, %r5;
+	mov.u32 %r41, %r5;
+	@%p5 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r38, %r39, %r40, %r41 }, [ %rd15 + 0 ], %rd16;
+	// end inline asm
+	prmt.b32 	%r265, %r38, %r40, 0x7632U;
+	prmt.b32 	%r266, %r39, %r41, 0x7632U;
+	.loc	1 54 134                        // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:54:134
+	bar.sync 	0;
+	prmt.b32 	%r267, %r34, %r36, 0x5410U;
+	st.shared.b32 	[%r120], %r267;
+	st.shared.b32 	[%r120+256], %r263;
+	prmt.b32 	%r268, %r35, %r37, 0x5410U;
+	st.shared.b32 	[%r120+512], %r268;
+	st.shared.b32 	[%r120+768], %r264;
+	prmt.b32 	%r269, %r38, %r40, 0x5410U;
+	st.shared.b32 	[%r120+128], %r269;
+	st.shared.b32 	[%r120+384], %r265;
+	prmt.b32 	%r270, %r39, %r41, 0x5410U;
+	st.shared.b32 	[%r120+640], %r270;
+	st.shared.b32 	[%r120+896], %r266;
+	bar.sync 	0;
+	ld.shared.v2.b16 	{%rs17, %rs18}, [%r133];
+	ld.shared.v2.b16 	{%rs19, %rs20}, [%r135+1024];
+	ld.shared.v2.b16 	{%rs21, %rs22}, [%r137+2048];
+	ld.shared.v2.b16 	{%rs23, %rs24}, [%r139+3072];
+	ld.shared.v2.b16 	{%rs25, %rs26}, [%r141];
+	ld.shared.v2.b16 	{%rs27, %rs28}, [%r143+1024];
+	ld.shared.v2.b16 	{%rs29, %rs30}, [%r145+2048];
+	ld.shared.v2.b16 	{%rs31, %rs32}, [%r147+3072];
+	.loc	1 56 52                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:56:52
+	add.s32 	%r271, %r85, -8192;
+	.loc	1 56 31                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:56:31
+	mad.wide.s32 	%rd17, %r271, 4, %rd31;
+	.loc	1 56 90                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:56:90
+	// begin inline asm
+	mov.u64 %rd18, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd18, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r42, %r5;
+	mov.u32 %r43, %r5;
+	mov.u32 %r44, %r5;
+	mov.u32 %r45, %r5;
+	@%p6 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r42, %r43, %r44, %r45 }, [ %rd17 + 0 ], %rd18;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd19, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd19, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r46, %r5;
+	mov.u32 %r47, %r5;
+	mov.u32 %r48, %r5;
+	mov.u32 %r49, %r5;
+	@%p6 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r46, %r47, %r48, %r49 }, [ %rd17 + 0 ], %rd19;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd20, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd20, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r50, %r5;
+	mov.u32 %r51, %r5;
+	mov.u32 %r52, %r5;
+	mov.u32 %r53, %r5;
+	@%p6 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r50, %r51, %r52, %r53 }, [ %rd17 + 0 ], %rd20;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd21, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd21, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r54, %r5;
+	mov.u32 %r55, %r5;
+	mov.u32 %r56, %r5;
+	mov.u32 %r57, %r5;
+	@%p6 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r54, %r55, %r56, %r57 }, [ %rd17 + 0 ], %rd21;
+	// end inline asm
+	.loc	1 58 21                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:58:21
+	div.full.f32 	%r272, %r42, %r164;
+	div.full.f32 	%r273, %r43, %r164;
+	div.full.f32 	%r274, %r44, %r164;
+	div.full.f32 	%r275, %r45, %r164;
+	div.full.f32 	%r276, %r46, %r164;
+	div.full.f32 	%r277, %r47, %r164;
+	div.full.f32 	%r278, %r48, %r164;
+	div.full.f32 	%r279, %r49, %r164;
+	div.full.f32 	%r280, %r50, %r164;
+	div.full.f32 	%r281, %r51, %r164;
+	div.full.f32 	%r282, %r52, %r164;
+	div.full.f32 	%r283, %r53, %r164;
+	div.full.f32 	%r284, %r54, %r164;
+	div.full.f32 	%r285, %r55, %r164;
+	div.full.f32 	%r286, %r56, %r164;
+	div.full.f32 	%r287, %r57, %r164;
+	.loc	1 60 20                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:60:20
+	add.f32 	%r288, %r272, 0f358637BD;
+	add.f32 	%r289, %r273, 0f358637BD;
+	add.f32 	%r290, %r274, 0f358637BD;
+	add.f32 	%r291, %r275, 0f358637BD;
+	add.f32 	%r292, %r276, 0f358637BD;
+	add.f32 	%r293, %r277, 0f358637BD;
+	add.f32 	%r294, %r278, 0f358637BD;
+	add.f32 	%r295, %r279, 0f358637BD;
+	add.f32 	%r296, %r280, 0f358637BD;
+	add.f32 	%r297, %r281, 0f358637BD;
+	add.f32 	%r298, %r282, 0f358637BD;
+	add.f32 	%r299, %r283, 0f358637BD;
+	add.f32 	%r300, %r284, 0f358637BD;
+	add.f32 	%r301, %r285, 0f358637BD;
+	add.f32 	%r302, %r286, 0f358637BD;
+	add.f32 	%r303, %r287, 0f358637BD;
+	.loc	1 61 28                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:61:28
+	rsqrt.approx.ftz.f32 	%r304, %r288;
+	rsqrt.approx.ftz.f32 	%r305, %r289;
+	rsqrt.approx.ftz.f32 	%r306, %r290;
+	rsqrt.approx.ftz.f32 	%r307, %r291;
+	rsqrt.approx.ftz.f32 	%r308, %r292;
+	rsqrt.approx.ftz.f32 	%r309, %r293;
+	rsqrt.approx.ftz.f32 	%r310, %r294;
+	rsqrt.approx.ftz.f32 	%r311, %r295;
+	rsqrt.approx.ftz.f32 	%r312, %r296;
+	rsqrt.approx.ftz.f32 	%r313, %r297;
+	rsqrt.approx.ftz.f32 	%r314, %r298;
+	rsqrt.approx.ftz.f32 	%r315, %r299;
+	rsqrt.approx.ftz.f32 	%r316, %r300;
+	rsqrt.approx.ftz.f32 	%r317, %r301;
+	rsqrt.approx.ftz.f32 	%r318, %r302;
+	rsqrt.approx.ftz.f32 	%r319, %r303;
+	.loc	1 23 21                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:23:21
+	setp.lt.s32 	%p15, %r83, 73728;
+	.loc	1 35 18                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:35:18
+	setp.lt.s32 	%p16, %r84, 8192;
+	.loc	1 54 134                        // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:54:134
+	cvt.f32.bf16 	%r320, %rs25;
+	cvt.f32.bf16 	%r321, %rs27;
+	cvt.f32.bf16 	%r322, %rs17;
+	cvt.f32.bf16 	%r323, %rs19;
+	.loc	1 62 20                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:62:20
+	mul.f32 	%r324, %r305, %r323;
+	mul.f32 	%r325, %r304, %r322;
+	mul.f32 	%r326, %r313, %r321;
+	mul.f32 	%r327, %r312, %r320;
+	.loc	1 54 134                        // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:54:134
+	cvt.f32.bf16 	%r328, %rs29;
+	cvt.f32.bf16 	%r329, %rs31;
+	cvt.f32.bf16 	%r330, %rs21;
+	cvt.f32.bf16 	%r331, %rs23;
+	.loc	1 62 20                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:62:20
+	mul.f32 	%r332, %r307, %r331;
+	mul.f32 	%r333, %r306, %r330;
+	mul.f32 	%r334, %r315, %r329;
+	mul.f32 	%r335, %r314, %r328;
+	.loc	1 54 134                        // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:54:134
+	cvt.f32.bf16 	%r336, %rs26;
+	cvt.f32.bf16 	%r337, %rs28;
+	cvt.f32.bf16 	%r338, %rs18;
+	cvt.f32.bf16 	%r339, %rs20;
+	.loc	1 62 20                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:62:20
+	mul.f32 	%r340, %r309, %r339;
+	mul.f32 	%r341, %r308, %r338;
+	mul.f32 	%r342, %r317, %r337;
+	mul.f32 	%r343, %r316, %r336;
+	.loc	1 54 134                        // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:54:134
+	cvt.f32.bf16 	%r344, %rs30;
+	cvt.f32.bf16 	%r345, %rs32;
+	cvt.f32.bf16 	%r346, %rs22;
+	cvt.f32.bf16 	%r347, %rs24;
+	.loc	1 62 20                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:62:20
+	mul.f32 	%r348, %r311, %r347;
+	mul.f32 	%r349, %r310, %r346;
+	mul.f32 	%r350, %r319, %r345;
+	mul.f32 	%r351, %r318, %r344;
+	bar.sync 	0;
+	st.shared.v4.b32 	[%r231], {%r325, %r327, %r324, %r326};
+	st.shared.v4.b32 	[%r233+4096], {%r333, %r335, %r332, %r334};
+	bar.sync 	0;
+	ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r352, %r353, %r354, %r355}, [%r242];
+	ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r356, %r357, %r358, %r359}, [%r242+512];
+	bar.sync 	0;
+	st.shared.v4.b32 	[%r231], {%r341, %r343, %r340, %r342};
+	st.shared.v4.b32 	[%r233+4096], {%r349, %r351, %r348, %r350};
+	bar.sync 	0;
+	ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r360, %r361, %r362, %r363}, [%r242];
+	ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r364, %r365, %r366, %r367}, [%r242+512];
+	.loc	1 63 31                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:63:31
+	add.s64 	%rd22, %rd32, %rd34;
+	.loc	1 63 71                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:63:71
+	// begin inline asm
+	mov.u64 %rd23, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd23, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r58, %r5;
+	mov.u32 %r59, %r5;
+	mov.u32 %r60, %r5;
+	mov.u32 %r61, %r5;
+	@%p4 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r58, %r59, %r60, %r61 }, [ %rd22 + 0 ], %rd23;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd24, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd24, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r62, %r5;
+	mov.u32 %r63, %r5;
+	mov.u32 %r64, %r5;
+	mov.u32 %r65, %r5;
+	@%p5 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r62, %r63, %r64, %r65 }, [ %rd22 + 0 ], %rd24;
+	// end inline asm
+	.loc	1 70 34                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:70:34
+	shl.b32 	%r368, %r83, 7;
+	shl.b32 	%r369, %r84, 7;
+	.loc	1 70 30                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:70:30
+	add.s32 	%r370, %r368, %r92;
+	add.s32 	%r371, %r369, %r92;
+	.loc	1 70 25                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:70:25
+	mad.wide.s32 	%rd25, %r370, 2, %rd33;
+	mad.wide.s32 	%rd26, %r371, 2, %rd33;
+	.loc	1 70 54                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:70:54
+	and.pred 	%p7, %p8, %p15;
+	.loc	1 45 137                        // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:45:137
+	mov.b32 	{%rs33, %rs34}, %r26;
+	cvt.f32.bf16 	%r372, %rs33;
+	cvt.f32.bf16 	%r373, %rs34;
+	.loc	1 47 20                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:47:20
+	mul.f32 	%r374, %r244, %r373;
+	mul.f32 	%r375, %r243, %r372;
+	.loc	1 63 138                        // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:63:138
+	mov.b32 	{%rs35, %rs36}, %r58;
+	cvt.f32.bf16 	%r376, %rs35;
+	cvt.f32.bf16 	%r377, %rs36;
+	.loc	1 65 20                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:65:20
+	mul.f32 	%r378, %r353, %r377;
+	mul.f32 	%r379, %r352, %r376;
+	.loc	1 0 0                           // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:0
+	selp.f32 	%r380, %r375, %r379, %p10;
+	selp.f32 	%r381, %r374, %r378, %p10;
+	.loc	1 70 46                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:70:46
+	cvt.rn.bf16x2.f32 	%r66, %r381, %r380;
+	.loc	1 45 137                        // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:45:137
+	mov.b32 	{%rs37, %rs38}, %r27;
+	cvt.f32.bf16 	%r382, %rs37;
+	cvt.f32.bf16 	%r383, %rs38;
+	.loc	1 47 20                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:47:20
+	mul.f32 	%r384, %r246, %r383;
+	mul.f32 	%r385, %r245, %r382;
+	.loc	1 63 138                        // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:63:138
+	mov.b32 	{%rs39, %rs40}, %r59;
+	cvt.f32.bf16 	%r386, %rs39;
+	cvt.f32.bf16 	%r387, %rs40;
+	.loc	1 65 20                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:65:20
+	mul.f32 	%r388, %r355, %r387;
+	mul.f32 	%r389, %r354, %r386;
+	.loc	1 0 0                           // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:0
+	selp.f32 	%r390, %r385, %r389, %p10;
+	selp.f32 	%r391, %r384, %r388, %p10;
+	.loc	1 70 46                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:70:46
+	cvt.rn.bf16x2.f32 	%r67, %r391, %r390;
+	.loc	1 45 137                        // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:45:137
+	mov.b32 	{%rs41, %rs42}, %r28;
+	cvt.f32.bf16 	%r392, %rs41;
+	cvt.f32.bf16 	%r393, %rs42;
+	.loc	1 47 20                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:47:20
+	mul.f32 	%r394, %r252, %r393;
+	mul.f32 	%r395, %r251, %r392;
+	.loc	1 63 138                        // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:63:138
+	mov.b32 	{%rs43, %rs44}, %r60;
+	cvt.f32.bf16 	%r396, %rs43;
+	cvt.f32.bf16 	%r397, %rs44;
+	.loc	1 65 20                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:65:20
+	mul.f32 	%r398, %r361, %r397;
+	mul.f32 	%r399, %r360, %r396;
+	.loc	1 0 0                           // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:0
+	selp.f32 	%r400, %r395, %r399, %p10;
+	selp.f32 	%r401, %r394, %r398, %p10;
+	.loc	1 70 46                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:70:46
+	cvt.rn.bf16x2.f32 	%r68, %r401, %r400;
+	.loc	1 45 137                        // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:45:137
+	mov.b32 	{%rs45, %rs46}, %r29;
+	cvt.f32.bf16 	%r402, %rs45;
+	cvt.f32.bf16 	%r403, %rs46;
+	.loc	1 47 20                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:47:20
+	mul.f32 	%r404, %r254, %r403;
+	mul.f32 	%r405, %r253, %r402;
+	.loc	1 63 138                        // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:63:138
+	mov.b32 	{%rs47, %rs48}, %r61;
+	cvt.f32.bf16 	%r406, %rs47;
+	cvt.f32.bf16 	%r407, %rs48;
+	.loc	1 65 20                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:65:20
+	mul.f32 	%r408, %r363, %r407;
+	mul.f32 	%r409, %r362, %r406;
+	.loc	1 0 0                           // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:0
+	selp.f32 	%r410, %r405, %r409, %p10;
+	selp.f32 	%r411, %r404, %r408, %p10;
+	.loc	1 70 46                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:70:46
+	cvt.rn.bf16x2.f32 	%r69, %r411, %r410;
+	.loc	1 45 137                        // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:45:137
+	mov.b32 	{%rs49, %rs50}, %r30;
+	cvt.f32.bf16 	%r412, %rs49;
+	cvt.f32.bf16 	%r413, %rs50;
+	.loc	1 47 20                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:47:20
+	mul.f32 	%r414, %r248, %r413;
+	mul.f32 	%r415, %r247, %r412;
+	.loc	1 63 138                        // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:63:138
+	mov.b32 	{%rs51, %rs52}, %r62;
+	cvt.f32.bf16 	%r416, %rs51;
+	cvt.f32.bf16 	%r417, %rs52;
+	.loc	1 65 20                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:65:20
+	mul.f32 	%r418, %r357, %r417;
+	mul.f32 	%r419, %r356, %r416;
+	.loc	1 0 0                           // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:0
+	selp.f32 	%r420, %r415, %r419, %p16;
+	selp.f32 	%r421, %r414, %r418, %p16;
+	.loc	1 70 46                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:70:46
+	cvt.rn.bf16x2.f32 	%r70, %r421, %r420;
+	.loc	1 45 137                        // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:45:137
+	mov.b32 	{%rs53, %rs54}, %r31;
+	cvt.f32.bf16 	%r422, %rs53;
+	cvt.f32.bf16 	%r423, %rs54;
+	.loc	1 47 20                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:47:20
+	mul.f32 	%r424, %r250, %r423;
+	mul.f32 	%r425, %r249, %r422;
+	.loc	1 63 138                        // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:63:138
+	mov.b32 	{%rs55, %rs56}, %r63;
+	cvt.f32.bf16 	%r426, %rs55;
+	cvt.f32.bf16 	%r427, %rs56;
+	.loc	1 65 20                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:65:20
+	mul.f32 	%r428, %r359, %r427;
+	mul.f32 	%r429, %r358, %r426;
+	.loc	1 0 0                           // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:0
+	selp.f32 	%r430, %r425, %r429, %p16;
+	selp.f32 	%r431, %r424, %r428, %p16;
+	.loc	1 70 46                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:70:46
+	cvt.rn.bf16x2.f32 	%r71, %r431, %r430;
+	.loc	1 45 137                        // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:45:137
+	mov.b32 	{%rs57, %rs58}, %r32;
+	cvt.f32.bf16 	%r432, %rs57;
+	cvt.f32.bf16 	%r433, %rs58;
+	.loc	1 47 20                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:47:20
+	mul.f32 	%r434, %r256, %r433;
+	mul.f32 	%r435, %r255, %r432;
+	.loc	1 63 138                        // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:63:138
+	mov.b32 	{%rs59, %rs60}, %r64;
+	cvt.f32.bf16 	%r436, %rs59;
+	cvt.f32.bf16 	%r437, %rs60;
+	.loc	1 65 20                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:65:20
+	mul.f32 	%r438, %r365, %r437;
+	mul.f32 	%r439, %r364, %r436;
+	.loc	1 0 0                           // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:0
+	selp.f32 	%r440, %r435, %r439, %p16;
+	selp.f32 	%r441, %r434, %r438, %p16;
+	.loc	1 70 46                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:70:46
+	cvt.rn.bf16x2.f32 	%r72, %r441, %r440;
+	.loc	1 45 137                        // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:45:137
+	mov.b32 	{%rs61, %rs62}, %r33;
+	cvt.f32.bf16 	%r442, %rs61;
+	cvt.f32.bf16 	%r443, %rs62;
+	.loc	1 47 20                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:47:20
+	mul.f32 	%r444, %r258, %r443;
+	mul.f32 	%r445, %r257, %r442;
+	.loc	1 63 138                        // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:63:138
+	mov.b32 	{%rs63, %rs64}, %r65;
+	cvt.f32.bf16 	%r446, %rs63;
+	cvt.f32.bf16 	%r447, %rs64;
+	.loc	1 65 20                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:65:20
+	mul.f32 	%r448, %r367, %r447;
+	mul.f32 	%r449, %r366, %r446;
+	.loc	1 0 0                           // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:0
+	selp.f32 	%r450, %r445, %r449, %p16;
+	selp.f32 	%r451, %r444, %r448, %p16;
+	.loc	1 70 46                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:70:46
+	cvt.rn.bf16x2.f32 	%r73, %r451, %r450;
+	// begin inline asm
+	@%p7 st.global.v4.b32 [ %rd25 + 0 ], { %r66, %r67, %r68, %r69 };
+	// end inline asm
+	// begin inline asm
+	@%p7 st.global.v4.b32 [ %rd26 + 0 ], { %r70, %r71, %r72, %r73 };
+	// end inline asm
+	.loc	1 70 4                          // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:70:4
+	ret;
+$L__tmp1:
+$L__func_end0:
+                                        // -- End function
+}
+	.file	1 "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py"
+	.section	.debug_abbrev
+	{
+.b8 1                                   // Abbreviation Code
+.b8 17                                  // DW_TAG_compile_unit
+.b8 0                                   // DW_CHILDREN_no
+.b8 37                                  // DW_AT_producer
+.b8 8                                   // DW_FORM_string
+.b8 19                                  // DW_AT_language
+.b8 5                                   // DW_FORM_data2
+.b8 3                                   // DW_AT_name
+.b8 8                                   // DW_FORM_string
+.b8 16                                  // DW_AT_stmt_list
+.b8 6                                   // DW_FORM_data4
+.b8 27                                  // DW_AT_comp_dir
+.b8 8                                   // DW_FORM_string
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 0                                   // EOM(3)
+	}
+	.section	.debug_info
+	{
+.b32 224                                // Length of Unit
+.b8 2                                   // DWARF version number
+.b8 0
+.b32 .debug_abbrev                      // Offset Into Abbrev. Section
+.b8 8                                   // Address Size (in bytes)
+.b8 1                                   // Abbrev [1] 0xb:0xd9 DW_TAG_compile_unit
+.b8 116                                 // DW_AT_producer
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 0
+.b8 2                                   // DW_AT_language
+.b8 0
+.b8 99                                  // DW_AT_name
+.b8 50
+.b8 104
+.b8 105
+.b8 106
+.b8 51
+.b8 104
+.b8 109
+.b8 108
+.b8 111
+.b8 117
+.b8 109
+.b8 120
+.b8 100
+.b8 109
+.b8 104
+.b8 117
+.b8 101
+.b8 122
+.b8 115
+.b8 121
+.b8 104
+.b8 107
+.b8 109
+.b8 110
+.b8 113
+.b8 103
+.b8 110
+.b8 102
+.b8 97
+.b8 53
+.b8 105
+.b8 118
+.b8 114
+.b8 101
+.b8 50
+.b8 55
+.b8 117
+.b8 111
+.b8 115
+.b8 121
+.b8 109
+.b8 97
+.b8 109
+.b8 51
+.b8 100
+.b8 114
+.b8 55
+.b8 97
+.b8 53
+.b8 120
+.b8 98
+.b8 46
+.b8 112
+.b8 121
+.b8 0
+.b32 .debug_line                        // DW_AT_stmt_list
+.b8 47                                  // DW_AT_comp_dir
+.b8 97
+.b8 112
+.b8 112
+.b8 47
+.b8 116
+.b8 101
+.b8 110
+.b8 115
+.b8 111
+.b8 114
+.b8 114
+.b8 116
+.b8 95
+.b8 108
+.b8 108
+.b8 109
+.b8 47
+.b8 118
+.b8 105
+.b8 115
+.b8 117
+.b8 97
+.b8 108
+.b8 95
+.b8 103
+.b8 101
+.b8 110
+.b8 47
+.b8 99
+.b8 111
+.b8 109
+.b8 112
+.b8 105
+.b8 108
+.b8 101
+.b8 100
+.b8 95
+.b8 99
+.b8 97
+.b8 99
+.b8 104
+.b8 101
+.b8 47
+.b8 102
+.b8 108
+.b8 117
+.b8 120
+.b8 50
+.b8 95
+.b8 107
+.b8 108
+.b8 101
+.b8 105
+.b8 110
+.b8 95
+.b8 57
+.b8 98
+.b8 95
+.b8 78
+.b8 86
+.b8 73
+.b8 68
+.b8 73
+.b8 65
+.b8 95
+.b8 71
+.b8 101
+.b8 70
+.b8 111
+.b8 114
+.b8 99
+.b8 101
+.b8 95
+.b8 82
+.b8 84
+.b8 88
+.b8 95
+.b8 52
+.b8 48
+.b8 57
+.b8 48
+.b8 95
+.b8 115
+.b8 109
+.b8 56
+.b8 57
+.b8 95
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 50
+.b8 46
+.b8 49
+.b8 48
+.b8 46
+.b8 48
+.b8 97
+.b8 48
+.b8 95
+.b8 98
+.b8 52
+.b8 101
+.b8 52
+.b8 101
+.b8 101
+.b8 56
+.b8 49
+.b8 100
+.b8 51
+.b8 46
+.b8 110
+.b8 118
+.b8 50
+.b8 53
+.b8 46
+.b8 49
+.b8 50
+.b8 95
+.b8 99
+.b8 117
+.b8 100
+.b8 97
+.b8 49
+.b8 51
+.b8 95
+.b8 49
+.b8 47
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 105
+.b8 110
+.b8 100
+.b8 117
+.b8 99
+.b8 116
+.b8 111
+.b8 114
+.b8 47
+.b8 50
+.b8 104
+.b8 0
+	}
+	.section	.debug_macinfo	{	}
diff --git a/triton/V55R5JX27SH3ZP2ZK3XX5KCHKG3ODHZ7KRAVANZA47FHWYLN35WA/triton_poi_fused__fused_rms_norm_cat_view_2.source b/triton/V55R5JX27SH3ZP2ZK3XX5KCHKG3ODHZ7KRAVANZA47FHWYLN35WA/triton_poi_fused__fused_rms_norm_cat_view_2.source
new file mode 100644
index 0000000000000000000000000000000000000000..86e111c99a00db61380325e65ed8aadb64550e9d
--- /dev/null
+++ b/triton/V55R5JX27SH3ZP2ZK3XX5KCHKG3ODHZ7KRAVANZA47FHWYLN35WA/triton_poi_fused__fused_rms_norm_cat_view_2.source
@@ -0,0 +1,415 @@
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":18:0)
+#loc99 = loc("in_ptr0"(#loc))
+#loc100 = loc("in_ptr1"(#loc))
+#loc101 = loc("in_ptr2"(#loc))
+#loc102 = loc("in_ptr3"(#loc))
+#loc103 = loc("in_ptr4"(#loc))
+#loc104 = loc("in_ptr5"(#loc))
+#loc105 = loc("out_ptr0"(#loc))
+#loc106 = loc("ynumel"(#loc))
+#loc107 = loc("xnumel"(#loc))
+module {
+  tt.func public @triton_poi_fused__fused_rms_norm_cat_view_2(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %in_ptr4: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr4"(#loc)), %in_ptr5: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr5"(#loc)), %out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ynumel: i32 {tt.divisibility = 16 : i32} loc("ynumel"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} {
+    %ynumel_0 = arith.constant 73728 : i32 loc(#loc108)
+    %xnumel_1 = arith.constant 128 : i32 loc(#loc109)
+    %yoffset = tt.get_program_id y : i32 loc(#loc110)
+    %yoffset_2 = tt.get_program_id z : i32 loc(#loc111)
+    %yoffset_3 = tt.get_num_programs y : i32 loc(#loc112)
+    %yoffset_4 = arith.muli %yoffset_2, %yoffset_3 : i32 loc(#loc113)
+    %yoffset_5 = arith.addi %yoffset, %yoffset_4 : i32 loc(#loc114)
+    %yoffset_6 = arith.constant 256 : i32 loc(#loc115)
+    %yoffset_7 = arith.constant 256 : i32 loc(#loc115)
+    %yoffset_8 = arith.muli %yoffset_5, %yoffset_7 : i32 loc(#loc115)
+    %yindex = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32> loc(#loc116)
+    %yindex_9 = tt.expand_dims %yindex {axis = 1 : i32} : tensor<256xi32> -> tensor<256x1xi32> loc(#loc117)
+    %yindex_10 = tt.splat %yoffset_8 : i32 -> tensor<256x1xi32> loc(#loc118)
+    %yindex_11 = arith.addi %yindex_10, %yindex_9 : tensor<256x1xi32> loc(#loc118)
+    %ymask = arith.constant dense<73728> : tensor<256x1xi32> loc(#loc119)
+    %ymask_12 = arith.cmpi slt, %yindex_11, %ymask : tensor<256x1xi32> loc(#loc119)
+    %xoffset = tt.get_program_id x : i32 loc(#loc120)
+    %xoffset_13 = arith.constant 16 : i32 loc(#loc121)
+    %xoffset_14 = arith.constant 16 : i32 loc(#loc121)
+    %xoffset_15 = arith.muli %xoffset, %xoffset_14 : i32 loc(#loc121)
+    %xindex = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32> loc(#loc122)
+    %xindex_16 = tt.expand_dims %xindex {axis = 0 : i32} : tensor<16xi32> -> tensor<1x16xi32> loc(#loc123)
+    %xindex_17 = tt.splat %xoffset_15 : i32 -> tensor<1x16xi32> loc(#loc124)
+    %xindex_18 = arith.addi %xindex_17, %xindex_16 : tensor<1x16xi32> loc(#loc124)
+    %xmask = arith.constant dense<128> : tensor<1x16xi32> loc(#loc125)
+    %xmask_19 = arith.cmpi slt, %xindex_18, %xmask : tensor<1x16xi32> loc(#loc125)
+    %y1 = arith.constant 32 : i32 loc(#loc126)
+    %y1_20 = arith.constant 32 : i32 loc(#loc126)
+    %y1_21 = arith.constant dense<32> : tensor<256x1xi32> loc(#loc126)
+    %y1_22 = arith.divsi %yindex_11, %y1_21 : tensor<256x1xi32> loc(#loc126)
+    %y0 = arith.constant 32 : i32 loc(#loc127)
+    %y0_23 = arith.constant 32 : i32 loc(#loc127)
+    %y0_24 = arith.constant dense<32> : tensor<256x1xi32> loc(#loc127)
+    %y0_25 = arith.remsi %yindex_11, %y0_24 : tensor<256x1xi32> loc(#loc127)
+    %tmp1 = arith.constant 0 : i64 loc(#loc128)
+    %tmp1_26 = arith.constant dense<0> : tensor<1x1xi64> loc(#loc128)
+    %tmp2 = arith.extsi %y1_22 : tensor<256x1xi32> to tensor<256x1xi64> loc(#loc129)
+    %tmp2_27 = arith.constant dense<0> : tensor<256x1xi64> loc(#loc129)
+    %tmp2_28 = arith.cmpi sge, %tmp2, %tmp2_27 : tensor<256x1xi64> loc(#loc129)
+    %tmp3 = arith.constant 256 : i64 loc(#loc130)
+    %tmp3_29 = arith.constant dense<256> : tensor<1x1xi64> loc(#loc130)
+    %tmp4 = arith.extsi %y1_22 : tensor<256x1xi32> to tensor<256x1xi64> loc(#loc131)
+    %tmp4_30 = arith.constant dense<256> : tensor<256x1xi64> loc(#loc131)
+    %tmp4_31 = arith.cmpi slt, %tmp4, %tmp4_30 : tensor<256x1xi64> loc(#loc131)
+    %tmp5 = arith.constant 128 : i32 loc(#loc132)
+    %tmp5_32 = arith.constant 128 : i32 loc(#loc132)
+    %tmp5_33 = arith.constant dense<128> : tensor<256x1xi32> loc(#loc132)
+    %tmp5_34 = arith.muli %tmp5_33, %y0_25 : tensor<256x1xi32> loc(#loc132)
+    %tmp5_35 = tt.broadcast %xindex_18 : tensor<1x16xi32> -> tensor<256x16xi32> loc(#loc133)
+    %tmp5_36 = tt.broadcast %tmp5_34 : tensor<256x1xi32> -> tensor<256x16xi32> loc(#loc133)
+    %tmp5_37 = arith.addi %tmp5_35, %tmp5_36 : tensor<256x16xi32> loc(#loc133)
+    %tmp5_38 = arith.constant 12288 : i32 loc(#loc134)
+    %tmp5_39 = arith.constant 12288 : i32 loc(#loc134)
+    %tmp5_40 = arith.constant dense<12288> : tensor<256x1xi32> loc(#loc134)
+    %tmp5_41 = arith.muli %tmp5_40, %y1_22 : tensor<256x1xi32> loc(#loc134)
+    %tmp5_42 = tt.broadcast %tmp5_41 : tensor<256x1xi32> -> tensor<256x16xi32> loc(#loc135)
+    %tmp5_43 = arith.addi %tmp5_37, %tmp5_42 : tensor<256x16xi32> loc(#loc135)
+    %tmp5_44 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<256x16x!tt.ptr<bf16>> loc(#loc136)
+    %tmp5_45 = tt.addptr %tmp5_44, %tmp5_43 : tensor<256x16x!tt.ptr<bf16>>, tensor<256x16xi32> loc(#loc136)
+    %tmp5_46 = tt.broadcast %tmp4_31 : tensor<256x1xi1> -> tensor<256x16xi1> loc(#loc137)
+    %tmp5_47 = tt.broadcast %xmask_19 : tensor<1x16xi1> -> tensor<256x16xi1> loc(#loc137)
+    %tmp5_48 = arith.andi %tmp5_46, %tmp5_47 : tensor<256x16xi1> loc(#loc137)
+    %tmp5_49 = tt.broadcast %ymask_12 : tensor<256x1xi1> -> tensor<256x16xi1> loc(#loc138)
+    %tmp5_50 = arith.andi %tmp5_48, %tmp5_49 : tensor<256x16xi1> loc(#loc138)
+    %tmp5_51 = arith.constant 0.000000e+00 : f32 loc(#loc139)
+    %tmp5_52 = arith.constant dense<0.000000e+00> : tensor<256x16xf32> loc(#loc139)
+    %tmp5_53 = arith.truncf %tmp5_52 : tensor<256x16xf32> to tensor<256x16xbf16> loc(#loc139)
+    %tmp5_54 = tt.load %tmp5_45, %tmp5_50, %tmp5_53 evictionPolicy = evict_last : tensor<256x16x!tt.ptr<bf16>> loc(#loc139)
+    %tmp5_55 = arith.extf %tmp5_54 : tensor<256x16xbf16> to tensor<256x16xf32> loc(#loc140)
+    %tmp7 = arith.constant 32 : i32 loc(#loc141)
+    %tmp7_56 = arith.constant 32 : i32 loc(#loc141)
+    %tmp7_57 = arith.constant dense<32> : tensor<256x1xi32> loc(#loc141)
+    %tmp7_58 = arith.muli %tmp7_57, %y1_22 : tensor<256x1xi32> loc(#loc141)
+    %tmp7_59 = arith.addi %y0_25, %tmp7_58 : tensor<256x1xi32> loc(#loc142)
+    %tmp7_60 = tt.broadcast %tmp7_59 : tensor<256x1xi32> -> tensor<256x16xi32> loc(#loc143)
+    %tmp7_61 = tt.splat %in_ptr1 : !tt.ptr<f32> -> tensor<256x16x!tt.ptr<f32>> loc(#loc144)
+    %tmp7_62 = tt.addptr %tmp7_61, %tmp7_60 : tensor<256x16x!tt.ptr<f32>>, tensor<256x16xi32> loc(#loc144)
+    %tmp7_63 = tt.broadcast %tmp4_31 : tensor<256x1xi1> -> tensor<256x16xi1> loc(#loc145)
+    %tmp7_64 = tt.broadcast %xmask_19 : tensor<1x16xi1> -> tensor<256x16xi1> loc(#loc145)
+    %tmp7_65 = arith.andi %tmp7_63, %tmp7_64 : tensor<256x16xi1> loc(#loc145)
+    %tmp7_66 = tt.broadcast %ymask_12 : tensor<256x1xi1> -> tensor<256x16xi1> loc(#loc146)
+    %tmp7_67 = arith.andi %tmp7_65, %tmp7_66 : tensor<256x16xi1> loc(#loc146)
+    %tmp7_68 = arith.constant 0.000000e+00 : f32 loc(#loc147)
+    %tmp7_69 = arith.constant dense<0.000000e+00> : tensor<256x16xf32> loc(#loc147)
+    %tmp7_70 = tt.load %tmp7_62, %tmp7_67, %tmp7_69 evictionPolicy = evict_last : tensor<256x16x!tt.ptr<f32>> loc(#loc147)
+    %tmp8 = arith.constant 1.280000e+02 : f32 loc(#loc148)
+    %tmp9 = arith.constant dense<1.280000e+02> : tensor<256x16xf32> loc(#loc149)
+    %tmp9_71 = arith.divf %tmp7_70, %tmp9 : tensor<256x16xf32> loc(#loc149)
+    %tmp10 = arith.constant 9.99999997E-7 : f32 loc(#loc150)
+    %tmp11 = arith.constant dense<9.99999997E-7> : tensor<256x16xf32> loc(#loc151)
+    %tmp11_72 = arith.addf %tmp9_71, %tmp11 : tensor<256x16xf32> loc(#loc151)
+    %tmp12 = tt.extern_elementwise %tmp11_72 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<256x16xf32>) -> tensor<256x16xf32> loc(#loc152)
+    %tmp13 = arith.mulf %tmp5_55, %tmp12 : tensor<256x16xf32> loc(#loc153)
+    %tmp14 = tt.broadcast %xindex_18 : tensor<1x16xi32> -> tensor<256x16xi32> loc(#loc154)
+    %tmp14_73 = tt.splat %in_ptr2 : !tt.ptr<bf16> -> tensor<256x16x!tt.ptr<bf16>> loc(#loc155)
+    %tmp14_74 = tt.addptr %tmp14_73, %tmp14 : tensor<256x16x!tt.ptr<bf16>>, tensor<256x16xi32> loc(#loc155)
+    %tmp14_75 = tt.broadcast %tmp4_31 : tensor<256x1xi1> -> tensor<256x16xi1> loc(#loc156)
+    %tmp14_76 = tt.broadcast %xmask_19 : tensor<1x16xi1> -> tensor<256x16xi1> loc(#loc156)
+    %tmp14_77 = arith.andi %tmp14_75, %tmp14_76 : tensor<256x16xi1> loc(#loc156)
+    %tmp14_78 = tt.broadcast %ymask_12 : tensor<256x1xi1> -> tensor<256x16xi1> loc(#loc157)
+    %tmp14_79 = arith.andi %tmp14_77, %tmp14_78 : tensor<256x16xi1> loc(#loc157)
+    %tmp14_80 = arith.constant 0.000000e+00 : f32 loc(#loc158)
+    %tmp14_81 = arith.constant dense<0.000000e+00> : tensor<256x16xf32> loc(#loc158)
+    %tmp14_82 = arith.truncf %tmp14_81 : tensor<256x16xf32> to tensor<256x16xbf16> loc(#loc158)
+    %tmp14_83 = tt.load %tmp14_74, %tmp14_79, %tmp14_82 evictionPolicy = evict_last : tensor<256x16x!tt.ptr<bf16>> loc(#loc158)
+    %tmp14_84 = arith.extf %tmp14_83 : tensor<256x16xbf16> to tensor<256x16xf32> loc(#loc159)
+    %tmp16 = arith.mulf %tmp13, %tmp14_84 : tensor<256x16xf32> loc(#loc160)
+    %tmp18 = arith.constant 0.000000e+00 : f32 loc(#loc161)
+    %tmp18_85 = arith.constant dense<0.000000e+00> : tensor<256x16xf32> loc(#loc161)
+    %tmp19 = tt.broadcast %tmp4_31 : tensor<256x1xi1> -> tensor<256x16xi1> loc(#loc162)
+    %tmp19_86 = arith.select %tmp19, %tmp16, %tmp18_85 : tensor<256x16xi1>, tensor<256x16xf32> loc(#loc162)
+    %tmp20 = arith.extsi %y1_22 : tensor<256x1xi32> to tensor<256x1xi64> loc(#loc163)
+    %tmp20_87 = arith.constant dense<256> : tensor<256x1xi64> loc(#loc163)
+    %tmp20_88 = arith.cmpi sge, %tmp20, %tmp20_87 : tensor<256x1xi64> loc(#loc163)
+    %tmp21 = arith.constant 2304 : i64 loc(#loc164)
+    %tmp21_89 = arith.constant dense<2304> : tensor<1x1xi64> loc(#loc164)
+    %tmp22 = arith.extsi %y1_22 : tensor<256x1xi32> to tensor<256x1xi64> loc(#loc165)
+    %tmp22_90 = arith.constant dense<2304> : tensor<256x1xi64> loc(#loc165)
+    %tmp22_91 = arith.cmpi slt, %tmp22, %tmp22_90 : tensor<256x1xi64> loc(#loc165)
+    %tmp23 = arith.constant 128 : i32 loc(#loc166)
+    %tmp23_92 = arith.constant 128 : i32 loc(#loc166)
+    %tmp23_93 = arith.constant dense<128> : tensor<256x1xi32> loc(#loc166)
+    %tmp23_94 = arith.muli %tmp23_93, %y0_25 : tensor<256x1xi32> loc(#loc166)
+    %tmp23_95 = tt.broadcast %xindex_18 : tensor<1x16xi32> -> tensor<256x16xi32> loc(#loc167)
+    %tmp23_96 = tt.broadcast %tmp23_94 : tensor<256x1xi32> -> tensor<256x16xi32> loc(#loc167)
+    %tmp23_97 = arith.addi %tmp23_95, %tmp23_96 : tensor<256x16xi32> loc(#loc167)
+    %tmp23_98 = arith.constant -256 : i32 loc(#loc168)
+    %tmp23_99 = arith.constant -256 : i32 loc(#loc168)
+    %tmp23_100 = arith.constant dense<-256> : tensor<256x1xi32> loc(#loc168)
+    %tmp23_101 = arith.addi %tmp23_100, %y1_22 : tensor<256x1xi32> loc(#loc168)
+    %tmp23_102 = arith.constant 12288 : i32 loc(#loc169)
+    %tmp23_103 = arith.constant 12288 : i32 loc(#loc169)
+    %tmp23_104 = arith.constant dense<12288> : tensor<256x1xi32> loc(#loc169)
+    %tmp23_105 = arith.muli %tmp23_104, %tmp23_101 : tensor<256x1xi32> loc(#loc169)
+    %tmp23_106 = tt.broadcast %tmp23_105 : tensor<256x1xi32> -> tensor<256x16xi32> loc(#loc170)
+    %tmp23_107 = arith.addi %tmp23_97, %tmp23_106 : tensor<256x16xi32> loc(#loc170)
+    %tmp23_108 = tt.splat %in_ptr3 : !tt.ptr<bf16> -> tensor<256x16x!tt.ptr<bf16>> loc(#loc171)
+    %tmp23_109 = tt.addptr %tmp23_108, %tmp23_107 : tensor<256x16x!tt.ptr<bf16>>, tensor<256x16xi32> loc(#loc171)
+    %tmp23_110 = tt.broadcast %tmp20_88 : tensor<256x1xi1> -> tensor<256x16xi1> loc(#loc172)
+    %tmp23_111 = tt.broadcast %xmask_19 : tensor<1x16xi1> -> tensor<256x16xi1> loc(#loc172)
+    %tmp23_112 = arith.andi %tmp23_110, %tmp23_111 : tensor<256x16xi1> loc(#loc172)
+    %tmp23_113 = tt.broadcast %ymask_12 : tensor<256x1xi1> -> tensor<256x16xi1> loc(#loc173)
+    %tmp23_114 = arith.andi %tmp23_112, %tmp23_113 : tensor<256x16xi1> loc(#loc173)
+    %tmp23_115 = arith.constant 0.000000e+00 : f32 loc(#loc174)
+    %tmp23_116 = arith.constant dense<0.000000e+00> : tensor<256x16xf32> loc(#loc174)
+    %tmp23_117 = arith.truncf %tmp23_116 : tensor<256x16xf32> to tensor<256x16xbf16> loc(#loc174)
+    %tmp23_118 = tt.load %tmp23_109, %tmp23_114, %tmp23_117 evictionPolicy = evict_last : tensor<256x16x!tt.ptr<bf16>> loc(#loc174)
+    %tmp23_119 = arith.extf %tmp23_118 : tensor<256x16xbf16> to tensor<256x16xf32> loc(#loc175)
+    %tmp25 = arith.constant -256 : i32 loc(#loc176)
+    %tmp25_120 = arith.constant -256 : i32 loc(#loc176)
+    %tmp25_121 = arith.constant dense<-256> : tensor<256x1xi32> loc(#loc176)
+    %tmp25_122 = arith.addi %tmp25_121, %y1_22 : tensor<256x1xi32> loc(#loc176)
+    %tmp25_123 = arith.constant 32 : i32 loc(#loc177)
+    %tmp25_124 = arith.constant 32 : i32 loc(#loc177)
+    %tmp25_125 = arith.constant dense<32> : tensor<256x1xi32> loc(#loc177)
+    %tmp25_126 = arith.muli %tmp25_125, %tmp25_122 : tensor<256x1xi32> loc(#loc177)
+    %tmp25_127 = arith.addi %y0_25, %tmp25_126 : tensor<256x1xi32> loc(#loc178)
+    %tmp25_128 = tt.broadcast %tmp25_127 : tensor<256x1xi32> -> tensor<256x16xi32> loc(#loc179)
+    %tmp25_129 = tt.splat %in_ptr4 : !tt.ptr<f32> -> tensor<256x16x!tt.ptr<f32>> loc(#loc180)
+    %tmp25_130 = tt.addptr %tmp25_129, %tmp25_128 : tensor<256x16x!tt.ptr<f32>>, tensor<256x16xi32> loc(#loc180)
+    %tmp25_131 = tt.broadcast %tmp20_88 : tensor<256x1xi1> -> tensor<256x16xi1> loc(#loc181)
+    %tmp25_132 = tt.broadcast %xmask_19 : tensor<1x16xi1> -> tensor<256x16xi1> loc(#loc181)
+    %tmp25_133 = arith.andi %tmp25_131, %tmp25_132 : tensor<256x16xi1> loc(#loc181)
+    %tmp25_134 = tt.broadcast %ymask_12 : tensor<256x1xi1> -> tensor<256x16xi1> loc(#loc182)
+    %tmp25_135 = arith.andi %tmp25_133, %tmp25_134 : tensor<256x16xi1> loc(#loc182)
+    %tmp25_136 = arith.constant 0.000000e+00 : f32 loc(#loc183)
+    %tmp25_137 = arith.constant dense<0.000000e+00> : tensor<256x16xf32> loc(#loc183)
+    %tmp25_138 = tt.load %tmp25_130, %tmp25_135, %tmp25_137 evictionPolicy = evict_last : tensor<256x16x!tt.ptr<f32>> loc(#loc183)
+    %tmp26 = arith.constant 1.280000e+02 : f32 loc(#loc184)
+    %tmp27 = arith.constant dense<1.280000e+02> : tensor<256x16xf32> loc(#loc185)
+    %tmp27_139 = arith.divf %tmp25_138, %tmp27 : tensor<256x16xf32> loc(#loc185)
+    %tmp28 = arith.constant 9.99999997E-7 : f32 loc(#loc186)
+    %tmp29 = arith.constant dense<9.99999997E-7> : tensor<256x16xf32> loc(#loc187)
+    %tmp29_140 = arith.addf %tmp27_139, %tmp29 : tensor<256x16xf32> loc(#loc187)
+    %tmp30 = tt.extern_elementwise %tmp29_140 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<256x16xf32>) -> tensor<256x16xf32> loc(#loc188)
+    %tmp31 = arith.mulf %tmp23_119, %tmp30 : tensor<256x16xf32> loc(#loc189)
+    %tmp32 = tt.broadcast %xindex_18 : tensor<1x16xi32> -> tensor<256x16xi32> loc(#loc190)
+    %tmp32_141 = tt.splat %in_ptr5 : !tt.ptr<bf16> -> tensor<256x16x!tt.ptr<bf16>> loc(#loc191)
+    %tmp32_142 = tt.addptr %tmp32_141, %tmp32 : tensor<256x16x!tt.ptr<bf16>>, tensor<256x16xi32> loc(#loc191)
+    %tmp32_143 = tt.broadcast %tmp20_88 : tensor<256x1xi1> -> tensor<256x16xi1> loc(#loc192)
+    %tmp32_144 = tt.broadcast %xmask_19 : tensor<1x16xi1> -> tensor<256x16xi1> loc(#loc192)
+    %tmp32_145 = arith.andi %tmp32_143, %tmp32_144 : tensor<256x16xi1> loc(#loc192)
+    %tmp32_146 = tt.broadcast %ymask_12 : tensor<256x1xi1> -> tensor<256x16xi1> loc(#loc193)
+    %tmp32_147 = arith.andi %tmp32_145, %tmp32_146 : tensor<256x16xi1> loc(#loc193)
+    %tmp32_148 = arith.constant 0.000000e+00 : f32 loc(#loc194)
+    %tmp32_149 = arith.constant dense<0.000000e+00> : tensor<256x16xf32> loc(#loc194)
+    %tmp32_150 = arith.truncf %tmp32_149 : tensor<256x16xf32> to tensor<256x16xbf16> loc(#loc194)
+    %tmp32_151 = tt.load %tmp32_142, %tmp32_147, %tmp32_150 evictionPolicy = evict_last : tensor<256x16x!tt.ptr<bf16>> loc(#loc194)
+    %tmp32_152 = arith.extf %tmp32_151 : tensor<256x16xbf16> to tensor<256x16xf32> loc(#loc195)
+    %tmp34 = arith.mulf %tmp31, %tmp32_152 : tensor<256x16xf32> loc(#loc196)
+    %tmp36 = arith.constant 0.000000e+00 : f32 loc(#loc197)
+    %tmp36_153 = arith.constant dense<0.000000e+00> : tensor<256x16xf32> loc(#loc197)
+    %tmp37 = tt.broadcast %tmp20_88 : tensor<256x1xi1> -> tensor<256x16xi1> loc(#loc198)
+    %tmp37_154 = arith.select %tmp37, %tmp34, %tmp36_153 : tensor<256x16xi1>, tensor<256x16xf32> loc(#loc198)
+    %tmp38 = tt.broadcast %tmp4_31 : tensor<256x1xi1> -> tensor<256x16xi1> loc(#loc199)
+    %tmp38_155 = arith.select %tmp38, %tmp19_86, %tmp37_154 : tensor<256x16xi1>, tensor<256x16xf32> loc(#loc199)
+    %c128_i32 = arith.constant 128 : i32 loc(#loc93)
+    %c128_i32_156 = arith.constant 128 : i32 loc(#loc93)
+    %cst = arith.constant dense<128> : tensor<256x1xi32> loc(#loc93)
+    %0 = arith.muli %cst, %yindex_11 : tensor<256x1xi32> loc(#loc93)
+    %1 = tt.broadcast %xindex_18 : tensor<1x16xi32> -> tensor<256x16xi32> loc(#loc94)
+    %2 = tt.broadcast %0 : tensor<256x1xi32> -> tensor<256x16xi32> loc(#loc94)
+    %3 = arith.addi %1, %2 : tensor<256x16xi32> loc(#loc94)
+    %4 = tt.splat %out_ptr0 : !tt.ptr<bf16> -> tensor<256x16x!tt.ptr<bf16>> loc(#loc95)
+    %5 = tt.addptr %4, %3 : tensor<256x16x!tt.ptr<bf16>>, tensor<256x16xi32> loc(#loc95)
+    %6 = tt.broadcast %xmask_19 : tensor<1x16xi1> -> tensor<256x16xi1> loc(#loc96)
+    %7 = tt.broadcast %ymask_12 : tensor<256x1xi1> -> tensor<256x16xi1> loc(#loc96)
+    %8 = arith.andi %6, %7 : tensor<256x16xi1> loc(#loc96)
+    %9 = arith.truncf %tmp38_155 : tensor<256x16xf32> to tensor<256x16xbf16> loc(#loc97)
+    tt.store %5, %9, %8 : tensor<256x16x!tt.ptr<bf16>> loc(#loc97)
+    tt.return loc(#loc98)
+  } loc(#loc)
+} loc(#loc)
+#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":19:13)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":20:13)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:29)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:48)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:69)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:53)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:34)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:75)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":22:36)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":22:44)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":22:23)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":23:21)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":24:28)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":24:33)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":25:36)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":25:44)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":25:23)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":26:21)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":27:19)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":29:19)
+#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":32:30)
+#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":33:19)
+#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":34:32)
+#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":35:18)
+#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:39)
+#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:35)
+#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:51)
+#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:44)
+#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:30)
+#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:64)
+#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:72)
+#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:57)
+#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:123)
+#loc34 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:55)
+#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:51)
+#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:60)
+#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:30)
+#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:87)
+#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:95)
+#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:80)
+#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":39:11)
+#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":40:19)
+#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":41:12)
+#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":42:19)
+#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":43:28)
+#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":44:19)
+#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:51)
+#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:31)
+#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:78)
+#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:86)
+#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:71)
+#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:137)
+#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":47:20)
+#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":49:38)
+#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":50:34)
+#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":51:20)
+#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":52:34)
+#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":53:19)
+#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:40)
+#loc60 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:36)
+#loc61 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:61)
+#loc62 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:52)
+#loc63 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:45)
+#loc64 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:31)
+#loc65 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:75)
+#loc66 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:83)
+#loc67 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:67)
+#loc68 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:134)
+#loc69 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:65)
+#loc70 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:56)
+#loc71 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:52)
+#loc72 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:70)
+#loc73 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:31)
+#loc74 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:98)
+#loc75 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:106)
+#loc76 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:90)
+#loc77 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":57:12)
+#loc78 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":58:21)
+#loc79 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":59:12)
+#loc80 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":60:20)
+#loc81 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":61:28)
+#loc82 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":62:20)
+#loc83 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:51)
+#loc84 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:31)
+#loc85 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:79)
+#loc86 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:87)
+#loc87 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:71)
+#loc88 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:138)
+#loc89 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":65:20)
+#loc90 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":67:38)
+#loc91 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":68:35)
+#loc92 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":69:34)
+#loc93 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:34)
+#loc94 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:30)
+#loc95 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:25)
+#loc96 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:54)
+#loc97 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:46)
+#loc98 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:4)
+#loc108 = loc("ynumel"(#loc1))
+#loc109 = loc("xnumel"(#loc2))
+#loc110 = loc("yoffset"(#loc3))
+#loc111 = loc("yoffset"(#loc4))
+#loc112 = loc("yoffset"(#loc5))
+#loc113 = loc("yoffset"(#loc6))
+#loc114 = loc("yoffset"(#loc7))
+#loc115 = loc("yoffset"(#loc8))
+#loc116 = loc("yindex"(#loc9))
+#loc117 = loc("yindex"(#loc10))
+#loc118 = loc("yindex"(#loc11))
+#loc119 = loc("ymask"(#loc12))
+#loc120 = loc("xoffset"(#loc13))
+#loc121 = loc("xoffset"(#loc14))
+#loc122 = loc("xindex"(#loc15))
+#loc123 = loc("xindex"(#loc16))
+#loc124 = loc("xindex"(#loc17))
+#loc125 = loc("xmask"(#loc18))
+#loc126 = loc("y1"(#loc19))
+#loc127 = loc("y0"(#loc20))
+#loc128 = loc("tmp1"(#loc21))
+#loc129 = loc("tmp2"(#loc22))
+#loc130 = loc("tmp3"(#loc23))
+#loc131 = loc("tmp4"(#loc24))
+#loc132 = loc("tmp5"(#loc25))
+#loc133 = loc("tmp5"(#loc26))
+#loc134 = loc("tmp5"(#loc27))
+#loc135 = loc("tmp5"(#loc28))
+#loc136 = loc("tmp5"(#loc29))
+#loc137 = loc("tmp5"(#loc30))
+#loc138 = loc("tmp5"(#loc31))
+#loc139 = loc("tmp5"(#loc32))
+#loc140 = loc("tmp5"(#loc33))
+#loc141 = loc("tmp7"(#loc34))
+#loc142 = loc("tmp7"(#loc35))
+#loc143 = loc("tmp7"(#loc36))
+#loc144 = loc("tmp7"(#loc37))
+#loc145 = loc("tmp7"(#loc38))
+#loc146 = loc("tmp7"(#loc39))
+#loc147 = loc("tmp7"(#loc40))
+#loc148 = loc("tmp8"(#loc41))
+#loc149 = loc("tmp9"(#loc42))
+#loc150 = loc("tmp10"(#loc43))
+#loc151 = loc("tmp11"(#loc44))
+#loc152 = loc("tmp12"(#loc45))
+#loc153 = loc("tmp13"(#loc46))
+#loc154 = loc("tmp14"(#loc47))
+#loc155 = loc("tmp14"(#loc48))
+#loc156 = loc("tmp14"(#loc49))
+#loc157 = loc("tmp14"(#loc50))
+#loc158 = loc("tmp14"(#loc51))
+#loc159 = loc("tmp14"(#loc52))
+#loc160 = loc("tmp16"(#loc53))
+#loc161 = loc("tmp18"(#loc54))
+#loc162 = loc("tmp19"(#loc55))
+#loc163 = loc("tmp20"(#loc56))
+#loc164 = loc("tmp21"(#loc57))
+#loc165 = loc("tmp22"(#loc58))
+#loc166 = loc("tmp23"(#loc59))
+#loc167 = loc("tmp23"(#loc60))
+#loc168 = loc("tmp23"(#loc61))
+#loc169 = loc("tmp23"(#loc62))
+#loc170 = loc("tmp23"(#loc63))
+#loc171 = loc("tmp23"(#loc64))
+#loc172 = loc("tmp23"(#loc65))
+#loc173 = loc("tmp23"(#loc66))
+#loc174 = loc("tmp23"(#loc67))
+#loc175 = loc("tmp23"(#loc68))
+#loc176 = loc("tmp25"(#loc69))
+#loc177 = loc("tmp25"(#loc70))
+#loc178 = loc("tmp25"(#loc71))
+#loc179 = loc("tmp25"(#loc72))
+#loc180 = loc("tmp25"(#loc73))
+#loc181 = loc("tmp25"(#loc74))
+#loc182 = loc("tmp25"(#loc75))
+#loc183 = loc("tmp25"(#loc76))
+#loc184 = loc("tmp26"(#loc77))
+#loc185 = loc("tmp27"(#loc78))
+#loc186 = loc("tmp28"(#loc79))
+#loc187 = loc("tmp29"(#loc80))
+#loc188 = loc("tmp30"(#loc81))
+#loc189 = loc("tmp31"(#loc82))
+#loc190 = loc("tmp32"(#loc83))
+#loc191 = loc("tmp32"(#loc84))
+#loc192 = loc("tmp32"(#loc85))
+#loc193 = loc("tmp32"(#loc86))
+#loc194 = loc("tmp32"(#loc87))
+#loc195 = loc("tmp32"(#loc88))
+#loc196 = loc("tmp34"(#loc89))
+#loc197 = loc("tmp36"(#loc90))
+#loc198 = loc("tmp37"(#loc91))
+#loc199 = loc("tmp38"(#loc92))
diff --git a/triton/V55R5JX27SH3ZP2ZK3XX5KCHKG3ODHZ7KRAVANZA47FHWYLN35WA/triton_poi_fused__fused_rms_norm_cat_view_2.ttgir b/triton/V55R5JX27SH3ZP2ZK3XX5KCHKG3ODHZ7KRAVANZA47FHWYLN35WA/triton_poi_fused__fused_rms_norm_cat_view_2.ttgir
new file mode 100644
index 0000000000000000000000000000000000000000..c644293bb8f8b37f41fda54506ed01135819c69d
--- /dev/null
+++ b/triton/V55R5JX27SH3ZP2ZK3XX5KCHKG3ODHZ7KRAVANZA47FHWYLN35WA/triton_poi_fused__fused_rms_norm_cat_view_2.ttgir
@@ -0,0 +1,288 @@
+#blocked = #ttg.blocked<{sizePerThread = [4, 1], threadsPerWarp = [32, 1], warpsPerCTA = [2, 4], order = [0, 1]}>
+#blocked1 = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [16, 2], warpsPerCTA = [8, 1], order = [1, 0]}>
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":18:0)
+#loc70 = loc("in_ptr0"(#loc))
+#loc71 = loc("in_ptr1"(#loc))
+#loc72 = loc("in_ptr2"(#loc))
+#loc73 = loc("in_ptr3"(#loc))
+#loc74 = loc("in_ptr4"(#loc))
+#loc75 = loc("in_ptr5"(#loc))
+#loc76 = loc("out_ptr0"(#loc))
+#loc77 = loc("ynumel"(#loc))
+#loc78 = loc("xnumel"(#loc))
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "cuda:89", "ttg.threads-per-warp" = 32 : i32} {
+  tt.func public @triton_poi_fused__fused_rms_norm_cat_view_2(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %in_ptr4: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr4"(#loc)), %in_ptr5: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr5"(#loc)), %out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ynumel: i32 {tt.divisibility = 16 : i32} loc("ynumel"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} {
+    %cst = arith.constant dense<-256> : tensor<256x1xi32, #blocked> loc(#loc1)
+    %cst_0 = arith.constant dense<-256> : tensor<256x1xi32, #blocked1> loc(#loc1)
+    %cst_1 = arith.constant dense<12288> : tensor<256x1xi32, #blocked1> loc(#loc1)
+    %cst_2 = arith.constant dense<128> : tensor<256x1xi32, #blocked1> loc(#loc1)
+    %cst_3 = arith.constant dense<256> : tensor<256x1xi64, #blocked> loc(#loc1)
+    %cst_4 = arith.constant dense<256> : tensor<256x1xi64, #blocked1> loc(#loc1)
+    %cst_5 = arith.constant dense<32> : tensor<256x1xi32, #blocked> loc(#loc1)
+    %cst_6 = arith.constant dense<32> : tensor<256x1xi32, #blocked1> loc(#loc1)
+    %cst_7 = arith.constant dense<128> : tensor<1x16xi32, #blocked> loc(#loc1)
+    %cst_8 = arith.constant dense<128> : tensor<1x16xi32, #blocked1> loc(#loc1)
+    %cst_9 = arith.constant dense<73728> : tensor<256x1xi32, #blocked> loc(#loc1)
+    %cst_10 = arith.constant dense<73728> : tensor<256x1xi32, #blocked1> loc(#loc1)
+    %c256_i32 = arith.constant 256 : i32 loc(#loc1)
+    %cst_11 = arith.constant dense<0.000000e+00> : tensor<256x16xbf16, #blocked1> loc(#loc1)
+    %cst_12 = arith.constant dense<0.000000e+00> : tensor<256x16xf32, #blocked> loc(#loc1)
+    %c16_i32 = arith.constant 16 : i32 loc(#loc1)
+    %cst_13 = arith.constant dense<9.99999997E-7> : tensor<256x16xf32, #blocked> loc(#loc1)
+    %cst_14 = arith.constant dense<1.280000e+02> : tensor<256x16xf32, #blocked> loc(#loc1)
+    %cst_15 = arith.constant dense<0.000000e+00> : tensor<256x16xf32, #blocked1> loc(#loc1)
+    %yoffset = tt.get_program_id y : i32 loc(#loc79)
+    %yoffset_16 = tt.get_program_id z : i32 loc(#loc80)
+    %yoffset_17 = tt.get_num_programs y : i32 loc(#loc81)
+    %yoffset_18 = arith.muli %yoffset_16, %yoffset_17 : i32 loc(#loc82)
+    %yoffset_19 = arith.addi %yoffset, %yoffset_18 : i32 loc(#loc83)
+    %yoffset_20 = arith.muli %yoffset_19, %c256_i32 : i32 loc(#loc84)
+    %yindex = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc85)
+    %yindex_21 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc85)
+    %yindex_22 = tt.expand_dims %yindex {axis = 1 : i32} : tensor<256xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<256x1xi32, #blocked1> loc(#loc85)
+    %yindex_23 = tt.expand_dims %yindex_21 {axis = 1 : i32} : tensor<256xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<256x1xi32, #blocked> loc(#loc85)
+    %yindex_24 = tt.splat %yoffset_20 : i32 -> tensor<256x1xi32, #blocked1> loc(#loc86)
+    %yindex_25 = tt.splat %yoffset_20 : i32 -> tensor<256x1xi32, #blocked> loc(#loc86)
+    %yindex_26 = arith.addi %yindex_24, %yindex_22 : tensor<256x1xi32, #blocked1> loc(#loc86)
+    %yindex_27 = arith.addi %yindex_25, %yindex_23 : tensor<256x1xi32, #blocked> loc(#loc86)
+    %ymask = arith.cmpi slt, %yindex_26, %cst_10 : tensor<256x1xi32, #blocked1> loc(#loc87)
+    %ymask_28 = arith.cmpi slt, %yindex_27, %cst_9 : tensor<256x1xi32, #blocked> loc(#loc87)
+    %xoffset = tt.get_program_id x : i32 loc(#loc88)
+    %xoffset_29 = arith.muli %xoffset, %c16_i32 : i32 loc(#loc89)
+    %xindex = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc90)
+    %xindex_30 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc90)
+    %xindex_31 = tt.expand_dims %xindex {axis = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x16xi32, #blocked1> loc(#loc90)
+    %xindex_32 = tt.expand_dims %xindex_30 {axis = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x16xi32, #blocked> loc(#loc90)
+    %xindex_33 = tt.splat %xoffset_29 : i32 -> tensor<1x16xi32, #blocked1> loc(#loc91)
+    %xindex_34 = tt.splat %xoffset_29 : i32 -> tensor<1x16xi32, #blocked> loc(#loc91)
+    %xindex_35 = arith.addi %xindex_33, %xindex_31 : tensor<1x16xi32, #blocked1> loc(#loc91)
+    %xindex_36 = arith.addi %xindex_34, %xindex_32 : tensor<1x16xi32, #blocked> loc(#loc91)
+    %xmask = arith.cmpi slt, %xindex_35, %cst_8 : tensor<1x16xi32, #blocked1> loc(#loc92)
+    %xmask_37 = arith.cmpi slt, %xindex_36, %cst_7 : tensor<1x16xi32, #blocked> loc(#loc92)
+    %y1 = arith.divsi %yindex_26, %cst_6 : tensor<256x1xi32, #blocked1> loc(#loc93)
+    %y1_38 = arith.divsi %yindex_27, %cst_5 : tensor<256x1xi32, #blocked> loc(#loc93)
+    %y0 = arith.remsi %yindex_26, %cst_6 : tensor<256x1xi32, #blocked1> loc(#loc94)
+    %y0_39 = arith.remsi %yindex_27, %cst_5 : tensor<256x1xi32, #blocked> loc(#loc94)
+    %tmp4 = arith.extsi %y1 : tensor<256x1xi32, #blocked1> to tensor<256x1xi64, #blocked1> loc(#loc95)
+    %tmp4_40 = arith.extsi %y1_38 : tensor<256x1xi32, #blocked> to tensor<256x1xi64, #blocked> loc(#loc95)
+    %tmp4_41 = arith.cmpi slt, %tmp4, %cst_4 : tensor<256x1xi64, #blocked1> loc(#loc95)
+    %tmp4_42 = arith.cmpi slt, %tmp4_40, %cst_3 : tensor<256x1xi64, #blocked> loc(#loc95)
+    %tmp5 = arith.muli %y0, %cst_2 : tensor<256x1xi32, #blocked1> loc(#loc96)
+    %tmp5_43 = tt.broadcast %xindex_35 : tensor<1x16xi32, #blocked1> -> tensor<256x16xi32, #blocked1> loc(#loc97)
+    %tmp5_44 = tt.broadcast %tmp5 : tensor<256x1xi32, #blocked1> -> tensor<256x16xi32, #blocked1> loc(#loc97)
+    %tmp5_45 = arith.addi %tmp5_43, %tmp5_44 : tensor<256x16xi32, #blocked1> loc(#loc97)
+    %tmp5_46 = arith.muli %y1, %cst_1 : tensor<256x1xi32, #blocked1> loc(#loc98)
+    %tmp5_47 = tt.broadcast %tmp5_46 : tensor<256x1xi32, #blocked1> -> tensor<256x16xi32, #blocked1> loc(#loc99)
+    %tmp5_48 = arith.addi %tmp5_45, %tmp5_47 : tensor<256x16xi32, #blocked1> loc(#loc99)
+    %tmp5_49 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<256x16x!tt.ptr<bf16>, #blocked1> loc(#loc100)
+    %tmp5_50 = tt.addptr %tmp5_49, %tmp5_48 : tensor<256x16x!tt.ptr<bf16>, #blocked1>, tensor<256x16xi32, #blocked1> loc(#loc100)
+    %tmp5_51 = tt.broadcast %tmp4_41 : tensor<256x1xi1, #blocked1> -> tensor<256x16xi1, #blocked1> loc(#loc101)
+    %tmp5_52 = tt.broadcast %tmp4_42 : tensor<256x1xi1, #blocked> -> tensor<256x16xi1, #blocked> loc(#loc101)
+    %tmp5_53 = tt.broadcast %xmask : tensor<1x16xi1, #blocked1> -> tensor<256x16xi1, #blocked1> loc(#loc101)
+    %tmp5_54 = tt.broadcast %xmask_37 : tensor<1x16xi1, #blocked> -> tensor<256x16xi1, #blocked> loc(#loc101)
+    %tmp5_55 = arith.andi %tmp5_51, %tmp5_53 : tensor<256x16xi1, #blocked1> loc(#loc101)
+    %tmp5_56 = arith.andi %tmp5_52, %tmp5_54 : tensor<256x16xi1, #blocked> loc(#loc101)
+    %tmp5_57 = tt.broadcast %ymask : tensor<256x1xi1, #blocked1> -> tensor<256x16xi1, #blocked1> loc(#loc102)
+    %tmp5_58 = tt.broadcast %ymask_28 : tensor<256x1xi1, #blocked> -> tensor<256x16xi1, #blocked> loc(#loc102)
+    %tmp5_59 = arith.andi %tmp5_55, %tmp5_57 : tensor<256x16xi1, #blocked1> loc(#loc102)
+    %tmp5_60 = arith.andi %tmp5_56, %tmp5_58 : tensor<256x16xi1, #blocked> loc(#loc102)
+    %tmp5_61 = tt.load %tmp5_50, %tmp5_59, %cst_11 evictionPolicy = evict_last : tensor<256x16x!tt.ptr<bf16>, #blocked1> loc(#loc103)
+    %tmp5_62 = ttg.convert_layout %tmp5_61 : tensor<256x16xbf16, #blocked1> -> tensor<256x16xbf16, #blocked> loc(#loc104)
+    %tmp5_63 = arith.extf %tmp5_62 : tensor<256x16xbf16, #blocked> to tensor<256x16xf32, #blocked> loc(#loc104)
+    %tmp7 = arith.muli %y1_38, %cst_5 : tensor<256x1xi32, #blocked> loc(#loc105)
+    %tmp7_64 = arith.addi %y0_39, %tmp7 : tensor<256x1xi32, #blocked> loc(#loc106)
+    %tmp7_65 = tt.splat %in_ptr1 : !tt.ptr<f32> -> tensor<256x1x!tt.ptr<f32>, #blocked> loc(#loc107)
+    %tmp7_66 = tt.addptr %tmp7_65, %tmp7_64 : tensor<256x1x!tt.ptr<f32>, #blocked>, tensor<256x1xi32, #blocked> loc(#loc107)
+    %tmp7_67 = tt.broadcast %tmp7_66 : tensor<256x1x!tt.ptr<f32>, #blocked> -> tensor<256x16x!tt.ptr<f32>, #blocked> loc(#loc107)
+    %tmp7_68 = tt.load %tmp7_67, %tmp5_60, %cst_12 evictionPolicy = evict_last : tensor<256x16x!tt.ptr<f32>, #blocked> loc(#loc108)
+    %tmp9 = arith.divf %tmp7_68, %cst_14 : tensor<256x16xf32, #blocked> loc(#loc109)
+    %tmp11 = arith.addf %tmp9, %cst_13 : tensor<256x16xf32, #blocked> loc(#loc110)
+    %tmp12 = tt.extern_elementwise %tmp11 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<256x16xf32, #blocked>) -> tensor<256x16xf32, #blocked> loc(#loc111)
+    %tmp13 = arith.mulf %tmp5_63, %tmp12 : tensor<256x16xf32, #blocked> loc(#loc112)
+    %tmp13_69 = ttg.convert_layout %tmp13 : tensor<256x16xf32, #blocked> -> tensor<256x16xf32, #blocked1> loc(#loc112)
+    %tmp14 = tt.splat %in_ptr2 : !tt.ptr<bf16> -> tensor<1x16x!tt.ptr<bf16>, #blocked1> loc(#loc113)
+    %tmp14_70 = tt.addptr %tmp14, %xindex_35 : tensor<1x16x!tt.ptr<bf16>, #blocked1>, tensor<1x16xi32, #blocked1> loc(#loc113)
+    %tmp14_71 = tt.broadcast %tmp14_70 : tensor<1x16x!tt.ptr<bf16>, #blocked1> -> tensor<256x16x!tt.ptr<bf16>, #blocked1> loc(#loc113)
+    %tmp14_72 = tt.load %tmp14_71, %tmp5_59, %cst_11 evictionPolicy = evict_last : tensor<256x16x!tt.ptr<bf16>, #blocked1> loc(#loc114)
+    %tmp14_73 = arith.extf %tmp14_72 : tensor<256x16xbf16, #blocked1> to tensor<256x16xf32, #blocked1> loc(#loc115)
+    %tmp16 = arith.mulf %tmp13_69, %tmp14_73 : tensor<256x16xf32, #blocked1> loc(#loc116)
+    %tmp20 = arith.cmpi sge, %tmp4, %cst_4 : tensor<256x1xi64, #blocked1> loc(#loc117)
+    %tmp20_74 = arith.cmpi sge, %tmp4_40, %cst_3 : tensor<256x1xi64, #blocked> loc(#loc117)
+    %tmp23 = arith.addi %y1, %cst_0 : tensor<256x1xi32, #blocked1> loc(#loc118)
+    %tmp23_75 = arith.addi %y1_38, %cst : tensor<256x1xi32, #blocked> loc(#loc118)
+    %tmp23_76 = arith.muli %tmp23, %cst_1 : tensor<256x1xi32, #blocked1> loc(#loc119)
+    %tmp23_77 = tt.broadcast %tmp23_76 : tensor<256x1xi32, #blocked1> -> tensor<256x16xi32, #blocked1> loc(#loc120)
+    %tmp23_78 = arith.addi %tmp5_45, %tmp23_77 : tensor<256x16xi32, #blocked1> loc(#loc120)
+    %tmp23_79 = tt.splat %in_ptr3 : !tt.ptr<bf16> -> tensor<256x16x!tt.ptr<bf16>, #blocked1> loc(#loc121)
+    %tmp23_80 = tt.addptr %tmp23_79, %tmp23_78 : tensor<256x16x!tt.ptr<bf16>, #blocked1>, tensor<256x16xi32, #blocked1> loc(#loc121)
+    %tmp23_81 = tt.broadcast %tmp20 : tensor<256x1xi1, #blocked1> -> tensor<256x16xi1, #blocked1> loc(#loc122)
+    %tmp23_82 = tt.broadcast %tmp20_74 : tensor<256x1xi1, #blocked> -> tensor<256x16xi1, #blocked> loc(#loc122)
+    %tmp23_83 = arith.andi %tmp23_81, %tmp5_53 : tensor<256x16xi1, #blocked1> loc(#loc122)
+    %tmp23_84 = arith.andi %tmp23_82, %tmp5_54 : tensor<256x16xi1, #blocked> loc(#loc122)
+    %tmp23_85 = arith.andi %tmp23_83, %tmp5_57 : tensor<256x16xi1, #blocked1> loc(#loc123)
+    %tmp23_86 = arith.andi %tmp23_84, %tmp5_58 : tensor<256x16xi1, #blocked> loc(#loc123)
+    %tmp23_87 = tt.load %tmp23_80, %tmp23_85, %cst_11 evictionPolicy = evict_last : tensor<256x16x!tt.ptr<bf16>, #blocked1> loc(#loc124)
+    %tmp23_88 = ttg.convert_layout %tmp23_87 : tensor<256x16xbf16, #blocked1> -> tensor<256x16xbf16, #blocked> loc(#loc125)
+    %tmp23_89 = arith.extf %tmp23_88 : tensor<256x16xbf16, #blocked> to tensor<256x16xf32, #blocked> loc(#loc125)
+    %tmp25 = arith.muli %tmp23_75, %cst_5 : tensor<256x1xi32, #blocked> loc(#loc126)
+    %tmp25_90 = arith.addi %y0_39, %tmp25 : tensor<256x1xi32, #blocked> loc(#loc127)
+    %tmp25_91 = tt.splat %in_ptr4 : !tt.ptr<f32> -> tensor<256x1x!tt.ptr<f32>, #blocked> loc(#loc128)
+    %tmp25_92 = tt.addptr %tmp25_91, %tmp25_90 : tensor<256x1x!tt.ptr<f32>, #blocked>, tensor<256x1xi32, #blocked> loc(#loc128)
+    %tmp25_93 = tt.broadcast %tmp25_92 : tensor<256x1x!tt.ptr<f32>, #blocked> -> tensor<256x16x!tt.ptr<f32>, #blocked> loc(#loc128)
+    %tmp25_94 = tt.load %tmp25_93, %tmp23_86, %cst_12 evictionPolicy = evict_last : tensor<256x16x!tt.ptr<f32>, #blocked> loc(#loc129)
+    %tmp27 = arith.divf %tmp25_94, %cst_14 : tensor<256x16xf32, #blocked> loc(#loc130)
+    %tmp29 = arith.addf %tmp27, %cst_13 : tensor<256x16xf32, #blocked> loc(#loc131)
+    %tmp30 = tt.extern_elementwise %tmp29 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<256x16xf32, #blocked>) -> tensor<256x16xf32, #blocked> loc(#loc132)
+    %tmp31 = arith.mulf %tmp23_89, %tmp30 : tensor<256x16xf32, #blocked> loc(#loc133)
+    %tmp31_95 = ttg.convert_layout %tmp31 : tensor<256x16xf32, #blocked> -> tensor<256x16xf32, #blocked1> loc(#loc133)
+    %tmp32 = tt.splat %in_ptr5 : !tt.ptr<bf16> -> tensor<1x16x!tt.ptr<bf16>, #blocked1> loc(#loc134)
+    %tmp32_96 = tt.addptr %tmp32, %xindex_35 : tensor<1x16x!tt.ptr<bf16>, #blocked1>, tensor<1x16xi32, #blocked1> loc(#loc134)
+    %tmp32_97 = tt.broadcast %tmp32_96 : tensor<1x16x!tt.ptr<bf16>, #blocked1> -> tensor<256x16x!tt.ptr<bf16>, #blocked1> loc(#loc134)
+    %tmp32_98 = tt.load %tmp32_97, %tmp23_85, %cst_11 evictionPolicy = evict_last : tensor<256x16x!tt.ptr<bf16>, #blocked1> loc(#loc135)
+    %tmp32_99 = arith.extf %tmp32_98 : tensor<256x16xbf16, #blocked1> to tensor<256x16xf32, #blocked1> loc(#loc136)
+    %tmp34 = arith.mulf %tmp31_95, %tmp32_99 : tensor<256x16xf32, #blocked1> loc(#loc137)
+    %tmp37 = arith.select %tmp23_81, %tmp34, %cst_15 : tensor<256x16xi1, #blocked1>, tensor<256x16xf32, #blocked1> loc(#loc138)
+    %tmp38 = arith.select %tmp5_51, %tmp16, %tmp37 : tensor<256x16xi1, #blocked1>, tensor<256x16xf32, #blocked1> loc(#loc141)
+    %0 = arith.muli %yindex_26, %cst_2 : tensor<256x1xi32, #blocked1> loc(#loc64)
+    %1 = tt.broadcast %0 : tensor<256x1xi32, #blocked1> -> tensor<256x16xi32, #blocked1> loc(#loc65)
+    %2 = arith.addi %tmp5_43, %1 : tensor<256x16xi32, #blocked1> loc(#loc65)
+    %3 = tt.splat %out_ptr0 : !tt.ptr<bf16> -> tensor<256x16x!tt.ptr<bf16>, #blocked1> loc(#loc66)
+    %4 = tt.addptr %3, %2 : tensor<256x16x!tt.ptr<bf16>, #blocked1>, tensor<256x16xi32, #blocked1> loc(#loc66)
+    %5 = arith.andi %tmp5_53, %tmp5_57 : tensor<256x16xi1, #blocked1> loc(#loc67)
+    %6 = arith.truncf %tmp38 : tensor<256x16xf32, #blocked1> to tensor<256x16xbf16, #blocked1> loc(#loc68)
+    tt.store %4, %6, %5 : tensor<256x16x!tt.ptr<bf16>, #blocked1> loc(#loc68)
+    tt.return loc(#loc69)
+  } loc(#loc)
+} loc(#loc)
+#loc1 = loc(unknown)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:29)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:48)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:69)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:53)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:34)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:75)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":22:44)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":22:23)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":23:21)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":24:28)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":24:33)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":25:44)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":25:23)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":26:21)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":27:19)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":29:19)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":35:18)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:39)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:35)
+#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:51)
+#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:44)
+#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:30)
+#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:64)
+#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:72)
+#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:57)
+#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:123)
+#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:55)
+#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:51)
+#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:30)
+#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:80)
+#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":40:19)
+#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":42:19)
+#loc34 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":43:28)
+#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":44:19)
+#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:31)
+#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:71)
+#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:137)
+#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":47:20)
+#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":51:20)
+#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:61)
+#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:52)
+#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:45)
+#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:31)
+#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:75)
+#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:83)
+#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:67)
+#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:134)
+#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:56)
+#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:52)
+#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:31)
+#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:90)
+#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":58:21)
+#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":60:20)
+#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":61:28)
+#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":62:20)
+#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:31)
+#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:71)
+#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:138)
+#loc60 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":65:20)
+#loc61 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":68:35)
+#loc62 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":69:34)
+#loc63 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":50:34)
+#loc64 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:34)
+#loc65 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:30)
+#loc66 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:25)
+#loc67 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:54)
+#loc68 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:46)
+#loc69 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:4)
+#loc79 = loc("yoffset"(#loc2))
+#loc80 = loc("yoffset"(#loc3))
+#loc81 = loc("yoffset"(#loc4))
+#loc82 = loc("yoffset"(#loc5))
+#loc83 = loc("yoffset"(#loc6))
+#loc84 = loc("yoffset"(#loc7))
+#loc85 = loc("yindex"(#loc8))
+#loc86 = loc("yindex"(#loc9))
+#loc87 = loc("ymask"(#loc10))
+#loc88 = loc("xoffset"(#loc11))
+#loc89 = loc("xoffset"(#loc12))
+#loc90 = loc("xindex"(#loc13))
+#loc91 = loc("xindex"(#loc14))
+#loc92 = loc("xmask"(#loc15))
+#loc93 = loc("y1"(#loc16))
+#loc94 = loc("y0"(#loc17))
+#loc95 = loc("tmp4"(#loc18))
+#loc96 = loc("tmp5"(#loc19))
+#loc97 = loc("tmp5"(#loc20))
+#loc98 = loc("tmp5"(#loc21))
+#loc99 = loc("tmp5"(#loc22))
+#loc100 = loc("tmp5"(#loc23))
+#loc101 = loc("tmp5"(#loc24))
+#loc102 = loc("tmp5"(#loc25))
+#loc103 = loc("tmp5"(#loc26))
+#loc104 = loc("tmp5"(#loc27))
+#loc105 = loc("tmp7"(#loc28))
+#loc106 = loc("tmp7"(#loc29))
+#loc107 = loc("tmp7"(#loc30))
+#loc108 = loc("tmp7"(#loc31))
+#loc109 = loc("tmp9"(#loc32))
+#loc110 = loc("tmp11"(#loc33))
+#loc111 = loc("tmp12"(#loc34))
+#loc112 = loc("tmp13"(#loc35))
+#loc113 = loc("tmp14"(#loc36))
+#loc114 = loc("tmp14"(#loc37))
+#loc115 = loc("tmp14"(#loc38))
+#loc116 = loc("tmp16"(#loc39))
+#loc117 = loc("tmp20"(#loc40))
+#loc118 = loc("tmp23"(#loc41))
+#loc119 = loc("tmp23"(#loc42))
+#loc120 = loc("tmp23"(#loc43))
+#loc121 = loc("tmp23"(#loc44))
+#loc122 = loc("tmp23"(#loc45))
+#loc123 = loc("tmp23"(#loc46))
+#loc124 = loc("tmp23"(#loc47))
+#loc125 = loc("tmp23"(#loc48))
+#loc126 = loc("tmp25"(#loc49))
+#loc127 = loc("tmp25"(#loc50))
+#loc128 = loc("tmp25"(#loc51))
+#loc129 = loc("tmp25"(#loc52))
+#loc130 = loc("tmp27"(#loc53))
+#loc131 = loc("tmp29"(#loc54))
+#loc132 = loc("tmp30"(#loc55))
+#loc133 = loc("tmp31"(#loc56))
+#loc134 = loc("tmp32"(#loc57))
+#loc135 = loc("tmp32"(#loc58))
+#loc136 = loc("tmp32"(#loc59))
+#loc137 = loc("tmp34"(#loc60))
+#loc138 = loc("tmp37"(#loc61))
+#loc139 = loc("tmp38"(#loc62))
+#loc140 = loc("tmp19"(#loc63))
+#loc141 = loc(fused[#loc139, #loc140])
diff --git a/triton/V55R5JX27SH3ZP2ZK3XX5KCHKG3ODHZ7KRAVANZA47FHWYLN35WA/triton_poi_fused__fused_rms_norm_cat_view_2.ttir b/triton/V55R5JX27SH3ZP2ZK3XX5KCHKG3ODHZ7KRAVANZA47FHWYLN35WA/triton_poi_fused__fused_rms_norm_cat_view_2.ttir
new file mode 100644
index 0000000000000000000000000000000000000000..c52775319a63d328fe709f5d56118eb499b143d7
--- /dev/null
+++ b/triton/V55R5JX27SH3ZP2ZK3XX5KCHKG3ODHZ7KRAVANZA47FHWYLN35WA/triton_poi_fused__fused_rms_norm_cat_view_2.ttir
@@ -0,0 +1,256 @@
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":18:0)
+#loc72 = loc("in_ptr0"(#loc))
+#loc73 = loc("in_ptr1"(#loc))
+#loc74 = loc("in_ptr2"(#loc))
+#loc75 = loc("in_ptr3"(#loc))
+#loc76 = loc("in_ptr4"(#loc))
+#loc77 = loc("in_ptr5"(#loc))
+#loc78 = loc("out_ptr0"(#loc))
+#loc79 = loc("ynumel"(#loc))
+#loc80 = loc("xnumel"(#loc))
+module {
+  tt.func public @triton_poi_fused__fused_rms_norm_cat_view_2(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %in_ptr4: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr4"(#loc)), %in_ptr5: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr5"(#loc)), %out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ynumel: i32 {tt.divisibility = 16 : i32} loc("ynumel"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} {
+    %cst = arith.constant dense<0.000000e+00> : tensor<256x16xbf16> loc(#loc1)
+    %cst_0 = arith.constant dense<-256> : tensor<256x1xi32> loc(#loc1)
+    %cst_1 = arith.constant dense<9.99999997E-7> : tensor<256x16xf32> loc(#loc1)
+    %cst_2 = arith.constant dense<1.280000e+02> : tensor<256x16xf32> loc(#loc1)
+    %cst_3 = arith.constant dense<0.000000e+00> : tensor<256x16xf32> loc(#loc1)
+    %cst_4 = arith.constant dense<12288> : tensor<256x1xi32> loc(#loc1)
+    %cst_5 = arith.constant dense<128> : tensor<256x1xi32> loc(#loc1)
+    %cst_6 = arith.constant dense<256> : tensor<256x1xi64> loc(#loc1)
+    %cst_7 = arith.constant dense<32> : tensor<256x1xi32> loc(#loc1)
+    %xmask = arith.constant dense<128> : tensor<1x16xi32> loc(#loc81)
+    %c16_i32 = arith.constant 16 : i32 loc(#loc1)
+    %ymask = arith.constant dense<73728> : tensor<256x1xi32> loc(#loc82)
+    %c256_i32 = arith.constant 256 : i32 loc(#loc1)
+    %yoffset = tt.get_program_id y : i32 loc(#loc83)
+    %yoffset_8 = tt.get_program_id z : i32 loc(#loc84)
+    %yoffset_9 = tt.get_num_programs y : i32 loc(#loc85)
+    %yoffset_10 = arith.muli %yoffset_8, %yoffset_9 : i32 loc(#loc86)
+    %yoffset_11 = arith.addi %yoffset, %yoffset_10 : i32 loc(#loc87)
+    %yoffset_12 = arith.muli %yoffset_11, %c256_i32 : i32 loc(#loc88)
+    %yindex = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32> loc(#loc89)
+    %yindex_13 = tt.expand_dims %yindex {axis = 1 : i32} : tensor<256xi32> -> tensor<256x1xi32> loc(#loc90)
+    %yindex_14 = tt.splat %yoffset_12 : i32 -> tensor<256x1xi32> loc(#loc91)
+    %yindex_15 = arith.addi %yindex_14, %yindex_13 : tensor<256x1xi32> loc(#loc91)
+    %ymask_16 = arith.cmpi slt, %yindex_15, %ymask : tensor<256x1xi32> loc(#loc82)
+    %xoffset = tt.get_program_id x : i32 loc(#loc92)
+    %xoffset_17 = arith.muli %xoffset, %c16_i32 : i32 loc(#loc93)
+    %xindex = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32> loc(#loc94)
+    %xindex_18 = tt.expand_dims %xindex {axis = 0 : i32} : tensor<16xi32> -> tensor<1x16xi32> loc(#loc95)
+    %xindex_19 = tt.splat %xoffset_17 : i32 -> tensor<1x16xi32> loc(#loc96)
+    %xindex_20 = arith.addi %xindex_19, %xindex_18 : tensor<1x16xi32> loc(#loc96)
+    %xmask_21 = arith.cmpi slt, %xindex_20, %xmask : tensor<1x16xi32> loc(#loc81)
+    %y1 = arith.divsi %yindex_15, %cst_7 : tensor<256x1xi32> loc(#loc97)
+    %y0 = arith.remsi %yindex_15, %cst_7 : tensor<256x1xi32> loc(#loc98)
+    %tmp4 = arith.extsi %y1 : tensor<256x1xi32> to tensor<256x1xi64> loc(#loc99)
+    %tmp4_22 = arith.cmpi slt, %tmp4, %cst_6 : tensor<256x1xi64> loc(#loc99)
+    %tmp5 = arith.muli %y0, %cst_5 : tensor<256x1xi32> loc(#loc100)
+    %tmp5_23 = tt.broadcast %xindex_20 : tensor<1x16xi32> -> tensor<256x16xi32> loc(#loc101)
+    %tmp5_24 = tt.broadcast %tmp5 : tensor<256x1xi32> -> tensor<256x16xi32> loc(#loc101)
+    %tmp5_25 = arith.addi %tmp5_23, %tmp5_24 : tensor<256x16xi32> loc(#loc101)
+    %tmp5_26 = arith.muli %y1, %cst_4 : tensor<256x1xi32> loc(#loc102)
+    %tmp5_27 = tt.broadcast %tmp5_26 : tensor<256x1xi32> -> tensor<256x16xi32> loc(#loc103)
+    %tmp5_28 = arith.addi %tmp5_25, %tmp5_27 : tensor<256x16xi32> loc(#loc103)
+    %tmp5_29 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<256x16x!tt.ptr<bf16>> loc(#loc104)
+    %tmp5_30 = tt.addptr %tmp5_29, %tmp5_28 : tensor<256x16x!tt.ptr<bf16>>, tensor<256x16xi32> loc(#loc104)
+    %tmp5_31 = tt.broadcast %tmp4_22 : tensor<256x1xi1> -> tensor<256x16xi1> loc(#loc105)
+    %tmp5_32 = tt.broadcast %xmask_21 : tensor<1x16xi1> -> tensor<256x16xi1> loc(#loc105)
+    %tmp5_33 = arith.andi %tmp5_31, %tmp5_32 : tensor<256x16xi1> loc(#loc105)
+    %tmp5_34 = tt.broadcast %ymask_16 : tensor<256x1xi1> -> tensor<256x16xi1> loc(#loc106)
+    %tmp5_35 = arith.andi %tmp5_33, %tmp5_34 : tensor<256x16xi1> loc(#loc106)
+    %tmp5_36 = tt.load %tmp5_30, %tmp5_35, %cst evictionPolicy = evict_last : tensor<256x16x!tt.ptr<bf16>> loc(#loc107)
+    %tmp5_37 = arith.extf %tmp5_36 : tensor<256x16xbf16> to tensor<256x16xf32> loc(#loc108)
+    %tmp7 = arith.muli %y1, %cst_7 : tensor<256x1xi32> loc(#loc109)
+    %tmp7_38 = arith.addi %y0, %tmp7 : tensor<256x1xi32> loc(#loc110)
+    %tmp7_39 = tt.splat %in_ptr1 : !tt.ptr<f32> -> tensor<256x1x!tt.ptr<f32>> loc(#loc111)
+    %tmp7_40 = tt.addptr %tmp7_39, %tmp7_38 : tensor<256x1x!tt.ptr<f32>>, tensor<256x1xi32> loc(#loc111)
+    %tmp7_41 = tt.broadcast %tmp7_40 : tensor<256x1x!tt.ptr<f32>> -> tensor<256x16x!tt.ptr<f32>> loc(#loc111)
+    %tmp7_42 = tt.load %tmp7_41, %tmp5_35, %cst_3 evictionPolicy = evict_last : tensor<256x16x!tt.ptr<f32>> loc(#loc112)
+    %tmp9 = arith.divf %tmp7_42, %cst_2 : tensor<256x16xf32> loc(#loc113)
+    %tmp11 = arith.addf %tmp9, %cst_1 : tensor<256x16xf32> loc(#loc114)
+    %tmp12 = tt.extern_elementwise %tmp11 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<256x16xf32>) -> tensor<256x16xf32> loc(#loc115)
+    %tmp13 = arith.mulf %tmp5_37, %tmp12 : tensor<256x16xf32> loc(#loc116)
+    %tmp14 = tt.splat %in_ptr2 : !tt.ptr<bf16> -> tensor<1x16x!tt.ptr<bf16>> loc(#loc117)
+    %tmp14_43 = tt.addptr %tmp14, %xindex_20 : tensor<1x16x!tt.ptr<bf16>>, tensor<1x16xi32> loc(#loc117)
+    %tmp14_44 = tt.broadcast %tmp14_43 : tensor<1x16x!tt.ptr<bf16>> -> tensor<256x16x!tt.ptr<bf16>> loc(#loc117)
+    %tmp14_45 = tt.load %tmp14_44, %tmp5_35, %cst evictionPolicy = evict_last : tensor<256x16x!tt.ptr<bf16>> loc(#loc118)
+    %tmp14_46 = arith.extf %tmp14_45 : tensor<256x16xbf16> to tensor<256x16xf32> loc(#loc119)
+    %tmp16 = arith.mulf %tmp13, %tmp14_46 : tensor<256x16xf32> loc(#loc120)
+    %tmp19 = arith.select %tmp5_31, %tmp16, %cst_3 : tensor<256x16xi1>, tensor<256x16xf32> loc(#loc121)
+    %tmp20 = arith.cmpi sge, %tmp4, %cst_6 : tensor<256x1xi64> loc(#loc122)
+    %tmp23 = arith.addi %y1, %cst_0 : tensor<256x1xi32> loc(#loc123)
+    %tmp23_47 = arith.muli %tmp23, %cst_4 : tensor<256x1xi32> loc(#loc124)
+    %tmp23_48 = tt.broadcast %tmp23_47 : tensor<256x1xi32> -> tensor<256x16xi32> loc(#loc125)
+    %tmp23_49 = arith.addi %tmp5_25, %tmp23_48 : tensor<256x16xi32> loc(#loc125)
+    %tmp23_50 = tt.splat %in_ptr3 : !tt.ptr<bf16> -> tensor<256x16x!tt.ptr<bf16>> loc(#loc126)
+    %tmp23_51 = tt.addptr %tmp23_50, %tmp23_49 : tensor<256x16x!tt.ptr<bf16>>, tensor<256x16xi32> loc(#loc126)
+    %tmp23_52 = tt.broadcast %tmp20 : tensor<256x1xi1> -> tensor<256x16xi1> loc(#loc127)
+    %tmp23_53 = arith.andi %tmp23_52, %tmp5_32 : tensor<256x16xi1> loc(#loc127)
+    %tmp23_54 = arith.andi %tmp23_53, %tmp5_34 : tensor<256x16xi1> loc(#loc128)
+    %tmp23_55 = tt.load %tmp23_51, %tmp23_54, %cst evictionPolicy = evict_last : tensor<256x16x!tt.ptr<bf16>> loc(#loc129)
+    %tmp23_56 = arith.extf %tmp23_55 : tensor<256x16xbf16> to tensor<256x16xf32> loc(#loc130)
+    %tmp25 = arith.muli %tmp23, %cst_7 : tensor<256x1xi32> loc(#loc131)
+    %tmp25_57 = arith.addi %y0, %tmp25 : tensor<256x1xi32> loc(#loc132)
+    %tmp25_58 = tt.splat %in_ptr4 : !tt.ptr<f32> -> tensor<256x1x!tt.ptr<f32>> loc(#loc133)
+    %tmp25_59 = tt.addptr %tmp25_58, %tmp25_57 : tensor<256x1x!tt.ptr<f32>>, tensor<256x1xi32> loc(#loc133)
+    %tmp25_60 = tt.broadcast %tmp25_59 : tensor<256x1x!tt.ptr<f32>> -> tensor<256x16x!tt.ptr<f32>> loc(#loc133)
+    %tmp25_61 = tt.load %tmp25_60, %tmp23_54, %cst_3 evictionPolicy = evict_last : tensor<256x16x!tt.ptr<f32>> loc(#loc134)
+    %tmp27 = arith.divf %tmp25_61, %cst_2 : tensor<256x16xf32> loc(#loc135)
+    %tmp29 = arith.addf %tmp27, %cst_1 : tensor<256x16xf32> loc(#loc136)
+    %tmp30 = tt.extern_elementwise %tmp29 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<256x16xf32>) -> tensor<256x16xf32> loc(#loc137)
+    %tmp31 = arith.mulf %tmp23_56, %tmp30 : tensor<256x16xf32> loc(#loc138)
+    %tmp32 = tt.splat %in_ptr5 : !tt.ptr<bf16> -> tensor<1x16x!tt.ptr<bf16>> loc(#loc139)
+    %tmp32_62 = tt.addptr %tmp32, %xindex_20 : tensor<1x16x!tt.ptr<bf16>>, tensor<1x16xi32> loc(#loc139)
+    %tmp32_63 = tt.broadcast %tmp32_62 : tensor<1x16x!tt.ptr<bf16>> -> tensor<256x16x!tt.ptr<bf16>> loc(#loc139)
+    %tmp32_64 = tt.load %tmp32_63, %tmp23_54, %cst evictionPolicy = evict_last : tensor<256x16x!tt.ptr<bf16>> loc(#loc140)
+    %tmp32_65 = arith.extf %tmp32_64 : tensor<256x16xbf16> to tensor<256x16xf32> loc(#loc141)
+    %tmp34 = arith.mulf %tmp31, %tmp32_65 : tensor<256x16xf32> loc(#loc142)
+    %tmp37 = arith.select %tmp23_52, %tmp34, %cst_3 : tensor<256x16xi1>, tensor<256x16xf32> loc(#loc143)
+    %tmp38 = arith.select %tmp5_31, %tmp19, %tmp37 : tensor<256x16xi1>, tensor<256x16xf32> loc(#loc144)
+    %0 = arith.muli %yindex_15, %cst_5 : tensor<256x1xi32> loc(#loc66)
+    %1 = tt.broadcast %0 : tensor<256x1xi32> -> tensor<256x16xi32> loc(#loc67)
+    %2 = arith.addi %tmp5_23, %1 : tensor<256x16xi32> loc(#loc67)
+    %3 = tt.splat %out_ptr0 : !tt.ptr<bf16> -> tensor<256x16x!tt.ptr<bf16>> loc(#loc68)
+    %4 = tt.addptr %3, %2 : tensor<256x16x!tt.ptr<bf16>>, tensor<256x16xi32> loc(#loc68)
+    %5 = arith.andi %tmp5_32, %tmp5_34 : tensor<256x16xi1> loc(#loc69)
+    %6 = arith.truncf %tmp38 : tensor<256x16xf32> to tensor<256x16xbf16> loc(#loc70)
+    tt.store %4, %6, %5 : tensor<256x16x!tt.ptr<bf16>> loc(#loc70)
+    tt.return loc(#loc71)
+  } loc(#loc)
+} loc(#loc)
+#loc1 = loc(unknown)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":26:21)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":23:21)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:29)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:48)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:69)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:53)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:34)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:75)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":22:36)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":22:44)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":22:23)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":24:28)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":24:33)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":25:36)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":25:44)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":25:23)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":27:19)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":29:19)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":35:18)
+#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:39)
+#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:35)
+#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:51)
+#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:44)
+#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:30)
+#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:64)
+#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:72)
+#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:57)
+#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:123)
+#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:55)
+#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:51)
+#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:30)
+#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:80)
+#loc34 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":40:19)
+#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":42:19)
+#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":43:28)
+#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":44:19)
+#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:31)
+#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:71)
+#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:137)
+#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":47:20)
+#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":50:34)
+#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":51:20)
+#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:61)
+#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:52)
+#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:45)
+#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:31)
+#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:75)
+#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:83)
+#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:67)
+#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:134)
+#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:56)
+#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:52)
+#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:31)
+#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:90)
+#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":58:21)
+#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":60:20)
+#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":61:28)
+#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":62:20)
+#loc60 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:31)
+#loc61 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:71)
+#loc62 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:138)
+#loc63 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":65:20)
+#loc64 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":68:35)
+#loc65 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":69:34)
+#loc66 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:34)
+#loc67 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:30)
+#loc68 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:25)
+#loc69 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:54)
+#loc70 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:46)
+#loc71 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:4)
+#loc81 = loc("xmask"(#loc2))
+#loc82 = loc("ymask"(#loc3))
+#loc83 = loc("yoffset"(#loc4))
+#loc84 = loc("yoffset"(#loc5))
+#loc85 = loc("yoffset"(#loc6))
+#loc86 = loc("yoffset"(#loc7))
+#loc87 = loc("yoffset"(#loc8))
+#loc88 = loc("yoffset"(#loc9))
+#loc89 = loc("yindex"(#loc10))
+#loc90 = loc("yindex"(#loc11))
+#loc91 = loc("yindex"(#loc12))
+#loc92 = loc("xoffset"(#loc13))
+#loc93 = loc("xoffset"(#loc14))
+#loc94 = loc("xindex"(#loc15))
+#loc95 = loc("xindex"(#loc16))
+#loc96 = loc("xindex"(#loc17))
+#loc97 = loc("y1"(#loc18))
+#loc98 = loc("y0"(#loc19))
+#loc99 = loc("tmp4"(#loc20))
+#loc100 = loc("tmp5"(#loc21))
+#loc101 = loc("tmp5"(#loc22))
+#loc102 = loc("tmp5"(#loc23))
+#loc103 = loc("tmp5"(#loc24))
+#loc104 = loc("tmp5"(#loc25))
+#loc105 = loc("tmp5"(#loc26))
+#loc106 = loc("tmp5"(#loc27))
+#loc107 = loc("tmp5"(#loc28))
+#loc108 = loc("tmp5"(#loc29))
+#loc109 = loc("tmp7"(#loc30))
+#loc110 = loc("tmp7"(#loc31))
+#loc111 = loc("tmp7"(#loc32))
+#loc112 = loc("tmp7"(#loc33))
+#loc113 = loc("tmp9"(#loc34))
+#loc114 = loc("tmp11"(#loc35))
+#loc115 = loc("tmp12"(#loc36))
+#loc116 = loc("tmp13"(#loc37))
+#loc117 = loc("tmp14"(#loc38))
+#loc118 = loc("tmp14"(#loc39))
+#loc119 = loc("tmp14"(#loc40))
+#loc120 = loc("tmp16"(#loc41))
+#loc121 = loc("tmp19"(#loc42))
+#loc122 = loc("tmp20"(#loc43))
+#loc123 = loc("tmp23"(#loc44))
+#loc124 = loc("tmp23"(#loc45))
+#loc125 = loc("tmp23"(#loc46))
+#loc126 = loc("tmp23"(#loc47))
+#loc127 = loc("tmp23"(#loc48))
+#loc128 = loc("tmp23"(#loc49))
+#loc129 = loc("tmp23"(#loc50))
+#loc130 = loc("tmp23"(#loc51))
+#loc131 = loc("tmp25"(#loc52))
+#loc132 = loc("tmp25"(#loc53))
+#loc133 = loc("tmp25"(#loc54))
+#loc134 = loc("tmp25"(#loc55))
+#loc135 = loc("tmp27"(#loc56))
+#loc136 = loc("tmp29"(#loc57))
+#loc137 = loc("tmp30"(#loc58))
+#loc138 = loc("tmp31"(#loc59))
+#loc139 = loc("tmp32"(#loc60))
+#loc140 = loc("tmp32"(#loc61))
+#loc141 = loc("tmp32"(#loc62))
+#loc142 = loc("tmp34"(#loc63))
+#loc143 = loc("tmp37"(#loc64))
+#loc144 = loc("tmp38"(#loc65))
diff --git a/triton/VJYGHH2I6HL5D4FSAVHRN5TDVUMIEA46OE2HSL7GW3Y7IZ2XL7TQ/__grp__triton_poi_fused__fused_rms_norm_cat_view_2.json b/triton/VJYGHH2I6HL5D4FSAVHRN5TDVUMIEA46OE2HSL7GW3Y7IZ2XL7TQ/__grp__triton_poi_fused__fused_rms_norm_cat_view_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..e22017651a3036bf0f04930d296f26ec643ab728
--- /dev/null
+++ b/triton/VJYGHH2I6HL5D4FSAVHRN5TDVUMIEA46OE2HSL7GW3Y7IZ2XL7TQ/__grp__triton_poi_fused__fused_rms_norm_cat_view_2.json
@@ -0,0 +1 @@
+{"child_paths": {"triton_poi_fused__fused_rms_norm_cat_view_2.source": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/VJYGHH2I6HL5D4FSAVHRN5TDVUMIEA46OE2HSL7GW3Y7IZ2XL7TQ/triton_poi_fused__fused_rms_norm_cat_view_2.source", "triton_poi_fused__fused_rms_norm_cat_view_2.ttir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/VJYGHH2I6HL5D4FSAVHRN5TDVUMIEA46OE2HSL7GW3Y7IZ2XL7TQ/triton_poi_fused__fused_rms_norm_cat_view_2.ttir", "triton_poi_fused__fused_rms_norm_cat_view_2.ttgir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/VJYGHH2I6HL5D4FSAVHRN5TDVUMIEA46OE2HSL7GW3Y7IZ2XL7TQ/triton_poi_fused__fused_rms_norm_cat_view_2.ttgir", "triton_poi_fused__fused_rms_norm_cat_view_2.llir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/VJYGHH2I6HL5D4FSAVHRN5TDVUMIEA46OE2HSL7GW3Y7IZ2XL7TQ/triton_poi_fused__fused_rms_norm_cat_view_2.llir", "triton_poi_fused__fused_rms_norm_cat_view_2.ptx": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/VJYGHH2I6HL5D4FSAVHRN5TDVUMIEA46OE2HSL7GW3Y7IZ2XL7TQ/triton_poi_fused__fused_rms_norm_cat_view_2.ptx", "triton_poi_fused__fused_rms_norm_cat_view_2.cubin": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/VJYGHH2I6HL5D4FSAVHRN5TDVUMIEA46OE2HSL7GW3Y7IZ2XL7TQ/triton_poi_fused__fused_rms_norm_cat_view_2.cubin", "triton_poi_fused__fused_rms_norm_cat_view_2.json": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/VJYGHH2I6HL5D4FSAVHRN5TDVUMIEA46OE2HSL7GW3Y7IZ2XL7TQ/triton_poi_fused__fused_rms_norm_cat_view_2.json"}}
\ No newline at end of file
diff --git a/triton/VJYGHH2I6HL5D4FSAVHRN5TDVUMIEA46OE2HSL7GW3Y7IZ2XL7TQ/triton_poi_fused__fused_rms_norm_cat_view_2.cubin b/triton/VJYGHH2I6HL5D4FSAVHRN5TDVUMIEA46OE2HSL7GW3Y7IZ2XL7TQ/triton_poi_fused__fused_rms_norm_cat_view_2.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..95e3feb2b7bdda770a4870af710475a3948cc107
Binary files /dev/null and b/triton/VJYGHH2I6HL5D4FSAVHRN5TDVUMIEA46OE2HSL7GW3Y7IZ2XL7TQ/triton_poi_fused__fused_rms_norm_cat_view_2.cubin differ
diff --git a/triton/VJYGHH2I6HL5D4FSAVHRN5TDVUMIEA46OE2HSL7GW3Y7IZ2XL7TQ/triton_poi_fused__fused_rms_norm_cat_view_2.json b/triton/VJYGHH2I6HL5D4FSAVHRN5TDVUMIEA46OE2HSL7GW3Y7IZ2XL7TQ/triton_poi_fused__fused_rms_norm_cat_view_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..07e0d71041b3544e9f92b9122350fd767b89530a
--- /dev/null
+++ b/triton/VJYGHH2I6HL5D4FSAVHRN5TDVUMIEA46OE2HSL7GW3Y7IZ2XL7TQ/triton_poi_fused__fused_rms_norm_cat_view_2.json
@@ -0,0 +1 @@
+{"hash": "aa70639f48f1d7d1f0b2054f16f663ad1882039e7134792fe6b6f1f467575fe7", "target": {"backend": "cuda", "arch": 89, "warp_size": 32}, "num_warps": 8, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "enable_reflect_ftz": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee", "bf16x3", "bf16x6"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm89", "instrumentation_mode": "", "triton_version": "3.6.0", "tensordesc_meta": [], "shared": 16384, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_poi_fused__fused_rms_norm_cat_view_2"}
\ No newline at end of file
diff --git a/triton/VJYGHH2I6HL5D4FSAVHRN5TDVUMIEA46OE2HSL7GW3Y7IZ2XL7TQ/triton_poi_fused__fused_rms_norm_cat_view_2.llir b/triton/VJYGHH2I6HL5D4FSAVHRN5TDVUMIEA46OE2HSL7GW3Y7IZ2XL7TQ/triton_poi_fused__fused_rms_norm_cat_view_2.llir
new file mode 100644
index 0000000000000000000000000000000000000000..caaaf465ea07ddd0e7ac483af4ab4cab285b2271
--- /dev/null
+++ b/triton/VJYGHH2I6HL5D4FSAVHRN5TDVUMIEA46OE2HSL7GW3Y7IZ2XL7TQ/triton_poi_fused__fused_rms_norm_cat_view_2.llir
@@ -0,0 +1,1346 @@
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64"
+
+@global_smem = external addrspace(3) global [0 x i8], align 16
+@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1
+
+; Function Attrs: nounwind
+define ptx_kernel void @triton_poi_fused__fused_rms_norm_cat_view_2(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, ptr addrspace(1) %6, i32 %7, i32 %8, ptr addrspace(1) readnone captures(none) %9, ptr addrspace(1) readnone captures(none) %10) local_unnamed_addr #0 !dbg !5 {
+  %12 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y(), !dbg !8
+  %13 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.z(), !dbg !9
+  %14 = tail call i32 @llvm.nvvm.read.ptx.sreg.nctaid.y(), !dbg !10
+  %15 = mul nuw i32 %13, %14, !dbg !11
+  %16 = add nuw i32 %15, %12, !dbg !12
+  %17 = shl i32 %16, 5, !dbg !13
+  %18 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !14
+  %19 = lshr i32 %18, 4, !dbg !14
+  %20 = and i32 %19, 15, !dbg !14
+  %21 = and i32 %18, 7, !dbg !14
+  %22 = shl nuw nsw i32 %21, 2, !dbg !14
+  %23 = or disjoint i32 %17, %20, !dbg !15
+  %24 = or disjoint i32 %23, 16, !dbg !15
+  %25 = or disjoint i32 %17, %22, !dbg !15
+  %26 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !16
+  %27 = shl i32 %26, 7, !dbg !17
+  %28 = and i32 %18, 15, !dbg !18
+  %29 = shl nuw nsw i32 %28, 3, !dbg !18
+  %30 = lshr i32 %18, 3, !dbg !18
+  %31 = and i32 %30, 31, !dbg !18
+  %32 = or disjoint i32 %29, %27, !dbg !19
+  %33 = or disjoint i32 %31, %27, !dbg !19
+  %34 = icmp slt i32 %32, 128, !dbg !20
+  %35 = icmp slt i32 %33, 128, !dbg !20
+  %36 = sdiv i32 %23, 32, !dbg !21
+  %37 = sdiv i32 %25, 32, !dbg !21
+  %38 = mul i32 %36, 32, !dbg !22
+  %.decomposed = sub i32 %23, %38, !dbg !22
+  %39 = srem i32 %24, 32, !dbg !22
+  %40 = mul i32 %37, 32, !dbg !22
+  %.decomposed109 = sub i32 %25, %40, !dbg !22
+  %41 = icmp slt i32 %23, 8192, !dbg !23
+  %42 = icmp slt i32 %25, 8192, !dbg !23
+  %43 = shl nsw i32 %.decomposed, 7, !dbg !24
+  %44 = shl nsw i32 %39, 7, !dbg !24
+  %45 = add i32 %43, %32, !dbg !25
+  %46 = add i32 %44, %32, !dbg !25
+  %47 = mul i32 %36, 12288, !dbg !26
+  %48 = add i32 %45, %47, !dbg !27
+  %49 = add i32 %46, %47, !dbg !27
+  %50 = sext i32 %48 to i64, !dbg !28
+  %51 = getelementptr bfloat, ptr addrspace(1) %0, i64 %50, !dbg !28
+  %52 = sext i32 %49 to i64, !dbg !28
+  %53 = getelementptr bfloat, ptr addrspace(1) %0, i64 %52, !dbg !28
+  %54 = and i1 %34, %41, !dbg !29
+  %55 = and i1 %35, %42, !dbg !29
+  %56 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !30
+  %57 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %51, i64 %56, i1 %54) #6, !dbg !30
+  %58 = extractvalue { i32, i32, i32, i32 } %57, 0, !dbg !30
+  %59 = extractvalue { i32, i32, i32, i32 } %57, 1, !dbg !30
+  %60 = extractvalue { i32, i32, i32, i32 } %57, 2, !dbg !30
+  %61 = extractvalue { i32, i32, i32, i32 } %57, 3, !dbg !30
+  %62 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !30
+  %63 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %53, i64 %62, i1 %54) #6, !dbg !30
+  %64 = extractvalue { i32, i32, i32, i32 } %63, 0, !dbg !30
+  %65 = extractvalue { i32, i32, i32, i32 } %63, 1, !dbg !30
+  %66 = extractvalue { i32, i32, i32, i32 } %63, 2, !dbg !30
+  %67 = extractvalue { i32, i32, i32, i32 } %63, 3, !dbg !30
+  %68 = insertelement <2 x i32> poison, i32 %58, i64 0, !dbg !30
+  %69 = insertelement <2 x i32> %68, i32 %64, i64 1, !dbg !30
+  %70 = lshr <2 x i32> %69, splat (i32 16), !dbg !30
+  %71 = trunc nuw <2 x i32> %70 to <2 x i16>, !dbg !30
+  %72 = insertelement <2 x i32> poison, i32 %59, i64 0, !dbg !30
+  %73 = insertelement <2 x i32> %72, i32 %65, i64 1, !dbg !30
+  %74 = lshr <2 x i32> %73, splat (i32 16), !dbg !30
+  %75 = trunc nuw <2 x i32> %74 to <2 x i16>, !dbg !30
+  %76 = insertelement <2 x i32> poison, i32 %60, i64 0, !dbg !30
+  %77 = insertelement <2 x i32> %76, i32 %66, i64 1, !dbg !30
+  %78 = lshr <2 x i32> %77, splat (i32 16), !dbg !30
+  %79 = trunc nuw <2 x i32> %78 to <2 x i16>, !dbg !30
+  %80 = insertelement <2 x i32> poison, i32 %61, i64 0, !dbg !30
+  %81 = insertelement <2 x i32> %80, i32 %67, i64 1, !dbg !30
+  %82 = lshr <2 x i32> %81, splat (i32 16), !dbg !30
+  %83 = trunc nuw <2 x i32> %82 to <2 x i16>, !dbg !30
+  %84 = and i32 %18, 192, !dbg !31
+  %85 = shl nuw nsw i32 %84, 5, !dbg !31
+  %86 = shl nuw nsw i32 %21, 4, !dbg !31
+  %87 = lshr exact i32 %84, 1, !dbg !31
+  %88 = shl nuw nsw i32 %18, 6, !dbg !31
+  %89 = and i32 %88, 512, !dbg !31
+  %90 = and i32 %18, 16, !dbg !31
+  %91 = icmp eq i32 %90, 0, !dbg !31
+  %92 = select i1 %91, i32 0, i32 1040, !dbg !31
+  %93 = and i32 %18, 32, !dbg !31
+  %94 = shl nuw nsw i32 %93, 2, !dbg !31
+  %95 = or disjoint i32 %85, %86, !dbg !31
+  %96 = or disjoint i32 %92, %87, !dbg !31
+  %97 = xor i32 %96, %95, !dbg !31
+  %98 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %89, !dbg !31
+  %99 = getelementptr inbounds nuw i8, ptr addrspace(3) %98, i32 %97, !dbg !31
+  %100 = getelementptr inbounds nuw i8, ptr addrspace(3) %99, i32 %94, !dbg !31
+  %101 = trunc i32 %58 to i16, !dbg !31
+  %102 = trunc i32 %64 to i16, !dbg !31
+  %103 = trunc i32 %59 to i16, !dbg !31
+  %104 = trunc i32 %65 to i16, !dbg !31
+  %105 = insertelement <2 x i16> poison, i16 %101, i64 0, !dbg !31
+  %106 = insertelement <2 x i16> %105, i16 %102, i64 1, !dbg !31
+  %107 = bitcast <2 x i16> %106 to i32, !dbg !31
+  %108 = bitcast <2 x i16> %71 to i32, !dbg !31
+  %109 = insertelement <2 x i16> poison, i16 %103, i64 0, !dbg !31
+  %110 = insertelement <2 x i16> %109, i16 %104, i64 1, !dbg !31
+  %111 = bitcast <2 x i16> %110 to i32, !dbg !31
+  %112 = bitcast <2 x i16> %75 to i32, !dbg !31
+  %113 = insertelement <4 x i32> poison, i32 %107, i64 0, !dbg !31
+  %114 = insertelement <4 x i32> %113, i32 %108, i64 1, !dbg !31
+  %115 = insertelement <4 x i32> %114, i32 %111, i64 2, !dbg !31
+  %116 = insertelement <4 x i32> %115, i32 %112, i64 3, !dbg !31
+  store <4 x i32> %116, ptr addrspace(3) %100, align 16, !dbg !31
+  %117 = getelementptr inbounds nuw i8, ptr addrspace(3) %100, i32 256, !dbg !31
+  %118 = trunc i32 %60 to i16, !dbg !31
+  %119 = trunc i32 %66 to i16, !dbg !31
+  %120 = trunc i32 %61 to i16, !dbg !31
+  %121 = trunc i32 %67 to i16, !dbg !31
+  %122 = insertelement <2 x i16> poison, i16 %118, i64 0, !dbg !31
+  %123 = insertelement <2 x i16> %122, i16 %119, i64 1, !dbg !31
+  %124 = bitcast <2 x i16> %123 to i32, !dbg !31
+  %125 = bitcast <2 x i16> %79 to i32, !dbg !31
+  %126 = insertelement <2 x i16> poison, i16 %120, i64 0, !dbg !31
+  %127 = insertelement <2 x i16> %126, i16 %121, i64 1, !dbg !31
+  %128 = bitcast <2 x i16> %127 to i32, !dbg !31
+  %129 = bitcast <2 x i16> %83 to i32, !dbg !31
+  %130 = insertelement <4 x i32> poison, i32 %124, i64 0, !dbg !31
+  %131 = insertelement <4 x i32> %130, i32 %125, i64 1, !dbg !31
+  %132 = insertelement <4 x i32> %131, i32 %128, i64 2, !dbg !31
+  %133 = insertelement <4 x i32> %132, i32 %129, i64 3, !dbg !31
+  store <4 x i32> %133, ptr addrspace(3) %117, align 16, !dbg !31
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !31
+  %134 = shl nuw nsw i32 %21, 10, !dbg !31
+  %135 = shl nuw nsw i32 %28, 4, !dbg !31
+  %136 = lshr exact i32 %84, 2, !dbg !31
+  %137 = shl nuw nsw i32 %90, 2, !dbg !31
+  %138 = shl nuw nsw i32 %93, 3, !dbg !31
+  %139 = xor i32 %135, %136, !dbg !31
+  %140 = xor i32 %139, %137, !dbg !31
+  %141 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %134, !dbg !31
+  %142 = getelementptr inbounds nuw i8, ptr addrspace(3) %141, i32 %140, !dbg !31
+  %143 = getelementptr inbounds nuw i8, ptr addrspace(3) %142, i32 %138, !dbg !31
+  %144 = tail call { i32, i32, i32, i32 } @llvm.nvvm.ldmatrix.sync.aligned.m8n8.x4.trans.b16.p3(ptr addrspace(3) %143), !dbg !31
+  %145 = extractvalue { i32, i32, i32, i32 } %144, 0, !dbg !31
+  %146 = bitcast i32 %145 to <2 x bfloat>, !dbg !31
+  %147 = extractelement <2 x bfloat> %146, i64 0, !dbg !31
+  %148 = extractelement <2 x bfloat> %146, i64 1, !dbg !31
+  %149 = extractvalue { i32, i32, i32, i32 } %144, 1, !dbg !31
+  %150 = bitcast i32 %149 to <2 x bfloat>, !dbg !31
+  %151 = extractelement <2 x bfloat> %150, i64 0, !dbg !31
+  %152 = extractelement <2 x bfloat> %150, i64 1, !dbg !31
+  %153 = extractvalue { i32, i32, i32, i32 } %144, 2, !dbg !31
+  %154 = bitcast i32 %153 to <2 x bfloat>, !dbg !31
+  %155 = extractelement <2 x bfloat> %154, i64 0, !dbg !31
+  %156 = extractelement <2 x bfloat> %154, i64 1, !dbg !31
+  %157 = extractvalue { i32, i32, i32, i32 } %144, 3, !dbg !31
+  %158 = bitcast i32 %157 to <2 x bfloat>, !dbg !31
+  %159 = extractelement <2 x bfloat> %158, i64 0, !dbg !31
+  %160 = extractelement <2 x bfloat> %158, i64 1, !dbg !31
+  %161 = getelementptr inbounds nuw i8, ptr addrspace(3) %143, i32 512, !dbg !31
+  %162 = tail call { i32, i32, i32, i32 } @llvm.nvvm.ldmatrix.sync.aligned.m8n8.x4.trans.b16.p3(ptr addrspace(3) nonnull %161), !dbg !31
+  %163 = extractvalue { i32, i32, i32, i32 } %162, 0, !dbg !31
+  %164 = bitcast i32 %163 to <2 x bfloat>, !dbg !31
+  %165 = extractelement <2 x bfloat> %164, i64 0, !dbg !31
+  %166 = extractelement <2 x bfloat> %164, i64 1, !dbg !31
+  %167 = extractvalue { i32, i32, i32, i32 } %162, 1, !dbg !31
+  %168 = bitcast i32 %167 to <2 x bfloat>, !dbg !31
+  %169 = extractelement <2 x bfloat> %168, i64 0, !dbg !31
+  %170 = extractelement <2 x bfloat> %168, i64 1, !dbg !31
+  %171 = extractvalue { i32, i32, i32, i32 } %162, 2, !dbg !31
+  %172 = bitcast i32 %171 to <2 x bfloat>, !dbg !31
+  %173 = extractelement <2 x bfloat> %172, i64 0, !dbg !31
+  %174 = extractelement <2 x bfloat> %172, i64 1, !dbg !31
+  %175 = extractvalue { i32, i32, i32, i32 } %162, 3, !dbg !31
+  %176 = bitcast i32 %175 to <2 x bfloat>, !dbg !31
+  %177 = extractelement <2 x bfloat> %176, i64 0, !dbg !31
+  %178 = extractelement <2 x bfloat> %176, i64 1, !dbg !31
+  %179 = fpext bfloat %147 to float, !dbg !31
+  %180 = fpext bfloat %148 to float, !dbg !31
+  %181 = fpext bfloat %151 to float, !dbg !31
+  %182 = fpext bfloat %152 to float, !dbg !31
+  %183 = fpext bfloat %155 to float, !dbg !31
+  %184 = fpext bfloat %156 to float, !dbg !31
+  %185 = fpext bfloat %159 to float, !dbg !31
+  %186 = fpext bfloat %160 to float, !dbg !31
+  %187 = fpext bfloat %165 to float, !dbg !31
+  %188 = fpext bfloat %166 to float, !dbg !31
+  %189 = fpext bfloat %169 to float, !dbg !31
+  %190 = fpext bfloat %170 to float, !dbg !31
+  %191 = fpext bfloat %173 to float, !dbg !31
+  %192 = fpext bfloat %174 to float, !dbg !31
+  %193 = fpext bfloat %177 to float, !dbg !31
+  %194 = fpext bfloat %178 to float, !dbg !31
+  %195 = sext i32 %25 to i64, !dbg !32
+  %196 = getelementptr float, ptr addrspace(1) %1, i64 %195, !dbg !32
+  %197 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !33
+  %198 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %196, i64 %197, i1 %55) #6, !dbg !33
+  %199 = extractvalue { i32, i32, i32, i32 } %198, 0, !dbg !33
+  %200 = extractvalue { i32, i32, i32, i32 } %198, 1, !dbg !33
+  %201 = extractvalue { i32, i32, i32, i32 } %198, 2, !dbg !33
+  %202 = extractvalue { i32, i32, i32, i32 } %198, 3, !dbg !33
+  %203 = bitcast i32 %199 to float, !dbg !33
+  %204 = bitcast i32 %200 to float, !dbg !33
+  %205 = bitcast i32 %201 to float, !dbg !33
+  %206 = bitcast i32 %202 to float, !dbg !33
+  %207 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !33
+  %208 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %196, i64 %207, i1 %55) #6, !dbg !33
+  %209 = extractvalue { i32, i32, i32, i32 } %208, 0, !dbg !33
+  %210 = extractvalue { i32, i32, i32, i32 } %208, 1, !dbg !33
+  %211 = extractvalue { i32, i32, i32, i32 } %208, 2, !dbg !33
+  %212 = extractvalue { i32, i32, i32, i32 } %208, 3, !dbg !33
+  %213 = bitcast i32 %209 to float, !dbg !33
+  %214 = bitcast i32 %210 to float, !dbg !33
+  %215 = bitcast i32 %211 to float, !dbg !33
+  %216 = bitcast i32 %212 to float, !dbg !33
+  %217 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !33
+  %218 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %196, i64 %217, i1 %55) #6, !dbg !33
+  %219 = extractvalue { i32, i32, i32, i32 } %218, 0, !dbg !33
+  %220 = extractvalue { i32, i32, i32, i32 } %218, 1, !dbg !33
+  %221 = extractvalue { i32, i32, i32, i32 } %218, 2, !dbg !33
+  %222 = extractvalue { i32, i32, i32, i32 } %218, 3, !dbg !33
+  %223 = bitcast i32 %219 to float, !dbg !33
+  %224 = bitcast i32 %220 to float, !dbg !33
+  %225 = bitcast i32 %221 to float, !dbg !33
+  %226 = bitcast i32 %222 to float, !dbg !33
+  %227 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !33
+  %228 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %196, i64 %227, i1 %55) #6, !dbg !33
+  %229 = extractvalue { i32, i32, i32, i32 } %228, 0, !dbg !33
+  %230 = extractvalue { i32, i32, i32, i32 } %228, 1, !dbg !33
+  %231 = extractvalue { i32, i32, i32, i32 } %228, 2, !dbg !33
+  %232 = extractvalue { i32, i32, i32, i32 } %228, 3, !dbg !33
+  %233 = bitcast i32 %229 to float, !dbg !33
+  %234 = bitcast i32 %230 to float, !dbg !33
+  %235 = bitcast i32 %231 to float, !dbg !33
+  %236 = bitcast i32 %232 to float, !dbg !33
+  %237 = tail call float @llvm.nvvm.div.full(float %203, float 1.280000e+02), !dbg !34
+  %238 = tail call float @llvm.nvvm.div.full(float %204, float 1.280000e+02), !dbg !34
+  %239 = tail call float @llvm.nvvm.div.full(float %205, float 1.280000e+02), !dbg !34
+  %240 = tail call float @llvm.nvvm.div.full(float %206, float 1.280000e+02), !dbg !34
+  %241 = tail call float @llvm.nvvm.div.full(float %213, float 1.280000e+02), !dbg !34
+  %242 = tail call float @llvm.nvvm.div.full(float %214, float 1.280000e+02), !dbg !34
+  %243 = tail call float @llvm.nvvm.div.full(float %215, float 1.280000e+02), !dbg !34
+  %244 = tail call float @llvm.nvvm.div.full(float %216, float 1.280000e+02), !dbg !34
+  %245 = tail call float @llvm.nvvm.div.full(float %223, float 1.280000e+02), !dbg !34
+  %246 = tail call float @llvm.nvvm.div.full(float %224, float 1.280000e+02), !dbg !34
+  %247 = tail call float @llvm.nvvm.div.full(float %225, float 1.280000e+02), !dbg !34
+  %248 = tail call float @llvm.nvvm.div.full(float %226, float 1.280000e+02), !dbg !34
+  %249 = tail call float @llvm.nvvm.div.full(float %233, float 1.280000e+02), !dbg !34
+  %250 = tail call float @llvm.nvvm.div.full(float %234, float 1.280000e+02), !dbg !34
+  %251 = tail call float @llvm.nvvm.div.full(float %235, float 1.280000e+02), !dbg !34
+  %252 = tail call float @llvm.nvvm.div.full(float %236, float 1.280000e+02), !dbg !34
+  %253 = fadd float %237, 0x3EB0C6F7A0000000, !dbg !35
+  %254 = fadd float %238, 0x3EB0C6F7A0000000, !dbg !35
+  %255 = fadd float %239, 0x3EB0C6F7A0000000, !dbg !35
+  %256 = fadd float %240, 0x3EB0C6F7A0000000, !dbg !35
+  %257 = fadd float %241, 0x3EB0C6F7A0000000, !dbg !35
+  %258 = fadd float %242, 0x3EB0C6F7A0000000, !dbg !35
+  %259 = fadd float %243, 0x3EB0C6F7A0000000, !dbg !35
+  %260 = fadd float %244, 0x3EB0C6F7A0000000, !dbg !35
+  %261 = fadd float %245, 0x3EB0C6F7A0000000, !dbg !35
+  %262 = fadd float %246, 0x3EB0C6F7A0000000, !dbg !35
+  %263 = fadd float %247, 0x3EB0C6F7A0000000, !dbg !35
+  %264 = fadd float %248, 0x3EB0C6F7A0000000, !dbg !35
+  %265 = fadd float %249, 0x3EB0C6F7A0000000, !dbg !35
+  %266 = fadd float %250, 0x3EB0C6F7A0000000, !dbg !35
+  %267 = fadd float %251, 0x3EB0C6F7A0000000, !dbg !35
+  %268 = fadd float %252, 0x3EB0C6F7A0000000, !dbg !35
+  %269 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36
+  %.not.i = icmp eq i32 %269, 0, !dbg !36
+  br i1 %.not.i, label %272, label %270, !dbg !36
+
+270:                                              ; preds = %11
+  %271 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %253), !dbg !36
+  br label %__nv_rsqrtf.exit, !dbg !36
+
+272:                                              ; preds = %11
+  %273 = tail call float @llvm.nvvm.rsqrt.approx.f(float %253), !dbg !36
+  br label %__nv_rsqrtf.exit, !dbg !36
+
+__nv_rsqrtf.exit:                                 ; preds = %270, %272
+  %.0.i = phi float [ %271, %270 ], [ %273, %272 ], !dbg !36
+  %274 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36
+  %.not.i16 = icmp eq i32 %274, 0, !dbg !36
+  br i1 %.not.i16, label %277, label %275, !dbg !36
+
+275:                                              ; preds = %__nv_rsqrtf.exit
+  %276 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %254), !dbg !36
+  br label %__nv_rsqrtf.exit18, !dbg !36
+
+277:                                              ; preds = %__nv_rsqrtf.exit
+  %278 = tail call float @llvm.nvvm.rsqrt.approx.f(float %254), !dbg !36
+  br label %__nv_rsqrtf.exit18, !dbg !36
+
+__nv_rsqrtf.exit18:                               ; preds = %275, %277
+  %.0.i17 = phi float [ %276, %275 ], [ %278, %277 ], !dbg !36
+  %279 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36
+  %.not.i19 = icmp eq i32 %279, 0, !dbg !36
+  br i1 %.not.i19, label %282, label %280, !dbg !36
+
+280:                                              ; preds = %__nv_rsqrtf.exit18
+  %281 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %255), !dbg !36
+  br label %__nv_rsqrtf.exit21, !dbg !36
+
+282:                                              ; preds = %__nv_rsqrtf.exit18
+  %283 = tail call float @llvm.nvvm.rsqrt.approx.f(float %255), !dbg !36
+  br label %__nv_rsqrtf.exit21, !dbg !36
+
+__nv_rsqrtf.exit21:                               ; preds = %280, %282
+  %.0.i20 = phi float [ %281, %280 ], [ %283, %282 ], !dbg !36
+  %284 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36
+  %.not.i22 = icmp eq i32 %284, 0, !dbg !36
+  br i1 %.not.i22, label %287, label %285, !dbg !36
+
+285:                                              ; preds = %__nv_rsqrtf.exit21
+  %286 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %256), !dbg !36
+  br label %__nv_rsqrtf.exit24, !dbg !36
+
+287:                                              ; preds = %__nv_rsqrtf.exit21
+  %288 = tail call float @llvm.nvvm.rsqrt.approx.f(float %256), !dbg !36
+  br label %__nv_rsqrtf.exit24, !dbg !36
+
+__nv_rsqrtf.exit24:                               ; preds = %285, %287
+  %.0.i23 = phi float [ %286, %285 ], [ %288, %287 ], !dbg !36
+  %289 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36
+  %.not.i25 = icmp eq i32 %289, 0, !dbg !36
+  br i1 %.not.i25, label %292, label %290, !dbg !36
+
+290:                                              ; preds = %__nv_rsqrtf.exit24
+  %291 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %257), !dbg !36
+  br label %__nv_rsqrtf.exit27, !dbg !36
+
+292:                                              ; preds = %__nv_rsqrtf.exit24
+  %293 = tail call float @llvm.nvvm.rsqrt.approx.f(float %257), !dbg !36
+  br label %__nv_rsqrtf.exit27, !dbg !36
+
+__nv_rsqrtf.exit27:                               ; preds = %290, %292
+  %.0.i26 = phi float [ %291, %290 ], [ %293, %292 ], !dbg !36
+  %294 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36
+  %.not.i28 = icmp eq i32 %294, 0, !dbg !36
+  br i1 %.not.i28, label %297, label %295, !dbg !36
+
+295:                                              ; preds = %__nv_rsqrtf.exit27
+  %296 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %258), !dbg !36
+  br label %__nv_rsqrtf.exit30, !dbg !36
+
+297:                                              ; preds = %__nv_rsqrtf.exit27
+  %298 = tail call float @llvm.nvvm.rsqrt.approx.f(float %258), !dbg !36
+  br label %__nv_rsqrtf.exit30, !dbg !36
+
+__nv_rsqrtf.exit30:                               ; preds = %295, %297
+  %.0.i29 = phi float [ %296, %295 ], [ %298, %297 ], !dbg !36
+  %299 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36
+  %.not.i31 = icmp eq i32 %299, 0, !dbg !36
+  br i1 %.not.i31, label %302, label %300, !dbg !36
+
+300:                                              ; preds = %__nv_rsqrtf.exit30
+  %301 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %259), !dbg !36
+  br label %__nv_rsqrtf.exit33, !dbg !36
+
+302:                                              ; preds = %__nv_rsqrtf.exit30
+  %303 = tail call float @llvm.nvvm.rsqrt.approx.f(float %259), !dbg !36
+  br label %__nv_rsqrtf.exit33, !dbg !36
+
+__nv_rsqrtf.exit33:                               ; preds = %300, %302
+  %.0.i32 = phi float [ %301, %300 ], [ %303, %302 ], !dbg !36
+  %304 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36
+  %.not.i34 = icmp eq i32 %304, 0, !dbg !36
+  br i1 %.not.i34, label %307, label %305, !dbg !36
+
+305:                                              ; preds = %__nv_rsqrtf.exit33
+  %306 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %260), !dbg !36
+  br label %__nv_rsqrtf.exit36, !dbg !36
+
+307:                                              ; preds = %__nv_rsqrtf.exit33
+  %308 = tail call float @llvm.nvvm.rsqrt.approx.f(float %260), !dbg !36
+  br label %__nv_rsqrtf.exit36, !dbg !36
+
+__nv_rsqrtf.exit36:                               ; preds = %305, %307
+  %.0.i35 = phi float [ %306, %305 ], [ %308, %307 ], !dbg !36
+  %309 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36
+  %.not.i37 = icmp eq i32 %309, 0, !dbg !36
+  br i1 %.not.i37, label %312, label %310, !dbg !36
+
+310:                                              ; preds = %__nv_rsqrtf.exit36
+  %311 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %261), !dbg !36
+  br label %__nv_rsqrtf.exit39, !dbg !36
+
+312:                                              ; preds = %__nv_rsqrtf.exit36
+  %313 = tail call float @llvm.nvvm.rsqrt.approx.f(float %261), !dbg !36
+  br label %__nv_rsqrtf.exit39, !dbg !36
+
+__nv_rsqrtf.exit39:                               ; preds = %310, %312
+  %.0.i38 = phi float [ %311, %310 ], [ %313, %312 ], !dbg !36
+  %314 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36
+  %.not.i40 = icmp eq i32 %314, 0, !dbg !36
+  br i1 %.not.i40, label %317, label %315, !dbg !36
+
+315:                                              ; preds = %__nv_rsqrtf.exit39
+  %316 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %262), !dbg !36
+  br label %__nv_rsqrtf.exit42, !dbg !36
+
+317:                                              ; preds = %__nv_rsqrtf.exit39
+  %318 = tail call float @llvm.nvvm.rsqrt.approx.f(float %262), !dbg !36
+  br label %__nv_rsqrtf.exit42, !dbg !36
+
+__nv_rsqrtf.exit42:                               ; preds = %315, %317
+  %.0.i41 = phi float [ %316, %315 ], [ %318, %317 ], !dbg !36
+  %319 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36
+  %.not.i43 = icmp eq i32 %319, 0, !dbg !36
+  br i1 %.not.i43, label %322, label %320, !dbg !36
+
+320:                                              ; preds = %__nv_rsqrtf.exit42
+  %321 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %263), !dbg !36
+  br label %__nv_rsqrtf.exit45, !dbg !36
+
+322:                                              ; preds = %__nv_rsqrtf.exit42
+  %323 = tail call float @llvm.nvvm.rsqrt.approx.f(float %263), !dbg !36
+  br label %__nv_rsqrtf.exit45, !dbg !36
+
+__nv_rsqrtf.exit45:                               ; preds = %320, %322
+  %.0.i44 = phi float [ %321, %320 ], [ %323, %322 ], !dbg !36
+  %324 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36
+  %.not.i46 = icmp eq i32 %324, 0, !dbg !36
+  br i1 %.not.i46, label %327, label %325, !dbg !36
+
+325:                                              ; preds = %__nv_rsqrtf.exit45
+  %326 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %264), !dbg !36
+  br label %__nv_rsqrtf.exit48, !dbg !36
+
+327:                                              ; preds = %__nv_rsqrtf.exit45
+  %328 = tail call float @llvm.nvvm.rsqrt.approx.f(float %264), !dbg !36
+  br label %__nv_rsqrtf.exit48, !dbg !36
+
+__nv_rsqrtf.exit48:                               ; preds = %325, %327
+  %.0.i47 = phi float [ %326, %325 ], [ %328, %327 ], !dbg !36
+  %329 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36
+  %.not.i49 = icmp eq i32 %329, 0, !dbg !36
+  br i1 %.not.i49, label %332, label %330, !dbg !36
+
+330:                                              ; preds = %__nv_rsqrtf.exit48
+  %331 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %265), !dbg !36
+  br label %__nv_rsqrtf.exit51, !dbg !36
+
+332:                                              ; preds = %__nv_rsqrtf.exit48
+  %333 = tail call float @llvm.nvvm.rsqrt.approx.f(float %265), !dbg !36
+  br label %__nv_rsqrtf.exit51, !dbg !36
+
+__nv_rsqrtf.exit51:                               ; preds = %330, %332
+  %.0.i50 = phi float [ %331, %330 ], [ %333, %332 ], !dbg !36
+  %334 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36
+  %.not.i52 = icmp eq i32 %334, 0, !dbg !36
+  br i1 %.not.i52, label %337, label %335, !dbg !36
+
+335:                                              ; preds = %__nv_rsqrtf.exit51
+  %336 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %266), !dbg !36
+  br label %__nv_rsqrtf.exit54, !dbg !36
+
+337:                                              ; preds = %__nv_rsqrtf.exit51
+  %338 = tail call float @llvm.nvvm.rsqrt.approx.f(float %266), !dbg !36
+  br label %__nv_rsqrtf.exit54, !dbg !36
+
+__nv_rsqrtf.exit54:                               ; preds = %335, %337
+  %.0.i53 = phi float [ %336, %335 ], [ %338, %337 ], !dbg !36
+  %339 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36
+  %.not.i55 = icmp eq i32 %339, 0, !dbg !36
+  br i1 %.not.i55, label %342, label %340, !dbg !36
+
+340:                                              ; preds = %__nv_rsqrtf.exit54
+  %341 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %267), !dbg !36
+  br label %__nv_rsqrtf.exit57, !dbg !36
+
+342:                                              ; preds = %__nv_rsqrtf.exit54
+  %343 = tail call float @llvm.nvvm.rsqrt.approx.f(float %267), !dbg !36
+  br label %__nv_rsqrtf.exit57, !dbg !36
+
+__nv_rsqrtf.exit57:                               ; preds = %340, %342
+  %.0.i56 = phi float [ %341, %340 ], [ %343, %342 ], !dbg !36
+  %344 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36
+  %.not.i58 = icmp eq i32 %344, 0, !dbg !36
+  br i1 %.not.i58, label %347, label %345, !dbg !36
+
+345:                                              ; preds = %__nv_rsqrtf.exit57
+  %346 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %268), !dbg !36
+  br label %__nv_rsqrtf.exit60, !dbg !36
+
+347:                                              ; preds = %__nv_rsqrtf.exit57
+  %348 = tail call float @llvm.nvvm.rsqrt.approx.f(float %268), !dbg !36
+  br label %__nv_rsqrtf.exit60, !dbg !36
+
+__nv_rsqrtf.exit60:                               ; preds = %345, %347
+  %.0.i59 = phi float [ %346, %345 ], [ %348, %347 ], !dbg !36
+  %349 = fmul float %.0.i, %179, !dbg !37
+  %350 = fmul float %.0.i17, %180, !dbg !37
+  %351 = fmul float %.0.i20, %181, !dbg !37
+  %352 = fmul float %.0.i23, %182, !dbg !37
+  %353 = fmul float %.0.i26, %183, !dbg !37
+  %354 = fmul float %.0.i29, %184, !dbg !37
+  %355 = fmul float %.0.i32, %185, !dbg !37
+  %356 = fmul float %.0.i35, %186, !dbg !37
+  %357 = fmul float %.0.i38, %187, !dbg !37
+  %358 = fmul float %.0.i41, %188, !dbg !37
+  %359 = fmul float %.0.i44, %189, !dbg !37
+  %360 = fmul float %.0.i47, %190, !dbg !37
+  %361 = fmul float %.0.i50, %191, !dbg !37
+  %362 = fmul float %.0.i53, %192, !dbg !37
+  %363 = fmul float %.0.i56, %193, !dbg !37
+  %364 = fmul float %.0.i59, %194, !dbg !37
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !37
+  %365 = shl nuw nsw i32 %18, 9, !dbg !37
+  %366 = and i32 %365, 15360, !dbg !37
+  %367 = lshr i32 %18, 1, !dbg !37
+  %368 = and i32 %367, 108, !dbg !37
+  %369 = or disjoint i32 %366, %86, !dbg !37
+  %370 = xor i32 %369, %368, !dbg !37
+  %371 = or disjoint i32 %370, %138, !dbg !37
+  %372 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %371, !dbg !37
+  store float %349, ptr addrspace(3) %372, align 4, !dbg !37
+  %373 = getelementptr inbounds nuw i8, ptr addrspace(3) %372, i32 128, !dbg !37
+  store float %351, ptr addrspace(3) %373, align 4, !dbg !37
+  %374 = xor i32 %371, 528, !dbg !37
+  %375 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %374, !dbg !37
+  store float %350, ptr addrspace(3) %375, align 4, !dbg !37
+  %376 = getelementptr inbounds nuw i8, ptr addrspace(3) %375, i32 128, !dbg !37
+  store float %352, ptr addrspace(3) %376, align 4, !dbg !37
+  %377 = xor i32 %371, 4, !dbg !37
+  %378 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %377, !dbg !37
+  store float %353, ptr addrspace(3) %378, align 4, !dbg !37
+  %379 = getelementptr inbounds nuw i8, ptr addrspace(3) %378, i32 128, !dbg !37
+  store float %355, ptr addrspace(3) %379, align 4, !dbg !37
+  %380 = xor i32 %371, 532, !dbg !37
+  %381 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %380, !dbg !37
+  store float %354, ptr addrspace(3) %381, align 4, !dbg !37
+  %382 = getelementptr inbounds nuw i8, ptr addrspace(3) %381, i32 128, !dbg !37
+  store float %356, ptr addrspace(3) %382, align 4, !dbg !37
+  %383 = xor i32 %371, 8, !dbg !37
+  %384 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %383, !dbg !37
+  store float %357, ptr addrspace(3) %384, align 4, !dbg !37
+  %385 = getelementptr inbounds nuw i8, ptr addrspace(3) %384, i32 128, !dbg !37
+  store float %359, ptr addrspace(3) %385, align 4, !dbg !37
+  %386 = xor i32 %371, 536, !dbg !37
+  %387 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %386, !dbg !37
+  store float %358, ptr addrspace(3) %387, align 4, !dbg !37
+  %388 = getelementptr inbounds nuw i8, ptr addrspace(3) %387, i32 128, !dbg !37
+  store float %360, ptr addrspace(3) %388, align 4, !dbg !37
+  %389 = xor i32 %371, 12, !dbg !37
+  %390 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %389, !dbg !37
+  store float %361, ptr addrspace(3) %390, align 4, !dbg !37
+  %391 = getelementptr inbounds nuw i8, ptr addrspace(3) %390, i32 128, !dbg !37
+  store float %363, ptr addrspace(3) %391, align 4, !dbg !37
+  %392 = xor i32 %371, 540, !dbg !37
+  %393 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %392, !dbg !37
+  store float %362, ptr addrspace(3) %393, align 4, !dbg !37
+  %394 = getelementptr inbounds nuw i8, ptr addrspace(3) %393, i32 128, !dbg !37
+  store float %364, ptr addrspace(3) %394, align 4, !dbg !37
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !37
+  %395 = shl nuw nsw i32 %18, 5, !dbg !37
+  %396 = and i32 %395, 608, !dbg !37
+  %397 = and i32 %18, 28, !dbg !37
+  %398 = lshr i32 %18, 2, !dbg !37
+  %399 = and i32 %398, 16, !dbg !37
+  %400 = and i32 %18, 128, !dbg !37
+  %401 = icmp eq i32 %400, 0, !dbg !37
+  %402 = select i1 %401, i32 0, i32 1056, !dbg !37
+  %403 = or disjoint i32 %396, %397, !dbg !37
+  %404 = or disjoint i32 %402, %399, !dbg !37
+  %405 = xor i32 %404, %403, !dbg !37
+  %406 = or disjoint i32 %405, %94, !dbg !37
+  %407 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %406, !dbg !37
+  %408 = load float, ptr addrspace(3) %407, align 4, !dbg !37
+  %409 = getelementptr inbounds nuw i8, ptr addrspace(3) %407, i32 256, !dbg !37
+  %410 = load float, ptr addrspace(3) %409, align 4, !dbg !37
+  %411 = xor i32 %406, 4100, !dbg !37
+  %412 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %411, !dbg !37
+  %413 = load float, ptr addrspace(3) %412, align 4, !dbg !37
+  %414 = getelementptr inbounds nuw i8, ptr addrspace(3) %412, i32 256, !dbg !37
+  %415 = load float, ptr addrspace(3) %414, align 4, !dbg !37
+  %416 = xor i32 %406, 8200, !dbg !37
+  %417 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %416, !dbg !37
+  %418 = load float, ptr addrspace(3) %417, align 4, !dbg !37
+  %419 = getelementptr inbounds nuw i8, ptr addrspace(3) %417, i32 256, !dbg !37
+  %420 = load float, ptr addrspace(3) %419, align 4, !dbg !37
+  %421 = xor i32 %406, 12300, !dbg !37
+  %422 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %421, !dbg !37
+  %423 = load float, ptr addrspace(3) %422, align 4, !dbg !37
+  %424 = getelementptr inbounds nuw i8, ptr addrspace(3) %422, i32 256, !dbg !37
+  %425 = load float, ptr addrspace(3) %424, align 4, !dbg !37
+  %426 = xor i32 %406, 2112, !dbg !37
+  %427 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %426, !dbg !37
+  %428 = load float, ptr addrspace(3) %427, align 4, !dbg !37
+  %429 = getelementptr inbounds nuw i8, ptr addrspace(3) %427, i32 256, !dbg !37
+  %430 = load float, ptr addrspace(3) %429, align 4, !dbg !37
+  %431 = xor i32 %406, 6212, !dbg !37
+  %432 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %431, !dbg !37
+  %433 = load float, ptr addrspace(3) %432, align 4, !dbg !37
+  %434 = getelementptr inbounds nuw i8, ptr addrspace(3) %432, i32 256, !dbg !37
+  %435 = load float, ptr addrspace(3) %434, align 4, !dbg !37
+  %436 = xor i32 %406, 10312, !dbg !37
+  %437 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %436, !dbg !37
+  %438 = load float, ptr addrspace(3) %437, align 4, !dbg !37
+  %439 = getelementptr inbounds nuw i8, ptr addrspace(3) %437, i32 256, !dbg !37
+  %440 = load float, ptr addrspace(3) %439, align 4, !dbg !37
+  %441 = xor i32 %406, 14412, !dbg !37
+  %442 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %441, !dbg !37
+  %443 = load float, ptr addrspace(3) %442, align 4, !dbg !37
+  %444 = getelementptr inbounds nuw i8, ptr addrspace(3) %442, i32 256, !dbg !37
+  %445 = load float, ptr addrspace(3) %444, align 4, !dbg !37
+  %446 = sext i32 %32 to i64, !dbg !38
+  %447 = getelementptr bfloat, ptr addrspace(1) %2, i64 %446, !dbg !38
+  %448 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !39
+  %449 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %447, i64 %448, i1 %54) #6, !dbg !39
+  %450 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !39
+  %451 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %447, i64 %450, i1 %54) #6, !dbg !39
+  %452 = add i32 %47, -3145728, !dbg !40
+  %453 = add i32 %45, %452, !dbg !41
+  %454 = add i32 %46, %452, !dbg !41
+  %455 = sext i32 %453 to i64, !dbg !42
+  %456 = getelementptr bfloat, ptr addrspace(1) %3, i64 %455, !dbg !42
+  %457 = sext i32 %454 to i64, !dbg !42
+  %458 = getelementptr bfloat, ptr addrspace(1) %3, i64 %457, !dbg !42
+  %459 = add i32 %17, -8192, !dbg !43
+  %460 = icmp ult i32 %459, 65536, !dbg !43
+  %461 = and i1 %34, %460, !dbg !43
+  %462 = and i1 %35, %460, !dbg !43
+  %463 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !44
+  %464 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %456, i64 %463, i1 %461) #6, !dbg !44
+  %465 = extractvalue { i32, i32, i32, i32 } %464, 0, !dbg !44
+  %466 = extractvalue { i32, i32, i32, i32 } %464, 1, !dbg !44
+  %467 = extractvalue { i32, i32, i32, i32 } %464, 2, !dbg !44
+  %468 = extractvalue { i32, i32, i32, i32 } %464, 3, !dbg !44
+  %469 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !44
+  %470 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %458, i64 %469, i1 %461) #6, !dbg !44
+  %471 = extractvalue { i32, i32, i32, i32 } %470, 0, !dbg !44
+  %472 = extractvalue { i32, i32, i32, i32 } %470, 1, !dbg !44
+  %473 = extractvalue { i32, i32, i32, i32 } %470, 2, !dbg !44
+  %474 = extractvalue { i32, i32, i32, i32 } %470, 3, !dbg !44
+  %475 = insertelement <2 x i32> poison, i32 %465, i64 0, !dbg !44
+  %476 = insertelement <2 x i32> %475, i32 %471, i64 1, !dbg !44
+  %477 = lshr <2 x i32> %476, splat (i32 16), !dbg !44
+  %478 = trunc nuw <2 x i32> %477 to <2 x i16>, !dbg !44
+  %479 = insertelement <2 x i32> poison, i32 %466, i64 0, !dbg !44
+  %480 = insertelement <2 x i32> %479, i32 %472, i64 1, !dbg !44
+  %481 = lshr <2 x i32> %480, splat (i32 16), !dbg !44
+  %482 = trunc nuw <2 x i32> %481 to <2 x i16>, !dbg !44
+  %483 = insertelement <2 x i32> poison, i32 %467, i64 0, !dbg !44
+  %484 = insertelement <2 x i32> %483, i32 %473, i64 1, !dbg !44
+  %485 = lshr <2 x i32> %484, splat (i32 16), !dbg !44
+  %486 = trunc nuw <2 x i32> %485 to <2 x i16>, !dbg !44
+  %487 = insertelement <2 x i32> poison, i32 %468, i64 0, !dbg !44
+  %488 = insertelement <2 x i32> %487, i32 %474, i64 1, !dbg !44
+  %489 = lshr <2 x i32> %488, splat (i32 16), !dbg !44
+  %490 = trunc nuw <2 x i32> %489 to <2 x i16>, !dbg !44
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !45
+  %491 = trunc i32 %465 to i16, !dbg !45
+  %492 = trunc i32 %471 to i16, !dbg !45
+  %493 = trunc i32 %466 to i16, !dbg !45
+  %494 = trunc i32 %472 to i16, !dbg !45
+  %495 = insertelement <2 x i16> poison, i16 %491, i64 0, !dbg !45
+  %496 = insertelement <2 x i16> %495, i16 %492, i64 1, !dbg !45
+  %497 = bitcast <2 x i16> %496 to i32, !dbg !45
+  %498 = bitcast <2 x i16> %478 to i32, !dbg !45
+  %499 = insertelement <2 x i16> poison, i16 %493, i64 0, !dbg !45
+  %500 = insertelement <2 x i16> %499, i16 %494, i64 1, !dbg !45
+  %501 = bitcast <2 x i16> %500 to i32, !dbg !45
+  %502 = bitcast <2 x i16> %482 to i32, !dbg !45
+  %503 = insertelement <4 x i32> poison, i32 %497, i64 0, !dbg !45
+  %504 = insertelement <4 x i32> %503, i32 %498, i64 1, !dbg !45
+  %505 = insertelement <4 x i32> %504, i32 %501, i64 2, !dbg !45
+  %506 = insertelement <4 x i32> %505, i32 %502, i64 3, !dbg !45
+  store <4 x i32> %506, ptr addrspace(3) %100, align 16, !dbg !45
+  %507 = trunc i32 %467 to i16, !dbg !45
+  %508 = trunc i32 %473 to i16, !dbg !45
+  %509 = trunc i32 %468 to i16, !dbg !45
+  %510 = trunc i32 %474 to i16, !dbg !45
+  %511 = insertelement <2 x i16> poison, i16 %507, i64 0, !dbg !45
+  %512 = insertelement <2 x i16> %511, i16 %508, i64 1, !dbg !45
+  %513 = bitcast <2 x i16> %512 to i32, !dbg !45
+  %514 = bitcast <2 x i16> %486 to i32, !dbg !45
+  %515 = insertelement <2 x i16> poison, i16 %509, i64 0, !dbg !45
+  %516 = insertelement <2 x i16> %515, i16 %510, i64 1, !dbg !45
+  %517 = bitcast <2 x i16> %516 to i32, !dbg !45
+  %518 = bitcast <2 x i16> %490 to i32, !dbg !45
+  %519 = insertelement <4 x i32> poison, i32 %513, i64 0, !dbg !45
+  %520 = insertelement <4 x i32> %519, i32 %514, i64 1, !dbg !45
+  %521 = insertelement <4 x i32> %520, i32 %517, i64 2, !dbg !45
+  %522 = insertelement <4 x i32> %521, i32 %518, i64 3, !dbg !45
+  store <4 x i32> %522, ptr addrspace(3) %117, align 16, !dbg !45
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !45
+  %523 = tail call { i32, i32, i32, i32 } @llvm.nvvm.ldmatrix.sync.aligned.m8n8.x4.trans.b16.p3(ptr addrspace(3) %143), !dbg !45
+  %524 = tail call { i32, i32, i32, i32 } @llvm.nvvm.ldmatrix.sync.aligned.m8n8.x4.trans.b16.p3(ptr addrspace(3) nonnull %161), !dbg !45
+  %525 = shl nsw i32 %37, 5, !dbg !46
+  %526 = add nsw i32 %.decomposed109, -8192, !dbg !46
+  %527 = add i32 %526, %525, !dbg !47
+  %528 = sext i32 %527 to i64, !dbg !48
+  %529 = getelementptr float, ptr addrspace(1) %4, i64 %528, !dbg !48
+  %530 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !49
+  %531 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %529, i64 %530, i1 %462) #6, !dbg !49
+  %532 = extractvalue { i32, i32, i32, i32 } %531, 0, !dbg !49
+  %533 = extractvalue { i32, i32, i32, i32 } %531, 1, !dbg !49
+  %534 = extractvalue { i32, i32, i32, i32 } %531, 2, !dbg !49
+  %535 = extractvalue { i32, i32, i32, i32 } %531, 3, !dbg !49
+  %536 = bitcast i32 %532 to float, !dbg !49
+  %537 = bitcast i32 %533 to float, !dbg !49
+  %538 = bitcast i32 %534 to float, !dbg !49
+  %539 = bitcast i32 %535 to float, !dbg !49
+  %540 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !49
+  %541 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %529, i64 %540, i1 %462) #6, !dbg !49
+  %542 = extractvalue { i32, i32, i32, i32 } %541, 0, !dbg !49
+  %543 = extractvalue { i32, i32, i32, i32 } %541, 1, !dbg !49
+  %544 = extractvalue { i32, i32, i32, i32 } %541, 2, !dbg !49
+  %545 = extractvalue { i32, i32, i32, i32 } %541, 3, !dbg !49
+  %546 = bitcast i32 %542 to float, !dbg !49
+  %547 = bitcast i32 %543 to float, !dbg !49
+  %548 = bitcast i32 %544 to float, !dbg !49
+  %549 = bitcast i32 %545 to float, !dbg !49
+  %550 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !49
+  %551 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %529, i64 %550, i1 %462) #6, !dbg !49
+  %552 = extractvalue { i32, i32, i32, i32 } %551, 0, !dbg !49
+  %553 = extractvalue { i32, i32, i32, i32 } %551, 1, !dbg !49
+  %554 = extractvalue { i32, i32, i32, i32 } %551, 2, !dbg !49
+  %555 = extractvalue { i32, i32, i32, i32 } %551, 3, !dbg !49
+  %556 = bitcast i32 %552 to float, !dbg !49
+  %557 = bitcast i32 %553 to float, !dbg !49
+  %558 = bitcast i32 %554 to float, !dbg !49
+  %559 = bitcast i32 %555 to float, !dbg !49
+  %560 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !49
+  %561 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %529, i64 %560, i1 %462) #6, !dbg !49
+  %562 = extractvalue { i32, i32, i32, i32 } %561, 0, !dbg !49
+  %563 = extractvalue { i32, i32, i32, i32 } %561, 1, !dbg !49
+  %564 = extractvalue { i32, i32, i32, i32 } %561, 2, !dbg !49
+  %565 = extractvalue { i32, i32, i32, i32 } %561, 3, !dbg !49
+  %566 = bitcast i32 %562 to float, !dbg !49
+  %567 = bitcast i32 %563 to float, !dbg !49
+  %568 = bitcast i32 %564 to float, !dbg !49
+  %569 = bitcast i32 %565 to float, !dbg !49
+  %570 = tail call float @llvm.nvvm.div.full(float %536, float 1.280000e+02), !dbg !50
+  %571 = tail call float @llvm.nvvm.div.full(float %537, float 1.280000e+02), !dbg !50
+  %572 = tail call float @llvm.nvvm.div.full(float %538, float 1.280000e+02), !dbg !50
+  %573 = tail call float @llvm.nvvm.div.full(float %539, float 1.280000e+02), !dbg !50
+  %574 = tail call float @llvm.nvvm.div.full(float %546, float 1.280000e+02), !dbg !50
+  %575 = tail call float @llvm.nvvm.div.full(float %547, float 1.280000e+02), !dbg !50
+  %576 = tail call float @llvm.nvvm.div.full(float %548, float 1.280000e+02), !dbg !50
+  %577 = tail call float @llvm.nvvm.div.full(float %549, float 1.280000e+02), !dbg !50
+  %578 = tail call float @llvm.nvvm.div.full(float %556, float 1.280000e+02), !dbg !50
+  %579 = tail call float @llvm.nvvm.div.full(float %557, float 1.280000e+02), !dbg !50
+  %580 = tail call float @llvm.nvvm.div.full(float %558, float 1.280000e+02), !dbg !50
+  %581 = tail call float @llvm.nvvm.div.full(float %559, float 1.280000e+02), !dbg !50
+  %582 = tail call float @llvm.nvvm.div.full(float %566, float 1.280000e+02), !dbg !50
+  %583 = tail call float @llvm.nvvm.div.full(float %567, float 1.280000e+02), !dbg !50
+  %584 = tail call float @llvm.nvvm.div.full(float %568, float 1.280000e+02), !dbg !50
+  %585 = tail call float @llvm.nvvm.div.full(float %569, float 1.280000e+02), !dbg !50
+  %586 = fadd float %570, 0x3EB0C6F7A0000000, !dbg !51
+  %587 = fadd float %571, 0x3EB0C6F7A0000000, !dbg !51
+  %588 = fadd float %572, 0x3EB0C6F7A0000000, !dbg !51
+  %589 = fadd float %573, 0x3EB0C6F7A0000000, !dbg !51
+  %590 = fadd float %574, 0x3EB0C6F7A0000000, !dbg !51
+  %591 = fadd float %575, 0x3EB0C6F7A0000000, !dbg !51
+  %592 = fadd float %576, 0x3EB0C6F7A0000000, !dbg !51
+  %593 = fadd float %577, 0x3EB0C6F7A0000000, !dbg !51
+  %594 = fadd float %578, 0x3EB0C6F7A0000000, !dbg !51
+  %595 = fadd float %579, 0x3EB0C6F7A0000000, !dbg !51
+  %596 = fadd float %580, 0x3EB0C6F7A0000000, !dbg !51
+  %597 = fadd float %581, 0x3EB0C6F7A0000000, !dbg !51
+  %598 = fadd float %582, 0x3EB0C6F7A0000000, !dbg !51
+  %599 = fadd float %583, 0x3EB0C6F7A0000000, !dbg !51
+  %600 = fadd float %584, 0x3EB0C6F7A0000000, !dbg !51
+  %601 = fadd float %585, 0x3EB0C6F7A0000000, !dbg !51
+  %602 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !52
+  %.not.i61 = icmp eq i32 %602, 0, !dbg !52
+  br i1 %.not.i61, label %605, label %603, !dbg !52
+
+603:                                              ; preds = %__nv_rsqrtf.exit60
+  %604 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %586), !dbg !52
+  br label %__nv_rsqrtf.exit63, !dbg !52
+
+605:                                              ; preds = %__nv_rsqrtf.exit60
+  %606 = tail call float @llvm.nvvm.rsqrt.approx.f(float %586), !dbg !52
+  br label %__nv_rsqrtf.exit63, !dbg !52
+
+__nv_rsqrtf.exit63:                               ; preds = %603, %605
+  %.0.i62 = phi float [ %604, %603 ], [ %606, %605 ], !dbg !52
+  %607 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !52
+  %.not.i64 = icmp eq i32 %607, 0, !dbg !52
+  br i1 %.not.i64, label %610, label %608, !dbg !52
+
+608:                                              ; preds = %__nv_rsqrtf.exit63
+  %609 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %587), !dbg !52
+  br label %__nv_rsqrtf.exit66, !dbg !52
+
+610:                                              ; preds = %__nv_rsqrtf.exit63
+  %611 = tail call float @llvm.nvvm.rsqrt.approx.f(float %587), !dbg !52
+  br label %__nv_rsqrtf.exit66, !dbg !52
+
+__nv_rsqrtf.exit66:                               ; preds = %608, %610
+  %.0.i65 = phi float [ %609, %608 ], [ %611, %610 ], !dbg !52
+  %612 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !52
+  %.not.i67 = icmp eq i32 %612, 0, !dbg !52
+  br i1 %.not.i67, label %615, label %613, !dbg !52
+
+613:                                              ; preds = %__nv_rsqrtf.exit66
+  %614 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %588), !dbg !52
+  br label %__nv_rsqrtf.exit69, !dbg !52
+
+615:                                              ; preds = %__nv_rsqrtf.exit66
+  %616 = tail call float @llvm.nvvm.rsqrt.approx.f(float %588), !dbg !52
+  br label %__nv_rsqrtf.exit69, !dbg !52
+
+__nv_rsqrtf.exit69:                               ; preds = %613, %615
+  %.0.i68 = phi float [ %614, %613 ], [ %616, %615 ], !dbg !52
+  %617 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !52
+  %.not.i70 = icmp eq i32 %617, 0, !dbg !52
+  br i1 %.not.i70, label %620, label %618, !dbg !52
+
+618:                                              ; preds = %__nv_rsqrtf.exit69
+  %619 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %589), !dbg !52
+  br label %__nv_rsqrtf.exit72, !dbg !52
+
+620:                                              ; preds = %__nv_rsqrtf.exit69
+  %621 = tail call float @llvm.nvvm.rsqrt.approx.f(float %589), !dbg !52
+  br label %__nv_rsqrtf.exit72, !dbg !52
+
+__nv_rsqrtf.exit72:                               ; preds = %618, %620
+  %.0.i71 = phi float [ %619, %618 ], [ %621, %620 ], !dbg !52
+  %622 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !52
+  %.not.i73 = icmp eq i32 %622, 0, !dbg !52
+  br i1 %.not.i73, label %625, label %623, !dbg !52
+
+623:                                              ; preds = %__nv_rsqrtf.exit72
+  %624 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %590), !dbg !52
+  br label %__nv_rsqrtf.exit75, !dbg !52
+
+625:                                              ; preds = %__nv_rsqrtf.exit72
+  %626 = tail call float @llvm.nvvm.rsqrt.approx.f(float %590), !dbg !52
+  br label %__nv_rsqrtf.exit75, !dbg !52
+
+__nv_rsqrtf.exit75:                               ; preds = %623, %625
+  %.0.i74 = phi float [ %624, %623 ], [ %626, %625 ], !dbg !52
+  %627 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !52
+  %.not.i76 = icmp eq i32 %627, 0, !dbg !52
+  br i1 %.not.i76, label %630, label %628, !dbg !52
+
+628:                                              ; preds = %__nv_rsqrtf.exit75
+  %629 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %591), !dbg !52
+  br label %__nv_rsqrtf.exit78, !dbg !52
+
+630:                                              ; preds = %__nv_rsqrtf.exit75
+  %631 = tail call float @llvm.nvvm.rsqrt.approx.f(float %591), !dbg !52
+  br label %__nv_rsqrtf.exit78, !dbg !52
+
+__nv_rsqrtf.exit78:                               ; preds = %628, %630
+  %.0.i77 = phi float [ %629, %628 ], [ %631, %630 ], !dbg !52
+  %632 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !52
+  %.not.i79 = icmp eq i32 %632, 0, !dbg !52
+  br i1 %.not.i79, label %635, label %633, !dbg !52
+
+633:                                              ; preds = %__nv_rsqrtf.exit78
+  %634 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %592), !dbg !52
+  br label %__nv_rsqrtf.exit81, !dbg !52
+
+635:                                              ; preds = %__nv_rsqrtf.exit78
+  %636 = tail call float @llvm.nvvm.rsqrt.approx.f(float %592), !dbg !52
+  br label %__nv_rsqrtf.exit81, !dbg !52
+
+__nv_rsqrtf.exit81:                               ; preds = %633, %635
+  %.0.i80 = phi float [ %634, %633 ], [ %636, %635 ], !dbg !52
+  %637 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !52
+  %.not.i82 = icmp eq i32 %637, 0, !dbg !52
+  br i1 %.not.i82, label %640, label %638, !dbg !52
+
+638:                                              ; preds = %__nv_rsqrtf.exit81
+  %639 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %593), !dbg !52
+  br label %__nv_rsqrtf.exit84, !dbg !52
+
+640:                                              ; preds = %__nv_rsqrtf.exit81
+  %641 = tail call float @llvm.nvvm.rsqrt.approx.f(float %593), !dbg !52
+  br label %__nv_rsqrtf.exit84, !dbg !52
+
+__nv_rsqrtf.exit84:                               ; preds = %638, %640
+  %.0.i83 = phi float [ %639, %638 ], [ %641, %640 ], !dbg !52
+  %642 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !52
+  %.not.i85 = icmp eq i32 %642, 0, !dbg !52
+  br i1 %.not.i85, label %645, label %643, !dbg !52
+
+643:                                              ; preds = %__nv_rsqrtf.exit84
+  %644 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %594), !dbg !52
+  br label %__nv_rsqrtf.exit87, !dbg !52
+
+645:                                              ; preds = %__nv_rsqrtf.exit84
+  %646 = tail call float @llvm.nvvm.rsqrt.approx.f(float %594), !dbg !52
+  br label %__nv_rsqrtf.exit87, !dbg !52
+
+__nv_rsqrtf.exit87:                               ; preds = %643, %645
+  %.0.i86 = phi float [ %644, %643 ], [ %646, %645 ], !dbg !52
+  %647 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !52
+  %.not.i88 = icmp eq i32 %647, 0, !dbg !52
+  br i1 %.not.i88, label %650, label %648, !dbg !52
+
+648:                                              ; preds = %__nv_rsqrtf.exit87
+  %649 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %595), !dbg !52
+  br label %__nv_rsqrtf.exit90, !dbg !52
+
+650:                                              ; preds = %__nv_rsqrtf.exit87
+  %651 = tail call float @llvm.nvvm.rsqrt.approx.f(float %595), !dbg !52
+  br label %__nv_rsqrtf.exit90, !dbg !52
+
+__nv_rsqrtf.exit90:                               ; preds = %648, %650
+  %.0.i89 = phi float [ %649, %648 ], [ %651, %650 ], !dbg !52
+  %652 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !52
+  %.not.i91 = icmp eq i32 %652, 0, !dbg !52
+  br i1 %.not.i91, label %655, label %653, !dbg !52
+
+653:                                              ; preds = %__nv_rsqrtf.exit90
+  %654 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %596), !dbg !52
+  br label %__nv_rsqrtf.exit93, !dbg !52
+
+655:                                              ; preds = %__nv_rsqrtf.exit90
+  %656 = tail call float @llvm.nvvm.rsqrt.approx.f(float %596), !dbg !52
+  br label %__nv_rsqrtf.exit93, !dbg !52
+
+__nv_rsqrtf.exit93:                               ; preds = %653, %655
+  %.0.i92 = phi float [ %654, %653 ], [ %656, %655 ], !dbg !52
+  %657 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !52
+  %.not.i94 = icmp eq i32 %657, 0, !dbg !52
+  br i1 %.not.i94, label %660, label %658, !dbg !52
+
+658:                                              ; preds = %__nv_rsqrtf.exit93
+  %659 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %597), !dbg !52
+  br label %__nv_rsqrtf.exit96, !dbg !52
+
+660:                                              ; preds = %__nv_rsqrtf.exit93
+  %661 = tail call float @llvm.nvvm.rsqrt.approx.f(float %597), !dbg !52
+  br label %__nv_rsqrtf.exit96, !dbg !52
+
+__nv_rsqrtf.exit96:                               ; preds = %658, %660
+  %.0.i95 = phi float [ %659, %658 ], [ %661, %660 ], !dbg !52
+  %662 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !52
+  %.not.i97 = icmp eq i32 %662, 0, !dbg !52
+  br i1 %.not.i97, label %665, label %663, !dbg !52
+
+663:                                              ; preds = %__nv_rsqrtf.exit96
+  %664 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %598), !dbg !52
+  br label %__nv_rsqrtf.exit99, !dbg !52
+
+665:                                              ; preds = %__nv_rsqrtf.exit96
+  %666 = tail call float @llvm.nvvm.rsqrt.approx.f(float %598), !dbg !52
+  br label %__nv_rsqrtf.exit99, !dbg !52
+
+__nv_rsqrtf.exit99:                               ; preds = %663, %665
+  %.0.i98 = phi float [ %664, %663 ], [ %666, %665 ], !dbg !52
+  %667 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !52
+  %.not.i100 = icmp eq i32 %667, 0, !dbg !52
+  br i1 %.not.i100, label %670, label %668, !dbg !52
+
+668:                                              ; preds = %__nv_rsqrtf.exit99
+  %669 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %599), !dbg !52
+  br label %__nv_rsqrtf.exit102, !dbg !52
+
+670:                                              ; preds = %__nv_rsqrtf.exit99
+  %671 = tail call float @llvm.nvvm.rsqrt.approx.f(float %599), !dbg !52
+  br label %__nv_rsqrtf.exit102, !dbg !52
+
+__nv_rsqrtf.exit102:                              ; preds = %668, %670
+  %.0.i101 = phi float [ %669, %668 ], [ %671, %670 ], !dbg !52
+  %672 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !52
+  %.not.i103 = icmp eq i32 %672, 0, !dbg !52
+  br i1 %.not.i103, label %675, label %673, !dbg !52
+
+673:                                              ; preds = %__nv_rsqrtf.exit102
+  %674 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %600), !dbg !52
+  br label %__nv_rsqrtf.exit105, !dbg !52
+
+675:                                              ; preds = %__nv_rsqrtf.exit102
+  %676 = tail call float @llvm.nvvm.rsqrt.approx.f(float %600), !dbg !52
+  br label %__nv_rsqrtf.exit105, !dbg !52
+
+__nv_rsqrtf.exit105:                              ; preds = %673, %675
+  %.0.i104 = phi float [ %674, %673 ], [ %676, %675 ], !dbg !52
+  %677 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !52
+  %.not.i106 = icmp eq i32 %677, 0, !dbg !52
+  br i1 %.not.i106, label %680, label %678, !dbg !52
+
+678:                                              ; preds = %__nv_rsqrtf.exit105
+  %679 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %601), !dbg !52
+  br label %__nv_rsqrtf.exit108, !dbg !52
+
+680:                                              ; preds = %__nv_rsqrtf.exit105
+  %681 = tail call float @llvm.nvvm.rsqrt.approx.f(float %601), !dbg !52
+  br label %__nv_rsqrtf.exit108, !dbg !52
+
+__nv_rsqrtf.exit108:                              ; preds = %678, %680
+  %.0.i107 = phi float [ %679, %678 ], [ %681, %680 ], !dbg !52
+  %682 = extractvalue { i32, i32, i32, i32 } %524, 3, !dbg !45
+  %683 = bitcast i32 %682 to <2 x bfloat>, !dbg !45
+  %684 = extractelement <2 x bfloat> %683, i64 1, !dbg !45
+  %685 = fpext bfloat %684 to float, !dbg !45
+  %686 = extractelement <2 x bfloat> %683, i64 0, !dbg !45
+  %687 = fpext bfloat %686 to float, !dbg !45
+  %688 = extractvalue { i32, i32, i32, i32 } %524, 2, !dbg !45
+  %689 = bitcast i32 %688 to <2 x bfloat>, !dbg !45
+  %690 = extractelement <2 x bfloat> %689, i64 1, !dbg !45
+  %691 = fpext bfloat %690 to float, !dbg !45
+  %692 = extractelement <2 x bfloat> %689, i64 0, !dbg !45
+  %693 = fpext bfloat %692 to float, !dbg !45
+  %694 = extractvalue { i32, i32, i32, i32 } %524, 1, !dbg !45
+  %695 = bitcast i32 %694 to <2 x bfloat>, !dbg !45
+  %696 = extractelement <2 x bfloat> %695, i64 1, !dbg !45
+  %697 = fpext bfloat %696 to float, !dbg !45
+  %698 = extractelement <2 x bfloat> %695, i64 0, !dbg !45
+  %699 = fpext bfloat %698 to float, !dbg !45
+  %700 = extractvalue { i32, i32, i32, i32 } %524, 0, !dbg !45
+  %701 = bitcast i32 %700 to <2 x bfloat>, !dbg !45
+  %702 = extractelement <2 x bfloat> %701, i64 1, !dbg !45
+  %703 = fpext bfloat %702 to float, !dbg !45
+  %704 = extractelement <2 x bfloat> %701, i64 0, !dbg !45
+  %705 = fpext bfloat %704 to float, !dbg !45
+  %706 = extractvalue { i32, i32, i32, i32 } %523, 3, !dbg !45
+  %707 = bitcast i32 %706 to <2 x bfloat>, !dbg !45
+  %708 = extractelement <2 x bfloat> %707, i64 1, !dbg !45
+  %709 = fpext bfloat %708 to float, !dbg !45
+  %710 = extractelement <2 x bfloat> %707, i64 0, !dbg !45
+  %711 = fpext bfloat %710 to float, !dbg !45
+  %712 = extractvalue { i32, i32, i32, i32 } %523, 2, !dbg !45
+  %713 = bitcast i32 %712 to <2 x bfloat>, !dbg !45
+  %714 = extractelement <2 x bfloat> %713, i64 1, !dbg !45
+  %715 = fpext bfloat %714 to float, !dbg !45
+  %716 = extractelement <2 x bfloat> %713, i64 0, !dbg !45
+  %717 = fpext bfloat %716 to float, !dbg !45
+  %718 = extractvalue { i32, i32, i32, i32 } %523, 1, !dbg !45
+  %719 = bitcast i32 %718 to <2 x bfloat>, !dbg !45
+  %720 = extractelement <2 x bfloat> %719, i64 1, !dbg !45
+  %721 = fpext bfloat %720 to float, !dbg !45
+  %722 = extractelement <2 x bfloat> %719, i64 0, !dbg !45
+  %723 = fpext bfloat %722 to float, !dbg !45
+  %724 = extractvalue { i32, i32, i32, i32 } %523, 0, !dbg !45
+  %725 = bitcast i32 %724 to <2 x bfloat>, !dbg !45
+  %726 = extractelement <2 x bfloat> %725, i64 1, !dbg !45
+  %727 = fpext bfloat %726 to float, !dbg !45
+  %728 = extractelement <2 x bfloat> %725, i64 0, !dbg !45
+  %729 = fpext bfloat %728 to float, !dbg !45
+  %730 = extractvalue { i32, i32, i32, i32 } %451, 3, !dbg !39
+  %731 = bitcast i32 %730 to <2 x bfloat>, !dbg !39
+  %732 = extractvalue { i32, i32, i32, i32 } %451, 2, !dbg !39
+  %733 = bitcast i32 %732 to <2 x bfloat>, !dbg !39
+  %734 = extractvalue { i32, i32, i32, i32 } %451, 1, !dbg !39
+  %735 = bitcast i32 %734 to <2 x bfloat>, !dbg !39
+  %736 = extractvalue { i32, i32, i32, i32 } %451, 0, !dbg !39
+  %737 = bitcast i32 %736 to <2 x bfloat>, !dbg !39
+  %738 = extractvalue { i32, i32, i32, i32 } %449, 3, !dbg !39
+  %739 = bitcast i32 %738 to <2 x bfloat>, !dbg !39
+  %740 = extractvalue { i32, i32, i32, i32 } %449, 2, !dbg !39
+  %741 = bitcast i32 %740 to <2 x bfloat>, !dbg !39
+  %742 = extractvalue { i32, i32, i32, i32 } %449, 1, !dbg !39
+  %743 = bitcast i32 %742 to <2 x bfloat>, !dbg !39
+  %744 = extractvalue { i32, i32, i32, i32 } %449, 0, !dbg !39
+  %745 = bitcast i32 %744 to <2 x bfloat>, !dbg !39
+  %746 = icmp slt i32 %23, 73728, !dbg !53
+  %747 = fmul float %.0.i62, %729, !dbg !54
+  %748 = fmul float %.0.i65, %727, !dbg !54
+  %749 = fmul float %.0.i68, %723, !dbg !54
+  %750 = fmul float %.0.i71, %721, !dbg !54
+  %751 = fmul float %.0.i74, %717, !dbg !54
+  %752 = fmul float %.0.i77, %715, !dbg !54
+  %753 = fmul float %.0.i80, %711, !dbg !54
+  %754 = fmul float %.0.i83, %709, !dbg !54
+  %755 = fmul float %.0.i86, %705, !dbg !54
+  %756 = fmul float %.0.i89, %703, !dbg !54
+  %757 = fmul float %.0.i92, %699, !dbg !54
+  %758 = fmul float %.0.i95, %697, !dbg !54
+  %759 = fmul float %.0.i98, %693, !dbg !54
+  %760 = fmul float %.0.i101, %691, !dbg !54
+  %761 = fmul float %.0.i104, %687, !dbg !54
+  %762 = fmul float %.0.i107, %685, !dbg !54
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !54
+  store float %747, ptr addrspace(3) %372, align 4, !dbg !54
+  store float %749, ptr addrspace(3) %373, align 4, !dbg !54
+  store float %748, ptr addrspace(3) %375, align 4, !dbg !54
+  store float %750, ptr addrspace(3) %376, align 4, !dbg !54
+  store float %751, ptr addrspace(3) %378, align 4, !dbg !54
+  store float %753, ptr addrspace(3) %379, align 4, !dbg !54
+  store float %752, ptr addrspace(3) %381, align 4, !dbg !54
+  store float %754, ptr addrspace(3) %382, align 4, !dbg !54
+  store float %755, ptr addrspace(3) %384, align 4, !dbg !54
+  store float %757, ptr addrspace(3) %385, align 4, !dbg !54
+  store float %756, ptr addrspace(3) %387, align 4, !dbg !54
+  store float %758, ptr addrspace(3) %388, align 4, !dbg !54
+  store float %759, ptr addrspace(3) %390, align 4, !dbg !54
+  store float %761, ptr addrspace(3) %391, align 4, !dbg !54
+  store float %760, ptr addrspace(3) %393, align 4, !dbg !54
+  store float %762, ptr addrspace(3) %394, align 4, !dbg !54
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !54
+  %763 = load float, ptr addrspace(3) %407, align 4, !dbg !54
+  %764 = load float, ptr addrspace(3) %409, align 4, !dbg !54
+  %765 = load float, ptr addrspace(3) %412, align 4, !dbg !54
+  %766 = load float, ptr addrspace(3) %414, align 4, !dbg !54
+  %767 = load float, ptr addrspace(3) %417, align 4, !dbg !54
+  %768 = load float, ptr addrspace(3) %419, align 4, !dbg !54
+  %769 = load float, ptr addrspace(3) %422, align 4, !dbg !54
+  %770 = load float, ptr addrspace(3) %424, align 4, !dbg !54
+  %771 = load float, ptr addrspace(3) %427, align 4, !dbg !54
+  %772 = load float, ptr addrspace(3) %429, align 4, !dbg !54
+  %773 = load float, ptr addrspace(3) %432, align 4, !dbg !54
+  %774 = load float, ptr addrspace(3) %434, align 4, !dbg !54
+  %775 = load float, ptr addrspace(3) %437, align 4, !dbg !54
+  %776 = load float, ptr addrspace(3) %439, align 4, !dbg !54
+  %777 = load float, ptr addrspace(3) %442, align 4, !dbg !54
+  %778 = load float, ptr addrspace(3) %444, align 4, !dbg !54
+  %779 = getelementptr bfloat, ptr addrspace(1) %5, i64 %446, !dbg !55
+  %780 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !56
+  %781 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %779, i64 %780, i1 %461) #6, !dbg !56
+  %782 = extractvalue { i32, i32, i32, i32 } %781, 0, !dbg !56
+  %783 = bitcast i32 %782 to <2 x bfloat>, !dbg !56
+  %784 = extractvalue { i32, i32, i32, i32 } %781, 1, !dbg !56
+  %785 = bitcast i32 %784 to <2 x bfloat>, !dbg !56
+  %786 = extractvalue { i32, i32, i32, i32 } %781, 2, !dbg !56
+  %787 = bitcast i32 %786 to <2 x bfloat>, !dbg !56
+  %788 = extractvalue { i32, i32, i32, i32 } %781, 3, !dbg !56
+  %789 = bitcast i32 %788 to <2 x bfloat>, !dbg !56
+  %790 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !56
+  %791 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %779, i64 %790, i1 %461) #6, !dbg !56
+  %792 = extractvalue { i32, i32, i32, i32 } %791, 0, !dbg !56
+  %793 = bitcast i32 %792 to <2 x bfloat>, !dbg !56
+  %794 = extractvalue { i32, i32, i32, i32 } %791, 1, !dbg !56
+  %795 = bitcast i32 %794 to <2 x bfloat>, !dbg !56
+  %796 = extractvalue { i32, i32, i32, i32 } %791, 2, !dbg !56
+  %797 = bitcast i32 %796 to <2 x bfloat>, !dbg !56
+  %798 = extractvalue { i32, i32, i32, i32 } %791, 3, !dbg !56
+  %799 = bitcast i32 %798 to <2 x bfloat>, !dbg !56
+  %800 = shl i32 %23, 7, !dbg !57
+  %801 = shl i32 %24, 7, !dbg !57
+  %802 = add i32 %800, %32, !dbg !58
+  %803 = add i32 %801, %32, !dbg !58
+  %804 = sext i32 %802 to i64, !dbg !59
+  %805 = getelementptr bfloat, ptr addrspace(1) %6, i64 %804, !dbg !59
+  %806 = sext i32 %803 to i64, !dbg !59
+  %807 = getelementptr bfloat, ptr addrspace(1) %6, i64 %806, !dbg !59
+  %808 = and i1 %34, %746, !dbg !60
+  %809 = fpext <2 x bfloat> %745 to <2 x float>, !dbg !61
+  %810 = insertelement <2 x float> poison, float %408, i64 0, !dbg !62
+  %811 = insertelement <2 x float> %810, float %413, i64 1, !dbg !62
+  %812 = fmul <2 x float> %811, %809, !dbg !62
+  %813 = fpext <2 x bfloat> %783 to <2 x float>, !dbg !63
+  %814 = insertelement <2 x float> poison, float %763, i64 0, !dbg !64
+  %815 = insertelement <2 x float> %814, float %765, i64 1, !dbg !64
+  %816 = fmul <2 x float> %815, %813, !dbg !64
+  %817 = insertelement <2 x i1> poison, i1 %41, i64 0, !dbg !65
+  %818 = shufflevector <2 x i1> %817, <2 x i1> poison, <2 x i32> zeroinitializer, !dbg !65
+  %819 = select <2 x i1> %818, <2 x float> %812, <2 x float> %816, !dbg !65
+  %820 = fptrunc <2 x float> %819 to <2 x bfloat>, !dbg !66
+  %821 = fpext <2 x bfloat> %743 to <2 x float>, !dbg !61
+  %822 = insertelement <2 x float> poison, float %418, i64 0, !dbg !62
+  %823 = insertelement <2 x float> %822, float %423, i64 1, !dbg !62
+  %824 = fmul <2 x float> %823, %821, !dbg !62
+  %825 = fpext <2 x bfloat> %785 to <2 x float>, !dbg !63
+  %826 = insertelement <2 x float> poison, float %767, i64 0, !dbg !64
+  %827 = insertelement <2 x float> %826, float %769, i64 1, !dbg !64
+  %828 = fmul <2 x float> %827, %825, !dbg !64
+  %829 = select <2 x i1> %818, <2 x float> %824, <2 x float> %828, !dbg !65
+  %830 = fptrunc <2 x float> %829 to <2 x bfloat>, !dbg !66
+  %831 = fpext <2 x bfloat> %741 to <2 x float>, !dbg !61
+  %832 = insertelement <2 x float> poison, float %410, i64 0, !dbg !62
+  %833 = insertelement <2 x float> %832, float %415, i64 1, !dbg !62
+  %834 = fmul <2 x float> %833, %831, !dbg !62
+  %835 = fpext <2 x bfloat> %787 to <2 x float>, !dbg !63
+  %836 = insertelement <2 x float> poison, float %764, i64 0, !dbg !64
+  %837 = insertelement <2 x float> %836, float %766, i64 1, !dbg !64
+  %838 = fmul <2 x float> %837, %835, !dbg !64
+  %839 = select <2 x i1> %818, <2 x float> %834, <2 x float> %838, !dbg !65
+  %840 = fptrunc <2 x float> %839 to <2 x bfloat>, !dbg !66
+  %841 = fpext <2 x bfloat> %739 to <2 x float>, !dbg !61
+  %842 = insertelement <2 x float> poison, float %420, i64 0, !dbg !62
+  %843 = insertelement <2 x float> %842, float %425, i64 1, !dbg !62
+  %844 = fmul <2 x float> %843, %841, !dbg !62
+  %845 = fpext <2 x bfloat> %789 to <2 x float>, !dbg !63
+  %846 = insertelement <2 x float> poison, float %768, i64 0, !dbg !64
+  %847 = insertelement <2 x float> %846, float %770, i64 1, !dbg !64
+  %848 = fmul <2 x float> %847, %845, !dbg !64
+  %849 = select <2 x i1> %818, <2 x float> %844, <2 x float> %848, !dbg !65
+  %850 = fptrunc <2 x float> %849 to <2 x bfloat>, !dbg !66
+  %851 = fpext <2 x bfloat> %737 to <2 x float>, !dbg !61
+  %852 = insertelement <2 x float> poison, float %428, i64 0, !dbg !62
+  %853 = insertelement <2 x float> %852, float %433, i64 1, !dbg !62
+  %854 = fmul <2 x float> %853, %851, !dbg !62
+  %855 = fpext <2 x bfloat> %793 to <2 x float>, !dbg !63
+  %856 = insertelement <2 x float> poison, float %771, i64 0, !dbg !64
+  %857 = insertelement <2 x float> %856, float %773, i64 1, !dbg !64
+  %858 = fmul <2 x float> %857, %855, !dbg !64
+  %859 = select <2 x i1> %818, <2 x float> %854, <2 x float> %858, !dbg !65
+  %860 = fptrunc <2 x float> %859 to <2 x bfloat>, !dbg !66
+  %861 = fpext <2 x bfloat> %735 to <2 x float>, !dbg !61
+  %862 = insertelement <2 x float> poison, float %438, i64 0, !dbg !62
+  %863 = insertelement <2 x float> %862, float %443, i64 1, !dbg !62
+  %864 = fmul <2 x float> %863, %861, !dbg !62
+  %865 = fpext <2 x bfloat> %795 to <2 x float>, !dbg !63
+  %866 = insertelement <2 x float> poison, float %775, i64 0, !dbg !64
+  %867 = insertelement <2 x float> %866, float %777, i64 1, !dbg !64
+  %868 = fmul <2 x float> %867, %865, !dbg !64
+  %869 = select <2 x i1> %818, <2 x float> %864, <2 x float> %868, !dbg !65
+  %870 = fptrunc <2 x float> %869 to <2 x bfloat>, !dbg !66
+  %871 = fpext <2 x bfloat> %733 to <2 x float>, !dbg !61
+  %872 = insertelement <2 x float> poison, float %430, i64 0, !dbg !62
+  %873 = insertelement <2 x float> %872, float %435, i64 1, !dbg !62
+  %874 = fmul <2 x float> %873, %871, !dbg !62
+  %875 = fpext <2 x bfloat> %797 to <2 x float>, !dbg !63
+  %876 = insertelement <2 x float> poison, float %772, i64 0, !dbg !64
+  %877 = insertelement <2 x float> %876, float %774, i64 1, !dbg !64
+  %878 = fmul <2 x float> %877, %875, !dbg !64
+  %879 = select <2 x i1> %818, <2 x float> %874, <2 x float> %878, !dbg !65
+  %880 = fptrunc <2 x float> %879 to <2 x bfloat>, !dbg !66
+  %881 = fpext <2 x bfloat> %731 to <2 x float>, !dbg !61
+  %882 = insertelement <2 x float> poison, float %440, i64 0, !dbg !62
+  %883 = insertelement <2 x float> %882, float %445, i64 1, !dbg !62
+  %884 = fmul <2 x float> %883, %881, !dbg !62
+  %885 = fpext <2 x bfloat> %799 to <2 x float>, !dbg !63
+  %886 = insertelement <2 x float> poison, float %776, i64 0, !dbg !64
+  %887 = insertelement <2 x float> %886, float %778, i64 1, !dbg !64
+  %888 = fmul <2 x float> %887, %885, !dbg !64
+  %889 = select <2 x i1> %818, <2 x float> %884, <2 x float> %888, !dbg !65
+  %890 = fptrunc <2 x float> %889 to <2 x bfloat>, !dbg !66
+  %891 = bitcast <2 x bfloat> %820 to i32, !dbg !66
+  %892 = bitcast <2 x bfloat> %830 to i32, !dbg !66
+  %893 = bitcast <2 x bfloat> %840 to i32, !dbg !66
+  %894 = bitcast <2 x bfloat> %850 to i32, !dbg !66
+  tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %891, i32 %892, i32 %893, i32 %894, ptr addrspace(1) %805, i1 %808) #6, !dbg !66
+  %895 = bitcast <2 x bfloat> %860 to i32, !dbg !66
+  %896 = bitcast <2 x bfloat> %870 to i32, !dbg !66
+  %897 = bitcast <2 x bfloat> %880 to i32, !dbg !66
+  %898 = bitcast <2 x bfloat> %890 to i32, !dbg !66
+  tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %895, i32 %896, i32 %897, i32 %898, ptr addrspace(1) %807, i1 %808) #6, !dbg !66
+  ret void, !dbg !67
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 65535) i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() #1
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 65535) i32 @llvm.nvvm.read.ptx.sreg.ctaid.z() #1
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 1, 65536) i32 @llvm.nvvm.read.ptx.sreg.nctaid.y() #1
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1
+
+; Function Attrs: convergent nocallback nounwind
+declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #2
+
+; Function Attrs: nocallback nofree nounwind memory(argmem: read)
+declare { i32, i32, i32, i32 } @llvm.nvvm.ldmatrix.sync.aligned.m8n8.x4.trans.b16.p3(ptr addrspace(3) readonly captures(none)) #3
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.div.full(float, float) #4
+
+declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #5
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #4
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.rsqrt.approx.f(float) #4
+
+attributes #0 = { nounwind "nvvm.reqntid"="256" }
+attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #2 = { convergent nocallback nounwind }
+attributes #3 = { nocallback nofree nounwind memory(argmem: read) }
+attributes #4 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
+attributes #5 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #6 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3}
+!llvm.ident = !{!4}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly)
+!1 = !DIFile(filename: "c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py", directory: "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h")
+!2 = !{i32 2, !"Debug Info Version", i32 3}
+!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
+!4 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
+!5 = distinct !DISubprogram(name: "triton_poi_fused__fused_rms_norm_cat_view_2", linkageName: "triton_poi_fused__fused_rms_norm_cat_view_2", scope: !1, file: !1, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
+!7 = !{}
+!8 = !DILocation(line: 21, column: 29, scope: !5)
+!9 = !DILocation(line: 21, column: 48, scope: !5)
+!10 = !DILocation(line: 21, column: 69, scope: !5)
+!11 = !DILocation(line: 21, column: 53, scope: !5)
+!12 = !DILocation(line: 21, column: 34, scope: !5)
+!13 = !DILocation(line: 21, column: 75, scope: !5)
+!14 = !DILocation(line: 22, column: 44, scope: !5)
+!15 = !DILocation(line: 22, column: 23, scope: !5)
+!16 = !DILocation(line: 24, column: 28, scope: !5)
+!17 = !DILocation(line: 24, column: 33, scope: !5)
+!18 = !DILocation(line: 25, column: 44, scope: !5)
+!19 = !DILocation(line: 25, column: 23, scope: !5)
+!20 = !DILocation(line: 26, column: 21, scope: !5)
+!21 = !DILocation(line: 27, column: 19, scope: !5)
+!22 = !DILocation(line: 29, column: 19, scope: !5)
+!23 = !DILocation(line: 35, column: 18, scope: !5)
+!24 = !DILocation(line: 36, column: 39, scope: !5)
+!25 = !DILocation(line: 36, column: 35, scope: !5)
+!26 = !DILocation(line: 36, column: 51, scope: !5)
+!27 = !DILocation(line: 36, column: 44, scope: !5)
+!28 = !DILocation(line: 36, column: 30, scope: !5)
+!29 = !DILocation(line: 36, column: 64, scope: !5)
+!30 = !DILocation(line: 36, column: 57, scope: !5)
+!31 = !DILocation(line: 36, column: 123, scope: !5)
+!32 = !DILocation(line: 38, column: 30, scope: !5)
+!33 = !DILocation(line: 38, column: 80, scope: !5)
+!34 = !DILocation(line: 40, column: 19, scope: !5)
+!35 = !DILocation(line: 42, column: 19, scope: !5)
+!36 = !DILocation(line: 43, column: 28, scope: !5)
+!37 = !DILocation(line: 44, column: 19, scope: !5)
+!38 = !DILocation(line: 45, column: 31, scope: !5)
+!39 = !DILocation(line: 45, column: 71, scope: !5)
+!40 = !DILocation(line: 54, column: 52, scope: !5)
+!41 = !DILocation(line: 54, column: 45, scope: !5)
+!42 = !DILocation(line: 54, column: 31, scope: !5)
+!43 = !DILocation(line: 54, column: 83, scope: !5)
+!44 = !DILocation(line: 54, column: 67, scope: !5)
+!45 = !DILocation(line: 54, column: 134, scope: !5)
+!46 = !DILocation(line: 56, column: 56, scope: !5)
+!47 = !DILocation(line: 56, column: 52, scope: !5)
+!48 = !DILocation(line: 56, column: 31, scope: !5)
+!49 = !DILocation(line: 56, column: 90, scope: !5)
+!50 = !DILocation(line: 58, column: 21, scope: !5)
+!51 = !DILocation(line: 60, column: 20, scope: !5)
+!52 = !DILocation(line: 61, column: 28, scope: !5)
+!53 = !DILocation(line: 23, column: 21, scope: !5)
+!54 = !DILocation(line: 62, column: 20, scope: !5)
+!55 = !DILocation(line: 63, column: 31, scope: !5)
+!56 = !DILocation(line: 63, column: 71, scope: !5)
+!57 = !DILocation(line: 70, column: 34, scope: !5)
+!58 = !DILocation(line: 70, column: 30, scope: !5)
+!59 = !DILocation(line: 70, column: 25, scope: !5)
+!60 = !DILocation(line: 70, column: 54, scope: !5)
+!61 = !DILocation(line: 45, column: 137, scope: !5)
+!62 = !DILocation(line: 47, column: 20, scope: !5)
+!63 = !DILocation(line: 63, column: 138, scope: !5)
+!64 = !DILocation(line: 65, column: 20, scope: !5)
+!65 = !DILocation(line: 0, scope: !5)
+!66 = !DILocation(line: 70, column: 46, scope: !5)
+!67 = !DILocation(line: 70, column: 4, scope: !5)
diff --git a/triton/VJYGHH2I6HL5D4FSAVHRN5TDVUMIEA46OE2HSL7GW3Y7IZ2XL7TQ/triton_poi_fused__fused_rms_norm_cat_view_2.ptx b/triton/VJYGHH2I6HL5D4FSAVHRN5TDVUMIEA46OE2HSL7GW3Y7IZ2XL7TQ/triton_poi_fused__fused_rms_norm_cat_view_2.ptx
new file mode 100644
index 0000000000000000000000000000000000000000..ca658b515880c2ca77001f4f0b40acdb60efe741
--- /dev/null
+++ b/triton/VJYGHH2I6HL5D4FSAVHRN5TDVUMIEA46OE2HSL7GW3Y7IZ2XL7TQ/triton_poi_fused__fused_rms_norm_cat_view_2.ptx
@@ -0,0 +1,1096 @@
+//
+// Generated by LLVM NVPTX Back-End
+//
+
+.version 9.1
+.target sm_89
+.address_size 64
+
+	// .globl	triton_poi_fused__fused_rms_norm_cat_view_2 // -- Begin function triton_poi_fused__fused_rms_norm_cat_view_2
+.extern .shared .align 16 .b8 global_smem[];
+.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90};
+                                        // @triton_poi_fused__fused_rms_norm_cat_view_2
+.visible .entry triton_poi_fused__fused_rms_norm_cat_view_2(
+	.param .u64 .ptr .global .align 1 triton_poi_fused__fused_rms_norm_cat_view_2_param_0,
+	.param .u64 .ptr .global .align 1 triton_poi_fused__fused_rms_norm_cat_view_2_param_1,
+	.param .u64 .ptr .global .align 1 triton_poi_fused__fused_rms_norm_cat_view_2_param_2,
+	.param .u64 .ptr .global .align 1 triton_poi_fused__fused_rms_norm_cat_view_2_param_3,
+	.param .u64 .ptr .global .align 1 triton_poi_fused__fused_rms_norm_cat_view_2_param_4,
+	.param .u64 .ptr .global .align 1 triton_poi_fused__fused_rms_norm_cat_view_2_param_5,
+	.param .u64 .ptr .global .align 1 triton_poi_fused__fused_rms_norm_cat_view_2_param_6,
+	.param .u32 triton_poi_fused__fused_rms_norm_cat_view_2_param_7,
+	.param .u32 triton_poi_fused__fused_rms_norm_cat_view_2_param_8,
+	.param .u64 .ptr .global .align 1 triton_poi_fused__fused_rms_norm_cat_view_2_param_9,
+	.param .u64 .ptr .global .align 1 triton_poi_fused__fused_rms_norm_cat_view_2_param_10
+)
+.reqntid 256
+{
+	.reg .pred 	%p<12>;
+	.reg .b16 	%rs<65>;
+	.reg .b32 	%r<499>;
+	.reg .b64 	%rd<35>;
+	.loc	1 18 0                          // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:18:0
+$L__func_begin0:
+	.loc	1 18 0                          // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:18:0
+
+// %bb.0:                               // %__nv_rsqrtf.exit
+	ld.param.b64 	%rd27, [triton_poi_fused__fused_rms_norm_cat_view_2_param_0];
+	ld.param.b64 	%rd28, [triton_poi_fused__fused_rms_norm_cat_view_2_param_1];
+$L__tmp0:
+	.loc	1 21 29                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:21:29
+	mov.u32 	%r74, %ctaid.y;
+	ld.param.b64 	%rd29, [triton_poi_fused__fused_rms_norm_cat_view_2_param_2];
+	.loc	1 21 48                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:21:48
+	mov.u32 	%r75, %ctaid.z;
+	ld.param.b64 	%rd30, [triton_poi_fused__fused_rms_norm_cat_view_2_param_3];
+	.loc	1 21 69                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:21:69
+	mov.u32 	%r76, %nctaid.y;
+	ld.param.b64 	%rd31, [triton_poi_fused__fused_rms_norm_cat_view_2_param_4];
+	.loc	1 21 34                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:21:34
+	mad.lo.s32 	%r77, %r75, %r76, %r74;
+	ld.param.b64 	%rd32, [triton_poi_fused__fused_rms_norm_cat_view_2_param_5];
+	.loc	1 21 75                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:21:75
+	shl.b32 	%r78, %r77, 5;
+	ld.param.b64 	%rd33, [triton_poi_fused__fused_rms_norm_cat_view_2_param_6];
+	.loc	1 22 44                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:22:44
+	mov.u32 	%r79, %tid.x;
+	bfe.u32 	%r80, %r79, 4, 4;
+	and.b32 	%r81, %r79, 7;
+	shl.b32 	%r82, %r81, 2;
+	.loc	1 22 23                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:22:23
+	or.b32 	%r83, %r78, %r80;
+	or.b32 	%r84, %r83, 16;
+	or.b32 	%r85, %r78, %r82;
+	.loc	1 24 28                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:24:28
+	mov.u32 	%r86, %ctaid.x;
+	.loc	1 24 33                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:24:33
+	shl.b32 	%r87, %r86, 7;
+	.loc	1 25 44                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:25:44
+	and.b32 	%r88, %r79, 15;
+	shl.b32 	%r89, %r88, 3;
+	bfe.u32 	%r90, %r79, 3, 5;
+	.loc	1 25 23                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:25:23
+	or.b32 	%r91, %r89, %r87;
+	or.b32 	%r92, %r90, %r87;
+	.loc	1 26 21                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:26:21
+	setp.lt.s32 	%p6, %r91, 128;
+	setp.lt.s32 	%p7, %r92, 128;
+	.loc	1 27 19                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:27:19
+	bfe.s32 	%r93, %r77, 26, 1;
+	.loc	1 29 19                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:29:19
+	shr.u32 	%r94, %r93, 27;
+	.loc	1 27 19                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:27:19
+	add.s32 	%r95, %r83, %r94;
+	shr.u32 	%r96, %r95, 5;
+	.loc	1 29 19                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:29:19
+	and.b32 	%r97, %r95, 33554400;
+	sub.s32 	%r98, %r83, %r97;
+	add.s32 	%r99, %r84, %r94;
+	and.b32 	%r100, %r99, 33554400;
+	sub.s32 	%r101, %r84, %r100;
+	.loc	1 35 18                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:35:18
+	setp.lt.s32 	%p8, %r83, 8192;
+	setp.lt.s32 	%p9, %r85, 8192;
+	.loc	1 36 39                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:36:39
+	shl.b32 	%r102, %r98, 7;
+	shl.b32 	%r103, %r101, 7;
+	.loc	1 36 35                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:36:35
+	add.s32 	%r104, %r102, %r91;
+	add.s32 	%r105, %r103, %r91;
+	.loc	1 36 51                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:36:51
+	mul.lo.s32 	%r106, %r96, 12288;
+	.loc	1 36 44                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:36:44
+	add.s32 	%r107, %r104, %r106;
+	add.s32 	%r108, %r105, %r106;
+	.loc	1 36 30                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:36:30
+	mad.wide.s32 	%rd1, %r107, 2, %rd27;
+	mad.wide.s32 	%rd3, %r108, 2, %rd27;
+	.loc	1 36 64                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:36:64
+	and.pred 	%p1, %p6, %p8;
+	and.pred 	%p2, %p7, %p9;
+	.loc	1 36 57                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:36:57
+	// begin inline asm
+	mov.u64 %rd2, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd2, 1.0;
+	// end inline asm
+	mov.b32 	%r5, 0;
+	// begin inline asm
+	mov.u32 %r1, %r5;
+	mov.u32 %r2, %r5;
+	mov.u32 %r3, %r5;
+	mov.u32 %r4, %r5;
+	@%p1 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r1, %r2, %r3, %r4 }, [ %rd1 + 0 ], %rd2;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd4, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd4, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r6, %r5;
+	mov.u32 %r7, %r5;
+	mov.u32 %r8, %r5;
+	mov.u32 %r9, %r5;
+	@%p1 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r6, %r7, %r8, %r9 }, [ %rd3 + 0 ], %rd4;
+	// end inline asm
+	prmt.b32 	%r109, %r1, %r6, 0x7632U;
+	prmt.b32 	%r110, %r2, %r7, 0x7632U;
+	prmt.b32 	%r111, %r3, %r8, 0x7632U;
+	prmt.b32 	%r112, %r4, %r9, 0x7632U;
+	.loc	1 36 123                        // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:36:123
+	and.b32 	%r113, %r79, 192;
+	shl.b32 	%r114, %r113, 5;
+	shl.b32 	%r115, %r81, 4;
+	shr.u32 	%r116, %r113, 1;
+	shl.b32 	%r117, %r79, 6;
+	and.b32 	%r118, %r117, 512;
+	and.b32 	%r119, %r79, 16;
+	bfe.s32 	%r120, %r79, 4, 1;
+	and.b32 	%r121, %r120, 1040;
+	and.b32 	%r122, %r79, 32;
+	shl.b32 	%r123, %r122, 2;
+	or.b32 	%r124, %r114, %r115;
+	or.b32 	%r125, %r121, %r116;
+	xor.b32 	%r126, %r125, %r124;
+	mov.b32 	%r127, global_smem;
+	add.s32 	%r128, %r127, %r118;
+	add.s32 	%r129, %r128, %r126;
+	add.s32 	%r130, %r129, %r123;
+	prmt.b32 	%r131, %r1, %r6, 0x5410U;
+	prmt.b32 	%r132, %r2, %r7, 0x5410U;
+	st.shared.v4.b32 	[%r130], {%r131, %r109, %r132, %r110};
+	prmt.b32 	%r133, %r3, %r8, 0x5410U;
+	prmt.b32 	%r134, %r4, %r9, 0x5410U;
+	st.shared.v4.b32 	[%r130+256], {%r133, %r111, %r134, %r112};
+	bar.sync 	0;
+	shl.b32 	%r135, %r81, 10;
+	shl.b32 	%r136, %r88, 4;
+	shr.u32 	%r137, %r113, 2;
+	shl.b32 	%r138, %r119, 2;
+	shl.b32 	%r139, %r122, 3;
+	xor.b32 	%r140, %r136, %r137;
+	xor.b32 	%r141, %r140, %r138;
+	add.s32 	%r142, %r127, %r135;
+	add.s32 	%r143, %r142, %r141;
+	add.s32 	%r144, %r143, %r139;
+	ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r145, %r146, %r147, %r148}, [%r144];
+	mov.b32 	{%rs1, %rs2}, %r145;
+	mov.b32 	{%rs3, %rs4}, %r146;
+	mov.b32 	{%rs5, %rs6}, %r147;
+	mov.b32 	{%rs7, %rs8}, %r148;
+	ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r149, %r150, %r151, %r152}, [%r144+512];
+	mov.b32 	{%rs9, %rs10}, %r149;
+	mov.b32 	{%rs11, %rs12}, %r150;
+	mov.b32 	{%rs13, %rs14}, %r151;
+	mov.b32 	{%rs15, %rs16}, %r152;
+	cvt.f32.bf16 	%r153, %rs1;
+	cvt.f32.bf16 	%r154, %rs2;
+	cvt.f32.bf16 	%r155, %rs3;
+	cvt.f32.bf16 	%r156, %rs4;
+	cvt.f32.bf16 	%r157, %rs5;
+	cvt.f32.bf16 	%r158, %rs6;
+	cvt.f32.bf16 	%r159, %rs7;
+	cvt.f32.bf16 	%r160, %rs8;
+	cvt.f32.bf16 	%r161, %rs9;
+	cvt.f32.bf16 	%r162, %rs10;
+	cvt.f32.bf16 	%r163, %rs11;
+	cvt.f32.bf16 	%r164, %rs12;
+	cvt.f32.bf16 	%r165, %rs13;
+	cvt.f32.bf16 	%r166, %rs14;
+	cvt.f32.bf16 	%r167, %rs15;
+	cvt.f32.bf16 	%r168, %rs16;
+	.loc	1 38 30                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:38:30
+	mad.wide.s32 	%rd5, %r85, 4, %rd28;
+	.loc	1 38 80                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:38:80
+	// begin inline asm
+	mov.u64 %rd6, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd6, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r10, %r5;
+	mov.u32 %r11, %r5;
+	mov.u32 %r12, %r5;
+	mov.u32 %r13, %r5;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r10, %r11, %r12, %r13 }, [ %rd5 + 0 ], %rd6;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd7, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd7, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r14, %r5;
+	mov.u32 %r15, %r5;
+	mov.u32 %r16, %r5;
+	mov.u32 %r17, %r5;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r14, %r15, %r16, %r17 }, [ %rd5 + 0 ], %rd7;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd8, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd8, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r18, %r5;
+	mov.u32 %r19, %r5;
+	mov.u32 %r20, %r5;
+	mov.u32 %r21, %r5;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r18, %r19, %r20, %r21 }, [ %rd5 + 0 ], %rd8;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd9, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd9, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r22, %r5;
+	mov.u32 %r23, %r5;
+	mov.u32 %r24, %r5;
+	mov.u32 %r25, %r5;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r22, %r23, %r24, %r25 }, [ %rd5 + 0 ], %rd9;
+	// end inline asm
+	mov.b32 	%r169, 0f43000000;
+	.loc	1 40 19                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:40:19
+	div.full.f32 	%r170, %r10, %r169;
+	div.full.f32 	%r171, %r11, %r169;
+	div.full.f32 	%r172, %r12, %r169;
+	div.full.f32 	%r173, %r13, %r169;
+	div.full.f32 	%r174, %r14, %r169;
+	div.full.f32 	%r175, %r15, %r169;
+	div.full.f32 	%r176, %r16, %r169;
+	div.full.f32 	%r177, %r17, %r169;
+	div.full.f32 	%r178, %r18, %r169;
+	div.full.f32 	%r179, %r19, %r169;
+	div.full.f32 	%r180, %r20, %r169;
+	div.full.f32 	%r181, %r21, %r169;
+	div.full.f32 	%r182, %r22, %r169;
+	div.full.f32 	%r183, %r23, %r169;
+	div.full.f32 	%r184, %r24, %r169;
+	div.full.f32 	%r185, %r25, %r169;
+	.loc	1 42 19                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:42:19
+	add.f32 	%r186, %r170, 0f358637BD;
+	add.f32 	%r187, %r171, 0f358637BD;
+	add.f32 	%r188, %r172, 0f358637BD;
+	add.f32 	%r189, %r173, 0f358637BD;
+	add.f32 	%r190, %r174, 0f358637BD;
+	add.f32 	%r191, %r175, 0f358637BD;
+	add.f32 	%r192, %r176, 0f358637BD;
+	add.f32 	%r193, %r177, 0f358637BD;
+	add.f32 	%r194, %r178, 0f358637BD;
+	add.f32 	%r195, %r179, 0f358637BD;
+	add.f32 	%r196, %r180, 0f358637BD;
+	add.f32 	%r197, %r181, 0f358637BD;
+	add.f32 	%r198, %r182, 0f358637BD;
+	add.f32 	%r199, %r183, 0f358637BD;
+	add.f32 	%r200, %r184, 0f358637BD;
+	add.f32 	%r201, %r185, 0f358637BD;
+	.loc	1 43 28                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:43:28
+	rsqrt.approx.ftz.f32 	%r202, %r186;
+	rsqrt.approx.ftz.f32 	%r203, %r187;
+	rsqrt.approx.ftz.f32 	%r204, %r188;
+	rsqrt.approx.ftz.f32 	%r205, %r189;
+	rsqrt.approx.ftz.f32 	%r206, %r190;
+	rsqrt.approx.ftz.f32 	%r207, %r191;
+	rsqrt.approx.ftz.f32 	%r208, %r192;
+	rsqrt.approx.ftz.f32 	%r209, %r193;
+	rsqrt.approx.ftz.f32 	%r210, %r194;
+	rsqrt.approx.ftz.f32 	%r211, %r195;
+	rsqrt.approx.ftz.f32 	%r212, %r196;
+	rsqrt.approx.ftz.f32 	%r213, %r197;
+	rsqrt.approx.ftz.f32 	%r214, %r198;
+	rsqrt.approx.ftz.f32 	%r215, %r199;
+	rsqrt.approx.ftz.f32 	%r216, %r200;
+	rsqrt.approx.ftz.f32 	%r217, %r201;
+	.loc	1 44 19                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:44:19
+	mul.f32 	%r218, %r202, %r153;
+	mul.f32 	%r219, %r203, %r154;
+	mul.f32 	%r220, %r204, %r155;
+	mul.f32 	%r221, %r205, %r156;
+	mul.f32 	%r222, %r206, %r157;
+	mul.f32 	%r223, %r207, %r158;
+	mul.f32 	%r224, %r208, %r159;
+	mul.f32 	%r225, %r209, %r160;
+	mul.f32 	%r226, %r210, %r161;
+	mul.f32 	%r227, %r211, %r162;
+	mul.f32 	%r228, %r212, %r163;
+	mul.f32 	%r229, %r213, %r164;
+	mul.f32 	%r230, %r214, %r165;
+	mul.f32 	%r231, %r215, %r166;
+	mul.f32 	%r232, %r216, %r167;
+	mul.f32 	%r233, %r217, %r168;
+	bar.sync 	0;
+	shl.b32 	%r234, %r79, 9;
+	and.b32 	%r235, %r234, 15360;
+	shr.u32 	%r236, %r79, 1;
+	and.b32 	%r237, %r236, 108;
+	or.b32 	%r238, %r235, %r115;
+	xor.b32 	%r239, %r238, %r237;
+	or.b32 	%r240, %r239, %r139;
+	add.s32 	%r241, %r127, %r240;
+	st.shared.b32 	[%r241], %r218;
+	st.shared.b32 	[%r241+128], %r220;
+	xor.b32 	%r242, %r240, 16;
+	add.s32 	%r243, %r127, %r242;
+	st.shared.b32 	[%r243+512], %r219;
+	st.shared.b32 	[%r243+640], %r221;
+	xor.b32 	%r244, %r240, 4;
+	add.s32 	%r245, %r127, %r244;
+	st.shared.b32 	[%r245], %r222;
+	st.shared.b32 	[%r245+128], %r224;
+	xor.b32 	%r246, %r240, 20;
+	add.s32 	%r247, %r127, %r246;
+	st.shared.b32 	[%r247+512], %r223;
+	st.shared.b32 	[%r247+640], %r225;
+	xor.b32 	%r248, %r240, 8;
+	add.s32 	%r249, %r127, %r248;
+	st.shared.b32 	[%r249], %r226;
+	st.shared.b32 	[%r249+128], %r228;
+	xor.b32 	%r250, %r240, 24;
+	add.s32 	%r251, %r127, %r250;
+	st.shared.b32 	[%r251+512], %r227;
+	st.shared.b32 	[%r251+640], %r229;
+	xor.b32 	%r252, %r240, 12;
+	add.s32 	%r253, %r127, %r252;
+	st.shared.b32 	[%r253], %r230;
+	st.shared.b32 	[%r253+128], %r232;
+	xor.b32 	%r254, %r240, 28;
+	add.s32 	%r255, %r127, %r254;
+	st.shared.b32 	[%r255+512], %r231;
+	st.shared.b32 	[%r255+640], %r233;
+	bar.sync 	0;
+	shl.b32 	%r256, %r79, 5;
+	and.b32 	%r257, %r256, 608;
+	and.b32 	%r258, %r79, 28;
+	shr.u32 	%r259, %r79, 2;
+	and.b32 	%r260, %r259, 16;
+	bfe.s32 	%r261, %r79, 7, 1;
+	and.b32 	%r262, %r261, 1056;
+	or.b32 	%r263, %r257, %r258;
+	or.b32 	%r264, %r262, %r260;
+	xor.b32 	%r265, %r264, %r263;
+	or.b32 	%r266, %r265, %r123;
+	add.s32 	%r267, %r127, %r266;
+	ld.shared.b32 	%r268, [%r267];
+	ld.shared.b32 	%r269, [%r267+256];
+	xor.b32 	%r270, %r266, 4;
+	add.s32 	%r271, %r127, %r270;
+	ld.shared.b32 	%r272, [%r271+4096];
+	ld.shared.b32 	%r273, [%r271+4352];
+	xor.b32 	%r274, %r266, 8;
+	add.s32 	%r275, %r127, %r274;
+	ld.shared.b32 	%r276, [%r275+8192];
+	ld.shared.b32 	%r277, [%r275+8448];
+	xor.b32 	%r278, %r266, 12;
+	add.s32 	%r279, %r127, %r278;
+	ld.shared.b32 	%r280, [%r279+12288];
+	ld.shared.b32 	%r281, [%r279+12544];
+	xor.b32 	%r282, %r266, 64;
+	add.s32 	%r283, %r127, %r282;
+	ld.shared.b32 	%r284, [%r283+2048];
+	ld.shared.b32 	%r285, [%r283+2304];
+	xor.b32 	%r286, %r266, 68;
+	add.s32 	%r287, %r127, %r286;
+	ld.shared.b32 	%r288, [%r287+6144];
+	ld.shared.b32 	%r289, [%r287+6400];
+	xor.b32 	%r290, %r266, 72;
+	add.s32 	%r291, %r127, %r290;
+	ld.shared.b32 	%r292, [%r291+10240];
+	ld.shared.b32 	%r293, [%r291+10496];
+	xor.b32 	%r294, %r266, 76;
+	add.s32 	%r295, %r127, %r294;
+	ld.shared.b32 	%r296, [%r295+14336];
+	ld.shared.b32 	%r297, [%r295+14592];
+	.loc	1 45 31                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:45:31
+	mul.wide.s32 	%rd34, %r91, 2;
+	add.s64 	%rd10, %rd29, %rd34;
+	.loc	1 45 71                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:45:71
+	// begin inline asm
+	mov.u64 %rd11, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd11, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r26, %r5;
+	mov.u32 %r27, %r5;
+	mov.u32 %r28, %r5;
+	mov.u32 %r29, %r5;
+	@%p1 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r26, %r27, %r28, %r29 }, [ %rd10 + 0 ], %rd11;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd12, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd12, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r30, %r5;
+	mov.u32 %r31, %r5;
+	mov.u32 %r32, %r5;
+	mov.u32 %r33, %r5;
+	@%p1 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r30, %r31, %r32, %r33 }, [ %rd10 + 0 ], %rd12;
+	// end inline asm
+	.loc	1 54 52                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:54:52
+	add.s32 	%r298, %r106, -3145728;
+	.loc	1 54 45                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:54:45
+	add.s32 	%r299, %r104, %r298;
+	add.s32 	%r300, %r105, %r298;
+	.loc	1 54 31                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:54:31
+	mad.wide.s32 	%rd13, %r299, 2, %rd30;
+	mad.wide.s32 	%rd15, %r300, 2, %rd30;
+	.loc	1 54 83                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:54:83
+	add.s32 	%r301, %r78, -8192;
+	setp.lt.u32 	%p10, %r301, 65536;
+	and.pred 	%p3, %p6, %p10;
+	and.pred 	%p4, %p7, %p10;
+	.loc	1 54 67                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:54:67
+	// begin inline asm
+	mov.u64 %rd14, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd14, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r34, %r5;
+	mov.u32 %r35, %r5;
+	mov.u32 %r36, %r5;
+	mov.u32 %r37, %r5;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r34, %r35, %r36, %r37 }, [ %rd13 + 0 ], %rd14;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd16, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd16, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r38, %r5;
+	mov.u32 %r39, %r5;
+	mov.u32 %r40, %r5;
+	mov.u32 %r41, %r5;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r38, %r39, %r40, %r41 }, [ %rd15 + 0 ], %rd16;
+	// end inline asm
+	prmt.b32 	%r302, %r34, %r38, 0x7632U;
+	prmt.b32 	%r303, %r35, %r39, 0x7632U;
+	prmt.b32 	%r304, %r36, %r40, 0x7632U;
+	prmt.b32 	%r305, %r37, %r41, 0x7632U;
+	.loc	1 54 134                        // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:54:134
+	bar.sync 	0;
+	prmt.b32 	%r306, %r34, %r38, 0x5410U;
+	prmt.b32 	%r307, %r35, %r39, 0x5410U;
+	st.shared.v4.b32 	[%r130], {%r306, %r302, %r307, %r303};
+	prmt.b32 	%r308, %r36, %r40, 0x5410U;
+	prmt.b32 	%r309, %r37, %r41, 0x5410U;
+	st.shared.v4.b32 	[%r130+256], {%r308, %r304, %r309, %r305};
+	bar.sync 	0;
+	ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r310, %r311, %r312, %r313}, [%r144];
+	ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r314, %r315, %r316, %r317}, [%r144+512];
+	.loc	1 56 52                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:56:52
+	add.s32 	%r318, %r85, -8192;
+	.loc	1 56 31                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:56:31
+	mad.wide.s32 	%rd17, %r318, 4, %rd31;
+	.loc	1 56 90                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:56:90
+	// begin inline asm
+	mov.u64 %rd18, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd18, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r42, %r5;
+	mov.u32 %r43, %r5;
+	mov.u32 %r44, %r5;
+	mov.u32 %r45, %r5;
+	@%p4 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r42, %r43, %r44, %r45 }, [ %rd17 + 0 ], %rd18;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd19, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd19, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r46, %r5;
+	mov.u32 %r47, %r5;
+	mov.u32 %r48, %r5;
+	mov.u32 %r49, %r5;
+	@%p4 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r46, %r47, %r48, %r49 }, [ %rd17 + 0 ], %rd19;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd20, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd20, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r50, %r5;
+	mov.u32 %r51, %r5;
+	mov.u32 %r52, %r5;
+	mov.u32 %r53, %r5;
+	@%p4 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r50, %r51, %r52, %r53 }, [ %rd17 + 0 ], %rd20;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd21, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd21, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r54, %r5;
+	mov.u32 %r55, %r5;
+	mov.u32 %r56, %r5;
+	mov.u32 %r57, %r5;
+	@%p4 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r54, %r55, %r56, %r57 }, [ %rd17 + 0 ], %rd21;
+	// end inline asm
+	.loc	1 58 21                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:58:21
+	div.full.f32 	%r319, %r42, %r169;
+	div.full.f32 	%r320, %r43, %r169;
+	div.full.f32 	%r321, %r44, %r169;
+	div.full.f32 	%r322, %r45, %r169;
+	div.full.f32 	%r323, %r46, %r169;
+	div.full.f32 	%r324, %r47, %r169;
+	div.full.f32 	%r325, %r48, %r169;
+	div.full.f32 	%r326, %r49, %r169;
+	div.full.f32 	%r327, %r50, %r169;
+	div.full.f32 	%r328, %r51, %r169;
+	div.full.f32 	%r329, %r52, %r169;
+	div.full.f32 	%r330, %r53, %r169;
+	div.full.f32 	%r331, %r54, %r169;
+	div.full.f32 	%r332, %r55, %r169;
+	div.full.f32 	%r333, %r56, %r169;
+	div.full.f32 	%r334, %r57, %r169;
+	.loc	1 60 20                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:60:20
+	add.f32 	%r335, %r319, 0f358637BD;
+	add.f32 	%r336, %r320, 0f358637BD;
+	add.f32 	%r337, %r321, 0f358637BD;
+	add.f32 	%r338, %r322, 0f358637BD;
+	add.f32 	%r339, %r323, 0f358637BD;
+	add.f32 	%r340, %r324, 0f358637BD;
+	add.f32 	%r341, %r325, 0f358637BD;
+	add.f32 	%r342, %r326, 0f358637BD;
+	add.f32 	%r343, %r327, 0f358637BD;
+	add.f32 	%r344, %r328, 0f358637BD;
+	add.f32 	%r345, %r329, 0f358637BD;
+	add.f32 	%r346, %r330, 0f358637BD;
+	add.f32 	%r347, %r331, 0f358637BD;
+	add.f32 	%r348, %r332, 0f358637BD;
+	add.f32 	%r349, %r333, 0f358637BD;
+	add.f32 	%r350, %r334, 0f358637BD;
+	.loc	1 61 28                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:61:28
+	rsqrt.approx.ftz.f32 	%r351, %r335;
+	rsqrt.approx.ftz.f32 	%r352, %r336;
+	rsqrt.approx.ftz.f32 	%r353, %r337;
+	rsqrt.approx.ftz.f32 	%r354, %r338;
+	rsqrt.approx.ftz.f32 	%r355, %r339;
+	rsqrt.approx.ftz.f32 	%r356, %r340;
+	rsqrt.approx.ftz.f32 	%r357, %r341;
+	rsqrt.approx.ftz.f32 	%r358, %r342;
+	rsqrt.approx.ftz.f32 	%r359, %r343;
+	rsqrt.approx.ftz.f32 	%r360, %r344;
+	rsqrt.approx.ftz.f32 	%r361, %r345;
+	rsqrt.approx.ftz.f32 	%r362, %r346;
+	rsqrt.approx.ftz.f32 	%r363, %r347;
+	rsqrt.approx.ftz.f32 	%r364, %r348;
+	rsqrt.approx.ftz.f32 	%r365, %r349;
+	rsqrt.approx.ftz.f32 	%r366, %r350;
+	.loc	1 54 134                        // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:54:134
+	mov.b32 	{%rs17, %rs18}, %r317;
+	cvt.f32.bf16 	%r367, %rs18;
+	cvt.f32.bf16 	%r368, %rs17;
+	mov.b32 	{%rs19, %rs20}, %r316;
+	cvt.f32.bf16 	%r369, %rs20;
+	cvt.f32.bf16 	%r370, %rs19;
+	mov.b32 	{%rs21, %rs22}, %r315;
+	cvt.f32.bf16 	%r371, %rs22;
+	cvt.f32.bf16 	%r372, %rs21;
+	mov.b32 	{%rs23, %rs24}, %r314;
+	cvt.f32.bf16 	%r373, %rs24;
+	cvt.f32.bf16 	%r374, %rs23;
+	mov.b32 	{%rs25, %rs26}, %r313;
+	cvt.f32.bf16 	%r375, %rs26;
+	cvt.f32.bf16 	%r376, %rs25;
+	mov.b32 	{%rs27, %rs28}, %r312;
+	cvt.f32.bf16 	%r377, %rs28;
+	cvt.f32.bf16 	%r378, %rs27;
+	mov.b32 	{%rs29, %rs30}, %r311;
+	cvt.f32.bf16 	%r379, %rs30;
+	cvt.f32.bf16 	%r380, %rs29;
+	mov.b32 	{%rs31, %rs32}, %r310;
+	cvt.f32.bf16 	%r381, %rs32;
+	cvt.f32.bf16 	%r382, %rs31;
+	.loc	1 23 21                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:23:21
+	setp.lt.s32 	%p11, %r83, 73728;
+	.loc	1 62 20                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:62:20
+	mul.f32 	%r383, %r351, %r382;
+	mul.f32 	%r384, %r352, %r381;
+	mul.f32 	%r385, %r353, %r380;
+	mul.f32 	%r386, %r354, %r379;
+	mul.f32 	%r387, %r355, %r378;
+	mul.f32 	%r388, %r356, %r377;
+	mul.f32 	%r389, %r357, %r376;
+	mul.f32 	%r390, %r358, %r375;
+	mul.f32 	%r391, %r359, %r374;
+	mul.f32 	%r392, %r360, %r373;
+	mul.f32 	%r393, %r361, %r372;
+	mul.f32 	%r394, %r362, %r371;
+	mul.f32 	%r395, %r363, %r370;
+	mul.f32 	%r396, %r364, %r369;
+	mul.f32 	%r397, %r365, %r368;
+	mul.f32 	%r398, %r366, %r367;
+	bar.sync 	0;
+	st.shared.b32 	[%r241], %r383;
+	st.shared.b32 	[%r241+128], %r385;
+	st.shared.b32 	[%r243+512], %r384;
+	st.shared.b32 	[%r243+640], %r386;
+	st.shared.b32 	[%r245], %r387;
+	st.shared.b32 	[%r245+128], %r389;
+	st.shared.b32 	[%r247+512], %r388;
+	st.shared.b32 	[%r247+640], %r390;
+	st.shared.b32 	[%r249], %r391;
+	st.shared.b32 	[%r249+128], %r393;
+	st.shared.b32 	[%r251+512], %r392;
+	st.shared.b32 	[%r251+640], %r394;
+	st.shared.b32 	[%r253], %r395;
+	st.shared.b32 	[%r253+128], %r397;
+	st.shared.b32 	[%r255+512], %r396;
+	st.shared.b32 	[%r255+640], %r398;
+	bar.sync 	0;
+	ld.shared.b32 	%r399, [%r267];
+	ld.shared.b32 	%r400, [%r267+256];
+	ld.shared.b32 	%r401, [%r271+4096];
+	ld.shared.b32 	%r402, [%r271+4352];
+	ld.shared.b32 	%r403, [%r275+8192];
+	ld.shared.b32 	%r404, [%r275+8448];
+	ld.shared.b32 	%r405, [%r279+12288];
+	ld.shared.b32 	%r406, [%r279+12544];
+	ld.shared.b32 	%r407, [%r283+2048];
+	ld.shared.b32 	%r408, [%r283+2304];
+	ld.shared.b32 	%r409, [%r287+6144];
+	ld.shared.b32 	%r410, [%r287+6400];
+	ld.shared.b32 	%r411, [%r291+10240];
+	ld.shared.b32 	%r412, [%r291+10496];
+	ld.shared.b32 	%r413, [%r295+14336];
+	ld.shared.b32 	%r414, [%r295+14592];
+	.loc	1 63 31                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:63:31
+	add.s64 	%rd22, %rd32, %rd34;
+	.loc	1 63 71                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:63:71
+	// begin inline asm
+	mov.u64 %rd23, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd23, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r58, %r5;
+	mov.u32 %r59, %r5;
+	mov.u32 %r60, %r5;
+	mov.u32 %r61, %r5;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r58, %r59, %r60, %r61 }, [ %rd22 + 0 ], %rd23;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd24, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd24, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r62, %r5;
+	mov.u32 %r63, %r5;
+	mov.u32 %r64, %r5;
+	mov.u32 %r65, %r5;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r62, %r63, %r64, %r65 }, [ %rd22 + 0 ], %rd24;
+	// end inline asm
+	.loc	1 70 34                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:70:34
+	shl.b32 	%r415, %r83, 7;
+	shl.b32 	%r416, %r84, 7;
+	.loc	1 70 30                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:70:30
+	add.s32 	%r417, %r415, %r91;
+	add.s32 	%r418, %r416, %r91;
+	.loc	1 70 25                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:70:25
+	mad.wide.s32 	%rd25, %r417, 2, %rd33;
+	mad.wide.s32 	%rd26, %r418, 2, %rd33;
+	.loc	1 70 54                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:70:54
+	and.pred 	%p5, %p6, %p11;
+	.loc	1 45 137                        // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:45:137
+	mov.b32 	{%rs33, %rs34}, %r26;
+	cvt.f32.bf16 	%r419, %rs33;
+	cvt.f32.bf16 	%r420, %rs34;
+	.loc	1 47 20                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:47:20
+	mul.f32 	%r421, %r272, %r420;
+	mul.f32 	%r422, %r268, %r419;
+	.loc	1 63 138                        // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:63:138
+	mov.b32 	{%rs35, %rs36}, %r58;
+	cvt.f32.bf16 	%r423, %rs35;
+	cvt.f32.bf16 	%r424, %rs36;
+	.loc	1 65 20                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:65:20
+	mul.f32 	%r425, %r401, %r424;
+	mul.f32 	%r426, %r399, %r423;
+	.loc	1 0 0                           // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:0
+	selp.f32 	%r427, %r422, %r426, %p8;
+	selp.f32 	%r428, %r421, %r425, %p8;
+	.loc	1 70 46                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:70:46
+	cvt.rn.bf16x2.f32 	%r66, %r428, %r427;
+	.loc	1 45 137                        // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:45:137
+	mov.b32 	{%rs37, %rs38}, %r27;
+	cvt.f32.bf16 	%r429, %rs37;
+	cvt.f32.bf16 	%r430, %rs38;
+	.loc	1 47 20                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:47:20
+	mul.f32 	%r431, %r280, %r430;
+	mul.f32 	%r432, %r276, %r429;
+	.loc	1 63 138                        // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:63:138
+	mov.b32 	{%rs39, %rs40}, %r59;
+	cvt.f32.bf16 	%r433, %rs39;
+	cvt.f32.bf16 	%r434, %rs40;
+	.loc	1 65 20                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:65:20
+	mul.f32 	%r435, %r405, %r434;
+	mul.f32 	%r436, %r403, %r433;
+	.loc	1 0 0                           // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:0
+	selp.f32 	%r437, %r432, %r436, %p8;
+	selp.f32 	%r438, %r431, %r435, %p8;
+	.loc	1 70 46                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:70:46
+	cvt.rn.bf16x2.f32 	%r67, %r438, %r437;
+	.loc	1 45 137                        // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:45:137
+	mov.b32 	{%rs41, %rs42}, %r28;
+	cvt.f32.bf16 	%r439, %rs41;
+	cvt.f32.bf16 	%r440, %rs42;
+	.loc	1 47 20                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:47:20
+	mul.f32 	%r441, %r273, %r440;
+	mul.f32 	%r442, %r269, %r439;
+	.loc	1 63 138                        // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:63:138
+	mov.b32 	{%rs43, %rs44}, %r60;
+	cvt.f32.bf16 	%r443, %rs43;
+	cvt.f32.bf16 	%r444, %rs44;
+	.loc	1 65 20                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:65:20
+	mul.f32 	%r445, %r402, %r444;
+	mul.f32 	%r446, %r400, %r443;
+	.loc	1 0 0                           // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:0
+	selp.f32 	%r447, %r442, %r446, %p8;
+	selp.f32 	%r448, %r441, %r445, %p8;
+	.loc	1 70 46                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:70:46
+	cvt.rn.bf16x2.f32 	%r68, %r448, %r447;
+	.loc	1 45 137                        // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:45:137
+	mov.b32 	{%rs45, %rs46}, %r29;
+	cvt.f32.bf16 	%r449, %rs45;
+	cvt.f32.bf16 	%r450, %rs46;
+	.loc	1 47 20                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:47:20
+	mul.f32 	%r451, %r281, %r450;
+	mul.f32 	%r452, %r277, %r449;
+	.loc	1 63 138                        // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:63:138
+	mov.b32 	{%rs47, %rs48}, %r61;
+	cvt.f32.bf16 	%r453, %rs47;
+	cvt.f32.bf16 	%r454, %rs48;
+	.loc	1 65 20                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:65:20
+	mul.f32 	%r455, %r406, %r454;
+	mul.f32 	%r456, %r404, %r453;
+	.loc	1 0 0                           // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:0
+	selp.f32 	%r457, %r452, %r456, %p8;
+	selp.f32 	%r458, %r451, %r455, %p8;
+	.loc	1 70 46                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:70:46
+	cvt.rn.bf16x2.f32 	%r69, %r458, %r457;
+	.loc	1 45 137                        // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:45:137
+	mov.b32 	{%rs49, %rs50}, %r30;
+	cvt.f32.bf16 	%r459, %rs49;
+	cvt.f32.bf16 	%r460, %rs50;
+	.loc	1 47 20                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:47:20
+	mul.f32 	%r461, %r288, %r460;
+	mul.f32 	%r462, %r284, %r459;
+	.loc	1 63 138                        // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:63:138
+	mov.b32 	{%rs51, %rs52}, %r62;
+	cvt.f32.bf16 	%r463, %rs51;
+	cvt.f32.bf16 	%r464, %rs52;
+	.loc	1 65 20                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:65:20
+	mul.f32 	%r465, %r409, %r464;
+	mul.f32 	%r466, %r407, %r463;
+	.loc	1 0 0                           // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:0
+	selp.f32 	%r467, %r462, %r466, %p8;
+	selp.f32 	%r468, %r461, %r465, %p8;
+	.loc	1 70 46                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:70:46
+	cvt.rn.bf16x2.f32 	%r70, %r468, %r467;
+	.loc	1 45 137                        // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:45:137
+	mov.b32 	{%rs53, %rs54}, %r31;
+	cvt.f32.bf16 	%r469, %rs53;
+	cvt.f32.bf16 	%r470, %rs54;
+	.loc	1 47 20                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:47:20
+	mul.f32 	%r471, %r296, %r470;
+	mul.f32 	%r472, %r292, %r469;
+	.loc	1 63 138                        // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:63:138
+	mov.b32 	{%rs55, %rs56}, %r63;
+	cvt.f32.bf16 	%r473, %rs55;
+	cvt.f32.bf16 	%r474, %rs56;
+	.loc	1 65 20                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:65:20
+	mul.f32 	%r475, %r413, %r474;
+	mul.f32 	%r476, %r411, %r473;
+	.loc	1 0 0                           // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:0
+	selp.f32 	%r477, %r472, %r476, %p8;
+	selp.f32 	%r478, %r471, %r475, %p8;
+	.loc	1 70 46                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:70:46
+	cvt.rn.bf16x2.f32 	%r71, %r478, %r477;
+	.loc	1 45 137                        // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:45:137
+	mov.b32 	{%rs57, %rs58}, %r32;
+	cvt.f32.bf16 	%r479, %rs57;
+	cvt.f32.bf16 	%r480, %rs58;
+	.loc	1 47 20                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:47:20
+	mul.f32 	%r481, %r289, %r480;
+	mul.f32 	%r482, %r285, %r479;
+	.loc	1 63 138                        // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:63:138
+	mov.b32 	{%rs59, %rs60}, %r64;
+	cvt.f32.bf16 	%r483, %rs59;
+	cvt.f32.bf16 	%r484, %rs60;
+	.loc	1 65 20                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:65:20
+	mul.f32 	%r485, %r410, %r484;
+	mul.f32 	%r486, %r408, %r483;
+	.loc	1 0 0                           // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:0
+	selp.f32 	%r487, %r482, %r486, %p8;
+	selp.f32 	%r488, %r481, %r485, %p8;
+	.loc	1 70 46                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:70:46
+	cvt.rn.bf16x2.f32 	%r72, %r488, %r487;
+	.loc	1 45 137                        // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:45:137
+	mov.b32 	{%rs61, %rs62}, %r33;
+	cvt.f32.bf16 	%r489, %rs61;
+	cvt.f32.bf16 	%r490, %rs62;
+	.loc	1 47 20                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:47:20
+	mul.f32 	%r491, %r297, %r490;
+	mul.f32 	%r492, %r293, %r489;
+	.loc	1 63 138                        // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:63:138
+	mov.b32 	{%rs63, %rs64}, %r65;
+	cvt.f32.bf16 	%r493, %rs63;
+	cvt.f32.bf16 	%r494, %rs64;
+	.loc	1 65 20                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:65:20
+	mul.f32 	%r495, %r414, %r494;
+	mul.f32 	%r496, %r412, %r493;
+	.loc	1 0 0                           // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:0
+	selp.f32 	%r497, %r492, %r496, %p8;
+	selp.f32 	%r498, %r491, %r495, %p8;
+	.loc	1 70 46                         // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:70:46
+	cvt.rn.bf16x2.f32 	%r73, %r498, %r497;
+	// begin inline asm
+	@%p5 st.global.v4.b32 [ %rd25 + 0 ], { %r66, %r67, %r68, %r69 };
+	// end inline asm
+	// begin inline asm
+	@%p5 st.global.v4.b32 [ %rd26 + 0 ], { %r70, %r71, %r72, %r73 };
+	// end inline asm
+	.loc	1 70 4                          // c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py:70:4
+	ret;
+$L__tmp1:
+$L__func_end0:
+                                        // -- End function
+}
+	.file	1 "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py"
+	.section	.debug_abbrev
+	{
+.b8 1                                   // Abbreviation Code
+.b8 17                                  // DW_TAG_compile_unit
+.b8 0                                   // DW_CHILDREN_no
+.b8 37                                  // DW_AT_producer
+.b8 8                                   // DW_FORM_string
+.b8 19                                  // DW_AT_language
+.b8 5                                   // DW_FORM_data2
+.b8 3                                   // DW_AT_name
+.b8 8                                   // DW_FORM_string
+.b8 16                                  // DW_AT_stmt_list
+.b8 6                                   // DW_FORM_data4
+.b8 27                                  // DW_AT_comp_dir
+.b8 8                                   // DW_FORM_string
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 0                                   // EOM(3)
+	}
+	.section	.debug_info
+	{
+.b32 224                                // Length of Unit
+.b8 2                                   // DWARF version number
+.b8 0
+.b32 .debug_abbrev                      // Offset Into Abbrev. Section
+.b8 8                                   // Address Size (in bytes)
+.b8 1                                   // Abbrev [1] 0xb:0xd9 DW_TAG_compile_unit
+.b8 116                                 // DW_AT_producer
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 0
+.b8 2                                   // DW_AT_language
+.b8 0
+.b8 99                                  // DW_AT_name
+.b8 50
+.b8 104
+.b8 105
+.b8 106
+.b8 51
+.b8 104
+.b8 109
+.b8 108
+.b8 111
+.b8 117
+.b8 109
+.b8 120
+.b8 100
+.b8 109
+.b8 104
+.b8 117
+.b8 101
+.b8 122
+.b8 115
+.b8 121
+.b8 104
+.b8 107
+.b8 109
+.b8 110
+.b8 113
+.b8 103
+.b8 110
+.b8 102
+.b8 97
+.b8 53
+.b8 105
+.b8 118
+.b8 114
+.b8 101
+.b8 50
+.b8 55
+.b8 117
+.b8 111
+.b8 115
+.b8 121
+.b8 109
+.b8 97
+.b8 109
+.b8 51
+.b8 100
+.b8 114
+.b8 55
+.b8 97
+.b8 53
+.b8 120
+.b8 98
+.b8 46
+.b8 112
+.b8 121
+.b8 0
+.b32 .debug_line                        // DW_AT_stmt_list
+.b8 47                                  // DW_AT_comp_dir
+.b8 97
+.b8 112
+.b8 112
+.b8 47
+.b8 116
+.b8 101
+.b8 110
+.b8 115
+.b8 111
+.b8 114
+.b8 114
+.b8 116
+.b8 95
+.b8 108
+.b8 108
+.b8 109
+.b8 47
+.b8 118
+.b8 105
+.b8 115
+.b8 117
+.b8 97
+.b8 108
+.b8 95
+.b8 103
+.b8 101
+.b8 110
+.b8 47
+.b8 99
+.b8 111
+.b8 109
+.b8 112
+.b8 105
+.b8 108
+.b8 101
+.b8 100
+.b8 95
+.b8 99
+.b8 97
+.b8 99
+.b8 104
+.b8 101
+.b8 47
+.b8 102
+.b8 108
+.b8 117
+.b8 120
+.b8 50
+.b8 95
+.b8 107
+.b8 108
+.b8 101
+.b8 105
+.b8 110
+.b8 95
+.b8 57
+.b8 98
+.b8 95
+.b8 78
+.b8 86
+.b8 73
+.b8 68
+.b8 73
+.b8 65
+.b8 95
+.b8 71
+.b8 101
+.b8 70
+.b8 111
+.b8 114
+.b8 99
+.b8 101
+.b8 95
+.b8 82
+.b8 84
+.b8 88
+.b8 95
+.b8 52
+.b8 48
+.b8 57
+.b8 48
+.b8 95
+.b8 115
+.b8 109
+.b8 56
+.b8 57
+.b8 95
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 50
+.b8 46
+.b8 49
+.b8 48
+.b8 46
+.b8 48
+.b8 97
+.b8 48
+.b8 95
+.b8 98
+.b8 52
+.b8 101
+.b8 52
+.b8 101
+.b8 101
+.b8 56
+.b8 49
+.b8 100
+.b8 51
+.b8 46
+.b8 110
+.b8 118
+.b8 50
+.b8 53
+.b8 46
+.b8 49
+.b8 50
+.b8 95
+.b8 99
+.b8 117
+.b8 100
+.b8 97
+.b8 49
+.b8 51
+.b8 95
+.b8 49
+.b8 47
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 105
+.b8 110
+.b8 100
+.b8 117
+.b8 99
+.b8 116
+.b8 111
+.b8 114
+.b8 47
+.b8 50
+.b8 104
+.b8 0
+	}
+	.section	.debug_macinfo	{	}
diff --git a/triton/VJYGHH2I6HL5D4FSAVHRN5TDVUMIEA46OE2HSL7GW3Y7IZ2XL7TQ/triton_poi_fused__fused_rms_norm_cat_view_2.source b/triton/VJYGHH2I6HL5D4FSAVHRN5TDVUMIEA46OE2HSL7GW3Y7IZ2XL7TQ/triton_poi_fused__fused_rms_norm_cat_view_2.source
new file mode 100644
index 0000000000000000000000000000000000000000..8267d145edfd1392500dd03aa3358250d2fb2971
--- /dev/null
+++ b/triton/VJYGHH2I6HL5D4FSAVHRN5TDVUMIEA46OE2HSL7GW3Y7IZ2XL7TQ/triton_poi_fused__fused_rms_norm_cat_view_2.source
@@ -0,0 +1,415 @@
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":18:0)
+#loc99 = loc("in_ptr0"(#loc))
+#loc100 = loc("in_ptr1"(#loc))
+#loc101 = loc("in_ptr2"(#loc))
+#loc102 = loc("in_ptr3"(#loc))
+#loc103 = loc("in_ptr4"(#loc))
+#loc104 = loc("in_ptr5"(#loc))
+#loc105 = loc("out_ptr0"(#loc))
+#loc106 = loc("ynumel"(#loc))
+#loc107 = loc("xnumel"(#loc))
+module {
+  tt.func public @triton_poi_fused__fused_rms_norm_cat_view_2(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %in_ptr4: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr4"(#loc)), %in_ptr5: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr5"(#loc)), %out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ynumel: i32 {tt.divisibility = 16 : i32} loc("ynumel"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} {
+    %ynumel_0 = arith.constant 73728 : i32 loc(#loc108)
+    %xnumel_1 = arith.constant 128 : i32 loc(#loc109)
+    %yoffset = tt.get_program_id y : i32 loc(#loc110)
+    %yoffset_2 = tt.get_program_id z : i32 loc(#loc111)
+    %yoffset_3 = tt.get_num_programs y : i32 loc(#loc112)
+    %yoffset_4 = arith.muli %yoffset_2, %yoffset_3 : i32 loc(#loc113)
+    %yoffset_5 = arith.addi %yoffset, %yoffset_4 : i32 loc(#loc114)
+    %yoffset_6 = arith.constant 32 : i32 loc(#loc115)
+    %yoffset_7 = arith.constant 32 : i32 loc(#loc115)
+    %yoffset_8 = arith.muli %yoffset_5, %yoffset_7 : i32 loc(#loc115)
+    %yindex = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32> loc(#loc116)
+    %yindex_9 = tt.expand_dims %yindex {axis = 1 : i32} : tensor<32xi32> -> tensor<32x1xi32> loc(#loc117)
+    %yindex_10 = tt.splat %yoffset_8 : i32 -> tensor<32x1xi32> loc(#loc118)
+    %yindex_11 = arith.addi %yindex_10, %yindex_9 : tensor<32x1xi32> loc(#loc118)
+    %ymask = arith.constant dense<73728> : tensor<32x1xi32> loc(#loc119)
+    %ymask_12 = arith.cmpi slt, %yindex_11, %ymask : tensor<32x1xi32> loc(#loc119)
+    %xoffset = tt.get_program_id x : i32 loc(#loc120)
+    %xoffset_13 = arith.constant 128 : i32 loc(#loc121)
+    %xoffset_14 = arith.constant 128 : i32 loc(#loc121)
+    %xoffset_15 = arith.muli %xoffset, %xoffset_14 : i32 loc(#loc121)
+    %xindex = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc122)
+    %xindex_16 = tt.expand_dims %xindex {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc123)
+    %xindex_17 = tt.splat %xoffset_15 : i32 -> tensor<1x128xi32> loc(#loc124)
+    %xindex_18 = arith.addi %xindex_17, %xindex_16 : tensor<1x128xi32> loc(#loc124)
+    %xmask = arith.constant dense<128> : tensor<1x128xi32> loc(#loc125)
+    %xmask_19 = arith.cmpi slt, %xindex_18, %xmask : tensor<1x128xi32> loc(#loc125)
+    %y1 = arith.constant 32 : i32 loc(#loc126)
+    %y1_20 = arith.constant 32 : i32 loc(#loc126)
+    %y1_21 = arith.constant dense<32> : tensor<32x1xi32> loc(#loc126)
+    %y1_22 = arith.divsi %yindex_11, %y1_21 : tensor<32x1xi32> loc(#loc126)
+    %y0 = arith.constant 32 : i32 loc(#loc127)
+    %y0_23 = arith.constant 32 : i32 loc(#loc127)
+    %y0_24 = arith.constant dense<32> : tensor<32x1xi32> loc(#loc127)
+    %y0_25 = arith.remsi %yindex_11, %y0_24 : tensor<32x1xi32> loc(#loc127)
+    %tmp1 = arith.constant 0 : i64 loc(#loc128)
+    %tmp1_26 = arith.constant dense<0> : tensor<1x1xi64> loc(#loc128)
+    %tmp2 = arith.extsi %y1_22 : tensor<32x1xi32> to tensor<32x1xi64> loc(#loc129)
+    %tmp2_27 = arith.constant dense<0> : tensor<32x1xi64> loc(#loc129)
+    %tmp2_28 = arith.cmpi sge, %tmp2, %tmp2_27 : tensor<32x1xi64> loc(#loc129)
+    %tmp3 = arith.constant 256 : i64 loc(#loc130)
+    %tmp3_29 = arith.constant dense<256> : tensor<1x1xi64> loc(#loc130)
+    %tmp4 = arith.extsi %y1_22 : tensor<32x1xi32> to tensor<32x1xi64> loc(#loc131)
+    %tmp4_30 = arith.constant dense<256> : tensor<32x1xi64> loc(#loc131)
+    %tmp4_31 = arith.cmpi slt, %tmp4, %tmp4_30 : tensor<32x1xi64> loc(#loc131)
+    %tmp5 = arith.constant 128 : i32 loc(#loc132)
+    %tmp5_32 = arith.constant 128 : i32 loc(#loc132)
+    %tmp5_33 = arith.constant dense<128> : tensor<32x1xi32> loc(#loc132)
+    %tmp5_34 = arith.muli %tmp5_33, %y0_25 : tensor<32x1xi32> loc(#loc132)
+    %tmp5_35 = tt.broadcast %xindex_18 : tensor<1x128xi32> -> tensor<32x128xi32> loc(#loc133)
+    %tmp5_36 = tt.broadcast %tmp5_34 : tensor<32x1xi32> -> tensor<32x128xi32> loc(#loc133)
+    %tmp5_37 = arith.addi %tmp5_35, %tmp5_36 : tensor<32x128xi32> loc(#loc133)
+    %tmp5_38 = arith.constant 12288 : i32 loc(#loc134)
+    %tmp5_39 = arith.constant 12288 : i32 loc(#loc134)
+    %tmp5_40 = arith.constant dense<12288> : tensor<32x1xi32> loc(#loc134)
+    %tmp5_41 = arith.muli %tmp5_40, %y1_22 : tensor<32x1xi32> loc(#loc134)
+    %tmp5_42 = tt.broadcast %tmp5_41 : tensor<32x1xi32> -> tensor<32x128xi32> loc(#loc135)
+    %tmp5_43 = arith.addi %tmp5_37, %tmp5_42 : tensor<32x128xi32> loc(#loc135)
+    %tmp5_44 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<32x128x!tt.ptr<bf16>> loc(#loc136)
+    %tmp5_45 = tt.addptr %tmp5_44, %tmp5_43 : tensor<32x128x!tt.ptr<bf16>>, tensor<32x128xi32> loc(#loc136)
+    %tmp5_46 = tt.broadcast %tmp4_31 : tensor<32x1xi1> -> tensor<32x128xi1> loc(#loc137)
+    %tmp5_47 = tt.broadcast %xmask_19 : tensor<1x128xi1> -> tensor<32x128xi1> loc(#loc137)
+    %tmp5_48 = arith.andi %tmp5_46, %tmp5_47 : tensor<32x128xi1> loc(#loc137)
+    %tmp5_49 = tt.broadcast %ymask_12 : tensor<32x1xi1> -> tensor<32x128xi1> loc(#loc138)
+    %tmp5_50 = arith.andi %tmp5_48, %tmp5_49 : tensor<32x128xi1> loc(#loc138)
+    %tmp5_51 = arith.constant 0.000000e+00 : f32 loc(#loc139)
+    %tmp5_52 = arith.constant dense<0.000000e+00> : tensor<32x128xf32> loc(#loc139)
+    %tmp5_53 = arith.truncf %tmp5_52 : tensor<32x128xf32> to tensor<32x128xbf16> loc(#loc139)
+    %tmp5_54 = tt.load %tmp5_45, %tmp5_50, %tmp5_53 evictionPolicy = evict_last : tensor<32x128x!tt.ptr<bf16>> loc(#loc139)
+    %tmp5_55 = arith.extf %tmp5_54 : tensor<32x128xbf16> to tensor<32x128xf32> loc(#loc140)
+    %tmp7 = arith.constant 32 : i32 loc(#loc141)
+    %tmp7_56 = arith.constant 32 : i32 loc(#loc141)
+    %tmp7_57 = arith.constant dense<32> : tensor<32x1xi32> loc(#loc141)
+    %tmp7_58 = arith.muli %tmp7_57, %y1_22 : tensor<32x1xi32> loc(#loc141)
+    %tmp7_59 = arith.addi %y0_25, %tmp7_58 : tensor<32x1xi32> loc(#loc142)
+    %tmp7_60 = tt.broadcast %tmp7_59 : tensor<32x1xi32> -> tensor<32x128xi32> loc(#loc143)
+    %tmp7_61 = tt.splat %in_ptr1 : !tt.ptr<f32> -> tensor<32x128x!tt.ptr<f32>> loc(#loc144)
+    %tmp7_62 = tt.addptr %tmp7_61, %tmp7_60 : tensor<32x128x!tt.ptr<f32>>, tensor<32x128xi32> loc(#loc144)
+    %tmp7_63 = tt.broadcast %tmp4_31 : tensor<32x1xi1> -> tensor<32x128xi1> loc(#loc145)
+    %tmp7_64 = tt.broadcast %xmask_19 : tensor<1x128xi1> -> tensor<32x128xi1> loc(#loc145)
+    %tmp7_65 = arith.andi %tmp7_63, %tmp7_64 : tensor<32x128xi1> loc(#loc145)
+    %tmp7_66 = tt.broadcast %ymask_12 : tensor<32x1xi1> -> tensor<32x128xi1> loc(#loc146)
+    %tmp7_67 = arith.andi %tmp7_65, %tmp7_66 : tensor<32x128xi1> loc(#loc146)
+    %tmp7_68 = arith.constant 0.000000e+00 : f32 loc(#loc147)
+    %tmp7_69 = arith.constant dense<0.000000e+00> : tensor<32x128xf32> loc(#loc147)
+    %tmp7_70 = tt.load %tmp7_62, %tmp7_67, %tmp7_69 evictionPolicy = evict_last : tensor<32x128x!tt.ptr<f32>> loc(#loc147)
+    %tmp8 = arith.constant 1.280000e+02 : f32 loc(#loc148)
+    %tmp9 = arith.constant dense<1.280000e+02> : tensor<32x128xf32> loc(#loc149)
+    %tmp9_71 = arith.divf %tmp7_70, %tmp9 : tensor<32x128xf32> loc(#loc149)
+    %tmp10 = arith.constant 9.99999997E-7 : f32 loc(#loc150)
+    %tmp11 = arith.constant dense<9.99999997E-7> : tensor<32x128xf32> loc(#loc151)
+    %tmp11_72 = arith.addf %tmp9_71, %tmp11 : tensor<32x128xf32> loc(#loc151)
+    %tmp12 = tt.extern_elementwise %tmp11_72 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<32x128xf32>) -> tensor<32x128xf32> loc(#loc152)
+    %tmp13 = arith.mulf %tmp5_55, %tmp12 : tensor<32x128xf32> loc(#loc153)
+    %tmp14 = tt.broadcast %xindex_18 : tensor<1x128xi32> -> tensor<32x128xi32> loc(#loc154)
+    %tmp14_73 = tt.splat %in_ptr2 : !tt.ptr<bf16> -> tensor<32x128x!tt.ptr<bf16>> loc(#loc155)
+    %tmp14_74 = tt.addptr %tmp14_73, %tmp14 : tensor<32x128x!tt.ptr<bf16>>, tensor<32x128xi32> loc(#loc155)
+    %tmp14_75 = tt.broadcast %tmp4_31 : tensor<32x1xi1> -> tensor<32x128xi1> loc(#loc156)
+    %tmp14_76 = tt.broadcast %xmask_19 : tensor<1x128xi1> -> tensor<32x128xi1> loc(#loc156)
+    %tmp14_77 = arith.andi %tmp14_75, %tmp14_76 : tensor<32x128xi1> loc(#loc156)
+    %tmp14_78 = tt.broadcast %ymask_12 : tensor<32x1xi1> -> tensor<32x128xi1> loc(#loc157)
+    %tmp14_79 = arith.andi %tmp14_77, %tmp14_78 : tensor<32x128xi1> loc(#loc157)
+    %tmp14_80 = arith.constant 0.000000e+00 : f32 loc(#loc158)
+    %tmp14_81 = arith.constant dense<0.000000e+00> : tensor<32x128xf32> loc(#loc158)
+    %tmp14_82 = arith.truncf %tmp14_81 : tensor<32x128xf32> to tensor<32x128xbf16> loc(#loc158)
+    %tmp14_83 = tt.load %tmp14_74, %tmp14_79, %tmp14_82 evictionPolicy = evict_last : tensor<32x128x!tt.ptr<bf16>> loc(#loc158)
+    %tmp14_84 = arith.extf %tmp14_83 : tensor<32x128xbf16> to tensor<32x128xf32> loc(#loc159)
+    %tmp16 = arith.mulf %tmp13, %tmp14_84 : tensor<32x128xf32> loc(#loc160)
+    %tmp18 = arith.constant 0.000000e+00 : f32 loc(#loc161)
+    %tmp18_85 = arith.constant dense<0.000000e+00> : tensor<32x128xf32> loc(#loc161)
+    %tmp19 = tt.broadcast %tmp4_31 : tensor<32x1xi1> -> tensor<32x128xi1> loc(#loc162)
+    %tmp19_86 = arith.select %tmp19, %tmp16, %tmp18_85 : tensor<32x128xi1>, tensor<32x128xf32> loc(#loc162)
+    %tmp20 = arith.extsi %y1_22 : tensor<32x1xi32> to tensor<32x1xi64> loc(#loc163)
+    %tmp20_87 = arith.constant dense<256> : tensor<32x1xi64> loc(#loc163)
+    %tmp20_88 = arith.cmpi sge, %tmp20, %tmp20_87 : tensor<32x1xi64> loc(#loc163)
+    %tmp21 = arith.constant 2304 : i64 loc(#loc164)
+    %tmp21_89 = arith.constant dense<2304> : tensor<1x1xi64> loc(#loc164)
+    %tmp22 = arith.extsi %y1_22 : tensor<32x1xi32> to tensor<32x1xi64> loc(#loc165)
+    %tmp22_90 = arith.constant dense<2304> : tensor<32x1xi64> loc(#loc165)
+    %tmp22_91 = arith.cmpi slt, %tmp22, %tmp22_90 : tensor<32x1xi64> loc(#loc165)
+    %tmp23 = arith.constant 128 : i32 loc(#loc166)
+    %tmp23_92 = arith.constant 128 : i32 loc(#loc166)
+    %tmp23_93 = arith.constant dense<128> : tensor<32x1xi32> loc(#loc166)
+    %tmp23_94 = arith.muli %tmp23_93, %y0_25 : tensor<32x1xi32> loc(#loc166)
+    %tmp23_95 = tt.broadcast %xindex_18 : tensor<1x128xi32> -> tensor<32x128xi32> loc(#loc167)
+    %tmp23_96 = tt.broadcast %tmp23_94 : tensor<32x1xi32> -> tensor<32x128xi32> loc(#loc167)
+    %tmp23_97 = arith.addi %tmp23_95, %tmp23_96 : tensor<32x128xi32> loc(#loc167)
+    %tmp23_98 = arith.constant -256 : i32 loc(#loc168)
+    %tmp23_99 = arith.constant -256 : i32 loc(#loc168)
+    %tmp23_100 = arith.constant dense<-256> : tensor<32x1xi32> loc(#loc168)
+    %tmp23_101 = arith.addi %tmp23_100, %y1_22 : tensor<32x1xi32> loc(#loc168)
+    %tmp23_102 = arith.constant 12288 : i32 loc(#loc169)
+    %tmp23_103 = arith.constant 12288 : i32 loc(#loc169)
+    %tmp23_104 = arith.constant dense<12288> : tensor<32x1xi32> loc(#loc169)
+    %tmp23_105 = arith.muli %tmp23_104, %tmp23_101 : tensor<32x1xi32> loc(#loc169)
+    %tmp23_106 = tt.broadcast %tmp23_105 : tensor<32x1xi32> -> tensor<32x128xi32> loc(#loc170)
+    %tmp23_107 = arith.addi %tmp23_97, %tmp23_106 : tensor<32x128xi32> loc(#loc170)
+    %tmp23_108 = tt.splat %in_ptr3 : !tt.ptr<bf16> -> tensor<32x128x!tt.ptr<bf16>> loc(#loc171)
+    %tmp23_109 = tt.addptr %tmp23_108, %tmp23_107 : tensor<32x128x!tt.ptr<bf16>>, tensor<32x128xi32> loc(#loc171)
+    %tmp23_110 = tt.broadcast %tmp20_88 : tensor<32x1xi1> -> tensor<32x128xi1> loc(#loc172)
+    %tmp23_111 = tt.broadcast %xmask_19 : tensor<1x128xi1> -> tensor<32x128xi1> loc(#loc172)
+    %tmp23_112 = arith.andi %tmp23_110, %tmp23_111 : tensor<32x128xi1> loc(#loc172)
+    %tmp23_113 = tt.broadcast %ymask_12 : tensor<32x1xi1> -> tensor<32x128xi1> loc(#loc173)
+    %tmp23_114 = arith.andi %tmp23_112, %tmp23_113 : tensor<32x128xi1> loc(#loc173)
+    %tmp23_115 = arith.constant 0.000000e+00 : f32 loc(#loc174)
+    %tmp23_116 = arith.constant dense<0.000000e+00> : tensor<32x128xf32> loc(#loc174)
+    %tmp23_117 = arith.truncf %tmp23_116 : tensor<32x128xf32> to tensor<32x128xbf16> loc(#loc174)
+    %tmp23_118 = tt.load %tmp23_109, %tmp23_114, %tmp23_117 evictionPolicy = evict_last : tensor<32x128x!tt.ptr<bf16>> loc(#loc174)
+    %tmp23_119 = arith.extf %tmp23_118 : tensor<32x128xbf16> to tensor<32x128xf32> loc(#loc175)
+    %tmp25 = arith.constant -256 : i32 loc(#loc176)
+    %tmp25_120 = arith.constant -256 : i32 loc(#loc176)
+    %tmp25_121 = arith.constant dense<-256> : tensor<32x1xi32> loc(#loc176)
+    %tmp25_122 = arith.addi %tmp25_121, %y1_22 : tensor<32x1xi32> loc(#loc176)
+    %tmp25_123 = arith.constant 32 : i32 loc(#loc177)
+    %tmp25_124 = arith.constant 32 : i32 loc(#loc177)
+    %tmp25_125 = arith.constant dense<32> : tensor<32x1xi32> loc(#loc177)
+    %tmp25_126 = arith.muli %tmp25_125, %tmp25_122 : tensor<32x1xi32> loc(#loc177)
+    %tmp25_127 = arith.addi %y0_25, %tmp25_126 : tensor<32x1xi32> loc(#loc178)
+    %tmp25_128 = tt.broadcast %tmp25_127 : tensor<32x1xi32> -> tensor<32x128xi32> loc(#loc179)
+    %tmp25_129 = tt.splat %in_ptr4 : !tt.ptr<f32> -> tensor<32x128x!tt.ptr<f32>> loc(#loc180)
+    %tmp25_130 = tt.addptr %tmp25_129, %tmp25_128 : tensor<32x128x!tt.ptr<f32>>, tensor<32x128xi32> loc(#loc180)
+    %tmp25_131 = tt.broadcast %tmp20_88 : tensor<32x1xi1> -> tensor<32x128xi1> loc(#loc181)
+    %tmp25_132 = tt.broadcast %xmask_19 : tensor<1x128xi1> -> tensor<32x128xi1> loc(#loc181)
+    %tmp25_133 = arith.andi %tmp25_131, %tmp25_132 : tensor<32x128xi1> loc(#loc181)
+    %tmp25_134 = tt.broadcast %ymask_12 : tensor<32x1xi1> -> tensor<32x128xi1> loc(#loc182)
+    %tmp25_135 = arith.andi %tmp25_133, %tmp25_134 : tensor<32x128xi1> loc(#loc182)
+    %tmp25_136 = arith.constant 0.000000e+00 : f32 loc(#loc183)
+    %tmp25_137 = arith.constant dense<0.000000e+00> : tensor<32x128xf32> loc(#loc183)
+    %tmp25_138 = tt.load %tmp25_130, %tmp25_135, %tmp25_137 evictionPolicy = evict_last : tensor<32x128x!tt.ptr<f32>> loc(#loc183)
+    %tmp26 = arith.constant 1.280000e+02 : f32 loc(#loc184)
+    %tmp27 = arith.constant dense<1.280000e+02> : tensor<32x128xf32> loc(#loc185)
+    %tmp27_139 = arith.divf %tmp25_138, %tmp27 : tensor<32x128xf32> loc(#loc185)
+    %tmp28 = arith.constant 9.99999997E-7 : f32 loc(#loc186)
+    %tmp29 = arith.constant dense<9.99999997E-7> : tensor<32x128xf32> loc(#loc187)
+    %tmp29_140 = arith.addf %tmp27_139, %tmp29 : tensor<32x128xf32> loc(#loc187)
+    %tmp30 = tt.extern_elementwise %tmp29_140 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<32x128xf32>) -> tensor<32x128xf32> loc(#loc188)
+    %tmp31 = arith.mulf %tmp23_119, %tmp30 : tensor<32x128xf32> loc(#loc189)
+    %tmp32 = tt.broadcast %xindex_18 : tensor<1x128xi32> -> tensor<32x128xi32> loc(#loc190)
+    %tmp32_141 = tt.splat %in_ptr5 : !tt.ptr<bf16> -> tensor<32x128x!tt.ptr<bf16>> loc(#loc191)
+    %tmp32_142 = tt.addptr %tmp32_141, %tmp32 : tensor<32x128x!tt.ptr<bf16>>, tensor<32x128xi32> loc(#loc191)
+    %tmp32_143 = tt.broadcast %tmp20_88 : tensor<32x1xi1> -> tensor<32x128xi1> loc(#loc192)
+    %tmp32_144 = tt.broadcast %xmask_19 : tensor<1x128xi1> -> tensor<32x128xi1> loc(#loc192)
+    %tmp32_145 = arith.andi %tmp32_143, %tmp32_144 : tensor<32x128xi1> loc(#loc192)
+    %tmp32_146 = tt.broadcast %ymask_12 : tensor<32x1xi1> -> tensor<32x128xi1> loc(#loc193)
+    %tmp32_147 = arith.andi %tmp32_145, %tmp32_146 : tensor<32x128xi1> loc(#loc193)
+    %tmp32_148 = arith.constant 0.000000e+00 : f32 loc(#loc194)
+    %tmp32_149 = arith.constant dense<0.000000e+00> : tensor<32x128xf32> loc(#loc194)
+    %tmp32_150 = arith.truncf %tmp32_149 : tensor<32x128xf32> to tensor<32x128xbf16> loc(#loc194)
+    %tmp32_151 = tt.load %tmp32_142, %tmp32_147, %tmp32_150 evictionPolicy = evict_last : tensor<32x128x!tt.ptr<bf16>> loc(#loc194)
+    %tmp32_152 = arith.extf %tmp32_151 : tensor<32x128xbf16> to tensor<32x128xf32> loc(#loc195)
+    %tmp34 = arith.mulf %tmp31, %tmp32_152 : tensor<32x128xf32> loc(#loc196)
+    %tmp36 = arith.constant 0.000000e+00 : f32 loc(#loc197)
+    %tmp36_153 = arith.constant dense<0.000000e+00> : tensor<32x128xf32> loc(#loc197)
+    %tmp37 = tt.broadcast %tmp20_88 : tensor<32x1xi1> -> tensor<32x128xi1> loc(#loc198)
+    %tmp37_154 = arith.select %tmp37, %tmp34, %tmp36_153 : tensor<32x128xi1>, tensor<32x128xf32> loc(#loc198)
+    %tmp38 = tt.broadcast %tmp4_31 : tensor<32x1xi1> -> tensor<32x128xi1> loc(#loc199)
+    %tmp38_155 = arith.select %tmp38, %tmp19_86, %tmp37_154 : tensor<32x128xi1>, tensor<32x128xf32> loc(#loc199)
+    %c128_i32 = arith.constant 128 : i32 loc(#loc93)
+    %c128_i32_156 = arith.constant 128 : i32 loc(#loc93)
+    %cst = arith.constant dense<128> : tensor<32x1xi32> loc(#loc93)
+    %0 = arith.muli %cst, %yindex_11 : tensor<32x1xi32> loc(#loc93)
+    %1 = tt.broadcast %xindex_18 : tensor<1x128xi32> -> tensor<32x128xi32> loc(#loc94)
+    %2 = tt.broadcast %0 : tensor<32x1xi32> -> tensor<32x128xi32> loc(#loc94)
+    %3 = arith.addi %1, %2 : tensor<32x128xi32> loc(#loc94)
+    %4 = tt.splat %out_ptr0 : !tt.ptr<bf16> -> tensor<32x128x!tt.ptr<bf16>> loc(#loc95)
+    %5 = tt.addptr %4, %3 : tensor<32x128x!tt.ptr<bf16>>, tensor<32x128xi32> loc(#loc95)
+    %6 = tt.broadcast %xmask_19 : tensor<1x128xi1> -> tensor<32x128xi1> loc(#loc96)
+    %7 = tt.broadcast %ymask_12 : tensor<32x1xi1> -> tensor<32x128xi1> loc(#loc96)
+    %8 = arith.andi %6, %7 : tensor<32x128xi1> loc(#loc96)
+    %9 = arith.truncf %tmp38_155 : tensor<32x128xf32> to tensor<32x128xbf16> loc(#loc97)
+    tt.store %5, %9, %8 : tensor<32x128x!tt.ptr<bf16>> loc(#loc97)
+    tt.return loc(#loc98)
+  } loc(#loc)
+} loc(#loc)
+#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":19:13)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":20:13)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:29)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:48)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:69)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:53)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:34)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:75)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":22:36)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":22:44)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":22:23)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":23:21)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":24:28)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":24:33)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":25:36)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":25:44)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":25:23)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":26:21)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":27:19)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":29:19)
+#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":32:30)
+#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":33:19)
+#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":34:32)
+#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":35:18)
+#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:39)
+#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:35)
+#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:51)
+#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:44)
+#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:30)
+#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:64)
+#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:72)
+#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:57)
+#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:123)
+#loc34 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:55)
+#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:51)
+#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:60)
+#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:30)
+#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:87)
+#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:95)
+#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:80)
+#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":39:11)
+#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":40:19)
+#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":41:12)
+#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":42:19)
+#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":43:28)
+#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":44:19)
+#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:51)
+#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:31)
+#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:78)
+#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:86)
+#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:71)
+#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:137)
+#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":47:20)
+#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":49:38)
+#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":50:34)
+#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":51:20)
+#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":52:34)
+#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":53:19)
+#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:40)
+#loc60 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:36)
+#loc61 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:61)
+#loc62 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:52)
+#loc63 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:45)
+#loc64 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:31)
+#loc65 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:75)
+#loc66 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:83)
+#loc67 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:67)
+#loc68 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:134)
+#loc69 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:65)
+#loc70 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:56)
+#loc71 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:52)
+#loc72 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:70)
+#loc73 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:31)
+#loc74 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:98)
+#loc75 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:106)
+#loc76 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:90)
+#loc77 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":57:12)
+#loc78 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":58:21)
+#loc79 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":59:12)
+#loc80 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":60:20)
+#loc81 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":61:28)
+#loc82 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":62:20)
+#loc83 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:51)
+#loc84 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:31)
+#loc85 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:79)
+#loc86 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:87)
+#loc87 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:71)
+#loc88 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:138)
+#loc89 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":65:20)
+#loc90 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":67:38)
+#loc91 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":68:35)
+#loc92 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":69:34)
+#loc93 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:34)
+#loc94 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:30)
+#loc95 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:25)
+#loc96 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:54)
+#loc97 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:46)
+#loc98 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:4)
+#loc108 = loc("ynumel"(#loc1))
+#loc109 = loc("xnumel"(#loc2))
+#loc110 = loc("yoffset"(#loc3))
+#loc111 = loc("yoffset"(#loc4))
+#loc112 = loc("yoffset"(#loc5))
+#loc113 = loc("yoffset"(#loc6))
+#loc114 = loc("yoffset"(#loc7))
+#loc115 = loc("yoffset"(#loc8))
+#loc116 = loc("yindex"(#loc9))
+#loc117 = loc("yindex"(#loc10))
+#loc118 = loc("yindex"(#loc11))
+#loc119 = loc("ymask"(#loc12))
+#loc120 = loc("xoffset"(#loc13))
+#loc121 = loc("xoffset"(#loc14))
+#loc122 = loc("xindex"(#loc15))
+#loc123 = loc("xindex"(#loc16))
+#loc124 = loc("xindex"(#loc17))
+#loc125 = loc("xmask"(#loc18))
+#loc126 = loc("y1"(#loc19))
+#loc127 = loc("y0"(#loc20))
+#loc128 = loc("tmp1"(#loc21))
+#loc129 = loc("tmp2"(#loc22))
+#loc130 = loc("tmp3"(#loc23))
+#loc131 = loc("tmp4"(#loc24))
+#loc132 = loc("tmp5"(#loc25))
+#loc133 = loc("tmp5"(#loc26))
+#loc134 = loc("tmp5"(#loc27))
+#loc135 = loc("tmp5"(#loc28))
+#loc136 = loc("tmp5"(#loc29))
+#loc137 = loc("tmp5"(#loc30))
+#loc138 = loc("tmp5"(#loc31))
+#loc139 = loc("tmp5"(#loc32))
+#loc140 = loc("tmp5"(#loc33))
+#loc141 = loc("tmp7"(#loc34))
+#loc142 = loc("tmp7"(#loc35))
+#loc143 = loc("tmp7"(#loc36))
+#loc144 = loc("tmp7"(#loc37))
+#loc145 = loc("tmp7"(#loc38))
+#loc146 = loc("tmp7"(#loc39))
+#loc147 = loc("tmp7"(#loc40))
+#loc148 = loc("tmp8"(#loc41))
+#loc149 = loc("tmp9"(#loc42))
+#loc150 = loc("tmp10"(#loc43))
+#loc151 = loc("tmp11"(#loc44))
+#loc152 = loc("tmp12"(#loc45))
+#loc153 = loc("tmp13"(#loc46))
+#loc154 = loc("tmp14"(#loc47))
+#loc155 = loc("tmp14"(#loc48))
+#loc156 = loc("tmp14"(#loc49))
+#loc157 = loc("tmp14"(#loc50))
+#loc158 = loc("tmp14"(#loc51))
+#loc159 = loc("tmp14"(#loc52))
+#loc160 = loc("tmp16"(#loc53))
+#loc161 = loc("tmp18"(#loc54))
+#loc162 = loc("tmp19"(#loc55))
+#loc163 = loc("tmp20"(#loc56))
+#loc164 = loc("tmp21"(#loc57))
+#loc165 = loc("tmp22"(#loc58))
+#loc166 = loc("tmp23"(#loc59))
+#loc167 = loc("tmp23"(#loc60))
+#loc168 = loc("tmp23"(#loc61))
+#loc169 = loc("tmp23"(#loc62))
+#loc170 = loc("tmp23"(#loc63))
+#loc171 = loc("tmp23"(#loc64))
+#loc172 = loc("tmp23"(#loc65))
+#loc173 = loc("tmp23"(#loc66))
+#loc174 = loc("tmp23"(#loc67))
+#loc175 = loc("tmp23"(#loc68))
+#loc176 = loc("tmp25"(#loc69))
+#loc177 = loc("tmp25"(#loc70))
+#loc178 = loc("tmp25"(#loc71))
+#loc179 = loc("tmp25"(#loc72))
+#loc180 = loc("tmp25"(#loc73))
+#loc181 = loc("tmp25"(#loc74))
+#loc182 = loc("tmp25"(#loc75))
+#loc183 = loc("tmp25"(#loc76))
+#loc184 = loc("tmp26"(#loc77))
+#loc185 = loc("tmp27"(#loc78))
+#loc186 = loc("tmp28"(#loc79))
+#loc187 = loc("tmp29"(#loc80))
+#loc188 = loc("tmp30"(#loc81))
+#loc189 = loc("tmp31"(#loc82))
+#loc190 = loc("tmp32"(#loc83))
+#loc191 = loc("tmp32"(#loc84))
+#loc192 = loc("tmp32"(#loc85))
+#loc193 = loc("tmp32"(#loc86))
+#loc194 = loc("tmp32"(#loc87))
+#loc195 = loc("tmp32"(#loc88))
+#loc196 = loc("tmp34"(#loc89))
+#loc197 = loc("tmp36"(#loc90))
+#loc198 = loc("tmp37"(#loc91))
+#loc199 = loc("tmp38"(#loc92))
diff --git a/triton/VJYGHH2I6HL5D4FSAVHRN5TDVUMIEA46OE2HSL7GW3Y7IZ2XL7TQ/triton_poi_fused__fused_rms_norm_cat_view_2.ttgir b/triton/VJYGHH2I6HL5D4FSAVHRN5TDVUMIEA46OE2HSL7GW3Y7IZ2XL7TQ/triton_poi_fused__fused_rms_norm_cat_view_2.ttgir
new file mode 100644
index 0000000000000000000000000000000000000000..4d241fbddba3123622164ff35df2e35ea9a08d49
--- /dev/null
+++ b/triton/VJYGHH2I6HL5D4FSAVHRN5TDVUMIEA46OE2HSL7GW3Y7IZ2XL7TQ/triton_poi_fused__fused_rms_norm_cat_view_2.ttgir
@@ -0,0 +1,288 @@
+#blocked = #ttg.blocked<{sizePerThread = [4, 1], threadsPerWarp = [8, 4], warpsPerCTA = [1, 8], order = [0, 1]}>
+#blocked1 = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [2, 16], warpsPerCTA = [8, 1], order = [1, 0]}>
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":18:0)
+#loc70 = loc("in_ptr0"(#loc))
+#loc71 = loc("in_ptr1"(#loc))
+#loc72 = loc("in_ptr2"(#loc))
+#loc73 = loc("in_ptr3"(#loc))
+#loc74 = loc("in_ptr4"(#loc))
+#loc75 = loc("in_ptr5"(#loc))
+#loc76 = loc("out_ptr0"(#loc))
+#loc77 = loc("ynumel"(#loc))
+#loc78 = loc("xnumel"(#loc))
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "cuda:89", "ttg.threads-per-warp" = 32 : i32} {
+  tt.func public @triton_poi_fused__fused_rms_norm_cat_view_2(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %in_ptr4: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr4"(#loc)), %in_ptr5: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr5"(#loc)), %out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ynumel: i32 {tt.divisibility = 16 : i32} loc("ynumel"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} {
+    %cst = arith.constant dense<-256> : tensor<32x1xi32, #blocked> loc(#loc1)
+    %cst_0 = arith.constant dense<-256> : tensor<32x1xi32, #blocked1> loc(#loc1)
+    %cst_1 = arith.constant dense<12288> : tensor<32x1xi32, #blocked1> loc(#loc1)
+    %cst_2 = arith.constant dense<128> : tensor<32x1xi32, #blocked1> loc(#loc1)
+    %cst_3 = arith.constant dense<256> : tensor<32x1xi64, #blocked> loc(#loc1)
+    %cst_4 = arith.constant dense<256> : tensor<32x1xi64, #blocked1> loc(#loc1)
+    %cst_5 = arith.constant dense<32> : tensor<32x1xi32, #blocked> loc(#loc1)
+    %cst_6 = arith.constant dense<32> : tensor<32x1xi32, #blocked1> loc(#loc1)
+    %cst_7 = arith.constant dense<128> : tensor<1x128xi32, #blocked> loc(#loc1)
+    %cst_8 = arith.constant dense<128> : tensor<1x128xi32, #blocked1> loc(#loc1)
+    %cst_9 = arith.constant dense<73728> : tensor<32x1xi32, #blocked> loc(#loc1)
+    %cst_10 = arith.constant dense<73728> : tensor<32x1xi32, #blocked1> loc(#loc1)
+    %c32_i32 = arith.constant 32 : i32 loc(#loc1)
+    %c128_i32 = arith.constant 128 : i32 loc(#loc1)
+    %cst_11 = arith.constant dense<0.000000e+00> : tensor<32x128xbf16, #blocked1> loc(#loc1)
+    %cst_12 = arith.constant dense<0.000000e+00> : tensor<32x128xf32, #blocked> loc(#loc1)
+    %cst_13 = arith.constant dense<9.99999997E-7> : tensor<32x128xf32, #blocked> loc(#loc1)
+    %cst_14 = arith.constant dense<1.280000e+02> : tensor<32x128xf32, #blocked> loc(#loc1)
+    %cst_15 = arith.constant dense<0.000000e+00> : tensor<32x128xf32, #blocked1> loc(#loc1)
+    %yoffset = tt.get_program_id y : i32 loc(#loc79)
+    %yoffset_16 = tt.get_program_id z : i32 loc(#loc80)
+    %yoffset_17 = tt.get_num_programs y : i32 loc(#loc81)
+    %yoffset_18 = arith.muli %yoffset_16, %yoffset_17 : i32 loc(#loc82)
+    %yoffset_19 = arith.addi %yoffset, %yoffset_18 : i32 loc(#loc83)
+    %yoffset_20 = arith.muli %yoffset_19, %c32_i32 : i32 loc(#loc84)
+    %yindex = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc85)
+    %yindex_21 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc85)
+    %yindex_22 = tt.expand_dims %yindex {axis = 1 : i32} : tensor<32xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<32x1xi32, #blocked1> loc(#loc85)
+    %yindex_23 = tt.expand_dims %yindex_21 {axis = 1 : i32} : tensor<32xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<32x1xi32, #blocked> loc(#loc85)
+    %yindex_24 = tt.splat %yoffset_20 : i32 -> tensor<32x1xi32, #blocked1> loc(#loc86)
+    %yindex_25 = tt.splat %yoffset_20 : i32 -> tensor<32x1xi32, #blocked> loc(#loc86)
+    %yindex_26 = arith.addi %yindex_24, %yindex_22 : tensor<32x1xi32, #blocked1> loc(#loc86)
+    %yindex_27 = arith.addi %yindex_25, %yindex_23 : tensor<32x1xi32, #blocked> loc(#loc86)
+    %ymask = arith.cmpi slt, %yindex_26, %cst_10 : tensor<32x1xi32, #blocked1> loc(#loc87)
+    %ymask_28 = arith.cmpi slt, %yindex_27, %cst_9 : tensor<32x1xi32, #blocked> loc(#loc87)
+    %xoffset = tt.get_program_id x : i32 loc(#loc88)
+    %xoffset_29 = arith.muli %xoffset, %c128_i32 : i32 loc(#loc89)
+    %xindex = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc90)
+    %xindex_30 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc90)
+    %xindex_31 = tt.expand_dims %xindex {axis = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x128xi32, #blocked1> loc(#loc90)
+    %xindex_32 = tt.expand_dims %xindex_30 {axis = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x128xi32, #blocked> loc(#loc90)
+    %xindex_33 = tt.splat %xoffset_29 : i32 -> tensor<1x128xi32, #blocked1> loc(#loc91)
+    %xindex_34 = tt.splat %xoffset_29 : i32 -> tensor<1x128xi32, #blocked> loc(#loc91)
+    %xindex_35 = arith.addi %xindex_33, %xindex_31 : tensor<1x128xi32, #blocked1> loc(#loc91)
+    %xindex_36 = arith.addi %xindex_34, %xindex_32 : tensor<1x128xi32, #blocked> loc(#loc91)
+    %xmask = arith.cmpi slt, %xindex_35, %cst_8 : tensor<1x128xi32, #blocked1> loc(#loc92)
+    %xmask_37 = arith.cmpi slt, %xindex_36, %cst_7 : tensor<1x128xi32, #blocked> loc(#loc92)
+    %y1 = arith.divsi %yindex_26, %cst_6 : tensor<32x1xi32, #blocked1> loc(#loc93)
+    %y1_38 = arith.divsi %yindex_27, %cst_5 : tensor<32x1xi32, #blocked> loc(#loc93)
+    %y0 = arith.remsi %yindex_26, %cst_6 : tensor<32x1xi32, #blocked1> loc(#loc94)
+    %y0_39 = arith.remsi %yindex_27, %cst_5 : tensor<32x1xi32, #blocked> loc(#loc94)
+    %tmp4 = arith.extsi %y1 : tensor<32x1xi32, #blocked1> to tensor<32x1xi64, #blocked1> loc(#loc95)
+    %tmp4_40 = arith.extsi %y1_38 : tensor<32x1xi32, #blocked> to tensor<32x1xi64, #blocked> loc(#loc95)
+    %tmp4_41 = arith.cmpi slt, %tmp4, %cst_4 : tensor<32x1xi64, #blocked1> loc(#loc95)
+    %tmp4_42 = arith.cmpi slt, %tmp4_40, %cst_3 : tensor<32x1xi64, #blocked> loc(#loc95)
+    %tmp5 = arith.muli %y0, %cst_2 : tensor<32x1xi32, #blocked1> loc(#loc96)
+    %tmp5_43 = tt.broadcast %xindex_35 : tensor<1x128xi32, #blocked1> -> tensor<32x128xi32, #blocked1> loc(#loc97)
+    %tmp5_44 = tt.broadcast %tmp5 : tensor<32x1xi32, #blocked1> -> tensor<32x128xi32, #blocked1> loc(#loc97)
+    %tmp5_45 = arith.addi %tmp5_43, %tmp5_44 : tensor<32x128xi32, #blocked1> loc(#loc97)
+    %tmp5_46 = arith.muli %y1, %cst_1 : tensor<32x1xi32, #blocked1> loc(#loc98)
+    %tmp5_47 = tt.broadcast %tmp5_46 : tensor<32x1xi32, #blocked1> -> tensor<32x128xi32, #blocked1> loc(#loc99)
+    %tmp5_48 = arith.addi %tmp5_45, %tmp5_47 : tensor<32x128xi32, #blocked1> loc(#loc99)
+    %tmp5_49 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<32x128x!tt.ptr<bf16>, #blocked1> loc(#loc100)
+    %tmp5_50 = tt.addptr %tmp5_49, %tmp5_48 : tensor<32x128x!tt.ptr<bf16>, #blocked1>, tensor<32x128xi32, #blocked1> loc(#loc100)
+    %tmp5_51 = tt.broadcast %tmp4_41 : tensor<32x1xi1, #blocked1> -> tensor<32x128xi1, #blocked1> loc(#loc101)
+    %tmp5_52 = tt.broadcast %tmp4_42 : tensor<32x1xi1, #blocked> -> tensor<32x128xi1, #blocked> loc(#loc101)
+    %tmp5_53 = tt.broadcast %xmask : tensor<1x128xi1, #blocked1> -> tensor<32x128xi1, #blocked1> loc(#loc101)
+    %tmp5_54 = tt.broadcast %xmask_37 : tensor<1x128xi1, #blocked> -> tensor<32x128xi1, #blocked> loc(#loc101)
+    %tmp5_55 = arith.andi %tmp5_51, %tmp5_53 : tensor<32x128xi1, #blocked1> loc(#loc101)
+    %tmp5_56 = arith.andi %tmp5_52, %tmp5_54 : tensor<32x128xi1, #blocked> loc(#loc101)
+    %tmp5_57 = tt.broadcast %ymask : tensor<32x1xi1, #blocked1> -> tensor<32x128xi1, #blocked1> loc(#loc102)
+    %tmp5_58 = tt.broadcast %ymask_28 : tensor<32x1xi1, #blocked> -> tensor<32x128xi1, #blocked> loc(#loc102)
+    %tmp5_59 = arith.andi %tmp5_55, %tmp5_57 : tensor<32x128xi1, #blocked1> loc(#loc102)
+    %tmp5_60 = arith.andi %tmp5_56, %tmp5_58 : tensor<32x128xi1, #blocked> loc(#loc102)
+    %tmp5_61 = tt.load %tmp5_50, %tmp5_59, %cst_11 evictionPolicy = evict_last : tensor<32x128x!tt.ptr<bf16>, #blocked1> loc(#loc103)
+    %tmp5_62 = ttg.convert_layout %tmp5_61 : tensor<32x128xbf16, #blocked1> -> tensor<32x128xbf16, #blocked> loc(#loc104)
+    %tmp5_63 = arith.extf %tmp5_62 : tensor<32x128xbf16, #blocked> to tensor<32x128xf32, #blocked> loc(#loc104)
+    %tmp7 = arith.muli %y1_38, %cst_5 : tensor<32x1xi32, #blocked> loc(#loc105)
+    %tmp7_64 = arith.addi %y0_39, %tmp7 : tensor<32x1xi32, #blocked> loc(#loc106)
+    %tmp7_65 = tt.splat %in_ptr1 : !tt.ptr<f32> -> tensor<32x1x!tt.ptr<f32>, #blocked> loc(#loc107)
+    %tmp7_66 = tt.addptr %tmp7_65, %tmp7_64 : tensor<32x1x!tt.ptr<f32>, #blocked>, tensor<32x1xi32, #blocked> loc(#loc107)
+    %tmp7_67 = tt.broadcast %tmp7_66 : tensor<32x1x!tt.ptr<f32>, #blocked> -> tensor<32x128x!tt.ptr<f32>, #blocked> loc(#loc107)
+    %tmp7_68 = tt.load %tmp7_67, %tmp5_60, %cst_12 evictionPolicy = evict_last : tensor<32x128x!tt.ptr<f32>, #blocked> loc(#loc108)
+    %tmp9 = arith.divf %tmp7_68, %cst_14 : tensor<32x128xf32, #blocked> loc(#loc109)
+    %tmp11 = arith.addf %tmp9, %cst_13 : tensor<32x128xf32, #blocked> loc(#loc110)
+    %tmp12 = tt.extern_elementwise %tmp11 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<32x128xf32, #blocked>) -> tensor<32x128xf32, #blocked> loc(#loc111)
+    %tmp13 = arith.mulf %tmp5_63, %tmp12 : tensor<32x128xf32, #blocked> loc(#loc112)
+    %tmp13_69 = ttg.convert_layout %tmp13 : tensor<32x128xf32, #blocked> -> tensor<32x128xf32, #blocked1> loc(#loc112)
+    %tmp14 = tt.splat %in_ptr2 : !tt.ptr<bf16> -> tensor<1x128x!tt.ptr<bf16>, #blocked1> loc(#loc113)
+    %tmp14_70 = tt.addptr %tmp14, %xindex_35 : tensor<1x128x!tt.ptr<bf16>, #blocked1>, tensor<1x128xi32, #blocked1> loc(#loc113)
+    %tmp14_71 = tt.broadcast %tmp14_70 : tensor<1x128x!tt.ptr<bf16>, #blocked1> -> tensor<32x128x!tt.ptr<bf16>, #blocked1> loc(#loc113)
+    %tmp14_72 = tt.load %tmp14_71, %tmp5_59, %cst_11 evictionPolicy = evict_last : tensor<32x128x!tt.ptr<bf16>, #blocked1> loc(#loc114)
+    %tmp14_73 = arith.extf %tmp14_72 : tensor<32x128xbf16, #blocked1> to tensor<32x128xf32, #blocked1> loc(#loc115)
+    %tmp16 = arith.mulf %tmp13_69, %tmp14_73 : tensor<32x128xf32, #blocked1> loc(#loc116)
+    %tmp20 = arith.cmpi sge, %tmp4, %cst_4 : tensor<32x1xi64, #blocked1> loc(#loc117)
+    %tmp20_74 = arith.cmpi sge, %tmp4_40, %cst_3 : tensor<32x1xi64, #blocked> loc(#loc117)
+    %tmp23 = arith.addi %y1, %cst_0 : tensor<32x1xi32, #blocked1> loc(#loc118)
+    %tmp23_75 = arith.addi %y1_38, %cst : tensor<32x1xi32, #blocked> loc(#loc118)
+    %tmp23_76 = arith.muli %tmp23, %cst_1 : tensor<32x1xi32, #blocked1> loc(#loc119)
+    %tmp23_77 = tt.broadcast %tmp23_76 : tensor<32x1xi32, #blocked1> -> tensor<32x128xi32, #blocked1> loc(#loc120)
+    %tmp23_78 = arith.addi %tmp5_45, %tmp23_77 : tensor<32x128xi32, #blocked1> loc(#loc120)
+    %tmp23_79 = tt.splat %in_ptr3 : !tt.ptr<bf16> -> tensor<32x128x!tt.ptr<bf16>, #blocked1> loc(#loc121)
+    %tmp23_80 = tt.addptr %tmp23_79, %tmp23_78 : tensor<32x128x!tt.ptr<bf16>, #blocked1>, tensor<32x128xi32, #blocked1> loc(#loc121)
+    %tmp23_81 = tt.broadcast %tmp20 : tensor<32x1xi1, #blocked1> -> tensor<32x128xi1, #blocked1> loc(#loc122)
+    %tmp23_82 = tt.broadcast %tmp20_74 : tensor<32x1xi1, #blocked> -> tensor<32x128xi1, #blocked> loc(#loc122)
+    %tmp23_83 = arith.andi %tmp23_81, %tmp5_53 : tensor<32x128xi1, #blocked1> loc(#loc122)
+    %tmp23_84 = arith.andi %tmp23_82, %tmp5_54 : tensor<32x128xi1, #blocked> loc(#loc122)
+    %tmp23_85 = arith.andi %tmp23_83, %tmp5_57 : tensor<32x128xi1, #blocked1> loc(#loc123)
+    %tmp23_86 = arith.andi %tmp23_84, %tmp5_58 : tensor<32x128xi1, #blocked> loc(#loc123)
+    %tmp23_87 = tt.load %tmp23_80, %tmp23_85, %cst_11 evictionPolicy = evict_last : tensor<32x128x!tt.ptr<bf16>, #blocked1> loc(#loc124)
+    %tmp23_88 = ttg.convert_layout %tmp23_87 : tensor<32x128xbf16, #blocked1> -> tensor<32x128xbf16, #blocked> loc(#loc125)
+    %tmp23_89 = arith.extf %tmp23_88 : tensor<32x128xbf16, #blocked> to tensor<32x128xf32, #blocked> loc(#loc125)
+    %tmp25 = arith.muli %tmp23_75, %cst_5 : tensor<32x1xi32, #blocked> loc(#loc126)
+    %tmp25_90 = arith.addi %y0_39, %tmp25 : tensor<32x1xi32, #blocked> loc(#loc127)
+    %tmp25_91 = tt.splat %in_ptr4 : !tt.ptr<f32> -> tensor<32x1x!tt.ptr<f32>, #blocked> loc(#loc128)
+    %tmp25_92 = tt.addptr %tmp25_91, %tmp25_90 : tensor<32x1x!tt.ptr<f32>, #blocked>, tensor<32x1xi32, #blocked> loc(#loc128)
+    %tmp25_93 = tt.broadcast %tmp25_92 : tensor<32x1x!tt.ptr<f32>, #blocked> -> tensor<32x128x!tt.ptr<f32>, #blocked> loc(#loc128)
+    %tmp25_94 = tt.load %tmp25_93, %tmp23_86, %cst_12 evictionPolicy = evict_last : tensor<32x128x!tt.ptr<f32>, #blocked> loc(#loc129)
+    %tmp27 = arith.divf %tmp25_94, %cst_14 : tensor<32x128xf32, #blocked> loc(#loc130)
+    %tmp29 = arith.addf %tmp27, %cst_13 : tensor<32x128xf32, #blocked> loc(#loc131)
+    %tmp30 = tt.extern_elementwise %tmp29 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<32x128xf32, #blocked>) -> tensor<32x128xf32, #blocked> loc(#loc132)
+    %tmp31 = arith.mulf %tmp23_89, %tmp30 : tensor<32x128xf32, #blocked> loc(#loc133)
+    %tmp31_95 = ttg.convert_layout %tmp31 : tensor<32x128xf32, #blocked> -> tensor<32x128xf32, #blocked1> loc(#loc133)
+    %tmp32 = tt.splat %in_ptr5 : !tt.ptr<bf16> -> tensor<1x128x!tt.ptr<bf16>, #blocked1> loc(#loc134)
+    %tmp32_96 = tt.addptr %tmp32, %xindex_35 : tensor<1x128x!tt.ptr<bf16>, #blocked1>, tensor<1x128xi32, #blocked1> loc(#loc134)
+    %tmp32_97 = tt.broadcast %tmp32_96 : tensor<1x128x!tt.ptr<bf16>, #blocked1> -> tensor<32x128x!tt.ptr<bf16>, #blocked1> loc(#loc134)
+    %tmp32_98 = tt.load %tmp32_97, %tmp23_85, %cst_11 evictionPolicy = evict_last : tensor<32x128x!tt.ptr<bf16>, #blocked1> loc(#loc135)
+    %tmp32_99 = arith.extf %tmp32_98 : tensor<32x128xbf16, #blocked1> to tensor<32x128xf32, #blocked1> loc(#loc136)
+    %tmp34 = arith.mulf %tmp31_95, %tmp32_99 : tensor<32x128xf32, #blocked1> loc(#loc137)
+    %tmp37 = arith.select %tmp23_81, %tmp34, %cst_15 : tensor<32x128xi1, #blocked1>, tensor<32x128xf32, #blocked1> loc(#loc138)
+    %tmp38 = arith.select %tmp5_51, %tmp16, %tmp37 : tensor<32x128xi1, #blocked1>, tensor<32x128xf32, #blocked1> loc(#loc141)
+    %0 = arith.muli %yindex_26, %cst_2 : tensor<32x1xi32, #blocked1> loc(#loc64)
+    %1 = tt.broadcast %0 : tensor<32x1xi32, #blocked1> -> tensor<32x128xi32, #blocked1> loc(#loc65)
+    %2 = arith.addi %tmp5_43, %1 : tensor<32x128xi32, #blocked1> loc(#loc65)
+    %3 = tt.splat %out_ptr0 : !tt.ptr<bf16> -> tensor<32x128x!tt.ptr<bf16>, #blocked1> loc(#loc66)
+    %4 = tt.addptr %3, %2 : tensor<32x128x!tt.ptr<bf16>, #blocked1>, tensor<32x128xi32, #blocked1> loc(#loc66)
+    %5 = arith.andi %tmp5_53, %tmp5_57 : tensor<32x128xi1, #blocked1> loc(#loc67)
+    %6 = arith.truncf %tmp38 : tensor<32x128xf32, #blocked1> to tensor<32x128xbf16, #blocked1> loc(#loc68)
+    tt.store %4, %6, %5 : tensor<32x128x!tt.ptr<bf16>, #blocked1> loc(#loc68)
+    tt.return loc(#loc69)
+  } loc(#loc)
+} loc(#loc)
+#loc1 = loc(unknown)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:29)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:48)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:69)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:53)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:34)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:75)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":22:44)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":22:23)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":23:21)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":24:28)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":24:33)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":25:44)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":25:23)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":26:21)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":27:19)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":29:19)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":35:18)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:39)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:35)
+#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:51)
+#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:44)
+#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:30)
+#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:64)
+#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:72)
+#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:57)
+#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:123)
+#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:55)
+#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:51)
+#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:30)
+#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:80)
+#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":40:19)
+#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":42:19)
+#loc34 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":43:28)
+#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":44:19)
+#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:31)
+#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:71)
+#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:137)
+#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":47:20)
+#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":51:20)
+#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:61)
+#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:52)
+#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:45)
+#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:31)
+#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:75)
+#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:83)
+#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:67)
+#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:134)
+#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:56)
+#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:52)
+#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:31)
+#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:90)
+#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":58:21)
+#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":60:20)
+#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":61:28)
+#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":62:20)
+#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:31)
+#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:71)
+#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:138)
+#loc60 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":65:20)
+#loc61 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":68:35)
+#loc62 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":69:34)
+#loc63 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":50:34)
+#loc64 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:34)
+#loc65 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:30)
+#loc66 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:25)
+#loc67 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:54)
+#loc68 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:46)
+#loc69 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:4)
+#loc79 = loc("yoffset"(#loc2))
+#loc80 = loc("yoffset"(#loc3))
+#loc81 = loc("yoffset"(#loc4))
+#loc82 = loc("yoffset"(#loc5))
+#loc83 = loc("yoffset"(#loc6))
+#loc84 = loc("yoffset"(#loc7))
+#loc85 = loc("yindex"(#loc8))
+#loc86 = loc("yindex"(#loc9))
+#loc87 = loc("ymask"(#loc10))
+#loc88 = loc("xoffset"(#loc11))
+#loc89 = loc("xoffset"(#loc12))
+#loc90 = loc("xindex"(#loc13))
+#loc91 = loc("xindex"(#loc14))
+#loc92 = loc("xmask"(#loc15))
+#loc93 = loc("y1"(#loc16))
+#loc94 = loc("y0"(#loc17))
+#loc95 = loc("tmp4"(#loc18))
+#loc96 = loc("tmp5"(#loc19))
+#loc97 = loc("tmp5"(#loc20))
+#loc98 = loc("tmp5"(#loc21))
+#loc99 = loc("tmp5"(#loc22))
+#loc100 = loc("tmp5"(#loc23))
+#loc101 = loc("tmp5"(#loc24))
+#loc102 = loc("tmp5"(#loc25))
+#loc103 = loc("tmp5"(#loc26))
+#loc104 = loc("tmp5"(#loc27))
+#loc105 = loc("tmp7"(#loc28))
+#loc106 = loc("tmp7"(#loc29))
+#loc107 = loc("tmp7"(#loc30))
+#loc108 = loc("tmp7"(#loc31))
+#loc109 = loc("tmp9"(#loc32))
+#loc110 = loc("tmp11"(#loc33))
+#loc111 = loc("tmp12"(#loc34))
+#loc112 = loc("tmp13"(#loc35))
+#loc113 = loc("tmp14"(#loc36))
+#loc114 = loc("tmp14"(#loc37))
+#loc115 = loc("tmp14"(#loc38))
+#loc116 = loc("tmp16"(#loc39))
+#loc117 = loc("tmp20"(#loc40))
+#loc118 = loc("tmp23"(#loc41))
+#loc119 = loc("tmp23"(#loc42))
+#loc120 = loc("tmp23"(#loc43))
+#loc121 = loc("tmp23"(#loc44))
+#loc122 = loc("tmp23"(#loc45))
+#loc123 = loc("tmp23"(#loc46))
+#loc124 = loc("tmp23"(#loc47))
+#loc125 = loc("tmp23"(#loc48))
+#loc126 = loc("tmp25"(#loc49))
+#loc127 = loc("tmp25"(#loc50))
+#loc128 = loc("tmp25"(#loc51))
+#loc129 = loc("tmp25"(#loc52))
+#loc130 = loc("tmp27"(#loc53))
+#loc131 = loc("tmp29"(#loc54))
+#loc132 = loc("tmp30"(#loc55))
+#loc133 = loc("tmp31"(#loc56))
+#loc134 = loc("tmp32"(#loc57))
+#loc135 = loc("tmp32"(#loc58))
+#loc136 = loc("tmp32"(#loc59))
+#loc137 = loc("tmp34"(#loc60))
+#loc138 = loc("tmp37"(#loc61))
+#loc139 = loc("tmp38"(#loc62))
+#loc140 = loc("tmp19"(#loc63))
+#loc141 = loc(fused[#loc139, #loc140])
diff --git a/triton/VJYGHH2I6HL5D4FSAVHRN5TDVUMIEA46OE2HSL7GW3Y7IZ2XL7TQ/triton_poi_fused__fused_rms_norm_cat_view_2.ttir b/triton/VJYGHH2I6HL5D4FSAVHRN5TDVUMIEA46OE2HSL7GW3Y7IZ2XL7TQ/triton_poi_fused__fused_rms_norm_cat_view_2.ttir
new file mode 100644
index 0000000000000000000000000000000000000000..2e1bac7342609eb268e778dbdb3f0f27ac5b4f0b
--- /dev/null
+++ b/triton/VJYGHH2I6HL5D4FSAVHRN5TDVUMIEA46OE2HSL7GW3Y7IZ2XL7TQ/triton_poi_fused__fused_rms_norm_cat_view_2.ttir
@@ -0,0 +1,256 @@
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":18:0)
+#loc72 = loc("in_ptr0"(#loc))
+#loc73 = loc("in_ptr1"(#loc))
+#loc74 = loc("in_ptr2"(#loc))
+#loc75 = loc("in_ptr3"(#loc))
+#loc76 = loc("in_ptr4"(#loc))
+#loc77 = loc("in_ptr5"(#loc))
+#loc78 = loc("out_ptr0"(#loc))
+#loc79 = loc("ynumel"(#loc))
+#loc80 = loc("xnumel"(#loc))
+module {
+  tt.func public @triton_poi_fused__fused_rms_norm_cat_view_2(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %in_ptr4: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr4"(#loc)), %in_ptr5: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr5"(#loc)), %out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ynumel: i32 {tt.divisibility = 16 : i32} loc("ynumel"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} {
+    %cst = arith.constant dense<0.000000e+00> : tensor<32x128xbf16> loc(#loc1)
+    %cst_0 = arith.constant dense<-256> : tensor<32x1xi32> loc(#loc1)
+    %cst_1 = arith.constant dense<9.99999997E-7> : tensor<32x128xf32> loc(#loc1)
+    %cst_2 = arith.constant dense<1.280000e+02> : tensor<32x128xf32> loc(#loc1)
+    %cst_3 = arith.constant dense<0.000000e+00> : tensor<32x128xf32> loc(#loc1)
+    %cst_4 = arith.constant dense<12288> : tensor<32x1xi32> loc(#loc1)
+    %cst_5 = arith.constant dense<128> : tensor<32x1xi32> loc(#loc1)
+    %cst_6 = arith.constant dense<256> : tensor<32x1xi64> loc(#loc1)
+    %cst_7 = arith.constant dense<32> : tensor<32x1xi32> loc(#loc1)
+    %xmask = arith.constant dense<128> : tensor<1x128xi32> loc(#loc81)
+    %ymask = arith.constant dense<73728> : tensor<32x1xi32> loc(#loc82)
+    %c32_i32 = arith.constant 32 : i32 loc(#loc1)
+    %c128_i32 = arith.constant 128 : i32 loc(#loc1)
+    %yoffset = tt.get_program_id y : i32 loc(#loc83)
+    %yoffset_8 = tt.get_program_id z : i32 loc(#loc84)
+    %yoffset_9 = tt.get_num_programs y : i32 loc(#loc85)
+    %yoffset_10 = arith.muli %yoffset_8, %yoffset_9 : i32 loc(#loc86)
+    %yoffset_11 = arith.addi %yoffset, %yoffset_10 : i32 loc(#loc87)
+    %yoffset_12 = arith.muli %yoffset_11, %c32_i32 : i32 loc(#loc88)
+    %yindex = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32> loc(#loc89)
+    %yindex_13 = tt.expand_dims %yindex {axis = 1 : i32} : tensor<32xi32> -> tensor<32x1xi32> loc(#loc90)
+    %yindex_14 = tt.splat %yoffset_12 : i32 -> tensor<32x1xi32> loc(#loc91)
+    %yindex_15 = arith.addi %yindex_14, %yindex_13 : tensor<32x1xi32> loc(#loc91)
+    %ymask_16 = arith.cmpi slt, %yindex_15, %ymask : tensor<32x1xi32> loc(#loc82)
+    %xoffset = tt.get_program_id x : i32 loc(#loc92)
+    %xoffset_17 = arith.muli %xoffset, %c128_i32 : i32 loc(#loc93)
+    %xindex = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc94)
+    %xindex_18 = tt.expand_dims %xindex {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc95)
+    %xindex_19 = tt.splat %xoffset_17 : i32 -> tensor<1x128xi32> loc(#loc96)
+    %xindex_20 = arith.addi %xindex_19, %xindex_18 : tensor<1x128xi32> loc(#loc96)
+    %xmask_21 = arith.cmpi slt, %xindex_20, %xmask : tensor<1x128xi32> loc(#loc81)
+    %y1 = arith.divsi %yindex_15, %cst_7 : tensor<32x1xi32> loc(#loc97)
+    %y0 = arith.remsi %yindex_15, %cst_7 : tensor<32x1xi32> loc(#loc98)
+    %tmp4 = arith.extsi %y1 : tensor<32x1xi32> to tensor<32x1xi64> loc(#loc99)
+    %tmp4_22 = arith.cmpi slt, %tmp4, %cst_6 : tensor<32x1xi64> loc(#loc99)
+    %tmp5 = arith.muli %y0, %cst_5 : tensor<32x1xi32> loc(#loc100)
+    %tmp5_23 = tt.broadcast %xindex_20 : tensor<1x128xi32> -> tensor<32x128xi32> loc(#loc101)
+    %tmp5_24 = tt.broadcast %tmp5 : tensor<32x1xi32> -> tensor<32x128xi32> loc(#loc101)
+    %tmp5_25 = arith.addi %tmp5_23, %tmp5_24 : tensor<32x128xi32> loc(#loc101)
+    %tmp5_26 = arith.muli %y1, %cst_4 : tensor<32x1xi32> loc(#loc102)
+    %tmp5_27 = tt.broadcast %tmp5_26 : tensor<32x1xi32> -> tensor<32x128xi32> loc(#loc103)
+    %tmp5_28 = arith.addi %tmp5_25, %tmp5_27 : tensor<32x128xi32> loc(#loc103)
+    %tmp5_29 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<32x128x!tt.ptr<bf16>> loc(#loc104)
+    %tmp5_30 = tt.addptr %tmp5_29, %tmp5_28 : tensor<32x128x!tt.ptr<bf16>>, tensor<32x128xi32> loc(#loc104)
+    %tmp5_31 = tt.broadcast %tmp4_22 : tensor<32x1xi1> -> tensor<32x128xi1> loc(#loc105)
+    %tmp5_32 = tt.broadcast %xmask_21 : tensor<1x128xi1> -> tensor<32x128xi1> loc(#loc105)
+    %tmp5_33 = arith.andi %tmp5_31, %tmp5_32 : tensor<32x128xi1> loc(#loc105)
+    %tmp5_34 = tt.broadcast %ymask_16 : tensor<32x1xi1> -> tensor<32x128xi1> loc(#loc106)
+    %tmp5_35 = arith.andi %tmp5_33, %tmp5_34 : tensor<32x128xi1> loc(#loc106)
+    %tmp5_36 = tt.load %tmp5_30, %tmp5_35, %cst evictionPolicy = evict_last : tensor<32x128x!tt.ptr<bf16>> loc(#loc107)
+    %tmp5_37 = arith.extf %tmp5_36 : tensor<32x128xbf16> to tensor<32x128xf32> loc(#loc108)
+    %tmp7 = arith.muli %y1, %cst_7 : tensor<32x1xi32> loc(#loc109)
+    %tmp7_38 = arith.addi %y0, %tmp7 : tensor<32x1xi32> loc(#loc110)
+    %tmp7_39 = tt.splat %in_ptr1 : !tt.ptr<f32> -> tensor<32x1x!tt.ptr<f32>> loc(#loc111)
+    %tmp7_40 = tt.addptr %tmp7_39, %tmp7_38 : tensor<32x1x!tt.ptr<f32>>, tensor<32x1xi32> loc(#loc111)
+    %tmp7_41 = tt.broadcast %tmp7_40 : tensor<32x1x!tt.ptr<f32>> -> tensor<32x128x!tt.ptr<f32>> loc(#loc111)
+    %tmp7_42 = tt.load %tmp7_41, %tmp5_35, %cst_3 evictionPolicy = evict_last : tensor<32x128x!tt.ptr<f32>> loc(#loc112)
+    %tmp9 = arith.divf %tmp7_42, %cst_2 : tensor<32x128xf32> loc(#loc113)
+    %tmp11 = arith.addf %tmp9, %cst_1 : tensor<32x128xf32> loc(#loc114)
+    %tmp12 = tt.extern_elementwise %tmp11 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<32x128xf32>) -> tensor<32x128xf32> loc(#loc115)
+    %tmp13 = arith.mulf %tmp5_37, %tmp12 : tensor<32x128xf32> loc(#loc116)
+    %tmp14 = tt.splat %in_ptr2 : !tt.ptr<bf16> -> tensor<1x128x!tt.ptr<bf16>> loc(#loc117)
+    %tmp14_43 = tt.addptr %tmp14, %xindex_20 : tensor<1x128x!tt.ptr<bf16>>, tensor<1x128xi32> loc(#loc117)
+    %tmp14_44 = tt.broadcast %tmp14_43 : tensor<1x128x!tt.ptr<bf16>> -> tensor<32x128x!tt.ptr<bf16>> loc(#loc117)
+    %tmp14_45 = tt.load %tmp14_44, %tmp5_35, %cst evictionPolicy = evict_last : tensor<32x128x!tt.ptr<bf16>> loc(#loc118)
+    %tmp14_46 = arith.extf %tmp14_45 : tensor<32x128xbf16> to tensor<32x128xf32> loc(#loc119)
+    %tmp16 = arith.mulf %tmp13, %tmp14_46 : tensor<32x128xf32> loc(#loc120)
+    %tmp19 = arith.select %tmp5_31, %tmp16, %cst_3 : tensor<32x128xi1>, tensor<32x128xf32> loc(#loc121)
+    %tmp20 = arith.cmpi sge, %tmp4, %cst_6 : tensor<32x1xi64> loc(#loc122)
+    %tmp23 = arith.addi %y1, %cst_0 : tensor<32x1xi32> loc(#loc123)
+    %tmp23_47 = arith.muli %tmp23, %cst_4 : tensor<32x1xi32> loc(#loc124)
+    %tmp23_48 = tt.broadcast %tmp23_47 : tensor<32x1xi32> -> tensor<32x128xi32> loc(#loc125)
+    %tmp23_49 = arith.addi %tmp5_25, %tmp23_48 : tensor<32x128xi32> loc(#loc125)
+    %tmp23_50 = tt.splat %in_ptr3 : !tt.ptr<bf16> -> tensor<32x128x!tt.ptr<bf16>> loc(#loc126)
+    %tmp23_51 = tt.addptr %tmp23_50, %tmp23_49 : tensor<32x128x!tt.ptr<bf16>>, tensor<32x128xi32> loc(#loc126)
+    %tmp23_52 = tt.broadcast %tmp20 : tensor<32x1xi1> -> tensor<32x128xi1> loc(#loc127)
+    %tmp23_53 = arith.andi %tmp23_52, %tmp5_32 : tensor<32x128xi1> loc(#loc127)
+    %tmp23_54 = arith.andi %tmp23_53, %tmp5_34 : tensor<32x128xi1> loc(#loc128)
+    %tmp23_55 = tt.load %tmp23_51, %tmp23_54, %cst evictionPolicy = evict_last : tensor<32x128x!tt.ptr<bf16>> loc(#loc129)
+    %tmp23_56 = arith.extf %tmp23_55 : tensor<32x128xbf16> to tensor<32x128xf32> loc(#loc130)
+    %tmp25 = arith.muli %tmp23, %cst_7 : tensor<32x1xi32> loc(#loc131)
+    %tmp25_57 = arith.addi %y0, %tmp25 : tensor<32x1xi32> loc(#loc132)
+    %tmp25_58 = tt.splat %in_ptr4 : !tt.ptr<f32> -> tensor<32x1x!tt.ptr<f32>> loc(#loc133)
+    %tmp25_59 = tt.addptr %tmp25_58, %tmp25_57 : tensor<32x1x!tt.ptr<f32>>, tensor<32x1xi32> loc(#loc133)
+    %tmp25_60 = tt.broadcast %tmp25_59 : tensor<32x1x!tt.ptr<f32>> -> tensor<32x128x!tt.ptr<f32>> loc(#loc133)
+    %tmp25_61 = tt.load %tmp25_60, %tmp23_54, %cst_3 evictionPolicy = evict_last : tensor<32x128x!tt.ptr<f32>> loc(#loc134)
+    %tmp27 = arith.divf %tmp25_61, %cst_2 : tensor<32x128xf32> loc(#loc135)
+    %tmp29 = arith.addf %tmp27, %cst_1 : tensor<32x128xf32> loc(#loc136)
+    %tmp30 = tt.extern_elementwise %tmp29 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<32x128xf32>) -> tensor<32x128xf32> loc(#loc137)
+    %tmp31 = arith.mulf %tmp23_56, %tmp30 : tensor<32x128xf32> loc(#loc138)
+    %tmp32 = tt.splat %in_ptr5 : !tt.ptr<bf16> -> tensor<1x128x!tt.ptr<bf16>> loc(#loc139)
+    %tmp32_62 = tt.addptr %tmp32, %xindex_20 : tensor<1x128x!tt.ptr<bf16>>, tensor<1x128xi32> loc(#loc139)
+    %tmp32_63 = tt.broadcast %tmp32_62 : tensor<1x128x!tt.ptr<bf16>> -> tensor<32x128x!tt.ptr<bf16>> loc(#loc139)
+    %tmp32_64 = tt.load %tmp32_63, %tmp23_54, %cst evictionPolicy = evict_last : tensor<32x128x!tt.ptr<bf16>> loc(#loc140)
+    %tmp32_65 = arith.extf %tmp32_64 : tensor<32x128xbf16> to tensor<32x128xf32> loc(#loc141)
+    %tmp34 = arith.mulf %tmp31, %tmp32_65 : tensor<32x128xf32> loc(#loc142)
+    %tmp37 = arith.select %tmp23_52, %tmp34, %cst_3 : tensor<32x128xi1>, tensor<32x128xf32> loc(#loc143)
+    %tmp38 = arith.select %tmp5_31, %tmp19, %tmp37 : tensor<32x128xi1>, tensor<32x128xf32> loc(#loc144)
+    %0 = arith.muli %yindex_15, %cst_5 : tensor<32x1xi32> loc(#loc66)
+    %1 = tt.broadcast %0 : tensor<32x1xi32> -> tensor<32x128xi32> loc(#loc67)
+    %2 = arith.addi %tmp5_23, %1 : tensor<32x128xi32> loc(#loc67)
+    %3 = tt.splat %out_ptr0 : !tt.ptr<bf16> -> tensor<32x128x!tt.ptr<bf16>> loc(#loc68)
+    %4 = tt.addptr %3, %2 : tensor<32x128x!tt.ptr<bf16>>, tensor<32x128xi32> loc(#loc68)
+    %5 = arith.andi %tmp5_32, %tmp5_34 : tensor<32x128xi1> loc(#loc69)
+    %6 = arith.truncf %tmp38 : tensor<32x128xf32> to tensor<32x128xbf16> loc(#loc70)
+    tt.store %4, %6, %5 : tensor<32x128x!tt.ptr<bf16>> loc(#loc70)
+    tt.return loc(#loc71)
+  } loc(#loc)
+} loc(#loc)
+#loc1 = loc(unknown)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":26:21)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":23:21)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:29)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:48)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:69)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:53)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:34)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":21:75)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":22:36)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":22:44)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":22:23)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":24:28)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":24:33)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":25:36)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":25:44)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":25:23)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":27:19)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":29:19)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":35:18)
+#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:39)
+#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:35)
+#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:51)
+#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:44)
+#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:30)
+#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:64)
+#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:72)
+#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:57)
+#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":36:123)
+#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:55)
+#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:51)
+#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:30)
+#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":38:80)
+#loc34 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":40:19)
+#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":42:19)
+#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":43:28)
+#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":44:19)
+#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:31)
+#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:71)
+#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":45:137)
+#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":47:20)
+#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":50:34)
+#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":51:20)
+#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:61)
+#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:52)
+#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:45)
+#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:31)
+#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:75)
+#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:83)
+#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:67)
+#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":54:134)
+#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:56)
+#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:52)
+#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:31)
+#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":56:90)
+#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":58:21)
+#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":60:20)
+#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":61:28)
+#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":62:20)
+#loc60 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:31)
+#loc61 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:71)
+#loc62 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":63:138)
+#loc63 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":65:20)
+#loc64 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":68:35)
+#loc65 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":69:34)
+#loc66 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:34)
+#loc67 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:30)
+#loc68 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:25)
+#loc69 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:54)
+#loc70 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:46)
+#loc71 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/2h/c2hij3hmloumxdmhuezsyhkmnqgnfa5ivre27uosymam3dr7a5xb.py":70:4)
+#loc81 = loc("xmask"(#loc2))
+#loc82 = loc("ymask"(#loc3))
+#loc83 = loc("yoffset"(#loc4))
+#loc84 = loc("yoffset"(#loc5))
+#loc85 = loc("yoffset"(#loc6))
+#loc86 = loc("yoffset"(#loc7))
+#loc87 = loc("yoffset"(#loc8))
+#loc88 = loc("yoffset"(#loc9))
+#loc89 = loc("yindex"(#loc10))
+#loc90 = loc("yindex"(#loc11))
+#loc91 = loc("yindex"(#loc12))
+#loc92 = loc("xoffset"(#loc13))
+#loc93 = loc("xoffset"(#loc14))
+#loc94 = loc("xindex"(#loc15))
+#loc95 = loc("xindex"(#loc16))
+#loc96 = loc("xindex"(#loc17))
+#loc97 = loc("y1"(#loc18))
+#loc98 = loc("y0"(#loc19))
+#loc99 = loc("tmp4"(#loc20))
+#loc100 = loc("tmp5"(#loc21))
+#loc101 = loc("tmp5"(#loc22))
+#loc102 = loc("tmp5"(#loc23))
+#loc103 = loc("tmp5"(#loc24))
+#loc104 = loc("tmp5"(#loc25))
+#loc105 = loc("tmp5"(#loc26))
+#loc106 = loc("tmp5"(#loc27))
+#loc107 = loc("tmp5"(#loc28))
+#loc108 = loc("tmp5"(#loc29))
+#loc109 = loc("tmp7"(#loc30))
+#loc110 = loc("tmp7"(#loc31))
+#loc111 = loc("tmp7"(#loc32))
+#loc112 = loc("tmp7"(#loc33))
+#loc113 = loc("tmp9"(#loc34))
+#loc114 = loc("tmp11"(#loc35))
+#loc115 = loc("tmp12"(#loc36))
+#loc116 = loc("tmp13"(#loc37))
+#loc117 = loc("tmp14"(#loc38))
+#loc118 = loc("tmp14"(#loc39))
+#loc119 = loc("tmp14"(#loc40))
+#loc120 = loc("tmp16"(#loc41))
+#loc121 = loc("tmp19"(#loc42))
+#loc122 = loc("tmp20"(#loc43))
+#loc123 = loc("tmp23"(#loc44))
+#loc124 = loc("tmp23"(#loc45))
+#loc125 = loc("tmp23"(#loc46))
+#loc126 = loc("tmp23"(#loc47))
+#loc127 = loc("tmp23"(#loc48))
+#loc128 = loc("tmp23"(#loc49))
+#loc129 = loc("tmp23"(#loc50))
+#loc130 = loc("tmp23"(#loc51))
+#loc131 = loc("tmp25"(#loc52))
+#loc132 = loc("tmp25"(#loc53))
+#loc133 = loc("tmp25"(#loc54))
+#loc134 = loc("tmp25"(#loc55))
+#loc135 = loc("tmp27"(#loc56))
+#loc136 = loc("tmp29"(#loc57))
+#loc137 = loc("tmp30"(#loc58))
+#loc138 = loc("tmp31"(#loc59))
+#loc139 = loc("tmp32"(#loc60))
+#loc140 = loc("tmp32"(#loc61))
+#loc141 = loc("tmp32"(#loc62))
+#loc142 = loc("tmp34"(#loc63))
+#loc143 = loc("tmp37"(#loc64))
+#loc144 = loc("tmp38"(#loc65))
diff --git a/triton/WCDEYES2ECOIHNPRK2ZUC7OEILM2N7OM7EE5ROIP54J23WJDT3TA/__grp__triton_poi_fused_add_mul_1.json b/triton/WCDEYES2ECOIHNPRK2ZUC7OEILM2N7OM7EE5ROIP54J23WJDT3TA/__grp__triton_poi_fused_add_mul_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..3189810da0d1a4d08562ec50104228dfd33d02e4
--- /dev/null
+++ b/triton/WCDEYES2ECOIHNPRK2ZUC7OEILM2N7OM7EE5ROIP54J23WJDT3TA/__grp__triton_poi_fused_add_mul_1.json
@@ -0,0 +1 @@
+{"child_paths": {"triton_poi_fused_add_mul_1.source": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/WCDEYES2ECOIHNPRK2ZUC7OEILM2N7OM7EE5ROIP54J23WJDT3TA/triton_poi_fused_add_mul_1.source", "triton_poi_fused_add_mul_1.ttir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/WCDEYES2ECOIHNPRK2ZUC7OEILM2N7OM7EE5ROIP54J23WJDT3TA/triton_poi_fused_add_mul_1.ttir", "triton_poi_fused_add_mul_1.ttgir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/WCDEYES2ECOIHNPRK2ZUC7OEILM2N7OM7EE5ROIP54J23WJDT3TA/triton_poi_fused_add_mul_1.ttgir", "triton_poi_fused_add_mul_1.llir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/WCDEYES2ECOIHNPRK2ZUC7OEILM2N7OM7EE5ROIP54J23WJDT3TA/triton_poi_fused_add_mul_1.llir", "triton_poi_fused_add_mul_1.ptx": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/WCDEYES2ECOIHNPRK2ZUC7OEILM2N7OM7EE5ROIP54J23WJDT3TA/triton_poi_fused_add_mul_1.ptx", "triton_poi_fused_add_mul_1.cubin": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/WCDEYES2ECOIHNPRK2ZUC7OEILM2N7OM7EE5ROIP54J23WJDT3TA/triton_poi_fused_add_mul_1.cubin", "triton_poi_fused_add_mul_1.json": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/WCDEYES2ECOIHNPRK2ZUC7OEILM2N7OM7EE5ROIP54J23WJDT3TA/triton_poi_fused_add_mul_1.json"}}
\ No newline at end of file
diff --git a/triton/WCDEYES2ECOIHNPRK2ZUC7OEILM2N7OM7EE5ROIP54J23WJDT3TA/triton_poi_fused_add_mul_1.cubin b/triton/WCDEYES2ECOIHNPRK2ZUC7OEILM2N7OM7EE5ROIP54J23WJDT3TA/triton_poi_fused_add_mul_1.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..6dd0e14859961e4591d128a27b5be77b8d7eeb03
Binary files /dev/null and b/triton/WCDEYES2ECOIHNPRK2ZUC7OEILM2N7OM7EE5ROIP54J23WJDT3TA/triton_poi_fused_add_mul_1.cubin differ
diff --git a/triton/WCDEYES2ECOIHNPRK2ZUC7OEILM2N7OM7EE5ROIP54J23WJDT3TA/triton_poi_fused_add_mul_1.json b/triton/WCDEYES2ECOIHNPRK2ZUC7OEILM2N7OM7EE5ROIP54J23WJDT3TA/triton_poi_fused_add_mul_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..e74dd9bda134fba4e35c667f1e5c4090d8003e18
--- /dev/null
+++ b/triton/WCDEYES2ECOIHNPRK2ZUC7OEILM2N7OM7EE5ROIP54J23WJDT3TA/triton_poi_fused_add_mul_1.json
@@ -0,0 +1 @@
+{"hash": "b0864c125a209c83b5f156b3417dc442d9a6fdccf909d8b90fef13add9239ee6", "target": {"backend": "cuda", "arch": 89, "warp_size": 32}, "num_warps": 8, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "enable_reflect_ftz": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee", "bf16x3", "bf16x6"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm89", "instrumentation_mode": "", "triton_version": "3.6.0", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_poi_fused_add_mul_1"}
\ No newline at end of file
diff --git a/triton/WCDEYES2ECOIHNPRK2ZUC7OEILM2N7OM7EE5ROIP54J23WJDT3TA/triton_poi_fused_add_mul_1.llir b/triton/WCDEYES2ECOIHNPRK2ZUC7OEILM2N7OM7EE5ROIP54J23WJDT3TA/triton_poi_fused_add_mul_1.llir
new file mode 100644
index 0000000000000000000000000000000000000000..f91df25b442f7de9dc8e952508a3a0c8a197e308
--- /dev/null
+++ b/triton/WCDEYES2ECOIHNPRK2ZUC7OEILM2N7OM7EE5ROIP54J23WJDT3TA/triton_poi_fused_add_mul_1.llir
@@ -0,0 +1,76 @@
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64"
+
+; Function Attrs: nounwind
+define ptx_kernel void @triton_poi_fused_add_mul_1(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, i32 %4, ptr addrspace(1) readnone captures(none) %5, ptr addrspace(1) readnone captures(none) %6) local_unnamed_addr #0 !dbg !4 {
+  %8 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7
+  %9 = shl i32 %8, 9, !dbg !8
+  %10 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9
+  %11 = shl nuw nsw i32 %10, 1, !dbg !9
+  %12 = and i32 %11, 510, !dbg !9
+  %13 = or disjoint i32 %12, %9, !dbg !10
+  %14 = srem i32 %13, 4096, !dbg !11
+  %15 = sext i32 %13 to i64, !dbg !12
+  %16 = getelementptr bfloat, ptr addrspace(1) %0, i64 %15, !dbg !12
+  %17 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l"(ptr addrspace(1) %16) #2, !dbg !13
+  %18 = bitcast i32 %17 to <2 x bfloat>, !dbg !13
+  %19 = sext i32 %14 to i64, !dbg !14
+  %20 = getelementptr bfloat, ptr addrspace(1) %1, i64 %19, !dbg !14
+  %21 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #2, !dbg !15
+  %22 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l"(ptr addrspace(1) %20, i64 %21) #2, !dbg !15
+  %23 = bitcast i32 %22 to <2 x bfloat>, !dbg !15
+  %24 = getelementptr bfloat, ptr addrspace(1) %2, i64 %15, !dbg !16
+  %25 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l"(ptr addrspace(1) %24) #2, !dbg !17
+  %26 = bitcast i32 %25 to <2 x bfloat>, !dbg !17
+  %27 = getelementptr bfloat, ptr addrspace(1) %3, i64 %15, !dbg !18
+  %28 = fpext <2 x bfloat> %18 to <2 x float>, !dbg !19
+  %29 = fpext <2 x bfloat> %23 to <2 x float>, !dbg !20
+  %30 = fpext <2 x bfloat> %26 to <2 x float>, !dbg !21
+  %31 = fmul <2 x float> %29, %30, !dbg !22
+  %32 = fadd <2 x float> %31, %28, !dbg !23
+  %33 = fptrunc <2 x float> %32 to <2 x bfloat>, !dbg !24
+  %34 = bitcast <2 x bfloat> %33 to i32, !dbg !24
+  tail call void asm sideeffect "st.global.b32 [ $1 + 0 ], { $0 };", "r,l"(i32 %34, ptr addrspace(1) %27) #2, !dbg !24
+  ret void, !dbg !25
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1
+
+attributes #0 = { nounwind "nvvm.reqntid"="256" }
+attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #2 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly)
+!1 = !DIFile(filename: "c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py", directory: "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f")
+!2 = !{i32 2, !"Debug Info Version", i32 3}
+!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
+!4 = distinct !DISubprogram(name: "triton_poi_fused_add_mul_1", linkageName: "triton_poi_fused_add_mul_1", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!5 = !DISubroutineType(cc: DW_CC_normal, types: !6)
+!6 = !{}
+!7 = !DILocation(line: 20, column: 28, scope: !4)
+!8 = !DILocation(line: 20, column: 33, scope: !4)
+!9 = !DILocation(line: 21, column: 36, scope: !4)
+!10 = !DILocation(line: 21, column: 23, scope: !4)
+!11 = !DILocation(line: 24, column: 19, scope: !4)
+!12 = !DILocation(line: 25, column: 30, scope: !4)
+!13 = !DILocation(line: 25, column: 35, scope: !4)
+!14 = !DILocation(line: 26, column: 30, scope: !4)
+!15 = !DILocation(line: 26, column: 35, scope: !4)
+!16 = !DILocation(line: 27, column: 30, scope: !4)
+!17 = !DILocation(line: 27, column: 35, scope: !4)
+!18 = !DILocation(line: 30, column: 25, scope: !4)
+!19 = !DILocation(line: 25, column: 44, scope: !4)
+!20 = !DILocation(line: 26, column: 74, scope: !4)
+!21 = !DILocation(line: 27, column: 44, scope: !4)
+!22 = !DILocation(line: 28, column: 18, scope: !4)
+!23 = !DILocation(line: 29, column: 18, scope: !4)
+!24 = !DILocation(line: 30, column: 36, scope: !4)
+!25 = !DILocation(line: 30, column: 4, scope: !4)
diff --git a/triton/WCDEYES2ECOIHNPRK2ZUC7OEILM2N7OM7EE5ROIP54J23WJDT3TA/triton_poi_fused_add_mul_1.ptx b/triton/WCDEYES2ECOIHNPRK2ZUC7OEILM2N7OM7EE5ROIP54J23WJDT3TA/triton_poi_fused_add_mul_1.ptx
new file mode 100644
index 0000000000000000000000000000000000000000..bcb290e97f0ae9090d44db7171a261314c410328
--- /dev/null
+++ b/triton/WCDEYES2ECOIHNPRK2ZUC7OEILM2N7OM7EE5ROIP54J23WJDT3TA/triton_poi_fused_add_mul_1.ptx
@@ -0,0 +1,347 @@
+//
+// Generated by LLVM NVPTX Back-End
+//
+
+.version 9.1
+.target sm_89
+.address_size 64
+
+	// .globl	triton_poi_fused_add_mul_1 // -- Begin function triton_poi_fused_add_mul_1
+                                        // @triton_poi_fused_add_mul_1
+.visible .entry triton_poi_fused_add_mul_1(
+	.param .u64 .ptr .global .align 1 triton_poi_fused_add_mul_1_param_0,
+	.param .u64 .ptr .global .align 1 triton_poi_fused_add_mul_1_param_1,
+	.param .u64 .ptr .global .align 1 triton_poi_fused_add_mul_1_param_2,
+	.param .u64 .ptr .global .align 1 triton_poi_fused_add_mul_1_param_3,
+	.param .u32 triton_poi_fused_add_mul_1_param_4,
+	.param .u64 .ptr .global .align 1 triton_poi_fused_add_mul_1_param_5,
+	.param .u64 .ptr .global .align 1 triton_poi_fused_add_mul_1_param_6
+)
+.reqntid 256
+{
+	.reg .b16 	%rs<7>;
+	.reg .b32 	%r<24>;
+	.reg .b64 	%rd<11>;
+	.loc	1 18 0                          // c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py:18:0
+$L__func_begin0:
+	.loc	1 18 0                          // c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py:18:0
+
+// %bb.0:
+	ld.param.b64 	%rd6, [triton_poi_fused_add_mul_1_param_0];
+	ld.param.b64 	%rd7, [triton_poi_fused_add_mul_1_param_1];
+$L__tmp0:
+	.loc	1 20 28                         // c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py:20:28
+	mov.u32 	%r5, %ctaid.x;
+	.loc	1 20 33                         // c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py:20:33
+	shl.b32 	%r6, %r5, 9;
+	ld.param.b64 	%rd8, [triton_poi_fused_add_mul_1_param_2];
+	ld.param.b64 	%rd9, [triton_poi_fused_add_mul_1_param_3];
+	.loc	1 21 36                         // c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py:21:36
+	mov.u32 	%r7, %tid.x;
+	shl.b32 	%r8, %r7, 1;
+	and.b32 	%r9, %r8, 510;
+	.loc	1 21 23                         // c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py:21:23
+	or.b32 	%r10, %r9, %r6;
+	.loc	1 24 19                         // c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py:24:19
+	bfe.s32 	%r11, %r5, 22, 1;
+	shr.u32 	%r12, %r11, 20;
+	add.s32 	%r13, %r10, %r12;
+	and.b32 	%r14, %r13, -4096;
+	sub.s32 	%r15, %r10, %r14;
+	.loc	1 25 30                         // c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py:25:30
+	mul.wide.s32 	%rd10, %r10, 2;
+	add.s64 	%rd1, %rd6, %rd10;
+	.loc	1 25 35                         // c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py:25:35
+	// begin inline asm
+	mov.u32 %r1, 0x0;
+	ld.global.b32 { %r1 }, [ %rd1 + 0 ];
+	// end inline asm
+	.loc	1 26 30                         // c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py:26:30
+	mad.wide.s32 	%rd2, %r15, 2, %rd7;
+	.loc	1 26 35                         // c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py:26:35
+	// begin inline asm
+	mov.u64 %rd3, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd3, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r2, 0x0;
+	ld.global.L1::evict_last.L2::cache_hint.b32 { %r2 }, [ %rd2 + 0 ], %rd3;
+	// end inline asm
+	.loc	1 27 30                         // c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py:27:30
+	add.s64 	%rd4, %rd8, %rd10;
+	.loc	1 27 35                         // c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py:27:35
+	// begin inline asm
+	mov.u32 %r3, 0x0;
+	ld.global.b32 { %r3 }, [ %rd4 + 0 ];
+	// end inline asm
+	.loc	1 30 25                         // c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py:30:25
+	add.s64 	%rd5, %rd9, %rd10;
+	.loc	1 25 44                         // c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py:25:44
+	mov.b32 	{%rs1, %rs2}, %r1;
+	cvt.f32.bf16 	%r16, %rs2;
+	cvt.f32.bf16 	%r17, %rs1;
+	.loc	1 26 74                         // c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py:26:74
+	mov.b32 	{%rs3, %rs4}, %r2;
+	cvt.f32.bf16 	%r18, %rs4;
+	cvt.f32.bf16 	%r19, %rs3;
+	.loc	1 27 44                         // c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py:27:44
+	mov.b32 	{%rs5, %rs6}, %r3;
+	cvt.f32.bf16 	%r20, %rs6;
+	cvt.f32.bf16 	%r21, %rs5;
+	.loc	1 29 18                         // c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py:29:18
+	fma.rn.f32 	%r22, %r19, %r21, %r17;
+	fma.rn.f32 	%r23, %r18, %r20, %r16;
+	.loc	1 30 36                         // c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py:30:36
+	cvt.rn.bf16x2.f32 	%r4, %r23, %r22;
+	// begin inline asm
+	st.global.b32 [ %rd5 + 0 ], { %r4 };
+	// end inline asm
+	.loc	1 30 4                          // c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py:30:4
+	ret;
+$L__tmp1:
+$L__func_end0:
+                                        // -- End function
+}
+	.file	1 "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py"
+	.section	.debug_abbrev
+	{
+.b8 1                                   // Abbreviation Code
+.b8 17                                  // DW_TAG_compile_unit
+.b8 0                                   // DW_CHILDREN_no
+.b8 37                                  // DW_AT_producer
+.b8 8                                   // DW_FORM_string
+.b8 19                                  // DW_AT_language
+.b8 5                                   // DW_FORM_data2
+.b8 3                                   // DW_AT_name
+.b8 8                                   // DW_FORM_string
+.b8 16                                  // DW_AT_stmt_list
+.b8 6                                   // DW_FORM_data4
+.b8 27                                  // DW_AT_comp_dir
+.b8 8                                   // DW_FORM_string
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 0                                   // EOM(3)
+	}
+	.section	.debug_info
+	{
+.b32 224                                // Length of Unit
+.b8 2                                   // DWARF version number
+.b8 0
+.b32 .debug_abbrev                      // Offset Into Abbrev. Section
+.b8 8                                   // Address Size (in bytes)
+.b8 1                                   // Abbrev [1] 0xb:0xd9 DW_TAG_compile_unit
+.b8 116                                 // DW_AT_producer
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 0
+.b8 2                                   // DW_AT_language
+.b8 0
+.b8 99                                  // DW_AT_name
+.b8 55
+.b8 102
+.b8 102
+.b8 52
+.b8 105
+.b8 98
+.b8 54
+.b8 54
+.b8 53
+.b8 50
+.b8 111
+.b8 106
+.b8 108
+.b8 108
+.b8 117
+.b8 116
+.b8 109
+.b8 52
+.b8 99
+.b8 55
+.b8 109
+.b8 107
+.b8 122
+.b8 122
+.b8 112
+.b8 121
+.b8 98
+.b8 111
+.b8 110
+.b8 100
+.b8 51
+.b8 112
+.b8 97
+.b8 103
+.b8 117
+.b8 51
+.b8 103
+.b8 108
+.b8 115
+.b8 112
+.b8 119
+.b8 51
+.b8 115
+.b8 122
+.b8 116
+.b8 107
+.b8 102
+.b8 101
+.b8 50
+.b8 122
+.b8 97
+.b8 46
+.b8 112
+.b8 121
+.b8 0
+.b32 .debug_line                        // DW_AT_stmt_list
+.b8 47                                  // DW_AT_comp_dir
+.b8 97
+.b8 112
+.b8 112
+.b8 47
+.b8 116
+.b8 101
+.b8 110
+.b8 115
+.b8 111
+.b8 114
+.b8 114
+.b8 116
+.b8 95
+.b8 108
+.b8 108
+.b8 109
+.b8 47
+.b8 118
+.b8 105
+.b8 115
+.b8 117
+.b8 97
+.b8 108
+.b8 95
+.b8 103
+.b8 101
+.b8 110
+.b8 47
+.b8 99
+.b8 111
+.b8 109
+.b8 112
+.b8 105
+.b8 108
+.b8 101
+.b8 100
+.b8 95
+.b8 99
+.b8 97
+.b8 99
+.b8 104
+.b8 101
+.b8 47
+.b8 102
+.b8 108
+.b8 117
+.b8 120
+.b8 50
+.b8 95
+.b8 107
+.b8 108
+.b8 101
+.b8 105
+.b8 110
+.b8 95
+.b8 57
+.b8 98
+.b8 95
+.b8 78
+.b8 86
+.b8 73
+.b8 68
+.b8 73
+.b8 65
+.b8 95
+.b8 71
+.b8 101
+.b8 70
+.b8 111
+.b8 114
+.b8 99
+.b8 101
+.b8 95
+.b8 82
+.b8 84
+.b8 88
+.b8 95
+.b8 52
+.b8 48
+.b8 57
+.b8 48
+.b8 95
+.b8 115
+.b8 109
+.b8 56
+.b8 57
+.b8 95
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 50
+.b8 46
+.b8 49
+.b8 48
+.b8 46
+.b8 48
+.b8 97
+.b8 48
+.b8 95
+.b8 98
+.b8 52
+.b8 101
+.b8 52
+.b8 101
+.b8 101
+.b8 56
+.b8 49
+.b8 100
+.b8 51
+.b8 46
+.b8 110
+.b8 118
+.b8 50
+.b8 53
+.b8 46
+.b8 49
+.b8 50
+.b8 95
+.b8 99
+.b8 117
+.b8 100
+.b8 97
+.b8 49
+.b8 51
+.b8 95
+.b8 49
+.b8 47
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 105
+.b8 110
+.b8 100
+.b8 117
+.b8 99
+.b8 116
+.b8 111
+.b8 114
+.b8 47
+.b8 55
+.b8 102
+.b8 0
+	}
+	.section	.debug_macinfo	{	}
diff --git a/triton/WCDEYES2ECOIHNPRK2ZUC7OEILM2N7OM7EE5ROIP54J23WJDT3TA/triton_poi_fused_add_mul_1.source b/triton/WCDEYES2ECOIHNPRK2ZUC7OEILM2N7OM7EE5ROIP54J23WJDT3TA/triton_poi_fused_add_mul_1.source
new file mode 100644
index 0000000000000000000000000000000000000000..b1626afe5c2f197738fbbe8f4779fff6f72eac1d
--- /dev/null
+++ b/triton/WCDEYES2ECOIHNPRK2ZUC7OEILM2N7OM7EE5ROIP54J23WJDT3TA/triton_poi_fused_add_mul_1.source
@@ -0,0 +1,82 @@
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":18:0)
+#loc22 = loc("in_ptr0"(#loc))
+#loc23 = loc("in_ptr1"(#loc))
+#loc24 = loc("in_ptr2"(#loc))
+#loc25 = loc("out_ptr0"(#loc))
+#loc26 = loc("xnumel"(#loc))
+module {
+  tt.func public @triton_poi_fused_add_mul_1(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} {
+    %xnumel_0 = arith.constant 8388608 : i32 loc(#loc27)
+    %xoffset = tt.get_program_id x : i32 loc(#loc28)
+    %xoffset_1 = arith.constant 512 : i32 loc(#loc29)
+    %xoffset_2 = arith.constant 512 : i32 loc(#loc29)
+    %xoffset_3 = arith.muli %xoffset, %xoffset_2 : i32 loc(#loc29)
+    %xindex = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32> loc(#loc30)
+    %xindex_4 = tt.splat %xoffset_3 : i32 -> tensor<512xi32> loc(#loc31)
+    %xindex_5 = arith.addi %xindex_4, %xindex : tensor<512xi32> loc(#loc31)
+    %xmask = arith.constant true loc(#loc32)
+    %xmask_6 = arith.constant dense<true> : tensor<512xi1> loc(#loc32)
+    %x0 = arith.constant 4096 : i32 loc(#loc33)
+    %x0_7 = arith.constant 4096 : i32 loc(#loc33)
+    %x0_8 = arith.constant dense<4096> : tensor<512xi32> loc(#loc33)
+    %x0_9 = arith.remsi %xindex_5, %x0_8 : tensor<512xi32> loc(#loc33)
+    %tmp0 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<512x!tt.ptr<bf16>> loc(#loc34)
+    %tmp0_10 = tt.addptr %tmp0, %xindex_5 : tensor<512x!tt.ptr<bf16>>, tensor<512xi32> loc(#loc34)
+    %tmp0_11 = tt.load %tmp0_10 : tensor<512x!tt.ptr<bf16>> loc(#loc35)
+    %tmp0_12 = arith.extf %tmp0_11 : tensor<512xbf16> to tensor<512xf32> loc(#loc36)
+    %tmp1 = tt.splat %in_ptr1 : !tt.ptr<bf16> -> tensor<512x!tt.ptr<bf16>> loc(#loc37)
+    %tmp1_13 = tt.addptr %tmp1, %x0_9 : tensor<512x!tt.ptr<bf16>>, tensor<512xi32> loc(#loc37)
+    %tmp1_14 = tt.load %tmp1_13 evictionPolicy = evict_last : tensor<512x!tt.ptr<bf16>> loc(#loc38)
+    %tmp1_15 = arith.extf %tmp1_14 : tensor<512xbf16> to tensor<512xf32> loc(#loc39)
+    %tmp2 = tt.splat %in_ptr2 : !tt.ptr<bf16> -> tensor<512x!tt.ptr<bf16>> loc(#loc40)
+    %tmp2_16 = tt.addptr %tmp2, %xindex_5 : tensor<512x!tt.ptr<bf16>>, tensor<512xi32> loc(#loc40)
+    %tmp2_17 = tt.load %tmp2_16 : tensor<512x!tt.ptr<bf16>> loc(#loc41)
+    %tmp2_18 = arith.extf %tmp2_17 : tensor<512xbf16> to tensor<512xf32> loc(#loc42)
+    %tmp3 = arith.mulf %tmp1_15, %tmp2_18 : tensor<512xf32> loc(#loc43)
+    %tmp4 = arith.addf %tmp0_12, %tmp3 : tensor<512xf32> loc(#loc44)
+    %0 = tt.splat %out_ptr0 : !tt.ptr<bf16> -> tensor<512x!tt.ptr<bf16>> loc(#loc19)
+    %1 = tt.addptr %0, %xindex_5 : tensor<512x!tt.ptr<bf16>>, tensor<512xi32> loc(#loc19)
+    %2 = arith.truncf %tmp4 : tensor<512xf32> to tensor<512xbf16> loc(#loc20)
+    tt.store %1, %2 : tensor<512x!tt.ptr<bf16>> loc(#loc20)
+    tt.return loc(#loc21)
+  } loc(#loc)
+} loc(#loc)
+#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":19:13)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":20:28)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":20:33)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":21:36)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":21:23)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":22:36)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":24:19)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":25:30)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":25:35)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":25:44)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":26:30)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":26:35)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":26:74)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":27:30)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":27:35)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":27:44)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":28:18)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":29:18)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":30:25)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":30:36)
+#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":30:4)
+#loc27 = loc("xnumel"(#loc1))
+#loc28 = loc("xoffset"(#loc2))
+#loc29 = loc("xoffset"(#loc3))
+#loc30 = loc("xindex"(#loc4))
+#loc31 = loc("xindex"(#loc5))
+#loc32 = loc("xmask"(#loc6))
+#loc33 = loc("x0"(#loc7))
+#loc34 = loc("tmp0"(#loc8))
+#loc35 = loc("tmp0"(#loc9))
+#loc36 = loc("tmp0"(#loc10))
+#loc37 = loc("tmp1"(#loc11))
+#loc38 = loc("tmp1"(#loc12))
+#loc39 = loc("tmp1"(#loc13))
+#loc40 = loc("tmp2"(#loc14))
+#loc41 = loc("tmp2"(#loc15))
+#loc42 = loc("tmp2"(#loc16))
+#loc43 = loc("tmp3"(#loc17))
+#loc44 = loc("tmp4"(#loc18))
diff --git a/triton/WCDEYES2ECOIHNPRK2ZUC7OEILM2N7OM7EE5ROIP54J23WJDT3TA/triton_poi_fused_add_mul_1.ttgir b/triton/WCDEYES2ECOIHNPRK2ZUC7OEILM2N7OM7EE5ROIP54J23WJDT3TA/triton_poi_fused_add_mul_1.ttgir
new file mode 100644
index 0000000000000000000000000000000000000000..436155cf7787907392c12b3a4a4322da3c4c0914
--- /dev/null
+++ b/triton/WCDEYES2ECOIHNPRK2ZUC7OEILM2N7OM7EE5ROIP54J23WJDT3TA/triton_poi_fused_add_mul_1.ttgir
@@ -0,0 +1,74 @@
+#blocked = #ttg.blocked<{sizePerThread = [2], threadsPerWarp = [32], warpsPerCTA = [8], order = [0]}>
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":18:0)
+#loc21 = loc("in_ptr0"(#loc))
+#loc22 = loc("in_ptr1"(#loc))
+#loc23 = loc("in_ptr2"(#loc))
+#loc24 = loc("out_ptr0"(#loc))
+#loc25 = loc("xnumel"(#loc))
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "cuda:89", "ttg.threads-per-warp" = 32 : i32} {
+  tt.func public @triton_poi_fused_add_mul_1(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} {
+    %cst = arith.constant dense<4096> : tensor<512xi32, #blocked> loc(#loc1)
+    %c512_i32 = arith.constant 512 : i32 loc(#loc1)
+    %xoffset = tt.get_program_id x : i32 loc(#loc26)
+    %xoffset_0 = arith.muli %xoffset, %c512_i32 : i32 loc(#loc27)
+    %xindex = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32, #blocked> loc(#loc28)
+    %xindex_1 = tt.splat %xoffset_0 : i32 -> tensor<512xi32, #blocked> loc(#loc29)
+    %xindex_2 = arith.addi %xindex_1, %xindex : tensor<512xi32, #blocked> loc(#loc29)
+    %x0 = arith.remsi %xindex_2, %cst : tensor<512xi32, #blocked> loc(#loc30)
+    %tmp0 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<512x!tt.ptr<bf16>, #blocked> loc(#loc31)
+    %tmp0_3 = tt.addptr %tmp0, %xindex_2 : tensor<512x!tt.ptr<bf16>, #blocked>, tensor<512xi32, #blocked> loc(#loc31)
+    %tmp0_4 = tt.load %tmp0_3 : tensor<512x!tt.ptr<bf16>, #blocked> loc(#loc32)
+    %tmp0_5 = arith.extf %tmp0_4 : tensor<512xbf16, #blocked> to tensor<512xf32, #blocked> loc(#loc33)
+    %tmp1 = tt.splat %in_ptr1 : !tt.ptr<bf16> -> tensor<512x!tt.ptr<bf16>, #blocked> loc(#loc34)
+    %tmp1_6 = tt.addptr %tmp1, %x0 : tensor<512x!tt.ptr<bf16>, #blocked>, tensor<512xi32, #blocked> loc(#loc34)
+    %tmp1_7 = tt.load %tmp1_6 evictionPolicy = evict_last : tensor<512x!tt.ptr<bf16>, #blocked> loc(#loc35)
+    %tmp1_8 = arith.extf %tmp1_7 : tensor<512xbf16, #blocked> to tensor<512xf32, #blocked> loc(#loc36)
+    %tmp2 = tt.splat %in_ptr2 : !tt.ptr<bf16> -> tensor<512x!tt.ptr<bf16>, #blocked> loc(#loc37)
+    %tmp2_9 = tt.addptr %tmp2, %xindex_2 : tensor<512x!tt.ptr<bf16>, #blocked>, tensor<512xi32, #blocked> loc(#loc37)
+    %tmp2_10 = tt.load %tmp2_9 : tensor<512x!tt.ptr<bf16>, #blocked> loc(#loc38)
+    %tmp2_11 = arith.extf %tmp2_10 : tensor<512xbf16, #blocked> to tensor<512xf32, #blocked> loc(#loc39)
+    %tmp3 = arith.mulf %tmp1_8, %tmp2_11 : tensor<512xf32, #blocked> loc(#loc40)
+    %tmp4 = arith.addf %tmp0_5, %tmp3 : tensor<512xf32, #blocked> loc(#loc41)
+    %0 = tt.splat %out_ptr0 : !tt.ptr<bf16> -> tensor<512x!tt.ptr<bf16>, #blocked> loc(#loc18)
+    %1 = tt.addptr %0, %xindex_2 : tensor<512x!tt.ptr<bf16>, #blocked>, tensor<512xi32, #blocked> loc(#loc18)
+    %2 = arith.truncf %tmp4 : tensor<512xf32, #blocked> to tensor<512xbf16, #blocked> loc(#loc19)
+    tt.store %1, %2 : tensor<512x!tt.ptr<bf16>, #blocked> loc(#loc19)
+    tt.return loc(#loc20)
+  } loc(#loc)
+} loc(#loc)
+#loc1 = loc(unknown)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":20:28)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":20:33)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":21:36)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":21:23)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":24:19)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":25:30)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":25:35)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":25:44)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":26:30)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":26:35)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":26:74)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":27:30)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":27:35)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":27:44)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":28:18)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":29:18)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":30:25)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":30:36)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":30:4)
+#loc26 = loc("xoffset"(#loc2))
+#loc27 = loc("xoffset"(#loc3))
+#loc28 = loc("xindex"(#loc4))
+#loc29 = loc("xindex"(#loc5))
+#loc30 = loc("x0"(#loc6))
+#loc31 = loc("tmp0"(#loc7))
+#loc32 = loc("tmp0"(#loc8))
+#loc33 = loc("tmp0"(#loc9))
+#loc34 = loc("tmp1"(#loc10))
+#loc35 = loc("tmp1"(#loc11))
+#loc36 = loc("tmp1"(#loc12))
+#loc37 = loc("tmp2"(#loc13))
+#loc38 = loc("tmp2"(#loc14))
+#loc39 = loc("tmp2"(#loc15))
+#loc40 = loc("tmp3"(#loc16))
+#loc41 = loc("tmp4"(#loc17))
diff --git a/triton/WCDEYES2ECOIHNPRK2ZUC7OEILM2N7OM7EE5ROIP54J23WJDT3TA/triton_poi_fused_add_mul_1.ttir b/triton/WCDEYES2ECOIHNPRK2ZUC7OEILM2N7OM7EE5ROIP54J23WJDT3TA/triton_poi_fused_add_mul_1.ttir
new file mode 100644
index 0000000000000000000000000000000000000000..3ebe3ece72fb3c5354ff98cef047a82986069276
--- /dev/null
+++ b/triton/WCDEYES2ECOIHNPRK2ZUC7OEILM2N7OM7EE5ROIP54J23WJDT3TA/triton_poi_fused_add_mul_1.ttir
@@ -0,0 +1,73 @@
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":18:0)
+#loc21 = loc("in_ptr0"(#loc))
+#loc22 = loc("in_ptr1"(#loc))
+#loc23 = loc("in_ptr2"(#loc))
+#loc24 = loc("out_ptr0"(#loc))
+#loc25 = loc("xnumel"(#loc))
+module {
+  tt.func public @triton_poi_fused_add_mul_1(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} {
+    %x0 = arith.constant dense<4096> : tensor<512xi32> loc(#loc26)
+    %c512_i32 = arith.constant 512 : i32 loc(#loc2)
+    %xoffset = tt.get_program_id x : i32 loc(#loc27)
+    %xoffset_0 = arith.muli %xoffset, %c512_i32 : i32 loc(#loc28)
+    %xindex = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32> loc(#loc29)
+    %xindex_1 = tt.splat %xoffset_0 : i32 -> tensor<512xi32> loc(#loc30)
+    %xindex_2 = arith.addi %xindex_1, %xindex : tensor<512xi32> loc(#loc30)
+    %x0_3 = arith.remsi %xindex_2, %x0 : tensor<512xi32> loc(#loc26)
+    %tmp0 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<512x!tt.ptr<bf16>> loc(#loc31)
+    %tmp0_4 = tt.addptr %tmp0, %xindex_2 : tensor<512x!tt.ptr<bf16>>, tensor<512xi32> loc(#loc31)
+    %tmp0_5 = tt.load %tmp0_4 : tensor<512x!tt.ptr<bf16>> loc(#loc32)
+    %tmp0_6 = arith.extf %tmp0_5 : tensor<512xbf16> to tensor<512xf32> loc(#loc33)
+    %tmp1 = tt.splat %in_ptr1 : !tt.ptr<bf16> -> tensor<512x!tt.ptr<bf16>> loc(#loc34)
+    %tmp1_7 = tt.addptr %tmp1, %x0_3 : tensor<512x!tt.ptr<bf16>>, tensor<512xi32> loc(#loc34)
+    %tmp1_8 = tt.load %tmp1_7 evictionPolicy = evict_last : tensor<512x!tt.ptr<bf16>> loc(#loc35)
+    %tmp1_9 = arith.extf %tmp1_8 : tensor<512xbf16> to tensor<512xf32> loc(#loc36)
+    %tmp2 = tt.splat %in_ptr2 : !tt.ptr<bf16> -> tensor<512x!tt.ptr<bf16>> loc(#loc37)
+    %tmp2_10 = tt.addptr %tmp2, %xindex_2 : tensor<512x!tt.ptr<bf16>>, tensor<512xi32> loc(#loc37)
+    %tmp2_11 = tt.load %tmp2_10 : tensor<512x!tt.ptr<bf16>> loc(#loc38)
+    %tmp2_12 = arith.extf %tmp2_11 : tensor<512xbf16> to tensor<512xf32> loc(#loc39)
+    %tmp3 = arith.mulf %tmp1_9, %tmp2_12 : tensor<512xf32> loc(#loc40)
+    %tmp4 = arith.addf %tmp0_6, %tmp3 : tensor<512xf32> loc(#loc41)
+    %0 = tt.splat %out_ptr0 : !tt.ptr<bf16> -> tensor<512x!tt.ptr<bf16>> loc(#loc18)
+    %1 = tt.addptr %0, %xindex_2 : tensor<512x!tt.ptr<bf16>>, tensor<512xi32> loc(#loc18)
+    %2 = arith.truncf %tmp4 : tensor<512xf32> to tensor<512xbf16> loc(#loc19)
+    tt.store %1, %2 : tensor<512x!tt.ptr<bf16>> loc(#loc19)
+    tt.return loc(#loc20)
+  } loc(#loc)
+} loc(#loc)
+#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":24:19)
+#loc2 = loc(unknown)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":20:28)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":20:33)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":21:36)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":21:23)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":25:30)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":25:35)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":25:44)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":26:30)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":26:35)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":26:74)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":27:30)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":27:35)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":27:44)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":28:18)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":29:18)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":30:25)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":30:36)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/7f/c7ff4ib6652ojllutm4c7mkzzpybond3pagu3glspw3sztkfe2za.py":30:4)
+#loc26 = loc("x0"(#loc1))
+#loc27 = loc("xoffset"(#loc3))
+#loc28 = loc("xoffset"(#loc4))
+#loc29 = loc("xindex"(#loc5))
+#loc30 = loc("xindex"(#loc6))
+#loc31 = loc("tmp0"(#loc7))
+#loc32 = loc("tmp0"(#loc8))
+#loc33 = loc("tmp0"(#loc9))
+#loc34 = loc("tmp1"(#loc10))
+#loc35 = loc("tmp1"(#loc11))
+#loc36 = loc("tmp1"(#loc12))
+#loc37 = loc("tmp2"(#loc13))
+#loc38 = loc("tmp2"(#loc14))
+#loc39 = loc("tmp2"(#loc15))
+#loc40 = loc("tmp3"(#loc16))
+#loc41 = loc("tmp4"(#loc17))
diff --git a/triton/WJMADSUCHEDSCOLWBJQFN3ERNWG6GZG72GNV74XDUGQSSKHQEOCA/__grp__triton_poi_fused_add_mul_0.json b/triton/WJMADSUCHEDSCOLWBJQFN3ERNWG6GZG72GNV74XDUGQSSKHQEOCA/__grp__triton_poi_fused_add_mul_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..adbac3ad18d8fc51d1ebbf8e6eaa7c44e3441c1b
--- /dev/null
+++ b/triton/WJMADSUCHEDSCOLWBJQFN3ERNWG6GZG72GNV74XDUGQSSKHQEOCA/__grp__triton_poi_fused_add_mul_0.json
@@ -0,0 +1 @@
+{"child_paths": {"triton_poi_fused_add_mul_0.source": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/WJMADSUCHEDSCOLWBJQFN3ERNWG6GZG72GNV74XDUGQSSKHQEOCA/triton_poi_fused_add_mul_0.source", "triton_poi_fused_add_mul_0.ttir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/WJMADSUCHEDSCOLWBJQFN3ERNWG6GZG72GNV74XDUGQSSKHQEOCA/triton_poi_fused_add_mul_0.ttir", "triton_poi_fused_add_mul_0.ttgir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/WJMADSUCHEDSCOLWBJQFN3ERNWG6GZG72GNV74XDUGQSSKHQEOCA/triton_poi_fused_add_mul_0.ttgir", "triton_poi_fused_add_mul_0.llir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/WJMADSUCHEDSCOLWBJQFN3ERNWG6GZG72GNV74XDUGQSSKHQEOCA/triton_poi_fused_add_mul_0.llir", "triton_poi_fused_add_mul_0.ptx": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/WJMADSUCHEDSCOLWBJQFN3ERNWG6GZG72GNV74XDUGQSSKHQEOCA/triton_poi_fused_add_mul_0.ptx", "triton_poi_fused_add_mul_0.cubin": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/WJMADSUCHEDSCOLWBJQFN3ERNWG6GZG72GNV74XDUGQSSKHQEOCA/triton_poi_fused_add_mul_0.cubin", "triton_poi_fused_add_mul_0.json": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/WJMADSUCHEDSCOLWBJQFN3ERNWG6GZG72GNV74XDUGQSSKHQEOCA/triton_poi_fused_add_mul_0.json"}}
\ No newline at end of file
diff --git a/triton/WJMADSUCHEDSCOLWBJQFN3ERNWG6GZG72GNV74XDUGQSSKHQEOCA/triton_poi_fused_add_mul_0.cubin b/triton/WJMADSUCHEDSCOLWBJQFN3ERNWG6GZG72GNV74XDUGQSSKHQEOCA/triton_poi_fused_add_mul_0.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..68099c11f78c7b0646c1d5eb47549b05fb4183ae
Binary files /dev/null and b/triton/WJMADSUCHEDSCOLWBJQFN3ERNWG6GZG72GNV74XDUGQSSKHQEOCA/triton_poi_fused_add_mul_0.cubin differ
diff --git a/triton/WJMADSUCHEDSCOLWBJQFN3ERNWG6GZG72GNV74XDUGQSSKHQEOCA/triton_poi_fused_add_mul_0.json b/triton/WJMADSUCHEDSCOLWBJQFN3ERNWG6GZG72GNV74XDUGQSSKHQEOCA/triton_poi_fused_add_mul_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..89724be249fdd454fe11a667e8f58971359a3253
--- /dev/null
+++ b/triton/WJMADSUCHEDSCOLWBJQFN3ERNWG6GZG72GNV74XDUGQSSKHQEOCA/triton_poi_fused_add_mul_0.json
@@ -0,0 +1 @@
+{"hash": "b25801ca8239072139760a6056ec916d8de364dfd19b5ff2e3a1a12928f02384", "target": {"backend": "cuda", "arch": 89, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "enable_reflect_ftz": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee", "bf16x3", "bf16x6"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm89", "instrumentation_mode": "", "triton_version": "3.6.0", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_poi_fused_add_mul_0"}
\ No newline at end of file
diff --git a/triton/WJMADSUCHEDSCOLWBJQFN3ERNWG6GZG72GNV74XDUGQSSKHQEOCA/triton_poi_fused_add_mul_0.llir b/triton/WJMADSUCHEDSCOLWBJQFN3ERNWG6GZG72GNV74XDUGQSSKHQEOCA/triton_poi_fused_add_mul_0.llir
new file mode 100644
index 0000000000000000000000000000000000000000..ff716b11651253c08f2152f2e99adadf6bac7f64
--- /dev/null
+++ b/triton/WJMADSUCHEDSCOLWBJQFN3ERNWG6GZG72GNV74XDUGQSSKHQEOCA/triton_poi_fused_add_mul_0.llir
@@ -0,0 +1,118 @@
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64"
+
+; Function Attrs: nounwind
+define ptx_kernel void @triton_poi_fused_add_mul_0(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, i32 %4, ptr addrspace(1) readnone captures(none) %5, ptr addrspace(1) readnone captures(none) %6) local_unnamed_addr #0 !dbg !4 {
+  %8 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7
+  %9 = shl i32 %8, 10, !dbg !8
+  %10 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9
+  %11 = shl nuw nsw i32 %10, 3, !dbg !9
+  %12 = and i32 %11, 1016, !dbg !9
+  %13 = or disjoint i32 %12, %9, !dbg !10
+  %14 = srem i32 %13, 4096, !dbg !11
+  %15 = sext i32 %13 to i64, !dbg !12
+  %16 = getelementptr bfloat, ptr addrspace(1) %0, i64 %15, !dbg !12
+  %17 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l"(ptr addrspace(1) %16) #2, !dbg !13
+  %18 = extractvalue { i32, i32, i32, i32 } %17, 0, !dbg !13
+  %19 = bitcast i32 %18 to <2 x bfloat>, !dbg !13
+  %20 = extractvalue { i32, i32, i32, i32 } %17, 1, !dbg !13
+  %21 = bitcast i32 %20 to <2 x bfloat>, !dbg !13
+  %22 = extractvalue { i32, i32, i32, i32 } %17, 2, !dbg !13
+  %23 = bitcast i32 %22 to <2 x bfloat>, !dbg !13
+  %24 = extractvalue { i32, i32, i32, i32 } %17, 3, !dbg !13
+  %25 = bitcast i32 %24 to <2 x bfloat>, !dbg !13
+  %26 = sext i32 %14 to i64, !dbg !14
+  %27 = getelementptr bfloat, ptr addrspace(1) %1, i64 %26, !dbg !14
+  %28 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #2, !dbg !15
+  %29 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ], $5;", "=r,=r,=r,=r,l,l"(ptr addrspace(1) %27, i64 %28) #2, !dbg !15
+  %30 = extractvalue { i32, i32, i32, i32 } %29, 0, !dbg !15
+  %31 = bitcast i32 %30 to <2 x bfloat>, !dbg !15
+  %32 = extractvalue { i32, i32, i32, i32 } %29, 1, !dbg !15
+  %33 = bitcast i32 %32 to <2 x bfloat>, !dbg !15
+  %34 = extractvalue { i32, i32, i32, i32 } %29, 2, !dbg !15
+  %35 = bitcast i32 %34 to <2 x bfloat>, !dbg !15
+  %36 = extractvalue { i32, i32, i32, i32 } %29, 3, !dbg !15
+  %37 = bitcast i32 %36 to <2 x bfloat>, !dbg !15
+  %38 = getelementptr bfloat, ptr addrspace(1) %2, i64 %15, !dbg !16
+  %39 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l"(ptr addrspace(1) %38) #2, !dbg !17
+  %40 = extractvalue { i32, i32, i32, i32 } %39, 0, !dbg !17
+  %41 = bitcast i32 %40 to <2 x bfloat>, !dbg !17
+  %42 = extractvalue { i32, i32, i32, i32 } %39, 1, !dbg !17
+  %43 = bitcast i32 %42 to <2 x bfloat>, !dbg !17
+  %44 = extractvalue { i32, i32, i32, i32 } %39, 2, !dbg !17
+  %45 = bitcast i32 %44 to <2 x bfloat>, !dbg !17
+  %46 = extractvalue { i32, i32, i32, i32 } %39, 3, !dbg !17
+  %47 = bitcast i32 %46 to <2 x bfloat>, !dbg !17
+  %48 = getelementptr bfloat, ptr addrspace(1) %3, i64 %15, !dbg !18
+  %49 = fpext <2 x bfloat> %19 to <2 x float>, !dbg !19
+  %50 = fpext <2 x bfloat> %31 to <2 x float>, !dbg !20
+  %51 = fpext <2 x bfloat> %41 to <2 x float>, !dbg !21
+  %52 = fmul <2 x float> %50, %51, !dbg !22
+  %53 = fadd <2 x float> %52, %49, !dbg !23
+  %54 = fptrunc <2 x float> %53 to <2 x bfloat>, !dbg !24
+  %55 = fpext <2 x bfloat> %21 to <2 x float>, !dbg !19
+  %56 = fpext <2 x bfloat> %33 to <2 x float>, !dbg !20
+  %57 = fpext <2 x bfloat> %43 to <2 x float>, !dbg !21
+  %58 = fmul <2 x float> %56, %57, !dbg !22
+  %59 = fadd <2 x float> %58, %55, !dbg !23
+  %60 = fptrunc <2 x float> %59 to <2 x bfloat>, !dbg !24
+  %61 = fpext <2 x bfloat> %23 to <2 x float>, !dbg !19
+  %62 = fpext <2 x bfloat> %35 to <2 x float>, !dbg !20
+  %63 = fpext <2 x bfloat> %45 to <2 x float>, !dbg !21
+  %64 = fmul <2 x float> %62, %63, !dbg !22
+  %65 = fadd <2 x float> %64, %61, !dbg !23
+  %66 = fptrunc <2 x float> %65 to <2 x bfloat>, !dbg !24
+  %67 = fpext <2 x bfloat> %25 to <2 x float>, !dbg !19
+  %68 = fpext <2 x bfloat> %37 to <2 x float>, !dbg !20
+  %69 = fpext <2 x bfloat> %47 to <2 x float>, !dbg !21
+  %70 = fmul <2 x float> %68, %69, !dbg !22
+  %71 = fadd <2 x float> %70, %67, !dbg !23
+  %72 = fptrunc <2 x float> %71 to <2 x bfloat>, !dbg !24
+  %73 = bitcast <2 x bfloat> %54 to i32, !dbg !24
+  %74 = bitcast <2 x bfloat> %60 to i32, !dbg !24
+  %75 = bitcast <2 x bfloat> %66 to i32, !dbg !24
+  %76 = bitcast <2 x bfloat> %72 to i32, !dbg !24
+  tail call void asm sideeffect "st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l"(i32 %73, i32 %74, i32 %75, i32 %76, ptr addrspace(1) %48) #2, !dbg !24
+  ret void, !dbg !25
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1
+
+attributes #0 = { nounwind "nvvm.reqntid"="128" }
+attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #2 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly)
+!1 = !DIFile(filename: "cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py", directory: "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj")
+!2 = !{i32 2, !"Debug Info Version", i32 3}
+!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
+!4 = distinct !DISubprogram(name: "triton_poi_fused_add_mul_0", linkageName: "triton_poi_fused_add_mul_0", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!5 = !DISubroutineType(cc: DW_CC_normal, types: !6)
+!6 = !{}
+!7 = !DILocation(line: 20, column: 28, scope: !4)
+!8 = !DILocation(line: 20, column: 33, scope: !4)
+!9 = !DILocation(line: 21, column: 36, scope: !4)
+!10 = !DILocation(line: 21, column: 23, scope: !4)
+!11 = !DILocation(line: 24, column: 19, scope: !4)
+!12 = !DILocation(line: 25, column: 30, scope: !4)
+!13 = !DILocation(line: 25, column: 35, scope: !4)
+!14 = !DILocation(line: 26, column: 30, scope: !4)
+!15 = !DILocation(line: 26, column: 35, scope: !4)
+!16 = !DILocation(line: 27, column: 30, scope: !4)
+!17 = !DILocation(line: 27, column: 35, scope: !4)
+!18 = !DILocation(line: 30, column: 25, scope: !4)
+!19 = !DILocation(line: 25, column: 44, scope: !4)
+!20 = !DILocation(line: 26, column: 74, scope: !4)
+!21 = !DILocation(line: 27, column: 44, scope: !4)
+!22 = !DILocation(line: 28, column: 18, scope: !4)
+!23 = !DILocation(line: 29, column: 18, scope: !4)
+!24 = !DILocation(line: 30, column: 36, scope: !4)
+!25 = !DILocation(line: 30, column: 4, scope: !4)
diff --git a/triton/WJMADSUCHEDSCOLWBJQFN3ERNWG6GZG72GNV74XDUGQSSKHQEOCA/triton_poi_fused_add_mul_0.ptx b/triton/WJMADSUCHEDSCOLWBJQFN3ERNWG6GZG72GNV74XDUGQSSKHQEOCA/triton_poi_fused_add_mul_0.ptx
new file mode 100644
index 0000000000000000000000000000000000000000..44954ddf77b9b7d045f1e8712658a03bca88f56f
--- /dev/null
+++ b/triton/WJMADSUCHEDSCOLWBJQFN3ERNWG6GZG72GNV74XDUGQSSKHQEOCA/triton_poi_fused_add_mul_0.ptx
@@ -0,0 +1,407 @@
+//
+// Generated by LLVM NVPTX Back-End
+//
+
+.version 9.1
+.target sm_89
+.address_size 64
+
+	// .globl	triton_poi_fused_add_mul_0 // -- Begin function triton_poi_fused_add_mul_0
+                                        // @triton_poi_fused_add_mul_0
+.visible .entry triton_poi_fused_add_mul_0(
+	.param .u64 .ptr .global .align 1 triton_poi_fused_add_mul_0_param_0,
+	.param .u64 .ptr .global .align 1 triton_poi_fused_add_mul_0_param_1,
+	.param .u64 .ptr .global .align 1 triton_poi_fused_add_mul_0_param_2,
+	.param .u64 .ptr .global .align 1 triton_poi_fused_add_mul_0_param_3,
+	.param .u32 triton_poi_fused_add_mul_0_param_4,
+	.param .u64 .ptr .global .align 1 triton_poi_fused_add_mul_0_param_5,
+	.param .u64 .ptr .global .align 1 triton_poi_fused_add_mul_0_param_6
+)
+.reqntid 128
+{
+	.reg .b16 	%rs<25>;
+	.reg .b32 	%r<60>;
+	.reg .b64 	%rd<11>;
+	.loc	1 18 0                          // cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py:18:0
+$L__func_begin0:
+	.loc	1 18 0                          // cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py:18:0
+
+// %bb.0:
+	ld.param.b64 	%rd6, [triton_poi_fused_add_mul_0_param_0];
+	ld.param.b64 	%rd7, [triton_poi_fused_add_mul_0_param_1];
+$L__tmp0:
+	.loc	1 20 28                         // cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py:20:28
+	mov.u32 	%r17, %ctaid.x;
+	.loc	1 20 33                         // cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py:20:33
+	shl.b32 	%r18, %r17, 10;
+	ld.param.b64 	%rd8, [triton_poi_fused_add_mul_0_param_2];
+	ld.param.b64 	%rd9, [triton_poi_fused_add_mul_0_param_3];
+	.loc	1 21 36                         // cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py:21:36
+	mov.u32 	%r19, %tid.x;
+	shl.b32 	%r20, %r19, 3;
+	and.b32 	%r21, %r20, 1016;
+	.loc	1 21 23                         // cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py:21:23
+	or.b32 	%r22, %r21, %r18;
+	.loc	1 24 19                         // cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py:24:19
+	bfe.s32 	%r23, %r17, 21, 1;
+	shr.u32 	%r24, %r23, 20;
+	add.s32 	%r25, %r22, %r24;
+	and.b32 	%r26, %r25, -4096;
+	sub.s32 	%r27, %r22, %r26;
+	.loc	1 25 30                         // cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py:25:30
+	mul.wide.s32 	%rd10, %r22, 2;
+	add.s64 	%rd1, %rd6, %rd10;
+	.loc	1 25 35                         // cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py:25:35
+	// begin inline asm
+	mov.u32 %r1, 0x0;
+	mov.u32 %r2, 0x0;
+	mov.u32 %r3, 0x0;
+	mov.u32 %r4, 0x0;
+	ld.global.v4.b32 { %r1, %r2, %r3, %r4 }, [ %rd1 + 0 ];
+	// end inline asm
+	.loc	1 26 30                         // cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py:26:30
+	mad.wide.s32 	%rd2, %r27, 2, %rd7;
+	.loc	1 26 35                         // cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py:26:35
+	// begin inline asm
+	mov.u64 %rd3, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd3, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r5, 0x0;
+	mov.u32 %r6, 0x0;
+	mov.u32 %r7, 0x0;
+	mov.u32 %r8, 0x0;
+	ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r5, %r6, %r7, %r8 }, [ %rd2 + 0 ], %rd3;
+	// end inline asm
+	.loc	1 27 30                         // cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py:27:30
+	add.s64 	%rd4, %rd8, %rd10;
+	.loc	1 27 35                         // cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py:27:35
+	// begin inline asm
+	mov.u32 %r9, 0x0;
+	mov.u32 %r10, 0x0;
+	mov.u32 %r11, 0x0;
+	mov.u32 %r12, 0x0;
+	ld.global.v4.b32 { %r9, %r10, %r11, %r12 }, [ %rd4 + 0 ];
+	// end inline asm
+	.loc	1 30 25                         // cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py:30:25
+	add.s64 	%rd5, %rd9, %rd10;
+	.loc	1 25 44                         // cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py:25:44
+	mov.b32 	{%rs1, %rs2}, %r1;
+	cvt.f32.bf16 	%r28, %rs2;
+	cvt.f32.bf16 	%r29, %rs1;
+	.loc	1 26 74                         // cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py:26:74
+	mov.b32 	{%rs3, %rs4}, %r5;
+	cvt.f32.bf16 	%r30, %rs4;
+	cvt.f32.bf16 	%r31, %rs3;
+	.loc	1 27 44                         // cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py:27:44
+	mov.b32 	{%rs5, %rs6}, %r9;
+	cvt.f32.bf16 	%r32, %rs6;
+	cvt.f32.bf16 	%r33, %rs5;
+	.loc	1 29 18                         // cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py:29:18
+	fma.rn.f32 	%r34, %r31, %r33, %r29;
+	fma.rn.f32 	%r35, %r30, %r32, %r28;
+	.loc	1 30 36                         // cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py:30:36
+	cvt.rn.bf16x2.f32 	%r13, %r35, %r34;
+	.loc	1 25 44                         // cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py:25:44
+	mov.b32 	{%rs7, %rs8}, %r2;
+	cvt.f32.bf16 	%r36, %rs8;
+	cvt.f32.bf16 	%r37, %rs7;
+	.loc	1 26 74                         // cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py:26:74
+	mov.b32 	{%rs9, %rs10}, %r6;
+	cvt.f32.bf16 	%r38, %rs10;
+	cvt.f32.bf16 	%r39, %rs9;
+	.loc	1 27 44                         // cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py:27:44
+	mov.b32 	{%rs11, %rs12}, %r10;
+	cvt.f32.bf16 	%r40, %rs12;
+	cvt.f32.bf16 	%r41, %rs11;
+	.loc	1 29 18                         // cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py:29:18
+	fma.rn.f32 	%r42, %r39, %r41, %r37;
+	fma.rn.f32 	%r43, %r38, %r40, %r36;
+	.loc	1 30 36                         // cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py:30:36
+	cvt.rn.bf16x2.f32 	%r14, %r43, %r42;
+	.loc	1 25 44                         // cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py:25:44
+	mov.b32 	{%rs13, %rs14}, %r3;
+	cvt.f32.bf16 	%r44, %rs14;
+	cvt.f32.bf16 	%r45, %rs13;
+	.loc	1 26 74                         // cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py:26:74
+	mov.b32 	{%rs15, %rs16}, %r7;
+	cvt.f32.bf16 	%r46, %rs16;
+	cvt.f32.bf16 	%r47, %rs15;
+	.loc	1 27 44                         // cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py:27:44
+	mov.b32 	{%rs17, %rs18}, %r11;
+	cvt.f32.bf16 	%r48, %rs18;
+	cvt.f32.bf16 	%r49, %rs17;
+	.loc	1 29 18                         // cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py:29:18
+	fma.rn.f32 	%r50, %r47, %r49, %r45;
+	fma.rn.f32 	%r51, %r46, %r48, %r44;
+	.loc	1 30 36                         // cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py:30:36
+	cvt.rn.bf16x2.f32 	%r15, %r51, %r50;
+	.loc	1 25 44                         // cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py:25:44
+	mov.b32 	{%rs19, %rs20}, %r4;
+	cvt.f32.bf16 	%r52, %rs20;
+	cvt.f32.bf16 	%r53, %rs19;
+	.loc	1 26 74                         // cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py:26:74
+	mov.b32 	{%rs21, %rs22}, %r8;
+	cvt.f32.bf16 	%r54, %rs22;
+	cvt.f32.bf16 	%r55, %rs21;
+	.loc	1 27 44                         // cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py:27:44
+	mov.b32 	{%rs23, %rs24}, %r12;
+	cvt.f32.bf16 	%r56, %rs24;
+	cvt.f32.bf16 	%r57, %rs23;
+	.loc	1 29 18                         // cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py:29:18
+	fma.rn.f32 	%r58, %r55, %r57, %r53;
+	fma.rn.f32 	%r59, %r54, %r56, %r52;
+	.loc	1 30 36                         // cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py:30:36
+	cvt.rn.bf16x2.f32 	%r16, %r59, %r58;
+	// begin inline asm
+	st.global.v4.b32 [ %rd5 + 0 ], { %r13, %r14, %r15, %r16 };
+	// end inline asm
+	.loc	1 30 4                          // cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py:30:4
+	ret;
+$L__tmp1:
+$L__func_end0:
+                                        // -- End function
+}
+	.file	1 "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py"
+	.section	.debug_abbrev
+	{
+.b8 1                                   // Abbreviation Code
+.b8 17                                  // DW_TAG_compile_unit
+.b8 0                                   // DW_CHILDREN_no
+.b8 37                                  // DW_AT_producer
+.b8 8                                   // DW_FORM_string
+.b8 19                                  // DW_AT_language
+.b8 5                                   // DW_FORM_data2
+.b8 3                                   // DW_AT_name
+.b8 8                                   // DW_FORM_string
+.b8 16                                  // DW_AT_stmt_list
+.b8 6                                   // DW_FORM_data4
+.b8 27                                  // DW_AT_comp_dir
+.b8 8                                   // DW_FORM_string
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 0                                   // EOM(3)
+	}
+	.section	.debug_info
+	{
+.b32 224                                // Length of Unit
+.b8 2                                   // DWARF version number
+.b8 0
+.b32 .debug_abbrev                      // Offset Into Abbrev. Section
+.b8 8                                   // Address Size (in bytes)
+.b8 1                                   // Abbrev [1] 0xb:0xd9 DW_TAG_compile_unit
+.b8 116                                 // DW_AT_producer
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 0
+.b8 2                                   // DW_AT_language
+.b8 0
+.b8 99                                  // DW_AT_name
+.b8 120
+.b8 106
+.b8 52
+.b8 112
+.b8 53
+.b8 51
+.b8 104
+.b8 111
+.b8 116
+.b8 118
+.b8 119
+.b8 51
+.b8 51
+.b8 54
+.b8 119
+.b8 52
+.b8 106
+.b8 54
+.b8 106
+.b8 54
+.b8 110
+.b8 108
+.b8 121
+.b8 100
+.b8 119
+.b8 120
+.b8 122
+.b8 114
+.b8 115
+.b8 52
+.b8 104
+.b8 104
+.b8 107
+.b8 106
+.b8 52
+.b8 50
+.b8 104
+.b8 111
+.b8 102
+.b8 108
+.b8 111
+.b8 116
+.b8 50
+.b8 110
+.b8 115
+.b8 122
+.b8 113
+.b8 122
+.b8 113
+.b8 51
+.b8 117
+.b8 46
+.b8 112
+.b8 121
+.b8 0
+.b32 .debug_line                        // DW_AT_stmt_list
+.b8 47                                  // DW_AT_comp_dir
+.b8 97
+.b8 112
+.b8 112
+.b8 47
+.b8 116
+.b8 101
+.b8 110
+.b8 115
+.b8 111
+.b8 114
+.b8 114
+.b8 116
+.b8 95
+.b8 108
+.b8 108
+.b8 109
+.b8 47
+.b8 118
+.b8 105
+.b8 115
+.b8 117
+.b8 97
+.b8 108
+.b8 95
+.b8 103
+.b8 101
+.b8 110
+.b8 47
+.b8 99
+.b8 111
+.b8 109
+.b8 112
+.b8 105
+.b8 108
+.b8 101
+.b8 100
+.b8 95
+.b8 99
+.b8 97
+.b8 99
+.b8 104
+.b8 101
+.b8 47
+.b8 102
+.b8 108
+.b8 117
+.b8 120
+.b8 50
+.b8 95
+.b8 107
+.b8 108
+.b8 101
+.b8 105
+.b8 110
+.b8 95
+.b8 57
+.b8 98
+.b8 95
+.b8 78
+.b8 86
+.b8 73
+.b8 68
+.b8 73
+.b8 65
+.b8 95
+.b8 71
+.b8 101
+.b8 70
+.b8 111
+.b8 114
+.b8 99
+.b8 101
+.b8 95
+.b8 82
+.b8 84
+.b8 88
+.b8 95
+.b8 52
+.b8 48
+.b8 57
+.b8 48
+.b8 95
+.b8 115
+.b8 109
+.b8 56
+.b8 57
+.b8 95
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 50
+.b8 46
+.b8 49
+.b8 48
+.b8 46
+.b8 48
+.b8 97
+.b8 48
+.b8 95
+.b8 98
+.b8 52
+.b8 101
+.b8 52
+.b8 101
+.b8 101
+.b8 56
+.b8 49
+.b8 100
+.b8 51
+.b8 46
+.b8 110
+.b8 118
+.b8 50
+.b8 53
+.b8 46
+.b8 49
+.b8 50
+.b8 95
+.b8 99
+.b8 117
+.b8 100
+.b8 97
+.b8 49
+.b8 51
+.b8 95
+.b8 49
+.b8 47
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 105
+.b8 110
+.b8 100
+.b8 117
+.b8 99
+.b8 116
+.b8 111
+.b8 114
+.b8 47
+.b8 120
+.b8 106
+.b8 0
+	}
+	.section	.debug_macinfo	{	}
diff --git a/triton/WJMADSUCHEDSCOLWBJQFN3ERNWG6GZG72GNV74XDUGQSSKHQEOCA/triton_poi_fused_add_mul_0.source b/triton/WJMADSUCHEDSCOLWBJQFN3ERNWG6GZG72GNV74XDUGQSSKHQEOCA/triton_poi_fused_add_mul_0.source
new file mode 100644
index 0000000000000000000000000000000000000000..9540e6aec96ba1c2af2514155a8ddd8b706766c5
--- /dev/null
+++ b/triton/WJMADSUCHEDSCOLWBJQFN3ERNWG6GZG72GNV74XDUGQSSKHQEOCA/triton_poi_fused_add_mul_0.source
@@ -0,0 +1,82 @@
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":18:0)
+#loc22 = loc("in_ptr0"(#loc))
+#loc23 = loc("in_ptr1"(#loc))
+#loc24 = loc("in_ptr2"(#loc))
+#loc25 = loc("out_ptr0"(#loc))
+#loc26 = loc("xnumel"(#loc))
+module {
+  tt.func public @triton_poi_fused_add_mul_0(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} {
+    %xnumel_0 = arith.constant 9437184 : i32 loc(#loc27)
+    %xoffset = tt.get_program_id x : i32 loc(#loc28)
+    %xoffset_1 = arith.constant 1024 : i32 loc(#loc29)
+    %xoffset_2 = arith.constant 1024 : i32 loc(#loc29)
+    %xoffset_3 = arith.muli %xoffset, %xoffset_2 : i32 loc(#loc29)
+    %xindex = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32> loc(#loc30)
+    %xindex_4 = tt.splat %xoffset_3 : i32 -> tensor<1024xi32> loc(#loc31)
+    %xindex_5 = arith.addi %xindex_4, %xindex : tensor<1024xi32> loc(#loc31)
+    %xmask = arith.constant true loc(#loc32)
+    %xmask_6 = arith.constant dense<true> : tensor<1024xi1> loc(#loc32)
+    %x0 = arith.constant 4096 : i32 loc(#loc33)
+    %x0_7 = arith.constant 4096 : i32 loc(#loc33)
+    %x0_8 = arith.constant dense<4096> : tensor<1024xi32> loc(#loc33)
+    %x0_9 = arith.remsi %xindex_5, %x0_8 : tensor<1024xi32> loc(#loc33)
+    %tmp0 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<1024x!tt.ptr<bf16>> loc(#loc34)
+    %tmp0_10 = tt.addptr %tmp0, %xindex_5 : tensor<1024x!tt.ptr<bf16>>, tensor<1024xi32> loc(#loc34)
+    %tmp0_11 = tt.load %tmp0_10 : tensor<1024x!tt.ptr<bf16>> loc(#loc35)
+    %tmp0_12 = arith.extf %tmp0_11 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc36)
+    %tmp1 = tt.splat %in_ptr1 : !tt.ptr<bf16> -> tensor<1024x!tt.ptr<bf16>> loc(#loc37)
+    %tmp1_13 = tt.addptr %tmp1, %x0_9 : tensor<1024x!tt.ptr<bf16>>, tensor<1024xi32> loc(#loc37)
+    %tmp1_14 = tt.load %tmp1_13 evictionPolicy = evict_last : tensor<1024x!tt.ptr<bf16>> loc(#loc38)
+    %tmp1_15 = arith.extf %tmp1_14 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc39)
+    %tmp2 = tt.splat %in_ptr2 : !tt.ptr<bf16> -> tensor<1024x!tt.ptr<bf16>> loc(#loc40)
+    %tmp2_16 = tt.addptr %tmp2, %xindex_5 : tensor<1024x!tt.ptr<bf16>>, tensor<1024xi32> loc(#loc40)
+    %tmp2_17 = tt.load %tmp2_16 : tensor<1024x!tt.ptr<bf16>> loc(#loc41)
+    %tmp2_18 = arith.extf %tmp2_17 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc42)
+    %tmp3 = arith.mulf %tmp1_15, %tmp2_18 : tensor<1024xf32> loc(#loc43)
+    %tmp4 = arith.addf %tmp0_12, %tmp3 : tensor<1024xf32> loc(#loc44)
+    %0 = tt.splat %out_ptr0 : !tt.ptr<bf16> -> tensor<1024x!tt.ptr<bf16>> loc(#loc19)
+    %1 = tt.addptr %0, %xindex_5 : tensor<1024x!tt.ptr<bf16>>, tensor<1024xi32> loc(#loc19)
+    %2 = arith.truncf %tmp4 : tensor<1024xf32> to tensor<1024xbf16> loc(#loc20)
+    tt.store %1, %2 : tensor<1024x!tt.ptr<bf16>> loc(#loc20)
+    tt.return loc(#loc21)
+  } loc(#loc)
+} loc(#loc)
+#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":19:13)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":20:28)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":20:33)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":21:36)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":21:23)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":22:36)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":24:19)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":25:30)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":25:35)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":25:44)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":26:30)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":26:35)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":26:74)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":27:30)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":27:35)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":27:44)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":28:18)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":29:18)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":30:25)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":30:36)
+#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":30:4)
+#loc27 = loc("xnumel"(#loc1))
+#loc28 = loc("xoffset"(#loc2))
+#loc29 = loc("xoffset"(#loc3))
+#loc30 = loc("xindex"(#loc4))
+#loc31 = loc("xindex"(#loc5))
+#loc32 = loc("xmask"(#loc6))
+#loc33 = loc("x0"(#loc7))
+#loc34 = loc("tmp0"(#loc8))
+#loc35 = loc("tmp0"(#loc9))
+#loc36 = loc("tmp0"(#loc10))
+#loc37 = loc("tmp1"(#loc11))
+#loc38 = loc("tmp1"(#loc12))
+#loc39 = loc("tmp1"(#loc13))
+#loc40 = loc("tmp2"(#loc14))
+#loc41 = loc("tmp2"(#loc15))
+#loc42 = loc("tmp2"(#loc16))
+#loc43 = loc("tmp3"(#loc17))
+#loc44 = loc("tmp4"(#loc18))
diff --git a/triton/WJMADSUCHEDSCOLWBJQFN3ERNWG6GZG72GNV74XDUGQSSKHQEOCA/triton_poi_fused_add_mul_0.ttgir b/triton/WJMADSUCHEDSCOLWBJQFN3ERNWG6GZG72GNV74XDUGQSSKHQEOCA/triton_poi_fused_add_mul_0.ttgir
new file mode 100644
index 0000000000000000000000000000000000000000..63e2f8fa8bf6aa63baaa5418d52ef014ce10516d
--- /dev/null
+++ b/triton/WJMADSUCHEDSCOLWBJQFN3ERNWG6GZG72GNV74XDUGQSSKHQEOCA/triton_poi_fused_add_mul_0.ttgir
@@ -0,0 +1,74 @@
+#blocked = #ttg.blocked<{sizePerThread = [8], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}>
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":18:0)
+#loc21 = loc("in_ptr0"(#loc))
+#loc22 = loc("in_ptr1"(#loc))
+#loc23 = loc("in_ptr2"(#loc))
+#loc24 = loc("out_ptr0"(#loc))
+#loc25 = loc("xnumel"(#loc))
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:89", "ttg.threads-per-warp" = 32 : i32} {
+  tt.func public @triton_poi_fused_add_mul_0(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} {
+    %cst = arith.constant dense<4096> : tensor<1024xi32, #blocked> loc(#loc1)
+    %c1024_i32 = arith.constant 1024 : i32 loc(#loc1)
+    %xoffset = tt.get_program_id x : i32 loc(#loc26)
+    %xoffset_0 = arith.muli %xoffset, %c1024_i32 : i32 loc(#loc27)
+    %xindex = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked> loc(#loc28)
+    %xindex_1 = tt.splat %xoffset_0 : i32 -> tensor<1024xi32, #blocked> loc(#loc29)
+    %xindex_2 = arith.addi %xindex_1, %xindex : tensor<1024xi32, #blocked> loc(#loc29)
+    %x0 = arith.remsi %xindex_2, %cst : tensor<1024xi32, #blocked> loc(#loc30)
+    %tmp0 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<1024x!tt.ptr<bf16>, #blocked> loc(#loc31)
+    %tmp0_3 = tt.addptr %tmp0, %xindex_2 : tensor<1024x!tt.ptr<bf16>, #blocked>, tensor<1024xi32, #blocked> loc(#loc31)
+    %tmp0_4 = tt.load %tmp0_3 : tensor<1024x!tt.ptr<bf16>, #blocked> loc(#loc32)
+    %tmp0_5 = arith.extf %tmp0_4 : tensor<1024xbf16, #blocked> to tensor<1024xf32, #blocked> loc(#loc33)
+    %tmp1 = tt.splat %in_ptr1 : !tt.ptr<bf16> -> tensor<1024x!tt.ptr<bf16>, #blocked> loc(#loc34)
+    %tmp1_6 = tt.addptr %tmp1, %x0 : tensor<1024x!tt.ptr<bf16>, #blocked>, tensor<1024xi32, #blocked> loc(#loc34)
+    %tmp1_7 = tt.load %tmp1_6 evictionPolicy = evict_last : tensor<1024x!tt.ptr<bf16>, #blocked> loc(#loc35)
+    %tmp1_8 = arith.extf %tmp1_7 : tensor<1024xbf16, #blocked> to tensor<1024xf32, #blocked> loc(#loc36)
+    %tmp2 = tt.splat %in_ptr2 : !tt.ptr<bf16> -> tensor<1024x!tt.ptr<bf16>, #blocked> loc(#loc37)
+    %tmp2_9 = tt.addptr %tmp2, %xindex_2 : tensor<1024x!tt.ptr<bf16>, #blocked>, tensor<1024xi32, #blocked> loc(#loc37)
+    %tmp2_10 = tt.load %tmp2_9 : tensor<1024x!tt.ptr<bf16>, #blocked> loc(#loc38)
+    %tmp2_11 = arith.extf %tmp2_10 : tensor<1024xbf16, #blocked> to tensor<1024xf32, #blocked> loc(#loc39)
+    %tmp3 = arith.mulf %tmp1_8, %tmp2_11 : tensor<1024xf32, #blocked> loc(#loc40)
+    %tmp4 = arith.addf %tmp0_5, %tmp3 : tensor<1024xf32, #blocked> loc(#loc41)
+    %0 = tt.splat %out_ptr0 : !tt.ptr<bf16> -> tensor<1024x!tt.ptr<bf16>, #blocked> loc(#loc18)
+    %1 = tt.addptr %0, %xindex_2 : tensor<1024x!tt.ptr<bf16>, #blocked>, tensor<1024xi32, #blocked> loc(#loc18)
+    %2 = arith.truncf %tmp4 : tensor<1024xf32, #blocked> to tensor<1024xbf16, #blocked> loc(#loc19)
+    tt.store %1, %2 : tensor<1024x!tt.ptr<bf16>, #blocked> loc(#loc19)
+    tt.return loc(#loc20)
+  } loc(#loc)
+} loc(#loc)
+#loc1 = loc(unknown)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":20:28)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":20:33)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":21:36)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":21:23)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":24:19)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":25:30)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":25:35)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":25:44)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":26:30)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":26:35)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":26:74)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":27:30)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":27:35)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":27:44)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":28:18)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":29:18)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":30:25)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":30:36)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":30:4)
+#loc26 = loc("xoffset"(#loc2))
+#loc27 = loc("xoffset"(#loc3))
+#loc28 = loc("xindex"(#loc4))
+#loc29 = loc("xindex"(#loc5))
+#loc30 = loc("x0"(#loc6))
+#loc31 = loc("tmp0"(#loc7))
+#loc32 = loc("tmp0"(#loc8))
+#loc33 = loc("tmp0"(#loc9))
+#loc34 = loc("tmp1"(#loc10))
+#loc35 = loc("tmp1"(#loc11))
+#loc36 = loc("tmp1"(#loc12))
+#loc37 = loc("tmp2"(#loc13))
+#loc38 = loc("tmp2"(#loc14))
+#loc39 = loc("tmp2"(#loc15))
+#loc40 = loc("tmp3"(#loc16))
+#loc41 = loc("tmp4"(#loc17))
diff --git a/triton/WJMADSUCHEDSCOLWBJQFN3ERNWG6GZG72GNV74XDUGQSSKHQEOCA/triton_poi_fused_add_mul_0.ttir b/triton/WJMADSUCHEDSCOLWBJQFN3ERNWG6GZG72GNV74XDUGQSSKHQEOCA/triton_poi_fused_add_mul_0.ttir
new file mode 100644
index 0000000000000000000000000000000000000000..fa71f8cc16761ebc1193fc79c01a58067123235e
--- /dev/null
+++ b/triton/WJMADSUCHEDSCOLWBJQFN3ERNWG6GZG72GNV74XDUGQSSKHQEOCA/triton_poi_fused_add_mul_0.ttir
@@ -0,0 +1,73 @@
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":18:0)
+#loc21 = loc("in_ptr0"(#loc))
+#loc22 = loc("in_ptr1"(#loc))
+#loc23 = loc("in_ptr2"(#loc))
+#loc24 = loc("out_ptr0"(#loc))
+#loc25 = loc("xnumel"(#loc))
+module {
+  tt.func public @triton_poi_fused_add_mul_0(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} {
+    %x0 = arith.constant dense<4096> : tensor<1024xi32> loc(#loc26)
+    %c1024_i32 = arith.constant 1024 : i32 loc(#loc2)
+    %xoffset = tt.get_program_id x : i32 loc(#loc27)
+    %xoffset_0 = arith.muli %xoffset, %c1024_i32 : i32 loc(#loc28)
+    %xindex = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32> loc(#loc29)
+    %xindex_1 = tt.splat %xoffset_0 : i32 -> tensor<1024xi32> loc(#loc30)
+    %xindex_2 = arith.addi %xindex_1, %xindex : tensor<1024xi32> loc(#loc30)
+    %x0_3 = arith.remsi %xindex_2, %x0 : tensor<1024xi32> loc(#loc26)
+    %tmp0 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<1024x!tt.ptr<bf16>> loc(#loc31)
+    %tmp0_4 = tt.addptr %tmp0, %xindex_2 : tensor<1024x!tt.ptr<bf16>>, tensor<1024xi32> loc(#loc31)
+    %tmp0_5 = tt.load %tmp0_4 : tensor<1024x!tt.ptr<bf16>> loc(#loc32)
+    %tmp0_6 = arith.extf %tmp0_5 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc33)
+    %tmp1 = tt.splat %in_ptr1 : !tt.ptr<bf16> -> tensor<1024x!tt.ptr<bf16>> loc(#loc34)
+    %tmp1_7 = tt.addptr %tmp1, %x0_3 : tensor<1024x!tt.ptr<bf16>>, tensor<1024xi32> loc(#loc34)
+    %tmp1_8 = tt.load %tmp1_7 evictionPolicy = evict_last : tensor<1024x!tt.ptr<bf16>> loc(#loc35)
+    %tmp1_9 = arith.extf %tmp1_8 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc36)
+    %tmp2 = tt.splat %in_ptr2 : !tt.ptr<bf16> -> tensor<1024x!tt.ptr<bf16>> loc(#loc37)
+    %tmp2_10 = tt.addptr %tmp2, %xindex_2 : tensor<1024x!tt.ptr<bf16>>, tensor<1024xi32> loc(#loc37)
+    %tmp2_11 = tt.load %tmp2_10 : tensor<1024x!tt.ptr<bf16>> loc(#loc38)
+    %tmp2_12 = arith.extf %tmp2_11 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc39)
+    %tmp3 = arith.mulf %tmp1_9, %tmp2_12 : tensor<1024xf32> loc(#loc40)
+    %tmp4 = arith.addf %tmp0_6, %tmp3 : tensor<1024xf32> loc(#loc41)
+    %0 = tt.splat %out_ptr0 : !tt.ptr<bf16> -> tensor<1024x!tt.ptr<bf16>> loc(#loc18)
+    %1 = tt.addptr %0, %xindex_2 : tensor<1024x!tt.ptr<bf16>>, tensor<1024xi32> loc(#loc18)
+    %2 = arith.truncf %tmp4 : tensor<1024xf32> to tensor<1024xbf16> loc(#loc19)
+    tt.store %1, %2 : tensor<1024x!tt.ptr<bf16>> loc(#loc19)
+    tt.return loc(#loc20)
+  } loc(#loc)
+} loc(#loc)
+#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":24:19)
+#loc2 = loc(unknown)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":20:28)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":20:33)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":21:36)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":21:23)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":25:30)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":25:35)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":25:44)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":26:30)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":26:35)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":26:74)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":27:30)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":27:35)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":27:44)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":28:18)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":29:18)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":30:25)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":30:36)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":30:4)
+#loc26 = loc("x0"(#loc1))
+#loc27 = loc("xoffset"(#loc3))
+#loc28 = loc("xoffset"(#loc4))
+#loc29 = loc("xindex"(#loc5))
+#loc30 = loc("xindex"(#loc6))
+#loc31 = loc("tmp0"(#loc7))
+#loc32 = loc("tmp0"(#loc8))
+#loc33 = loc("tmp0"(#loc9))
+#loc34 = loc("tmp1"(#loc10))
+#loc35 = loc("tmp1"(#loc11))
+#loc36 = loc("tmp1"(#loc12))
+#loc37 = loc("tmp2"(#loc13))
+#loc38 = loc("tmp2"(#loc14))
+#loc39 = loc("tmp2"(#loc15))
+#loc40 = loc("tmp3"(#loc16))
+#loc41 = loc("tmp4"(#loc17))
diff --git a/triton/WWAJNTGHNSELS6BPUWXFSCATBYNRZMO3ZTEJ6B5RK4WZND27EVLQ/__grp__triton_poi_fused_clone_permute_1.json b/triton/WWAJNTGHNSELS6BPUWXFSCATBYNRZMO3ZTEJ6B5RK4WZND27EVLQ/__grp__triton_poi_fused_clone_permute_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..304084d10ec2828903eadd2c674840d3469700a4
--- /dev/null
+++ b/triton/WWAJNTGHNSELS6BPUWXFSCATBYNRZMO3ZTEJ6B5RK4WZND27EVLQ/__grp__triton_poi_fused_clone_permute_1.json
@@ -0,0 +1 @@
+{"child_paths": {"triton_poi_fused_clone_permute_1.source": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/WWAJNTGHNSELS6BPUWXFSCATBYNRZMO3ZTEJ6B5RK4WZND27EVLQ/triton_poi_fused_clone_permute_1.source", "triton_poi_fused_clone_permute_1.ttir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/WWAJNTGHNSELS6BPUWXFSCATBYNRZMO3ZTEJ6B5RK4WZND27EVLQ/triton_poi_fused_clone_permute_1.ttir", "triton_poi_fused_clone_permute_1.ttgir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/WWAJNTGHNSELS6BPUWXFSCATBYNRZMO3ZTEJ6B5RK4WZND27EVLQ/triton_poi_fused_clone_permute_1.ttgir", "triton_poi_fused_clone_permute_1.llir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/WWAJNTGHNSELS6BPUWXFSCATBYNRZMO3ZTEJ6B5RK4WZND27EVLQ/triton_poi_fused_clone_permute_1.llir", "triton_poi_fused_clone_permute_1.ptx": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/WWAJNTGHNSELS6BPUWXFSCATBYNRZMO3ZTEJ6B5RK4WZND27EVLQ/triton_poi_fused_clone_permute_1.ptx", "triton_poi_fused_clone_permute_1.cubin": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/WWAJNTGHNSELS6BPUWXFSCATBYNRZMO3ZTEJ6B5RK4WZND27EVLQ/triton_poi_fused_clone_permute_1.cubin", "triton_poi_fused_clone_permute_1.json": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/WWAJNTGHNSELS6BPUWXFSCATBYNRZMO3ZTEJ6B5RK4WZND27EVLQ/triton_poi_fused_clone_permute_1.json"}}
\ No newline at end of file
diff --git a/triton/WWAJNTGHNSELS6BPUWXFSCATBYNRZMO3ZTEJ6B5RK4WZND27EVLQ/triton_poi_fused_clone_permute_1.cubin b/triton/WWAJNTGHNSELS6BPUWXFSCATBYNRZMO3ZTEJ6B5RK4WZND27EVLQ/triton_poi_fused_clone_permute_1.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..e4fb21bd4ba1dd64279e04556079967eca7c3a06
Binary files /dev/null and b/triton/WWAJNTGHNSELS6BPUWXFSCATBYNRZMO3ZTEJ6B5RK4WZND27EVLQ/triton_poi_fused_clone_permute_1.cubin differ
diff --git a/triton/WWAJNTGHNSELS6BPUWXFSCATBYNRZMO3ZTEJ6B5RK4WZND27EVLQ/triton_poi_fused_clone_permute_1.json b/triton/WWAJNTGHNSELS6BPUWXFSCATBYNRZMO3ZTEJ6B5RK4WZND27EVLQ/triton_poi_fused_clone_permute_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..ff624dab70ae2155de636afc17772c34f7f2ddfa
--- /dev/null
+++ b/triton/WWAJNTGHNSELS6BPUWXFSCATBYNRZMO3ZTEJ6B5RK4WZND27EVLQ/triton_poi_fused_clone_permute_1.json
@@ -0,0 +1 @@
+{"hash": "b58096ccc76c88b9782fa5ae5908130e1b1cb1dbccc89f07b1572d968f5f2557", "target": {"backend": "cuda", "arch": 89, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "enable_reflect_ftz": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee", "bf16x3", "bf16x6"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm89", "instrumentation_mode": "", "triton_version": "3.6.0", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_poi_fused_clone_permute_1"}
\ No newline at end of file
diff --git a/triton/WWAJNTGHNSELS6BPUWXFSCATBYNRZMO3ZTEJ6B5RK4WZND27EVLQ/triton_poi_fused_clone_permute_1.llir b/triton/WWAJNTGHNSELS6BPUWXFSCATBYNRZMO3ZTEJ6B5RK4WZND27EVLQ/triton_poi_fused_clone_permute_1.llir
new file mode 100644
index 0000000000000000000000000000000000000000..de514b90391d11c66700ae9478eb24e360effc68
--- /dev/null
+++ b/triton/WWAJNTGHNSELS6BPUWXFSCATBYNRZMO3ZTEJ6B5RK4WZND27EVLQ/triton_poi_fused_clone_permute_1.llir
@@ -0,0 +1,71 @@
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64"
+
+; Function Attrs: nounwind
+define ptx_kernel void @triton_poi_fused_clone_permute_1(ptr addrspace(1) %0, ptr addrspace(1) %1, i32 %2, ptr addrspace(1) readnone captures(none) %3, ptr addrspace(1) readnone captures(none) %4) local_unnamed_addr #0 !dbg !4 {
+  %6 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7
+  %7 = shl i32 %6, 10, !dbg !8
+  %8 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9
+  %9 = shl nuw nsw i32 %8, 3, !dbg !9
+  %10 = and i32 %9, 1016, !dbg !9
+  %11 = or disjoint i32 %10, %7, !dbg !10
+  %12 = sdiv i32 %11, 128, !dbg !11
+  %13 = mul i32 %12, 128, !dbg !12
+  %.decomposed = sub i32 %11, %13, !dbg !12
+  %14 = srem i32 %12, 32, !dbg !13
+  %15 = sdiv i32 %11, 4096, !dbg !14
+  %16 = shl nsw i32 %15, 7, !dbg !15
+  %17 = add nsw i32 %16, %.decomposed, !dbg !16
+  %18 = mul nsw i32 %14, 294912, !dbg !17
+  %19 = add nsw i32 %17, %18, !dbg !18
+  %20 = sext i32 %19 to i64, !dbg !19
+  %21 = getelementptr bfloat, ptr addrspace(1) %0, i64 %20, !dbg !19
+  %22 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l"(ptr addrspace(1) %21) #2, !dbg !20
+  %23 = extractvalue { i32, i32, i32, i32 } %22, 0, !dbg !20
+  %24 = extractvalue { i32, i32, i32, i32 } %22, 1, !dbg !20
+  %25 = extractvalue { i32, i32, i32, i32 } %22, 2, !dbg !20
+  %26 = extractvalue { i32, i32, i32, i32 } %22, 3, !dbg !20
+  %27 = sext i32 %11 to i64, !dbg !21
+  %28 = getelementptr bfloat, ptr addrspace(1) %1, i64 %27, !dbg !21
+  tail call void asm sideeffect "st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l"(i32 %23, i32 %24, i32 %25, i32 %26, ptr addrspace(1) %28) #2, !dbg !22
+  ret void, !dbg !23
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1
+
+attributes #0 = { nounwind "nvvm.reqntid"="128" }
+attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #2 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly)
+!1 = !DIFile(filename: "cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py", directory: "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4")
+!2 = !{i32 2, !"Debug Info Version", i32 3}
+!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
+!4 = distinct !DISubprogram(name: "triton_poi_fused_clone_permute_1", linkageName: "triton_poi_fused_clone_permute_1", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!5 = !DISubroutineType(cc: DW_CC_normal, types: !6)
+!6 = !{}
+!7 = !DILocation(line: 20, column: 28, scope: !4)
+!8 = !DILocation(line: 20, column: 33, scope: !4)
+!9 = !DILocation(line: 21, column: 36, scope: !4)
+!10 = !DILocation(line: 21, column: 23, scope: !4)
+!11 = !DILocation(line: 24, column: 21, scope: !4)
+!12 = !DILocation(line: 23, column: 19, scope: !4)
+!13 = !DILocation(line: 24, column: 28, scope: !4)
+!14 = !DILocation(line: 25, column: 19, scope: !4)
+!15 = !DILocation(line: 27, column: 39, scope: !4)
+!16 = !DILocation(line: 27, column: 35, scope: !4)
+!17 = !DILocation(line: 27, column: 51, scope: !4)
+!18 = !DILocation(line: 27, column: 44, scope: !4)
+!19 = !DILocation(line: 27, column: 30, scope: !4)
+!20 = !DILocation(line: 27, column: 56, scope: !4)
+!21 = !DILocation(line: 28, column: 25, scope: !4)
+!22 = !DILocation(line: 28, column: 36, scope: !4)
+!23 = !DILocation(line: 28, column: 4, scope: !4)
diff --git a/triton/WWAJNTGHNSELS6BPUWXFSCATBYNRZMO3ZTEJ6B5RK4WZND27EVLQ/triton_poi_fused_clone_permute_1.ptx b/triton/WWAJNTGHNSELS6BPUWXFSCATBYNRZMO3ZTEJ6B5RK4WZND27EVLQ/triton_poi_fused_clone_permute_1.ptx
new file mode 100644
index 0000000000000000000000000000000000000000..4460a694435f14748c57ff63fffe06869ac59cee
--- /dev/null
+++ b/triton/WWAJNTGHNSELS6BPUWXFSCATBYNRZMO3ZTEJ6B5RK4WZND27EVLQ/triton_poi_fused_clone_permute_1.ptx
@@ -0,0 +1,327 @@
+//
+// Generated by LLVM NVPTX Back-End
+//
+
+.version 9.1
+.target sm_89
+.address_size 64
+
+	// .globl	triton_poi_fused_clone_permute_1 // -- Begin function triton_poi_fused_clone_permute_1
+                                        // @triton_poi_fused_clone_permute_1
+.visible .entry triton_poi_fused_clone_permute_1(
+	.param .u64 .ptr .global .align 1 triton_poi_fused_clone_permute_1_param_0,
+	.param .u64 .ptr .global .align 1 triton_poi_fused_clone_permute_1_param_1,
+	.param .u32 triton_poi_fused_clone_permute_1_param_2,
+	.param .u64 .ptr .global .align 1 triton_poi_fused_clone_permute_1_param_3,
+	.param .u64 .ptr .global .align 1 triton_poi_fused_clone_permute_1_param_4
+)
+.reqntid 128
+{
+	.reg .b32 	%r<27>;
+	.reg .b64 	%rd<5>;
+	.loc	1 18 0                          // cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py:18:0
+$L__func_begin0:
+	.loc	1 18 0                          // cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py:18:0
+
+// %bb.0:
+	ld.param.b64 	%rd3, [triton_poi_fused_clone_permute_1_param_0];
+	ld.param.b64 	%rd4, [triton_poi_fused_clone_permute_1_param_1];
+$L__tmp0:
+	.loc	1 20 28                         // cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py:20:28
+	mov.u32 	%r5, %ctaid.x;
+	.loc	1 20 33                         // cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py:20:33
+	shl.b32 	%r6, %r5, 10;
+	.loc	1 21 36                         // cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py:21:36
+	mov.u32 	%r7, %tid.x;
+	shl.b32 	%r8, %r7, 3;
+	and.b32 	%r9, %r8, 1016;
+	.loc	1 21 23                         // cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py:21:23
+	or.b32 	%r10, %r9, %r6;
+	.loc	1 24 21                         // cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py:24:21
+	bfe.s32 	%r11, %r5, 21, 1;
+	shr.u32 	%r12, %r11, 25;
+	add.s32 	%r13, %r10, %r12;
+	shr.s32 	%r14, %r13, 7;
+	.loc	1 23 19                         // cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py:23:19
+	and.b32 	%r15, %r13, -128;
+	sub.s32 	%r16, %r10, %r15;
+	.loc	1 24 28                         // cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py:24:28
+	shr.u32 	%r17, %r14, 27;
+	add.s32 	%r18, %r14, %r17;
+	and.b32 	%r19, %r18, 131040;
+	sub.s32 	%r20, %r14, %r19;
+	.loc	1 25 19                         // cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py:25:19
+	shr.u32 	%r21, %r11, 20;
+	add.s32 	%r22, %r10, %r21;
+	shr.s32 	%r23, %r22, 12;
+	.loc	1 27 39                         // cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py:27:39
+	shl.b32 	%r24, %r23, 7;
+	.loc	1 27 35                         // cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py:27:35
+	add.s32 	%r25, %r24, %r16;
+	.loc	1 27 44                         // cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py:27:44
+	mad.lo.s32 	%r26, %r20, 294912, %r25;
+	.loc	1 27 30                         // cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py:27:30
+	mad.wide.s32 	%rd1, %r26, 2, %rd3;
+	.loc	1 27 56                         // cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py:27:56
+	// begin inline asm
+	mov.u32 %r1, 0x0;
+	mov.u32 %r2, 0x0;
+	mov.u32 %r3, 0x0;
+	mov.u32 %r4, 0x0;
+	ld.global.v4.b32 { %r1, %r2, %r3, %r4 }, [ %rd1 + 0 ];
+	// end inline asm
+	.loc	1 28 25                         // cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py:28:25
+	mad.wide.s32 	%rd2, %r10, 2, %rd4;
+	.loc	1 28 36                         // cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py:28:36
+	// begin inline asm
+	st.global.v4.b32 [ %rd2 + 0 ], { %r1, %r2, %r3, %r4 };
+	// end inline asm
+	.loc	1 28 4                          // cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py:28:4
+	ret;
+$L__tmp1:
+$L__func_end0:
+                                        // -- End function
+}
+	.file	1 "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py"
+	.section	.debug_abbrev
+	{
+.b8 1                                   // Abbreviation Code
+.b8 17                                  // DW_TAG_compile_unit
+.b8 0                                   // DW_CHILDREN_no
+.b8 37                                  // DW_AT_producer
+.b8 8                                   // DW_FORM_string
+.b8 19                                  // DW_AT_language
+.b8 5                                   // DW_FORM_data2
+.b8 3                                   // DW_AT_name
+.b8 8                                   // DW_FORM_string
+.b8 16                                  // DW_AT_stmt_list
+.b8 6                                   // DW_FORM_data4
+.b8 27                                  // DW_AT_comp_dir
+.b8 8                                   // DW_FORM_string
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 0                                   // EOM(3)
+	}
+	.section	.debug_info
+	{
+.b32 224                                // Length of Unit
+.b8 2                                   // DWARF version number
+.b8 0
+.b32 .debug_abbrev                      // Offset Into Abbrev. Section
+.b8 8                                   // Address Size (in bytes)
+.b8 1                                   // Abbrev [1] 0xb:0xd9 DW_TAG_compile_unit
+.b8 116                                 // DW_AT_producer
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 0
+.b8 2                                   // DW_AT_language
+.b8 0
+.b8 99                                  // DW_AT_name
+.b8 114
+.b8 52
+.b8 99
+.b8 51
+.b8 114
+.b8 108
+.b8 98
+.b8 99
+.b8 54
+.b8 51
+.b8 106
+.b8 50
+.b8 113
+.b8 121
+.b8 111
+.b8 101
+.b8 51
+.b8 108
+.b8 54
+.b8 50
+.b8 109
+.b8 118
+.b8 98
+.b8 99
+.b8 114
+.b8 109
+.b8 116
+.b8 52
+.b8 120
+.b8 53
+.b8 103
+.b8 112
+.b8 100
+.b8 110
+.b8 50
+.b8 55
+.b8 100
+.b8 118
+.b8 112
+.b8 101
+.b8 110
+.b8 108
+.b8 99
+.b8 103
+.b8 116
+.b8 109
+.b8 116
+.b8 55
+.b8 52
+.b8 107
+.b8 53
+.b8 46
+.b8 112
+.b8 121
+.b8 0
+.b32 .debug_line                        // DW_AT_stmt_list
+.b8 47                                  // DW_AT_comp_dir
+.b8 97
+.b8 112
+.b8 112
+.b8 47
+.b8 116
+.b8 101
+.b8 110
+.b8 115
+.b8 111
+.b8 114
+.b8 114
+.b8 116
+.b8 95
+.b8 108
+.b8 108
+.b8 109
+.b8 47
+.b8 118
+.b8 105
+.b8 115
+.b8 117
+.b8 97
+.b8 108
+.b8 95
+.b8 103
+.b8 101
+.b8 110
+.b8 47
+.b8 99
+.b8 111
+.b8 109
+.b8 112
+.b8 105
+.b8 108
+.b8 101
+.b8 100
+.b8 95
+.b8 99
+.b8 97
+.b8 99
+.b8 104
+.b8 101
+.b8 47
+.b8 102
+.b8 108
+.b8 117
+.b8 120
+.b8 50
+.b8 95
+.b8 107
+.b8 108
+.b8 101
+.b8 105
+.b8 110
+.b8 95
+.b8 57
+.b8 98
+.b8 95
+.b8 78
+.b8 86
+.b8 73
+.b8 68
+.b8 73
+.b8 65
+.b8 95
+.b8 71
+.b8 101
+.b8 70
+.b8 111
+.b8 114
+.b8 99
+.b8 101
+.b8 95
+.b8 82
+.b8 84
+.b8 88
+.b8 95
+.b8 52
+.b8 48
+.b8 57
+.b8 48
+.b8 95
+.b8 115
+.b8 109
+.b8 56
+.b8 57
+.b8 95
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 50
+.b8 46
+.b8 49
+.b8 48
+.b8 46
+.b8 48
+.b8 97
+.b8 48
+.b8 95
+.b8 98
+.b8 52
+.b8 101
+.b8 52
+.b8 101
+.b8 101
+.b8 56
+.b8 49
+.b8 100
+.b8 51
+.b8 46
+.b8 110
+.b8 118
+.b8 50
+.b8 53
+.b8 46
+.b8 49
+.b8 50
+.b8 95
+.b8 99
+.b8 117
+.b8 100
+.b8 97
+.b8 49
+.b8 51
+.b8 95
+.b8 49
+.b8 47
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 105
+.b8 110
+.b8 100
+.b8 117
+.b8 99
+.b8 116
+.b8 111
+.b8 114
+.b8 47
+.b8 114
+.b8 52
+.b8 0
+	}
+	.section	.debug_macinfo	{	}
diff --git a/triton/WWAJNTGHNSELS6BPUWXFSCATBYNRZMO3ZTEJ6B5RK4WZND27EVLQ/triton_poi_fused_clone_permute_1.source b/triton/WWAJNTGHNSELS6BPUWXFSCATBYNRZMO3ZTEJ6B5RK4WZND27EVLQ/triton_poi_fused_clone_permute_1.source
new file mode 100644
index 0000000000000000000000000000000000000000..8b82e37670979443529030dc38b4f14210659a70
--- /dev/null
+++ b/triton/WWAJNTGHNSELS6BPUWXFSCATBYNRZMO3ZTEJ6B5RK4WZND27EVLQ/triton_poi_fused_clone_permute_1.source
@@ -0,0 +1,90 @@
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":18:0)
+#loc21 = loc("in_ptr0"(#loc))
+#loc22 = loc("out_ptr0"(#loc))
+#loc23 = loc("xnumel"(#loc))
+module {
+  tt.func public @triton_poi_fused_clone_permute_1(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} {
+    %xnumel_0 = arith.constant 9437184 : i32 loc(#loc24)
+    %xoffset = tt.get_program_id x : i32 loc(#loc25)
+    %xoffset_1 = arith.constant 1024 : i32 loc(#loc26)
+    %xoffset_2 = arith.constant 1024 : i32 loc(#loc26)
+    %xoffset_3 = arith.muli %xoffset, %xoffset_2 : i32 loc(#loc26)
+    %xindex = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32> loc(#loc27)
+    %xindex_4 = tt.splat %xoffset_3 : i32 -> tensor<1024xi32> loc(#loc28)
+    %xindex_5 = arith.addi %xindex_4, %xindex : tensor<1024xi32> loc(#loc28)
+    %xmask = arith.constant true loc(#loc29)
+    %xmask_6 = arith.constant dense<true> : tensor<1024xi1> loc(#loc29)
+    %x0 = arith.constant 128 : i32 loc(#loc30)
+    %x0_7 = arith.constant 128 : i32 loc(#loc30)
+    %x0_8 = arith.constant dense<128> : tensor<1024xi32> loc(#loc30)
+    %x0_9 = arith.remsi %xindex_5, %x0_8 : tensor<1024xi32> loc(#loc30)
+    %x1 = arith.constant 128 : i32 loc(#loc31)
+    %x1_10 = arith.constant 128 : i32 loc(#loc31)
+    %x1_11 = arith.constant dense<128> : tensor<1024xi32> loc(#loc31)
+    %x1_12 = arith.divsi %xindex_5, %x1_11 : tensor<1024xi32> loc(#loc31)
+    %x1_13 = arith.constant 32 : i32 loc(#loc32)
+    %x1_14 = arith.constant 32 : i32 loc(#loc32)
+    %x1_15 = arith.constant dense<32> : tensor<1024xi32> loc(#loc32)
+    %x1_16 = arith.remsi %x1_12, %x1_15 : tensor<1024xi32> loc(#loc32)
+    %x2 = arith.constant 4096 : i32 loc(#loc33)
+    %x2_17 = arith.constant 4096 : i32 loc(#loc33)
+    %x2_18 = arith.constant dense<4096> : tensor<1024xi32> loc(#loc33)
+    %x2_19 = arith.divsi %xindex_5, %x2_18 : tensor<1024xi32> loc(#loc33)
+    %tmp0 = arith.constant 128 : i32 loc(#loc34)
+    %tmp0_20 = arith.constant 128 : i32 loc(#loc34)
+    %tmp0_21 = arith.constant dense<128> : tensor<1024xi32> loc(#loc34)
+    %tmp0_22 = arith.muli %tmp0_21, %x2_19 : tensor<1024xi32> loc(#loc34)
+    %tmp0_23 = arith.addi %x0_9, %tmp0_22 : tensor<1024xi32> loc(#loc35)
+    %tmp0_24 = arith.constant 294912 : i32 loc(#loc36)
+    %tmp0_25 = arith.constant 294912 : i32 loc(#loc36)
+    %tmp0_26 = arith.constant dense<294912> : tensor<1024xi32> loc(#loc36)
+    %tmp0_27 = arith.muli %tmp0_26, %x1_16 : tensor<1024xi32> loc(#loc36)
+    %tmp0_28 = arith.addi %tmp0_23, %tmp0_27 : tensor<1024xi32> loc(#loc37)
+    %tmp0_29 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<1024x!tt.ptr<bf16>> loc(#loc38)
+    %tmp0_30 = tt.addptr %tmp0_29, %tmp0_28 : tensor<1024x!tt.ptr<bf16>>, tensor<1024xi32> loc(#loc38)
+    %tmp0_31 = tt.load %tmp0_30 : tensor<1024x!tt.ptr<bf16>> loc(#loc39)
+    %tmp0_32 = arith.extf %tmp0_31 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc40)
+    %0 = tt.splat %out_ptr0 : !tt.ptr<bf16> -> tensor<1024x!tt.ptr<bf16>> loc(#loc18)
+    %1 = tt.addptr %0, %xindex_5 : tensor<1024x!tt.ptr<bf16>>, tensor<1024xi32> loc(#loc18)
+    %2 = arith.truncf %tmp0_32 : tensor<1024xf32> to tensor<1024xbf16> loc(#loc19)
+    tt.store %1, %2 : tensor<1024x!tt.ptr<bf16>> loc(#loc19)
+    tt.return loc(#loc20)
+  } loc(#loc)
+} loc(#loc)
+#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":19:13)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":20:28)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":20:33)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":21:36)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":21:23)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":22:36)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":23:19)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":24:21)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":24:28)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":25:19)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":27:39)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":27:35)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":27:51)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":27:44)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":27:30)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":27:56)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":27:65)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":28:25)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":28:36)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":28:4)
+#loc24 = loc("xnumel"(#loc1))
+#loc25 = loc("xoffset"(#loc2))
+#loc26 = loc("xoffset"(#loc3))
+#loc27 = loc("xindex"(#loc4))
+#loc28 = loc("xindex"(#loc5))
+#loc29 = loc("xmask"(#loc6))
+#loc30 = loc("x0"(#loc7))
+#loc31 = loc("x1"(#loc8))
+#loc32 = loc("x1"(#loc9))
+#loc33 = loc("x2"(#loc10))
+#loc34 = loc("tmp0"(#loc11))
+#loc35 = loc("tmp0"(#loc12))
+#loc36 = loc("tmp0"(#loc13))
+#loc37 = loc("tmp0"(#loc14))
+#loc38 = loc("tmp0"(#loc15))
+#loc39 = loc("tmp0"(#loc16))
+#loc40 = loc("tmp0"(#loc17))
diff --git a/triton/WWAJNTGHNSELS6BPUWXFSCATBYNRZMO3ZTEJ6B5RK4WZND27EVLQ/triton_poi_fused_clone_permute_1.ttgir b/triton/WWAJNTGHNSELS6BPUWXFSCATBYNRZMO3ZTEJ6B5RK4WZND27EVLQ/triton_poi_fused_clone_permute_1.ttgir
new file mode 100644
index 0000000000000000000000000000000000000000..77c612c546d284b64ccc582e7d74da0c22a8602f
--- /dev/null
+++ b/triton/WWAJNTGHNSELS6BPUWXFSCATBYNRZMO3ZTEJ6B5RK4WZND27EVLQ/triton_poi_fused_clone_permute_1.ttgir
@@ -0,0 +1,66 @@
+#blocked = #ttg.blocked<{sizePerThread = [8], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}>
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":18:0)
+#loc19 = loc("in_ptr0"(#loc))
+#loc20 = loc("out_ptr0"(#loc))
+#loc21 = loc("xnumel"(#loc))
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:89", "ttg.threads-per-warp" = 32 : i32} {
+  tt.func public @triton_poi_fused_clone_permute_1(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} {
+    %cst = arith.constant dense<128> : tensor<1024xi32, #blocked> loc(#loc1)
+    %cst_0 = arith.constant dense<32> : tensor<1024xi32, #blocked> loc(#loc1)
+    %cst_1 = arith.constant dense<4096> : tensor<1024xi32, #blocked> loc(#loc1)
+    %cst_2 = arith.constant dense<294912> : tensor<1024xi32, #blocked> loc(#loc1)
+    %c1024_i32 = arith.constant 1024 : i32 loc(#loc1)
+    %xoffset = tt.get_program_id x : i32 loc(#loc22)
+    %xoffset_3 = arith.muli %xoffset, %c1024_i32 : i32 loc(#loc23)
+    %xindex = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked> loc(#loc24)
+    %xindex_4 = tt.splat %xoffset_3 : i32 -> tensor<1024xi32, #blocked> loc(#loc25)
+    %xindex_5 = arith.addi %xindex_4, %xindex : tensor<1024xi32, #blocked> loc(#loc25)
+    %x0 = arith.remsi %xindex_5, %cst : tensor<1024xi32, #blocked> loc(#loc26)
+    %x1 = arith.divsi %xindex_5, %cst : tensor<1024xi32, #blocked> loc(#loc27)
+    %x1_6 = arith.remsi %x1, %cst_0 : tensor<1024xi32, #blocked> loc(#loc28)
+    %x2 = arith.divsi %xindex_5, %cst_1 : tensor<1024xi32, #blocked> loc(#loc29)
+    %tmp0 = arith.muli %x2, %cst : tensor<1024xi32, #blocked> loc(#loc30)
+    %tmp0_7 = arith.addi %x0, %tmp0 : tensor<1024xi32, #blocked> loc(#loc31)
+    %tmp0_8 = arith.muli %x1_6, %cst_2 : tensor<1024xi32, #blocked> loc(#loc32)
+    %tmp0_9 = arith.addi %tmp0_7, %tmp0_8 : tensor<1024xi32, #blocked> loc(#loc33)
+    %tmp0_10 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<1024x!tt.ptr<bf16>, #blocked> loc(#loc34)
+    %tmp0_11 = tt.addptr %tmp0_10, %tmp0_9 : tensor<1024x!tt.ptr<bf16>, #blocked>, tensor<1024xi32, #blocked> loc(#loc34)
+    %tmp0_12 = tt.load %tmp0_11 : tensor<1024x!tt.ptr<bf16>, #blocked> loc(#loc35)
+    %0 = tt.splat %out_ptr0 : !tt.ptr<bf16> -> tensor<1024x!tt.ptr<bf16>, #blocked> loc(#loc16)
+    %1 = tt.addptr %0, %xindex_5 : tensor<1024x!tt.ptr<bf16>, #blocked>, tensor<1024xi32, #blocked> loc(#loc16)
+    tt.store %1, %tmp0_12 : tensor<1024x!tt.ptr<bf16>, #blocked> loc(#loc17)
+    tt.return loc(#loc18)
+  } loc(#loc)
+} loc(#loc)
+#loc1 = loc(unknown)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":20:28)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":20:33)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":21:36)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":21:23)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":23:19)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":24:21)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":24:28)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":25:19)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":27:39)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":27:35)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":27:51)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":27:44)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":27:30)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":27:56)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":28:25)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":28:36)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":28:4)
+#loc22 = loc("xoffset"(#loc2))
+#loc23 = loc("xoffset"(#loc3))
+#loc24 = loc("xindex"(#loc4))
+#loc25 = loc("xindex"(#loc5))
+#loc26 = loc("x0"(#loc6))
+#loc27 = loc("x1"(#loc7))
+#loc28 = loc("x1"(#loc8))
+#loc29 = loc("x2"(#loc9))
+#loc30 = loc("tmp0"(#loc10))
+#loc31 = loc("tmp0"(#loc11))
+#loc32 = loc("tmp0"(#loc12))
+#loc33 = loc("tmp0"(#loc13))
+#loc34 = loc("tmp0"(#loc14))
+#loc35 = loc("tmp0"(#loc15))
diff --git a/triton/WWAJNTGHNSELS6BPUWXFSCATBYNRZMO3ZTEJ6B5RK4WZND27EVLQ/triton_poi_fused_clone_permute_1.ttir b/triton/WWAJNTGHNSELS6BPUWXFSCATBYNRZMO3ZTEJ6B5RK4WZND27EVLQ/triton_poi_fused_clone_permute_1.ttir
new file mode 100644
index 0000000000000000000000000000000000000000..6ae9ccdfcf824b0acaa9121b4dc52364580189f6
--- /dev/null
+++ b/triton/WWAJNTGHNSELS6BPUWXFSCATBYNRZMO3ZTEJ6B5RK4WZND27EVLQ/triton_poi_fused_clone_permute_1.ttir
@@ -0,0 +1,65 @@
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":18:0)
+#loc19 = loc("in_ptr0"(#loc))
+#loc20 = loc("out_ptr0"(#loc))
+#loc21 = loc("xnumel"(#loc))
+module {
+  tt.func public @triton_poi_fused_clone_permute_1(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} {
+    %tmp0 = arith.constant dense<294912> : tensor<1024xi32> loc(#loc22)
+    %x2 = arith.constant dense<4096> : tensor<1024xi32> loc(#loc23)
+    %x1 = arith.constant dense<32> : tensor<1024xi32> loc(#loc24)
+    %cst = arith.constant dense<128> : tensor<1024xi32> loc(#loc4)
+    %c1024_i32 = arith.constant 1024 : i32 loc(#loc4)
+    %xoffset = tt.get_program_id x : i32 loc(#loc25)
+    %xoffset_0 = arith.muli %xoffset, %c1024_i32 : i32 loc(#loc26)
+    %xindex = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32> loc(#loc27)
+    %xindex_1 = tt.splat %xoffset_0 : i32 -> tensor<1024xi32> loc(#loc28)
+    %xindex_2 = arith.addi %xindex_1, %xindex : tensor<1024xi32> loc(#loc28)
+    %x0 = arith.remsi %xindex_2, %cst : tensor<1024xi32> loc(#loc29)
+    %x1_3 = arith.divsi %xindex_2, %cst : tensor<1024xi32> loc(#loc30)
+    %x1_4 = arith.remsi %x1_3, %x1 : tensor<1024xi32> loc(#loc24)
+    %x2_5 = arith.divsi %xindex_2, %x2 : tensor<1024xi32> loc(#loc23)
+    %tmp0_6 = arith.muli %x2_5, %cst : tensor<1024xi32> loc(#loc31)
+    %tmp0_7 = arith.addi %x0, %tmp0_6 : tensor<1024xi32> loc(#loc32)
+    %tmp0_8 = arith.muli %x1_4, %tmp0 : tensor<1024xi32> loc(#loc22)
+    %tmp0_9 = arith.addi %tmp0_7, %tmp0_8 : tensor<1024xi32> loc(#loc33)
+    %tmp0_10 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<1024x!tt.ptr<bf16>> loc(#loc34)
+    %tmp0_11 = tt.addptr %tmp0_10, %tmp0_9 : tensor<1024x!tt.ptr<bf16>>, tensor<1024xi32> loc(#loc34)
+    %tmp0_12 = tt.load %tmp0_11 : tensor<1024x!tt.ptr<bf16>> loc(#loc35)
+    %0 = tt.splat %out_ptr0 : !tt.ptr<bf16> -> tensor<1024x!tt.ptr<bf16>> loc(#loc16)
+    %1 = tt.addptr %0, %xindex_2 : tensor<1024x!tt.ptr<bf16>>, tensor<1024xi32> loc(#loc16)
+    tt.store %1, %tmp0_12 : tensor<1024x!tt.ptr<bf16>> loc(#loc17)
+    tt.return loc(#loc18)
+  } loc(#loc)
+} loc(#loc)
+#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":27:51)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":25:19)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":24:28)
+#loc4 = loc(unknown)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":20:28)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":20:33)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":21:36)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":21:23)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":23:19)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":24:21)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":27:39)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":27:35)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":27:44)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":27:30)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":27:56)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":28:25)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":28:36)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":28:4)
+#loc22 = loc("tmp0"(#loc1))
+#loc23 = loc("x2"(#loc2))
+#loc24 = loc("x1"(#loc3))
+#loc25 = loc("xoffset"(#loc5))
+#loc26 = loc("xoffset"(#loc6))
+#loc27 = loc("xindex"(#loc7))
+#loc28 = loc("xindex"(#loc8))
+#loc29 = loc("x0"(#loc9))
+#loc30 = loc("x1"(#loc10))
+#loc31 = loc("tmp0"(#loc11))
+#loc32 = loc("tmp0"(#loc12))
+#loc33 = loc("tmp0"(#loc13))
+#loc34 = loc("tmp0"(#loc14))
+#loc35 = loc("tmp0"(#loc15))
diff --git a/triton/X64FO2735YQCHUQEHARXMBHC2OO7I7W5Y2YUTJO7NRKIYNQ572AQ/__grp__triton_poi_fused_clone_permute_1.json b/triton/X64FO2735YQCHUQEHARXMBHC2OO7I7W5Y2YUTJO7NRKIYNQ572AQ/__grp__triton_poi_fused_clone_permute_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..1bc54924737d501726f01bb9fd8fdd3951d1ca27
--- /dev/null
+++ b/triton/X64FO2735YQCHUQEHARXMBHC2OO7I7W5Y2YUTJO7NRKIYNQ572AQ/__grp__triton_poi_fused_clone_permute_1.json
@@ -0,0 +1 @@
+{"child_paths": {"triton_poi_fused_clone_permute_1.source": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/X64FO2735YQCHUQEHARXMBHC2OO7I7W5Y2YUTJO7NRKIYNQ572AQ/triton_poi_fused_clone_permute_1.source", "triton_poi_fused_clone_permute_1.ttir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/X64FO2735YQCHUQEHARXMBHC2OO7I7W5Y2YUTJO7NRKIYNQ572AQ/triton_poi_fused_clone_permute_1.ttir", "triton_poi_fused_clone_permute_1.ttgir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/X64FO2735YQCHUQEHARXMBHC2OO7I7W5Y2YUTJO7NRKIYNQ572AQ/triton_poi_fused_clone_permute_1.ttgir", "triton_poi_fused_clone_permute_1.llir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/X64FO2735YQCHUQEHARXMBHC2OO7I7W5Y2YUTJO7NRKIYNQ572AQ/triton_poi_fused_clone_permute_1.llir", "triton_poi_fused_clone_permute_1.ptx": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/X64FO2735YQCHUQEHARXMBHC2OO7I7W5Y2YUTJO7NRKIYNQ572AQ/triton_poi_fused_clone_permute_1.ptx", "triton_poi_fused_clone_permute_1.cubin": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/X64FO2735YQCHUQEHARXMBHC2OO7I7W5Y2YUTJO7NRKIYNQ572AQ/triton_poi_fused_clone_permute_1.cubin", "triton_poi_fused_clone_permute_1.json": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/X64FO2735YQCHUQEHARXMBHC2OO7I7W5Y2YUTJO7NRKIYNQ572AQ/triton_poi_fused_clone_permute_1.json"}}
\ No newline at end of file
diff --git a/triton/X64FO2735YQCHUQEHARXMBHC2OO7I7W5Y2YUTJO7NRKIYNQ572AQ/triton_poi_fused_clone_permute_1.cubin b/triton/X64FO2735YQCHUQEHARXMBHC2OO7I7W5Y2YUTJO7NRKIYNQ572AQ/triton_poi_fused_clone_permute_1.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..e0ea9623516a2806328d1fe07b2cb84ce5ae810f
Binary files /dev/null and b/triton/X64FO2735YQCHUQEHARXMBHC2OO7I7W5Y2YUTJO7NRKIYNQ572AQ/triton_poi_fused_clone_permute_1.cubin differ
diff --git a/triton/X64FO2735YQCHUQEHARXMBHC2OO7I7W5Y2YUTJO7NRKIYNQ572AQ/triton_poi_fused_clone_permute_1.json b/triton/X64FO2735YQCHUQEHARXMBHC2OO7I7W5Y2YUTJO7NRKIYNQ572AQ/triton_poi_fused_clone_permute_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..bd4e606c4ff8ace4220b6b8eae55216e47f9533b
--- /dev/null
+++ b/triton/X64FO2735YQCHUQEHARXMBHC2OO7I7W5Y2YUTJO7NRKIYNQ572AQ/triton_poi_fused_clone_permute_1.json
@@ -0,0 +1 @@
+{"hash": "bfb8576bfbee2023d20438237604e2d39df47eddc6b149a5df6c548c361dfe81", "target": {"backend": "cuda", "arch": 89, "warp_size": 32}, "num_warps": 8, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "enable_reflect_ftz": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee", "bf16x3", "bf16x6"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm89", "instrumentation_mode": "", "triton_version": "3.6.0", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_poi_fused_clone_permute_1"}
\ No newline at end of file
diff --git a/triton/X64FO2735YQCHUQEHARXMBHC2OO7I7W5Y2YUTJO7NRKIYNQ572AQ/triton_poi_fused_clone_permute_1.llir b/triton/X64FO2735YQCHUQEHARXMBHC2OO7I7W5Y2YUTJO7NRKIYNQ572AQ/triton_poi_fused_clone_permute_1.llir
new file mode 100644
index 0000000000000000000000000000000000000000..f72e59ae2b240fd1d56f3a6041af45c7aa7c26a2
--- /dev/null
+++ b/triton/X64FO2735YQCHUQEHARXMBHC2OO7I7W5Y2YUTJO7NRKIYNQ572AQ/triton_poi_fused_clone_permute_1.llir
@@ -0,0 +1,67 @@
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64"
+
+; Function Attrs: nounwind
+define ptx_kernel void @triton_poi_fused_clone_permute_1(ptr addrspace(1) %0, ptr addrspace(1) %1, i32 %2, ptr addrspace(1) readnone captures(none) %3, ptr addrspace(1) readnone captures(none) %4) local_unnamed_addr #0 !dbg !4 {
+  %6 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7
+  %7 = shl i32 %6, 9, !dbg !8
+  %8 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9
+  %9 = shl nuw nsw i32 %8, 1, !dbg !9
+  %10 = and i32 %9, 510, !dbg !9
+  %11 = or disjoint i32 %10, %7, !dbg !10
+  %12 = sdiv i32 %11, 128, !dbg !11
+  %13 = mul i32 %12, 128, !dbg !12
+  %.decomposed = sub i32 %11, %13, !dbg !12
+  %14 = srem i32 %12, 32, !dbg !13
+  %15 = sdiv i32 %11, 4096, !dbg !14
+  %16 = shl nsw i32 %15, 7, !dbg !15
+  %17 = add nsw i32 %16, %.decomposed, !dbg !16
+  %18 = mul nsw i32 %14, 294912, !dbg !17
+  %19 = add nsw i32 %17, %18, !dbg !18
+  %20 = sext i32 %19 to i64, !dbg !19
+  %21 = getelementptr bfloat, ptr addrspace(1) %0, i64 %20, !dbg !19
+  %22 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l"(ptr addrspace(1) %21) #2, !dbg !20
+  %23 = sext i32 %11 to i64, !dbg !21
+  %24 = getelementptr bfloat, ptr addrspace(1) %1, i64 %23, !dbg !21
+  tail call void asm sideeffect "st.global.b32 [ $1 + 0 ], { $0 };", "r,l"(i32 %22, ptr addrspace(1) %24) #2, !dbg !22
+  ret void, !dbg !23
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1
+
+attributes #0 = { nounwind "nvvm.reqntid"="256" }
+attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #2 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly)
+!1 = !DIFile(filename: "cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py", directory: "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4")
+!2 = !{i32 2, !"Debug Info Version", i32 3}
+!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
+!4 = distinct !DISubprogram(name: "triton_poi_fused_clone_permute_1", linkageName: "triton_poi_fused_clone_permute_1", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!5 = !DISubroutineType(cc: DW_CC_normal, types: !6)
+!6 = !{}
+!7 = !DILocation(line: 20, column: 28, scope: !4)
+!8 = !DILocation(line: 20, column: 33, scope: !4)
+!9 = !DILocation(line: 21, column: 36, scope: !4)
+!10 = !DILocation(line: 21, column: 23, scope: !4)
+!11 = !DILocation(line: 24, column: 21, scope: !4)
+!12 = !DILocation(line: 23, column: 19, scope: !4)
+!13 = !DILocation(line: 24, column: 28, scope: !4)
+!14 = !DILocation(line: 25, column: 19, scope: !4)
+!15 = !DILocation(line: 27, column: 39, scope: !4)
+!16 = !DILocation(line: 27, column: 35, scope: !4)
+!17 = !DILocation(line: 27, column: 51, scope: !4)
+!18 = !DILocation(line: 27, column: 44, scope: !4)
+!19 = !DILocation(line: 27, column: 30, scope: !4)
+!20 = !DILocation(line: 27, column: 56, scope: !4)
+!21 = !DILocation(line: 28, column: 25, scope: !4)
+!22 = !DILocation(line: 28, column: 36, scope: !4)
+!23 = !DILocation(line: 28, column: 4, scope: !4)
diff --git a/triton/X64FO2735YQCHUQEHARXMBHC2OO7I7W5Y2YUTJO7NRKIYNQ572AQ/triton_poi_fused_clone_permute_1.ptx b/triton/X64FO2735YQCHUQEHARXMBHC2OO7I7W5Y2YUTJO7NRKIYNQ572AQ/triton_poi_fused_clone_permute_1.ptx
new file mode 100644
index 0000000000000000000000000000000000000000..fbe0faf8e74f8ef93c602bdbf09045b2dd8a40ec
--- /dev/null
+++ b/triton/X64FO2735YQCHUQEHARXMBHC2OO7I7W5Y2YUTJO7NRKIYNQ572AQ/triton_poi_fused_clone_permute_1.ptx
@@ -0,0 +1,324 @@
+//
+// Generated by LLVM NVPTX Back-End
+//
+
+.version 9.1
+.target sm_89
+.address_size 64
+
+	// .globl	triton_poi_fused_clone_permute_1 // -- Begin function triton_poi_fused_clone_permute_1
+                                        // @triton_poi_fused_clone_permute_1
+.visible .entry triton_poi_fused_clone_permute_1(
+	.param .u64 .ptr .global .align 1 triton_poi_fused_clone_permute_1_param_0,
+	.param .u64 .ptr .global .align 1 triton_poi_fused_clone_permute_1_param_1,
+	.param .u32 triton_poi_fused_clone_permute_1_param_2,
+	.param .u64 .ptr .global .align 1 triton_poi_fused_clone_permute_1_param_3,
+	.param .u64 .ptr .global .align 1 triton_poi_fused_clone_permute_1_param_4
+)
+.reqntid 256
+{
+	.reg .b32 	%r<24>;
+	.reg .b64 	%rd<5>;
+	.loc	1 18 0                          // cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py:18:0
+$L__func_begin0:
+	.loc	1 18 0                          // cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py:18:0
+
+// %bb.0:
+	ld.param.b64 	%rd3, [triton_poi_fused_clone_permute_1_param_0];
+	ld.param.b64 	%rd4, [triton_poi_fused_clone_permute_1_param_1];
+$L__tmp0:
+	.loc	1 20 28                         // cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py:20:28
+	mov.u32 	%r2, %ctaid.x;
+	.loc	1 20 33                         // cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py:20:33
+	shl.b32 	%r3, %r2, 9;
+	.loc	1 21 36                         // cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py:21:36
+	mov.u32 	%r4, %tid.x;
+	shl.b32 	%r5, %r4, 1;
+	and.b32 	%r6, %r5, 510;
+	.loc	1 21 23                         // cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py:21:23
+	or.b32 	%r7, %r6, %r3;
+	.loc	1 24 21                         // cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py:24:21
+	bfe.s32 	%r8, %r2, 22, 1;
+	shr.u32 	%r9, %r8, 25;
+	add.s32 	%r10, %r7, %r9;
+	shr.s32 	%r11, %r10, 7;
+	.loc	1 23 19                         // cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py:23:19
+	and.b32 	%r12, %r10, -128;
+	sub.s32 	%r13, %r7, %r12;
+	.loc	1 24 28                         // cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py:24:28
+	shr.u32 	%r14, %r11, 27;
+	add.s32 	%r15, %r11, %r14;
+	and.b32 	%r16, %r15, 131040;
+	sub.s32 	%r17, %r11, %r16;
+	.loc	1 25 19                         // cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py:25:19
+	shr.u32 	%r18, %r8, 20;
+	add.s32 	%r19, %r7, %r18;
+	shr.s32 	%r20, %r19, 12;
+	.loc	1 27 39                         // cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py:27:39
+	shl.b32 	%r21, %r20, 7;
+	.loc	1 27 35                         // cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py:27:35
+	add.s32 	%r22, %r21, %r13;
+	.loc	1 27 44                         // cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py:27:44
+	mad.lo.s32 	%r23, %r17, 294912, %r22;
+	.loc	1 27 30                         // cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py:27:30
+	mad.wide.s32 	%rd1, %r23, 2, %rd3;
+	.loc	1 27 56                         // cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py:27:56
+	// begin inline asm
+	mov.u32 %r1, 0x0;
+	ld.global.b32 { %r1 }, [ %rd1 + 0 ];
+	// end inline asm
+	.loc	1 28 25                         // cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py:28:25
+	mad.wide.s32 	%rd2, %r7, 2, %rd4;
+	.loc	1 28 36                         // cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py:28:36
+	// begin inline asm
+	st.global.b32 [ %rd2 + 0 ], { %r1 };
+	// end inline asm
+	.loc	1 28 4                          // cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py:28:4
+	ret;
+$L__tmp1:
+$L__func_end0:
+                                        // -- End function
+}
+	.file	1 "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py"
+	.section	.debug_abbrev
+	{
+.b8 1                                   // Abbreviation Code
+.b8 17                                  // DW_TAG_compile_unit
+.b8 0                                   // DW_CHILDREN_no
+.b8 37                                  // DW_AT_producer
+.b8 8                                   // DW_FORM_string
+.b8 19                                  // DW_AT_language
+.b8 5                                   // DW_FORM_data2
+.b8 3                                   // DW_AT_name
+.b8 8                                   // DW_FORM_string
+.b8 16                                  // DW_AT_stmt_list
+.b8 6                                   // DW_FORM_data4
+.b8 27                                  // DW_AT_comp_dir
+.b8 8                                   // DW_FORM_string
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 0                                   // EOM(3)
+	}
+	.section	.debug_info
+	{
+.b32 224                                // Length of Unit
+.b8 2                                   // DWARF version number
+.b8 0
+.b32 .debug_abbrev                      // Offset Into Abbrev. Section
+.b8 8                                   // Address Size (in bytes)
+.b8 1                                   // Abbrev [1] 0xb:0xd9 DW_TAG_compile_unit
+.b8 116                                 // DW_AT_producer
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 0
+.b8 2                                   // DW_AT_language
+.b8 0
+.b8 99                                  // DW_AT_name
+.b8 114
+.b8 52
+.b8 99
+.b8 51
+.b8 114
+.b8 108
+.b8 98
+.b8 99
+.b8 54
+.b8 51
+.b8 106
+.b8 50
+.b8 113
+.b8 121
+.b8 111
+.b8 101
+.b8 51
+.b8 108
+.b8 54
+.b8 50
+.b8 109
+.b8 118
+.b8 98
+.b8 99
+.b8 114
+.b8 109
+.b8 116
+.b8 52
+.b8 120
+.b8 53
+.b8 103
+.b8 112
+.b8 100
+.b8 110
+.b8 50
+.b8 55
+.b8 100
+.b8 118
+.b8 112
+.b8 101
+.b8 110
+.b8 108
+.b8 99
+.b8 103
+.b8 116
+.b8 109
+.b8 116
+.b8 55
+.b8 52
+.b8 107
+.b8 53
+.b8 46
+.b8 112
+.b8 121
+.b8 0
+.b32 .debug_line                        // DW_AT_stmt_list
+.b8 47                                  // DW_AT_comp_dir
+.b8 97
+.b8 112
+.b8 112
+.b8 47
+.b8 116
+.b8 101
+.b8 110
+.b8 115
+.b8 111
+.b8 114
+.b8 114
+.b8 116
+.b8 95
+.b8 108
+.b8 108
+.b8 109
+.b8 47
+.b8 118
+.b8 105
+.b8 115
+.b8 117
+.b8 97
+.b8 108
+.b8 95
+.b8 103
+.b8 101
+.b8 110
+.b8 47
+.b8 99
+.b8 111
+.b8 109
+.b8 112
+.b8 105
+.b8 108
+.b8 101
+.b8 100
+.b8 95
+.b8 99
+.b8 97
+.b8 99
+.b8 104
+.b8 101
+.b8 47
+.b8 102
+.b8 108
+.b8 117
+.b8 120
+.b8 50
+.b8 95
+.b8 107
+.b8 108
+.b8 101
+.b8 105
+.b8 110
+.b8 95
+.b8 57
+.b8 98
+.b8 95
+.b8 78
+.b8 86
+.b8 73
+.b8 68
+.b8 73
+.b8 65
+.b8 95
+.b8 71
+.b8 101
+.b8 70
+.b8 111
+.b8 114
+.b8 99
+.b8 101
+.b8 95
+.b8 82
+.b8 84
+.b8 88
+.b8 95
+.b8 52
+.b8 48
+.b8 57
+.b8 48
+.b8 95
+.b8 115
+.b8 109
+.b8 56
+.b8 57
+.b8 95
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 50
+.b8 46
+.b8 49
+.b8 48
+.b8 46
+.b8 48
+.b8 97
+.b8 48
+.b8 95
+.b8 98
+.b8 52
+.b8 101
+.b8 52
+.b8 101
+.b8 101
+.b8 56
+.b8 49
+.b8 100
+.b8 51
+.b8 46
+.b8 110
+.b8 118
+.b8 50
+.b8 53
+.b8 46
+.b8 49
+.b8 50
+.b8 95
+.b8 99
+.b8 117
+.b8 100
+.b8 97
+.b8 49
+.b8 51
+.b8 95
+.b8 49
+.b8 47
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 105
+.b8 110
+.b8 100
+.b8 117
+.b8 99
+.b8 116
+.b8 111
+.b8 114
+.b8 47
+.b8 114
+.b8 52
+.b8 0
+	}
+	.section	.debug_macinfo	{	}
diff --git a/triton/X64FO2735YQCHUQEHARXMBHC2OO7I7W5Y2YUTJO7NRKIYNQ572AQ/triton_poi_fused_clone_permute_1.source b/triton/X64FO2735YQCHUQEHARXMBHC2OO7I7W5Y2YUTJO7NRKIYNQ572AQ/triton_poi_fused_clone_permute_1.source
new file mode 100644
index 0000000000000000000000000000000000000000..8fd19f3ef7b5aaad73a872613ef77c41c55f73cd
--- /dev/null
+++ b/triton/X64FO2735YQCHUQEHARXMBHC2OO7I7W5Y2YUTJO7NRKIYNQ572AQ/triton_poi_fused_clone_permute_1.source
@@ -0,0 +1,90 @@
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":18:0)
+#loc21 = loc("in_ptr0"(#loc))
+#loc22 = loc("out_ptr0"(#loc))
+#loc23 = loc("xnumel"(#loc))
+module {
+  tt.func public @triton_poi_fused_clone_permute_1(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} {
+    %xnumel_0 = arith.constant 9437184 : i32 loc(#loc24)
+    %xoffset = tt.get_program_id x : i32 loc(#loc25)
+    %xoffset_1 = arith.constant 512 : i32 loc(#loc26)
+    %xoffset_2 = arith.constant 512 : i32 loc(#loc26)
+    %xoffset_3 = arith.muli %xoffset, %xoffset_2 : i32 loc(#loc26)
+    %xindex = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32> loc(#loc27)
+    %xindex_4 = tt.splat %xoffset_3 : i32 -> tensor<512xi32> loc(#loc28)
+    %xindex_5 = arith.addi %xindex_4, %xindex : tensor<512xi32> loc(#loc28)
+    %xmask = arith.constant true loc(#loc29)
+    %xmask_6 = arith.constant dense<true> : tensor<512xi1> loc(#loc29)
+    %x0 = arith.constant 128 : i32 loc(#loc30)
+    %x0_7 = arith.constant 128 : i32 loc(#loc30)
+    %x0_8 = arith.constant dense<128> : tensor<512xi32> loc(#loc30)
+    %x0_9 = arith.remsi %xindex_5, %x0_8 : tensor<512xi32> loc(#loc30)
+    %x1 = arith.constant 128 : i32 loc(#loc31)
+    %x1_10 = arith.constant 128 : i32 loc(#loc31)
+    %x1_11 = arith.constant dense<128> : tensor<512xi32> loc(#loc31)
+    %x1_12 = arith.divsi %xindex_5, %x1_11 : tensor<512xi32> loc(#loc31)
+    %x1_13 = arith.constant 32 : i32 loc(#loc32)
+    %x1_14 = arith.constant 32 : i32 loc(#loc32)
+    %x1_15 = arith.constant dense<32> : tensor<512xi32> loc(#loc32)
+    %x1_16 = arith.remsi %x1_12, %x1_15 : tensor<512xi32> loc(#loc32)
+    %x2 = arith.constant 4096 : i32 loc(#loc33)
+    %x2_17 = arith.constant 4096 : i32 loc(#loc33)
+    %x2_18 = arith.constant dense<4096> : tensor<512xi32> loc(#loc33)
+    %x2_19 = arith.divsi %xindex_5, %x2_18 : tensor<512xi32> loc(#loc33)
+    %tmp0 = arith.constant 128 : i32 loc(#loc34)
+    %tmp0_20 = arith.constant 128 : i32 loc(#loc34)
+    %tmp0_21 = arith.constant dense<128> : tensor<512xi32> loc(#loc34)
+    %tmp0_22 = arith.muli %tmp0_21, %x2_19 : tensor<512xi32> loc(#loc34)
+    %tmp0_23 = arith.addi %x0_9, %tmp0_22 : tensor<512xi32> loc(#loc35)
+    %tmp0_24 = arith.constant 294912 : i32 loc(#loc36)
+    %tmp0_25 = arith.constant 294912 : i32 loc(#loc36)
+    %tmp0_26 = arith.constant dense<294912> : tensor<512xi32> loc(#loc36)
+    %tmp0_27 = arith.muli %tmp0_26, %x1_16 : tensor<512xi32> loc(#loc36)
+    %tmp0_28 = arith.addi %tmp0_23, %tmp0_27 : tensor<512xi32> loc(#loc37)
+    %tmp0_29 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<512x!tt.ptr<bf16>> loc(#loc38)
+    %tmp0_30 = tt.addptr %tmp0_29, %tmp0_28 : tensor<512x!tt.ptr<bf16>>, tensor<512xi32> loc(#loc38)
+    %tmp0_31 = tt.load %tmp0_30 : tensor<512x!tt.ptr<bf16>> loc(#loc39)
+    %tmp0_32 = arith.extf %tmp0_31 : tensor<512xbf16> to tensor<512xf32> loc(#loc40)
+    %0 = tt.splat %out_ptr0 : !tt.ptr<bf16> -> tensor<512x!tt.ptr<bf16>> loc(#loc18)
+    %1 = tt.addptr %0, %xindex_5 : tensor<512x!tt.ptr<bf16>>, tensor<512xi32> loc(#loc18)
+    %2 = arith.truncf %tmp0_32 : tensor<512xf32> to tensor<512xbf16> loc(#loc19)
+    tt.store %1, %2 : tensor<512x!tt.ptr<bf16>> loc(#loc19)
+    tt.return loc(#loc20)
+  } loc(#loc)
+} loc(#loc)
+#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":19:13)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":20:28)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":20:33)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":21:36)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":21:23)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":22:36)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":23:19)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":24:21)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":24:28)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":25:19)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":27:39)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":27:35)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":27:51)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":27:44)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":27:30)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":27:56)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":27:65)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":28:25)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":28:36)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":28:4)
+#loc24 = loc("xnumel"(#loc1))
+#loc25 = loc("xoffset"(#loc2))
+#loc26 = loc("xoffset"(#loc3))
+#loc27 = loc("xindex"(#loc4))
+#loc28 = loc("xindex"(#loc5))
+#loc29 = loc("xmask"(#loc6))
+#loc30 = loc("x0"(#loc7))
+#loc31 = loc("x1"(#loc8))
+#loc32 = loc("x1"(#loc9))
+#loc33 = loc("x2"(#loc10))
+#loc34 = loc("tmp0"(#loc11))
+#loc35 = loc("tmp0"(#loc12))
+#loc36 = loc("tmp0"(#loc13))
+#loc37 = loc("tmp0"(#loc14))
+#loc38 = loc("tmp0"(#loc15))
+#loc39 = loc("tmp0"(#loc16))
+#loc40 = loc("tmp0"(#loc17))
diff --git a/triton/X64FO2735YQCHUQEHARXMBHC2OO7I7W5Y2YUTJO7NRKIYNQ572AQ/triton_poi_fused_clone_permute_1.ttgir b/triton/X64FO2735YQCHUQEHARXMBHC2OO7I7W5Y2YUTJO7NRKIYNQ572AQ/triton_poi_fused_clone_permute_1.ttgir
new file mode 100644
index 0000000000000000000000000000000000000000..c5e0fe984cd01748c1fff1ba3394b9b47e4c0d7d
--- /dev/null
+++ b/triton/X64FO2735YQCHUQEHARXMBHC2OO7I7W5Y2YUTJO7NRKIYNQ572AQ/triton_poi_fused_clone_permute_1.ttgir
@@ -0,0 +1,66 @@
+#blocked = #ttg.blocked<{sizePerThread = [2], threadsPerWarp = [32], warpsPerCTA = [8], order = [0]}>
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":18:0)
+#loc19 = loc("in_ptr0"(#loc))
+#loc20 = loc("out_ptr0"(#loc))
+#loc21 = loc("xnumel"(#loc))
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "cuda:89", "ttg.threads-per-warp" = 32 : i32} {
+  tt.func public @triton_poi_fused_clone_permute_1(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} {
+    %cst = arith.constant dense<128> : tensor<512xi32, #blocked> loc(#loc1)
+    %cst_0 = arith.constant dense<32> : tensor<512xi32, #blocked> loc(#loc1)
+    %cst_1 = arith.constant dense<4096> : tensor<512xi32, #blocked> loc(#loc1)
+    %cst_2 = arith.constant dense<294912> : tensor<512xi32, #blocked> loc(#loc1)
+    %c512_i32 = arith.constant 512 : i32 loc(#loc1)
+    %xoffset = tt.get_program_id x : i32 loc(#loc22)
+    %xoffset_3 = arith.muli %xoffset, %c512_i32 : i32 loc(#loc23)
+    %xindex = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32, #blocked> loc(#loc24)
+    %xindex_4 = tt.splat %xoffset_3 : i32 -> tensor<512xi32, #blocked> loc(#loc25)
+    %xindex_5 = arith.addi %xindex_4, %xindex : tensor<512xi32, #blocked> loc(#loc25)
+    %x0 = arith.remsi %xindex_5, %cst : tensor<512xi32, #blocked> loc(#loc26)
+    %x1 = arith.divsi %xindex_5, %cst : tensor<512xi32, #blocked> loc(#loc27)
+    %x1_6 = arith.remsi %x1, %cst_0 : tensor<512xi32, #blocked> loc(#loc28)
+    %x2 = arith.divsi %xindex_5, %cst_1 : tensor<512xi32, #blocked> loc(#loc29)
+    %tmp0 = arith.muli %x2, %cst : tensor<512xi32, #blocked> loc(#loc30)
+    %tmp0_7 = arith.addi %x0, %tmp0 : tensor<512xi32, #blocked> loc(#loc31)
+    %tmp0_8 = arith.muli %x1_6, %cst_2 : tensor<512xi32, #blocked> loc(#loc32)
+    %tmp0_9 = arith.addi %tmp0_7, %tmp0_8 : tensor<512xi32, #blocked> loc(#loc33)
+    %tmp0_10 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<512x!tt.ptr<bf16>, #blocked> loc(#loc34)
+    %tmp0_11 = tt.addptr %tmp0_10, %tmp0_9 : tensor<512x!tt.ptr<bf16>, #blocked>, tensor<512xi32, #blocked> loc(#loc34)
+    %tmp0_12 = tt.load %tmp0_11 : tensor<512x!tt.ptr<bf16>, #blocked> loc(#loc35)
+    %0 = tt.splat %out_ptr0 : !tt.ptr<bf16> -> tensor<512x!tt.ptr<bf16>, #blocked> loc(#loc16)
+    %1 = tt.addptr %0, %xindex_5 : tensor<512x!tt.ptr<bf16>, #blocked>, tensor<512xi32, #blocked> loc(#loc16)
+    tt.store %1, %tmp0_12 : tensor<512x!tt.ptr<bf16>, #blocked> loc(#loc17)
+    tt.return loc(#loc18)
+  } loc(#loc)
+} loc(#loc)
+#loc1 = loc(unknown)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":20:28)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":20:33)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":21:36)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":21:23)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":23:19)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":24:21)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":24:28)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":25:19)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":27:39)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":27:35)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":27:51)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":27:44)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":27:30)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":27:56)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":28:25)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":28:36)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":28:4)
+#loc22 = loc("xoffset"(#loc2))
+#loc23 = loc("xoffset"(#loc3))
+#loc24 = loc("xindex"(#loc4))
+#loc25 = loc("xindex"(#loc5))
+#loc26 = loc("x0"(#loc6))
+#loc27 = loc("x1"(#loc7))
+#loc28 = loc("x1"(#loc8))
+#loc29 = loc("x2"(#loc9))
+#loc30 = loc("tmp0"(#loc10))
+#loc31 = loc("tmp0"(#loc11))
+#loc32 = loc("tmp0"(#loc12))
+#loc33 = loc("tmp0"(#loc13))
+#loc34 = loc("tmp0"(#loc14))
+#loc35 = loc("tmp0"(#loc15))
diff --git a/triton/X64FO2735YQCHUQEHARXMBHC2OO7I7W5Y2YUTJO7NRKIYNQ572AQ/triton_poi_fused_clone_permute_1.ttir b/triton/X64FO2735YQCHUQEHARXMBHC2OO7I7W5Y2YUTJO7NRKIYNQ572AQ/triton_poi_fused_clone_permute_1.ttir
new file mode 100644
index 0000000000000000000000000000000000000000..00b50056d5c6bc28caa71748fc7614cb1a9852ec
--- /dev/null
+++ b/triton/X64FO2735YQCHUQEHARXMBHC2OO7I7W5Y2YUTJO7NRKIYNQ572AQ/triton_poi_fused_clone_permute_1.ttir
@@ -0,0 +1,65 @@
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":18:0)
+#loc19 = loc("in_ptr0"(#loc))
+#loc20 = loc("out_ptr0"(#loc))
+#loc21 = loc("xnumel"(#loc))
+module {
+  tt.func public @triton_poi_fused_clone_permute_1(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} {
+    %tmp0 = arith.constant dense<294912> : tensor<512xi32> loc(#loc22)
+    %x2 = arith.constant dense<4096> : tensor<512xi32> loc(#loc23)
+    %x1 = arith.constant dense<32> : tensor<512xi32> loc(#loc24)
+    %cst = arith.constant dense<128> : tensor<512xi32> loc(#loc4)
+    %c512_i32 = arith.constant 512 : i32 loc(#loc4)
+    %xoffset = tt.get_program_id x : i32 loc(#loc25)
+    %xoffset_0 = arith.muli %xoffset, %c512_i32 : i32 loc(#loc26)
+    %xindex = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32> loc(#loc27)
+    %xindex_1 = tt.splat %xoffset_0 : i32 -> tensor<512xi32> loc(#loc28)
+    %xindex_2 = arith.addi %xindex_1, %xindex : tensor<512xi32> loc(#loc28)
+    %x0 = arith.remsi %xindex_2, %cst : tensor<512xi32> loc(#loc29)
+    %x1_3 = arith.divsi %xindex_2, %cst : tensor<512xi32> loc(#loc30)
+    %x1_4 = arith.remsi %x1_3, %x1 : tensor<512xi32> loc(#loc24)
+    %x2_5 = arith.divsi %xindex_2, %x2 : tensor<512xi32> loc(#loc23)
+    %tmp0_6 = arith.muli %x2_5, %cst : tensor<512xi32> loc(#loc31)
+    %tmp0_7 = arith.addi %x0, %tmp0_6 : tensor<512xi32> loc(#loc32)
+    %tmp0_8 = arith.muli %x1_4, %tmp0 : tensor<512xi32> loc(#loc22)
+    %tmp0_9 = arith.addi %tmp0_7, %tmp0_8 : tensor<512xi32> loc(#loc33)
+    %tmp0_10 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<512x!tt.ptr<bf16>> loc(#loc34)
+    %tmp0_11 = tt.addptr %tmp0_10, %tmp0_9 : tensor<512x!tt.ptr<bf16>>, tensor<512xi32> loc(#loc34)
+    %tmp0_12 = tt.load %tmp0_11 : tensor<512x!tt.ptr<bf16>> loc(#loc35)
+    %0 = tt.splat %out_ptr0 : !tt.ptr<bf16> -> tensor<512x!tt.ptr<bf16>> loc(#loc16)
+    %1 = tt.addptr %0, %xindex_2 : tensor<512x!tt.ptr<bf16>>, tensor<512xi32> loc(#loc16)
+    tt.store %1, %tmp0_12 : tensor<512x!tt.ptr<bf16>> loc(#loc17)
+    tt.return loc(#loc18)
+  } loc(#loc)
+} loc(#loc)
+#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":27:51)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":25:19)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":24:28)
+#loc4 = loc(unknown)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":20:28)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":20:33)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":21:36)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":21:23)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":23:19)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":24:21)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":27:39)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":27:35)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":27:44)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":27:30)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":27:56)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":28:25)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":28:36)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/r4/cr4c3rlbc63j2qyoe3l62mvbcrmt4x5gpdn27dvpenlcgtmt74k5.py":28:4)
+#loc22 = loc("tmp0"(#loc1))
+#loc23 = loc("x2"(#loc2))
+#loc24 = loc("x1"(#loc3))
+#loc25 = loc("xoffset"(#loc5))
+#loc26 = loc("xoffset"(#loc6))
+#loc27 = loc("xindex"(#loc7))
+#loc28 = loc("xindex"(#loc8))
+#loc29 = loc("x0"(#loc9))
+#loc30 = loc("x1"(#loc10))
+#loc31 = loc("tmp0"(#loc11))
+#loc32 = loc("tmp0"(#loc12))
+#loc33 = loc("tmp0"(#loc13))
+#loc34 = loc("tmp0"(#loc14))
+#loc35 = loc("tmp0"(#loc15))
diff --git a/triton/XNU7OU7O32IXOD4UTC2D2AANXVO6GQPSECSTO2LUQBS3OJJCNM5Q/__grp__triton_poi_fused_cat_view_4.json b/triton/XNU7OU7O32IXOD4UTC2D2AANXVO6GQPSECSTO2LUQBS3OJJCNM5Q/__grp__triton_poi_fused_cat_view_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..c23fdadb3fe3d400c2bcb118bb2c3c7178d54586
--- /dev/null
+++ b/triton/XNU7OU7O32IXOD4UTC2D2AANXVO6GQPSECSTO2LUQBS3OJJCNM5Q/__grp__triton_poi_fused_cat_view_4.json
@@ -0,0 +1 @@
+{"child_paths": {"triton_poi_fused_cat_view_4.source": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/XNU7OU7O32IXOD4UTC2D2AANXVO6GQPSECSTO2LUQBS3OJJCNM5Q/triton_poi_fused_cat_view_4.source", "triton_poi_fused_cat_view_4.ttir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/XNU7OU7O32IXOD4UTC2D2AANXVO6GQPSECSTO2LUQBS3OJJCNM5Q/triton_poi_fused_cat_view_4.ttir", "triton_poi_fused_cat_view_4.ttgir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/XNU7OU7O32IXOD4UTC2D2AANXVO6GQPSECSTO2LUQBS3OJJCNM5Q/triton_poi_fused_cat_view_4.ttgir", "triton_poi_fused_cat_view_4.llir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/XNU7OU7O32IXOD4UTC2D2AANXVO6GQPSECSTO2LUQBS3OJJCNM5Q/triton_poi_fused_cat_view_4.llir", "triton_poi_fused_cat_view_4.ptx": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/XNU7OU7O32IXOD4UTC2D2AANXVO6GQPSECSTO2LUQBS3OJJCNM5Q/triton_poi_fused_cat_view_4.ptx", "triton_poi_fused_cat_view_4.cubin": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/XNU7OU7O32IXOD4UTC2D2AANXVO6GQPSECSTO2LUQBS3OJJCNM5Q/triton_poi_fused_cat_view_4.cubin", "triton_poi_fused_cat_view_4.json": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/XNU7OU7O32IXOD4UTC2D2AANXVO6GQPSECSTO2LUQBS3OJJCNM5Q/triton_poi_fused_cat_view_4.json"}}
\ No newline at end of file
diff --git a/triton/XNU7OU7O32IXOD4UTC2D2AANXVO6GQPSECSTO2LUQBS3OJJCNM5Q/triton_poi_fused_cat_view_4.cubin b/triton/XNU7OU7O32IXOD4UTC2D2AANXVO6GQPSECSTO2LUQBS3OJJCNM5Q/triton_poi_fused_cat_view_4.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..2908ced25ce7f2d66da17fab9904d83c3a104833
Binary files /dev/null and b/triton/XNU7OU7O32IXOD4UTC2D2AANXVO6GQPSECSTO2LUQBS3OJJCNM5Q/triton_poi_fused_cat_view_4.cubin differ
diff --git a/triton/XNU7OU7O32IXOD4UTC2D2AANXVO6GQPSECSTO2LUQBS3OJJCNM5Q/triton_poi_fused_cat_view_4.json b/triton/XNU7OU7O32IXOD4UTC2D2AANXVO6GQPSECSTO2LUQBS3OJJCNM5Q/triton_poi_fused_cat_view_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..ed23af6fb2dfd7a453b4a9b8a48b3333d4511692
--- /dev/null
+++ b/triton/XNU7OU7O32IXOD4UTC2D2AANXVO6GQPSECSTO2LUQBS3OJJCNM5Q/triton_poi_fused_cat_view_4.json
@@ -0,0 +1 @@
+{"hash": "bb69f753eede91770f9498b43d000dbd5de341f220a53769748065b725226b3b", "target": {"backend": "cuda", "arch": 89, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "enable_reflect_ftz": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee", "bf16x3", "bf16x6"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm89", "instrumentation_mode": "", "triton_version": "3.6.0", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_poi_fused_cat_view_4"}
\ No newline at end of file
diff --git a/triton/XNU7OU7O32IXOD4UTC2D2AANXVO6GQPSECSTO2LUQBS3OJJCNM5Q/triton_poi_fused_cat_view_4.llir b/triton/XNU7OU7O32IXOD4UTC2D2AANXVO6GQPSECSTO2LUQBS3OJJCNM5Q/triton_poi_fused_cat_view_4.llir
new file mode 100644
index 0000000000000000000000000000000000000000..7b6e85f5406c06c3962d20369415547d3b235ad5
--- /dev/null
+++ b/triton/XNU7OU7O32IXOD4UTC2D2AANXVO6GQPSECSTO2LUQBS3OJJCNM5Q/triton_poi_fused_cat_view_4.llir
@@ -0,0 +1,119 @@
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64"
+
+; Function Attrs: nounwind
+define ptx_kernel void @triton_poi_fused_cat_view_4(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, i32 %3, ptr addrspace(1) readnone captures(none) %4, ptr addrspace(1) readnone captures(none) %5) local_unnamed_addr #0 !dbg !4 {
+  %7 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7
+  %8 = shl i32 %7, 10, !dbg !8
+  %9 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9
+  %10 = shl nuw nsw i32 %9, 3, !dbg !9
+  %11 = and i32 %10, 1016, !dbg !9
+  %12 = or disjoint i32 %11, %8, !dbg !10
+  %13 = sdiv i32 %12, 4096, !dbg !11
+  %14 = icmp slt i32 %12, 1048576, !dbg !12
+  %15 = shl i32 %13, 13, !dbg !13
+  %16 = add i32 %15, %12, !dbg !13
+  %17 = sext i32 %16 to i64, !dbg !14
+  %18 = getelementptr bfloat, ptr addrspace(1) %0, i64 %17, !dbg !14
+  %19 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %18, i1 %14) #2, !dbg !15
+  %20 = extractvalue { i32, i32, i32, i32 } %19, 0, !dbg !15
+  %21 = bitcast i32 %20 to <2 x bfloat>, !dbg !15
+  %22 = extractvalue { i32, i32, i32, i32 } %19, 1, !dbg !15
+  %23 = bitcast i32 %22 to <2 x bfloat>, !dbg !15
+  %24 = extractvalue { i32, i32, i32, i32 } %19, 2, !dbg !15
+  %25 = bitcast i32 %24 to <2 x bfloat>, !dbg !15
+  %26 = extractvalue { i32, i32, i32, i32 } %19, 3, !dbg !15
+  %27 = bitcast i32 %26 to <2 x bfloat>, !dbg !15
+  %28 = extractelement <2 x bfloat> %21, i64 0, !dbg !15
+  %29 = extractelement <2 x bfloat> %21, i64 1, !dbg !15
+  %30 = extractelement <2 x bfloat> %23, i64 0, !dbg !15
+  %31 = extractelement <2 x bfloat> %23, i64 1, !dbg !15
+  %32 = extractelement <2 x bfloat> %25, i64 0, !dbg !15
+  %33 = extractelement <2 x bfloat> %25, i64 1, !dbg !15
+  %34 = extractelement <2 x bfloat> %27, i64 0, !dbg !15
+  %35 = extractelement <2 x bfloat> %27, i64 1, !dbg !15
+  %36 = icmp sgt i32 %12, 1048575, !dbg !16
+  %37 = add i32 %16, -3145728, !dbg !17
+  %38 = sext i32 %37 to i64, !dbg !18
+  %39 = getelementptr bfloat, ptr addrspace(1) %1, i64 %38, !dbg !18
+  %40 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %39, i1 %36) #2, !dbg !19
+  %41 = extractvalue { i32, i32, i32, i32 } %40, 0, !dbg !19
+  %42 = bitcast i32 %41 to <2 x bfloat>, !dbg !19
+  %43 = extractvalue { i32, i32, i32, i32 } %40, 1, !dbg !19
+  %44 = bitcast i32 %43 to <2 x bfloat>, !dbg !19
+  %45 = extractvalue { i32, i32, i32, i32 } %40, 2, !dbg !19
+  %46 = bitcast i32 %45 to <2 x bfloat>, !dbg !19
+  %47 = extractvalue { i32, i32, i32, i32 } %40, 3, !dbg !19
+  %48 = bitcast i32 %47 to <2 x bfloat>, !dbg !19
+  %49 = extractelement <2 x bfloat> %42, i64 0, !dbg !19
+  %50 = extractelement <2 x bfloat> %42, i64 1, !dbg !19
+  %51 = extractelement <2 x bfloat> %44, i64 0, !dbg !19
+  %52 = extractelement <2 x bfloat> %44, i64 1, !dbg !19
+  %53 = extractelement <2 x bfloat> %46, i64 0, !dbg !19
+  %54 = extractelement <2 x bfloat> %46, i64 1, !dbg !19
+  %55 = extractelement <2 x bfloat> %48, i64 0, !dbg !19
+  %56 = extractelement <2 x bfloat> %48, i64 1, !dbg !19
+  %.v = select i1 %14, bfloat %28, bfloat %49, !dbg !20
+  %.v1 = select i1 %14, bfloat %29, bfloat %50, !dbg !20
+  %.v2 = select i1 %14, bfloat %30, bfloat %51, !dbg !20
+  %.v3 = select i1 %14, bfloat %31, bfloat %52, !dbg !20
+  %.v4 = select i1 %14, bfloat %32, bfloat %53, !dbg !20
+  %.v5 = select i1 %14, bfloat %33, bfloat %54, !dbg !20
+  %.v6 = select i1 %14, bfloat %34, bfloat %55, !dbg !20
+  %.v7 = select i1 %14, bfloat %35, bfloat %56, !dbg !20
+  %57 = sext i32 %12 to i64, !dbg !21
+  %58 = getelementptr bfloat, ptr addrspace(1) %2, i64 %57, !dbg !21
+  %59 = insertelement <2 x bfloat> poison, bfloat %.v, i64 0, !dbg !22
+  %60 = insertelement <2 x bfloat> %59, bfloat %.v1, i64 1, !dbg !22
+  %61 = bitcast <2 x bfloat> %60 to i32, !dbg !22
+  %62 = insertelement <2 x bfloat> poison, bfloat %.v2, i64 0, !dbg !22
+  %63 = insertelement <2 x bfloat> %62, bfloat %.v3, i64 1, !dbg !22
+  %64 = bitcast <2 x bfloat> %63 to i32, !dbg !22
+  %65 = insertelement <2 x bfloat> poison, bfloat %.v4, i64 0, !dbg !22
+  %66 = insertelement <2 x bfloat> %65, bfloat %.v5, i64 1, !dbg !22
+  %67 = bitcast <2 x bfloat> %66 to i32, !dbg !22
+  %68 = insertelement <2 x bfloat> poison, bfloat %.v6, i64 0, !dbg !22
+  %69 = insertelement <2 x bfloat> %68, bfloat %.v7, i64 1, !dbg !22
+  %70 = bitcast <2 x bfloat> %69 to i32, !dbg !22
+  tail call void asm sideeffect "st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l"(i32 %61, i32 %64, i32 %67, i32 %70, ptr addrspace(1) %58) #2, !dbg !22
+  ret void, !dbg !23
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1
+
+attributes #0 = { nounwind "nvvm.reqntid"="128" }
+attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #2 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly)
+!1 = !DIFile(filename: "clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py", directory: "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp")
+!2 = !{i32 2, !"Debug Info Version", i32 3}
+!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
+!4 = distinct !DISubprogram(name: "triton_poi_fused_cat_view_4", linkageName: "triton_poi_fused_cat_view_4", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!5 = !DISubroutineType(cc: DW_CC_normal, types: !6)
+!6 = !{}
+!7 = !DILocation(line: 20, column: 28, scope: !4)
+!8 = !DILocation(line: 20, column: 33, scope: !4)
+!9 = !DILocation(line: 21, column: 36, scope: !4)
+!10 = !DILocation(line: 21, column: 23, scope: !4)
+!11 = !DILocation(line: 23, column: 19, scope: !4)
+!12 = !DILocation(line: 30, column: 18, scope: !4)
+!13 = !DILocation(line: 31, column: 35, scope: !4)
+!14 = !DILocation(line: 31, column: 30, scope: !4)
+!15 = !DILocation(line: 31, column: 48, scope: !4)
+!16 = !DILocation(line: 32, column: 19, scope: !4)
+!17 = !DILocation(line: 35, column: 35, scope: !4)
+!18 = !DILocation(line: 35, column: 30, scope: !4)
+!19 = !DILocation(line: 35, column: 57, scope: !4)
+!20 = !DILocation(line: 36, column: 33, scope: !4)
+!21 = !DILocation(line: 37, column: 25, scope: !4)
+!22 = !DILocation(line: 37, column: 37, scope: !4)
+!23 = !DILocation(line: 37, column: 4, scope: !4)
diff --git a/triton/XNU7OU7O32IXOD4UTC2D2AANXVO6GQPSECSTO2LUQBS3OJJCNM5Q/triton_poi_fused_cat_view_4.ptx b/triton/XNU7OU7O32IXOD4UTC2D2AANXVO6GQPSECSTO2LUQBS3OJJCNM5Q/triton_poi_fused_cat_view_4.ptx
new file mode 100644
index 0000000000000000000000000000000000000000..c00f6a727cc6b5d9f092c2aafd7d2b85748cd004
--- /dev/null
+++ b/triton/XNU7OU7O32IXOD4UTC2D2AANXVO6GQPSECSTO2LUQBS3OJJCNM5Q/triton_poi_fused_cat_view_4.ptx
@@ -0,0 +1,354 @@
+//
+// Generated by LLVM NVPTX Back-End
+//
+
+.version 9.1
+.target sm_89
+.address_size 64
+
+	// .globl	triton_poi_fused_cat_view_4 // -- Begin function triton_poi_fused_cat_view_4
+                                        // @triton_poi_fused_cat_view_4
+.visible .entry triton_poi_fused_cat_view_4(
+	.param .u64 .ptr .global .align 1 triton_poi_fused_cat_view_4_param_0,
+	.param .u64 .ptr .global .align 1 triton_poi_fused_cat_view_4_param_1,
+	.param .u64 .ptr .global .align 1 triton_poi_fused_cat_view_4_param_2,
+	.param .u32 triton_poi_fused_cat_view_4_param_3,
+	.param .u64 .ptr .global .align 1 triton_poi_fused_cat_view_4_param_4,
+	.param .u64 .ptr .global .align 1 triton_poi_fused_cat_view_4_param_5
+)
+.reqntid 128
+{
+	.reg .pred 	%p<3>;
+	.reg .b16 	%rs<25>;
+	.reg .b32 	%r<27>;
+	.reg .b64 	%rd<7>;
+	.loc	1 18 0                          // clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py:18:0
+$L__func_begin0:
+	.loc	1 18 0                          // clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py:18:0
+
+// %bb.0:
+	ld.param.b64 	%rd4, [triton_poi_fused_cat_view_4_param_0];
+	ld.param.b64 	%rd5, [triton_poi_fused_cat_view_4_param_1];
+$L__tmp0:
+	.loc	1 20 28                         // clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py:20:28
+	mov.u32 	%r14, %ctaid.x;
+	.loc	1 20 33                         // clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py:20:33
+	shl.b32 	%r15, %r14, 10;
+	ld.param.b64 	%rd6, [triton_poi_fused_cat_view_4_param_2];
+	.loc	1 21 36                         // clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py:21:36
+	mov.u32 	%r16, %tid.x;
+	shl.b32 	%r17, %r16, 3;
+	and.b32 	%r18, %r17, 1016;
+	.loc	1 21 23                         // clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py:21:23
+	or.b32 	%r19, %r18, %r15;
+	.loc	1 23 19                         // clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py:23:19
+	bfe.s32 	%r20, %r14, 21, 1;
+	shr.u32 	%r21, %r20, 20;
+	add.s32 	%r22, %r19, %r21;
+	.loc	1 30 18                         // clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py:30:18
+	setp.lt.s32 	%p1, %r19, 1048576;
+	.loc	1 31 35                         // clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py:31:35
+	shl.b32 	%r23, %r22, 1;
+	and.b32 	%r24, %r23, -8192;
+	add.s32 	%r25, %r24, %r19;
+	.loc	1 31 30                         // clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py:31:30
+	mad.wide.s32 	%rd1, %r25, 2, %rd4;
+	mov.b32 	%r5, 0;
+	.loc	1 31 48                         // clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py:31:48
+	// begin inline asm
+	mov.u32 %r1, %r5;
+	mov.u32 %r2, %r5;
+	mov.u32 %r3, %r5;
+	mov.u32 %r4, %r5;
+	@%p1 ld.global.v4.b32 { %r1, %r2, %r3, %r4 }, [ %rd1 + 0 ];
+	// end inline asm
+	mov.b32 	{%rs1, %rs2}, %r1;
+	mov.b32 	{%rs3, %rs4}, %r2;
+	mov.b32 	{%rs5, %rs6}, %r3;
+	mov.b32 	{%rs7, %rs8}, %r4;
+	.loc	1 32 19                         // clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py:32:19
+	setp.gt.s32 	%p2, %r19, 1048575;
+	.loc	1 35 35                         // clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py:35:35
+	add.s32 	%r26, %r25, -3145728;
+	.loc	1 35 30                         // clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py:35:30
+	mad.wide.s32 	%rd2, %r26, 2, %rd5;
+	.loc	1 35 57                         // clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py:35:57
+	// begin inline asm
+	mov.u32 %r6, %r5;
+	mov.u32 %r7, %r5;
+	mov.u32 %r8, %r5;
+	mov.u32 %r9, %r5;
+	@%p2 ld.global.v4.b32 { %r6, %r7, %r8, %r9 }, [ %rd2 + 0 ];
+	// end inline asm
+	mov.b32 	{%rs9, %rs10}, %r6;
+	mov.b32 	{%rs11, %rs12}, %r7;
+	mov.b32 	{%rs13, %rs14}, %r8;
+	mov.b32 	{%rs15, %rs16}, %r9;
+	.loc	1 36 33                         // clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py:36:33
+	selp.b16 	%rs17, %rs1, %rs9, %p1;
+	selp.b16 	%rs18, %rs2, %rs10, %p1;
+	selp.b16 	%rs19, %rs3, %rs11, %p1;
+	selp.b16 	%rs20, %rs4, %rs12, %p1;
+	selp.b16 	%rs21, %rs5, %rs13, %p1;
+	selp.b16 	%rs22, %rs6, %rs14, %p1;
+	selp.b16 	%rs23, %rs7, %rs15, %p1;
+	selp.b16 	%rs24, %rs8, %rs16, %p1;
+	.loc	1 37 25                         // clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py:37:25
+	mad.wide.s32 	%rd3, %r19, 2, %rd6;
+	.loc	1 37 37                         // clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py:37:37
+	mov.b32 	%r10, {%rs17, %rs18};
+	mov.b32 	%r11, {%rs19, %rs20};
+	mov.b32 	%r12, {%rs21, %rs22};
+	mov.b32 	%r13, {%rs23, %rs24};
+	// begin inline asm
+	st.global.v4.b32 [ %rd3 + 0 ], { %r10, %r11, %r12, %r13 };
+	// end inline asm
+	.loc	1 37 4                          // clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py:37:4
+	ret;
+$L__tmp1:
+$L__func_end0:
+                                        // -- End function
+}
+	.file	1 "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py"
+	.section	.debug_abbrev
+	{
+.b8 1                                   // Abbreviation Code
+.b8 17                                  // DW_TAG_compile_unit
+.b8 0                                   // DW_CHILDREN_no
+.b8 37                                  // DW_AT_producer
+.b8 8                                   // DW_FORM_string
+.b8 19                                  // DW_AT_language
+.b8 5                                   // DW_FORM_data2
+.b8 3                                   // DW_AT_name
+.b8 8                                   // DW_FORM_string
+.b8 16                                  // DW_AT_stmt_list
+.b8 6                                   // DW_FORM_data4
+.b8 27                                  // DW_AT_comp_dir
+.b8 8                                   // DW_FORM_string
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 0                                   // EOM(3)
+	}
+	.section	.debug_info
+	{
+.b32 224                                // Length of Unit
+.b8 2                                   // DWARF version number
+.b8 0
+.b32 .debug_abbrev                      // Offset Into Abbrev. Section
+.b8 8                                   // Address Size (in bytes)
+.b8 1                                   // Abbrev [1] 0xb:0xd9 DW_TAG_compile_unit
+.b8 116                                 // DW_AT_producer
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 0
+.b8 2                                   // DW_AT_language
+.b8 0
+.b8 99                                  // DW_AT_name
+.b8 108
+.b8 112
+.b8 102
+.b8 52
+.b8 108
+.b8 111
+.b8 111
+.b8 104
+.b8 102
+.b8 115
+.b8 103
+.b8 119
+.b8 113
+.b8 104
+.b8 50
+.b8 103
+.b8 105
+.b8 50
+.b8 120
+.b8 111
+.b8 118
+.b8 111
+.b8 100
+.b8 112
+.b8 109
+.b8 55
+.b8 104
+.b8 122
+.b8 118
+.b8 53
+.b8 117
+.b8 50
+.b8 114
+.b8 118
+.b8 110
+.b8 103
+.b8 98
+.b8 55
+.b8 99
+.b8 104
+.b8 106
+.b8 103
+.b8 121
+.b8 119
+.b8 120
+.b8 53
+.b8 53
+.b8 103
+.b8 116
+.b8 117
+.b8 100
+.b8 46
+.b8 112
+.b8 121
+.b8 0
+.b32 .debug_line                        // DW_AT_stmt_list
+.b8 47                                  // DW_AT_comp_dir
+.b8 97
+.b8 112
+.b8 112
+.b8 47
+.b8 116
+.b8 101
+.b8 110
+.b8 115
+.b8 111
+.b8 114
+.b8 114
+.b8 116
+.b8 95
+.b8 108
+.b8 108
+.b8 109
+.b8 47
+.b8 118
+.b8 105
+.b8 115
+.b8 117
+.b8 97
+.b8 108
+.b8 95
+.b8 103
+.b8 101
+.b8 110
+.b8 47
+.b8 99
+.b8 111
+.b8 109
+.b8 112
+.b8 105
+.b8 108
+.b8 101
+.b8 100
+.b8 95
+.b8 99
+.b8 97
+.b8 99
+.b8 104
+.b8 101
+.b8 47
+.b8 102
+.b8 108
+.b8 117
+.b8 120
+.b8 50
+.b8 95
+.b8 107
+.b8 108
+.b8 101
+.b8 105
+.b8 110
+.b8 95
+.b8 57
+.b8 98
+.b8 95
+.b8 78
+.b8 86
+.b8 73
+.b8 68
+.b8 73
+.b8 65
+.b8 95
+.b8 71
+.b8 101
+.b8 70
+.b8 111
+.b8 114
+.b8 99
+.b8 101
+.b8 95
+.b8 82
+.b8 84
+.b8 88
+.b8 95
+.b8 52
+.b8 48
+.b8 57
+.b8 48
+.b8 95
+.b8 115
+.b8 109
+.b8 56
+.b8 57
+.b8 95
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 50
+.b8 46
+.b8 49
+.b8 48
+.b8 46
+.b8 48
+.b8 97
+.b8 48
+.b8 95
+.b8 98
+.b8 52
+.b8 101
+.b8 52
+.b8 101
+.b8 101
+.b8 56
+.b8 49
+.b8 100
+.b8 51
+.b8 46
+.b8 110
+.b8 118
+.b8 50
+.b8 53
+.b8 46
+.b8 49
+.b8 50
+.b8 95
+.b8 99
+.b8 117
+.b8 100
+.b8 97
+.b8 49
+.b8 51
+.b8 95
+.b8 49
+.b8 47
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 105
+.b8 110
+.b8 100
+.b8 117
+.b8 99
+.b8 116
+.b8 111
+.b8 114
+.b8 47
+.b8 108
+.b8 112
+.b8 0
+	}
+	.section	.debug_macinfo	{	}
diff --git a/triton/XNU7OU7O32IXOD4UTC2D2AANXVO6GQPSECSTO2LUQBS3OJJCNM5Q/triton_poi_fused_cat_view_4.source b/triton/XNU7OU7O32IXOD4UTC2D2AANXVO6GQPSECSTO2LUQBS3OJJCNM5Q/triton_poi_fused_cat_view_4.source
new file mode 100644
index 0000000000000000000000000000000000000000..2fb5cfd9779313c8bed0c64788bf0ef349e6fcee
--- /dev/null
+++ b/triton/XNU7OU7O32IXOD4UTC2D2AANXVO6GQPSECSTO2LUQBS3OJJCNM5Q/triton_poi_fused_cat_view_4.source
@@ -0,0 +1,136 @@
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":18:0)
+#loc31 = loc("in_ptr0"(#loc))
+#loc32 = loc("in_ptr1"(#loc))
+#loc33 = loc("out_ptr0"(#loc))
+#loc34 = loc("xnumel"(#loc))
+module {
+  tt.func public @triton_poi_fused_cat_view_4(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} {
+    %xnumel_0 = arith.constant 9437184 : i32 loc(#loc35)
+    %xoffset = tt.get_program_id x : i32 loc(#loc36)
+    %xoffset_1 = arith.constant 1024 : i32 loc(#loc37)
+    %xoffset_2 = arith.constant 1024 : i32 loc(#loc37)
+    %xoffset_3 = arith.muli %xoffset, %xoffset_2 : i32 loc(#loc37)
+    %xindex = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32> loc(#loc38)
+    %xindex_4 = tt.splat %xoffset_3 : i32 -> tensor<1024xi32> loc(#loc39)
+    %xindex_5 = arith.addi %xindex_4, %xindex : tensor<1024xi32> loc(#loc39)
+    %xmask = arith.constant true loc(#loc40)
+    %xmask_6 = arith.constant dense<true> : tensor<1024xi1> loc(#loc40)
+    %x1 = arith.constant 4096 : i32 loc(#loc41)
+    %x1_7 = arith.constant 4096 : i32 loc(#loc41)
+    %x1_8 = arith.constant dense<4096> : tensor<1024xi32> loc(#loc41)
+    %x1_9 = arith.divsi %xindex_5, %x1_8 : tensor<1024xi32> loc(#loc41)
+    %x0 = arith.constant 4096 : i32 loc(#loc42)
+    %x0_10 = arith.constant 4096 : i32 loc(#loc42)
+    %x0_11 = arith.constant dense<4096> : tensor<1024xi32> loc(#loc42)
+    %x0_12 = arith.remsi %xindex_5, %x0_11 : tensor<1024xi32> loc(#loc42)
+    %tmp1 = arith.constant 0 : i64 loc(#loc43)
+    %tmp1_13 = arith.constant dense<0> : tensor<1xi64> loc(#loc43)
+    %tmp2 = arith.extsi %x1_9 : tensor<1024xi32> to tensor<1024xi64> loc(#loc44)
+    %tmp2_14 = arith.constant dense<0> : tensor<1024xi64> loc(#loc44)
+    %tmp2_15 = arith.cmpi sge, %tmp2, %tmp2_14 : tensor<1024xi64> loc(#loc44)
+    %tmp3 = arith.constant 256 : i64 loc(#loc45)
+    %tmp3_16 = arith.constant dense<256> : tensor<1xi64> loc(#loc45)
+    %tmp4 = arith.extsi %x1_9 : tensor<1024xi32> to tensor<1024xi64> loc(#loc46)
+    %tmp4_17 = arith.constant dense<256> : tensor<1024xi64> loc(#loc46)
+    %tmp4_18 = arith.cmpi slt, %tmp4, %tmp4_17 : tensor<1024xi64> loc(#loc46)
+    %tmp5 = arith.constant 12288 : i32 loc(#loc47)
+    %tmp5_19 = arith.constant 12288 : i32 loc(#loc47)
+    %tmp5_20 = arith.constant dense<12288> : tensor<1024xi32> loc(#loc47)
+    %tmp5_21 = arith.muli %tmp5_20, %x1_9 : tensor<1024xi32> loc(#loc47)
+    %tmp5_22 = arith.addi %x0_12, %tmp5_21 : tensor<1024xi32> loc(#loc48)
+    %tmp5_23 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<1024x!tt.ptr<bf16>> loc(#loc49)
+    %tmp5_24 = tt.addptr %tmp5_23, %tmp5_22 : tensor<1024x!tt.ptr<bf16>>, tensor<1024xi32> loc(#loc49)
+    %tmp5_25 = arith.constant 0.000000e+00 : f32 loc(#loc50)
+    %tmp5_26 = arith.constant dense<0.000000e+00> : tensor<1024xf32> loc(#loc50)
+    %tmp5_27 = arith.truncf %tmp5_26 : tensor<1024xf32> to tensor<1024xbf16> loc(#loc50)
+    %tmp5_28 = tt.load %tmp5_24, %tmp4_18, %tmp5_27 : tensor<1024x!tt.ptr<bf16>> loc(#loc50)
+    %tmp5_29 = arith.extf %tmp5_28 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc51)
+    %tmp6 = arith.extsi %x1_9 : tensor<1024xi32> to tensor<1024xi64> loc(#loc52)
+    %tmp6_30 = arith.constant dense<256> : tensor<1024xi64> loc(#loc52)
+    %tmp6_31 = arith.cmpi sge, %tmp6, %tmp6_30 : tensor<1024xi64> loc(#loc52)
+    %tmp7 = arith.constant 2304 : i64 loc(#loc53)
+    %tmp7_32 = arith.constant dense<2304> : tensor<1xi64> loc(#loc53)
+    %tmp8 = arith.extsi %x1_9 : tensor<1024xi32> to tensor<1024xi64> loc(#loc54)
+    %tmp8_33 = arith.constant dense<2304> : tensor<1024xi64> loc(#loc54)
+    %tmp8_34 = arith.cmpi slt, %tmp8, %tmp8_33 : tensor<1024xi64> loc(#loc54)
+    %tmp9 = arith.constant -256 : i32 loc(#loc55)
+    %tmp9_35 = arith.constant -256 : i32 loc(#loc55)
+    %tmp9_36 = arith.constant dense<-256> : tensor<1024xi32> loc(#loc55)
+    %tmp9_37 = arith.addi %tmp9_36, %x1_9 : tensor<1024xi32> loc(#loc55)
+    %tmp9_38 = arith.constant 12288 : i32 loc(#loc56)
+    %tmp9_39 = arith.constant 12288 : i32 loc(#loc56)
+    %tmp9_40 = arith.constant dense<12288> : tensor<1024xi32> loc(#loc56)
+    %tmp9_41 = arith.muli %tmp9_40, %tmp9_37 : tensor<1024xi32> loc(#loc56)
+    %tmp9_42 = arith.addi %x0_12, %tmp9_41 : tensor<1024xi32> loc(#loc57)
+    %tmp9_43 = tt.splat %in_ptr1 : !tt.ptr<bf16> -> tensor<1024x!tt.ptr<bf16>> loc(#loc58)
+    %tmp9_44 = tt.addptr %tmp9_43, %tmp9_42 : tensor<1024x!tt.ptr<bf16>>, tensor<1024xi32> loc(#loc58)
+    %tmp9_45 = arith.constant 0.000000e+00 : f32 loc(#loc59)
+    %tmp9_46 = arith.constant dense<0.000000e+00> : tensor<1024xf32> loc(#loc59)
+    %tmp9_47 = arith.truncf %tmp9_46 : tensor<1024xf32> to tensor<1024xbf16> loc(#loc59)
+    %tmp9_48 = tt.load %tmp9_44, %tmp6_31, %tmp9_47 : tensor<1024x!tt.ptr<bf16>> loc(#loc59)
+    %tmp9_49 = arith.extf %tmp9_48 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc60)
+    %tmp10 = arith.select %tmp4_18, %tmp5_29, %tmp9_49 : tensor<1024xi1>, tensor<1024xf32> loc(#loc61)
+    %0 = tt.splat %out_ptr0 : !tt.ptr<bf16> -> tensor<1024x!tt.ptr<bf16>> loc(#loc28)
+    %1 = tt.addptr %0, %xindex_5 : tensor<1024x!tt.ptr<bf16>>, tensor<1024xi32> loc(#loc28)
+    %2 = arith.truncf %tmp10 : tensor<1024xf32> to tensor<1024xbf16> loc(#loc29)
+    tt.store %1, %2 : tensor<1024x!tt.ptr<bf16>> loc(#loc29)
+    tt.return loc(#loc30)
+  } loc(#loc)
+} loc(#loc)
+#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":19:13)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":20:28)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":20:33)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":21:36)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":21:23)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":22:36)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":23:19)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":24:19)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":27:27)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":28:19)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":29:29)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":30:18)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":31:42)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":31:35)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":31:30)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":31:48)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":31:68)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":32:19)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":33:30)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":34:18)
+#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":35:51)
+#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":35:42)
+#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":35:35)
+#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":35:30)
+#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":35:57)
+#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":35:77)
+#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":36:33)
+#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":37:25)
+#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":37:37)
+#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":37:4)
+#loc35 = loc("xnumel"(#loc1))
+#loc36 = loc("xoffset"(#loc2))
+#loc37 = loc("xoffset"(#loc3))
+#loc38 = loc("xindex"(#loc4))
+#loc39 = loc("xindex"(#loc5))
+#loc40 = loc("xmask"(#loc6))
+#loc41 = loc("x1"(#loc7))
+#loc42 = loc("x0"(#loc8))
+#loc43 = loc("tmp1"(#loc9))
+#loc44 = loc("tmp2"(#loc10))
+#loc45 = loc("tmp3"(#loc11))
+#loc46 = loc("tmp4"(#loc12))
+#loc47 = loc("tmp5"(#loc13))
+#loc48 = loc("tmp5"(#loc14))
+#loc49 = loc("tmp5"(#loc15))
+#loc50 = loc("tmp5"(#loc16))
+#loc51 = loc("tmp5"(#loc17))
+#loc52 = loc("tmp6"(#loc18))
+#loc53 = loc("tmp7"(#loc19))
+#loc54 = loc("tmp8"(#loc20))
+#loc55 = loc("tmp9"(#loc21))
+#loc56 = loc("tmp9"(#loc22))
+#loc57 = loc("tmp9"(#loc23))
+#loc58 = loc("tmp9"(#loc24))
+#loc59 = loc("tmp9"(#loc25))
+#loc60 = loc("tmp9"(#loc26))
+#loc61 = loc("tmp10"(#loc27))
diff --git a/triton/XNU7OU7O32IXOD4UTC2D2AANXVO6GQPSECSTO2LUQBS3OJJCNM5Q/triton_poi_fused_cat_view_4.ttgir b/triton/XNU7OU7O32IXOD4UTC2D2AANXVO6GQPSECSTO2LUQBS3OJJCNM5Q/triton_poi_fused_cat_view_4.ttgir
new file mode 100644
index 0000000000000000000000000000000000000000..100ff20cefc58e82bde6e7d20523caec17d3427b
--- /dev/null
+++ b/triton/XNU7OU7O32IXOD4UTC2D2AANXVO6GQPSECSTO2LUQBS3OJJCNM5Q/triton_poi_fused_cat_view_4.ttgir
@@ -0,0 +1,89 @@
+#blocked = #ttg.blocked<{sizePerThread = [8], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}>
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":18:0)
+#loc25 = loc("in_ptr0"(#loc))
+#loc26 = loc("in_ptr1"(#loc))
+#loc27 = loc("out_ptr0"(#loc))
+#loc28 = loc("xnumel"(#loc))
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:89", "ttg.threads-per-warp" = 32 : i32} {
+  tt.func public @triton_poi_fused_cat_view_4(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} {
+    %cst = arith.constant dense<4096> : tensor<1024xi32, #blocked> loc(#loc1)
+    %cst_0 = arith.constant dense<256> : tensor<1024xi64, #blocked> loc(#loc1)
+    %cst_1 = arith.constant dense<12288> : tensor<1024xi32, #blocked> loc(#loc1)
+    %cst_2 = arith.constant dense<-256> : tensor<1024xi32, #blocked> loc(#loc1)
+    %cst_3 = arith.constant dense<0.000000e+00> : tensor<1024xbf16, #blocked> loc(#loc1)
+    %c1024_i32 = arith.constant 1024 : i32 loc(#loc1)
+    %xoffset = tt.get_program_id x : i32 loc(#loc29)
+    %xoffset_4 = arith.muli %xoffset, %c1024_i32 : i32 loc(#loc30)
+    %xindex = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked> loc(#loc31)
+    %xindex_5 = tt.splat %xoffset_4 : i32 -> tensor<1024xi32, #blocked> loc(#loc32)
+    %xindex_6 = arith.addi %xindex_5, %xindex : tensor<1024xi32, #blocked> loc(#loc32)
+    %x1 = arith.divsi %xindex_6, %cst : tensor<1024xi32, #blocked> loc(#loc33)
+    %x0 = arith.remsi %xindex_6, %cst : tensor<1024xi32, #blocked> loc(#loc34)
+    %tmp4 = arith.extsi %x1 : tensor<1024xi32, #blocked> to tensor<1024xi64, #blocked> loc(#loc35)
+    %tmp4_7 = arith.cmpi slt, %tmp4, %cst_0 : tensor<1024xi64, #blocked> loc(#loc35)
+    %tmp5 = arith.muli %x1, %cst_1 : tensor<1024xi32, #blocked> loc(#loc36)
+    %tmp5_8 = arith.addi %x0, %tmp5 : tensor<1024xi32, #blocked> loc(#loc37)
+    %tmp5_9 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<1024x!tt.ptr<bf16>, #blocked> loc(#loc38)
+    %tmp5_10 = tt.addptr %tmp5_9, %tmp5_8 : tensor<1024x!tt.ptr<bf16>, #blocked>, tensor<1024xi32, #blocked> loc(#loc38)
+    %tmp5_11 = tt.load %tmp5_10, %tmp4_7, %cst_3 : tensor<1024x!tt.ptr<bf16>, #blocked> loc(#loc39)
+    %tmp5_12 = arith.extf %tmp5_11 : tensor<1024xbf16, #blocked> to tensor<1024xf32, #blocked> loc(#loc40)
+    %tmp6 = arith.cmpi sge, %tmp4, %cst_0 : tensor<1024xi64, #blocked> loc(#loc41)
+    %tmp9 = arith.addi %x1, %cst_2 : tensor<1024xi32, #blocked> loc(#loc42)
+    %tmp9_13 = arith.muli %tmp9, %cst_1 : tensor<1024xi32, #blocked> loc(#loc43)
+    %tmp9_14 = arith.addi %x0, %tmp9_13 : tensor<1024xi32, #blocked> loc(#loc44)
+    %tmp9_15 = tt.splat %in_ptr1 : !tt.ptr<bf16> -> tensor<1024x!tt.ptr<bf16>, #blocked> loc(#loc45)
+    %tmp9_16 = tt.addptr %tmp9_15, %tmp9_14 : tensor<1024x!tt.ptr<bf16>, #blocked>, tensor<1024xi32, #blocked> loc(#loc45)
+    %tmp9_17 = tt.load %tmp9_16, %tmp6, %cst_3 : tensor<1024x!tt.ptr<bf16>, #blocked> loc(#loc46)
+    %tmp9_18 = arith.extf %tmp9_17 : tensor<1024xbf16, #blocked> to tensor<1024xf32, #blocked> loc(#loc47)
+    %tmp10 = arith.select %tmp4_7, %tmp5_12, %tmp9_18 : tensor<1024xi1, #blocked>, tensor<1024xf32, #blocked> loc(#loc48)
+    %0 = tt.splat %out_ptr0 : !tt.ptr<bf16> -> tensor<1024x!tt.ptr<bf16>, #blocked> loc(#loc22)
+    %1 = tt.addptr %0, %xindex_6 : tensor<1024x!tt.ptr<bf16>, #blocked>, tensor<1024xi32, #blocked> loc(#loc22)
+    %2 = arith.truncf %tmp10 : tensor<1024xf32, #blocked> to tensor<1024xbf16, #blocked> loc(#loc23)
+    tt.store %1, %2 : tensor<1024x!tt.ptr<bf16>, #blocked> loc(#loc23)
+    tt.return loc(#loc24)
+  } loc(#loc)
+} loc(#loc)
+#loc1 = loc(unknown)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":20:28)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":20:33)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":21:36)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":21:23)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":23:19)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":24:19)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":30:18)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":31:42)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":31:35)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":31:30)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":31:48)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":31:68)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":32:19)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":35:51)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":35:42)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":35:35)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":35:30)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":35:57)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":35:77)
+#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":36:33)
+#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":37:25)
+#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":37:37)
+#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":37:4)
+#loc29 = loc("xoffset"(#loc2))
+#loc30 = loc("xoffset"(#loc3))
+#loc31 = loc("xindex"(#loc4))
+#loc32 = loc("xindex"(#loc5))
+#loc33 = loc("x1"(#loc6))
+#loc34 = loc("x0"(#loc7))
+#loc35 = loc("tmp4"(#loc8))
+#loc36 = loc("tmp5"(#loc9))
+#loc37 = loc("tmp5"(#loc10))
+#loc38 = loc("tmp5"(#loc11))
+#loc39 = loc("tmp5"(#loc12))
+#loc40 = loc("tmp5"(#loc13))
+#loc41 = loc("tmp6"(#loc14))
+#loc42 = loc("tmp9"(#loc15))
+#loc43 = loc("tmp9"(#loc16))
+#loc44 = loc("tmp9"(#loc17))
+#loc45 = loc("tmp9"(#loc18))
+#loc46 = loc("tmp9"(#loc19))
+#loc47 = loc("tmp9"(#loc20))
+#loc48 = loc("tmp10"(#loc21))
diff --git a/triton/XNU7OU7O32IXOD4UTC2D2AANXVO6GQPSECSTO2LUQBS3OJJCNM5Q/triton_poi_fused_cat_view_4.ttir b/triton/XNU7OU7O32IXOD4UTC2D2AANXVO6GQPSECSTO2LUQBS3OJJCNM5Q/triton_poi_fused_cat_view_4.ttir
new file mode 100644
index 0000000000000000000000000000000000000000..511d76bc0ef4e2fd5a4e58f2fe591d6653997ca7
--- /dev/null
+++ b/triton/XNU7OU7O32IXOD4UTC2D2AANXVO6GQPSECSTO2LUQBS3OJJCNM5Q/triton_poi_fused_cat_view_4.ttir
@@ -0,0 +1,88 @@
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":18:0)
+#loc25 = loc("in_ptr0"(#loc))
+#loc26 = loc("in_ptr1"(#loc))
+#loc27 = loc("out_ptr0"(#loc))
+#loc28 = loc("xnumel"(#loc))
+module {
+  tt.func public @triton_poi_fused_cat_view_4(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} {
+    %cst = arith.constant dense<0.000000e+00> : tensor<1024xbf16> loc(#loc1)
+    %tmp9 = arith.constant dense<-256> : tensor<1024xi32> loc(#loc29)
+    %cst_0 = arith.constant dense<12288> : tensor<1024xi32> loc(#loc1)
+    %cst_1 = arith.constant dense<256> : tensor<1024xi64> loc(#loc1)
+    %cst_2 = arith.constant dense<4096> : tensor<1024xi32> loc(#loc1)
+    %c1024_i32 = arith.constant 1024 : i32 loc(#loc1)
+    %xoffset = tt.get_program_id x : i32 loc(#loc30)
+    %xoffset_3 = arith.muli %xoffset, %c1024_i32 : i32 loc(#loc31)
+    %xindex = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32> loc(#loc32)
+    %xindex_4 = tt.splat %xoffset_3 : i32 -> tensor<1024xi32> loc(#loc33)
+    %xindex_5 = arith.addi %xindex_4, %xindex : tensor<1024xi32> loc(#loc33)
+    %x1 = arith.divsi %xindex_5, %cst_2 : tensor<1024xi32> loc(#loc34)
+    %x0 = arith.remsi %xindex_5, %cst_2 : tensor<1024xi32> loc(#loc35)
+    %tmp4 = arith.extsi %x1 : tensor<1024xi32> to tensor<1024xi64> loc(#loc36)
+    %tmp4_6 = arith.cmpi slt, %tmp4, %cst_1 : tensor<1024xi64> loc(#loc36)
+    %tmp5 = arith.muli %x1, %cst_0 : tensor<1024xi32> loc(#loc37)
+    %tmp5_7 = arith.addi %x0, %tmp5 : tensor<1024xi32> loc(#loc38)
+    %tmp5_8 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<1024x!tt.ptr<bf16>> loc(#loc39)
+    %tmp5_9 = tt.addptr %tmp5_8, %tmp5_7 : tensor<1024x!tt.ptr<bf16>>, tensor<1024xi32> loc(#loc39)
+    %tmp5_10 = tt.load %tmp5_9, %tmp4_6, %cst : tensor<1024x!tt.ptr<bf16>> loc(#loc40)
+    %tmp5_11 = arith.extf %tmp5_10 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc41)
+    %tmp6 = arith.cmpi sge, %tmp4, %cst_1 : tensor<1024xi64> loc(#loc42)
+    %tmp9_12 = arith.addi %x1, %tmp9 : tensor<1024xi32> loc(#loc29)
+    %tmp9_13 = arith.muli %tmp9_12, %cst_0 : tensor<1024xi32> loc(#loc43)
+    %tmp9_14 = arith.addi %x0, %tmp9_13 : tensor<1024xi32> loc(#loc44)
+    %tmp9_15 = tt.splat %in_ptr1 : !tt.ptr<bf16> -> tensor<1024x!tt.ptr<bf16>> loc(#loc45)
+    %tmp9_16 = tt.addptr %tmp9_15, %tmp9_14 : tensor<1024x!tt.ptr<bf16>>, tensor<1024xi32> loc(#loc45)
+    %tmp9_17 = tt.load %tmp9_16, %tmp6, %cst : tensor<1024x!tt.ptr<bf16>> loc(#loc46)
+    %tmp9_18 = arith.extf %tmp9_17 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc47)
+    %tmp10 = arith.select %tmp4_6, %tmp5_11, %tmp9_18 : tensor<1024xi1>, tensor<1024xf32> loc(#loc48)
+    %0 = tt.splat %out_ptr0 : !tt.ptr<bf16> -> tensor<1024x!tt.ptr<bf16>> loc(#loc22)
+    %1 = tt.addptr %0, %xindex_5 : tensor<1024x!tt.ptr<bf16>>, tensor<1024xi32> loc(#loc22)
+    %2 = arith.truncf %tmp10 : tensor<1024xf32> to tensor<1024xbf16> loc(#loc23)
+    tt.store %1, %2 : tensor<1024x!tt.ptr<bf16>> loc(#loc23)
+    tt.return loc(#loc24)
+  } loc(#loc)
+} loc(#loc)
+#loc1 = loc(unknown)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":35:51)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":20:28)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":20:33)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":21:36)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":21:23)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":23:19)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":24:19)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":30:18)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":31:42)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":31:35)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":31:30)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":31:48)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":31:68)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":32:19)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":35:42)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":35:35)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":35:30)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":35:57)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":35:77)
+#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":36:33)
+#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":37:25)
+#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":37:37)
+#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/lp/clpf4loohfsgwqh2gi2xovodpm7hzv5u2rvngb7chjgywx55gtud.py":37:4)
+#loc29 = loc("tmp9"(#loc2))
+#loc30 = loc("xoffset"(#loc3))
+#loc31 = loc("xoffset"(#loc4))
+#loc32 = loc("xindex"(#loc5))
+#loc33 = loc("xindex"(#loc6))
+#loc34 = loc("x1"(#loc7))
+#loc35 = loc("x0"(#loc8))
+#loc36 = loc("tmp4"(#loc9))
+#loc37 = loc("tmp5"(#loc10))
+#loc38 = loc("tmp5"(#loc11))
+#loc39 = loc("tmp5"(#loc12))
+#loc40 = loc("tmp5"(#loc13))
+#loc41 = loc("tmp5"(#loc14))
+#loc42 = loc("tmp6"(#loc15))
+#loc43 = loc("tmp9"(#loc16))
+#loc44 = loc("tmp9"(#loc17))
+#loc45 = loc("tmp9"(#loc18))
+#loc46 = loc("tmp9"(#loc19))
+#loc47 = loc("tmp9"(#loc20))
+#loc48 = loc("tmp10"(#loc21))
diff --git a/triton/XPTQ4QSOEUVXWHAK6CZOZU62APU6T6ZACVOSZPONZ6HRIBKDDZXQ/__grp__triton_poi_fused_add_mul_0.json b/triton/XPTQ4QSOEUVXWHAK6CZOZU62APU6T6ZACVOSZPONZ6HRIBKDDZXQ/__grp__triton_poi_fused_add_mul_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..c8826882dc0439d11122b9daf69d9e95446804a6
--- /dev/null
+++ b/triton/XPTQ4QSOEUVXWHAK6CZOZU62APU6T6ZACVOSZPONZ6HRIBKDDZXQ/__grp__triton_poi_fused_add_mul_0.json
@@ -0,0 +1 @@
+{"child_paths": {"triton_poi_fused_add_mul_0.source": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/XPTQ4QSOEUVXWHAK6CZOZU62APU6T6ZACVOSZPONZ6HRIBKDDZXQ/triton_poi_fused_add_mul_0.source", "triton_poi_fused_add_mul_0.ttir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/XPTQ4QSOEUVXWHAK6CZOZU62APU6T6ZACVOSZPONZ6HRIBKDDZXQ/triton_poi_fused_add_mul_0.ttir", "triton_poi_fused_add_mul_0.ttgir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/XPTQ4QSOEUVXWHAK6CZOZU62APU6T6ZACVOSZPONZ6HRIBKDDZXQ/triton_poi_fused_add_mul_0.ttgir", "triton_poi_fused_add_mul_0.llir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/XPTQ4QSOEUVXWHAK6CZOZU62APU6T6ZACVOSZPONZ6HRIBKDDZXQ/triton_poi_fused_add_mul_0.llir", "triton_poi_fused_add_mul_0.ptx": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/XPTQ4QSOEUVXWHAK6CZOZU62APU6T6ZACVOSZPONZ6HRIBKDDZXQ/triton_poi_fused_add_mul_0.ptx", "triton_poi_fused_add_mul_0.cubin": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/XPTQ4QSOEUVXWHAK6CZOZU62APU6T6ZACVOSZPONZ6HRIBKDDZXQ/triton_poi_fused_add_mul_0.cubin", "triton_poi_fused_add_mul_0.json": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/XPTQ4QSOEUVXWHAK6CZOZU62APU6T6ZACVOSZPONZ6HRIBKDDZXQ/triton_poi_fused_add_mul_0.json"}}
\ No newline at end of file
diff --git a/triton/XPTQ4QSOEUVXWHAK6CZOZU62APU6T6ZACVOSZPONZ6HRIBKDDZXQ/triton_poi_fused_add_mul_0.cubin b/triton/XPTQ4QSOEUVXWHAK6CZOZU62APU6T6ZACVOSZPONZ6HRIBKDDZXQ/triton_poi_fused_add_mul_0.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..a9ae9b5a27246b4efc6e824f67772a5b40c12fe6
Binary files /dev/null and b/triton/XPTQ4QSOEUVXWHAK6CZOZU62APU6T6ZACVOSZPONZ6HRIBKDDZXQ/triton_poi_fused_add_mul_0.cubin differ
diff --git a/triton/XPTQ4QSOEUVXWHAK6CZOZU62APU6T6ZACVOSZPONZ6HRIBKDDZXQ/triton_poi_fused_add_mul_0.json b/triton/XPTQ4QSOEUVXWHAK6CZOZU62APU6T6ZACVOSZPONZ6HRIBKDDZXQ/triton_poi_fused_add_mul_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..5f1ab544c1f04c148f96f7d84db10e643858f6e5
--- /dev/null
+++ b/triton/XPTQ4QSOEUVXWHAK6CZOZU62APU6T6ZACVOSZPONZ6HRIBKDDZXQ/triton_poi_fused_add_mul_0.json
@@ -0,0 +1 @@
+{"hash": "bbe70e424e252b7b1c0af0b2ecd3da03e9e9fb20155d2cbdcdcf8f1405431e6f", "target": {"backend": "cuda", "arch": 89, "warp_size": 32}, "num_warps": 8, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "enable_reflect_ftz": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee", "bf16x3", "bf16x6"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm89", "instrumentation_mode": "", "triton_version": "3.6.0", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_poi_fused_add_mul_0"}
\ No newline at end of file
diff --git a/triton/XPTQ4QSOEUVXWHAK6CZOZU62APU6T6ZACVOSZPONZ6HRIBKDDZXQ/triton_poi_fused_add_mul_0.llir b/triton/XPTQ4QSOEUVXWHAK6CZOZU62APU6T6ZACVOSZPONZ6HRIBKDDZXQ/triton_poi_fused_add_mul_0.llir
new file mode 100644
index 0000000000000000000000000000000000000000..4860bccd1529052ec12391a5aad030f864c00594
--- /dev/null
+++ b/triton/XPTQ4QSOEUVXWHAK6CZOZU62APU6T6ZACVOSZPONZ6HRIBKDDZXQ/triton_poi_fused_add_mul_0.llir
@@ -0,0 +1,76 @@
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64"
+
+; Function Attrs: nounwind
+define ptx_kernel void @triton_poi_fused_add_mul_0(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, i32 %4, ptr addrspace(1) readnone captures(none) %5, ptr addrspace(1) readnone captures(none) %6) local_unnamed_addr #0 !dbg !4 {
+  %8 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7
+  %9 = shl i32 %8, 9, !dbg !8
+  %10 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9
+  %11 = shl nuw nsw i32 %10, 1, !dbg !9
+  %12 = and i32 %11, 510, !dbg !9
+  %13 = or disjoint i32 %12, %9, !dbg !10
+  %14 = srem i32 %13, 4096, !dbg !11
+  %15 = sext i32 %13 to i64, !dbg !12
+  %16 = getelementptr bfloat, ptr addrspace(1) %0, i64 %15, !dbg !12
+  %17 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l"(ptr addrspace(1) %16) #2, !dbg !13
+  %18 = bitcast i32 %17 to <2 x bfloat>, !dbg !13
+  %19 = sext i32 %14 to i64, !dbg !14
+  %20 = getelementptr bfloat, ptr addrspace(1) %1, i64 %19, !dbg !14
+  %21 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #2, !dbg !15
+  %22 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l"(ptr addrspace(1) %20, i64 %21) #2, !dbg !15
+  %23 = bitcast i32 %22 to <2 x bfloat>, !dbg !15
+  %24 = getelementptr bfloat, ptr addrspace(1) %2, i64 %15, !dbg !16
+  %25 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l"(ptr addrspace(1) %24) #2, !dbg !17
+  %26 = bitcast i32 %25 to <2 x bfloat>, !dbg !17
+  %27 = getelementptr bfloat, ptr addrspace(1) %3, i64 %15, !dbg !18
+  %28 = fpext <2 x bfloat> %18 to <2 x float>, !dbg !19
+  %29 = fpext <2 x bfloat> %23 to <2 x float>, !dbg !20
+  %30 = fpext <2 x bfloat> %26 to <2 x float>, !dbg !21
+  %31 = fmul <2 x float> %29, %30, !dbg !22
+  %32 = fadd <2 x float> %31, %28, !dbg !23
+  %33 = fptrunc <2 x float> %32 to <2 x bfloat>, !dbg !24
+  %34 = bitcast <2 x bfloat> %33 to i32, !dbg !24
+  tail call void asm sideeffect "st.global.b32 [ $1 + 0 ], { $0 };", "r,l"(i32 %34, ptr addrspace(1) %27) #2, !dbg !24
+  ret void, !dbg !25
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1
+
+attributes #0 = { nounwind "nvvm.reqntid"="256" }
+attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #2 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly)
+!1 = !DIFile(filename: "cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py", directory: "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj")
+!2 = !{i32 2, !"Debug Info Version", i32 3}
+!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
+!4 = distinct !DISubprogram(name: "triton_poi_fused_add_mul_0", linkageName: "triton_poi_fused_add_mul_0", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!5 = !DISubroutineType(cc: DW_CC_normal, types: !6)
+!6 = !{}
+!7 = !DILocation(line: 20, column: 28, scope: !4)
+!8 = !DILocation(line: 20, column: 33, scope: !4)
+!9 = !DILocation(line: 21, column: 36, scope: !4)
+!10 = !DILocation(line: 21, column: 23, scope: !4)
+!11 = !DILocation(line: 24, column: 19, scope: !4)
+!12 = !DILocation(line: 25, column: 30, scope: !4)
+!13 = !DILocation(line: 25, column: 35, scope: !4)
+!14 = !DILocation(line: 26, column: 30, scope: !4)
+!15 = !DILocation(line: 26, column: 35, scope: !4)
+!16 = !DILocation(line: 27, column: 30, scope: !4)
+!17 = !DILocation(line: 27, column: 35, scope: !4)
+!18 = !DILocation(line: 30, column: 25, scope: !4)
+!19 = !DILocation(line: 25, column: 44, scope: !4)
+!20 = !DILocation(line: 26, column: 74, scope: !4)
+!21 = !DILocation(line: 27, column: 44, scope: !4)
+!22 = !DILocation(line: 28, column: 18, scope: !4)
+!23 = !DILocation(line: 29, column: 18, scope: !4)
+!24 = !DILocation(line: 30, column: 36, scope: !4)
+!25 = !DILocation(line: 30, column: 4, scope: !4)
diff --git a/triton/XPTQ4QSOEUVXWHAK6CZOZU62APU6T6ZACVOSZPONZ6HRIBKDDZXQ/triton_poi_fused_add_mul_0.ptx b/triton/XPTQ4QSOEUVXWHAK6CZOZU62APU6T6ZACVOSZPONZ6HRIBKDDZXQ/triton_poi_fused_add_mul_0.ptx
new file mode 100644
index 0000000000000000000000000000000000000000..377378ae7ef2d5c43af9668539bd36bcd47de934
--- /dev/null
+++ b/triton/XPTQ4QSOEUVXWHAK6CZOZU62APU6T6ZACVOSZPONZ6HRIBKDDZXQ/triton_poi_fused_add_mul_0.ptx
@@ -0,0 +1,347 @@
+//
+// Generated by LLVM NVPTX Back-End
+//
+
+.version 9.1
+.target sm_89
+.address_size 64
+
+	// .globl	triton_poi_fused_add_mul_0 // -- Begin function triton_poi_fused_add_mul_0
+                                        // @triton_poi_fused_add_mul_0
+.visible .entry triton_poi_fused_add_mul_0(
+	.param .u64 .ptr .global .align 1 triton_poi_fused_add_mul_0_param_0,
+	.param .u64 .ptr .global .align 1 triton_poi_fused_add_mul_0_param_1,
+	.param .u64 .ptr .global .align 1 triton_poi_fused_add_mul_0_param_2,
+	.param .u64 .ptr .global .align 1 triton_poi_fused_add_mul_0_param_3,
+	.param .u32 triton_poi_fused_add_mul_0_param_4,
+	.param .u64 .ptr .global .align 1 triton_poi_fused_add_mul_0_param_5,
+	.param .u64 .ptr .global .align 1 triton_poi_fused_add_mul_0_param_6
+)
+.reqntid 256
+{
+	.reg .b16 	%rs<7>;
+	.reg .b32 	%r<24>;
+	.reg .b64 	%rd<11>;
+	.loc	1 18 0                          // cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py:18:0
+$L__func_begin0:
+	.loc	1 18 0                          // cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py:18:0
+
+// %bb.0:
+	ld.param.b64 	%rd6, [triton_poi_fused_add_mul_0_param_0];
+	ld.param.b64 	%rd7, [triton_poi_fused_add_mul_0_param_1];
+$L__tmp0:
+	.loc	1 20 28                         // cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py:20:28
+	mov.u32 	%r5, %ctaid.x;
+	.loc	1 20 33                         // cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py:20:33
+	shl.b32 	%r6, %r5, 9;
+	ld.param.b64 	%rd8, [triton_poi_fused_add_mul_0_param_2];
+	ld.param.b64 	%rd9, [triton_poi_fused_add_mul_0_param_3];
+	.loc	1 21 36                         // cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py:21:36
+	mov.u32 	%r7, %tid.x;
+	shl.b32 	%r8, %r7, 1;
+	and.b32 	%r9, %r8, 510;
+	.loc	1 21 23                         // cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py:21:23
+	or.b32 	%r10, %r9, %r6;
+	.loc	1 24 19                         // cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py:24:19
+	bfe.s32 	%r11, %r5, 22, 1;
+	shr.u32 	%r12, %r11, 20;
+	add.s32 	%r13, %r10, %r12;
+	and.b32 	%r14, %r13, -4096;
+	sub.s32 	%r15, %r10, %r14;
+	.loc	1 25 30                         // cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py:25:30
+	mul.wide.s32 	%rd10, %r10, 2;
+	add.s64 	%rd1, %rd6, %rd10;
+	.loc	1 25 35                         // cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py:25:35
+	// begin inline asm
+	mov.u32 %r1, 0x0;
+	ld.global.b32 { %r1 }, [ %rd1 + 0 ];
+	// end inline asm
+	.loc	1 26 30                         // cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py:26:30
+	mad.wide.s32 	%rd2, %r15, 2, %rd7;
+	.loc	1 26 35                         // cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py:26:35
+	// begin inline asm
+	mov.u64 %rd3, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd3, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r2, 0x0;
+	ld.global.L1::evict_last.L2::cache_hint.b32 { %r2 }, [ %rd2 + 0 ], %rd3;
+	// end inline asm
+	.loc	1 27 30                         // cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py:27:30
+	add.s64 	%rd4, %rd8, %rd10;
+	.loc	1 27 35                         // cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py:27:35
+	// begin inline asm
+	mov.u32 %r3, 0x0;
+	ld.global.b32 { %r3 }, [ %rd4 + 0 ];
+	// end inline asm
+	.loc	1 30 25                         // cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py:30:25
+	add.s64 	%rd5, %rd9, %rd10;
+	.loc	1 25 44                         // cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py:25:44
+	mov.b32 	{%rs1, %rs2}, %r1;
+	cvt.f32.bf16 	%r16, %rs2;
+	cvt.f32.bf16 	%r17, %rs1;
+	.loc	1 26 74                         // cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py:26:74
+	mov.b32 	{%rs3, %rs4}, %r2;
+	cvt.f32.bf16 	%r18, %rs4;
+	cvt.f32.bf16 	%r19, %rs3;
+	.loc	1 27 44                         // cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py:27:44
+	mov.b32 	{%rs5, %rs6}, %r3;
+	cvt.f32.bf16 	%r20, %rs6;
+	cvt.f32.bf16 	%r21, %rs5;
+	.loc	1 29 18                         // cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py:29:18
+	fma.rn.f32 	%r22, %r19, %r21, %r17;
+	fma.rn.f32 	%r23, %r18, %r20, %r16;
+	.loc	1 30 36                         // cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py:30:36
+	cvt.rn.bf16x2.f32 	%r4, %r23, %r22;
+	// begin inline asm
+	st.global.b32 [ %rd5 + 0 ], { %r4 };
+	// end inline asm
+	.loc	1 30 4                          // cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py:30:4
+	ret;
+$L__tmp1:
+$L__func_end0:
+                                        // -- End function
+}
+	.file	1 "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py"
+	.section	.debug_abbrev
+	{
+.b8 1                                   // Abbreviation Code
+.b8 17                                  // DW_TAG_compile_unit
+.b8 0                                   // DW_CHILDREN_no
+.b8 37                                  // DW_AT_producer
+.b8 8                                   // DW_FORM_string
+.b8 19                                  // DW_AT_language
+.b8 5                                   // DW_FORM_data2
+.b8 3                                   // DW_AT_name
+.b8 8                                   // DW_FORM_string
+.b8 16                                  // DW_AT_stmt_list
+.b8 6                                   // DW_FORM_data4
+.b8 27                                  // DW_AT_comp_dir
+.b8 8                                   // DW_FORM_string
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 0                                   // EOM(3)
+	}
+	.section	.debug_info
+	{
+.b32 224                                // Length of Unit
+.b8 2                                   // DWARF version number
+.b8 0
+.b32 .debug_abbrev                      // Offset Into Abbrev. Section
+.b8 8                                   // Address Size (in bytes)
+.b8 1                                   // Abbrev [1] 0xb:0xd9 DW_TAG_compile_unit
+.b8 116                                 // DW_AT_producer
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 0
+.b8 2                                   // DW_AT_language
+.b8 0
+.b8 99                                  // DW_AT_name
+.b8 120
+.b8 106
+.b8 52
+.b8 112
+.b8 53
+.b8 51
+.b8 104
+.b8 111
+.b8 116
+.b8 118
+.b8 119
+.b8 51
+.b8 51
+.b8 54
+.b8 119
+.b8 52
+.b8 106
+.b8 54
+.b8 106
+.b8 54
+.b8 110
+.b8 108
+.b8 121
+.b8 100
+.b8 119
+.b8 120
+.b8 122
+.b8 114
+.b8 115
+.b8 52
+.b8 104
+.b8 104
+.b8 107
+.b8 106
+.b8 52
+.b8 50
+.b8 104
+.b8 111
+.b8 102
+.b8 108
+.b8 111
+.b8 116
+.b8 50
+.b8 110
+.b8 115
+.b8 122
+.b8 113
+.b8 122
+.b8 113
+.b8 51
+.b8 117
+.b8 46
+.b8 112
+.b8 121
+.b8 0
+.b32 .debug_line                        // DW_AT_stmt_list
+.b8 47                                  // DW_AT_comp_dir
+.b8 97
+.b8 112
+.b8 112
+.b8 47
+.b8 116
+.b8 101
+.b8 110
+.b8 115
+.b8 111
+.b8 114
+.b8 114
+.b8 116
+.b8 95
+.b8 108
+.b8 108
+.b8 109
+.b8 47
+.b8 118
+.b8 105
+.b8 115
+.b8 117
+.b8 97
+.b8 108
+.b8 95
+.b8 103
+.b8 101
+.b8 110
+.b8 47
+.b8 99
+.b8 111
+.b8 109
+.b8 112
+.b8 105
+.b8 108
+.b8 101
+.b8 100
+.b8 95
+.b8 99
+.b8 97
+.b8 99
+.b8 104
+.b8 101
+.b8 47
+.b8 102
+.b8 108
+.b8 117
+.b8 120
+.b8 50
+.b8 95
+.b8 107
+.b8 108
+.b8 101
+.b8 105
+.b8 110
+.b8 95
+.b8 57
+.b8 98
+.b8 95
+.b8 78
+.b8 86
+.b8 73
+.b8 68
+.b8 73
+.b8 65
+.b8 95
+.b8 71
+.b8 101
+.b8 70
+.b8 111
+.b8 114
+.b8 99
+.b8 101
+.b8 95
+.b8 82
+.b8 84
+.b8 88
+.b8 95
+.b8 52
+.b8 48
+.b8 57
+.b8 48
+.b8 95
+.b8 115
+.b8 109
+.b8 56
+.b8 57
+.b8 95
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 50
+.b8 46
+.b8 49
+.b8 48
+.b8 46
+.b8 48
+.b8 97
+.b8 48
+.b8 95
+.b8 98
+.b8 52
+.b8 101
+.b8 52
+.b8 101
+.b8 101
+.b8 56
+.b8 49
+.b8 100
+.b8 51
+.b8 46
+.b8 110
+.b8 118
+.b8 50
+.b8 53
+.b8 46
+.b8 49
+.b8 50
+.b8 95
+.b8 99
+.b8 117
+.b8 100
+.b8 97
+.b8 49
+.b8 51
+.b8 95
+.b8 49
+.b8 47
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 105
+.b8 110
+.b8 100
+.b8 117
+.b8 99
+.b8 116
+.b8 111
+.b8 114
+.b8 47
+.b8 120
+.b8 106
+.b8 0
+	}
+	.section	.debug_macinfo	{	}
diff --git a/triton/XPTQ4QSOEUVXWHAK6CZOZU62APU6T6ZACVOSZPONZ6HRIBKDDZXQ/triton_poi_fused_add_mul_0.source b/triton/XPTQ4QSOEUVXWHAK6CZOZU62APU6T6ZACVOSZPONZ6HRIBKDDZXQ/triton_poi_fused_add_mul_0.source
new file mode 100644
index 0000000000000000000000000000000000000000..e852b1ad9f9b3d0f898a7172708dc2db1a0de9ac
--- /dev/null
+++ b/triton/XPTQ4QSOEUVXWHAK6CZOZU62APU6T6ZACVOSZPONZ6HRIBKDDZXQ/triton_poi_fused_add_mul_0.source
@@ -0,0 +1,82 @@
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":18:0)
+#loc22 = loc("in_ptr0"(#loc))
+#loc23 = loc("in_ptr1"(#loc))
+#loc24 = loc("in_ptr2"(#loc))
+#loc25 = loc("out_ptr0"(#loc))
+#loc26 = loc("xnumel"(#loc))
+module {
+  tt.func public @triton_poi_fused_add_mul_0(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} {
+    %xnumel_0 = arith.constant 9437184 : i32 loc(#loc27)
+    %xoffset = tt.get_program_id x : i32 loc(#loc28)
+    %xoffset_1 = arith.constant 512 : i32 loc(#loc29)
+    %xoffset_2 = arith.constant 512 : i32 loc(#loc29)
+    %xoffset_3 = arith.muli %xoffset, %xoffset_2 : i32 loc(#loc29)
+    %xindex = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32> loc(#loc30)
+    %xindex_4 = tt.splat %xoffset_3 : i32 -> tensor<512xi32> loc(#loc31)
+    %xindex_5 = arith.addi %xindex_4, %xindex : tensor<512xi32> loc(#loc31)
+    %xmask = arith.constant true loc(#loc32)
+    %xmask_6 = arith.constant dense<true> : tensor<512xi1> loc(#loc32)
+    %x0 = arith.constant 4096 : i32 loc(#loc33)
+    %x0_7 = arith.constant 4096 : i32 loc(#loc33)
+    %x0_8 = arith.constant dense<4096> : tensor<512xi32> loc(#loc33)
+    %x0_9 = arith.remsi %xindex_5, %x0_8 : tensor<512xi32> loc(#loc33)
+    %tmp0 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<512x!tt.ptr<bf16>> loc(#loc34)
+    %tmp0_10 = tt.addptr %tmp0, %xindex_5 : tensor<512x!tt.ptr<bf16>>, tensor<512xi32> loc(#loc34)
+    %tmp0_11 = tt.load %tmp0_10 : tensor<512x!tt.ptr<bf16>> loc(#loc35)
+    %tmp0_12 = arith.extf %tmp0_11 : tensor<512xbf16> to tensor<512xf32> loc(#loc36)
+    %tmp1 = tt.splat %in_ptr1 : !tt.ptr<bf16> -> tensor<512x!tt.ptr<bf16>> loc(#loc37)
+    %tmp1_13 = tt.addptr %tmp1, %x0_9 : tensor<512x!tt.ptr<bf16>>, tensor<512xi32> loc(#loc37)
+    %tmp1_14 = tt.load %tmp1_13 evictionPolicy = evict_last : tensor<512x!tt.ptr<bf16>> loc(#loc38)
+    %tmp1_15 = arith.extf %tmp1_14 : tensor<512xbf16> to tensor<512xf32> loc(#loc39)
+    %tmp2 = tt.splat %in_ptr2 : !tt.ptr<bf16> -> tensor<512x!tt.ptr<bf16>> loc(#loc40)
+    %tmp2_16 = tt.addptr %tmp2, %xindex_5 : tensor<512x!tt.ptr<bf16>>, tensor<512xi32> loc(#loc40)
+    %tmp2_17 = tt.load %tmp2_16 : tensor<512x!tt.ptr<bf16>> loc(#loc41)
+    %tmp2_18 = arith.extf %tmp2_17 : tensor<512xbf16> to tensor<512xf32> loc(#loc42)
+    %tmp3 = arith.mulf %tmp1_15, %tmp2_18 : tensor<512xf32> loc(#loc43)
+    %tmp4 = arith.addf %tmp0_12, %tmp3 : tensor<512xf32> loc(#loc44)
+    %0 = tt.splat %out_ptr0 : !tt.ptr<bf16> -> tensor<512x!tt.ptr<bf16>> loc(#loc19)
+    %1 = tt.addptr %0, %xindex_5 : tensor<512x!tt.ptr<bf16>>, tensor<512xi32> loc(#loc19)
+    %2 = arith.truncf %tmp4 : tensor<512xf32> to tensor<512xbf16> loc(#loc20)
+    tt.store %1, %2 : tensor<512x!tt.ptr<bf16>> loc(#loc20)
+    tt.return loc(#loc21)
+  } loc(#loc)
+} loc(#loc)
+#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":19:13)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":20:28)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":20:33)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":21:36)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":21:23)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":22:36)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":24:19)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":25:30)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":25:35)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":25:44)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":26:30)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":26:35)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":26:74)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":27:30)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":27:35)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":27:44)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":28:18)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":29:18)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":30:25)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":30:36)
+#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":30:4)
+#loc27 = loc("xnumel"(#loc1))
+#loc28 = loc("xoffset"(#loc2))
+#loc29 = loc("xoffset"(#loc3))
+#loc30 = loc("xindex"(#loc4))
+#loc31 = loc("xindex"(#loc5))
+#loc32 = loc("xmask"(#loc6))
+#loc33 = loc("x0"(#loc7))
+#loc34 = loc("tmp0"(#loc8))
+#loc35 = loc("tmp0"(#loc9))
+#loc36 = loc("tmp0"(#loc10))
+#loc37 = loc("tmp1"(#loc11))
+#loc38 = loc("tmp1"(#loc12))
+#loc39 = loc("tmp1"(#loc13))
+#loc40 = loc("tmp2"(#loc14))
+#loc41 = loc("tmp2"(#loc15))
+#loc42 = loc("tmp2"(#loc16))
+#loc43 = loc("tmp3"(#loc17))
+#loc44 = loc("tmp4"(#loc18))
diff --git a/triton/XPTQ4QSOEUVXWHAK6CZOZU62APU6T6ZACVOSZPONZ6HRIBKDDZXQ/triton_poi_fused_add_mul_0.ttgir b/triton/XPTQ4QSOEUVXWHAK6CZOZU62APU6T6ZACVOSZPONZ6HRIBKDDZXQ/triton_poi_fused_add_mul_0.ttgir
new file mode 100644
index 0000000000000000000000000000000000000000..3aefc0741e1a21f89c015f7b92831a9bb9619c24
--- /dev/null
+++ b/triton/XPTQ4QSOEUVXWHAK6CZOZU62APU6T6ZACVOSZPONZ6HRIBKDDZXQ/triton_poi_fused_add_mul_0.ttgir
@@ -0,0 +1,74 @@
+#blocked = #ttg.blocked<{sizePerThread = [2], threadsPerWarp = [32], warpsPerCTA = [8], order = [0]}>
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":18:0)
+#loc21 = loc("in_ptr0"(#loc))
+#loc22 = loc("in_ptr1"(#loc))
+#loc23 = loc("in_ptr2"(#loc))
+#loc24 = loc("out_ptr0"(#loc))
+#loc25 = loc("xnumel"(#loc))
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "cuda:89", "ttg.threads-per-warp" = 32 : i32} {
+  tt.func public @triton_poi_fused_add_mul_0(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} {
+    %cst = arith.constant dense<4096> : tensor<512xi32, #blocked> loc(#loc1)
+    %c512_i32 = arith.constant 512 : i32 loc(#loc1)
+    %xoffset = tt.get_program_id x : i32 loc(#loc26)
+    %xoffset_0 = arith.muli %xoffset, %c512_i32 : i32 loc(#loc27)
+    %xindex = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32, #blocked> loc(#loc28)
+    %xindex_1 = tt.splat %xoffset_0 : i32 -> tensor<512xi32, #blocked> loc(#loc29)
+    %xindex_2 = arith.addi %xindex_1, %xindex : tensor<512xi32, #blocked> loc(#loc29)
+    %x0 = arith.remsi %xindex_2, %cst : tensor<512xi32, #blocked> loc(#loc30)
+    %tmp0 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<512x!tt.ptr<bf16>, #blocked> loc(#loc31)
+    %tmp0_3 = tt.addptr %tmp0, %xindex_2 : tensor<512x!tt.ptr<bf16>, #blocked>, tensor<512xi32, #blocked> loc(#loc31)
+    %tmp0_4 = tt.load %tmp0_3 : tensor<512x!tt.ptr<bf16>, #blocked> loc(#loc32)
+    %tmp0_5 = arith.extf %tmp0_4 : tensor<512xbf16, #blocked> to tensor<512xf32, #blocked> loc(#loc33)
+    %tmp1 = tt.splat %in_ptr1 : !tt.ptr<bf16> -> tensor<512x!tt.ptr<bf16>, #blocked> loc(#loc34)
+    %tmp1_6 = tt.addptr %tmp1, %x0 : tensor<512x!tt.ptr<bf16>, #blocked>, tensor<512xi32, #blocked> loc(#loc34)
+    %tmp1_7 = tt.load %tmp1_6 evictionPolicy = evict_last : tensor<512x!tt.ptr<bf16>, #blocked> loc(#loc35)
+    %tmp1_8 = arith.extf %tmp1_7 : tensor<512xbf16, #blocked> to tensor<512xf32, #blocked> loc(#loc36)
+    %tmp2 = tt.splat %in_ptr2 : !tt.ptr<bf16> -> tensor<512x!tt.ptr<bf16>, #blocked> loc(#loc37)
+    %tmp2_9 = tt.addptr %tmp2, %xindex_2 : tensor<512x!tt.ptr<bf16>, #blocked>, tensor<512xi32, #blocked> loc(#loc37)
+    %tmp2_10 = tt.load %tmp2_9 : tensor<512x!tt.ptr<bf16>, #blocked> loc(#loc38)
+    %tmp2_11 = arith.extf %tmp2_10 : tensor<512xbf16, #blocked> to tensor<512xf32, #blocked> loc(#loc39)
+    %tmp3 = arith.mulf %tmp1_8, %tmp2_11 : tensor<512xf32, #blocked> loc(#loc40)
+    %tmp4 = arith.addf %tmp0_5, %tmp3 : tensor<512xf32, #blocked> loc(#loc41)
+    %0 = tt.splat %out_ptr0 : !tt.ptr<bf16> -> tensor<512x!tt.ptr<bf16>, #blocked> loc(#loc18)
+    %1 = tt.addptr %0, %xindex_2 : tensor<512x!tt.ptr<bf16>, #blocked>, tensor<512xi32, #blocked> loc(#loc18)
+    %2 = arith.truncf %tmp4 : tensor<512xf32, #blocked> to tensor<512xbf16, #blocked> loc(#loc19)
+    tt.store %1, %2 : tensor<512x!tt.ptr<bf16>, #blocked> loc(#loc19)
+    tt.return loc(#loc20)
+  } loc(#loc)
+} loc(#loc)
+#loc1 = loc(unknown)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":20:28)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":20:33)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":21:36)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":21:23)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":24:19)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":25:30)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":25:35)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":25:44)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":26:30)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":26:35)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":26:74)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":27:30)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":27:35)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":27:44)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":28:18)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":29:18)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":30:25)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":30:36)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":30:4)
+#loc26 = loc("xoffset"(#loc2))
+#loc27 = loc("xoffset"(#loc3))
+#loc28 = loc("xindex"(#loc4))
+#loc29 = loc("xindex"(#loc5))
+#loc30 = loc("x0"(#loc6))
+#loc31 = loc("tmp0"(#loc7))
+#loc32 = loc("tmp0"(#loc8))
+#loc33 = loc("tmp0"(#loc9))
+#loc34 = loc("tmp1"(#loc10))
+#loc35 = loc("tmp1"(#loc11))
+#loc36 = loc("tmp1"(#loc12))
+#loc37 = loc("tmp2"(#loc13))
+#loc38 = loc("tmp2"(#loc14))
+#loc39 = loc("tmp2"(#loc15))
+#loc40 = loc("tmp3"(#loc16))
+#loc41 = loc("tmp4"(#loc17))
diff --git a/triton/XPTQ4QSOEUVXWHAK6CZOZU62APU6T6ZACVOSZPONZ6HRIBKDDZXQ/triton_poi_fused_add_mul_0.ttir b/triton/XPTQ4QSOEUVXWHAK6CZOZU62APU6T6ZACVOSZPONZ6HRIBKDDZXQ/triton_poi_fused_add_mul_0.ttir
new file mode 100644
index 0000000000000000000000000000000000000000..ea1a5ed49ca40edb75f2e0618a05ff023f5b2322
--- /dev/null
+++ b/triton/XPTQ4QSOEUVXWHAK6CZOZU62APU6T6ZACVOSZPONZ6HRIBKDDZXQ/triton_poi_fused_add_mul_0.ttir
@@ -0,0 +1,73 @@
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":18:0)
+#loc21 = loc("in_ptr0"(#loc))
+#loc22 = loc("in_ptr1"(#loc))
+#loc23 = loc("in_ptr2"(#loc))
+#loc24 = loc("out_ptr0"(#loc))
+#loc25 = loc("xnumel"(#loc))
+module {
+  tt.func public @triton_poi_fused_add_mul_0(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} {
+    %x0 = arith.constant dense<4096> : tensor<512xi32> loc(#loc26)
+    %c512_i32 = arith.constant 512 : i32 loc(#loc2)
+    %xoffset = tt.get_program_id x : i32 loc(#loc27)
+    %xoffset_0 = arith.muli %xoffset, %c512_i32 : i32 loc(#loc28)
+    %xindex = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32> loc(#loc29)
+    %xindex_1 = tt.splat %xoffset_0 : i32 -> tensor<512xi32> loc(#loc30)
+    %xindex_2 = arith.addi %xindex_1, %xindex : tensor<512xi32> loc(#loc30)
+    %x0_3 = arith.remsi %xindex_2, %x0 : tensor<512xi32> loc(#loc26)
+    %tmp0 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<512x!tt.ptr<bf16>> loc(#loc31)
+    %tmp0_4 = tt.addptr %tmp0, %xindex_2 : tensor<512x!tt.ptr<bf16>>, tensor<512xi32> loc(#loc31)
+    %tmp0_5 = tt.load %tmp0_4 : tensor<512x!tt.ptr<bf16>> loc(#loc32)
+    %tmp0_6 = arith.extf %tmp0_5 : tensor<512xbf16> to tensor<512xf32> loc(#loc33)
+    %tmp1 = tt.splat %in_ptr1 : !tt.ptr<bf16> -> tensor<512x!tt.ptr<bf16>> loc(#loc34)
+    %tmp1_7 = tt.addptr %tmp1, %x0_3 : tensor<512x!tt.ptr<bf16>>, tensor<512xi32> loc(#loc34)
+    %tmp1_8 = tt.load %tmp1_7 evictionPolicy = evict_last : tensor<512x!tt.ptr<bf16>> loc(#loc35)
+    %tmp1_9 = arith.extf %tmp1_8 : tensor<512xbf16> to tensor<512xf32> loc(#loc36)
+    %tmp2 = tt.splat %in_ptr2 : !tt.ptr<bf16> -> tensor<512x!tt.ptr<bf16>> loc(#loc37)
+    %tmp2_10 = tt.addptr %tmp2, %xindex_2 : tensor<512x!tt.ptr<bf16>>, tensor<512xi32> loc(#loc37)
+    %tmp2_11 = tt.load %tmp2_10 : tensor<512x!tt.ptr<bf16>> loc(#loc38)
+    %tmp2_12 = arith.extf %tmp2_11 : tensor<512xbf16> to tensor<512xf32> loc(#loc39)
+    %tmp3 = arith.mulf %tmp1_9, %tmp2_12 : tensor<512xf32> loc(#loc40)
+    %tmp4 = arith.addf %tmp0_6, %tmp3 : tensor<512xf32> loc(#loc41)
+    %0 = tt.splat %out_ptr0 : !tt.ptr<bf16> -> tensor<512x!tt.ptr<bf16>> loc(#loc18)
+    %1 = tt.addptr %0, %xindex_2 : tensor<512x!tt.ptr<bf16>>, tensor<512xi32> loc(#loc18)
+    %2 = arith.truncf %tmp4 : tensor<512xf32> to tensor<512xbf16> loc(#loc19)
+    tt.store %1, %2 : tensor<512x!tt.ptr<bf16>> loc(#loc19)
+    tt.return loc(#loc20)
+  } loc(#loc)
+} loc(#loc)
+#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":24:19)
+#loc2 = loc(unknown)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":20:28)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":20:33)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":21:36)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":21:23)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":25:30)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":25:35)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":25:44)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":26:30)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":26:35)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":26:74)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":27:30)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":27:35)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":27:44)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":28:18)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":29:18)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":30:25)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":30:36)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/xj/cxj4p53hotvw336w4j6j6nlydwxzrs4hhkj42hoflot2nszqzq3u.py":30:4)
+#loc26 = loc("x0"(#loc1))
+#loc27 = loc("xoffset"(#loc3))
+#loc28 = loc("xoffset"(#loc4))
+#loc29 = loc("xindex"(#loc5))
+#loc30 = loc("xindex"(#loc6))
+#loc31 = loc("tmp0"(#loc7))
+#loc32 = loc("tmp0"(#loc8))
+#loc33 = loc("tmp0"(#loc9))
+#loc34 = loc("tmp1"(#loc10))
+#loc35 = loc("tmp1"(#loc11))
+#loc36 = loc("tmp1"(#loc12))
+#loc37 = loc("tmp2"(#loc13))
+#loc38 = loc("tmp2"(#loc14))
+#loc39 = loc("tmp2"(#loc15))
+#loc40 = loc("tmp3"(#loc16))
+#loc41 = loc("tmp4"(#loc17))
diff --git a/triton/XU5DT2AO5BD5AEHEYGLPP5LRDFHHCUEJT4LGDVLB4STXUGVGHFPA/cuda_utils.cpython-312-x86_64-linux-gnu.so b/triton/XU5DT2AO5BD5AEHEYGLPP5LRDFHHCUEJT4LGDVLB4STXUGVGHFPA/cuda_utils.cpython-312-x86_64-linux-gnu.so
new file mode 100644
index 0000000000000000000000000000000000000000..344c2b22830c5f156cc59b0e20ab9b830ae50702
Binary files /dev/null and b/triton/XU5DT2AO5BD5AEHEYGLPP5LRDFHHCUEJT4LGDVLB4STXUGVGHFPA/cuda_utils.cpython-312-x86_64-linux-gnu.so differ
diff --git a/triton/Y3ZRGG6TFS2HUZYQ2E4CHPQ77KNKPYICUJUYGLW4FU32D2R3OUOQ/__grp__triton_poi_fused_mul_silu_split_0.json b/triton/Y3ZRGG6TFS2HUZYQ2E4CHPQ77KNKPYICUJUYGLW4FU32D2R3OUOQ/__grp__triton_poi_fused_mul_silu_split_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..0337de1b10392975c7443b05a10d95409f20e9f7
--- /dev/null
+++ b/triton/Y3ZRGG6TFS2HUZYQ2E4CHPQ77KNKPYICUJUYGLW4FU32D2R3OUOQ/__grp__triton_poi_fused_mul_silu_split_0.json
@@ -0,0 +1 @@
+{"child_paths": {"triton_poi_fused_mul_silu_split_0.source": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/Y3ZRGG6TFS2HUZYQ2E4CHPQ77KNKPYICUJUYGLW4FU32D2R3OUOQ/triton_poi_fused_mul_silu_split_0.source", "triton_poi_fused_mul_silu_split_0.ttir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/Y3ZRGG6TFS2HUZYQ2E4CHPQ77KNKPYICUJUYGLW4FU32D2R3OUOQ/triton_poi_fused_mul_silu_split_0.ttir", "triton_poi_fused_mul_silu_split_0.ttgir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/Y3ZRGG6TFS2HUZYQ2E4CHPQ77KNKPYICUJUYGLW4FU32D2R3OUOQ/triton_poi_fused_mul_silu_split_0.ttgir", "triton_poi_fused_mul_silu_split_0.llir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/Y3ZRGG6TFS2HUZYQ2E4CHPQ77KNKPYICUJUYGLW4FU32D2R3OUOQ/triton_poi_fused_mul_silu_split_0.llir", "triton_poi_fused_mul_silu_split_0.ptx": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/Y3ZRGG6TFS2HUZYQ2E4CHPQ77KNKPYICUJUYGLW4FU32D2R3OUOQ/triton_poi_fused_mul_silu_split_0.ptx", "triton_poi_fused_mul_silu_split_0.cubin": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/Y3ZRGG6TFS2HUZYQ2E4CHPQ77KNKPYICUJUYGLW4FU32D2R3OUOQ/triton_poi_fused_mul_silu_split_0.cubin", "triton_poi_fused_mul_silu_split_0.json": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/Y3ZRGG6TFS2HUZYQ2E4CHPQ77KNKPYICUJUYGLW4FU32D2R3OUOQ/triton_poi_fused_mul_silu_split_0.json"}}
\ No newline at end of file
diff --git a/triton/Y3ZRGG6TFS2HUZYQ2E4CHPQ77KNKPYICUJUYGLW4FU32D2R3OUOQ/triton_poi_fused_mul_silu_split_0.cubin b/triton/Y3ZRGG6TFS2HUZYQ2E4CHPQ77KNKPYICUJUYGLW4FU32D2R3OUOQ/triton_poi_fused_mul_silu_split_0.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..c0c97225d45c76ada78d2df90ec7162cd0aba037
Binary files /dev/null and b/triton/Y3ZRGG6TFS2HUZYQ2E4CHPQ77KNKPYICUJUYGLW4FU32D2R3OUOQ/triton_poi_fused_mul_silu_split_0.cubin differ
diff --git a/triton/Y3ZRGG6TFS2HUZYQ2E4CHPQ77KNKPYICUJUYGLW4FU32D2R3OUOQ/triton_poi_fused_mul_silu_split_0.json b/triton/Y3ZRGG6TFS2HUZYQ2E4CHPQ77KNKPYICUJUYGLW4FU32D2R3OUOQ/triton_poi_fused_mul_silu_split_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..b9701a1a7f1da59c13c48181716cc878e0c24145
--- /dev/null
+++ b/triton/Y3ZRGG6TFS2HUZYQ2E4CHPQ77KNKPYICUJUYGLW4FU32D2R3OUOQ/triton_poi_fused_mul_silu_split_0.json
@@ -0,0 +1 @@
+{"hash": "c6f3131bd32cb47a6710d13823be1ffa9aa7e102a269832edc2d37a1ea3b751d", "target": {"backend": "cuda", "arch": 89, "warp_size": 32}, "num_warps": 8, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "enable_reflect_ftz": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee", "bf16x3", "bf16x6"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm89", "instrumentation_mode": "", "triton_version": "3.6.0", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_poi_fused_mul_silu_split_0"}
\ No newline at end of file
diff --git a/triton/Y3ZRGG6TFS2HUZYQ2E4CHPQ77KNKPYICUJUYGLW4FU32D2R3OUOQ/triton_poi_fused_mul_silu_split_0.llir b/triton/Y3ZRGG6TFS2HUZYQ2E4CHPQ77KNKPYICUJUYGLW4FU32D2R3OUOQ/triton_poi_fused_mul_silu_split_0.llir
new file mode 100644
index 0000000000000000000000000000000000000000..340f5c2a2f3c09dbf34df6f738929c582a15fd58
--- /dev/null
+++ b/triton/Y3ZRGG6TFS2HUZYQ2E4CHPQ77KNKPYICUJUYGLW4FU32D2R3OUOQ/triton_poi_fused_mul_silu_split_0.llir
@@ -0,0 +1,102 @@
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64"
+
+; Function Attrs: nounwind
+define ptx_kernel void @triton_poi_fused_mul_silu_split_0(ptr addrspace(1) %0, ptr addrspace(1) %1, i32 %2, ptr addrspace(1) readnone captures(none) %3, ptr addrspace(1) readnone captures(none) %4) local_unnamed_addr #0 !dbg !4 {
+  %6 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7
+  %7 = shl i32 %6, 9, !dbg !8
+  %8 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9
+  %9 = shl nuw nsw i32 %8, 1, !dbg !9
+  %10 = and i32 %9, 510, !dbg !9
+  %11 = or disjoint i32 %10, %7, !dbg !10
+  %12 = srem i32 %11, 12288, !dbg !11
+  %13 = sub nsw i32 %11, %12, !dbg !11
+  %14 = add i32 %13, %11, !dbg !11
+  %15 = sext i32 %14 to i64, !dbg !12
+  %16 = getelementptr bfloat, ptr addrspace(1) %0, i64 %15, !dbg !12
+  %17 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l"(ptr addrspace(1) %16) #3, !dbg !13
+  %18 = bitcast i32 %17 to <2 x bfloat>, !dbg !13
+  %19 = add i32 %14, 12288, !dbg !14
+  %20 = sext i32 %19 to i64, !dbg !15
+  %21 = getelementptr bfloat, ptr addrspace(1) %0, i64 %20, !dbg !15
+  %22 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l"(ptr addrspace(1) %21) #3, !dbg !16
+  %23 = bitcast i32 %22 to <2 x bfloat>, !dbg !16
+  %24 = sext i32 %11 to i64, !dbg !17
+  %25 = getelementptr bfloat, ptr addrspace(1) %1, i64 %24, !dbg !17
+  %26 = fpext <2 x bfloat> %18 to <2 x float>, !dbg !18
+  %27 = fpext <2 x bfloat> %23 to <2 x float>, !dbg !19
+  %28 = extractelement <2 x float> %26, i64 0, !dbg !20
+  %29 = fsub float 0.000000e+00, %28, !dbg !20
+  %30 = extractelement <2 x float> %26, i64 1, !dbg !20
+  %31 = fsub float 0.000000e+00, %30, !dbg !20
+  %32 = fmul float %29, 0x3FF7154760000000, !dbg !25
+  %33 = tail call float @llvm.nvvm.ex2.approx.f(float %32), !dbg !25
+  %34 = fmul float %31, 0x3FF7154760000000, !dbg !25
+  %35 = tail call float @llvm.nvvm.ex2.approx.f(float %34), !dbg !25
+  %36 = fadd float %33, 1.000000e+00, !dbg !26
+  %37 = fadd float %35, 1.000000e+00, !dbg !26
+  %38 = tail call float @llvm.nvvm.div.full(float 1.000000e+00, float %36), !dbg !27
+  %39 = tail call float @llvm.nvvm.div.full(float 1.000000e+00, float %37), !dbg !27
+  %40 = insertelement <2 x float> poison, float %38, i64 0, !dbg !28
+  %41 = insertelement <2 x float> %40, float %39, i64 1, !dbg !28
+  %42 = fmul <2 x float> %41, %26, !dbg !28
+  %43 = fmul <2 x float> %42, %27, !dbg !29
+  %44 = fptrunc <2 x float> %43 to <2 x bfloat>, !dbg !30
+  %45 = bitcast <2 x bfloat> %44 to i32, !dbg !30
+  tail call void asm sideeffect "st.global.b32 [ $1 + 0 ], { $0 };", "r,l"(i32 %45, ptr addrspace(1) %25) #3, !dbg !30
+  ret void, !dbg !31
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.ex2.approx.f(float) #2
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.div.full(float, float) #2
+
+attributes #0 = { nounwind "nvvm.reqntid"="256" }
+attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #2 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
+attributes #3 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly)
+!1 = !DIFile(filename: "c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py", directory: "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w")
+!2 = !{i32 2, !"Debug Info Version", i32 3}
+!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
+!4 = distinct !DISubprogram(name: "triton_poi_fused_mul_silu_split_0", linkageName: "triton_poi_fused_mul_silu_split_0", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!5 = !DISubroutineType(cc: DW_CC_normal, types: !6)
+!6 = !{}
+!7 = !DILocation(line: 20, column: 28, scope: !4)
+!8 = !DILocation(line: 20, column: 33, scope: !4)
+!9 = !DILocation(line: 21, column: 36, scope: !4)
+!10 = !DILocation(line: 21, column: 23, scope: !4)
+!11 = !DILocation(line: 26, column: 35, scope: !4)
+!12 = !DILocation(line: 26, column: 30, scope: !4)
+!13 = !DILocation(line: 26, column: 46, scope: !4)
+!14 = !DILocation(line: 27, column: 43, scope: !4)
+!15 = !DILocation(line: 27, column: 30, scope: !4)
+!16 = !DILocation(line: 27, column: 54, scope: !4)
+!17 = !DILocation(line: 33, column: 25, scope: !4)
+!18 = !DILocation(line: 26, column: 55, scope: !4)
+!19 = !DILocation(line: 27, column: 63, scope: !4)
+!20 = !DILocation(line: 50, column: 30, scope: !21, inlinedAt: !23)
+!21 = distinct !DILexicalBlockFile(scope: !4, file: !22, discriminator: 0)
+!22 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.12/dist-packages/triton/language")
+!23 = !DILocation(line: 29, column: 22, scope: !24)
+!24 = distinct !DILexicalBlockFile(scope: !4, file: !1, discriminator: 0)
+!25 = !DILocation(line: 50, column: 29, scope: !21, inlinedAt: !23)
+!26 = !DILocation(line: 50, column: 20, scope: !21, inlinedAt: !23)
+!27 = !DILocation(line: 50, column: 16, scope: !21, inlinedAt: !23)
+!28 = !DILocation(line: 30, column: 18, scope: !4)
+!29 = !DILocation(line: 32, column: 18, scope: !4)
+!30 = !DILocation(line: 33, column: 36, scope: !4)
+!31 = !DILocation(line: 33, column: 4, scope: !4)
diff --git a/triton/Y3ZRGG6TFS2HUZYQ2E4CHPQ77KNKPYICUJUYGLW4FU32D2R3OUOQ/triton_poi_fused_mul_silu_split_0.ptx b/triton/Y3ZRGG6TFS2HUZYQ2E4CHPQ77KNKPYICUJUYGLW4FU32D2R3OUOQ/triton_poi_fused_mul_silu_split_0.ptx
new file mode 100644
index 0000000000000000000000000000000000000000..ce38fb817fb5836290e537af53a923fd6aab8b0f
--- /dev/null
+++ b/triton/Y3ZRGG6TFS2HUZYQ2E4CHPQ77KNKPYICUJUYGLW4FU32D2R3OUOQ/triton_poi_fused_mul_silu_split_0.ptx
@@ -0,0 +1,437 @@
+//
+// Generated by LLVM NVPTX Back-End
+//
+
+.version 9.1
+.target sm_89
+.address_size 64
+
+	// .globl	triton_poi_fused_mul_silu_split_0 // -- Begin function triton_poi_fused_mul_silu_split_0
+                                        // @triton_poi_fused_mul_silu_split_0
+.visible .entry triton_poi_fused_mul_silu_split_0(
+	.param .u64 .ptr .global .align 1 triton_poi_fused_mul_silu_split_0_param_0,
+	.param .u64 .ptr .global .align 1 triton_poi_fused_mul_silu_split_0_param_1,
+	.param .u32 triton_poi_fused_mul_silu_split_0_param_2,
+	.param .u64 .ptr .global .align 1 triton_poi_fused_mul_silu_split_0_param_3,
+	.param .u64 .ptr .global .align 1 triton_poi_fused_mul_silu_split_0_param_4
+)
+.reqntid 256
+{
+	.reg .b16 	%rs<5>;
+	.reg .b32 	%r<36>;
+	.reg .b64 	%rd<6>;
+	.loc	1 18 0                          // c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:18:0
+$L__func_begin0:
+	.loc	1 18 0                          // c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:18:0
+
+// %bb.0:
+	ld.param.b64 	%rd4, [triton_poi_fused_mul_silu_split_0_param_0];
+	ld.param.b64 	%rd5, [triton_poi_fused_mul_silu_split_0_param_1];
+$L__tmp0:
+	.loc	1 20 28                         // c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:20:28
+	mov.u32 	%r4, %ctaid.x;
+	.loc	1 20 33                         // c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:20:33
+	shl.b32 	%r5, %r4, 9;
+	.loc	1 21 36                         // c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:21:36
+	mov.u32 	%r6, %tid.x;
+	shl.b32 	%r7, %r6, 1;
+	and.b32 	%r8, %r7, 510;
+	.loc	1 21 23                         // c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:21:23
+	or.b32 	%r9, %r8, %r5;
+	.loc	1 26 35                         // c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:26:35
+	mul.hi.s32 	%r10, %r9, 715827883;
+	shr.u32 	%r11, %r10, 31;
+	shr.u32 	%r12, %r10, 11;
+	add.s32 	%r13, %r12, %r11;
+	mad.lo.s32 	%r14, %r13, 12288, %r9;
+	.loc	1 26 30                         // c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:26:30
+	mad.wide.s32 	%rd1, %r14, 2, %rd4;
+	.loc	1 26 46                         // c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:26:46
+	// begin inline asm
+	mov.u32 %r1, 0x0;
+	ld.global.b32 { %r1 }, [ %rd1 + 0 ];
+	// end inline asm
+	.loc	1 27 43                         // c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:27:43
+	add.s32 	%r15, %r14, 12288;
+	.loc	1 27 30                         // c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:27:30
+	mad.wide.s32 	%rd2, %r15, 2, %rd4;
+	.loc	1 27 54                         // c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:27:54
+	// begin inline asm
+	mov.u32 %r2, 0x0;
+	ld.global.b32 { %r2 }, [ %rd2 + 0 ];
+	// end inline asm
+	.loc	1 33 25                         // c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:33:25
+	mad.wide.s32 	%rd3, %r9, 2, %rd5;
+	.loc	1 26 55                         // c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:26:55
+	mov.b32 	{%rs1, %rs2}, %r1;
+	cvt.f32.bf16 	%r16, %rs2;
+	cvt.f32.bf16 	%r17, %rs1;
+	.loc	1 27 63                         // c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:27:63
+	mov.b32 	{%rs3, %rs4}, %r2;
+	cvt.f32.bf16 	%r18, %rs4;
+	cvt.f32.bf16 	%r19, %rs3;
+	mov.b32 	%r20, 0f00000000;
+$L__tmp1:
+	.loc	2 50 30                         // standard.py:50:30 @[ c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:29:22 ]
+	sub.f32 	%r21, %r20, %r17;
+	sub.f32 	%r22, %r20, %r16;
+	.loc	2 50 29                         // standard.py:50:29 @[ c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:29:22 ]
+	mul.f32 	%r23, %r21, 0f3FB8AA3B;
+	ex2.approx.f32 	%r24, %r23;
+	mul.f32 	%r25, %r22, 0f3FB8AA3B;
+	ex2.approx.f32 	%r26, %r25;
+	.loc	2 50 20                         // standard.py:50:20 @[ c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:29:22 ]
+	add.f32 	%r27, %r24, 0f3F800000;
+	add.f32 	%r28, %r26, 0f3F800000;
+	mov.b32 	%r29, 0f3F800000;
+	.loc	2 50 16                         // standard.py:50:16 @[ c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:29:22 ]
+	div.full.f32 	%r30, %r29, %r27;
+	div.full.f32 	%r31, %r29, %r28;
+$L__tmp2:
+	.loc	1 30 18                         // c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:30:18
+	mul.f32 	%r32, %r31, %r16;
+	mul.f32 	%r33, %r30, %r17;
+	.loc	1 32 18                         // c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:32:18
+	mul.f32 	%r34, %r33, %r19;
+	mul.f32 	%r35, %r32, %r18;
+	.loc	1 33 36                         // c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:33:36
+	cvt.rn.bf16x2.f32 	%r3, %r35, %r34;
+	// begin inline asm
+	st.global.b32 [ %rd3 + 0 ], { %r3 };
+	// end inline asm
+	.loc	1 33 4                          // c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py:33:4
+	ret;
+$L__tmp3:
+$L__func_end0:
+                                        // -- End function
+}
+	.file	1 "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py"
+	.file	2 "/usr/local/lib/python3.12/dist-packages/triton/language/standard.py"
+	.section	.debug_abbrev
+	{
+.b8 1                                   // Abbreviation Code
+.b8 17                                  // DW_TAG_compile_unit
+.b8 1                                   // DW_CHILDREN_yes
+.b8 37                                  // DW_AT_producer
+.b8 8                                   // DW_FORM_string
+.b8 19                                  // DW_AT_language
+.b8 5                                   // DW_FORM_data2
+.b8 3                                   // DW_AT_name
+.b8 8                                   // DW_FORM_string
+.b8 16                                  // DW_AT_stmt_list
+.b8 6                                   // DW_FORM_data4
+.b8 27                                  // DW_AT_comp_dir
+.b8 8                                   // DW_FORM_string
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 2                                   // Abbreviation Code
+.b8 46                                  // DW_TAG_subprogram
+.b8 0                                   // DW_CHILDREN_no
+.b8 3                                   // DW_AT_name
+.b8 8                                   // DW_FORM_string
+.b8 32                                  // DW_AT_inline
+.b8 11                                  // DW_FORM_data1
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 3                                   // Abbreviation Code
+.b8 46                                  // DW_TAG_subprogram
+.b8 1                                   // DW_CHILDREN_yes
+.b8 17                                  // DW_AT_low_pc
+.b8 1                                   // DW_FORM_addr
+.b8 18                                  // DW_AT_high_pc
+.b8 1                                   // DW_FORM_addr
+.b8 49                                  // DW_AT_abstract_origin
+.b8 19                                  // DW_FORM_ref4
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 4                                   // Abbreviation Code
+.b8 29                                  // DW_TAG_inlined_subroutine
+.b8 0                                   // DW_CHILDREN_no
+.b8 49                                  // DW_AT_abstract_origin
+.b8 19                                  // DW_FORM_ref4
+.b8 17                                  // DW_AT_low_pc
+.b8 1                                   // DW_FORM_addr
+.b8 18                                  // DW_AT_high_pc
+.b8 1                                   // DW_FORM_addr
+.b8 88                                  // DW_AT_call_file
+.b8 11                                  // DW_FORM_data1
+.b8 89                                  // DW_AT_call_line
+.b8 11                                  // DW_FORM_data1
+.b8 87                                  // DW_AT_call_column
+.b8 11                                  // DW_FORM_data1
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 0                                   // EOM(3)
+	}
+	.section	.debug_info
+	{
+.b32 307                                // Length of Unit
+.b8 2                                   // DWARF version number
+.b8 0
+.b32 .debug_abbrev                      // Offset Into Abbrev. Section
+.b8 8                                   // Address Size (in bytes)
+.b8 1                                   // Abbrev [1] 0xb:0x12c DW_TAG_compile_unit
+.b8 116                                 // DW_AT_producer
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 0
+.b8 2                                   // DW_AT_language
+.b8 0
+.b8 99                                  // DW_AT_name
+.b8 54
+.b8 119
+.b8 54
+.b8 115
+.b8 103
+.b8 52
+.b8 118
+.b8 51
+.b8 98
+.b8 99
+.b8 105
+.b8 103
+.b8 104
+.b8 119
+.b8 111
+.b8 107
+.b8 122
+.b8 113
+.b8 54
+.b8 105
+.b8 52
+.b8 51
+.b8 116
+.b8 108
+.b8 53
+.b8 120
+.b8 107
+.b8 53
+.b8 118
+.b8 122
+.b8 55
+.b8 122
+.b8 101
+.b8 118
+.b8 117
+.b8 107
+.b8 55
+.b8 106
+.b8 104
+.b8 118
+.b8 108
+.b8 113
+.b8 121
+.b8 114
+.b8 121
+.b8 121
+.b8 117
+.b8 104
+.b8 117
+.b8 101
+.b8 111
+.b8 46
+.b8 112
+.b8 121
+.b8 0
+.b32 .debug_line                        // DW_AT_stmt_list
+.b8 47                                  // DW_AT_comp_dir
+.b8 97
+.b8 112
+.b8 112
+.b8 47
+.b8 116
+.b8 101
+.b8 110
+.b8 115
+.b8 111
+.b8 114
+.b8 114
+.b8 116
+.b8 95
+.b8 108
+.b8 108
+.b8 109
+.b8 47
+.b8 118
+.b8 105
+.b8 115
+.b8 117
+.b8 97
+.b8 108
+.b8 95
+.b8 103
+.b8 101
+.b8 110
+.b8 47
+.b8 99
+.b8 111
+.b8 109
+.b8 112
+.b8 105
+.b8 108
+.b8 101
+.b8 100
+.b8 95
+.b8 99
+.b8 97
+.b8 99
+.b8 104
+.b8 101
+.b8 47
+.b8 102
+.b8 108
+.b8 117
+.b8 120
+.b8 50
+.b8 95
+.b8 107
+.b8 108
+.b8 101
+.b8 105
+.b8 110
+.b8 95
+.b8 57
+.b8 98
+.b8 95
+.b8 78
+.b8 86
+.b8 73
+.b8 68
+.b8 73
+.b8 65
+.b8 95
+.b8 71
+.b8 101
+.b8 70
+.b8 111
+.b8 114
+.b8 99
+.b8 101
+.b8 95
+.b8 82
+.b8 84
+.b8 88
+.b8 95
+.b8 52
+.b8 48
+.b8 57
+.b8 48
+.b8 95
+.b8 115
+.b8 109
+.b8 56
+.b8 57
+.b8 95
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 50
+.b8 46
+.b8 49
+.b8 48
+.b8 46
+.b8 48
+.b8 97
+.b8 48
+.b8 95
+.b8 98
+.b8 52
+.b8 101
+.b8 52
+.b8 101
+.b8 101
+.b8 56
+.b8 49
+.b8 100
+.b8 51
+.b8 46
+.b8 110
+.b8 118
+.b8 50
+.b8 53
+.b8 46
+.b8 49
+.b8 50
+.b8 95
+.b8 99
+.b8 117
+.b8 100
+.b8 97
+.b8 49
+.b8 51
+.b8 95
+.b8 49
+.b8 47
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 105
+.b8 110
+.b8 100
+.b8 117
+.b8 99
+.b8 116
+.b8 111
+.b8 114
+.b8 47
+.b8 54
+.b8 119
+.b8 0
+.b8 2                                   // Abbrev [2] 0xe4:0x24 DW_TAG_subprogram
+.b8 116                                 // DW_AT_name
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 112
+.b8 111
+.b8 105
+.b8 95
+.b8 102
+.b8 117
+.b8 115
+.b8 101
+.b8 100
+.b8 95
+.b8 109
+.b8 117
+.b8 108
+.b8 95
+.b8 115
+.b8 105
+.b8 108
+.b8 117
+.b8 95
+.b8 115
+.b8 112
+.b8 108
+.b8 105
+.b8 116
+.b8 95
+.b8 48
+.b8 0
+.b8 1                                   // DW_AT_inline
+.b8 3                                   // Abbrev [3] 0x108:0x2e DW_TAG_subprogram
+.b64 $L__func_begin0                    // DW_AT_low_pc
+.b64 $L__func_end0                      // DW_AT_high_pc
+.b32 228                                // DW_AT_abstract_origin
+.b8 4                                   // Abbrev [4] 0x11d:0x18 DW_TAG_inlined_subroutine
+.b32 228                                // DW_AT_abstract_origin
+.b64 $L__tmp1                           // DW_AT_low_pc
+.b64 $L__tmp2                           // DW_AT_high_pc
+.b8 1                                   // DW_AT_call_file
+.b8 29                                  // DW_AT_call_line
+.b8 22                                  // DW_AT_call_column
+.b8 0                                   // End Of Children Mark
+.b8 0                                   // End Of Children Mark
+	}
+	.section	.debug_macinfo	{	}
diff --git a/triton/Y3ZRGG6TFS2HUZYQ2E4CHPQ77KNKPYICUJUYGLW4FU32D2R3OUOQ/triton_poi_fused_mul_silu_split_0.source b/triton/Y3ZRGG6TFS2HUZYQ2E4CHPQ77KNKPYICUJUYGLW4FU32D2R3OUOQ/triton_poi_fused_mul_silu_split_0.source
new file mode 100644
index 0000000000000000000000000000000000000000..6103d581e7ff1cb7c9a2381f8ec66e8a755f808a
--- /dev/null
+++ b/triton/Y3ZRGG6TFS2HUZYQ2E4CHPQ77KNKPYICUJUYGLW4FU32D2R3OUOQ/triton_poi_fused_mul_silu_split_0.source
@@ -0,0 +1,129 @@
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":18:0)
+#loc26 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":49:0)
+#loc33 = loc("in_ptr0"(#loc))
+#loc34 = loc("out_ptr0"(#loc))
+#loc35 = loc("xnumel"(#loc))
+#loc58 = loc("x"(#loc26))
+module {
+  tt.func public @triton_poi_fused_mul_silu_split_0(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} {
+    %xnumel_0 = arith.constant 25165824 : i32 loc(#loc36)
+    %xoffset = tt.get_program_id x : i32 loc(#loc37)
+    %xoffset_1 = arith.constant 512 : i32 loc(#loc38)
+    %xoffset_2 = arith.constant 512 : i32 loc(#loc38)
+    %xoffset_3 = arith.muli %xoffset, %xoffset_2 : i32 loc(#loc38)
+    %xindex = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32> loc(#loc39)
+    %xindex_4 = tt.splat %xoffset_3 : i32 -> tensor<512xi32> loc(#loc40)
+    %xindex_5 = arith.addi %xindex_4, %xindex : tensor<512xi32> loc(#loc40)
+    %xmask = arith.constant true loc(#loc41)
+    %xmask_6 = arith.constant dense<true> : tensor<512xi1> loc(#loc41)
+    %x0 = arith.constant 12288 : i32 loc(#loc42)
+    %x0_7 = arith.constant 12288 : i32 loc(#loc42)
+    %x0_8 = arith.constant dense<12288> : tensor<512xi32> loc(#loc42)
+    %x0_9 = arith.remsi %xindex_5, %x0_8 : tensor<512xi32> loc(#loc42)
+    %x1 = arith.constant 12288 : i32 loc(#loc43)
+    %x1_10 = arith.constant 12288 : i32 loc(#loc43)
+    %x1_11 = arith.constant dense<12288> : tensor<512xi32> loc(#loc43)
+    %x1_12 = arith.divsi %xindex_5, %x1_11 : tensor<512xi32> loc(#loc43)
+    %tmp0 = arith.constant 24576 : i32 loc(#loc44)
+    %tmp0_13 = arith.constant 24576 : i32 loc(#loc44)
+    %tmp0_14 = arith.constant dense<24576> : tensor<512xi32> loc(#loc44)
+    %tmp0_15 = arith.muli %tmp0_14, %x1_12 : tensor<512xi32> loc(#loc44)
+    %tmp0_16 = arith.addi %x0_9, %tmp0_15 : tensor<512xi32> loc(#loc45)
+    %tmp0_17 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<512x!tt.ptr<bf16>> loc(#loc46)
+    %tmp0_18 = tt.addptr %tmp0_17, %tmp0_16 : tensor<512x!tt.ptr<bf16>>, tensor<512xi32> loc(#loc46)
+    %tmp0_19 = tt.load %tmp0_18 : tensor<512x!tt.ptr<bf16>> loc(#loc47)
+    %tmp0_20 = arith.extf %tmp0_19 : tensor<512xbf16> to tensor<512xf32> loc(#loc48)
+    %tmp5 = arith.constant 12288 : i32 loc(#loc49)
+    %tmp5_21 = arith.constant 12288 : i32 loc(#loc49)
+    %tmp5_22 = arith.constant dense<12288> : tensor<512xi32> loc(#loc49)
+    %tmp5_23 = arith.addi %tmp5_22, %x0_9 : tensor<512xi32> loc(#loc49)
+    %tmp5_24 = arith.constant 24576 : i32 loc(#loc50)
+    %tmp5_25 = arith.constant 24576 : i32 loc(#loc50)
+    %tmp5_26 = arith.constant dense<24576> : tensor<512xi32> loc(#loc50)
+    %tmp5_27 = arith.muli %tmp5_26, %x1_12 : tensor<512xi32> loc(#loc50)
+    %tmp5_28 = arith.addi %tmp5_23, %tmp5_27 : tensor<512xi32> loc(#loc51)
+    %tmp5_29 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<512x!tt.ptr<bf16>> loc(#loc52)
+    %tmp5_30 = tt.addptr %tmp5_29, %tmp5_28 : tensor<512x!tt.ptr<bf16>>, tensor<512xi32> loc(#loc52)
+    %tmp5_31 = tt.load %tmp5_30 : tensor<512x!tt.ptr<bf16>> loc(#loc53)
+    %tmp5_32 = arith.extf %tmp5_31 : tensor<512xbf16> to tensor<512xf32> loc(#loc54)
+    %tmp2 = tt.call @triton.language.standard.sigmoid__fp32S512S__(%tmp0_20) : (tensor<512xf32>) -> tensor<512xf32> loc(#loc55)
+    %tmp3 = arith.mulf %tmp0_20, %tmp2 : tensor<512xf32> loc(#loc56)
+    %tmp6 = arith.mulf %tmp3, %tmp5_32 : tensor<512xf32> loc(#loc57)
+    %0 = tt.splat %out_ptr0 : !tt.ptr<bf16> -> tensor<512x!tt.ptr<bf16>> loc(#loc23)
+    %1 = tt.addptr %0, %xindex_5 : tensor<512x!tt.ptr<bf16>>, tensor<512xi32> loc(#loc23)
+    %2 = arith.truncf %tmp6 : tensor<512xf32> to tensor<512xbf16> loc(#loc24)
+    tt.store %1, %2 : tensor<512x!tt.ptr<bf16>> loc(#loc24)
+    tt.return loc(#loc25)
+  } loc(#loc)
+  tt.func private @triton.language.standard.sigmoid__fp32S512S__(%x: tensor<512xf32> loc("x"(#loc26))) -> tensor<512xf32> attributes {noinline = false} {
+    %cst = arith.constant 0.000000e+00 : f32 loc(#loc27)
+    %cst_0 = arith.constant dense<0.000000e+00> : tensor<512xf32> loc(#loc27)
+    %0 = arith.subf %cst_0, %x : tensor<512xf32> loc(#loc27)
+    %1 = math.exp %0 : tensor<512xf32> loc(#loc28)
+    %c1_i32 = arith.constant 1 : i32 loc(#loc29)
+    %cst_1 = arith.constant 1.000000e+00 : f32 loc(#loc29)
+    %cst_2 = arith.constant dense<1.000000e+00> : tensor<512xf32> loc(#loc29)
+    %2 = arith.addf %cst_2, %1 : tensor<512xf32> loc(#loc29)
+    %c1_i32_3 = arith.constant 1 : i32 loc(#loc30)
+    %cst_4 = arith.constant 1.000000e+00 : f32 loc(#loc30)
+    %cst_5 = arith.constant dense<1.000000e+00> : tensor<512xf32> loc(#loc30)
+    %3 = arith.divf %cst_5, %2 : tensor<512xf32> loc(#loc30)
+    tt.return %3 : tensor<512xf32> loc(#loc31)
+  ^bb1:  // no predecessors
+    %4 = ub.poison : tensor<512xf32> loc(#loc32)
+    tt.return %4 : tensor<512xf32> loc(#loc32)
+  } loc(#loc26)
+} loc(#loc)
+#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":19:13)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":20:28)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":20:33)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":21:36)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":21:23)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":22:36)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":23:19)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":24:19)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":26:41)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":26:35)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":26:30)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":26:46)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":26:55)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":27:38)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":27:49)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":27:43)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":27:30)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":27:54)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":27:63)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":29:22)
+#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":30:18)
+#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":32:18)
+#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":33:25)
+#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":33:36)
+#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":33:4)
+#loc27 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:30)
+#loc28 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:29)
+#loc29 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:20)
+#loc30 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:16)
+#loc31 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:11)
+#loc32 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:4)
+#loc36 = loc("xnumel"(#loc1))
+#loc37 = loc("xoffset"(#loc2))
+#loc38 = loc("xoffset"(#loc3))
+#loc39 = loc("xindex"(#loc4))
+#loc40 = loc("xindex"(#loc5))
+#loc41 = loc("xmask"(#loc6))
+#loc42 = loc("x0"(#loc7))
+#loc43 = loc("x1"(#loc8))
+#loc44 = loc("tmp0"(#loc9))
+#loc45 = loc("tmp0"(#loc10))
+#loc46 = loc("tmp0"(#loc11))
+#loc47 = loc("tmp0"(#loc12))
+#loc48 = loc("tmp0"(#loc13))
+#loc49 = loc("tmp5"(#loc14))
+#loc50 = loc("tmp5"(#loc15))
+#loc51 = loc("tmp5"(#loc16))
+#loc52 = loc("tmp5"(#loc17))
+#loc53 = loc("tmp5"(#loc18))
+#loc54 = loc("tmp5"(#loc19))
+#loc55 = loc("tmp2"(#loc20))
+#loc56 = loc("tmp3"(#loc21))
+#loc57 = loc("tmp6"(#loc22))
diff --git a/triton/Y3ZRGG6TFS2HUZYQ2E4CHPQ77KNKPYICUJUYGLW4FU32D2R3OUOQ/triton_poi_fused_mul_silu_split_0.ttgir b/triton/Y3ZRGG6TFS2HUZYQ2E4CHPQ77KNKPYICUJUYGLW4FU32D2R3OUOQ/triton_poi_fused_mul_silu_split_0.ttgir
new file mode 100644
index 0000000000000000000000000000000000000000..8e0600ab4273e4a88e1c06e6a65318b4a7965298
--- /dev/null
+++ b/triton/Y3ZRGG6TFS2HUZYQ2E4CHPQ77KNKPYICUJUYGLW4FU32D2R3OUOQ/triton_poi_fused_mul_silu_split_0.ttgir
@@ -0,0 +1,93 @@
+#blocked = #ttg.blocked<{sizePerThread = [2], threadsPerWarp = [32], warpsPerCTA = [8], order = [0]}>
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":18:0)
+#loc28 = loc("in_ptr0"(#loc))
+#loc29 = loc("out_ptr0"(#loc))
+#loc30 = loc("xnumel"(#loc))
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "cuda:89", "ttg.threads-per-warp" = 32 : i32} {
+  tt.func public @triton_poi_fused_mul_silu_split_0(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} {
+    %cst = arith.constant dense<24576> : tensor<512xi32, #blocked> loc(#loc1)
+    %cst_0 = arith.constant dense<12288> : tensor<512xi32, #blocked> loc(#loc1)
+    %c512_i32 = arith.constant 512 : i32 loc(#loc1)
+    %cst_1 = arith.constant dense<0.000000e+00> : tensor<512xf32, #blocked> loc(#loc1)
+    %cst_2 = arith.constant dense<1.000000e+00> : tensor<512xf32, #blocked> loc(#loc1)
+    %xoffset = tt.get_program_id x : i32 loc(#loc31)
+    %xoffset_3 = arith.muli %xoffset, %c512_i32 : i32 loc(#loc32)
+    %xindex = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32, #blocked> loc(#loc33)
+    %xindex_4 = tt.splat %xoffset_3 : i32 -> tensor<512xi32, #blocked> loc(#loc34)
+    %xindex_5 = arith.addi %xindex_4, %xindex : tensor<512xi32, #blocked> loc(#loc34)
+    %x0 = arith.remsi %xindex_5, %cst_0 : tensor<512xi32, #blocked> loc(#loc35)
+    %x1 = arith.divsi %xindex_5, %cst_0 : tensor<512xi32, #blocked> loc(#loc36)
+    %tmp0 = arith.muli %x1, %cst : tensor<512xi32, #blocked> loc(#loc37)
+    %tmp0_6 = arith.addi %x0, %tmp0 : tensor<512xi32, #blocked> loc(#loc38)
+    %tmp0_7 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<512x!tt.ptr<bf16>, #blocked> loc(#loc39)
+    %tmp0_8 = tt.addptr %tmp0_7, %tmp0_6 : tensor<512x!tt.ptr<bf16>, #blocked>, tensor<512xi32, #blocked> loc(#loc39)
+    %tmp0_9 = tt.load %tmp0_8 : tensor<512x!tt.ptr<bf16>, #blocked> loc(#loc40)
+    %tmp0_10 = arith.extf %tmp0_9 : tensor<512xbf16, #blocked> to tensor<512xf32, #blocked> loc(#loc41)
+    %tmp5 = arith.addi %x0, %cst_0 : tensor<512xi32, #blocked> loc(#loc42)
+    %tmp5_11 = arith.addi %tmp5, %tmp0 : tensor<512xi32, #blocked> loc(#loc43)
+    %tmp5_12 = tt.addptr %tmp0_7, %tmp5_11 : tensor<512x!tt.ptr<bf16>, #blocked>, tensor<512xi32, #blocked> loc(#loc44)
+    %tmp5_13 = tt.load %tmp5_12 : tensor<512x!tt.ptr<bf16>, #blocked> loc(#loc45)
+    %tmp5_14 = arith.extf %tmp5_13 : tensor<512xbf16, #blocked> to tensor<512xf32, #blocked> loc(#loc46)
+    %tmp2 = arith.subf %cst_1, %tmp0_10 : tensor<512xf32, #blocked> loc(#loc50)
+    %tmp2_15 = math.exp %tmp2 : tensor<512xf32, #blocked> loc(#loc51)
+    %tmp2_16 = arith.addf %tmp2_15, %cst_2 : tensor<512xf32, #blocked> loc(#loc52)
+    %tmp2_17 = arith.divf %cst_2, %tmp2_16 : tensor<512xf32, #blocked> loc(#loc53)
+    %tmp3 = arith.mulf %tmp0_10, %tmp2_17 : tensor<512xf32, #blocked> loc(#loc48)
+    %tmp6 = arith.mulf %tmp3, %tmp5_14 : tensor<512xf32, #blocked> loc(#loc49)
+    %0 = tt.splat %out_ptr0 : !tt.ptr<bf16> -> tensor<512x!tt.ptr<bf16>, #blocked> loc(#loc25)
+    %1 = tt.addptr %0, %xindex_5 : tensor<512x!tt.ptr<bf16>, #blocked>, tensor<512xi32, #blocked> loc(#loc25)
+    %2 = arith.truncf %tmp6 : tensor<512xf32, #blocked> to tensor<512xbf16, #blocked> loc(#loc26)
+    tt.store %1, %2 : tensor<512x!tt.ptr<bf16>, #blocked> loc(#loc26)
+    tt.return loc(#loc27)
+  } loc(#loc)
+} loc(#loc)
+#loc1 = loc(unknown)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":20:28)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":20:33)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":21:36)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":21:23)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":23:19)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":24:19)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":26:41)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":26:35)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":26:30)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":26:46)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":26:55)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":27:38)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":27:43)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":27:30)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":27:54)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":27:63)
+#loc18 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:30)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":29:22)
+#loc20 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:29)
+#loc21 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:20)
+#loc22 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:16)
+#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":30:18)
+#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":32:18)
+#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":33:25)
+#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":33:36)
+#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":33:4)
+#loc31 = loc("xoffset"(#loc2))
+#loc32 = loc("xoffset"(#loc3))
+#loc33 = loc("xindex"(#loc4))
+#loc34 = loc("xindex"(#loc5))
+#loc35 = loc("x0"(#loc6))
+#loc36 = loc("x1"(#loc7))
+#loc37 = loc("tmp0"(#loc8))
+#loc38 = loc("tmp0"(#loc9))
+#loc39 = loc("tmp0"(#loc10))
+#loc40 = loc("tmp0"(#loc11))
+#loc41 = loc("tmp0"(#loc12))
+#loc42 = loc("tmp5"(#loc13))
+#loc43 = loc("tmp5"(#loc14))
+#loc44 = loc("tmp5"(#loc15))
+#loc45 = loc("tmp5"(#loc16))
+#loc46 = loc("tmp5"(#loc17))
+#loc47 = loc("tmp2"(#loc19))
+#loc48 = loc("tmp3"(#loc23))
+#loc49 = loc("tmp6"(#loc24))
+#loc50 = loc(callsite(#loc18 at #loc47))
+#loc51 = loc(callsite(#loc20 at #loc47))
+#loc52 = loc(callsite(#loc21 at #loc47))
+#loc53 = loc(callsite(#loc22 at #loc47))
diff --git a/triton/Y3ZRGG6TFS2HUZYQ2E4CHPQ77KNKPYICUJUYGLW4FU32D2R3OUOQ/triton_poi_fused_mul_silu_split_0.ttir b/triton/Y3ZRGG6TFS2HUZYQ2E4CHPQ77KNKPYICUJUYGLW4FU32D2R3OUOQ/triton_poi_fused_mul_silu_split_0.ttir
new file mode 100644
index 0000000000000000000000000000000000000000..afe4785041347dbb9318e32fa3fc56ae1cc88555
--- /dev/null
+++ b/triton/Y3ZRGG6TFS2HUZYQ2E4CHPQ77KNKPYICUJUYGLW4FU32D2R3OUOQ/triton_poi_fused_mul_silu_split_0.ttir
@@ -0,0 +1,93 @@
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":18:0)
+#loc28 = loc("in_ptr0"(#loc))
+#loc29 = loc("out_ptr0"(#loc))
+#loc30 = loc("xnumel"(#loc))
+module {
+  tt.func public @triton_poi_fused_mul_silu_split_0(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} {
+    %tmp2 = arith.constant dense<0.000000e+00> : tensor<512xf32> loc(#loc50)
+    %tmp2_0 = arith.constant dense<1.000000e+00> : tensor<512xf32> loc(#loc51)
+    %cst = arith.constant dense<24576> : tensor<512xi32> loc(#loc3)
+    %cst_1 = arith.constant dense<12288> : tensor<512xi32> loc(#loc3)
+    %c512_i32 = arith.constant 512 : i32 loc(#loc3)
+    %xoffset = tt.get_program_id x : i32 loc(#loc32)
+    %xoffset_2 = arith.muli %xoffset, %c512_i32 : i32 loc(#loc33)
+    %xindex = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32> loc(#loc34)
+    %xindex_3 = tt.splat %xoffset_2 : i32 -> tensor<512xi32> loc(#loc35)
+    %xindex_4 = arith.addi %xindex_3, %xindex : tensor<512xi32> loc(#loc35)
+    %x0 = arith.remsi %xindex_4, %cst_1 : tensor<512xi32> loc(#loc36)
+    %x1 = arith.divsi %xindex_4, %cst_1 : tensor<512xi32> loc(#loc37)
+    %tmp0 = arith.muli %x1, %cst : tensor<512xi32> loc(#loc38)
+    %tmp0_5 = arith.addi %x0, %tmp0 : tensor<512xi32> loc(#loc39)
+    %tmp0_6 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<512x!tt.ptr<bf16>> loc(#loc40)
+    %tmp0_7 = tt.addptr %tmp0_6, %tmp0_5 : tensor<512x!tt.ptr<bf16>>, tensor<512xi32> loc(#loc40)
+    %tmp0_8 = tt.load %tmp0_7 : tensor<512x!tt.ptr<bf16>> loc(#loc41)
+    %tmp0_9 = arith.extf %tmp0_8 : tensor<512xbf16> to tensor<512xf32> loc(#loc42)
+    %tmp5 = arith.addi %x0, %cst_1 : tensor<512xi32> loc(#loc43)
+    %tmp5_10 = arith.addi %tmp5, %tmp0 : tensor<512xi32> loc(#loc44)
+    %tmp5_11 = tt.addptr %tmp0_6, %tmp5_10 : tensor<512x!tt.ptr<bf16>>, tensor<512xi32> loc(#loc45)
+    %tmp5_12 = tt.load %tmp5_11 : tensor<512x!tt.ptr<bf16>> loc(#loc46)
+    %tmp5_13 = arith.extf %tmp5_12 : tensor<512xbf16> to tensor<512xf32> loc(#loc47)
+    %tmp2_14 = arith.subf %tmp2, %tmp0_9 : tensor<512xf32> loc(#loc50)
+    %tmp2_15 = math.exp %tmp2_14 : tensor<512xf32> loc(#loc52)
+    %tmp2_16 = arith.addf %tmp2_15, %tmp2_0 : tensor<512xf32> loc(#loc53)
+    %tmp2_17 = arith.divf %tmp2_0, %tmp2_16 : tensor<512xf32> loc(#loc54)
+    %tmp3 = arith.mulf %tmp0_9, %tmp2_17 : tensor<512xf32> loc(#loc48)
+    %tmp6 = arith.mulf %tmp3, %tmp5_13 : tensor<512xf32> loc(#loc49)
+    %0 = tt.splat %out_ptr0 : !tt.ptr<bf16> -> tensor<512x!tt.ptr<bf16>> loc(#loc25)
+    %1 = tt.addptr %0, %xindex_4 : tensor<512x!tt.ptr<bf16>>, tensor<512xi32> loc(#loc25)
+    %2 = arith.truncf %tmp6 : tensor<512xf32> to tensor<512xbf16> loc(#loc26)
+    tt.store %1, %2 : tensor<512x!tt.ptr<bf16>> loc(#loc26)
+    tt.return loc(#loc27)
+  } loc(#loc)
+} loc(#loc)
+#loc1 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:30)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":29:22)
+#loc3 = loc(unknown)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":20:28)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":20:33)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":21:36)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":21:23)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":23:19)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":24:19)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":26:41)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":26:35)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":26:30)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":26:46)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":26:55)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":27:38)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":27:43)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":27:30)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":27:54)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":27:63)
+#loc20 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:29)
+#loc21 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:20)
+#loc22 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":50:16)
+#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":30:18)
+#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":32:18)
+#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":33:25)
+#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":33:36)
+#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/6w/c6w6sg4v3bcighwokzq6i43tl5xk5vz7zevuk7jhvlqyryyuhueo.py":33:4)
+#loc31 = loc("tmp2"(#loc2))
+#loc32 = loc("xoffset"(#loc4))
+#loc33 = loc("xoffset"(#loc5))
+#loc34 = loc("xindex"(#loc6))
+#loc35 = loc("xindex"(#loc7))
+#loc36 = loc("x0"(#loc8))
+#loc37 = loc("x1"(#loc9))
+#loc38 = loc("tmp0"(#loc10))
+#loc39 = loc("tmp0"(#loc11))
+#loc40 = loc("tmp0"(#loc12))
+#loc41 = loc("tmp0"(#loc13))
+#loc42 = loc("tmp0"(#loc14))
+#loc43 = loc("tmp5"(#loc15))
+#loc44 = loc("tmp5"(#loc16))
+#loc45 = loc("tmp5"(#loc17))
+#loc46 = loc("tmp5"(#loc18))
+#loc47 = loc("tmp5"(#loc19))
+#loc48 = loc("tmp3"(#loc23))
+#loc49 = loc("tmp6"(#loc24))
+#loc50 = loc(callsite(#loc1 at #loc31))
+#loc51 = loc(callsite(#loc3 at #loc31))
+#loc52 = loc(callsite(#loc20 at #loc31))
+#loc53 = loc(callsite(#loc21 at #loc31))
+#loc54 = loc(callsite(#loc22 at #loc31))
diff --git a/triton/YTGIANP3XEKFFBU5IH25WG4ULYMKAOFA3M5S7MXPQCJSAM4M6J5A/__grp__triton_red_fused__fused_rms_norm_view_1.json b/triton/YTGIANP3XEKFFBU5IH25WG4ULYMKAOFA3M5S7MXPQCJSAM4M6J5A/__grp__triton_red_fused__fused_rms_norm_view_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..08b02f890088594cfc35b1134a496ca1b1895c58
--- /dev/null
+++ b/triton/YTGIANP3XEKFFBU5IH25WG4ULYMKAOFA3M5S7MXPQCJSAM4M6J5A/__grp__triton_red_fused__fused_rms_norm_view_1.json
@@ -0,0 +1 @@
+{"child_paths": {"triton_red_fused__fused_rms_norm_view_1.source": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/YTGIANP3XEKFFBU5IH25WG4ULYMKAOFA3M5S7MXPQCJSAM4M6J5A/triton_red_fused__fused_rms_norm_view_1.source", "triton_red_fused__fused_rms_norm_view_1.ttir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/YTGIANP3XEKFFBU5IH25WG4ULYMKAOFA3M5S7MXPQCJSAM4M6J5A/triton_red_fused__fused_rms_norm_view_1.ttir", "triton_red_fused__fused_rms_norm_view_1.ttgir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/YTGIANP3XEKFFBU5IH25WG4ULYMKAOFA3M5S7MXPQCJSAM4M6J5A/triton_red_fused__fused_rms_norm_view_1.ttgir", "triton_red_fused__fused_rms_norm_view_1.llir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/YTGIANP3XEKFFBU5IH25WG4ULYMKAOFA3M5S7MXPQCJSAM4M6J5A/triton_red_fused__fused_rms_norm_view_1.llir", "triton_red_fused__fused_rms_norm_view_1.ptx": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/YTGIANP3XEKFFBU5IH25WG4ULYMKAOFA3M5S7MXPQCJSAM4M6J5A/triton_red_fused__fused_rms_norm_view_1.ptx", "triton_red_fused__fused_rms_norm_view_1.cubin": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/YTGIANP3XEKFFBU5IH25WG4ULYMKAOFA3M5S7MXPQCJSAM4M6J5A/triton_red_fused__fused_rms_norm_view_1.cubin", "triton_red_fused__fused_rms_norm_view_1.json": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/YTGIANP3XEKFFBU5IH25WG4ULYMKAOFA3M5S7MXPQCJSAM4M6J5A/triton_red_fused__fused_rms_norm_view_1.json"}}
\ No newline at end of file
diff --git a/triton/YTGIANP3XEKFFBU5IH25WG4ULYMKAOFA3M5S7MXPQCJSAM4M6J5A/triton_red_fused__fused_rms_norm_view_1.cubin b/triton/YTGIANP3XEKFFBU5IH25WG4ULYMKAOFA3M5S7MXPQCJSAM4M6J5A/triton_red_fused__fused_rms_norm_view_1.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..a4b30fba142c80477a85634e7287900bd3e88627
Binary files /dev/null and b/triton/YTGIANP3XEKFFBU5IH25WG4ULYMKAOFA3M5S7MXPQCJSAM4M6J5A/triton_red_fused__fused_rms_norm_view_1.cubin differ
diff --git a/triton/YTGIANP3XEKFFBU5IH25WG4ULYMKAOFA3M5S7MXPQCJSAM4M6J5A/triton_red_fused__fused_rms_norm_view_1.json b/triton/YTGIANP3XEKFFBU5IH25WG4ULYMKAOFA3M5S7MXPQCJSAM4M6J5A/triton_red_fused__fused_rms_norm_view_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..2040ecea201391605f9d5162a87576c64e851aa3
--- /dev/null
+++ b/triton/YTGIANP3XEKFFBU5IH25WG4ULYMKAOFA3M5S7MXPQCJSAM4M6J5A/triton_red_fused__fused_rms_norm_view_1.json
@@ -0,0 +1 @@
+{"hash": "c4cc8035fbb91452869d41f5db1b945e18a038a0db3b2fb2ef809320338cf27a", "target": {"backend": "cuda", "arch": 89, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "enable_reflect_ftz": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee", "bf16x3", "bf16x6"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm89", "instrumentation_mode": "", "triton_version": "3.6.0", "tensordesc_meta": [], "shared": 16, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused__fused_rms_norm_view_1"}
\ No newline at end of file
diff --git a/triton/YTGIANP3XEKFFBU5IH25WG4ULYMKAOFA3M5S7MXPQCJSAM4M6J5A/triton_red_fused__fused_rms_norm_view_1.llir b/triton/YTGIANP3XEKFFBU5IH25WG4ULYMKAOFA3M5S7MXPQCJSAM4M6J5A/triton_red_fused__fused_rms_norm_view_1.llir
new file mode 100644
index 0000000000000000000000000000000000000000..ef2490914e89973042e5ef7b80b78646f89473c2
--- /dev/null
+++ b/triton/YTGIANP3XEKFFBU5IH25WG4ULYMKAOFA3M5S7MXPQCJSAM4M6J5A/triton_red_fused__fused_rms_norm_view_1.llir
@@ -0,0 +1,136 @@
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64"
+
+@global_smem = external local_unnamed_addr addrspace(3) global [0 x i8], align 16
+
+; Function Attrs: nounwind
+define ptx_kernel void @triton_red_fused__fused_rms_norm_view_1(ptr addrspace(1) %0, ptr addrspace(1) %1, i32 %2, i32 %3, ptr addrspace(1) readnone captures(none) %4, ptr addrspace(1) readnone captures(none) %5) local_unnamed_addr #0 !dbg !4 {
+  %7 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7
+  %8 = shl i32 %7, 2, !dbg !8
+  %9 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9
+  %10 = and i32 %9, 96, !dbg !9
+  %11 = lshr exact i32 %10, 5, !dbg !9
+  %12 = and i32 %9, 3, !dbg !9
+  %13 = or disjoint i32 %11, %8, !dbg !10
+  %14 = or disjoint i32 %8, %12, !dbg !10
+  %15 = shl nuw nsw i32 %9, 2, !dbg !11
+  %16 = and i32 %15, 124, !dbg !11
+  %17 = sdiv i32 %13, 32, !dbg !12
+  %18 = mul i32 %17, 32, !dbg !13
+  %.decomposed = sub i32 %13, %18, !dbg !13
+  %19 = shl nsw i32 %.decomposed, 7, !dbg !14
+  %20 = or disjoint i32 %19, %16, !dbg !15
+  %21 = mul i32 %17, 12288, !dbg !16
+  %22 = add i32 %20, %21, !dbg !17
+  %23 = sext i32 %22 to i64, !dbg !18
+  %24 = getelementptr bfloat, ptr addrspace(1) %0, i64 %23, !dbg !18
+  %25 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !19
+  %26 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %24, i64 %25, i1 true) #4, !dbg !19
+  %27 = extractvalue { i32, i32 } %26, 0, !dbg !19
+  %28 = bitcast i32 %27 to <2 x bfloat>, !dbg !19
+  %29 = extractvalue { i32, i32 } %26, 1, !dbg !19
+  %30 = bitcast i32 %29 to <2 x bfloat>, !dbg !19
+  %31 = extractelement <2 x bfloat> %28, i64 0, !dbg !19
+  %32 = extractelement <2 x bfloat> %28, i64 1, !dbg !19
+  %33 = extractelement <2 x bfloat> %30, i64 0, !dbg !19
+  %34 = extractelement <2 x bfloat> %30, i64 1, !dbg !19
+  %35 = fpext bfloat %31 to float, !dbg !20
+  %36 = fpext bfloat %32 to float, !dbg !20
+  %37 = fpext bfloat %33 to float, !dbg !20
+  %38 = fpext bfloat %34 to float, !dbg !20
+  %39 = fmul float %35, %35, !dbg !21
+  %40 = fmul float %36, %36, !dbg !21
+  %41 = fmul float %37, %37, !dbg !21
+  %42 = fmul float %38, %38, !dbg !21
+  %43 = fadd float %39, %40, !dbg !22
+  %44 = fadd float %41, %43, !dbg !22
+  %45 = fadd float %42, %44, !dbg !22
+  %46 = bitcast float %45 to i32, !dbg !25
+  %47 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %46, i32 16, i32 31), !dbg !25
+  %48 = bitcast i32 %47 to float, !dbg !25
+  %49 = fadd float %45, %48, !dbg !22
+  %50 = bitcast float %49 to i32, !dbg !25
+  %51 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %50, i32 8, i32 31), !dbg !25
+  %52 = bitcast i32 %51 to float, !dbg !25
+  %53 = fadd float %49, %52, !dbg !22
+  %54 = bitcast float %53 to i32, !dbg !25
+  %55 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %54, i32 4, i32 31), !dbg !25
+  %56 = bitcast i32 %55 to float, !dbg !25
+  %57 = fadd float %53, %56, !dbg !22
+  %58 = bitcast float %57 to i32, !dbg !25
+  %59 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %58, i32 2, i32 31), !dbg !25
+  %60 = bitcast i32 %59 to float, !dbg !25
+  %61 = fadd float %57, %60, !dbg !22
+  %62 = bitcast float %61 to i32, !dbg !25
+  %63 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %62, i32 1, i32 31), !dbg !25
+  %64 = bitcast i32 %63 to float, !dbg !25
+  %65 = fadd float %61, %64, !dbg !22
+  %66 = lshr exact i32 %10, 3, !dbg !28
+  %67 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %66, !dbg !28
+  store float %65, ptr addrspace(3) %67, align 4, !dbg !28
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !28
+  %68 = shl nuw nsw i32 %12, 2, !dbg !28
+  %69 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %68, !dbg !28
+  %70 = load i32, ptr addrspace(3) %69, align 4, !dbg !28
+  %71 = sext i32 %14 to i64, !dbg !29
+  %72 = getelementptr float, ptr addrspace(1) %1, i64 %71, !dbg !29
+  %73 = and i32 %9, 124, !dbg !30
+  %74 = icmp eq i32 %73, 0, !dbg !30
+  tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %70, ptr addrspace(1) %72, i1 %74) #4, !dbg !30
+  ret void, !dbg !31
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1
+
+; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
+declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2
+
+; Function Attrs: convergent nocallback nounwind
+declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #3
+
+attributes #0 = { nounwind "nvvm.reqntid"="128" }
+attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
+attributes #3 = { convergent nocallback nounwind }
+attributes #4 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly)
+!1 = !DIFile(filename: "cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py", directory: "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi")
+!2 = !{i32 2, !"Debug Info Version", i32 3}
+!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
+!4 = distinct !DISubprogram(name: "triton_red_fused__fused_rms_norm_view_1", linkageName: "triton_red_fused__fused_rms_norm_view_1", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!5 = !DISubroutineType(cc: DW_CC_normal, types: !6)
+!6 = !{}
+!7 = !DILocation(line: 23, column: 28, scope: !4)
+!8 = !DILocation(line: 23, column: 33, scope: !4)
+!9 = !DILocation(line: 24, column: 44, scope: !4)
+!10 = !DILocation(line: 24, column: 23, scope: !4)
+!11 = !DILocation(line: 26, column: 37, scope: !4)
+!12 = !DILocation(line: 29, column: 19, scope: !4)
+!13 = !DILocation(line: 28, column: 19, scope: !4)
+!14 = !DILocation(line: 38, column: 45, scope: !4)
+!15 = !DILocation(line: 38, column: 41, scope: !4)
+!16 = !DILocation(line: 38, column: 56, scope: !4)
+!17 = !DILocation(line: 38, column: 50, scope: !4)
+!18 = !DILocation(line: 38, column: 34, scope: !4)
+!19 = !DILocation(line: 38, column: 61, scope: !4)
+!20 = !DILocation(line: 38, column: 115, scope: !4)
+!21 = !DILocation(line: 40, column: 22, scope: !4)
+!22 = !DILocation(line: 263, column: 15, scope: !23, inlinedAt: !25)
+!23 = distinct !DILexicalBlockFile(scope: !4, file: !24, discriminator: 0)
+!24 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.12/dist-packages/triton/language")
+!25 = !DILocation(line: 293, column: 36, scope: !23, inlinedAt: !26)
+!26 = !DILocation(line: 44, column: 25, scope: !27)
+!27 = distinct !DILexicalBlockFile(scope: !4, file: !1, discriminator: 0)
+!28 = !DILocation(line: 44, column: 28, scope: !4)
+!29 = !DILocation(line: 45, column: 25, scope: !4)
+!30 = !DILocation(line: 45, column: 36, scope: !4)
+!31 = !DILocation(line: 45, column: 4, scope: !4)
diff --git a/triton/YTGIANP3XEKFFBU5IH25WG4ULYMKAOFA3M5S7MXPQCJSAM4M6J5A/triton_red_fused__fused_rms_norm_view_1.ptx b/triton/YTGIANP3XEKFFBU5IH25WG4ULYMKAOFA3M5S7MXPQCJSAM4M6J5A/triton_red_fused__fused_rms_norm_view_1.ptx
new file mode 100644
index 0000000000000000000000000000000000000000..3cccf1d41c7734e1ec2434a7569069bd3403d5ab
--- /dev/null
+++ b/triton/YTGIANP3XEKFFBU5IH25WG4ULYMKAOFA3M5S7MXPQCJSAM4M6J5A/triton_red_fused__fused_rms_norm_view_1.ptx
@@ -0,0 +1,506 @@
+//
+// Generated by LLVM NVPTX Back-End
+//
+
+.version 9.1
+.target sm_89
+.address_size 64
+
+	// .globl	triton_red_fused__fused_rms_norm_view_1 // -- Begin function triton_red_fused__fused_rms_norm_view_1
+.extern .shared .align 16 .b8 global_smem[];
+                                        // @triton_red_fused__fused_rms_norm_view_1
+.visible .entry triton_red_fused__fused_rms_norm_view_1(
+	.param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm_view_1_param_0,
+	.param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm_view_1_param_1,
+	.param .u32 triton_red_fused__fused_rms_norm_view_1_param_2,
+	.param .u32 triton_red_fused__fused_rms_norm_view_1_param_3,
+	.param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm_view_1_param_4,
+	.param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm_view_1_param_5
+)
+.reqntid 128
+{
+	.reg .pred 	%p<3>;
+	.reg .b16 	%rs<5>;
+	.reg .b32 	%r<48>;
+	.reg .b64 	%rd<6>;
+	.loc	1 18 0                          // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:18:0
+$L__func_begin0:
+	.loc	1 18 0                          // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:18:0
+
+// %bb.0:
+	ld.param.b64 	%rd4, [triton_red_fused__fused_rms_norm_view_1_param_0];
+	ld.param.b64 	%rd5, [triton_red_fused__fused_rms_norm_view_1_param_1];
+$L__tmp0:
+	.loc	1 23 28                         // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:23:28
+	mov.u32 	%r5, %ctaid.x;
+	.loc	1 23 33                         // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:23:33
+	shl.b32 	%r6, %r5, 2;
+	.loc	1 24 44                         // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:24:44
+	mov.u32 	%r7, %tid.x;
+	and.b32 	%r8, %r7, 96;
+	bfe.u32 	%r9, %r7, 5, 2;
+	and.b32 	%r10, %r7, 3;
+	.loc	1 24 23                         // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:24:23
+	or.b32 	%r11, %r9, %r6;
+	or.b32 	%r12, %r6, %r10;
+	.loc	1 26 37                         // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:26:37
+	shl.b32 	%r13, %r7, 2;
+	and.b32 	%r14, %r13, 124;
+	.loc	1 29 19                         // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:29:19
+	bfe.s32 	%r15, %r5, 29, 1;
+	shr.u32 	%r16, %r15, 27;
+	add.s32 	%r17, %r11, %r16;
+	shr.u32 	%r18, %r17, 5;
+	.loc	1 28 19                         // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:28:19
+	and.b32 	%r19, %r17, 33554400;
+	sub.s32 	%r20, %r11, %r19;
+	.loc	1 38 45                         // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:38:45
+	shl.b32 	%r21, %r20, 7;
+	.loc	1 38 41                         // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:38:41
+	or.b32 	%r22, %r21, %r14;
+	.loc	1 38 50                         // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:38:50
+	mad.lo.s32 	%r23, %r18, 12288, %r22;
+	.loc	1 38 34                         // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:38:34
+	mad.wide.s32 	%rd1, %r23, 2, %rd4;
+	.loc	1 38 61                         // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:38:61
+	// begin inline asm
+	mov.u64 %rd2, 0x0;
+	createpolicy.fractional.L2::evict_first.b64 %rd2, 1.0;
+	// end inline asm
+	mov.b32 	%r3, 0;
+	mov.pred 	%p1, -1;
+	// begin inline asm
+	mov.u32 %r1, %r3;
+	mov.u32 %r2, %r3;
+	@%p1 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { %r1, %r2 }, [ %rd1 + 0 ], %rd2;
+	// end inline asm
+	mov.b32 	{%rs1, %rs2}, %r1;
+	mov.b32 	{%rs3, %rs4}, %r2;
+	.loc	1 38 115                        // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:38:115
+	cvt.f32.bf16 	%r24, %rs1;
+	cvt.f32.bf16 	%r25, %rs2;
+	cvt.f32.bf16 	%r26, %rs3;
+	cvt.f32.bf16 	%r27, %rs4;
+	.loc	1 40 22                         // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:40:22
+	mul.f32 	%r28, %r25, %r25;
+$L__tmp1:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:44:25 ] ]
+	fma.rn.f32 	%r29, %r24, %r24, %r28;
+	fma.rn.f32 	%r30, %r26, %r26, %r29;
+	fma.rn.f32 	%r31, %r27, %r27, %r30;
+$L__tmp2:
+	.loc	2 293 36                        // standard.py:293:36 @[ cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:44:25 ]
+	shfl.sync.bfly.b32 	%r32, %r31, 16, 31, -1;
+$L__tmp3:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:44:25 ] ]
+	add.f32 	%r33, %r31, %r32;
+$L__tmp4:
+	.loc	2 293 36                        // standard.py:293:36 @[ cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:44:25 ]
+	shfl.sync.bfly.b32 	%r34, %r33, 8, 31, -1;
+$L__tmp5:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:44:25 ] ]
+	add.f32 	%r35, %r33, %r34;
+$L__tmp6:
+	.loc	2 293 36                        // standard.py:293:36 @[ cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:44:25 ]
+	shfl.sync.bfly.b32 	%r36, %r35, 4, 31, -1;
+$L__tmp7:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:44:25 ] ]
+	add.f32 	%r37, %r35, %r36;
+$L__tmp8:
+	.loc	2 293 36                        // standard.py:293:36 @[ cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:44:25 ]
+	shfl.sync.bfly.b32 	%r38, %r37, 2, 31, -1;
+$L__tmp9:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:44:25 ] ]
+	add.f32 	%r39, %r37, %r38;
+$L__tmp10:
+	.loc	2 293 36                        // standard.py:293:36 @[ cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:44:25 ]
+	shfl.sync.bfly.b32 	%r40, %r39, 1, 31, -1;
+$L__tmp11:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:44:25 ] ]
+	add.f32 	%r41, %r39, %r40;
+$L__tmp12:
+	.loc	1 44 28                         // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:44:28
+	shr.u32 	%r42, %r8, 3;
+	mov.b32 	%r43, global_smem;
+	add.s32 	%r44, %r43, %r42;
+	st.shared.b32 	[%r44], %r41;
+	bar.sync 	0;
+	shl.b32 	%r45, %r10, 2;
+	add.s32 	%r46, %r43, %r45;
+	ld.shared.b32 	%r4, [%r46];
+	.loc	1 45 25                         // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:45:25
+	mad.wide.s32 	%rd3, %r12, 4, %rd5;
+	.loc	1 45 36                         // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:45:36
+	and.b32 	%r47, %r7, 124;
+	setp.eq.b32 	%p2, %r47, 0;
+	// begin inline asm
+	@%p2 st.global.b32 [ %rd3 + 0 ], { %r4 };
+	// end inline asm
+	.loc	1 45 4                          // cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py:45:4
+	ret;
+$L__tmp13:
+$L__func_end0:
+                                        // -- End function
+}
+	.file	1 "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py"
+	.file	2 "/usr/local/lib/python3.12/dist-packages/triton/language/standard.py"
+	.section	.debug_abbrev
+	{
+.b8 1                                   // Abbreviation Code
+.b8 17                                  // DW_TAG_compile_unit
+.b8 1                                   // DW_CHILDREN_yes
+.b8 37                                  // DW_AT_producer
+.b8 8                                   // DW_FORM_string
+.b8 19                                  // DW_AT_language
+.b8 5                                   // DW_FORM_data2
+.b8 3                                   // DW_AT_name
+.b8 8                                   // DW_FORM_string
+.b8 16                                  // DW_AT_stmt_list
+.b8 6                                   // DW_FORM_data4
+.b8 27                                  // DW_AT_comp_dir
+.b8 8                                   // DW_FORM_string
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 2                                   // Abbreviation Code
+.b8 46                                  // DW_TAG_subprogram
+.b8 0                                   // DW_CHILDREN_no
+.b8 3                                   // DW_AT_name
+.b8 8                                   // DW_FORM_string
+.b8 32                                  // DW_AT_inline
+.b8 11                                  // DW_FORM_data1
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 3                                   // Abbreviation Code
+.b8 46                                  // DW_TAG_subprogram
+.b8 1                                   // DW_CHILDREN_yes
+.b8 17                                  // DW_AT_low_pc
+.b8 1                                   // DW_FORM_addr
+.b8 18                                  // DW_AT_high_pc
+.b8 1                                   // DW_FORM_addr
+.b8 49                                  // DW_AT_abstract_origin
+.b8 19                                  // DW_FORM_ref4
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 4                                   // Abbreviation Code
+.b8 29                                  // DW_TAG_inlined_subroutine
+.b8 1                                   // DW_CHILDREN_yes
+.b8 49                                  // DW_AT_abstract_origin
+.b8 19                                  // DW_FORM_ref4
+.b8 17                                  // DW_AT_low_pc
+.b8 1                                   // DW_FORM_addr
+.b8 18                                  // DW_AT_high_pc
+.b8 1                                   // DW_FORM_addr
+.b8 88                                  // DW_AT_call_file
+.b8 11                                  // DW_FORM_data1
+.b8 89                                  // DW_AT_call_line
+.b8 11                                  // DW_FORM_data1
+.b8 87                                  // DW_AT_call_column
+.b8 11                                  // DW_FORM_data1
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 5                                   // Abbreviation Code
+.b8 29                                  // DW_TAG_inlined_subroutine
+.b8 0                                   // DW_CHILDREN_no
+.b8 49                                  // DW_AT_abstract_origin
+.b8 19                                  // DW_FORM_ref4
+.b8 17                                  // DW_AT_low_pc
+.b8 1                                   // DW_FORM_addr
+.b8 18                                  // DW_AT_high_pc
+.b8 1                                   // DW_FORM_addr
+.b8 88                                  // DW_AT_call_file
+.b8 11                                  // DW_FORM_data1
+.b8 89                                  // DW_AT_call_line
+.b8 5                                   // DW_FORM_data2
+.b8 87                                  // DW_AT_call_column
+.b8 11                                  // DW_FORM_data1
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 0                                   // EOM(3)
+	}
+	.section	.debug_info
+	{
+.b32 339                                // Length of Unit
+.b8 2                                   // DWARF version number
+.b8 0
+.b32 .debug_abbrev                      // Offset Into Abbrev. Section
+.b8 8                                   // Address Size (in bytes)
+.b8 1                                   // Abbrev [1] 0xb:0x14c DW_TAG_compile_unit
+.b8 116                                 // DW_AT_producer
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 0
+.b8 2                                   // DW_AT_language
+.b8 0
+.b8 99                                  // DW_AT_name
+.b8 113
+.b8 105
+.b8 116
+.b8 120
+.b8 53
+.b8 104
+.b8 119
+.b8 117
+.b8 112
+.b8 107
+.b8 98
+.b8 106
+.b8 109
+.b8 99
+.b8 115
+.b8 111
+.b8 121
+.b8 107
+.b8 113
+.b8 101
+.b8 112
+.b8 122
+.b8 113
+.b8 99
+.b8 55
+.b8 122
+.b8 99
+.b8 120
+.b8 106
+.b8 99
+.b8 98
+.b8 53
+.b8 97
+.b8 99
+.b8 113
+.b8 107
+.b8 105
+.b8 55
+.b8 122
+.b8 99
+.b8 115
+.b8 106
+.b8 105
+.b8 102
+.b8 114
+.b8 110
+.b8 114
+.b8 122
+.b8 99
+.b8 114
+.b8 114
+.b8 46
+.b8 112
+.b8 121
+.b8 0
+.b32 .debug_line                        // DW_AT_stmt_list
+.b8 47                                  // DW_AT_comp_dir
+.b8 97
+.b8 112
+.b8 112
+.b8 47
+.b8 116
+.b8 101
+.b8 110
+.b8 115
+.b8 111
+.b8 114
+.b8 114
+.b8 116
+.b8 95
+.b8 108
+.b8 108
+.b8 109
+.b8 47
+.b8 118
+.b8 105
+.b8 115
+.b8 117
+.b8 97
+.b8 108
+.b8 95
+.b8 103
+.b8 101
+.b8 110
+.b8 47
+.b8 99
+.b8 111
+.b8 109
+.b8 112
+.b8 105
+.b8 108
+.b8 101
+.b8 100
+.b8 95
+.b8 99
+.b8 97
+.b8 99
+.b8 104
+.b8 101
+.b8 47
+.b8 102
+.b8 108
+.b8 117
+.b8 120
+.b8 50
+.b8 95
+.b8 107
+.b8 108
+.b8 101
+.b8 105
+.b8 110
+.b8 95
+.b8 57
+.b8 98
+.b8 95
+.b8 78
+.b8 86
+.b8 73
+.b8 68
+.b8 73
+.b8 65
+.b8 95
+.b8 71
+.b8 101
+.b8 70
+.b8 111
+.b8 114
+.b8 99
+.b8 101
+.b8 95
+.b8 82
+.b8 84
+.b8 88
+.b8 95
+.b8 52
+.b8 48
+.b8 57
+.b8 48
+.b8 95
+.b8 115
+.b8 109
+.b8 56
+.b8 57
+.b8 95
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 50
+.b8 46
+.b8 49
+.b8 48
+.b8 46
+.b8 48
+.b8 97
+.b8 48
+.b8 95
+.b8 98
+.b8 52
+.b8 101
+.b8 52
+.b8 101
+.b8 101
+.b8 56
+.b8 49
+.b8 100
+.b8 51
+.b8 46
+.b8 110
+.b8 118
+.b8 50
+.b8 53
+.b8 46
+.b8 49
+.b8 50
+.b8 95
+.b8 99
+.b8 117
+.b8 100
+.b8 97
+.b8 49
+.b8 51
+.b8 95
+.b8 49
+.b8 47
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 105
+.b8 110
+.b8 100
+.b8 117
+.b8 99
+.b8 116
+.b8 111
+.b8 114
+.b8 47
+.b8 113
+.b8 105
+.b8 0
+.b8 2                                   // Abbrev [2] 0xe4:0x2a DW_TAG_subprogram
+.b8 116                                 // DW_AT_name
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 114
+.b8 101
+.b8 100
+.b8 95
+.b8 102
+.b8 117
+.b8 115
+.b8 101
+.b8 100
+.b8 95
+.b8 95
+.b8 102
+.b8 117
+.b8 115
+.b8 101
+.b8 100
+.b8 95
+.b8 114
+.b8 109
+.b8 115
+.b8 95
+.b8 110
+.b8 111
+.b8 114
+.b8 109
+.b8 95
+.b8 118
+.b8 105
+.b8 101
+.b8 119
+.b8 95
+.b8 49
+.b8 0
+.b8 1                                   // DW_AT_inline
+.b8 3                                   // Abbrev [3] 0x10e:0x48 DW_TAG_subprogram
+.b64 $L__func_begin0                    // DW_AT_low_pc
+.b64 $L__func_end0                      // DW_AT_high_pc
+.b32 228                                // DW_AT_abstract_origin
+.b8 4                                   // Abbrev [4] 0x123:0x32 DW_TAG_inlined_subroutine
+.b32 228                                // DW_AT_abstract_origin
+.b64 $L__tmp1                           // DW_AT_low_pc
+.b64 $L__tmp12                          // DW_AT_high_pc
+.b8 1                                   // DW_AT_call_file
+.b8 44                                  // DW_AT_call_line
+.b8 25                                  // DW_AT_call_column
+.b8 5                                   // Abbrev [5] 0x13b:0x19 DW_TAG_inlined_subroutine
+.b32 228                                // DW_AT_abstract_origin
+.b64 $L__tmp1                           // DW_AT_low_pc
+.b64 $L__tmp12                          // DW_AT_high_pc
+.b8 2                                   // DW_AT_call_file
+.b8 37                                  // DW_AT_call_line
+.b8 1
+.b8 36                                  // DW_AT_call_column
+.b8 0                                   // End Of Children Mark
+.b8 0                                   // End Of Children Mark
+.b8 0                                   // End Of Children Mark
+	}
+	.section	.debug_macinfo	{	}
diff --git a/triton/YTGIANP3XEKFFBU5IH25WG4ULYMKAOFA3M5S7MXPQCJSAM4M6J5A/triton_red_fused__fused_rms_norm_view_1.source b/triton/YTGIANP3XEKFFBU5IH25WG4ULYMKAOFA3M5S7MXPQCJSAM4M6J5A/triton_red_fused__fused_rms_norm_view_1.source
new file mode 100644
index 0000000000000000000000000000000000000000..d008d1290cf63a8a576c3fae2b114c150f51189b
--- /dev/null
+++ b/triton/YTGIANP3XEKFFBU5IH25WG4ULYMKAOFA3M5S7MXPQCJSAM4M6J5A/triton_red_fused__fused_rms_norm_view_1.source
@@ -0,0 +1,167 @@
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":18:0)
+#loc33 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":287:0)
+#loc35 = loc(unknown)
+#loc38 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":262:0)
+#loc42 = loc("in_ptr0"(#loc))
+#loc43 = loc("out_ptr0"(#loc))
+#loc44 = loc("xnumel"(#loc))
+#loc45 = loc("r0_numel"(#loc))
+#loc74 = loc("input"(#loc33))
+#loc75 = loc("a"(#loc38))
+#loc76 = loc("b"(#loc38))
+module {
+  tt.func public @triton_red_fused__fused_rms_norm_view_1(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
+    %xnumel_0 = arith.constant 65536 : i32 loc(#loc46)
+    %r0_numel_1 = arith.constant 128 : i32 loc(#loc47)
+    %xoffset = tt.get_program_id x : i32 loc(#loc48)
+    %xoffset_2 = arith.constant 4 : i32 loc(#loc49)
+    %xoffset_3 = arith.constant 4 : i32 loc(#loc49)
+    %xoffset_4 = arith.muli %xoffset, %xoffset_3 : i32 loc(#loc49)
+    %xindex = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32> loc(#loc50)
+    %xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<4xi32> -> tensor<4x1xi32> loc(#loc51)
+    %xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<4x1xi32> loc(#loc52)
+    %xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<4x1xi32> loc(#loc52)
+    %xmask = arith.constant true loc(#loc53)
+    %xmask_8 = arith.constant dense<true> : tensor<4x128xi1> loc(#loc53)
+    %r0_base = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc54)
+    %r0_base_9 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc55)
+    %x0 = arith.constant 32 : i32 loc(#loc56)
+    %x0_10 = arith.constant 32 : i32 loc(#loc56)
+    %x0_11 = arith.constant dense<32> : tensor<4x1xi32> loc(#loc56)
+    %x0_12 = arith.remsi %xindex_7, %x0_11 : tensor<4x1xi32> loc(#loc56)
+    %x1 = arith.constant 32 : i32 loc(#loc57)
+    %x1_13 = arith.constant 32 : i32 loc(#loc57)
+    %x1_14 = arith.constant dense<32> : tensor<4x1xi32> loc(#loc57)
+    %x1_15 = arith.divsi %xindex_7, %x1_14 : tensor<4x1xi32> loc(#loc57)
+    %_tmp4 = arith.constant 0.000000e+00 : f32 loc(#loc58)
+    %_tmp4_16 = arith.constant dense<0.000000e+00> : tensor<4x128xf32> loc(#loc58)
+    %c0_i32 = arith.constant 0 : i32 loc(#loc14)
+    %c128_i32 = arith.constant 128 : i32 loc(#loc14)
+    %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc14)
+    %1 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc14)
+    %2 = arith.bitcast %c128_i32 : i32 to i32 loc(#loc14)
+    %3 = ub.poison : i32 loc(#loc14)
+    %_tmp4_17 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%_tmp4_19 = %_tmp4_16) -> (tensor<4x128xf32>)  : i32 {
+      %r0_index = tt.splat %r0_offset : i32 -> tensor<1x128xi32> loc(#loc60)
+      %r0_index_20 = arith.addi %r0_index, %r0_base_9 : tensor<1x128xi32> loc(#loc60)
+      %r0_mask = arith.constant dense<128> : tensor<1x128xi32> loc(#loc61)
+      %r0_mask_21 = arith.cmpi slt, %r0_index_20, %r0_mask : tensor<1x128xi32> loc(#loc61)
+      %tmp0 = arith.constant 128 : i32 loc(#loc62)
+      %tmp0_22 = arith.constant 128 : i32 loc(#loc62)
+      %tmp0_23 = arith.constant dense<128> : tensor<4x1xi32> loc(#loc62)
+      %tmp0_24 = arith.muli %tmp0_23, %x0_12 : tensor<4x1xi32> loc(#loc62)
+      %tmp0_25 = tt.broadcast %r0_index_20 : tensor<1x128xi32> -> tensor<4x128xi32> loc(#loc63)
+      %tmp0_26 = tt.broadcast %tmp0_24 : tensor<4x1xi32> -> tensor<4x128xi32> loc(#loc63)
+      %tmp0_27 = arith.addi %tmp0_25, %tmp0_26 : tensor<4x128xi32> loc(#loc63)
+      %tmp0_28 = arith.constant 12288 : i32 loc(#loc64)
+      %tmp0_29 = arith.constant 12288 : i32 loc(#loc64)
+      %tmp0_30 = arith.constant dense<12288> : tensor<4x1xi32> loc(#loc64)
+      %tmp0_31 = arith.muli %tmp0_30, %x1_15 : tensor<4x1xi32> loc(#loc64)
+      %tmp0_32 = tt.broadcast %tmp0_31 : tensor<4x1xi32> -> tensor<4x128xi32> loc(#loc65)
+      %tmp0_33 = arith.addi %tmp0_27, %tmp0_32 : tensor<4x128xi32> loc(#loc65)
+      %tmp0_34 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<4x128x!tt.ptr<bf16>> loc(#loc66)
+      %tmp0_35 = tt.addptr %tmp0_34, %tmp0_33 : tensor<4x128x!tt.ptr<bf16>>, tensor<4x128xi32> loc(#loc66)
+      %tmp0_36 = arith.constant 0.000000e+00 : f32 loc(#loc67)
+      %tmp0_37 = tt.broadcast %r0_mask_21 : tensor<1x128xi1> -> tensor<4x128xi1> loc(#loc67)
+      %tmp0_38 = arith.constant dense<0.000000e+00> : tensor<4x128xf32> loc(#loc67)
+      %tmp0_39 = arith.truncf %tmp0_38 : tensor<4x128xf32> to tensor<4x128xbf16> loc(#loc67)
+      %tmp0_40 = tt.load %tmp0_35, %tmp0_37, %tmp0_39 evictionPolicy = evict_first : tensor<4x128x!tt.ptr<bf16>> loc(#loc67)
+      %tmp0_41 = arith.extf %tmp0_40 : tensor<4x128xbf16> to tensor<4x128xf32> loc(#loc68)
+      %tmp2 = arith.mulf %tmp0_41, %tmp0_41 : tensor<4x128xf32> loc(#loc69)
+      %tmp5 = arith.addf %_tmp4_19, %tmp2 : tensor<4x128xf32> loc(#loc70)
+      %_tmp4_42 = tt.broadcast %r0_mask_21 : tensor<1x128xi1> -> tensor<4x128xi1> loc(#loc71)
+      %_tmp4_43 = arith.select %_tmp4_42, %tmp5, %_tmp4_19 : tensor<4x128xi1>, tensor<4x128xf32> loc(#loc71)
+      scf.yield %_tmp4_43 : tensor<4x128xf32> loc(#loc27)
+    } loc(#loc59)
+    %tmp4 = tt.call @"triton.language.standard.sum__fp32S4_128S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%_tmp4_17) : (tensor<4x128xf32>) -> tensor<4xf32> loc(#loc72)
+    %tmp4_18 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<4xf32> -> tensor<4x1xf32> loc(#loc73)
+    %4 = tt.splat %out_ptr0 : !tt.ptr<f32> -> tensor<4x1x!tt.ptr<f32>> loc(#loc30)
+    %5 = tt.addptr %4, %xindex_7 : tensor<4x1x!tt.ptr<f32>>, tensor<4x1xi32> loc(#loc30)
+    tt.store %5, %tmp4_18 : tensor<4x1x!tt.ptr<f32>> loc(#loc31)
+    tt.return loc(#loc32)
+  } loc(#loc)
+  tt.func private @"triton.language.standard.sum__fp32S4_128S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<4x128xf32> loc("input"(#loc33))) -> tensor<4xf32> attributes {noinline = false} {
+    %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({
+    ^bb0(%arg1: f32 loc(unknown), %arg2: f32 loc(unknown)):
+      %2 = tt.call @triton.language.standard._sum_combine__fp32_fp32__(%arg1, %arg2) : (f32, f32) -> f32 loc(#loc34)
+      tt.reduce.return %2 : f32 loc(#loc34)
+    }) : (tensor<4x128xf32>) -> tensor<4xf32> loc(#loc34)
+    tt.return %0 : tensor<4xf32> loc(#loc36)
+  ^bb1:  // no predecessors
+    %1 = ub.poison : tensor<4xf32> loc(#loc37)
+    tt.return %1 : tensor<4xf32> loc(#loc37)
+  } loc(#loc33)
+  tt.func private @triton.language.standard._sum_combine__fp32_fp32__(%a: f32 loc("a"(#loc38)), %b: f32 loc("b"(#loc38))) -> f32 attributes {noinline = false} {
+    %0 = arith.addf %a, %b : f32 loc(#loc39)
+    tt.return %0 : f32 loc(#loc40)
+  ^bb1:  // no predecessors
+    %1 = ub.poison : f32 loc(#loc41)
+    tt.return %1 : f32 loc(#loc41)
+  } loc(#loc38)
+} loc(#loc)
+#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":19:13)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":20:15)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":23:28)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":23:33)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":24:36)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":24:44)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":24:23)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":25:46)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":26:27)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":26:37)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":28:19)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":29:19)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":30:43)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":32:43)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":33:31)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":34:29)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:45)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:41)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:56)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:50)
+#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:34)
+#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:61)
+#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:115)
+#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":40:22)
+#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":42:23)
+#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":43:40)
+#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":43:8)
+#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":44:25)
+#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":44:28)
+#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":45:25)
+#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":45:36)
+#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":45:4)
+#loc34 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:36)
+#loc36 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:11)
+#loc37 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:4)
+#loc39 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:15)
+#loc40 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:11)
+#loc41 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:4)
+#loc46 = loc("xnumel"(#loc1))
+#loc47 = loc("r0_numel"(#loc2))
+#loc48 = loc("xoffset"(#loc3))
+#loc49 = loc("xoffset"(#loc4))
+#loc50 = loc("xindex"(#loc5))
+#loc51 = loc("xindex"(#loc6))
+#loc52 = loc("xindex"(#loc7))
+#loc53 = loc("xmask"(#loc8))
+#loc54 = loc("r0_base"(#loc9))
+#loc55 = loc("r0_base"(#loc10))
+#loc56 = loc("x0"(#loc11))
+#loc57 = loc("x1"(#loc12))
+#loc58 = loc("_tmp4"(#loc13))
+#loc59 = loc("_tmp4"(#loc14))
+#loc60 = loc("r0_index"(#loc15))
+#loc61 = loc("r0_mask"(#loc16))
+#loc62 = loc("tmp0"(#loc17))
+#loc63 = loc("tmp0"(#loc18))
+#loc64 = loc("tmp0"(#loc19))
+#loc65 = loc("tmp0"(#loc20))
+#loc66 = loc("tmp0"(#loc21))
+#loc67 = loc("tmp0"(#loc22))
+#loc68 = loc("tmp0"(#loc23))
+#loc69 = loc("tmp2"(#loc24))
+#loc70 = loc("tmp5"(#loc25))
+#loc71 = loc("_tmp4"(#loc26))
+#loc72 = loc("tmp4"(#loc28))
+#loc73 = loc("tmp4"(#loc29))
diff --git a/triton/YTGIANP3XEKFFBU5IH25WG4ULYMKAOFA3M5S7MXPQCJSAM4M6J5A/triton_red_fused__fused_rms_norm_view_1.ttgir b/triton/YTGIANP3XEKFFBU5IH25WG4ULYMKAOFA3M5S7MXPQCJSAM4M6J5A/triton_red_fused__fused_rms_norm_view_1.ttgir
new file mode 100644
index 0000000000000000000000000000000000000000..c51f2eca3243f828f06f4f54c84c586d715a2d14
--- /dev/null
+++ b/triton/YTGIANP3XEKFFBU5IH25WG4ULYMKAOFA3M5S7MXPQCJSAM4M6J5A/triton_red_fused__fused_rms_norm_view_1.ttgir
@@ -0,0 +1,108 @@
+#blocked = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 32], warpsPerCTA = [4, 1], order = [1, 0]}>
+#blocked1 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [4, 8], warpsPerCTA = [1, 4], order = [0, 1]}>
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":18:0)
+#loc1 = loc(unknown)
+#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":44:25)
+#loc27 = loc("in_ptr0"(#loc))
+#loc28 = loc("out_ptr0"(#loc))
+#loc29 = loc("xnumel"(#loc))
+#loc30 = loc("r0_numel"(#loc))
+#loc49 = loc("tmp4"(#loc21))
+#loc52 = loc(callsite(#loc1 at #loc49))
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:89", "ttg.threads-per-warp" = 32 : i32} {
+  tt.func public @triton_red_fused__fused_rms_norm_view_1(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
+    %cst = arith.constant dense<128> : tensor<1x128xi32, #blocked> loc(#loc1)
+    %cst_0 = arith.constant dense<128> : tensor<4x1xi32, #blocked> loc(#loc1)
+    %cst_1 = arith.constant dense<12288> : tensor<4x1xi32, #blocked> loc(#loc1)
+    %cst_2 = arith.constant dense<32> : tensor<4x1xi32, #blocked> loc(#loc1)
+    %c4_i32 = arith.constant 4 : i32 loc(#loc1)
+    %cst_3 = arith.constant dense<0.000000e+00> : tensor<4x128xbf16, #blocked> loc(#loc1)
+    %cst_4 = arith.constant dense<0.000000e+00> : tensor<4x128xf32, #blocked> loc(#loc1)
+    %xoffset = tt.get_program_id x : i32 loc(#loc31)
+    %xoffset_5 = arith.muli %xoffset, %c4_i32 : i32 loc(#loc32)
+    %xindex = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc33)
+    %xindex_6 = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc33)
+    %xindex_7 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<4xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<4x1xi32, #blocked> loc(#loc33)
+    %xindex_8 = tt.expand_dims %xindex_6 {axis = 1 : i32} : tensor<4xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<4x1xi32, #blocked1> loc(#loc33)
+    %xindex_9 = tt.splat %xoffset_5 : i32 -> tensor<4x1xi32, #blocked> loc(#loc34)
+    %xindex_10 = tt.splat %xoffset_5 : i32 -> tensor<4x1xi32, #blocked1> loc(#loc34)
+    %xindex_11 = arith.addi %xindex_9, %xindex_7 : tensor<4x1xi32, #blocked> loc(#loc34)
+    %xindex_12 = arith.addi %xindex_10, %xindex_8 : tensor<4x1xi32, #blocked1> loc(#loc34)
+    %r0_base = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc35)
+    %r0_base_13 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x128xi32, #blocked> loc(#loc35)
+    %x0 = arith.remsi %xindex_11, %cst_2 : tensor<4x1xi32, #blocked> loc(#loc36)
+    %x1 = arith.divsi %xindex_11, %cst_2 : tensor<4x1xi32, #blocked> loc(#loc37)
+    %r0_mask = arith.cmpi slt, %r0_base_13, %cst : tensor<1x128xi32, #blocked> loc(#loc38)
+    %tmp0 = arith.muli %x0, %cst_0 : tensor<4x1xi32, #blocked> loc(#loc39)
+    %tmp0_14 = tt.broadcast %r0_base_13 : tensor<1x128xi32, #blocked> -> tensor<4x128xi32, #blocked> loc(#loc40)
+    %tmp0_15 = tt.broadcast %tmp0 : tensor<4x1xi32, #blocked> -> tensor<4x128xi32, #blocked> loc(#loc40)
+    %tmp0_16 = arith.addi %tmp0_14, %tmp0_15 : tensor<4x128xi32, #blocked> loc(#loc40)
+    %tmp0_17 = arith.muli %x1, %cst_1 : tensor<4x1xi32, #blocked> loc(#loc41)
+    %tmp0_18 = tt.broadcast %tmp0_17 : tensor<4x1xi32, #blocked> -> tensor<4x128xi32, #blocked> loc(#loc42)
+    %tmp0_19 = arith.addi %tmp0_16, %tmp0_18 : tensor<4x128xi32, #blocked> loc(#loc42)
+    %tmp0_20 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<4x128x!tt.ptr<bf16>, #blocked> loc(#loc43)
+    %tmp0_21 = tt.addptr %tmp0_20, %tmp0_19 : tensor<4x128x!tt.ptr<bf16>, #blocked>, tensor<4x128xi32, #blocked> loc(#loc43)
+    %tmp0_22 = tt.broadcast %r0_mask : tensor<1x128xi1, #blocked> -> tensor<4x128xi1, #blocked> loc(#loc44)
+    %tmp0_23 = tt.load %tmp0_21, %tmp0_22, %cst_3 evictionPolicy = evict_first : tensor<4x128x!tt.ptr<bf16>, #blocked> loc(#loc44)
+    %tmp0_24 = arith.extf %tmp0_23 : tensor<4x128xbf16, #blocked> to tensor<4x128xf32, #blocked> loc(#loc45)
+    %tmp2 = arith.mulf %tmp0_24, %tmp0_24 : tensor<4x128xf32, #blocked> loc(#loc46)
+    %tmp5 = arith.addf %tmp2, %cst_4 : tensor<4x128xf32, #blocked> loc(#loc47)
+    %_tmp4 = arith.select %tmp0_22, %tmp5, %cst_4 : tensor<4x128xi1, #blocked>, tensor<4x128xf32, #blocked> loc(#loc48)
+    %tmp4 = "tt.reduce"(%_tmp4) <{axis = 1 : i32}> ({
+    ^bb0(%tmp4_27: f32 loc(callsite(#loc1 at #loc49)), %tmp4_28: f32 loc(callsite(#loc1 at #loc49))):
+      %tmp4_29 = arith.addf %tmp4_27, %tmp4_28 : f32 loc(#loc53)
+      tt.reduce.return %tmp4_29 : f32 loc(#loc51)
+    }) : (tensor<4x128xf32, #blocked>) -> tensor<4xf32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc51)
+    %tmp4_25 = ttg.convert_layout %tmp4 : tensor<4xf32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<4xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc50)
+    %tmp4_26 = tt.expand_dims %tmp4_25 {axis = 1 : i32} : tensor<4xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<4x1xf32, #blocked1> loc(#loc50)
+    %0 = tt.splat %out_ptr0 : !tt.ptr<f32> -> tensor<4x1x!tt.ptr<f32>, #blocked1> loc(#loc24)
+    %1 = tt.addptr %0, %xindex_12 : tensor<4x1x!tt.ptr<f32>, #blocked1>, tensor<4x1xi32, #blocked1> loc(#loc24)
+    tt.store %1, %tmp4_26 : tensor<4x1x!tt.ptr<f32>, #blocked1> loc(#loc25)
+    tt.return loc(#loc26)
+  } loc(#loc)
+} loc(#loc)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":23:28)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":23:33)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":24:44)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":24:23)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":26:37)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":28:19)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":29:19)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":34:29)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:45)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:41)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:56)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:50)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:34)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:61)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:115)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":40:22)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":42:23)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":43:40)
+#loc20 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:36)
+#loc22 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:15)
+#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":44:28)
+#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":45:25)
+#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":45:36)
+#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":45:4)
+#loc31 = loc("xoffset"(#loc2))
+#loc32 = loc("xoffset"(#loc3))
+#loc33 = loc("xindex"(#loc4))
+#loc34 = loc("xindex"(#loc5))
+#loc35 = loc("r0_base"(#loc6))
+#loc36 = loc("x0"(#loc7))
+#loc37 = loc("x1"(#loc8))
+#loc38 = loc("r0_mask"(#loc9))
+#loc39 = loc("tmp0"(#loc10))
+#loc40 = loc("tmp0"(#loc11))
+#loc41 = loc("tmp0"(#loc12))
+#loc42 = loc("tmp0"(#loc13))
+#loc43 = loc("tmp0"(#loc14))
+#loc44 = loc("tmp0"(#loc15))
+#loc45 = loc("tmp0"(#loc16))
+#loc46 = loc("tmp2"(#loc17))
+#loc47 = loc("tmp5"(#loc18))
+#loc48 = loc("_tmp4"(#loc19))
+#loc50 = loc("tmp4"(#loc23))
+#loc51 = loc(callsite(#loc20 at #loc49))
+#loc53 = loc(callsite(#loc22 at #loc51))
diff --git a/triton/YTGIANP3XEKFFBU5IH25WG4ULYMKAOFA3M5S7MXPQCJSAM4M6J5A/triton_red_fused__fused_rms_norm_view_1.ttir b/triton/YTGIANP3XEKFFBU5IH25WG4ULYMKAOFA3M5S7MXPQCJSAM4M6J5A/triton_red_fused__fused_rms_norm_view_1.ttir
new file mode 100644
index 0000000000000000000000000000000000000000..a4e88a203c59a9ec64867fc191d00a05bfdcc09b
--- /dev/null
+++ b/triton/YTGIANP3XEKFFBU5IH25WG4ULYMKAOFA3M5S7MXPQCJSAM4M6J5A/triton_red_fused__fused_rms_norm_view_1.ttir
@@ -0,0 +1,105 @@
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":18:0)
+#loc2 = loc(unknown)
+#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":44:25)
+#loc29 = loc("in_ptr0"(#loc))
+#loc30 = loc("out_ptr0"(#loc))
+#loc31 = loc("xnumel"(#loc))
+#loc32 = loc("r0_numel"(#loc))
+#loc53 = loc("tmp4"(#loc23))
+#loc56 = loc(callsite(#loc2 at #loc53))
+module {
+  tt.func public @triton_red_fused__fused_rms_norm_view_1(%in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
+    %tmp0 = arith.constant dense<0.000000e+00> : tensor<4x128xbf16> loc(#loc33)
+    %cst = arith.constant dense<12288> : tensor<4x1xi32> loc(#loc2)
+    %cst_0 = arith.constant dense<128> : tensor<4x1xi32> loc(#loc2)
+    %cst_1 = arith.constant dense<128> : tensor<1x128xi32> loc(#loc2)
+    %cst_2 = arith.constant dense<0.000000e+00> : tensor<4x128xf32> loc(#loc2)
+    %cst_3 = arith.constant dense<32> : tensor<4x1xi32> loc(#loc2)
+    %c4_i32 = arith.constant 4 : i32 loc(#loc2)
+    %xoffset = tt.get_program_id x : i32 loc(#loc34)
+    %xoffset_4 = arith.muli %xoffset, %c4_i32 : i32 loc(#loc35)
+    %xindex = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32> loc(#loc36)
+    %xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<4xi32> -> tensor<4x1xi32> loc(#loc37)
+    %xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<4x1xi32> loc(#loc38)
+    %xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<4x1xi32> loc(#loc38)
+    %r0_base = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc39)
+    %r0_base_8 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc40)
+    %x0 = arith.remsi %xindex_7, %cst_3 : tensor<4x1xi32> loc(#loc41)
+    %x1 = arith.divsi %xindex_7, %cst_3 : tensor<4x1xi32> loc(#loc42)
+    %r0_mask = arith.cmpi slt, %r0_base_8, %cst_1 : tensor<1x128xi32> loc(#loc43)
+    %tmp0_9 = arith.muli %x0, %cst_0 : tensor<4x1xi32> loc(#loc44)
+    %tmp0_10 = tt.broadcast %r0_base_8 : tensor<1x128xi32> -> tensor<4x128xi32> loc(#loc45)
+    %tmp0_11 = tt.broadcast %tmp0_9 : tensor<4x1xi32> -> tensor<4x128xi32> loc(#loc45)
+    %tmp0_12 = arith.addi %tmp0_10, %tmp0_11 : tensor<4x128xi32> loc(#loc45)
+    %tmp0_13 = arith.muli %x1, %cst : tensor<4x1xi32> loc(#loc46)
+    %tmp0_14 = tt.broadcast %tmp0_13 : tensor<4x1xi32> -> tensor<4x128xi32> loc(#loc47)
+    %tmp0_15 = arith.addi %tmp0_12, %tmp0_14 : tensor<4x128xi32> loc(#loc47)
+    %tmp0_16 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<4x128x!tt.ptr<bf16>> loc(#loc48)
+    %tmp0_17 = tt.addptr %tmp0_16, %tmp0_15 : tensor<4x128x!tt.ptr<bf16>>, tensor<4x128xi32> loc(#loc48)
+    %tmp0_18 = tt.broadcast %r0_mask : tensor<1x128xi1> -> tensor<4x128xi1> loc(#loc33)
+    %tmp0_19 = tt.load %tmp0_17, %tmp0_18, %tmp0 evictionPolicy = evict_first : tensor<4x128x!tt.ptr<bf16>> loc(#loc33)
+    %tmp0_20 = arith.extf %tmp0_19 : tensor<4x128xbf16> to tensor<4x128xf32> loc(#loc49)
+    %tmp2 = arith.mulf %tmp0_20, %tmp0_20 : tensor<4x128xf32> loc(#loc50)
+    %tmp5 = arith.addf %tmp2, %cst_2 : tensor<4x128xf32> loc(#loc51)
+    %_tmp4 = arith.select %tmp0_18, %tmp5, %cst_2 : tensor<4x128xi1>, tensor<4x128xf32> loc(#loc52)
+    %tmp4 = "tt.reduce"(%_tmp4) <{axis = 1 : i32}> ({
+    ^bb0(%tmp4_22: f32 loc(callsite(#loc2 at #loc53)), %tmp4_23: f32 loc(callsite(#loc2 at #loc53))):
+      %tmp4_24 = arith.addf %tmp4_22, %tmp4_23 : f32 loc(#loc57)
+      tt.reduce.return %tmp4_24 : f32 loc(#loc55)
+    }) : (tensor<4x128xf32>) -> tensor<4xf32> loc(#loc55)
+    %tmp4_21 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<4xf32> -> tensor<4x1xf32> loc(#loc54)
+    %0 = tt.splat %out_ptr0 : !tt.ptr<f32> -> tensor<4x1x!tt.ptr<f32>> loc(#loc26)
+    %1 = tt.addptr %0, %xindex_7 : tensor<4x1x!tt.ptr<f32>>, tensor<4x1xi32> loc(#loc26)
+    tt.store %1, %tmp4_21 : tensor<4x1x!tt.ptr<f32>> loc(#loc27)
+    tt.return loc(#loc28)
+  } loc(#loc)
+} loc(#loc)
+#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:61)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":23:28)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":23:33)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":24:36)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":24:44)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":24:23)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":26:27)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":26:37)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":28:19)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":29:19)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":34:29)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:45)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:41)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:56)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:50)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:34)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":38:115)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":40:22)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":42:23)
+#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":43:40)
+#loc22 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:36)
+#loc24 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:15)
+#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":44:28)
+#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":45:25)
+#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":45:36)
+#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/qi/cqitx5hwupkbjmcsoykqepzqc7zcxjcb5acqki7zcsjifrnrzcrr.py":45:4)
+#loc33 = loc("tmp0"(#loc1))
+#loc34 = loc("xoffset"(#loc3))
+#loc35 = loc("xoffset"(#loc4))
+#loc36 = loc("xindex"(#loc5))
+#loc37 = loc("xindex"(#loc6))
+#loc38 = loc("xindex"(#loc7))
+#loc39 = loc("r0_base"(#loc8))
+#loc40 = loc("r0_base"(#loc9))
+#loc41 = loc("x0"(#loc10))
+#loc42 = loc("x1"(#loc11))
+#loc43 = loc("r0_mask"(#loc12))
+#loc44 = loc("tmp0"(#loc13))
+#loc45 = loc("tmp0"(#loc14))
+#loc46 = loc("tmp0"(#loc15))
+#loc47 = loc("tmp0"(#loc16))
+#loc48 = loc("tmp0"(#loc17))
+#loc49 = loc("tmp0"(#loc18))
+#loc50 = loc("tmp2"(#loc19))
+#loc51 = loc("tmp5"(#loc20))
+#loc52 = loc("_tmp4"(#loc21))
+#loc54 = loc("tmp4"(#loc25))
+#loc55 = loc(callsite(#loc22 at #loc53))
+#loc57 = loc(callsite(#loc24 at #loc55))
diff --git a/triton/YTJMOQ5EK2K5SU77M2GF34KOJUJGHRSHTNE3D7KKVDZMRCR7C73Q/__grp__triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json b/triton/YTJMOQ5EK2K5SU77M2GF34KOJUJGHRSHTNE3D7KKVDZMRCR7C73Q/__grp__triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..7c23b6862e6f9974cc6a5ac9efa3c882e9db6f61
--- /dev/null
+++ b/triton/YTJMOQ5EK2K5SU77M2GF34KOJUJGHRSHTNE3D7KKVDZMRCR7C73Q/__grp__triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json
@@ -0,0 +1 @@
+{"child_paths": {"triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.source": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/YTJMOQ5EK2K5SU77M2GF34KOJUJGHRSHTNE3D7KKVDZMRCR7C73Q/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.source", "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/YTJMOQ5EK2K5SU77M2GF34KOJUJGHRSHTNE3D7KKVDZMRCR7C73Q/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttir", "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttgir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/YTJMOQ5EK2K5SU77M2GF34KOJUJGHRSHTNE3D7KKVDZMRCR7C73Q/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttgir", "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.llir": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/YTJMOQ5EK2K5SU77M2GF34KOJUJGHRSHTNE3D7KKVDZMRCR7C73Q/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.llir", "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ptx": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/YTJMOQ5EK2K5SU77M2GF34KOJUJGHRSHTNE3D7KKVDZMRCR7C73Q/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ptx", "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.cubin": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/YTJMOQ5EK2K5SU77M2GF34KOJUJGHRSHTNE3D7KKVDZMRCR7C73Q/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.cubin", "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json": "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/triton/YTJMOQ5EK2K5SU77M2GF34KOJUJGHRSHTNE3D7KKVDZMRCR7C73Q/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json"}}
\ No newline at end of file
diff --git a/triton/YTJMOQ5EK2K5SU77M2GF34KOJUJGHRSHTNE3D7KKVDZMRCR7C73Q/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.cubin b/triton/YTJMOQ5EK2K5SU77M2GF34KOJUJGHRSHTNE3D7KKVDZMRCR7C73Q/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..d5cc87e55e0fff582530c02def97753686a3694b
Binary files /dev/null and b/triton/YTJMOQ5EK2K5SU77M2GF34KOJUJGHRSHTNE3D7KKVDZMRCR7C73Q/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.cubin differ
diff --git a/triton/YTJMOQ5EK2K5SU77M2GF34KOJUJGHRSHTNE3D7KKVDZMRCR7C73Q/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json b/triton/YTJMOQ5EK2K5SU77M2GF34KOJUJGHRSHTNE3D7KKVDZMRCR7C73Q/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..9696e3636d7e53ec7dfac85e4e4291c7dd2fc6e7
--- /dev/null
+++ b/triton/YTJMOQ5EK2K5SU77M2GF34KOJUJGHRSHTNE3D7KKVDZMRCR7C73Q/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.json
@@ -0,0 +1 @@
+{"hash": "c4d2c743a45695d953ff668c5df14e4d1263c6479b49b1fd4aa8f2c88a3f17f7", "target": {"backend": "cuda", "arch": 89, "warp_size": 32}, "num_warps": 8, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "enable_reflect_ftz": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": [], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee", "bf16x3", "bf16x6"], "max_num_imprecise_acc_default": 0, "extern_libs": [["libdevice", "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm89", "instrumentation_mode": "", "triton_version": "3.6.0", "tensordesc_meta": [], "shared": 2048, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0"}
\ No newline at end of file
diff --git a/triton/YTJMOQ5EK2K5SU77M2GF34KOJUJGHRSHTNE3D7KKVDZMRCR7C73Q/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.llir b/triton/YTJMOQ5EK2K5SU77M2GF34KOJUJGHRSHTNE3D7KKVDZMRCR7C73Q/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.llir
new file mode 100644
index 0000000000000000000000000000000000000000..df248f684e0ede3c641c5469aa4e3e660c212bcd
--- /dev/null
+++ b/triton/YTJMOQ5EK2K5SU77M2GF34KOJUJGHRSHTNE3D7KKVDZMRCR7C73Q/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.llir
@@ -0,0 +1,666 @@
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64"
+
+@global_smem = external local_unnamed_addr addrspace(3) global [0 x i8], align 16
+@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1
+
+; Function Attrs: nounwind
+define ptx_kernel void @triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, ptr addrspace(1) %6, i32 %7, i32 %8, ptr addrspace(1) readnone captures(none) %9, ptr addrspace(1) readnone captures(none) %10) local_unnamed_addr #0 !dbg !5 {
+  %12 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !8
+  %13 = shl i32 %12, 3, !dbg !9
+  %14 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
+  %15 = and i32 %14, 224, !dbg !10
+  %16 = lshr exact i32 %15, 5, !dbg !10
+  %17 = or disjoint i32 %16, %13, !dbg !11
+  %18 = shl nuw nsw i32 %14, 1, !dbg !12
+  %19 = and i32 %18, 62, !dbg !12
+  %20 = sdiv i32 %17, 32, !dbg !13
+  %21 = shl i32 %17, 7
+  %22 = shl i32 %20, 15
+  %23 = add i32 %22, %21
+  %24 = add i32 %23, 4096
+  %25 = zext nneg i32 %19 to i64, !dbg !14
+  %26 = or disjoint i32 %24, %19, !dbg !15
+  %27 = sext i32 %26 to i64, !dbg !16
+  %28 = getelementptr bfloat, ptr addrspace(1) %2, i64 %27, !dbg !16
+  %29 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !17
+  %30 = tail call i32 asm sideeffect "mov.u32 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $2 + 0 ], $3;", "=r,r,l,l,b"(i32 0, ptr addrspace(1) %28, i64 %29, i1 true) #6, !dbg !17
+  %31 = bitcast i32 %30 to <2 x bfloat>, !dbg !17
+  %32 = extractelement <2 x bfloat> %31, i64 0, !dbg !17
+  %33 = extractelement <2 x bfloat> %31, i64 1, !dbg !17
+  %34 = fpext bfloat %32 to float, !dbg !18
+  %35 = fpext bfloat %33 to float, !dbg !18
+  %36 = or disjoint i32 %23, %19, !dbg !19
+  %37 = sext i32 %36 to i64, !dbg !20
+  %38 = getelementptr bfloat, ptr addrspace(1) %2, i64 %37, !dbg !20
+  %39 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !21
+  %40 = tail call i32 asm sideeffect "mov.u32 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $2 + 0 ], $3;", "=r,r,l,l,b"(i32 0, ptr addrspace(1) %38, i64 %39, i1 true) #6, !dbg !21
+  %41 = bitcast i32 %40 to <2 x bfloat>, !dbg !21
+  %42 = extractelement <2 x bfloat> %41, i64 0, !dbg !21
+  %43 = extractelement <2 x bfloat> %41, i64 1, !dbg !21
+  %44 = fpext bfloat %42 to float, !dbg !22
+  %45 = fpext bfloat %43 to float, !dbg !22
+  %46 = fmul float %34, %34, !dbg !23
+  %47 = fmul float %35, %35, !dbg !23
+  %48 = fmul float %44, %44, !dbg !24
+  %49 = fmul float %45, %45, !dbg !24
+  %50 = or disjoint i32 %19, 64, !dbg !25
+  %51 = or disjoint i32 %24, %50, !dbg !15
+  %52 = sext i32 %51 to i64, !dbg !16
+  %53 = getelementptr bfloat, ptr addrspace(1) %2, i64 %52, !dbg !16
+  %54 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !17
+  %55 = tail call i32 asm sideeffect "mov.u32 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $2 + 0 ], $3;", "=r,r,l,l,b"(i32 0, ptr addrspace(1) %53, i64 %54, i1 true) #6, !dbg !17
+  %56 = bitcast i32 %55 to <2 x bfloat>, !dbg !17
+  %57 = extractelement <2 x bfloat> %56, i64 0, !dbg !17
+  %58 = extractelement <2 x bfloat> %56, i64 1, !dbg !17
+  %59 = fpext bfloat %57 to float, !dbg !18
+  %60 = fpext bfloat %58 to float, !dbg !18
+  %61 = or disjoint i32 %23, %50, !dbg !19
+  %62 = sext i32 %61 to i64, !dbg !20
+  %63 = getelementptr bfloat, ptr addrspace(1) %2, i64 %62, !dbg !20
+  %64 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !21
+  %65 = tail call i32 asm sideeffect "mov.u32 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $2 + 0 ], $3;", "=r,r,l,l,b"(i32 0, ptr addrspace(1) %63, i64 %64, i1 true) #6, !dbg !21
+  %66 = bitcast i32 %65 to <2 x bfloat>, !dbg !21
+  %67 = extractelement <2 x bfloat> %66, i64 0, !dbg !21
+  %68 = extractelement <2 x bfloat> %66, i64 1, !dbg !21
+  %69 = fpext bfloat %67 to float, !dbg !22
+  %70 = fpext bfloat %68 to float, !dbg !22
+  %71 = fmul float %59, %59, !dbg !23
+  %72 = fmul float %60, %60, !dbg !23
+  %73 = fadd float %46, %71, !dbg !26
+  %74 = fadd float %47, %72, !dbg !26
+  %75 = fmul float %69, %69, !dbg !24
+  %76 = fmul float %70, %70, !dbg !24
+  %77 = fadd float %48, %75, !dbg !27
+  %78 = fadd float %49, %76, !dbg !27
+  %79 = and i32 %14, 7, !dbg !10
+  %80 = or disjoint i32 %13, %79, !dbg !11
+  %81 = and i32 %14, 248, !dbg !12
+  %82 = lshr exact i32 %81, 3, !dbg !12
+  %83 = sdiv i32 %80, 32, !dbg !13
+  %84 = fadd float %73, %74, !dbg !28
+  %85 = bitcast float %84 to i32, !dbg !31
+  %86 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %85, i32 16, i32 31), !dbg !31
+  %87 = bitcast i32 %86 to float, !dbg !31
+  %88 = fadd float %84, %87, !dbg !28
+  %89 = bitcast float %88 to i32, !dbg !31
+  %90 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %89, i32 8, i32 31), !dbg !31
+  %91 = bitcast i32 %90 to float, !dbg !31
+  %92 = fadd float %88, %91, !dbg !28
+  %93 = bitcast float %92 to i32, !dbg !31
+  %94 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %93, i32 4, i32 31), !dbg !31
+  %95 = bitcast i32 %94 to float, !dbg !31
+  %96 = fadd float %92, %95, !dbg !28
+  %97 = bitcast float %96 to i32, !dbg !31
+  %98 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %97, i32 2, i32 31), !dbg !31
+  %99 = bitcast i32 %98 to float, !dbg !31
+  %100 = fadd float %96, %99, !dbg !28
+  %101 = bitcast float %100 to i32, !dbg !31
+  %102 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %101, i32 1, i32 31), !dbg !31
+  %103 = bitcast i32 %102 to float, !dbg !31
+  %104 = fadd float %100, %103, !dbg !28
+  %105 = fadd float %77, %78, !dbg !34
+  %106 = bitcast float %105 to i32, !dbg !35
+  %107 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %106, i32 16, i32 31), !dbg !35
+  %108 = bitcast i32 %107 to float, !dbg !35
+  %109 = fadd float %105, %108, !dbg !34
+  %110 = bitcast float %109 to i32, !dbg !35
+  %111 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %110, i32 8, i32 31), !dbg !35
+  %112 = bitcast i32 %111 to float, !dbg !35
+  %113 = fadd float %109, %112, !dbg !34
+  %114 = bitcast float %113 to i32, !dbg !35
+  %115 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %114, i32 4, i32 31), !dbg !35
+  %116 = bitcast i32 %115 to float, !dbg !35
+  %117 = fadd float %113, %116, !dbg !34
+  %118 = bitcast float %117 to i32, !dbg !35
+  %119 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %118, i32 2, i32 31), !dbg !35
+  %120 = bitcast i32 %119 to float, !dbg !35
+  %121 = fadd float %117, %120, !dbg !34
+  %122 = bitcast float %121 to i32, !dbg !35
+  %123 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %122, i32 1, i32 31), !dbg !35
+  %124 = bitcast i32 %123 to float, !dbg !35
+  %125 = fadd float %121, %124, !dbg !34
+  %126 = shl i32 %20, 7, !dbg !37
+  %127 = tail call float @llvm.nvvm.div.full(float %125, float 1.280000e+02), !dbg !38
+  %128 = fadd float %127, 0x3EB0C6F7A0000000, !dbg !39
+  %129 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !40
+  %.not.i = icmp eq i32 %129, 0, !dbg !40
+  br i1 %.not.i, label %132, label %130, !dbg !40
+
+130:                                              ; preds = %11
+  %131 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %128), !dbg !40
+  br label %__nv_rsqrtf.exit, !dbg !40
+
+132:                                              ; preds = %11
+  %133 = tail call float @llvm.nvvm.rsqrt.approx.f(float %128), !dbg !40
+  br label %__nv_rsqrtf.exit, !dbg !40
+
+__nv_rsqrtf.exit:                                 ; preds = %130, %132
+  %.0.i = phi float [ %131, %130 ], [ %133, %132 ], !dbg !40
+  %134 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !40
+  %.not.i3 = icmp eq i32 %134, 0, !dbg !40
+  br i1 %.not.i3, label %137, label %135, !dbg !40
+
+135:                                              ; preds = %__nv_rsqrtf.exit
+  %136 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %128), !dbg !40
+  br label %__nv_rsqrtf.exit5, !dbg !40
+
+137:                                              ; preds = %__nv_rsqrtf.exit
+  %138 = tail call float @llvm.nvvm.rsqrt.approx.f(float %128), !dbg !40
+  br label %__nv_rsqrtf.exit5, !dbg !40
+
+__nv_rsqrtf.exit5:                                ; preds = %135, %137
+  %.0.i4 = phi float [ %136, %135 ], [ %138, %137 ], !dbg !40
+  %139 = lshr exact i32 %15, 3, !dbg !41
+  %140 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %139, !dbg !41
+  store float %.0.i, ptr addrspace(3) %140, align 4, !dbg !41
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !41
+  %141 = shl nuw nsw i32 %79, 2, !dbg !41
+  %142 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %141, !dbg !41
+  %143 = load float, ptr addrspace(3) %142, align 4, !dbg !41
+  %144 = tail call float @llvm.nvvm.div.full(float %104, float 1.280000e+02), !dbg !42
+  %145 = fadd float %144, 0x3EB0C6F7A0000000, !dbg !43
+  %146 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44
+  %.not.i6 = icmp eq i32 %146, 0, !dbg !44
+  br i1 %.not.i6, label %149, label %147, !dbg !44
+
+147:                                              ; preds = %__nv_rsqrtf.exit5
+  %148 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %145), !dbg !44
+  br label %__nv_rsqrtf.exit8, !dbg !44
+
+149:                                              ; preds = %__nv_rsqrtf.exit5
+  %150 = tail call float @llvm.nvvm.rsqrt.approx.f(float %145), !dbg !44
+  br label %__nv_rsqrtf.exit8, !dbg !44
+
+__nv_rsqrtf.exit8:                                ; preds = %147, %149
+  %.0.i7 = phi float [ %148, %147 ], [ %150, %149 ], !dbg !44
+  %151 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44
+  %.not.i9 = icmp eq i32 %151, 0, !dbg !44
+  br i1 %.not.i9, label %154, label %152, !dbg !44
+
+152:                                              ; preds = %__nv_rsqrtf.exit8
+  %153 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %145), !dbg !44
+  br label %__nv_rsqrtf.exit11, !dbg !44
+
+154:                                              ; preds = %__nv_rsqrtf.exit8
+  %155 = tail call float @llvm.nvvm.rsqrt.approx.f(float %145), !dbg !44
+  br label %__nv_rsqrtf.exit11, !dbg !44
+
+__nv_rsqrtf.exit11:                               ; preds = %152, %154
+  %.0.i10 = phi float [ %153, %152 ], [ %155, %154 ], !dbg !44
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !45
+  store float %.0.i7, ptr addrspace(3) %140, align 4, !dbg !45
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !45
+  %156 = load float, ptr addrspace(3) %142, align 4, !dbg !45
+  %157 = shl i32 %17, 7, !dbg !46
+  %158 = and i32 %82, 1
+  %.masked = and i32 %82, 30
+  %159 = and i32 %14, 15
+  %160 = shl nuw nsw i32 %159, 3
+  %161 = shl nuw nsw i32 %15, 2
+  %162 = lshr exact i32 %15, 1
+  %163 = lshr i32 %14, 2
+  %164 = and i32 %163, 4
+  %165 = or disjoint i32 %160, %161
+  %166 = xor i32 %165, %162
+  %167 = or disjoint i32 %166, %164
+  %168 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %167
+  %169 = xor i32 %167, 1028
+  %170 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %169
+  %171 = shl nuw nsw i32 %159, 7
+  %172 = shl nuw nsw i32 %79, 4
+  %173 = lshr exact i32 %81, 1
+  %174 = xor i32 %172, %173
+  %175 = or disjoint i32 %174, %171
+  %176 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %175
+  %177 = xor i32 %175, 4
+  %178 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %177
+  %179 = icmp eq i32 %158, 0
+  %180 = shl i32 %80, 7
+  %181 = shl i32 %83, 15
+  %182 = add i32 %181, %180
+  %183 = icmp ne i32 %158, 0
+  %184 = add i32 %182, 4097
+  %185 = add i32 %182, 4096
+  %186 = shl nuw nsw i32 %79, 7
+  %187 = lshr i32 %14, 1
+  %188 = and i32 %187, 12
+  %189 = and i32 %163, 48
+  %190 = lshr i32 %14, 4
+  %191 = and i32 %190, 2
+  %192 = or disjoint i32 %186, %191
+  %193 = or disjoint i32 %172, %188
+  %194 = xor i32 %193, %189
+  %195 = or disjoint i32 %194, %192
+  %196 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %195
+  %197 = xor i32 %195, 64
+  %198 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %197
+  %199 = shl nuw nsw i32 %14, 2
+  %200 = and i32 %199, 1008
+  %201 = shl nuw nsw i32 %14, 3
+  %202 = and i32 %201, 8
+  %203 = and i32 %14, 2
+  %204 = xor i32 %200, %162
+  %205 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %202
+  %206 = getelementptr inbounds nuw i8, ptr addrspace(3) %205, i32 %203
+  %207 = getelementptr inbounds nuw i8, ptr addrspace(3) %206, i32 %204
+  %208 = getelementptr inbounds nuw i8, ptr addrspace(3) %207, i32 4
+  %209 = zext nneg i32 %.masked to i64, !dbg !47
+  %210 = sext i32 %126 to i64, !dbg !47
+  %211 = sext i32 %157 to i64, !dbg !47
+  br label %212, !dbg !47
+
+212:                                              ; preds = %__nv_rsqrtf.exit11, %212
+  %213 = phi i1 [ true, %__nv_rsqrtf.exit11 ], [ false, %212 ]
+  %indvars.iv = phi i64 [ 0, %__nv_rsqrtf.exit11 ], [ 64, %212 ]
+  %214 = or disjoint i64 %indvars.iv, %25, !dbg !48
+  %215 = or disjoint i64 %indvars.iv, %209, !dbg !49
+  %216 = or disjoint i64 %215, 32, !dbg !49
+  %217 = trunc nuw nsw i64 %214 to i32, !dbg !50
+  %218 = or disjoint i32 %23, %217, !dbg !50
+  %219 = sext i32 %218 to i64, !dbg !51
+  %220 = getelementptr bfloat, ptr addrspace(1) %2, i64 %219, !dbg !51
+  %221 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !52
+  %222 = tail call i32 asm sideeffect "mov.u32 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $2 + 0 ], $3;", "=r,r,l,l,b"(i32 0, ptr addrspace(1) %220, i64 %221, i1 true) #6, !dbg !52
+  %223 = bitcast i32 %222 to <2 x bfloat>, !dbg !52
+  %224 = extractelement <2 x bfloat> %223, i64 0, !dbg !52
+  %225 = extractelement <2 x bfloat> %223, i64 1, !dbg !52
+  %226 = fpext bfloat %224 to float, !dbg !53
+  %227 = fpext bfloat %225 to float, !dbg !53
+  %228 = getelementptr bfloat, ptr addrspace(1) %3, i64 %214, !dbg !54
+  %229 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !55
+  %230 = tail call i32 asm sideeffect "mov.u32 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $2 + 0 ], $3;", "=r,r,l,l,b"(i32 0, ptr addrspace(1) %228, i64 %229, i1 true) #6, !dbg !55
+  %231 = bitcast i32 %230 to <2 x bfloat>, !dbg !55
+  %232 = extractelement <2 x bfloat> %231, i64 0, !dbg !55
+  %233 = extractelement <2 x bfloat> %231, i64 1, !dbg !55
+  %234 = fpext bfloat %232 to float, !dbg !56
+  %235 = fpext bfloat %233 to float, !dbg !56
+  %236 = or disjoint i64 %214, %210, !dbg !57
+  %237 = getelementptr float, ptr addrspace(1) %4, i64 %236, !dbg !58
+  %238 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !59
+  %239 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %237, i64 %238, i1 true) #6, !dbg !59
+  %240 = extractvalue { i32, i32 } %239, 0, !dbg !59
+  %241 = extractvalue { i32, i32 } %239, 1, !dbg !59
+  %242 = bitcast i32 %240 to float, !dbg !59
+  %243 = bitcast i32 %241 to float, !dbg !59
+  %244 = getelementptr float, ptr addrspace(1) %5, i64 %236, !dbg !60
+  %245 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !61
+  %246 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %244, i64 %245, i1 true) #6, !dbg !61
+  %247 = extractvalue { i32, i32 } %246, 0, !dbg !61
+  %248 = extractvalue { i32, i32 } %246, 1, !dbg !61
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !61
+  %249 = insertelement <1 x i32> poison, i32 %247, i64 0, !dbg !61
+  store <1 x i32> %249, ptr addrspace(3) %168, align 4, !dbg !61
+  %250 = insertelement <1 x i32> poison, i32 %248, i64 0, !dbg !61
+  store <1 x i32> %250, ptr addrspace(3) %170, align 4, !dbg !61
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !61
+  %251 = load float, ptr addrspace(3) %176, align 4, !dbg !61
+  %252 = load float, ptr addrspace(3) %178, align 4, !dbg !61
+  %253 = or disjoint i32 %24, %217, !dbg !62
+  %254 = sext i32 %253 to i64, !dbg !63
+  %255 = getelementptr bfloat, ptr addrspace(1) %2, i64 %254, !dbg !63
+  %256 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #6, !dbg !64
+  %257 = tail call i32 asm sideeffect "mov.u32 $0, $1;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.b32 { $0 }, [ $2 + 0 ], $3;", "=r,r,l,l,b"(i32 0, ptr addrspace(1) %255, i64 %256, i1 true) #6, !dbg !64
+  %258 = bitcast i32 %257 to <2 x bfloat>, !dbg !64
+  %259 = extractelement <2 x bfloat> %258, i64 0, !dbg !64
+  %260 = extractelement <2 x bfloat> %258, i64 1, !dbg !64
+  %261 = fpext bfloat %259 to float, !dbg !65
+  %262 = fpext bfloat %260 to float, !dbg !65
+  %263 = getelementptr bfloat, ptr addrspace(1) %6, i64 %214, !dbg !66
+  %264 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !67
+  %265 = tail call i32 asm sideeffect "mov.u32 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $2 + 0 ], $3;", "=r,r,l,l,b"(i32 0, ptr addrspace(1) %263, i64 %264, i1 true) #6, !dbg !67
+  %266 = bitcast i32 %265 to <2 x bfloat>, !dbg !67
+  %267 = extractelement <2 x bfloat> %266, i64 0, !dbg !67
+  %268 = extractelement <2 x bfloat> %266, i64 1, !dbg !67
+  %269 = fpext bfloat %267 to float, !dbg !68
+  %270 = fpext bfloat %268 to float, !dbg !68
+  %271 = or disjoint i64 %215, 1, !dbg !69
+  %272 = or disjoint i64 %215, 33, !dbg !69
+  %273 = trunc nuw nsw i64 %271 to i32, !dbg !70
+  %274 = or disjoint i32 %182, %273, !dbg !70
+  %275 = trunc nuw nsw i64 %272 to i32, !dbg !70
+  %276 = or disjoint i32 %182, %275, !dbg !70
+  %277 = sext i32 %274 to i64, !dbg !71
+  %278 = getelementptr bfloat, ptr addrspace(1) %2, i64 %277, !dbg !71
+  %279 = sext i32 %276 to i64, !dbg !71
+  %280 = getelementptr bfloat, ptr addrspace(1) %2, i64 %279, !dbg !71
+  %281 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !72
+  %282 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %278, i64 %281, i1 %179) #6, !dbg !72
+  %283 = bitcast i16 %282 to bfloat, !dbg !72
+  %284 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !72
+  %285 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %280, i64 %284, i1 %179) #6, !dbg !72
+  %286 = bitcast i16 %285 to bfloat, !dbg !72
+  %287 = fpext bfloat %283 to float, !dbg !73
+  %288 = fpext bfloat %286 to float, !dbg !73
+  %289 = fmul float %143, %287, !dbg !41
+  %290 = fmul float %143, %288, !dbg !41
+  %291 = getelementptr bfloat, ptr addrspace(1) %3, i64 %271, !dbg !74
+  %292 = getelementptr bfloat, ptr addrspace(1) %3, i64 %272, !dbg !74
+  %293 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !75
+  %294 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %291, i64 %293, i1 %179) #6, !dbg !75
+  %295 = bitcast i16 %294 to bfloat, !dbg !75
+  %296 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !75
+  %297 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %292, i64 %296, i1 %179) #6, !dbg !75
+  %298 = bitcast i16 %297 to bfloat, !dbg !75
+  %299 = fpext bfloat %295 to float, !dbg !76
+  %300 = fpext bfloat %298 to float, !dbg !76
+  %301 = fmul float %289, %299, !dbg !77
+  %302 = fmul float %290, %300, !dbg !77
+  %303 = fsub float 0.000000e+00, %301, !dbg !78
+  %304 = fsub float 0.000000e+00, %302, !dbg !78
+  %305 = trunc nuw nsw i64 %215 to i32, !dbg !79
+  %306 = or disjoint i32 %182, %305, !dbg !79
+  %307 = trunc nuw nsw i64 %216 to i32, !dbg !79
+  %308 = or disjoint i32 %182, %307, !dbg !79
+  %309 = sext i32 %306 to i64, !dbg !80
+  %310 = getelementptr bfloat, ptr addrspace(1) %2, i64 %309, !dbg !80
+  %311 = sext i32 %308 to i64, !dbg !80
+  %312 = getelementptr bfloat, ptr addrspace(1) %2, i64 %311, !dbg !80
+  %313 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !81
+  %314 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %310, i64 %313, i1 %183) #6, !dbg !81
+  %315 = bitcast i16 %314 to bfloat, !dbg !81
+  %316 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !81
+  %317 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %312, i64 %316, i1 %183) #6, !dbg !81
+  %318 = bitcast i16 %317 to bfloat, !dbg !81
+  %319 = fpext bfloat %315 to float, !dbg !82
+  %320 = fpext bfloat %318 to float, !dbg !82
+  %321 = fmul float %143, %319, !dbg !83
+  %322 = fmul float %143, %320, !dbg !83
+  %323 = getelementptr bfloat, ptr addrspace(1) %3, i64 %215, !dbg !84
+  %324 = getelementptr bfloat, ptr addrspace(1) %3, i64 %216, !dbg !84
+  %325 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !85
+  %326 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %323, i64 %325, i1 %183) #6, !dbg !85
+  %327 = bitcast i16 %326 to bfloat, !dbg !85
+  %328 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !85
+  %329 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %324, i64 %328, i1 %183) #6, !dbg !85
+  %330 = bitcast i16 %329 to bfloat, !dbg !85
+  %331 = fpext bfloat %327 to float, !dbg !86
+  %332 = fpext bfloat %330 to float, !dbg !86
+  %333 = fmul float %321, %331, !dbg !87
+  %334 = fmul float %322, %332, !dbg !87
+  %335 = select i1 %179, float %303, float %333, !dbg !88
+  %336 = select i1 %179, float %304, float %334, !dbg !88
+  %337 = fmul float %.0.i4, %226, !dbg !89
+  %338 = fmul float %.0.i4, %227, !dbg !89
+  %339 = fmul float %337, %234, !dbg !90
+  %340 = fmul float %338, %235, !dbg !90
+  %341 = fmul float %339, %242, !dbg !91
+  %342 = fmul float %340, %243, !dbg !91
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !91
+  store float %341, ptr addrspace(3) %168, align 4, !dbg !91
+  store float %342, ptr addrspace(3) %170, align 4, !dbg !91
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !91
+  %343 = load float, ptr addrspace(3) %176, align 4, !dbg !91
+  %344 = load float, ptr addrspace(3) %178, align 4, !dbg !91
+  %345 = fmul float %251, %335, !dbg !92
+  %346 = fmul float %252, %336, !dbg !92
+  %347 = fadd float %345, %343, !dbg !93
+  %348 = fadd float %346, %344, !dbg !93
+  %349 = or disjoint i32 %184, %305, !dbg !94
+  %350 = or disjoint i32 %184, %307, !dbg !94
+  %351 = sext i32 %349 to i64, !dbg !95
+  %352 = getelementptr bfloat, ptr addrspace(1) %2, i64 %351, !dbg !95
+  %353 = sext i32 %350 to i64, !dbg !95
+  %354 = getelementptr bfloat, ptr addrspace(1) %2, i64 %353, !dbg !95
+  %355 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !96
+  %356 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %352, i64 %355, i1 %179) #6, !dbg !96
+  %357 = bitcast i16 %356 to bfloat, !dbg !96
+  %358 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !96
+  %359 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %354, i64 %358, i1 %179) #6, !dbg !96
+  %360 = bitcast i16 %359 to bfloat, !dbg !96
+  %361 = fpext bfloat %357 to float, !dbg !97
+  %362 = fpext bfloat %360 to float, !dbg !97
+  %363 = fmul float %156, %361, !dbg !45
+  %364 = fmul float %156, %362, !dbg !45
+  %365 = getelementptr bfloat, ptr addrspace(1) %6, i64 %271, !dbg !98
+  %366 = getelementptr bfloat, ptr addrspace(1) %6, i64 %272, !dbg !98
+  %367 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !99
+  %368 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %365, i64 %367, i1 %179) #6, !dbg !99
+  %369 = bitcast i16 %368 to bfloat, !dbg !99
+  %370 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !99
+  %371 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %366, i64 %370, i1 %179) #6, !dbg !99
+  %372 = bitcast i16 %371 to bfloat, !dbg !99
+  %373 = fpext bfloat %369 to float, !dbg !100
+  %374 = fpext bfloat %372 to float, !dbg !100
+  %375 = fmul float %363, %373, !dbg !101
+  %376 = fmul float %364, %374, !dbg !101
+  %377 = fsub float 0.000000e+00, %375, !dbg !102
+  %378 = fsub float 0.000000e+00, %376, !dbg !102
+  %379 = or disjoint i32 %185, %305, !dbg !103
+  %380 = or disjoint i32 %185, %307, !dbg !103
+  %381 = sext i32 %379 to i64, !dbg !104
+  %382 = getelementptr bfloat, ptr addrspace(1) %2, i64 %381, !dbg !104
+  %383 = sext i32 %380 to i64, !dbg !104
+  %384 = getelementptr bfloat, ptr addrspace(1) %2, i64 %383, !dbg !104
+  %385 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !105
+  %386 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %382, i64 %385, i1 %183) #6, !dbg !105
+  %387 = bitcast i16 %386 to bfloat, !dbg !105
+  %388 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !105
+  %389 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %384, i64 %388, i1 %183) #6, !dbg !105
+  %390 = bitcast i16 %389 to bfloat, !dbg !105
+  %391 = fpext bfloat %387 to float, !dbg !106
+  %392 = fpext bfloat %390 to float, !dbg !106
+  %393 = fmul float %156, %391, !dbg !107
+  %394 = fmul float %156, %392, !dbg !107
+  %395 = getelementptr bfloat, ptr addrspace(1) %6, i64 %215, !dbg !108
+  %396 = getelementptr bfloat, ptr addrspace(1) %6, i64 %216, !dbg !108
+  %397 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !109
+  %398 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %395, i64 %397, i1 %183) #6, !dbg !109
+  %399 = bitcast i16 %398 to bfloat, !dbg !109
+  %400 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !109
+  %401 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %396, i64 %400, i1 %183) #6, !dbg !109
+  %402 = bitcast i16 %401 to bfloat, !dbg !109
+  %403 = fpext bfloat %399 to float, !dbg !110
+  %404 = fpext bfloat %402 to float, !dbg !110
+  %405 = fmul float %393, %403, !dbg !111
+  %406 = fmul float %394, %404, !dbg !111
+  %407 = select i1 %179, float %377, float %405, !dbg !88
+  %408 = select i1 %179, float %378, float %406, !dbg !88
+  %409 = fmul float %.0.i10, %261, !dbg !112
+  %410 = fmul float %.0.i10, %262, !dbg !112
+  %411 = fmul float %409, %269, !dbg !113
+  %412 = fmul float %410, %270, !dbg !113
+  %413 = fmul float %411, %242, !dbg !114
+  %414 = fmul float %412, %243, !dbg !114
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !114
+  store float %413, ptr addrspace(3) %168, align 4, !dbg !114
+  store float %414, ptr addrspace(3) %170, align 4, !dbg !114
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !114
+  %415 = load float, ptr addrspace(3) %176, align 4, !dbg !114
+  %416 = load float, ptr addrspace(3) %178, align 4, !dbg !114
+  %417 = fmul float %251, %407, !dbg !115
+  %418 = fmul float %252, %408, !dbg !115
+  %419 = fadd float %417, %415, !dbg !116
+  %420 = fadd float %418, %416, !dbg !116
+  %421 = or disjoint i64 %214, %211, !dbg !117
+  %422 = getelementptr bfloat, ptr addrspace(1) %0, i64 %421, !dbg !118
+  %423 = fptrunc float %347 to bfloat, !dbg !119
+  %424 = fptrunc float %348 to bfloat, !dbg !119
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !119
+  store bfloat %423, ptr addrspace(3) %196, align 2, !dbg !119
+  store bfloat %424, ptr addrspace(3) %198, align 2, !dbg !119
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !119
+  %425 = load bfloat, ptr addrspace(3) %207, align 2, !dbg !119
+  %426 = load bfloat, ptr addrspace(3) %208, align 2, !dbg !119
+  %427 = insertelement <2 x bfloat> poison, bfloat %425, i64 0, !dbg !119
+  %428 = insertelement <2 x bfloat> %427, bfloat %426, i64 1, !dbg !119
+  %429 = bitcast <2 x bfloat> %428 to i32, !dbg !119
+  tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %429, ptr addrspace(1) %422, i1 true) #6, !dbg !119
+  %430 = getelementptr bfloat, ptr addrspace(1) %1, i64 %421, !dbg !120
+  %431 = fptrunc float %419 to bfloat, !dbg !121
+  %432 = fptrunc float %420 to bfloat, !dbg !121
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !121
+  store bfloat %431, ptr addrspace(3) %196, align 2, !dbg !121
+  store bfloat %432, ptr addrspace(3) %198, align 2, !dbg !121
+  tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !121
+  %433 = load bfloat, ptr addrspace(3) %207, align 2, !dbg !121
+  %434 = load bfloat, ptr addrspace(3) %208, align 2, !dbg !121
+  %435 = insertelement <2 x bfloat> poison, bfloat %433, i64 0, !dbg !121
+  %436 = insertelement <2 x bfloat> %435, bfloat %434, i64 1, !dbg !121
+  %437 = bitcast <2 x bfloat> %436 to i32, !dbg !121
+  tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %437, ptr addrspace(1) %430, i1 true) #6, !dbg !121
+  br i1 %213, label %212, label %438, !dbg !47
+
+438:                                              ; preds = %212
+  ret void, !dbg !122
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1
+
+; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
+declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.div.full(float, float) #3
+
+; Function Attrs: convergent nocallback nounwind
+declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #4
+
+declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #5
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #3
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.rsqrt.approx.f(float) #3
+
+attributes #0 = { nounwind "nvvm.reqntid"="256" }
+attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
+attributes #3 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
+attributes #4 = { convergent nocallback nounwind }
+attributes #5 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #6 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3}
+!llvm.ident = !{!4}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly)
+!1 = !DIFile(filename: "cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py", directory: "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv")
+!2 = !{i32 2, !"Debug Info Version", i32 3}
+!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
+!4 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
+!5 = distinct !DISubprogram(name: "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0", linkageName: "triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0", scope: !1, file: !1, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
+!7 = !{}
+!8 = !DILocation(line: 23, column: 28, scope: !5)
+!9 = !DILocation(line: 23, column: 33, scope: !5)
+!10 = !DILocation(line: 24, column: 44, scope: !5)
+!11 = !DILocation(line: 24, column: 23, scope: !5)
+!12 = !DILocation(line: 26, column: 37, scope: !5)
+!13 = !DILocation(line: 29, column: 19, scope: !5)
+!14 = !DILocation(line: 33, column: 43, scope: !5)
+!15 = !DILocation(line: 39, column: 57, scope: !5)
+!16 = !DILocation(line: 39, column: 34, scope: !5)
+!17 = !DILocation(line: 39, column: 68, scope: !5)
+!18 = !DILocation(line: 39, column: 121, scope: !5)
+!19 = !DILocation(line: 40, column: 50, scope: !5)
+!20 = !DILocation(line: 40, column: 34, scope: !5)
+!21 = !DILocation(line: 40, column: 61, scope: !5)
+!22 = !DILocation(line: 40, column: 114, scope: !5)
+!23 = !DILocation(line: 42, column: 22, scope: !5)
+!24 = !DILocation(line: 47, column: 22, scope: !5)
+!25 = !DILocation(line: 34, column: 31, scope: !5)
+!26 = !DILocation(line: 44, column: 23, scope: !5)
+!27 = !DILocation(line: 49, column: 25, scope: !5)
+!28 = !DILocation(line: 263, column: 15, scope: !29, inlinedAt: !31)
+!29 = distinct !DILexicalBlockFile(scope: !5, file: !30, discriminator: 0)
+!30 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.12/dist-packages/triton/language")
+!31 = !DILocation(line: 293, column: 36, scope: !29, inlinedAt: !32)
+!32 = !DILocation(line: 51, column: 25, scope: !33)
+!33 = distinct !DILexicalBlockFile(scope: !5, file: !1, discriminator: 0)
+!34 = !DILocation(line: 263, column: 15, scope: !29, inlinedAt: !35)
+!35 = !DILocation(line: 293, column: 36, scope: !29, inlinedAt: !36)
+!36 = !DILocation(line: 52, column: 27, scope: !33)
+!37 = !DILocation(line: 63, column: 46, scope: !5)
+!38 = !DILocation(line: 75, column: 25, scope: !5)
+!39 = !DILocation(line: 77, column: 24, scope: !5)
+!40 = !DILocation(line: 78, column: 32, scope: !5)
+!41 = !DILocation(line: 79, column: 24, scope: !5)
+!42 = !DILocation(line: 123, column: 24, scope: !5)
+!43 = !DILocation(line: 124, column: 24, scope: !5)
+!44 = !DILocation(line: 125, column: 32, scope: !5)
+!45 = !DILocation(line: 126, column: 24, scope: !5)
+!46 = !DILocation(line: 161, column: 43, scope: !5)
+!47 = !DILocation(line: 53, column: 43, scope: !5)
+!48 = !DILocation(line: 54, column: 31, scope: !5)
+!49 = !DILocation(line: 72, column: 41, scope: !5)
+!50 = !DILocation(line: 61, column: 51, scope: !5)
+!51 = !DILocation(line: 61, column: 35, scope: !5)
+!52 = !DILocation(line: 61, column: 62, scope: !5)
+!53 = !DILocation(line: 61, column: 115, scope: !5)
+!54 = !DILocation(line: 62, column: 35, scope: !5)
+!55 = !DILocation(line: 62, column: 42, scope: !5)
+!56 = !DILocation(line: 62, column: 95, scope: !5)
+!57 = !DILocation(line: 63, column: 42, scope: !5)
+!58 = !DILocation(line: 63, column: 35, scope: !5)
+!59 = !DILocation(line: 63, column: 51, scope: !5)
+!60 = !DILocation(line: 64, column: 35, scope: !5)
+!61 = !DILocation(line: 64, column: 51, scope: !5)
+!62 = !DILocation(line: 65, column: 58, scope: !5)
+!63 = !DILocation(line: 65, column: 35, scope: !5)
+!64 = !DILocation(line: 65, column: 69, scope: !5)
+!65 = !DILocation(line: 65, column: 123, scope: !5)
+!66 = !DILocation(line: 66, column: 36, scope: !5)
+!67 = !DILocation(line: 66, column: 43, scope: !5)
+!68 = !DILocation(line: 66, column: 96, scope: !5)
+!69 = !DILocation(line: 72, column: 39, scope: !5)
+!70 = !DILocation(line: 72, column: 57, scope: !5)
+!71 = !DILocation(line: 72, column: 35, scope: !5)
+!72 = !DILocation(line: 72, column: 68, scope: !5)
+!73 = !DILocation(line: 72, column: 129, scope: !5)
+!74 = !DILocation(line: 80, column: 35, scope: !5)
+!75 = !DILocation(line: 80, column: 85, scope: !5)
+!76 = !DILocation(line: 80, column: 146, scope: !5)
+!77 = !DILocation(line: 82, column: 24, scope: !5)
+!78 = !DILocation(line: 84, column: 17, scope: !5)
+!79 = !DILocation(line: 90, column: 53, scope: !5)
+!80 = !DILocation(line: 90, column: 35, scope: !5)
+!81 = !DILocation(line: 90, column: 64, scope: !5)
+!82 = !DILocation(line: 90, column: 125, scope: !5)
+!83 = !DILocation(line: 97, column: 24, scope: !5)
+!84 = !DILocation(line: 98, column: 35, scope: !5)
+!85 = !DILocation(line: 98, column: 81, scope: !5)
+!86 = !DILocation(line: 98, column: 142, scope: !5)
+!87 = !DILocation(line: 100, column: 24, scope: !5)
+!88 = !DILocation(line: 0, scope: !5)
+!89 = !DILocation(line: 111, column: 24, scope: !5)
+!90 = !DILocation(line: 113, column: 24, scope: !5)
+!91 = !DILocation(line: 116, column: 24, scope: !5)
+!92 = !DILocation(line: 118, column: 24, scope: !5)
+!93 = !DILocation(line: 119, column: 24, scope: !5)
+!94 = !DILocation(line: 121, column: 60, scope: !5)
+!95 = !DILocation(line: 121, column: 35, scope: !5)
+!96 = !DILocation(line: 121, column: 71, scope: !5)
+!97 = !DILocation(line: 121, column: 132, scope: !5)
+!98 = !DILocation(line: 127, column: 35, scope: !5)
+!99 = !DILocation(line: 127, column: 85, scope: !5)
+!100 = !DILocation(line: 127, column: 146, scope: !5)
+!101 = !DILocation(line: 129, column: 24, scope: !5)
+!102 = !DILocation(line: 131, column: 17, scope: !5)
+!103 = !DILocation(line: 134, column: 60, scope: !5)
+!104 = !DILocation(line: 134, column: 35, scope: !5)
+!105 = !DILocation(line: 134, column: 71, scope: !5)
+!106 = !DILocation(line: 134, column: 132, scope: !5)
+!107 = !DILocation(line: 139, column: 24, scope: !5)
+!108 = !DILocation(line: 140, column: 35, scope: !5)
+!109 = !DILocation(line: 140, column: 81, scope: !5)
+!110 = !DILocation(line: 140, column: 142, scope: !5)
+!111 = !DILocation(line: 142, column: 24, scope: !5)
+!112 = !DILocation(line: 151, column: 25, scope: !5)
+!113 = !DILocation(line: 153, column: 26, scope: !5)
+!114 = !DILocation(line: 156, column: 26, scope: !5)
+!115 = !DILocation(line: 158, column: 26, scope: !5)
+!116 = !DILocation(line: 159, column: 26, scope: !5)
+!117 = !DILocation(line: 161, column: 39, scope: !5)
+!118 = !DILocation(line: 161, column: 32, scope: !5)
+!119 = !DILocation(line: 161, column: 55, scope: !5)
+!120 = !DILocation(line: 162, column: 32, scope: !5)
+!121 = !DILocation(line: 162, column: 56, scope: !5)
+!122 = !DILocation(line: 53, column: 4, scope: !5)
diff --git a/triton/YTJMOQ5EK2K5SU77M2GF34KOJUJGHRSHTNE3D7KKVDZMRCR7C73Q/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ptx b/triton/YTJMOQ5EK2K5SU77M2GF34KOJUJGHRSHTNE3D7KKVDZMRCR7C73Q/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ptx
new file mode 100644
index 0000000000000000000000000000000000000000..79eaf8265f795a788a1d5127bcb7420c106ebde6
--- /dev/null
+++ b/triton/YTJMOQ5EK2K5SU77M2GF34KOJUJGHRSHTNE3D7KKVDZMRCR7C73Q/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ptx
@@ -0,0 +1,1190 @@
+//
+// Generated by LLVM NVPTX Back-End
+//
+
+.version 9.1
+.target sm_89
+.address_size 64
+
+	// .globl	triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0 // -- Begin function triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0
+.extern .shared .align 16 .b8 global_smem[];
+.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90};
+                                        // @triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0
+.visible .entry triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0(
+	.param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_0,
+	.param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_1,
+	.param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_2,
+	.param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_3,
+	.param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_4,
+	.param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_5,
+	.param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_6,
+	.param .u32 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_7,
+	.param .u32 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_8,
+	.param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_9,
+	.param .u64 .ptr .global .align 1 triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_10
+)
+.reqntid 256
+{
+	.reg .pred 	%p<6>;
+	.reg .b16 	%rs<42>;
+	.reg .b32 	%r<219>;
+	.reg .b64 	%rd<96>;
+	.loc	1 18 0                          // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:18:0
+$L__func_begin0:
+	.loc	1 18 0                          // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:18:0
+
+// %bb.0:                               // %__nv_rsqrtf.exit
+	ld.param.b64 	%rd11, [triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_6];
+	ld.param.b64 	%rd10, [triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_5];
+	ld.param.b64 	%rd9, [triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_4];
+	ld.param.b64 	%rd8, [triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_3];
+	ld.param.b64 	%rd7, [triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_2];
+	ld.param.b64 	%rd6, [triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_1];
+	ld.param.b64 	%rd5, [triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0_param_0];
+$L__tmp0:
+	.loc	1 23 28                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:23:28
+	mov.u32 	%r23, %ctaid.x;
+	.loc	1 23 33                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:23:33
+	shl.b32 	%r24, %r23, 3;
+	.loc	1 24 44                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:24:44
+	mov.u32 	%r25, %tid.x;
+	and.b32 	%r26, %r25, 224;
+	bfe.u32 	%r27, %r25, 5, 3;
+	.loc	1 24 23                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:24:23
+	or.b32 	%r28, %r27, %r24;
+	.loc	1 26 37                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:26:37
+	shl.b32 	%r29, %r25, 1;
+	and.b32 	%r30, %r29, 62;
+	.loc	1 29 19                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:29:19
+	bfe.s32 	%r31, %r23, 28, 1;
+	shr.u32 	%r32, %r31, 27;
+	add.s32 	%r33, %r28, %r32;
+	shr.s32 	%r34, %r33, 5;
+	shl.b32 	%r35, %r28, 7;
+	shl.b32 	%r36, %r34, 15;
+	add.s32 	%r1, %r36, %r35;
+	add.s32 	%r2, %r1, 4096;
+	.loc	1 33 43                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:33:43
+	cvt.u64.u32 	%rd1, %r30;
+	.loc	1 39 57                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:39:57
+	or.b32 	%r37, %r2, %r30;
+	.loc	1 39 34                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:39:34
+	mad.wide.s32 	%rd12, %r37, 2, %rd7;
+	.loc	1 39 68                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:39:68
+	// begin inline asm
+	mov.u64 %rd13, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd13, 1.0;
+	// end inline asm
+	mov.b32 	%r19, 0;
+	mov.pred 	%p2, -1;
+	// begin inline asm
+	mov.u32 %r18, %r19;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.b32 { %r18 }, [ %rd12 + 0 ], %rd13;
+	// end inline asm
+	mov.b32 	{%rs1, %rs2}, %r18;
+	.loc	1 39 121                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:39:121
+	cvt.f32.bf16 	%r38, %rs1;
+	cvt.f32.bf16 	%r39, %rs2;
+	.loc	1 40 50                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:40:50
+	or.b32 	%r40, %r1, %r30;
+	.loc	1 40 34                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:40:34
+	mad.wide.s32 	%rd14, %r40, 2, %rd7;
+	.loc	1 40 61                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:40:61
+	// begin inline asm
+	mov.u64 %rd15, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd15, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r20, %r19;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.b32 { %r20 }, [ %rd14 + 0 ], %rd15;
+	// end inline asm
+	mov.b32 	{%rs3, %rs4}, %r20;
+	.loc	1 40 114                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:40:114
+	cvt.f32.bf16 	%r41, %rs3;
+	cvt.f32.bf16 	%r42, %rs4;
+	.loc	1 39 34                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:39:34
+	cvt.s64.s32 	%rd20, %r2;
+	or.b64 	%rd21, %rd20, %rd1;
+	shl.b64 	%rd22, %rd21, 1;
+	add.s64 	%rd23, %rd7, %rd22;
+	add.s64 	%rd16, %rd23, 128;
+	.loc	1 39 68                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:39:68
+	// begin inline asm
+	mov.u64 %rd17, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd17, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r21, %r19;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.b32 { %r21 }, [ %rd16 + 0 ], %rd17;
+	// end inline asm
+	mov.b32 	{%rs5, %rs6}, %r21;
+	.loc	1 39 121                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:39:121
+	cvt.f32.bf16 	%r43, %rs5;
+	cvt.f32.bf16 	%r44, %rs6;
+	.loc	1 40 34                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:40:34
+	cvt.s64.s32 	%rd24, %r1;
+	or.b64 	%rd25, %rd24, %rd1;
+	shl.b64 	%rd26, %rd25, 1;
+	add.s64 	%rd27, %rd7, %rd26;
+	add.s64 	%rd18, %rd27, 128;
+	.loc	1 40 61                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:40:61
+	// begin inline asm
+	mov.u64 %rd19, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd19, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r22, %r19;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.b32 { %r22 }, [ %rd18 + 0 ], %rd19;
+	// end inline asm
+	mov.b32 	{%rs7, %rs8}, %r22;
+	.loc	1 40 114                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:40:114
+	cvt.f32.bf16 	%r45, %rs7;
+	cvt.f32.bf16 	%r46, %rs8;
+	.loc	1 42 22                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:42:22
+	mul.f32 	%r47, %r43, %r43;
+	mul.f32 	%r48, %r44, %r44;
+	.loc	1 44 23                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:44:23
+	fma.rn.f32 	%r49, %r38, %r38, %r47;
+	fma.rn.f32 	%r50, %r39, %r39, %r48;
+	.loc	1 47 22                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:47:22
+	mul.f32 	%r51, %r45, %r45;
+	mul.f32 	%r52, %r46, %r46;
+	.loc	1 49 25                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:49:25
+	fma.rn.f32 	%r53, %r41, %r41, %r51;
+	fma.rn.f32 	%r54, %r42, %r42, %r52;
+	.loc	1 24 44                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:24:44
+	and.b32 	%r55, %r25, 7;
+	.loc	1 24 23                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:24:23
+	or.b32 	%r56, %r24, %r55;
+	.loc	1 26 37                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:26:37
+	and.b32 	%r57, %r25, 248;
+	bfe.u32 	%r58, %r25, 3, 5;
+	.loc	1 29 19                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:29:19
+	add.s32 	%r59, %r56, %r32;
+$L__tmp1:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ] ]
+	add.f32 	%r60, %r49, %r50;
+$L__tmp2:
+	.loc	2 293 36                        // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ]
+	shfl.sync.bfly.b32 	%r61, %r60, 16, 31, -1;
+$L__tmp3:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ] ]
+	add.f32 	%r62, %r60, %r61;
+$L__tmp4:
+	.loc	2 293 36                        // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ]
+	shfl.sync.bfly.b32 	%r63, %r62, 8, 31, -1;
+$L__tmp5:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ] ]
+	add.f32 	%r64, %r62, %r63;
+$L__tmp6:
+	.loc	2 293 36                        // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ]
+	shfl.sync.bfly.b32 	%r65, %r64, 4, 31, -1;
+$L__tmp7:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ] ]
+	add.f32 	%r66, %r64, %r65;
+$L__tmp8:
+	.loc	2 293 36                        // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ]
+	shfl.sync.bfly.b32 	%r67, %r66, 2, 31, -1;
+$L__tmp9:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ] ]
+	add.f32 	%r68, %r66, %r67;
+$L__tmp10:
+	.loc	2 293 36                        // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ]
+	shfl.sync.bfly.b32 	%r69, %r68, 1, 31, -1;
+$L__tmp11:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:51:25 ] ]
+	add.f32 	%r70, %r68, %r69;
+$L__tmp12:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ] ]
+	add.f32 	%r71, %r53, %r54;
+$L__tmp13:
+	.loc	2 293 36                        // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ]
+	shfl.sync.bfly.b32 	%r72, %r71, 16, 31, -1;
+$L__tmp14:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ] ]
+	add.f32 	%r73, %r71, %r72;
+$L__tmp15:
+	.loc	2 293 36                        // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ]
+	shfl.sync.bfly.b32 	%r74, %r73, 8, 31, -1;
+$L__tmp16:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ] ]
+	add.f32 	%r75, %r73, %r74;
+$L__tmp17:
+	.loc	2 293 36                        // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ]
+	shfl.sync.bfly.b32 	%r76, %r75, 4, 31, -1;
+$L__tmp18:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ] ]
+	add.f32 	%r77, %r75, %r76;
+$L__tmp19:
+	.loc	2 293 36                        // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ]
+	shfl.sync.bfly.b32 	%r78, %r77, 2, 31, -1;
+$L__tmp20:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ] ]
+	add.f32 	%r79, %r77, %r78;
+$L__tmp21:
+	.loc	2 293 36                        // standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ]
+	shfl.sync.bfly.b32 	%r80, %r79, 1, 31, -1;
+$L__tmp22:
+	.loc	2 263 15                        // standard.py:263:15 @[ standard.py:293:36 @[ cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:52:27 ] ]
+	add.f32 	%r81, %r79, %r80;
+$L__tmp23:
+	.loc	1 63 46                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:63:46
+	shl.b32 	%r82, %r34, 7;
+	mov.b32 	%r83, 0f43000000;
+	.loc	1 75 25                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:75:25
+	div.full.f32 	%r84, %r81, %r83;
+	.loc	1 77 24                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:77:24
+	add.f32 	%r85, %r84, 0f358637BD;
+	.loc	1 78 32                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:78:32
+	rsqrt.approx.ftz.f32 	%r3, %r85;
+	.loc	1 79 24                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:79:24
+	shr.u32 	%r86, %r26, 3;
+	mov.b32 	%r87, global_smem;
+	add.s32 	%r88, %r87, %r86;
+	st.shared.b32 	[%r88], %r3;
+	bar.sync 	0;
+	shl.b32 	%r89, %r55, 2;
+	add.s32 	%r90, %r87, %r89;
+	ld.shared.b32 	%r4, [%r90];
+	.loc	1 123 24                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:123:24
+	div.full.f32 	%r91, %r70, %r83;
+	.loc	1 124 24                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:124:24
+	add.f32 	%r92, %r91, 0f358637BD;
+	.loc	1 125 32                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:125:32
+	rsqrt.approx.ftz.f32 	%r5, %r92;
+	.loc	1 126 24                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:126:24
+	bar.sync 	0;
+	st.shared.b32 	[%r88], %r5;
+	bar.sync 	0;
+	ld.shared.b32 	%r6, [%r90];
+	bfe.u32 	%r7, %r57, 3, 1;
+	and.b32 	%r93, %r58, 30;
+	and.b32 	%r94, %r25, 15;
+	shl.b32 	%r95, %r94, 3;
+	shl.b32 	%r96, %r26, 2;
+	shr.u32 	%r97, %r26, 1;
+	shr.u32 	%r98, %r25, 2;
+	and.b32 	%r99, %r98, 4;
+	or.b32 	%r100, %r95, %r96;
+	xor.b32 	%r101, %r100, %r97;
+	or.b32 	%r102, %r101, %r99;
+	add.s32 	%r8, %r87, %r102;
+	xor.b32 	%r103, %r102, 4;
+	add.s32 	%r9, %r87, %r103;
+	shl.b32 	%r104, %r94, 7;
+	shl.b32 	%r105, %r55, 4;
+	shr.u32 	%r106, %r57, 1;
+	xor.b32 	%r107, %r105, %r106;
+	or.b32 	%r108, %r107, %r104;
+	add.s32 	%r10, %r87, %r108;
+	xor.b32 	%r109, %r108, 4;
+	add.s32 	%r11, %r87, %r109;
+	shl.b32 	%r110, %r56, 7;
+	shl.b32 	%r111, %r59, 10;
+	and.b32 	%r112, %r111, -32768;
+	add.s32 	%r12, %r112, %r110;
+	add.s32 	%r13, %r12, 4097;
+	add.s32 	%r14, %r12, 4096;
+	shl.b32 	%r113, %r55, 7;
+	shr.u32 	%r114, %r25, 1;
+	and.b32 	%r115, %r114, 12;
+	and.b32 	%r116, %r98, 48;
+	shr.u32 	%r117, %r25, 4;
+	and.b32 	%r118, %r117, 2;
+	or.b32 	%r119, %r113, %r118;
+	or.b32 	%r120, %r105, %r115;
+	xor.b32 	%r121, %r120, %r116;
+	or.b32 	%r122, %r121, %r119;
+	add.s32 	%r15, %r87, %r122;
+	xor.b32 	%r123, %r122, 64;
+	add.s32 	%r16, %r87, %r123;
+	shl.b32 	%r124, %r25, 2;
+	and.b32 	%r125, %r124, 1008;
+	shl.b32 	%r126, %r25, 3;
+	and.b32 	%r127, %r126, 8;
+	and.b32 	%r128, %r25, 2;
+	xor.b32 	%r129, %r125, %r97;
+	add.s32 	%r130, %r87, %r127;
+	add.s32 	%r131, %r130, %r128;
+	add.s32 	%r17, %r131, %r129;
+	.loc	1 53 43                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:53:43
+	cvt.u64.u32 	%rd2, %r93;
+	cvt.s64.s32 	%rd3, %r82;
+	cvt.s64.s32 	%rd4, %r35;
+	mov.b64 	%rd95, 0;
+	mov.pred 	%p5, %p2;
+$L__BB0_1:                              // =>This Inner Loop Header: Depth=1
+	.loc	1 0 43                          // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:0:43
+	mov.pred 	%p1, %p5;
+	setp.ne.b32 	%p4, %r7, 0;
+	setp.eq.b32 	%p3, %r7, 0;
+	.loc	1 54 31                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:54:31
+	or.b64 	%rd74, %rd95, %rd1;
+	.loc	1 72 41                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:72:41
+	or.b64 	%rd75, %rd95, %rd2;
+	.loc	1 61 51                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:61:51
+	cvt.u32.u64 	%r142, %rd74;
+	or.b32 	%r143, %r1, %r142;
+	.loc	1 61 35                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:61:35
+	mad.wide.s32 	%rd29, %r143, 2, %rd7;
+	.loc	1 61 62                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:61:62
+	// begin inline asm
+	mov.u64 %rd28, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd28, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r132, %r19;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.b32 { %r132 }, [ %rd29 + 0 ], %rd28;
+	// end inline asm
+	mov.b32 	{%rs26, %rs27}, %r132;
+	.loc	1 61 115                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:61:115
+	cvt.f32.bf16 	%r144, %rs26;
+	cvt.f32.bf16 	%r145, %rs27;
+	.loc	1 62 35                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:62:35
+	shl.b64 	%rd76, %rd74, 1;
+	add.s64 	%rd31, %rd8, %rd76;
+	.loc	1 62 42                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:62:42
+	// begin inline asm
+	mov.u64 %rd30, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd30, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r133, %r19;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.b32 { %r133 }, [ %rd31 + 0 ], %rd30;
+	// end inline asm
+	mov.b32 	{%rs28, %rs29}, %r133;
+	.loc	1 62 95                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:62:95
+	cvt.f32.bf16 	%r146, %rs28;
+	cvt.f32.bf16 	%r147, %rs29;
+	.loc	1 63 42                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:63:42
+	or.b64 	%rd77, %rd74, %rd3;
+	.loc	1 63 35                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:63:35
+	shl.b64 	%rd78, %rd77, 2;
+	add.s64 	%rd33, %rd9, %rd78;
+	.loc	1 63 51                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:63:51
+	// begin inline asm
+	mov.u64 %rd32, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd32, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r134, %r19;
+	mov.u32 %r135, %r19;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { %r134, %r135 }, [ %rd33 + 0 ], %rd32;
+	// end inline asm
+	.loc	1 64 35                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:64:35
+	add.s64 	%rd35, %rd10, %rd78;
+	.loc	1 64 51                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:64:51
+	// begin inline asm
+	mov.u64 %rd34, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd34, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r136, %r19;
+	mov.u32 %r137, %r19;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { %r136, %r137 }, [ %rd35 + 0 ], %rd34;
+	// end inline asm
+	bar.sync 	0;
+	st.shared.b32 	[%r8], %r136;
+	st.shared.b32 	[%r9+1024], %r137;
+	bar.sync 	0;
+	ld.shared.b32 	%r148, [%r10];
+	ld.shared.b32 	%r149, [%r11];
+	.loc	1 65 58                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:65:58
+	or.b32 	%r150, %r2, %r142;
+	.loc	1 65 35                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:65:35
+	mad.wide.s32 	%rd37, %r150, 2, %rd7;
+	.loc	1 65 69                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:65:69
+	// begin inline asm
+	mov.u64 %rd36, 0x0;
+	createpolicy.fractional.L2::evict_first.b64 %rd36, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r138, %r19;
+	@%p2 ld.global.L1::evict_first.L2::cache_hint.b32 { %r138 }, [ %rd37 + 0 ], %rd36;
+	// end inline asm
+	mov.b32 	{%rs30, %rs31}, %r138;
+	.loc	1 65 123                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:65:123
+	cvt.f32.bf16 	%r151, %rs30;
+	cvt.f32.bf16 	%r152, %rs31;
+	.loc	1 66 36                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:66:36
+	add.s64 	%rd39, %rd11, %rd76;
+	.loc	1 66 43                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:66:43
+	// begin inline asm
+	mov.u64 %rd38, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd38, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u32 %r139, %r19;
+	@%p2 ld.global.L1::evict_last.L2::cache_hint.b32 { %r139 }, [ %rd39 + 0 ], %rd38;
+	// end inline asm
+	mov.b32 	{%rs32, %rs33}, %r139;
+	.loc	1 66 96                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:66:96
+	cvt.f32.bf16 	%r153, %rs32;
+	cvt.f32.bf16 	%r154, %rs33;
+	.loc	1 72 35                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:72:35
+	cvt.s64.s32 	%rd79, %r12;
+	.loc	1 72 57                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:72:57
+	cvt.u32.u64 	%r155, %rd75;
+	.loc	1 72 35                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:72:35
+	cvt.s64.s32 	%rd80, %rd75;
+	add.s64 	%rd81, %rd79, %rd80;
+	shl.b64 	%rd82, %rd81, 1;
+	add.s64 	%rd83, %rd7, %rd82;
+	add.s64 	%rd41, %rd83, 2;
+	add.s64 	%rd43, %rd83, 66;
+	.loc	1 72 68                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:72:68
+	// begin inline asm
+	mov.u64 %rd40, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd40, 1.0;
+	// end inline asm
+	mov.b16 	%rs10, 0;
+	// begin inline asm
+	mov.u16 %rs9, %rs10;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs9 }, [ %rd41 + 0 ], %rd40;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd42, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd42, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs11, %rs10;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs11 }, [ %rd43 + 0 ], %rd42;
+	// end inline asm
+	.loc	1 72 129                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:72:129
+	cvt.f32.bf16 	%r156, %rs9;
+	cvt.f32.bf16 	%r157, %rs11;
+	.loc	1 79 24                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:79:24
+	mul.f32 	%r158, %r4, %r156;
+	mul.f32 	%r159, %r4, %r157;
+	.loc	1 80 35                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:80:35
+	shl.b64 	%rd84, %rd75, 1;
+	add.s64 	%rd53, %rd8, %rd84;
+	add.s64 	%rd45, %rd53, 2;
+	add.s64 	%rd47, %rd53, 66;
+	.loc	1 80 85                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:80:85
+	// begin inline asm
+	mov.u64 %rd44, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd44, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs12, %rs10;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs12 }, [ %rd45 + 0 ], %rd44;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd46, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd46, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs13, %rs10;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs13 }, [ %rd47 + 0 ], %rd46;
+	// end inline asm
+	.loc	1 80 146                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:80:146
+	cvt.f32.bf16 	%r160, %rs12;
+	cvt.f32.bf16 	%r161, %rs13;
+	.loc	1 84 17                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:84:17
+	neg.f32 	%r162, %r158;
+	fma.rn.f32 	%r163, %r162, %r160, 0f00000000;
+	neg.f32 	%r164, %r159;
+	fma.rn.f32 	%r165, %r164, %r161, 0f00000000;
+	.loc	1 90 53                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:90:53
+	or.b32 	%r166, %r12, %r155;
+	.loc	1 90 35                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:90:35
+	mad.wide.s32 	%rd49, %r166, 2, %rd7;
+	add.s64 	%rd51, %rd83, 64;
+	.loc	1 90 64                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:90:64
+	// begin inline asm
+	mov.u64 %rd48, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd48, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs14, %rs10;
+	@%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs14 }, [ %rd49 + 0 ], %rd48;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd50, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd50, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs15, %rs10;
+	@%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs15 }, [ %rd51 + 0 ], %rd50;
+	// end inline asm
+	.loc	1 90 125                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:90:125
+	cvt.f32.bf16 	%r167, %rs14;
+	cvt.f32.bf16 	%r168, %rs15;
+	.loc	1 97 24                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:97:24
+	mul.f32 	%r169, %r4, %r167;
+	mul.f32 	%r170, %r4, %r168;
+	.loc	1 98 35                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:98:35
+	add.s64 	%rd55, %rd53, 64;
+	.loc	1 98 81                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:98:81
+	// begin inline asm
+	mov.u64 %rd52, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd52, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs16, %rs10;
+	@%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs16 }, [ %rd53 + 0 ], %rd52;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd54, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd54, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs17, %rs10;
+	@%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs17 }, [ %rd55 + 0 ], %rd54;
+	// end inline asm
+	.loc	1 98 142                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:98:142
+	cvt.f32.bf16 	%r171, %rs16;
+	cvt.f32.bf16 	%r172, %rs17;
+	.loc	1 100 24                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:100:24
+	mul.f32 	%r173, %r169, %r171;
+	mul.f32 	%r174, %r170, %r172;
+	.loc	1 0 0                           // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:0
+	selp.f32 	%r175, %r163, %r173, %p3;
+	selp.f32 	%r176, %r165, %r174, %p3;
+	.loc	1 111 24                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:111:24
+	mul.f32 	%r177, %r3, %r144;
+	mul.f32 	%r178, %r3, %r145;
+	.loc	1 113 24                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:113:24
+	mul.f32 	%r179, %r177, %r146;
+	mul.f32 	%r180, %r178, %r147;
+	.loc	1 116 24                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:116:24
+	mul.f32 	%r181, %r179, %r134;
+	mul.f32 	%r182, %r180, %r135;
+	bar.sync 	0;
+	st.shared.b32 	[%r8], %r181;
+	st.shared.b32 	[%r9+1024], %r182;
+	bar.sync 	0;
+	ld.shared.b32 	%r183, [%r10];
+	ld.shared.b32 	%r184, [%r11];
+	.loc	1 119 24                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:119:24
+	fma.rn.f32 	%r185, %r148, %r175, %r183;
+	fma.rn.f32 	%r186, %r149, %r176, %r184;
+	.loc	1 121 60                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:121:60
+	or.b32 	%r187, %r13, %r155;
+	.loc	1 121 35                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:121:35
+	mad.wide.s32 	%rd57, %r187, 2, %rd7;
+	cvt.s64.s32 	%rd85, %r13;
+	add.s64 	%rd86, %rd85, %rd80;
+	shl.b64 	%rd87, %rd86, 1;
+	add.s64 	%rd88, %rd7, %rd87;
+	add.s64 	%rd59, %rd88, 64;
+	.loc	1 121 71                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:121:71
+	// begin inline asm
+	mov.u64 %rd56, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd56, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs18, %rs10;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs18 }, [ %rd57 + 0 ], %rd56;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd58, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd58, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs19, %rs10;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs19 }, [ %rd59 + 0 ], %rd58;
+	// end inline asm
+	.loc	1 121 132                       // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:121:132
+	cvt.f32.bf16 	%r188, %rs18;
+	cvt.f32.bf16 	%r189, %rs19;
+	.loc	1 126 24                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:126:24
+	mul.f32 	%r190, %r6, %r188;
+	mul.f32 	%r191, %r6, %r189;
+	.loc	1 127 35                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:127:35
+	add.s64 	%rd69, %rd11, %rd84;
+	add.s64 	%rd61, %rd69, 2;
+	add.s64 	%rd63, %rd69, 66;
+	.loc	1 127 85                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:127:85
+	// begin inline asm
+	mov.u64 %rd60, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd60, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs20, %rs10;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs20 }, [ %rd61 + 0 ], %rd60;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd62, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd62, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs21, %rs10;
+	@%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs21 }, [ %rd63 + 0 ], %rd62;
+	// end inline asm
+	.loc	1 127 146                       // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:127:146
+	cvt.f32.bf16 	%r192, %rs20;
+	cvt.f32.bf16 	%r193, %rs21;
+	.loc	1 131 17                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:131:17
+	neg.f32 	%r194, %r190;
+	fma.rn.f32 	%r195, %r194, %r192, 0f00000000;
+	neg.f32 	%r196, %r191;
+	fma.rn.f32 	%r197, %r196, %r193, 0f00000000;
+	.loc	1 134 60                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:134:60
+	or.b32 	%r198, %r14, %r155;
+	.loc	1 134 35                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:134:35
+	mad.wide.s32 	%rd65, %r198, 2, %rd7;
+	cvt.s64.s32 	%rd89, %r14;
+	add.s64 	%rd90, %rd89, %rd80;
+	shl.b64 	%rd91, %rd90, 1;
+	add.s64 	%rd92, %rd7, %rd91;
+	add.s64 	%rd67, %rd92, 64;
+	.loc	1 134 71                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:134:71
+	// begin inline asm
+	mov.u64 %rd64, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd64, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs22, %rs10;
+	@%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs22 }, [ %rd65 + 0 ], %rd64;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd66, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd66, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs23, %rs10;
+	@%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs23 }, [ %rd67 + 0 ], %rd66;
+	// end inline asm
+	.loc	1 134 132                       // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:134:132
+	cvt.f32.bf16 	%r199, %rs22;
+	cvt.f32.bf16 	%r200, %rs23;
+	.loc	1 139 24                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:139:24
+	mul.f32 	%r201, %r6, %r199;
+	mul.f32 	%r202, %r6, %r200;
+	.loc	1 140 35                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:140:35
+	add.s64 	%rd71, %rd69, 64;
+	.loc	1 140 81                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:140:81
+	// begin inline asm
+	mov.u64 %rd68, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd68, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs24, %rs10;
+	@%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs24 }, [ %rd69 + 0 ], %rd68;
+	// end inline asm
+	// begin inline asm
+	mov.u64 %rd70, 0x0;
+	createpolicy.fractional.L2::evict_last.b64 %rd70, 1.0;
+	// end inline asm
+	// begin inline asm
+	mov.u16 %rs25, %rs10;
+	@%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs25 }, [ %rd71 + 0 ], %rd70;
+	// end inline asm
+	.loc	1 140 142                       // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:140:142
+	cvt.f32.bf16 	%r203, %rs24;
+	cvt.f32.bf16 	%r204, %rs25;
+	.loc	1 142 24                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:142:24
+	mul.f32 	%r205, %r201, %r203;
+	mul.f32 	%r206, %r202, %r204;
+	.loc	1 0 0                           // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:0
+	selp.f32 	%r207, %r195, %r205, %p3;
+	selp.f32 	%r208, %r197, %r206, %p3;
+	.loc	1 151 25                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:151:25
+	mul.f32 	%r209, %r5, %r151;
+	mul.f32 	%r210, %r5, %r152;
+	.loc	1 153 26                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:153:26
+	mul.f32 	%r211, %r209, %r153;
+	mul.f32 	%r212, %r210, %r154;
+	.loc	1 156 26                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:156:26
+	mul.f32 	%r213, %r211, %r134;
+	mul.f32 	%r214, %r212, %r135;
+	bar.sync 	0;
+	st.shared.b32 	[%r8], %r213;
+	st.shared.b32 	[%r9+1024], %r214;
+	bar.sync 	0;
+	ld.shared.b32 	%r215, [%r10];
+	ld.shared.b32 	%r216, [%r11];
+	.loc	1 159 26                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:159:26
+	fma.rn.f32 	%r217, %r148, %r207, %r215;
+	fma.rn.f32 	%r218, %r149, %r208, %r216;
+	.loc	1 161 39                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:161:39
+	or.b64 	%rd93, %rd74, %rd4;
+	.loc	1 161 32                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:161:32
+	shl.b64 	%rd94, %rd93, 1;
+	add.s64 	%rd72, %rd5, %rd94;
+	.loc	1 161 55                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:161:55
+	cvt.rn.bf16.f32 	%rs34, %r185;
+	cvt.rn.bf16.f32 	%rs35, %r186;
+	bar.sync 	0;
+	st.shared.b16 	[%r15], %rs34;
+	st.shared.b16 	[%r16], %rs35;
+	bar.sync 	0;
+	ld.shared.b16 	%rs36, [%r17];
+	ld.shared.b16 	%rs37, [%r17+4];
+	mov.b32 	%r140, {%rs36, %rs37};
+	// begin inline asm
+	@%p2 st.global.b32 [ %rd72 + 0 ], { %r140 };
+	// end inline asm
+	.loc	1 162 32                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:162:32
+	add.s64 	%rd73, %rd6, %rd94;
+	.loc	1 162 56                        // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:162:56
+	cvt.rn.bf16.f32 	%rs38, %r217;
+	cvt.rn.bf16.f32 	%rs39, %r218;
+	bar.sync 	0;
+	st.shared.b16 	[%r15], %rs38;
+	st.shared.b16 	[%r16], %rs39;
+	bar.sync 	0;
+	ld.shared.b16 	%rs40, [%r17];
+	ld.shared.b16 	%rs41, [%r17+4];
+	mov.b32 	%r141, {%rs40, %rs41};
+	// begin inline asm
+	@%p2 st.global.b32 [ %rd73 + 0 ], { %r141 };
+	// end inline asm
+	mov.b64 	%rd95, 64;
+	mov.pred 	%p5, 0;
+	.loc	1 53 43                         // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:53:43
+	@%p1 bra 	$L__BB0_1;
+// %bb.2:
+	.loc	1 53 4                          // cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py:53:4
+	ret;
+$L__tmp24:
+$L__func_end0:
+                                        // -- End function
+}
+	.file	1 "/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py"
+	.file	2 "/usr/local/lib/python3.12/dist-packages/triton/language/standard.py"
+	.section	.debug_abbrev
+	{
+.b8 1                                   // Abbreviation Code
+.b8 17                                  // DW_TAG_compile_unit
+.b8 1                                   // DW_CHILDREN_yes
+.b8 37                                  // DW_AT_producer
+.b8 8                                   // DW_FORM_string
+.b8 19                                  // DW_AT_language
+.b8 5                                   // DW_FORM_data2
+.b8 3                                   // DW_AT_name
+.b8 8                                   // DW_FORM_string
+.b8 16                                  // DW_AT_stmt_list
+.b8 6                                   // DW_FORM_data4
+.b8 27                                  // DW_AT_comp_dir
+.b8 8                                   // DW_FORM_string
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 2                                   // Abbreviation Code
+.b8 46                                  // DW_TAG_subprogram
+.b8 0                                   // DW_CHILDREN_no
+.b8 3                                   // DW_AT_name
+.b8 8                                   // DW_FORM_string
+.b8 32                                  // DW_AT_inline
+.b8 11                                  // DW_FORM_data1
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 3                                   // Abbreviation Code
+.b8 46                                  // DW_TAG_subprogram
+.b8 1                                   // DW_CHILDREN_yes
+.b8 17                                  // DW_AT_low_pc
+.b8 1                                   // DW_FORM_addr
+.b8 18                                  // DW_AT_high_pc
+.b8 1                                   // DW_FORM_addr
+.b8 49                                  // DW_AT_abstract_origin
+.b8 19                                  // DW_FORM_ref4
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 4                                   // Abbreviation Code
+.b8 29                                  // DW_TAG_inlined_subroutine
+.b8 1                                   // DW_CHILDREN_yes
+.b8 49                                  // DW_AT_abstract_origin
+.b8 19                                  // DW_FORM_ref4
+.b8 17                                  // DW_AT_low_pc
+.b8 1                                   // DW_FORM_addr
+.b8 18                                  // DW_AT_high_pc
+.b8 1                                   // DW_FORM_addr
+.b8 88                                  // DW_AT_call_file
+.b8 11                                  // DW_FORM_data1
+.b8 89                                  // DW_AT_call_line
+.b8 11                                  // DW_FORM_data1
+.b8 87                                  // DW_AT_call_column
+.b8 11                                  // DW_FORM_data1
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 5                                   // Abbreviation Code
+.b8 29                                  // DW_TAG_inlined_subroutine
+.b8 0                                   // DW_CHILDREN_no
+.b8 49                                  // DW_AT_abstract_origin
+.b8 19                                  // DW_FORM_ref4
+.b8 17                                  // DW_AT_low_pc
+.b8 1                                   // DW_FORM_addr
+.b8 18                                  // DW_AT_high_pc
+.b8 1                                   // DW_FORM_addr
+.b8 88                                  // DW_AT_call_file
+.b8 11                                  // DW_FORM_data1
+.b8 89                                  // DW_AT_call_line
+.b8 5                                   // DW_FORM_data2
+.b8 87                                  // DW_AT_call_column
+.b8 11                                  // DW_FORM_data1
+.b8 0                                   // EOM(1)
+.b8 0                                   // EOM(2)
+.b8 0                                   // EOM(3)
+	}
+	.section	.debug_info
+	{
+.b32 456                                // Length of Unit
+.b8 2                                   // DWARF version number
+.b8 0
+.b32 .debug_abbrev                      // Offset Into Abbrev. Section
+.b8 8                                   // Address Size (in bytes)
+.b8 1                                   // Abbrev [1] 0xb:0x1c1 DW_TAG_compile_unit
+.b8 116                                 // DW_AT_producer
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 0
+.b8 2                                   // DW_AT_language
+.b8 0
+.b8 99                                  // DW_AT_name
+.b8 98
+.b8 118
+.b8 113
+.b8 104
+.b8 106
+.b8 116
+.b8 121
+.b8 103
+.b8 55
+.b8 102
+.b8 118
+.b8 120
+.b8 122
+.b8 119
+.b8 116
+.b8 98
+.b8 116
+.b8 116
+.b8 52
+.b8 118
+.b8 114
+.b8 100
+.b8 107
+.b8 98
+.b8 110
+.b8 98
+.b8 54
+.b8 110
+.b8 51
+.b8 50
+.b8 102
+.b8 110
+.b8 114
+.b8 105
+.b8 106
+.b8 106
+.b8 112
+.b8 108
+.b8 51
+.b8 118
+.b8 118
+.b8 52
+.b8 99
+.b8 102
+.b8 113
+.b8 100
+.b8 52
+.b8 109
+.b8 122
+.b8 110
+.b8 114
+.b8 46
+.b8 112
+.b8 121
+.b8 0
+.b32 .debug_line                        // DW_AT_stmt_list
+.b8 47                                  // DW_AT_comp_dir
+.b8 97
+.b8 112
+.b8 112
+.b8 47
+.b8 116
+.b8 101
+.b8 110
+.b8 115
+.b8 111
+.b8 114
+.b8 114
+.b8 116
+.b8 95
+.b8 108
+.b8 108
+.b8 109
+.b8 47
+.b8 118
+.b8 105
+.b8 115
+.b8 117
+.b8 97
+.b8 108
+.b8 95
+.b8 103
+.b8 101
+.b8 110
+.b8 47
+.b8 99
+.b8 111
+.b8 109
+.b8 112
+.b8 105
+.b8 108
+.b8 101
+.b8 100
+.b8 95
+.b8 99
+.b8 97
+.b8 99
+.b8 104
+.b8 101
+.b8 47
+.b8 102
+.b8 108
+.b8 117
+.b8 120
+.b8 50
+.b8 95
+.b8 107
+.b8 108
+.b8 101
+.b8 105
+.b8 110
+.b8 95
+.b8 57
+.b8 98
+.b8 95
+.b8 78
+.b8 86
+.b8 73
+.b8 68
+.b8 73
+.b8 65
+.b8 95
+.b8 71
+.b8 101
+.b8 70
+.b8 111
+.b8 114
+.b8 99
+.b8 101
+.b8 95
+.b8 82
+.b8 84
+.b8 88
+.b8 95
+.b8 52
+.b8 48
+.b8 57
+.b8 48
+.b8 95
+.b8 115
+.b8 109
+.b8 56
+.b8 57
+.b8 95
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 50
+.b8 46
+.b8 49
+.b8 48
+.b8 46
+.b8 48
+.b8 97
+.b8 48
+.b8 95
+.b8 98
+.b8 52
+.b8 101
+.b8 52
+.b8 101
+.b8 101
+.b8 56
+.b8 49
+.b8 100
+.b8 51
+.b8 46
+.b8 110
+.b8 118
+.b8 50
+.b8 53
+.b8 46
+.b8 49
+.b8 50
+.b8 95
+.b8 99
+.b8 117
+.b8 100
+.b8 97
+.b8 49
+.b8 51
+.b8 95
+.b8 49
+.b8 47
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 105
+.b8 110
+.b8 100
+.b8 117
+.b8 99
+.b8 116
+.b8 111
+.b8 114
+.b8 47
+.b8 98
+.b8 118
+.b8 0
+.b8 2                                   // Abbrev [2] 0xe4:0x6d DW_TAG_subprogram
+.b8 116                                 // DW_AT_name
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 114
+.b8 101
+.b8 100
+.b8 95
+.b8 102
+.b8 117
+.b8 115
+.b8 101
+.b8 100
+.b8 95
+.b8 95
+.b8 102
+.b8 117
+.b8 115
+.b8 101
+.b8 100
+.b8 95
+.b8 114
+.b8 109
+.b8 115
+.b8 95
+.b8 110
+.b8 111
+.b8 114
+.b8 109
+.b8 95
+.b8 95
+.b8 116
+.b8 111
+.b8 95
+.b8 99
+.b8 111
+.b8 112
+.b8 121
+.b8 95
+.b8 97
+.b8 100
+.b8 100
+.b8 95
+.b8 109
+.b8 117
+.b8 108
+.b8 95
+.b8 110
+.b8 101
+.b8 103
+.b8 95
+.b8 115
+.b8 112
+.b8 108
+.b8 105
+.b8 116
+.b8 95
+.b8 115
+.b8 112
+.b8 108
+.b8 105
+.b8 116
+.b8 95
+.b8 119
+.b8 105
+.b8 116
+.b8 104
+.b8 95
+.b8 115
+.b8 105
+.b8 122
+.b8 101
+.b8 115
+.b8 95
+.b8 115
+.b8 116
+.b8 97
+.b8 99
+.b8 107
+.b8 95
+.b8 117
+.b8 110
+.b8 98
+.b8 105
+.b8 110
+.b8 100
+.b8 95
+.b8 117
+.b8 110
+.b8 115
+.b8 113
+.b8 117
+.b8 101
+.b8 101
+.b8 122
+.b8 101
+.b8 95
+.b8 118
+.b8 105
+.b8 101
+.b8 119
+.b8 95
+.b8 48
+.b8 0
+.b8 1                                   // DW_AT_inline
+.b8 3                                   // Abbrev [3] 0x151:0x7a DW_TAG_subprogram
+.b64 $L__func_begin0                    // DW_AT_low_pc
+.b64 $L__func_end0                      // DW_AT_high_pc
+.b32 228                                // DW_AT_abstract_origin
+.b8 4                                   // Abbrev [4] 0x166:0x32 DW_TAG_inlined_subroutine
+.b32 228                                // DW_AT_abstract_origin
+.b64 $L__tmp1                           // DW_AT_low_pc
+.b64 $L__tmp12                          // DW_AT_high_pc
+.b8 1                                   // DW_AT_call_file
+.b8 51                                  // DW_AT_call_line
+.b8 25                                  // DW_AT_call_column
+.b8 5                                   // Abbrev [5] 0x17e:0x19 DW_TAG_inlined_subroutine
+.b32 228                                // DW_AT_abstract_origin
+.b64 $L__tmp1                           // DW_AT_low_pc
+.b64 $L__tmp12                          // DW_AT_high_pc
+.b8 2                                   // DW_AT_call_file
+.b8 37                                  // DW_AT_call_line
+.b8 1
+.b8 36                                  // DW_AT_call_column
+.b8 0                                   // End Of Children Mark
+.b8 4                                   // Abbrev [4] 0x198:0x32 DW_TAG_inlined_subroutine
+.b32 228                                // DW_AT_abstract_origin
+.b64 $L__tmp12                          // DW_AT_low_pc
+.b64 $L__tmp23                          // DW_AT_high_pc
+.b8 1                                   // DW_AT_call_file
+.b8 52                                  // DW_AT_call_line
+.b8 27                                  // DW_AT_call_column
+.b8 5                                   // Abbrev [5] 0x1b0:0x19 DW_TAG_inlined_subroutine
+.b32 228                                // DW_AT_abstract_origin
+.b64 $L__tmp12                          // DW_AT_low_pc
+.b64 $L__tmp23                          // DW_AT_high_pc
+.b8 2                                   // DW_AT_call_file
+.b8 37                                  // DW_AT_call_line
+.b8 1
+.b8 36                                  // DW_AT_call_column
+.b8 0                                   // End Of Children Mark
+.b8 0                                   // End Of Children Mark
+.b8 0                                   // End Of Children Mark
+	}
+	.section	.debug_macinfo	{	}
diff --git a/triton/YTJMOQ5EK2K5SU77M2GF34KOJUJGHRSHTNE3D7KKVDZMRCR7C73Q/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.source b/triton/YTJMOQ5EK2K5SU77M2GF34KOJUJGHRSHTNE3D7KKVDZMRCR7C73Q/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.source
new file mode 100644
index 0000000000000000000000000000000000000000..50320303cc25b97f14e5ff354baaa2182359b780
--- /dev/null
+++ b/triton/YTJMOQ5EK2K5SU77M2GF34KOJUJGHRSHTNE3D7KKVDZMRCR7C73Q/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.source
@@ -0,0 +1,972 @@
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":18:0)
+#loc213 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":287:0)
+#loc215 = loc(unknown)
+#loc218 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":262:0)
+#loc222 = loc("in_out_ptr0"(#loc))
+#loc223 = loc("in_out_ptr1"(#loc))
+#loc224 = loc("in_ptr0"(#loc))
+#loc225 = loc("in_ptr1"(#loc))
+#loc226 = loc("in_ptr2"(#loc))
+#loc227 = loc("in_ptr3"(#loc))
+#loc228 = loc("in_ptr4"(#loc))
+#loc229 = loc("xnumel"(#loc))
+#loc230 = loc("r0_numel"(#loc))
+#loc432 = loc("input"(#loc213))
+#loc433 = loc("a"(#loc218))
+#loc434 = loc("b"(#loc218))
+module {
+  tt.func public @triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0(%in_out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_out_ptr0"(#loc)), %in_out_ptr1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_out_ptr1"(#loc)), %in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %in_ptr4: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr4"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
+    %xnumel_0 = arith.constant 73728 : i32 loc(#loc231)
+    %r0_numel_1 = arith.constant 128 : i32 loc(#loc232)
+    %xoffset = tt.get_program_id x : i32 loc(#loc233)
+    %xoffset_2 = arith.constant 8 : i32 loc(#loc234)
+    %xoffset_3 = arith.constant 8 : i32 loc(#loc234)
+    %xoffset_4 = arith.muli %xoffset, %xoffset_3 : i32 loc(#loc234)
+    %xindex = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32> loc(#loc235)
+    %xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<8xi32> -> tensor<8x1xi32> loc(#loc236)
+    %xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<8x1xi32> loc(#loc237)
+    %xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<8x1xi32> loc(#loc237)
+    %xmask = arith.constant true loc(#loc238)
+    %xmask_8 = arith.constant dense<true> : tensor<8x64xi1> loc(#loc238)
+    %r0_base = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc239)
+    %r0_base_9 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc240)
+    %x0 = arith.constant 32 : i32 loc(#loc241)
+    %x0_10 = arith.constant 32 : i32 loc(#loc241)
+    %x0_11 = arith.constant dense<32> : tensor<8x1xi32> loc(#loc241)
+    %x0_12 = arith.remsi %xindex_7, %x0_11 : tensor<8x1xi32> loc(#loc241)
+    %x1 = arith.constant 32 : i32 loc(#loc242)
+    %x1_13 = arith.constant 32 : i32 loc(#loc242)
+    %x1_14 = arith.constant dense<32> : tensor<8x1xi32> loc(#loc242)
+    %x1_15 = arith.divsi %xindex_7, %x1_14 : tensor<8x1xi32> loc(#loc242)
+    %_tmp4 = arith.constant 0.000000e+00 : f32 loc(#loc243)
+    %_tmp4_16 = arith.constant dense<0.000000e+00> : tensor<8x64xf32> loc(#loc243)
+    %_tmp10 = arith.constant 0.000000e+00 : f32 loc(#loc244)
+    %_tmp10_17 = arith.constant dense<0.000000e+00> : tensor<8x64xf32> loc(#loc244)
+    %c0_i32 = arith.constant 0 : i32 loc(#loc15)
+    %c64_i32 = arith.constant 64 : i32 loc(#loc15)
+    %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc15)
+    %1 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc15)
+    %2 = arith.bitcast %c64_i32 : i32 to i32 loc(#loc15)
+    %3 = ub.poison : i32 loc(#loc15)
+    %_tmp10_18:2 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%_tmp4_23 = %_tmp4_16, %_tmp10_24 = %_tmp10_17) -> (tensor<8x64xf32>, tensor<8x64xf32>)  : i32 {
+      %r0_index = tt.splat %r0_offset : i32 -> tensor<1x64xi32> loc(#loc246)
+      %r0_index_25 = arith.addi %r0_index, %r0_base_9 : tensor<1x64xi32> loc(#loc246)
+      %r0_mask = arith.constant dense<128> : tensor<1x64xi32> loc(#loc247)
+      %r0_mask_26 = arith.cmpi slt, %r0_index_25, %r0_mask : tensor<1x64xi32> loc(#loc247)
+      %tmp0 = arith.constant 4096 : i32 loc(#loc248)
+      %tmp0_27 = arith.constant 4096 : i32 loc(#loc248)
+      %tmp0_28 = arith.constant dense<4096> : tensor<1x64xi32> loc(#loc248)
+      %tmp0_29 = arith.addi %tmp0_28, %r0_index_25 : tensor<1x64xi32> loc(#loc248)
+      %tmp0_30 = arith.constant 128 : i32 loc(#loc249)
+      %tmp0_31 = arith.constant 128 : i32 loc(#loc249)
+      %tmp0_32 = arith.constant dense<128> : tensor<8x1xi32> loc(#loc249)
+      %tmp0_33 = arith.muli %tmp0_32, %x0_12 : tensor<8x1xi32> loc(#loc249)
+      %tmp0_34 = tt.broadcast %tmp0_29 : tensor<1x64xi32> -> tensor<8x64xi32> loc(#loc250)
+      %tmp0_35 = tt.broadcast %tmp0_33 : tensor<8x1xi32> -> tensor<8x64xi32> loc(#loc250)
+      %tmp0_36 = arith.addi %tmp0_34, %tmp0_35 : tensor<8x64xi32> loc(#loc250)
+      %tmp0_37 = arith.constant 36864 : i32 loc(#loc251)
+      %tmp0_38 = arith.constant 36864 : i32 loc(#loc251)
+      %tmp0_39 = arith.constant dense<36864> : tensor<8x1xi32> loc(#loc251)
+      %tmp0_40 = arith.muli %tmp0_39, %x1_15 : tensor<8x1xi32> loc(#loc251)
+      %tmp0_41 = tt.broadcast %tmp0_40 : tensor<8x1xi32> -> tensor<8x64xi32> loc(#loc252)
+      %tmp0_42 = arith.addi %tmp0_36, %tmp0_41 : tensor<8x64xi32> loc(#loc252)
+      %tmp0_43 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<8x64x!tt.ptr<bf16>> loc(#loc253)
+      %tmp0_44 = tt.addptr %tmp0_43, %tmp0_42 : tensor<8x64x!tt.ptr<bf16>>, tensor<8x64xi32> loc(#loc253)
+      %tmp0_45 = arith.constant 0.000000e+00 : f32 loc(#loc254)
+      %tmp0_46 = tt.broadcast %r0_mask_26 : tensor<1x64xi1> -> tensor<8x64xi1> loc(#loc254)
+      %tmp0_47 = arith.constant dense<0.000000e+00> : tensor<8x64xf32> loc(#loc254)
+      %tmp0_48 = arith.truncf %tmp0_47 : tensor<8x64xf32> to tensor<8x64xbf16> loc(#loc254)
+      %tmp0_49 = tt.load %tmp0_44, %tmp0_46, %tmp0_48 evictionPolicy = evict_last : tensor<8x64x!tt.ptr<bf16>> loc(#loc254)
+      %tmp0_50 = arith.extf %tmp0_49 : tensor<8x64xbf16> to tensor<8x64xf32> loc(#loc255)
+      %tmp6 = arith.constant 128 : i32 loc(#loc256)
+      %tmp6_51 = arith.constant 128 : i32 loc(#loc256)
+      %tmp6_52 = arith.constant dense<128> : tensor<8x1xi32> loc(#loc256)
+      %tmp6_53 = arith.muli %tmp6_52, %x0_12 : tensor<8x1xi32> loc(#loc256)
+      %tmp6_54 = tt.broadcast %r0_index_25 : tensor<1x64xi32> -> tensor<8x64xi32> loc(#loc257)
+      %tmp6_55 = tt.broadcast %tmp6_53 : tensor<8x1xi32> -> tensor<8x64xi32> loc(#loc257)
+      %tmp6_56 = arith.addi %tmp6_54, %tmp6_55 : tensor<8x64xi32> loc(#loc257)
+      %tmp6_57 = arith.constant 36864 : i32 loc(#loc258)
+      %tmp6_58 = arith.constant 36864 : i32 loc(#loc258)
+      %tmp6_59 = arith.constant dense<36864> : tensor<8x1xi32> loc(#loc258)
+      %tmp6_60 = arith.muli %tmp6_59, %x1_15 : tensor<8x1xi32> loc(#loc258)
+      %tmp6_61 = tt.broadcast %tmp6_60 : tensor<8x1xi32> -> tensor<8x64xi32> loc(#loc259)
+      %tmp6_62 = arith.addi %tmp6_56, %tmp6_61 : tensor<8x64xi32> loc(#loc259)
+      %tmp6_63 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<8x64x!tt.ptr<bf16>> loc(#loc260)
+      %tmp6_64 = tt.addptr %tmp6_63, %tmp6_62 : tensor<8x64x!tt.ptr<bf16>>, tensor<8x64xi32> loc(#loc260)
+      %tmp6_65 = arith.constant 0.000000e+00 : f32 loc(#loc261)
+      %tmp6_66 = tt.broadcast %r0_mask_26 : tensor<1x64xi1> -> tensor<8x64xi1> loc(#loc261)
+      %tmp6_67 = arith.constant dense<0.000000e+00> : tensor<8x64xf32> loc(#loc261)
+      %tmp6_68 = arith.truncf %tmp6_67 : tensor<8x64xf32> to tensor<8x64xbf16> loc(#loc261)
+      %tmp6_69 = tt.load %tmp6_64, %tmp6_66, %tmp6_68 evictionPolicy = evict_last : tensor<8x64x!tt.ptr<bf16>> loc(#loc261)
+      %tmp6_70 = arith.extf %tmp6_69 : tensor<8x64xbf16> to tensor<8x64xf32> loc(#loc262)
+      %tmp2 = arith.mulf %tmp0_50, %tmp0_50 : tensor<8x64xf32> loc(#loc263)
+      %tmp5 = arith.addf %_tmp4_23, %tmp2 : tensor<8x64xf32> loc(#loc264)
+      %_tmp4_71 = tt.broadcast %r0_mask_26 : tensor<1x64xi1> -> tensor<8x64xi1> loc(#loc265)
+      %_tmp4_72 = arith.select %_tmp4_71, %tmp5, %_tmp4_23 : tensor<8x64xi1>, tensor<8x64xf32> loc(#loc265)
+      %tmp8 = arith.mulf %tmp6_70, %tmp6_70 : tensor<8x64xf32> loc(#loc266)
+      %tmp11 = arith.addf %_tmp10_24, %tmp8 : tensor<8x64xf32> loc(#loc267)
+      %_tmp10_73 = tt.broadcast %r0_mask_26 : tensor<1x64xi1> -> tensor<8x64xi1> loc(#loc268)
+      %_tmp10_74 = arith.select %_tmp10_73, %tmp11, %_tmp10_24 : tensor<8x64xi1>, tensor<8x64xf32> loc(#loc268)
+      scf.yield %_tmp4_72, %_tmp10_74 : tensor<8x64xf32>, tensor<8x64xf32> loc(#loc39)
+    } loc(#loc435)
+    %tmp4 = tt.call @"triton.language.standard.sum__fp32S8_64S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%_tmp10_18#0) : (tensor<8x64xf32>) -> tensor<8xf32> loc(#loc269)
+    %tmp4_19 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<8xf32> -> tensor<8x1xf32> loc(#loc270)
+    %tmp10 = tt.call @"triton.language.standard.sum__fp32S8_64S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%_tmp10_18#1) : (tensor<8x64xf32>) -> tensor<8xf32> loc(#loc271)
+    %tmp10_20 = tt.expand_dims %tmp10 {axis = 1 : i32} : tensor<8xf32> -> tensor<8x1xf32> loc(#loc272)
+    %c0_i32_21 = arith.constant 0 : i32 loc(#loc44)
+    %c64_i32_22 = arith.constant 64 : i32 loc(#loc44)
+    %4 = arith.bitcast %c0_i32_21 : i32 to i32 loc(#loc44)
+    %5 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc44)
+    %6 = arith.bitcast %c64_i32_22 : i32 to i32 loc(#loc44)
+    %7 = ub.poison : i32 loc(#loc44)
+    scf.for %r0_offset = %4 to %5 step %6  : i32 {
+      %r0_index = tt.splat %r0_offset : i32 -> tensor<1x64xi32> loc(#loc273)
+      %r0_index_23 = arith.addi %r0_index, %r0_base_9 : tensor<1x64xi32> loc(#loc273)
+      %r0_mask = arith.constant dense<128> : tensor<1x64xi32> loc(#loc274)
+      %r0_mask_24 = arith.cmpi slt, %r0_index_23, %r0_mask : tensor<1x64xi32> loc(#loc274)
+      %r0_3 = arith.constant 2 : i32 loc(#loc275)
+      %r0_3_25 = arith.constant 2 : i32 loc(#loc275)
+      %r0_3_26 = arith.constant dense<2> : tensor<1x64xi32> loc(#loc275)
+      %r0_3_27 = arith.remsi %r0_index_23, %r0_3_26 : tensor<1x64xi32> loc(#loc275)
+      %r0_4 = arith.constant 2 : i32 loc(#loc276)
+      %r0_4_28 = arith.constant 2 : i32 loc(#loc276)
+      %r0_4_29 = arith.constant dense<2> : tensor<1x64xi32> loc(#loc276)
+      %r0_4_30 = arith.divsi %r0_index_23, %r0_4_29 : tensor<1x64xi32> loc(#loc276)
+      %tmp50 = arith.constant 128 : i32 loc(#loc277)
+      %tmp50_31 = arith.constant 128 : i32 loc(#loc277)
+      %tmp50_32 = arith.constant dense<128> : tensor<8x1xi32> loc(#loc277)
+      %tmp50_33 = arith.muli %tmp50_32, %x0_12 : tensor<8x1xi32> loc(#loc277)
+      %tmp50_34 = tt.broadcast %r0_index_23 : tensor<1x64xi32> -> tensor<8x64xi32> loc(#loc278)
+      %tmp50_35 = tt.broadcast %tmp50_33 : tensor<8x1xi32> -> tensor<8x64xi32> loc(#loc278)
+      %tmp50_36 = arith.addi %tmp50_34, %tmp50_35 : tensor<8x64xi32> loc(#loc278)
+      %tmp50_37 = arith.constant 36864 : i32 loc(#loc279)
+      %tmp50_38 = arith.constant 36864 : i32 loc(#loc279)
+      %tmp50_39 = arith.constant dense<36864> : tensor<8x1xi32> loc(#loc279)
+      %tmp50_40 = arith.muli %tmp50_39, %x1_15 : tensor<8x1xi32> loc(#loc279)
+      %tmp50_41 = tt.broadcast %tmp50_40 : tensor<8x1xi32> -> tensor<8x64xi32> loc(#loc280)
+      %tmp50_42 = arith.addi %tmp50_36, %tmp50_41 : tensor<8x64xi32> loc(#loc280)
+      %tmp50_43 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<8x64x!tt.ptr<bf16>> loc(#loc281)
+      %tmp50_44 = tt.addptr %tmp50_43, %tmp50_42 : tensor<8x64x!tt.ptr<bf16>>, tensor<8x64xi32> loc(#loc281)
+      %tmp50_45 = arith.constant 0.000000e+00 : f32 loc(#loc282)
+      %tmp50_46 = tt.broadcast %r0_mask_24 : tensor<1x64xi1> -> tensor<8x64xi1> loc(#loc282)
+      %tmp50_47 = arith.constant dense<0.000000e+00> : tensor<8x64xf32> loc(#loc282)
+      %tmp50_48 = arith.truncf %tmp50_47 : tensor<8x64xf32> to tensor<8x64xbf16> loc(#loc282)
+      %tmp50_49 = tt.load %tmp50_44, %tmp50_46, %tmp50_48 evictionPolicy = evict_last : tensor<8x64x!tt.ptr<bf16>> loc(#loc282)
+      %tmp50_50 = arith.extf %tmp50_49 : tensor<8x64xbf16> to tensor<8x64xf32> loc(#loc283)
+      %tmp58 = tt.splat %in_ptr1 : !tt.ptr<bf16> -> tensor<1x64x!tt.ptr<bf16>> loc(#loc284)
+      %tmp58_51 = tt.addptr %tmp58, %r0_index_23 : tensor<1x64x!tt.ptr<bf16>>, tensor<1x64xi32> loc(#loc284)
+      %tmp58_52 = arith.constant 0.000000e+00 : f32 loc(#loc285)
+      %tmp58_53 = arith.constant dense<0.000000e+00> : tensor<1x64xf32> loc(#loc285)
+      %tmp58_54 = arith.truncf %tmp58_53 : tensor<1x64xf32> to tensor<1x64xbf16> loc(#loc285)
+      %tmp58_55 = tt.load %tmp58_51, %r0_mask_24, %tmp58_54 evictionPolicy = evict_last : tensor<1x64x!tt.ptr<bf16>> loc(#loc285)
+      %tmp58_56 = arith.extf %tmp58_55 : tensor<1x64xbf16> to tensor<1x64xf32> loc(#loc286)
+      %tmp63 = arith.constant 128 : i32 loc(#loc287)
+      %tmp63_57 = arith.constant 128 : i32 loc(#loc287)
+      %tmp63_58 = arith.constant dense<128> : tensor<8x1xi32> loc(#loc287)
+      %tmp63_59 = arith.muli %tmp63_58, %x1_15 : tensor<8x1xi32> loc(#loc287)
+      %tmp63_60 = tt.broadcast %r0_index_23 : tensor<1x64xi32> -> tensor<8x64xi32> loc(#loc288)
+      %tmp63_61 = tt.broadcast %tmp63_59 : tensor<8x1xi32> -> tensor<8x64xi32> loc(#loc288)
+      %tmp63_62 = arith.addi %tmp63_60, %tmp63_61 : tensor<8x64xi32> loc(#loc288)
+      %tmp63_63 = tt.splat %in_ptr2 : !tt.ptr<f32> -> tensor<8x64x!tt.ptr<f32>> loc(#loc289)
+      %tmp63_64 = tt.addptr %tmp63_63, %tmp63_62 : tensor<8x64x!tt.ptr<f32>>, tensor<8x64xi32> loc(#loc289)
+      %tmp63_65 = arith.constant 0.000000e+00 : f32 loc(#loc290)
+      %tmp63_66 = tt.broadcast %r0_mask_24 : tensor<1x64xi1> -> tensor<8x64xi1> loc(#loc290)
+      %tmp63_67 = arith.constant dense<0.000000e+00> : tensor<8x64xf32> loc(#loc290)
+      %tmp63_68 = tt.load %tmp63_64, %tmp63_66, %tmp63_67 evictionPolicy = evict_last : tensor<8x64x!tt.ptr<f32>> loc(#loc290)
+      %tmp66 = arith.constant 128 : i32 loc(#loc291)
+      %tmp66_69 = arith.constant 128 : i32 loc(#loc291)
+      %tmp66_70 = arith.constant dense<128> : tensor<8x1xi32> loc(#loc291)
+      %tmp66_71 = arith.muli %tmp66_70, %x1_15 : tensor<8x1xi32> loc(#loc291)
+      %tmp66_72 = tt.broadcast %r0_index_23 : tensor<1x64xi32> -> tensor<8x64xi32> loc(#loc292)
+      %tmp66_73 = tt.broadcast %tmp66_71 : tensor<8x1xi32> -> tensor<8x64xi32> loc(#loc292)
+      %tmp66_74 = arith.addi %tmp66_72, %tmp66_73 : tensor<8x64xi32> loc(#loc292)
+      %tmp66_75 = tt.splat %in_ptr3 : !tt.ptr<f32> -> tensor<8x64x!tt.ptr<f32>> loc(#loc293)
+      %tmp66_76 = tt.addptr %tmp66_75, %tmp66_74 : tensor<8x64x!tt.ptr<f32>>, tensor<8x64xi32> loc(#loc293)
+      %tmp66_77 = arith.constant 0.000000e+00 : f32 loc(#loc294)
+      %tmp66_78 = tt.broadcast %r0_mask_24 : tensor<1x64xi1> -> tensor<8x64xi1> loc(#loc294)
+      %tmp66_79 = arith.constant dense<0.000000e+00> : tensor<8x64xf32> loc(#loc294)
+      %tmp66_80 = tt.load %tmp66_76, %tmp66_78, %tmp66_79 evictionPolicy = evict_last : tensor<8x64x!tt.ptr<f32>> loc(#loc294)
+      %tmp96 = arith.constant 4096 : i32 loc(#loc295)
+      %tmp96_81 = arith.constant 4096 : i32 loc(#loc295)
+      %tmp96_82 = arith.constant dense<4096> : tensor<1x64xi32> loc(#loc295)
+      %tmp96_83 = arith.addi %tmp96_82, %r0_index_23 : tensor<1x64xi32> loc(#loc295)
+      %tmp96_84 = arith.constant 128 : i32 loc(#loc296)
+      %tmp96_85 = arith.constant 128 : i32 loc(#loc296)
+      %tmp96_86 = arith.constant dense<128> : tensor<8x1xi32> loc(#loc296)
+      %tmp96_87 = arith.muli %tmp96_86, %x0_12 : tensor<8x1xi32> loc(#loc296)
+      %tmp96_88 = tt.broadcast %tmp96_83 : tensor<1x64xi32> -> tensor<8x64xi32> loc(#loc297)
+      %tmp96_89 = tt.broadcast %tmp96_87 : tensor<8x1xi32> -> tensor<8x64xi32> loc(#loc297)
+      %tmp96_90 = arith.addi %tmp96_88, %tmp96_89 : tensor<8x64xi32> loc(#loc297)
+      %tmp96_91 = arith.constant 36864 : i32 loc(#loc298)
+      %tmp96_92 = arith.constant 36864 : i32 loc(#loc298)
+      %tmp96_93 = arith.constant dense<36864> : tensor<8x1xi32> loc(#loc298)
+      %tmp96_94 = arith.muli %tmp96_93, %x1_15 : tensor<8x1xi32> loc(#loc298)
+      %tmp96_95 = tt.broadcast %tmp96_94 : tensor<8x1xi32> -> tensor<8x64xi32> loc(#loc299)
+      %tmp96_96 = arith.addi %tmp96_90, %tmp96_95 : tensor<8x64xi32> loc(#loc299)
+      %tmp96_97 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<8x64x!tt.ptr<bf16>> loc(#loc300)
+      %tmp96_98 = tt.addptr %tmp96_97, %tmp96_96 : tensor<8x64x!tt.ptr<bf16>>, tensor<8x64xi32> loc(#loc300)
+      %tmp96_99 = arith.constant 0.000000e+00 : f32 loc(#loc301)
+      %tmp96_100 = tt.broadcast %r0_mask_24 : tensor<1x64xi1> -> tensor<8x64xi1> loc(#loc301)
+      %tmp96_101 = arith.constant dense<0.000000e+00> : tensor<8x64xf32> loc(#loc301)
+      %tmp96_102 = arith.truncf %tmp96_101 : tensor<8x64xf32> to tensor<8x64xbf16> loc(#loc301)
+      %tmp96_103 = tt.load %tmp96_98, %tmp96_100, %tmp96_102 evictionPolicy = evict_first : tensor<8x64x!tt.ptr<bf16>> loc(#loc301)
+      %tmp96_104 = arith.extf %tmp96_103 : tensor<8x64xbf16> to tensor<8x64xf32> loc(#loc302)
+      %tmp102 = tt.splat %in_ptr4 : !tt.ptr<bf16> -> tensor<1x64x!tt.ptr<bf16>> loc(#loc303)
+      %tmp102_105 = tt.addptr %tmp102, %r0_index_23 : tensor<1x64x!tt.ptr<bf16>>, tensor<1x64xi32> loc(#loc303)
+      %tmp102_106 = arith.constant 0.000000e+00 : f32 loc(#loc304)
+      %tmp102_107 = arith.constant dense<0.000000e+00> : tensor<1x64xf32> loc(#loc304)
+      %tmp102_108 = arith.truncf %tmp102_107 : tensor<1x64xf32> to tensor<1x64xbf16> loc(#loc304)
+      %tmp102_109 = tt.load %tmp102_105, %r0_mask_24, %tmp102_108 evictionPolicy = evict_last : tensor<1x64x!tt.ptr<bf16>> loc(#loc304)
+      %tmp102_110 = arith.extf %tmp102_109 : tensor<1x64xbf16> to tensor<1x64xf32> loc(#loc305)
+      %tmp13 = arith.constant 0 : i64 loc(#loc306)
+      %tmp13_111 = arith.constant dense<0> : tensor<1x1xi64> loc(#loc306)
+      %tmp14 = arith.extsi %r0_3_27 : tensor<1x64xi32> to tensor<1x64xi64> loc(#loc307)
+      %tmp14_112 = arith.constant dense<0> : tensor<1x64xi64> loc(#loc307)
+      %tmp14_113 = arith.cmpi sge, %tmp14, %tmp14_112 : tensor<1x64xi64> loc(#loc307)
+      %tmp15 = arith.constant 1 : i64 loc(#loc308)
+      %tmp15_114 = arith.constant dense<1> : tensor<1x1xi64> loc(#loc308)
+      %tmp16 = arith.extsi %r0_3_27 : tensor<1x64xi32> to tensor<1x64xi64> loc(#loc309)
+      %tmp16_115 = arith.constant dense<1> : tensor<1x64xi64> loc(#loc309)
+      %tmp16_116 = arith.cmpi slt, %tmp16, %tmp16_115 : tensor<1x64xi64> loc(#loc309)
+      %tmp17 = arith.constant 2 : i32 loc(#loc310)
+      %tmp17_117 = arith.constant 2 : i32 loc(#loc310)
+      %tmp17_118 = arith.constant dense<2> : tensor<1x64xi32> loc(#loc310)
+      %tmp17_119 = arith.muli %tmp17_118, %r0_4_30 : tensor<1x64xi32> loc(#loc310)
+      %tmp17_120 = arith.constant 1 : i32 loc(#loc311)
+      %tmp17_121 = arith.constant 1 : i32 loc(#loc311)
+      %tmp17_122 = arith.constant dense<1> : tensor<1x64xi32> loc(#loc311)
+      %tmp17_123 = arith.addi %tmp17_122, %tmp17_119 : tensor<1x64xi32> loc(#loc311)
+      %tmp17_124 = arith.constant 128 : i32 loc(#loc312)
+      %tmp17_125 = arith.constant 128 : i32 loc(#loc312)
+      %tmp17_126 = arith.constant dense<128> : tensor<8x1xi32> loc(#loc312)
+      %tmp17_127 = arith.muli %tmp17_126, %x0_12 : tensor<8x1xi32> loc(#loc312)
+      %tmp17_128 = tt.broadcast %tmp17_123 : tensor<1x64xi32> -> tensor<8x64xi32> loc(#loc313)
+      %tmp17_129 = tt.broadcast %tmp17_127 : tensor<8x1xi32> -> tensor<8x64xi32> loc(#loc313)
+      %tmp17_130 = arith.addi %tmp17_128, %tmp17_129 : tensor<8x64xi32> loc(#loc313)
+      %tmp17_131 = arith.constant 36864 : i32 loc(#loc314)
+      %tmp17_132 = arith.constant 36864 : i32 loc(#loc314)
+      %tmp17_133 = arith.constant dense<36864> : tensor<8x1xi32> loc(#loc314)
+      %tmp17_134 = arith.muli %tmp17_133, %x1_15 : tensor<8x1xi32> loc(#loc314)
+      %tmp17_135 = tt.broadcast %tmp17_134 : tensor<8x1xi32> -> tensor<8x64xi32> loc(#loc315)
+      %tmp17_136 = arith.addi %tmp17_130, %tmp17_135 : tensor<8x64xi32> loc(#loc315)
+      %tmp17_137 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<8x64x!tt.ptr<bf16>> loc(#loc316)
+      %tmp17_138 = tt.addptr %tmp17_137, %tmp17_136 : tensor<8x64x!tt.ptr<bf16>>, tensor<8x64xi32> loc(#loc316)
+      %tmp17_139 = arith.andi %r0_mask_24, %tmp16_116 : tensor<1x64xi1> loc(#loc317)
+      %tmp17_140 = arith.constant 0.000000e+00 : f32 loc(#loc318)
+      %tmp17_141 = tt.broadcast %tmp17_139 : tensor<1x64xi1> -> tensor<8x64xi1> loc(#loc318)
+      %tmp17_142 = arith.constant dense<0.000000e+00> : tensor<8x64xf32> loc(#loc318)
+      %tmp17_143 = arith.truncf %tmp17_142 : tensor<8x64xf32> to tensor<8x64xbf16> loc(#loc318)
+      %tmp17_144 = tt.load %tmp17_138, %tmp17_141, %tmp17_143 evictionPolicy = evict_last : tensor<8x64x!tt.ptr<bf16>> loc(#loc318)
+      %tmp17_145 = arith.extf %tmp17_144 : tensor<8x64xbf16> to tensor<8x64xf32> loc(#loc319)
+      %tmp19 = arith.constant 1.280000e+02 : f32 loc(#loc320)
+      %tmp20 = arith.constant dense<1.280000e+02> : tensor<8x1xf32> loc(#loc321)
+      %tmp20_146 = arith.divf %tmp10_20, %tmp20 : tensor<8x1xf32> loc(#loc321)
+      %tmp21 = arith.constant 9.99999997E-7 : f32 loc(#loc322)
+      %tmp22 = arith.constant dense<9.99999997E-7> : tensor<8x1xf32> loc(#loc323)
+      %tmp22_147 = arith.addf %tmp20_146, %tmp22 : tensor<8x1xf32> loc(#loc323)
+      %tmp23 = tt.extern_elementwise %tmp22_147 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<8x1xf32>) -> tensor<8x1xf32> loc(#loc324)
+      %tmp24 = tt.broadcast %tmp23 : tensor<8x1xf32> -> tensor<8x64xf32> loc(#loc325)
+      %tmp24_148 = arith.mulf %tmp17_145, %tmp24 : tensor<8x64xf32> loc(#loc325)
+      %tmp25 = arith.constant 2 : i32 loc(#loc326)
+      %tmp25_149 = arith.constant 2 : i32 loc(#loc326)
+      %tmp25_150 = arith.constant dense<2> : tensor<1x64xi32> loc(#loc326)
+      %tmp25_151 = arith.muli %tmp25_150, %r0_4_30 : tensor<1x64xi32> loc(#loc326)
+      %tmp25_152 = arith.constant 1 : i32 loc(#loc327)
+      %tmp25_153 = arith.constant 1 : i32 loc(#loc327)
+      %tmp25_154 = arith.constant dense<1> : tensor<1x64xi32> loc(#loc327)
+      %tmp25_155 = arith.addi %tmp25_154, %tmp25_151 : tensor<1x64xi32> loc(#loc327)
+      %tmp25_156 = tt.broadcast %tmp25_155 : tensor<1x64xi32> -> tensor<8x64xi32> loc(#loc328)
+      %tmp25_157 = tt.splat %in_ptr1 : !tt.ptr<bf16> -> tensor<8x64x!tt.ptr<bf16>> loc(#loc329)
+      %tmp25_158 = tt.addptr %tmp25_157, %tmp25_156 : tensor<8x64x!tt.ptr<bf16>>, tensor<8x64xi32> loc(#loc329)
+      %tmp25_159 = arith.andi %r0_mask_24, %tmp16_116 : tensor<1x64xi1> loc(#loc330)
+      %tmp25_160 = arith.constant 0.000000e+00 : f32 loc(#loc331)
+      %tmp25_161 = tt.broadcast %tmp25_159 : tensor<1x64xi1> -> tensor<8x64xi1> loc(#loc331)
+      %tmp25_162 = arith.constant dense<0.000000e+00> : tensor<8x64xf32> loc(#loc331)
+      %tmp25_163 = arith.truncf %tmp25_162 : tensor<8x64xf32> to tensor<8x64xbf16> loc(#loc331)
+      %tmp25_164 = tt.load %tmp25_158, %tmp25_161, %tmp25_163 evictionPolicy = evict_last : tensor<8x64x!tt.ptr<bf16>> loc(#loc331)
+      %tmp25_165 = arith.extf %tmp25_164 : tensor<8x64xbf16> to tensor<8x64xf32> loc(#loc332)
+      %tmp27 = arith.mulf %tmp24_148, %tmp25_165 : tensor<8x64xf32> loc(#loc333)
+      %tmp29 = arith.constant 0.000000e+00 : f32 loc(#loc334)
+      %tmp29_166 = arith.constant dense<0.000000e+00> : tensor<8x64xf32> loc(#loc334)
+      %tmp29_167 = arith.subf %tmp29_166, %tmp27 : tensor<8x64xf32> loc(#loc334)
+      %tmp30 = arith.constant 0.000000e+00 : f32 loc(#loc335)
+      %tmp30_168 = arith.constant dense<0.000000e+00> : tensor<8x64xf32> loc(#loc335)
+      %tmp31 = tt.broadcast %tmp16_116 : tensor<1x64xi1> -> tensor<8x64xi1> loc(#loc336)
+      %tmp31_169 = arith.select %tmp31, %tmp29_167, %tmp30_168 : tensor<8x64xi1>, tensor<8x64xf32> loc(#loc336)
+      %tmp32 = arith.extsi %r0_3_27 : tensor<1x64xi32> to tensor<1x64xi64> loc(#loc337)
+      %tmp32_170 = arith.constant dense<1> : tensor<1x64xi64> loc(#loc337)
+      %tmp32_171 = arith.cmpi sge, %tmp32, %tmp32_170 : tensor<1x64xi64> loc(#loc337)
+      %tmp33 = arith.constant 2 : i64 loc(#loc338)
+      %tmp33_172 = arith.constant dense<2> : tensor<1x1xi64> loc(#loc338)
+      %tmp34 = arith.extsi %r0_3_27 : tensor<1x64xi32> to tensor<1x64xi64> loc(#loc339)
+      %tmp34_173 = arith.constant dense<2> : tensor<1x64xi64> loc(#loc339)
+      %tmp34_174 = arith.cmpi slt, %tmp34, %tmp34_173 : tensor<1x64xi64> loc(#loc339)
+      %tmp35 = arith.constant 2 : i32 loc(#loc340)
+      %tmp35_175 = arith.constant 2 : i32 loc(#loc340)
+      %tmp35_176 = arith.constant dense<2> : tensor<1x64xi32> loc(#loc340)
+      %tmp35_177 = arith.muli %tmp35_176, %r0_4_30 : tensor<1x64xi32> loc(#loc340)
+      %tmp35_178 = arith.constant 128 : i32 loc(#loc341)
+      %tmp35_179 = arith.constant 128 : i32 loc(#loc341)
+      %tmp35_180 = arith.constant dense<128> : tensor<8x1xi32> loc(#loc341)
+      %tmp35_181 = arith.muli %tmp35_180, %x0_12 : tensor<8x1xi32> loc(#loc341)
+      %tmp35_182 = tt.broadcast %tmp35_177 : tensor<1x64xi32> -> tensor<8x64xi32> loc(#loc342)
+      %tmp35_183 = tt.broadcast %tmp35_181 : tensor<8x1xi32> -> tensor<8x64xi32> loc(#loc342)
+      %tmp35_184 = arith.addi %tmp35_182, %tmp35_183 : tensor<8x64xi32> loc(#loc342)
+      %tmp35_185 = arith.constant 36864 : i32 loc(#loc343)
+      %tmp35_186 = arith.constant 36864 : i32 loc(#loc343)
+      %tmp35_187 = arith.constant dense<36864> : tensor<8x1xi32> loc(#loc343)
+      %tmp35_188 = arith.muli %tmp35_187, %x1_15 : tensor<8x1xi32> loc(#loc343)
+      %tmp35_189 = tt.broadcast %tmp35_188 : tensor<8x1xi32> -> tensor<8x64xi32> loc(#loc344)
+      %tmp35_190 = arith.addi %tmp35_184, %tmp35_189 : tensor<8x64xi32> loc(#loc344)
+      %tmp35_191 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<8x64x!tt.ptr<bf16>> loc(#loc345)
+      %tmp35_192 = tt.addptr %tmp35_191, %tmp35_190 : tensor<8x64x!tt.ptr<bf16>>, tensor<8x64xi32> loc(#loc345)
+      %tmp35_193 = arith.andi %r0_mask_24, %tmp32_171 : tensor<1x64xi1> loc(#loc346)
+      %tmp35_194 = arith.constant 0.000000e+00 : f32 loc(#loc347)
+      %tmp35_195 = tt.broadcast %tmp35_193 : tensor<1x64xi1> -> tensor<8x64xi1> loc(#loc347)
+      %tmp35_196 = arith.constant dense<0.000000e+00> : tensor<8x64xf32> loc(#loc347)
+      %tmp35_197 = arith.truncf %tmp35_196 : tensor<8x64xf32> to tensor<8x64xbf16> loc(#loc347)
+      %tmp35_198 = tt.load %tmp35_192, %tmp35_195, %tmp35_197 evictionPolicy = evict_last : tensor<8x64x!tt.ptr<bf16>> loc(#loc347)
+      %tmp35_199 = arith.extf %tmp35_198 : tensor<8x64xbf16> to tensor<8x64xf32> loc(#loc348)
+      %tmp37 = arith.constant 1.280000e+02 : f32 loc(#loc349)
+      %tmp38 = arith.constant dense<1.280000e+02> : tensor<8x1xf32> loc(#loc350)
+      %tmp38_200 = arith.divf %tmp10_20, %tmp38 : tensor<8x1xf32> loc(#loc350)
+      %tmp39 = arith.constant 9.99999997E-7 : f32 loc(#loc351)
+      %tmp40 = arith.constant dense<9.99999997E-7> : tensor<8x1xf32> loc(#loc352)
+      %tmp40_201 = arith.addf %tmp38_200, %tmp40 : tensor<8x1xf32> loc(#loc352)
+      %tmp41 = tt.extern_elementwise %tmp40_201 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<8x1xf32>) -> tensor<8x1xf32> loc(#loc353)
+      %tmp42 = tt.broadcast %tmp41 : tensor<8x1xf32> -> tensor<8x64xf32> loc(#loc354)
+      %tmp42_202 = arith.mulf %tmp35_199, %tmp42 : tensor<8x64xf32> loc(#loc354)
+      %tmp43 = arith.constant 2 : i32 loc(#loc355)
+      %tmp43_203 = arith.constant 2 : i32 loc(#loc355)
+      %tmp43_204 = arith.constant dense<2> : tensor<1x64xi32> loc(#loc355)
+      %tmp43_205 = arith.muli %tmp43_204, %r0_4_30 : tensor<1x64xi32> loc(#loc355)
+      %tmp43_206 = tt.broadcast %tmp43_205 : tensor<1x64xi32> -> tensor<8x64xi32> loc(#loc356)
+      %tmp43_207 = tt.splat %in_ptr1 : !tt.ptr<bf16> -> tensor<8x64x!tt.ptr<bf16>> loc(#loc357)
+      %tmp43_208 = tt.addptr %tmp43_207, %tmp43_206 : tensor<8x64x!tt.ptr<bf16>>, tensor<8x64xi32> loc(#loc357)
+      %tmp43_209 = arith.andi %r0_mask_24, %tmp32_171 : tensor<1x64xi1> loc(#loc358)
+      %tmp43_210 = arith.constant 0.000000e+00 : f32 loc(#loc359)
+      %tmp43_211 = tt.broadcast %tmp43_209 : tensor<1x64xi1> -> tensor<8x64xi1> loc(#loc359)
+      %tmp43_212 = arith.constant dense<0.000000e+00> : tensor<8x64xf32> loc(#loc359)
+      %tmp43_213 = arith.truncf %tmp43_212 : tensor<8x64xf32> to tensor<8x64xbf16> loc(#loc359)
+      %tmp43_214 = tt.load %tmp43_208, %tmp43_211, %tmp43_213 evictionPolicy = evict_last : tensor<8x64x!tt.ptr<bf16>> loc(#loc359)
+      %tmp43_215 = arith.extf %tmp43_214 : tensor<8x64xbf16> to tensor<8x64xf32> loc(#loc360)
+      %tmp45 = arith.mulf %tmp42_202, %tmp43_215 : tensor<8x64xf32> loc(#loc361)
+      %tmp47 = arith.constant 0.000000e+00 : f32 loc(#loc362)
+      %tmp47_216 = arith.constant dense<0.000000e+00> : tensor<8x64xf32> loc(#loc362)
+      %tmp48 = tt.broadcast %tmp32_171 : tensor<1x64xi1> -> tensor<8x64xi1> loc(#loc363)
+      %tmp48_217 = arith.select %tmp48, %tmp45, %tmp47_216 : tensor<8x64xi1>, tensor<8x64xf32> loc(#loc363)
+      %tmp49 = tt.broadcast %tmp16_116 : tensor<1x64xi1> -> tensor<8x64xi1> loc(#loc364)
+      %tmp49_218 = arith.select %tmp49, %tmp31_169, %tmp48_217 : tensor<8x64xi1>, tensor<8x64xf32> loc(#loc364)
+      %tmp52 = arith.constant 1.280000e+02 : f32 loc(#loc365)
+      %tmp53 = arith.constant dense<1.280000e+02> : tensor<8x1xf32> loc(#loc366)
+      %tmp53_219 = arith.divf %tmp10_20, %tmp53 : tensor<8x1xf32> loc(#loc366)
+      %tmp54 = arith.constant 9.99999997E-7 : f32 loc(#loc367)
+      %tmp55 = arith.constant dense<9.99999997E-7> : tensor<8x1xf32> loc(#loc368)
+      %tmp55_220 = arith.addf %tmp53_219, %tmp55 : tensor<8x1xf32> loc(#loc368)
+      %tmp56 = tt.extern_elementwise %tmp55_220 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<8x1xf32>) -> tensor<8x1xf32> loc(#loc369)
+      %tmp57 = tt.broadcast %tmp56 : tensor<8x1xf32> -> tensor<8x64xf32> loc(#loc370)
+      %tmp57_221 = arith.mulf %tmp50_50, %tmp57 : tensor<8x64xf32> loc(#loc370)
+      %tmp60 = tt.broadcast %tmp58_56 : tensor<1x64xf32> -> tensor<8x64xf32> loc(#loc371)
+      %tmp60_222 = arith.mulf %tmp57_221, %tmp60 : tensor<8x64xf32> loc(#loc371)
+      %tmp64 = arith.mulf %tmp60_222, %tmp63_68 : tensor<8x64xf32> loc(#loc372)
+      %tmp67 = arith.mulf %tmp49_218, %tmp66_80 : tensor<8x64xf32> loc(#loc373)
+      %tmp68 = arith.addf %tmp64, %tmp67 : tensor<8x64xf32> loc(#loc374)
+      %tmp70 = arith.constant 2 : i32 loc(#loc375)
+      %tmp70_223 = arith.constant 2 : i32 loc(#loc375)
+      %tmp70_224 = arith.constant dense<2> : tensor<1x64xi32> loc(#loc375)
+      %tmp70_225 = arith.muli %tmp70_224, %r0_4_30 : tensor<1x64xi32> loc(#loc375)
+      %tmp70_226 = arith.constant 4097 : i32 loc(#loc376)
+      %tmp70_227 = arith.constant 4097 : i32 loc(#loc376)
+      %tmp70_228 = arith.constant dense<4097> : tensor<1x64xi32> loc(#loc376)
+      %tmp70_229 = arith.addi %tmp70_228, %tmp70_225 : tensor<1x64xi32> loc(#loc376)
+      %tmp70_230 = arith.constant 128 : i32 loc(#loc377)
+      %tmp70_231 = arith.constant 128 : i32 loc(#loc377)
+      %tmp70_232 = arith.constant dense<128> : tensor<8x1xi32> loc(#loc377)
+      %tmp70_233 = arith.muli %tmp70_232, %x0_12 : tensor<8x1xi32> loc(#loc377)
+      %tmp70_234 = tt.broadcast %tmp70_229 : tensor<1x64xi32> -> tensor<8x64xi32> loc(#loc378)
+      %tmp70_235 = tt.broadcast %tmp70_233 : tensor<8x1xi32> -> tensor<8x64xi32> loc(#loc378)
+      %tmp70_236 = arith.addi %tmp70_234, %tmp70_235 : tensor<8x64xi32> loc(#loc378)
+      %tmp70_237 = arith.constant 36864 : i32 loc(#loc379)
+      %tmp70_238 = arith.constant 36864 : i32 loc(#loc379)
+      %tmp70_239 = arith.constant dense<36864> : tensor<8x1xi32> loc(#loc379)
+      %tmp70_240 = arith.muli %tmp70_239, %x1_15 : tensor<8x1xi32> loc(#loc379)
+      %tmp70_241 = tt.broadcast %tmp70_240 : tensor<8x1xi32> -> tensor<8x64xi32> loc(#loc380)
+      %tmp70_242 = arith.addi %tmp70_236, %tmp70_241 : tensor<8x64xi32> loc(#loc380)
+      %tmp70_243 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<8x64x!tt.ptr<bf16>> loc(#loc381)
+      %tmp70_244 = tt.addptr %tmp70_243, %tmp70_242 : tensor<8x64x!tt.ptr<bf16>>, tensor<8x64xi32> loc(#loc381)
+      %tmp70_245 = arith.andi %r0_mask_24, %tmp16_116 : tensor<1x64xi1> loc(#loc382)
+      %tmp70_246 = arith.constant 0.000000e+00 : f32 loc(#loc383)
+      %tmp70_247 = tt.broadcast %tmp70_245 : tensor<1x64xi1> -> tensor<8x64xi1> loc(#loc383)
+      %tmp70_248 = arith.constant dense<0.000000e+00> : tensor<8x64xf32> loc(#loc383)
+      %tmp70_249 = arith.truncf %tmp70_248 : tensor<8x64xf32> to tensor<8x64xbf16> loc(#loc383)
+      %tmp70_250 = tt.load %tmp70_244, %tmp70_247, %tmp70_249 evictionPolicy = evict_last : tensor<8x64x!tt.ptr<bf16>> loc(#loc383)
+      %tmp70_251 = arith.extf %tmp70_250 : tensor<8x64xbf16> to tensor<8x64xf32> loc(#loc384)
+      %tmp72 = arith.constant dense<1.280000e+02> : tensor<8x1xf32> loc(#loc385)
+      %tmp72_252 = arith.divf %tmp4_19, %tmp72 : tensor<8x1xf32> loc(#loc385)
+      %tmp73 = arith.constant dense<9.99999997E-7> : tensor<8x1xf32> loc(#loc386)
+      %tmp73_253 = arith.addf %tmp72_252, %tmp73 : tensor<8x1xf32> loc(#loc386)
+      %tmp74 = tt.extern_elementwise %tmp73_253 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<8x1xf32>) -> tensor<8x1xf32> loc(#loc387)
+      %tmp75 = tt.broadcast %tmp74 : tensor<8x1xf32> -> tensor<8x64xf32> loc(#loc388)
+      %tmp75_254 = arith.mulf %tmp70_251, %tmp75 : tensor<8x64xf32> loc(#loc388)
+      %tmp76 = arith.constant 2 : i32 loc(#loc389)
+      %tmp76_255 = arith.constant 2 : i32 loc(#loc389)
+      %tmp76_256 = arith.constant dense<2> : tensor<1x64xi32> loc(#loc389)
+      %tmp76_257 = arith.muli %tmp76_256, %r0_4_30 : tensor<1x64xi32> loc(#loc389)
+      %tmp76_258 = arith.constant 1 : i32 loc(#loc390)
+      %tmp76_259 = arith.constant 1 : i32 loc(#loc390)
+      %tmp76_260 = arith.constant dense<1> : tensor<1x64xi32> loc(#loc390)
+      %tmp76_261 = arith.addi %tmp76_260, %tmp76_257 : tensor<1x64xi32> loc(#loc390)
+      %tmp76_262 = tt.broadcast %tmp76_261 : tensor<1x64xi32> -> tensor<8x64xi32> loc(#loc391)
+      %tmp76_263 = tt.splat %in_ptr4 : !tt.ptr<bf16> -> tensor<8x64x!tt.ptr<bf16>> loc(#loc392)
+      %tmp76_264 = tt.addptr %tmp76_263, %tmp76_262 : tensor<8x64x!tt.ptr<bf16>>, tensor<8x64xi32> loc(#loc392)
+      %tmp76_265 = arith.andi %r0_mask_24, %tmp16_116 : tensor<1x64xi1> loc(#loc393)
+      %tmp76_266 = arith.constant 0.000000e+00 : f32 loc(#loc394)
+      %tmp76_267 = tt.broadcast %tmp76_265 : tensor<1x64xi1> -> tensor<8x64xi1> loc(#loc394)
+      %tmp76_268 = arith.constant dense<0.000000e+00> : tensor<8x64xf32> loc(#loc394)
+      %tmp76_269 = arith.truncf %tmp76_268 : tensor<8x64xf32> to tensor<8x64xbf16> loc(#loc394)
+      %tmp76_270 = tt.load %tmp76_264, %tmp76_267, %tmp76_269 evictionPolicy = evict_last : tensor<8x64x!tt.ptr<bf16>> loc(#loc394)
+      %tmp76_271 = arith.extf %tmp76_270 : tensor<8x64xbf16> to tensor<8x64xf32> loc(#loc395)
+      %tmp78 = arith.mulf %tmp75_254, %tmp76_271 : tensor<8x64xf32> loc(#loc396)
+      %tmp80 = arith.constant 0.000000e+00 : f32 loc(#loc397)
+      %tmp80_272 = arith.constant dense<0.000000e+00> : tensor<8x64xf32> loc(#loc397)
+      %tmp80_273 = arith.subf %tmp80_272, %tmp78 : tensor<8x64xf32> loc(#loc397)
+      %tmp81 = arith.constant 0.000000e+00 : f32 loc(#loc398)
+      %tmp81_274 = arith.constant dense<0.000000e+00> : tensor<8x64xf32> loc(#loc398)
+      %tmp82 = tt.broadcast %tmp16_116 : tensor<1x64xi1> -> tensor<8x64xi1> loc(#loc399)
+      %tmp82_275 = arith.select %tmp82, %tmp80_273, %tmp81_274 : tensor<8x64xi1>, tensor<8x64xf32> loc(#loc399)
+      %tmp83 = arith.constant 2 : i32 loc(#loc400)
+      %tmp83_276 = arith.constant 2 : i32 loc(#loc400)
+      %tmp83_277 = arith.constant dense<2> : tensor<1x64xi32> loc(#loc400)
+      %tmp83_278 = arith.muli %tmp83_277, %r0_4_30 : tensor<1x64xi32> loc(#loc400)
+      %tmp83_279 = arith.constant 4096 : i32 loc(#loc401)
+      %tmp83_280 = arith.constant 4096 : i32 loc(#loc401)
+      %tmp83_281 = arith.constant dense<4096> : tensor<1x64xi32> loc(#loc401)
+      %tmp83_282 = arith.addi %tmp83_281, %tmp83_278 : tensor<1x64xi32> loc(#loc401)
+      %tmp83_283 = arith.constant 128 : i32 loc(#loc402)
+      %tmp83_284 = arith.constant 128 : i32 loc(#loc402)
+      %tmp83_285 = arith.constant dense<128> : tensor<8x1xi32> loc(#loc402)
+      %tmp83_286 = arith.muli %tmp83_285, %x0_12 : tensor<8x1xi32> loc(#loc402)
+      %tmp83_287 = tt.broadcast %tmp83_282 : tensor<1x64xi32> -> tensor<8x64xi32> loc(#loc403)
+      %tmp83_288 = tt.broadcast %tmp83_286 : tensor<8x1xi32> -> tensor<8x64xi32> loc(#loc403)
+      %tmp83_289 = arith.addi %tmp83_287, %tmp83_288 : tensor<8x64xi32> loc(#loc403)
+      %tmp83_290 = arith.constant 36864 : i32 loc(#loc404)
+      %tmp83_291 = arith.constant 36864 : i32 loc(#loc404)
+      %tmp83_292 = arith.constant dense<36864> : tensor<8x1xi32> loc(#loc404)
+      %tmp83_293 = arith.muli %tmp83_292, %x1_15 : tensor<8x1xi32> loc(#loc404)
+      %tmp83_294 = tt.broadcast %tmp83_293 : tensor<8x1xi32> -> tensor<8x64xi32> loc(#loc405)
+      %tmp83_295 = arith.addi %tmp83_289, %tmp83_294 : tensor<8x64xi32> loc(#loc405)
+      %tmp83_296 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<8x64x!tt.ptr<bf16>> loc(#loc406)
+      %tmp83_297 = tt.addptr %tmp83_296, %tmp83_295 : tensor<8x64x!tt.ptr<bf16>>, tensor<8x64xi32> loc(#loc406)
+      %tmp83_298 = arith.andi %r0_mask_24, %tmp32_171 : tensor<1x64xi1> loc(#loc407)
+      %tmp83_299 = arith.constant 0.000000e+00 : f32 loc(#loc408)
+      %tmp83_300 = tt.broadcast %tmp83_298 : tensor<1x64xi1> -> tensor<8x64xi1> loc(#loc408)
+      %tmp83_301 = arith.constant dense<0.000000e+00> : tensor<8x64xf32> loc(#loc408)
+      %tmp83_302 = arith.truncf %tmp83_301 : tensor<8x64xf32> to tensor<8x64xbf16> loc(#loc408)
+      %tmp83_303 = tt.load %tmp83_297, %tmp83_300, %tmp83_302 evictionPolicy = evict_last : tensor<8x64x!tt.ptr<bf16>> loc(#loc408)
+      %tmp83_304 = arith.extf %tmp83_303 : tensor<8x64xbf16> to tensor<8x64xf32> loc(#loc409)
+      %tmp85 = arith.constant dense<1.280000e+02> : tensor<8x1xf32> loc(#loc410)
+      %tmp85_305 = arith.divf %tmp4_19, %tmp85 : tensor<8x1xf32> loc(#loc410)
+      %tmp86 = arith.constant dense<9.99999997E-7> : tensor<8x1xf32> loc(#loc411)
+      %tmp86_306 = arith.addf %tmp85_305, %tmp86 : tensor<8x1xf32> loc(#loc411)
+      %tmp87 = tt.extern_elementwise %tmp86_306 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<8x1xf32>) -> tensor<8x1xf32> loc(#loc412)
+      %tmp88 = tt.broadcast %tmp87 : tensor<8x1xf32> -> tensor<8x64xf32> loc(#loc413)
+      %tmp88_307 = arith.mulf %tmp83_304, %tmp88 : tensor<8x64xf32> loc(#loc413)
+      %tmp89 = arith.constant 2 : i32 loc(#loc414)
+      %tmp89_308 = arith.constant 2 : i32 loc(#loc414)
+      %tmp89_309 = arith.constant dense<2> : tensor<1x64xi32> loc(#loc414)
+      %tmp89_310 = arith.muli %tmp89_309, %r0_4_30 : tensor<1x64xi32> loc(#loc414)
+      %tmp89_311 = tt.broadcast %tmp89_310 : tensor<1x64xi32> -> tensor<8x64xi32> loc(#loc415)
+      %tmp89_312 = tt.splat %in_ptr4 : !tt.ptr<bf16> -> tensor<8x64x!tt.ptr<bf16>> loc(#loc416)
+      %tmp89_313 = tt.addptr %tmp89_312, %tmp89_311 : tensor<8x64x!tt.ptr<bf16>>, tensor<8x64xi32> loc(#loc416)
+      %tmp89_314 = arith.andi %r0_mask_24, %tmp32_171 : tensor<1x64xi1> loc(#loc417)
+      %tmp89_315 = arith.constant 0.000000e+00 : f32 loc(#loc418)
+      %tmp89_316 = tt.broadcast %tmp89_314 : tensor<1x64xi1> -> tensor<8x64xi1> loc(#loc418)
+      %tmp89_317 = arith.constant dense<0.000000e+00> : tensor<8x64xf32> loc(#loc418)
+      %tmp89_318 = arith.truncf %tmp89_317 : tensor<8x64xf32> to tensor<8x64xbf16> loc(#loc418)
+      %tmp89_319 = tt.load %tmp89_313, %tmp89_316, %tmp89_318 evictionPolicy = evict_last : tensor<8x64x!tt.ptr<bf16>> loc(#loc418)
+      %tmp89_320 = arith.extf %tmp89_319 : tensor<8x64xbf16> to tensor<8x64xf32> loc(#loc419)
+      %tmp91 = arith.mulf %tmp88_307, %tmp89_320 : tensor<8x64xf32> loc(#loc420)
+      %tmp93 = arith.constant 0.000000e+00 : f32 loc(#loc421)
+      %tmp93_321 = arith.constant dense<0.000000e+00> : tensor<8x64xf32> loc(#loc421)
+      %tmp94 = tt.broadcast %tmp32_171 : tensor<1x64xi1> -> tensor<8x64xi1> loc(#loc422)
+      %tmp94_322 = arith.select %tmp94, %tmp91, %tmp93_321 : tensor<8x64xi1>, tensor<8x64xf32> loc(#loc422)
+      %tmp95 = tt.broadcast %tmp16_116 : tensor<1x64xi1> -> tensor<8x64xi1> loc(#loc423)
+      %tmp95_323 = arith.select %tmp95, %tmp82_275, %tmp94_322 : tensor<8x64xi1>, tensor<8x64xf32> loc(#loc423)
+      %tmp98 = arith.constant dense<1.280000e+02> : tensor<8x1xf32> loc(#loc424)
+      %tmp98_324 = arith.divf %tmp4_19, %tmp98 : tensor<8x1xf32> loc(#loc424)
+      %tmp99 = arith.constant dense<9.99999997E-7> : tensor<8x1xf32> loc(#loc425)
+      %tmp99_325 = arith.addf %tmp98_324, %tmp99 : tensor<8x1xf32> loc(#loc425)
+      %tmp100 = tt.extern_elementwise %tmp99_325 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<8x1xf32>) -> tensor<8x1xf32> loc(#loc426)
+      %tmp101 = tt.broadcast %tmp100 : tensor<8x1xf32> -> tensor<8x64xf32> loc(#loc427)
+      %tmp101_326 = arith.mulf %tmp96_104, %tmp101 : tensor<8x64xf32> loc(#loc427)
+      %tmp104 = tt.broadcast %tmp102_110 : tensor<1x64xf32> -> tensor<8x64xf32> loc(#loc428)
+      %tmp104_327 = arith.mulf %tmp101_326, %tmp104 : tensor<8x64xf32> loc(#loc428)
+      %tmp107 = arith.mulf %tmp104_327, %tmp63_68 : tensor<8x64xf32> loc(#loc429)
+      %tmp109 = arith.mulf %tmp95_323, %tmp66_80 : tensor<8x64xf32> loc(#loc430)
+      %tmp110 = arith.addf %tmp107, %tmp109 : tensor<8x64xf32> loc(#loc431)
+      %c128_i32 = arith.constant 128 : i32 loc(#loc204)
+      %c128_i32_328 = arith.constant 128 : i32 loc(#loc204)
+      %cst = arith.constant dense<128> : tensor<8x1xi32> loc(#loc204)
+      %8 = arith.muli %cst, %xindex_7 : tensor<8x1xi32> loc(#loc204)
+      %9 = tt.broadcast %r0_index_23 : tensor<1x64xi32> -> tensor<8x64xi32> loc(#loc205)
+      %10 = tt.broadcast %8 : tensor<8x1xi32> -> tensor<8x64xi32> loc(#loc205)
+      %11 = arith.addi %9, %10 : tensor<8x64xi32> loc(#loc205)
+      %12 = tt.splat %in_out_ptr0 : !tt.ptr<bf16> -> tensor<8x64x!tt.ptr<bf16>> loc(#loc206)
+      %13 = tt.addptr %12, %11 : tensor<8x64x!tt.ptr<bf16>>, tensor<8x64xi32> loc(#loc206)
+      %14 = tt.broadcast %r0_mask_24 : tensor<1x64xi1> -> tensor<8x64xi1> loc(#loc207)
+      %15 = arith.truncf %tmp68 : tensor<8x64xf32> to tensor<8x64xbf16> loc(#loc207)
+      tt.store %13, %15, %14 : tensor<8x64x!tt.ptr<bf16>> loc(#loc207)
+      %c128_i32_329 = arith.constant 128 : i32 loc(#loc208)
+      %c128_i32_330 = arith.constant 128 : i32 loc(#loc208)
+      %cst_331 = arith.constant dense<128> : tensor<8x1xi32> loc(#loc208)
+      %16 = arith.muli %cst_331, %xindex_7 : tensor<8x1xi32> loc(#loc208)
+      %17 = tt.broadcast %r0_index_23 : tensor<1x64xi32> -> tensor<8x64xi32> loc(#loc209)
+      %18 = tt.broadcast %16 : tensor<8x1xi32> -> tensor<8x64xi32> loc(#loc209)
+      %19 = arith.addi %17, %18 : tensor<8x64xi32> loc(#loc209)
+      %20 = tt.splat %in_out_ptr1 : !tt.ptr<bf16> -> tensor<8x64x!tt.ptr<bf16>> loc(#loc210)
+      %21 = tt.addptr %20, %19 : tensor<8x64x!tt.ptr<bf16>>, tensor<8x64xi32> loc(#loc210)
+      %22 = tt.broadcast %r0_mask_24 : tensor<1x64xi1> -> tensor<8x64xi1> loc(#loc211)
+      %23 = arith.truncf %tmp110 : tensor<8x64xf32> to tensor<8x64xbf16> loc(#loc211)
+      tt.store %21, %23, %22 : tensor<8x64x!tt.ptr<bf16>> loc(#loc211)
+    } loc(#loc44)
+    tt.return loc(#loc212)
+  } loc(#loc)
+  tt.func private @"triton.language.standard.sum__fp32S8_64S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<8x64xf32> loc("input"(#loc213))) -> tensor<8xf32> attributes {noinline = false} {
+    %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({
+    ^bb0(%arg1: f32 loc(unknown), %arg2: f32 loc(unknown)):
+      %2 = tt.call @triton.language.standard._sum_combine__fp32_fp32__(%arg1, %arg2) : (f32, f32) -> f32 loc(#loc214)
+      tt.reduce.return %2 : f32 loc(#loc214)
+    }) : (tensor<8x64xf32>) -> tensor<8xf32> loc(#loc214)
+    tt.return %0 : tensor<8xf32> loc(#loc216)
+  ^bb1:  // no predecessors
+    %1 = ub.poison : tensor<8xf32> loc(#loc217)
+    tt.return %1 : tensor<8xf32> loc(#loc217)
+  } loc(#loc213)
+  tt.func private @triton.language.standard._sum_combine__fp32_fp32__(%a: f32 loc("a"(#loc218)), %b: f32 loc("b"(#loc218))) -> f32 attributes {noinline = false} {
+    %0 = arith.addf %a, %b : f32 loc(#loc219)
+    tt.return %0 : f32 loc(#loc220)
+  ^bb1:  // no predecessors
+    %1 = ub.poison : f32 loc(#loc221)
+    tt.return %1 : f32 loc(#loc221)
+  } loc(#loc218)
+} loc(#loc)
+#loc1 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":19:13)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":20:15)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":23:28)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":23:33)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:36)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:44)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:23)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":25:46)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":26:27)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":26:37)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":28:19)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":29:19)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":30:43)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":32:44)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":33:43)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":34:31)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":35:29)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:41)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:52)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:48)
+#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:63)
+#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:57)
+#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:34)
+#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:68)
+#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:121)
+#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:45)
+#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:41)
+#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:56)
+#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:50)
+#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:34)
+#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:61)
+#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:114)
+#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":42:22)
+#loc34 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":44:23)
+#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":45:40)
+#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":47:22)
+#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":49:25)
+#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":50:42)
+#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":50:8)
+#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":51:25)
+#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":51:28)
+#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":52:27)
+#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":52:30)
+#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":53:43)
+#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":54:31)
+#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":55:29)
+#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":58:27)
+#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":59:27)
+#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:46)
+#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:42)
+#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:57)
+#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:51)
+#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:35)
+#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:62)
+#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:115)
+#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:35)
+#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:42)
+#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:95)
+#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:46)
+#loc60 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:42)
+#loc61 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:35)
+#loc62 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:51)
+#loc63 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:46)
+#loc64 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:42)
+#loc65 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:35)
+#loc66 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:51)
+#loc67 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:42)
+#loc68 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:53)
+#loc69 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:49)
+#loc70 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:64)
+#loc71 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:58)
+#loc72 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:35)
+#loc73 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:69)
+#loc74 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:123)
+#loc75 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:36)
+#loc76 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:43)
+#loc77 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:96)
+#loc78 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":68:35)
+#loc79 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":69:25)
+#loc80 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":70:35)
+#loc81 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":71:24)
+#loc82 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:41)
+#loc83 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:39)
+#loc84 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:52)
+#loc85 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:48)
+#loc86 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:63)
+#loc87 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:57)
+#loc88 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:35)
+#loc89 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:78)
+#loc90 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:68)
+#loc91 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:129)
+#loc92 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":74:16)
+#loc93 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":75:25)
+#loc94 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":76:16)
+#loc95 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":77:24)
+#loc96 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":78:32)
+#loc97 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":79:24)
+#loc98 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:57)
+#loc99 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:55)
+#loc100 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:63)
+#loc101 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:35)
+#loc102 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:95)
+#loc103 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:85)
+#loc104 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:146)
+#loc105 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":82:24)
+#loc106 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":84:17)
+#loc107 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":85:42)
+#loc108 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":86:39)
+#loc109 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":87:25)
+#loc110 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":88:35)
+#loc111 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":89:24)
+#loc112 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:37)
+#loc113 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:48)
+#loc114 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:44)
+#loc115 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:59)
+#loc116 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:53)
+#loc117 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:35)
+#loc118 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:74)
+#loc119 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:64)
+#loc120 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:125)
+#loc121 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":92:16)
+#loc122 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":93:25)
+#loc123 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":94:16)
+#loc124 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":95:24)
+#loc125 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":96:32)
+#loc126 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":97:24)
+#loc127 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:53)
+#loc128 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:59)
+#loc129 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:35)
+#loc130 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:91)
+#loc131 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:81)
+#loc132 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:142)
+#loc133 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":100:24)
+#loc134 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":102:42)
+#loc135 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":103:39)
+#loc136 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":104:39)
+#loc137 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":106:16)
+#loc138 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":107:25)
+#loc139 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":108:16)
+#loc140 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":109:24)
+#loc141 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":110:32)
+#loc142 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":111:24)
+#loc143 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":113:24)
+#loc144 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":116:24)
+#loc145 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":118:24)
+#loc146 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":119:24)
+#loc147 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:44)
+#loc148 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:42)
+#loc149 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:55)
+#loc150 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:51)
+#loc151 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:66)
+#loc152 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:60)
+#loc153 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:35)
+#loc154 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:81)
+#loc155 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:71)
+#loc156 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:132)
+#loc157 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":123:24)
+#loc158 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":124:24)
+#loc159 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":125:32)
+#loc160 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":126:24)
+#loc161 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:57)
+#loc162 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:55)
+#loc163 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:63)
+#loc164 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:35)
+#loc165 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:95)
+#loc166 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:85)
+#loc167 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:146)
+#loc168 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":129:24)
+#loc169 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":131:17)
+#loc170 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":132:42)
+#loc171 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":133:39)
+#loc172 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:44)
+#loc173 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:42)
+#loc174 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:55)
+#loc175 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:51)
+#loc176 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:66)
+#loc177 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:60)
+#loc178 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:35)
+#loc179 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:81)
+#loc180 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:71)
+#loc181 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:132)
+#loc182 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":136:24)
+#loc183 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":137:24)
+#loc184 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":138:32)
+#loc185 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":139:24)
+#loc186 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:53)
+#loc187 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:59)
+#loc188 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:35)
+#loc189 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:91)
+#loc190 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:81)
+#loc191 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:142)
+#loc192 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":142:24)
+#loc193 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":144:42)
+#loc194 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":145:39)
+#loc195 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":146:39)
+#loc196 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":148:24)
+#loc197 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":149:24)
+#loc198 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":150:33)
+#loc199 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":151:25)
+#loc200 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":153:26)
+#loc201 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":156:26)
+#loc202 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":158:26)
+#loc203 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":159:26)
+#loc204 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:43)
+#loc205 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:39)
+#loc206 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:32)
+#loc207 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:55)
+#loc208 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:43)
+#loc209 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:39)
+#loc210 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:32)
+#loc211 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:56)
+#loc212 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":53:4)
+#loc214 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:36)
+#loc216 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:11)
+#loc217 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:4)
+#loc219 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:15)
+#loc220 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:11)
+#loc221 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:4)
+#loc231 = loc("xnumel"(#loc1))
+#loc232 = loc("r0_numel"(#loc2))
+#loc233 = loc("xoffset"(#loc3))
+#loc234 = loc("xoffset"(#loc4))
+#loc235 = loc("xindex"(#loc5))
+#loc236 = loc("xindex"(#loc6))
+#loc237 = loc("xindex"(#loc7))
+#loc238 = loc("xmask"(#loc8))
+#loc239 = loc("r0_base"(#loc9))
+#loc240 = loc("r0_base"(#loc10))
+#loc241 = loc("x0"(#loc11))
+#loc242 = loc("x1"(#loc12))
+#loc243 = loc("_tmp4"(#loc13))
+#loc244 = loc("_tmp10"(#loc14))
+#loc245 = loc("_tmp4"(#loc15))
+#loc246 = loc("r0_index"(#loc16))
+#loc247 = loc("r0_mask"(#loc17))
+#loc248 = loc("tmp0"(#loc18))
+#loc249 = loc("tmp0"(#loc19))
+#loc250 = loc("tmp0"(#loc20))
+#loc251 = loc("tmp0"(#loc21))
+#loc252 = loc("tmp0"(#loc22))
+#loc253 = loc("tmp0"(#loc23))
+#loc254 = loc("tmp0"(#loc24))
+#loc255 = loc("tmp0"(#loc25))
+#loc256 = loc("tmp6"(#loc26))
+#loc257 = loc("tmp6"(#loc27))
+#loc258 = loc("tmp6"(#loc28))
+#loc259 = loc("tmp6"(#loc29))
+#loc260 = loc("tmp6"(#loc30))
+#loc261 = loc("tmp6"(#loc31))
+#loc262 = loc("tmp6"(#loc32))
+#loc263 = loc("tmp2"(#loc33))
+#loc264 = loc("tmp5"(#loc34))
+#loc265 = loc("_tmp4"(#loc35))
+#loc266 = loc("tmp8"(#loc36))
+#loc267 = loc("tmp11"(#loc37))
+#loc268 = loc("_tmp10"(#loc38))
+#loc269 = loc("tmp4"(#loc40))
+#loc270 = loc("tmp4"(#loc41))
+#loc271 = loc("tmp10"(#loc42))
+#loc272 = loc("tmp10"(#loc43))
+#loc273 = loc("r0_index"(#loc45))
+#loc274 = loc("r0_mask"(#loc46))
+#loc275 = loc("r0_3"(#loc47))
+#loc276 = loc("r0_4"(#loc48))
+#loc277 = loc("tmp50"(#loc49))
+#loc278 = loc("tmp50"(#loc50))
+#loc279 = loc("tmp50"(#loc51))
+#loc280 = loc("tmp50"(#loc52))
+#loc281 = loc("tmp50"(#loc53))
+#loc282 = loc("tmp50"(#loc54))
+#loc283 = loc("tmp50"(#loc55))
+#loc284 = loc("tmp58"(#loc56))
+#loc285 = loc("tmp58"(#loc57))
+#loc286 = loc("tmp58"(#loc58))
+#loc287 = loc("tmp63"(#loc59))
+#loc288 = loc("tmp63"(#loc60))
+#loc289 = loc("tmp63"(#loc61))
+#loc290 = loc("tmp63"(#loc62))
+#loc291 = loc("tmp66"(#loc63))
+#loc292 = loc("tmp66"(#loc64))
+#loc293 = loc("tmp66"(#loc65))
+#loc294 = loc("tmp66"(#loc66))
+#loc295 = loc("tmp96"(#loc67))
+#loc296 = loc("tmp96"(#loc68))
+#loc297 = loc("tmp96"(#loc69))
+#loc298 = loc("tmp96"(#loc70))
+#loc299 = loc("tmp96"(#loc71))
+#loc300 = loc("tmp96"(#loc72))
+#loc301 = loc("tmp96"(#loc73))
+#loc302 = loc("tmp96"(#loc74))
+#loc303 = loc("tmp102"(#loc75))
+#loc304 = loc("tmp102"(#loc76))
+#loc305 = loc("tmp102"(#loc77))
+#loc306 = loc("tmp13"(#loc78))
+#loc307 = loc("tmp14"(#loc79))
+#loc308 = loc("tmp15"(#loc80))
+#loc309 = loc("tmp16"(#loc81))
+#loc310 = loc("tmp17"(#loc82))
+#loc311 = loc("tmp17"(#loc83))
+#loc312 = loc("tmp17"(#loc84))
+#loc313 = loc("tmp17"(#loc85))
+#loc314 = loc("tmp17"(#loc86))
+#loc315 = loc("tmp17"(#loc87))
+#loc316 = loc("tmp17"(#loc88))
+#loc317 = loc("tmp17"(#loc89))
+#loc318 = loc("tmp17"(#loc90))
+#loc319 = loc("tmp17"(#loc91))
+#loc320 = loc("tmp19"(#loc92))
+#loc321 = loc("tmp20"(#loc93))
+#loc322 = loc("tmp21"(#loc94))
+#loc323 = loc("tmp22"(#loc95))
+#loc324 = loc("tmp23"(#loc96))
+#loc325 = loc("tmp24"(#loc97))
+#loc326 = loc("tmp25"(#loc98))
+#loc327 = loc("tmp25"(#loc99))
+#loc328 = loc("tmp25"(#loc100))
+#loc329 = loc("tmp25"(#loc101))
+#loc330 = loc("tmp25"(#loc102))
+#loc331 = loc("tmp25"(#loc103))
+#loc332 = loc("tmp25"(#loc104))
+#loc333 = loc("tmp27"(#loc105))
+#loc334 = loc("tmp29"(#loc106))
+#loc335 = loc("tmp30"(#loc107))
+#loc336 = loc("tmp31"(#loc108))
+#loc337 = loc("tmp32"(#loc109))
+#loc338 = loc("tmp33"(#loc110))
+#loc339 = loc("tmp34"(#loc111))
+#loc340 = loc("tmp35"(#loc112))
+#loc341 = loc("tmp35"(#loc113))
+#loc342 = loc("tmp35"(#loc114))
+#loc343 = loc("tmp35"(#loc115))
+#loc344 = loc("tmp35"(#loc116))
+#loc345 = loc("tmp35"(#loc117))
+#loc346 = loc("tmp35"(#loc118))
+#loc347 = loc("tmp35"(#loc119))
+#loc348 = loc("tmp35"(#loc120))
+#loc349 = loc("tmp37"(#loc121))
+#loc350 = loc("tmp38"(#loc122))
+#loc351 = loc("tmp39"(#loc123))
+#loc352 = loc("tmp40"(#loc124))
+#loc353 = loc("tmp41"(#loc125))
+#loc354 = loc("tmp42"(#loc126))
+#loc355 = loc("tmp43"(#loc127))
+#loc356 = loc("tmp43"(#loc128))
+#loc357 = loc("tmp43"(#loc129))
+#loc358 = loc("tmp43"(#loc130))
+#loc359 = loc("tmp43"(#loc131))
+#loc360 = loc("tmp43"(#loc132))
+#loc361 = loc("tmp45"(#loc133))
+#loc362 = loc("tmp47"(#loc134))
+#loc363 = loc("tmp48"(#loc135))
+#loc364 = loc("tmp49"(#loc136))
+#loc365 = loc("tmp52"(#loc137))
+#loc366 = loc("tmp53"(#loc138))
+#loc367 = loc("tmp54"(#loc139))
+#loc368 = loc("tmp55"(#loc140))
+#loc369 = loc("tmp56"(#loc141))
+#loc370 = loc("tmp57"(#loc142))
+#loc371 = loc("tmp60"(#loc143))
+#loc372 = loc("tmp64"(#loc144))
+#loc373 = loc("tmp67"(#loc145))
+#loc374 = loc("tmp68"(#loc146))
+#loc375 = loc("tmp70"(#loc147))
+#loc376 = loc("tmp70"(#loc148))
+#loc377 = loc("tmp70"(#loc149))
+#loc378 = loc("tmp70"(#loc150))
+#loc379 = loc("tmp70"(#loc151))
+#loc380 = loc("tmp70"(#loc152))
+#loc381 = loc("tmp70"(#loc153))
+#loc382 = loc("tmp70"(#loc154))
+#loc383 = loc("tmp70"(#loc155))
+#loc384 = loc("tmp70"(#loc156))
+#loc385 = loc("tmp72"(#loc157))
+#loc386 = loc("tmp73"(#loc158))
+#loc387 = loc("tmp74"(#loc159))
+#loc388 = loc("tmp75"(#loc160))
+#loc389 = loc("tmp76"(#loc161))
+#loc390 = loc("tmp76"(#loc162))
+#loc391 = loc("tmp76"(#loc163))
+#loc392 = loc("tmp76"(#loc164))
+#loc393 = loc("tmp76"(#loc165))
+#loc394 = loc("tmp76"(#loc166))
+#loc395 = loc("tmp76"(#loc167))
+#loc396 = loc("tmp78"(#loc168))
+#loc397 = loc("tmp80"(#loc169))
+#loc398 = loc("tmp81"(#loc170))
+#loc399 = loc("tmp82"(#loc171))
+#loc400 = loc("tmp83"(#loc172))
+#loc401 = loc("tmp83"(#loc173))
+#loc402 = loc("tmp83"(#loc174))
+#loc403 = loc("tmp83"(#loc175))
+#loc404 = loc("tmp83"(#loc176))
+#loc405 = loc("tmp83"(#loc177))
+#loc406 = loc("tmp83"(#loc178))
+#loc407 = loc("tmp83"(#loc179))
+#loc408 = loc("tmp83"(#loc180))
+#loc409 = loc("tmp83"(#loc181))
+#loc410 = loc("tmp85"(#loc182))
+#loc411 = loc("tmp86"(#loc183))
+#loc412 = loc("tmp87"(#loc184))
+#loc413 = loc("tmp88"(#loc185))
+#loc414 = loc("tmp89"(#loc186))
+#loc415 = loc("tmp89"(#loc187))
+#loc416 = loc("tmp89"(#loc188))
+#loc417 = loc("tmp89"(#loc189))
+#loc418 = loc("tmp89"(#loc190))
+#loc419 = loc("tmp89"(#loc191))
+#loc420 = loc("tmp91"(#loc192))
+#loc421 = loc("tmp93"(#loc193))
+#loc422 = loc("tmp94"(#loc194))
+#loc423 = loc("tmp95"(#loc195))
+#loc424 = loc("tmp98"(#loc196))
+#loc425 = loc("tmp99"(#loc197))
+#loc426 = loc("tmp100"(#loc198))
+#loc427 = loc("tmp101"(#loc199))
+#loc428 = loc("tmp104"(#loc200))
+#loc429 = loc("tmp107"(#loc201))
+#loc430 = loc("tmp109"(#loc202))
+#loc431 = loc("tmp110"(#loc203))
+#loc435 = loc("_tmp10"(#loc245))
diff --git a/triton/YTJMOQ5EK2K5SU77M2GF34KOJUJGHRSHTNE3D7KKVDZMRCR7C73Q/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttgir b/triton/YTJMOQ5EK2K5SU77M2GF34KOJUJGHRSHTNE3D7KKVDZMRCR7C73Q/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttgir
new file mode 100644
index 0000000000000000000000000000000000000000..66ef1eff47bfa03d4850a8b2681875bd307846e6
--- /dev/null
+++ b/triton/YTJMOQ5EK2K5SU77M2GF34KOJUJGHRSHTNE3D7KKVDZMRCR7C73Q/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttgir
@@ -0,0 +1,547 @@
+#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [8, 4], warpsPerCTA = [1, 8], order = [0, 1]}>
+#blocked1 = #ttg.blocked<{sizePerThread = [1, 2], threadsPerWarp = [1, 32], warpsPerCTA = [8, 1], order = [1, 0]}>
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":18:0)
+#loc1 = loc(unknown)
+#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":51:25)
+#loc36 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":52:27)
+#loc147 = loc("in_out_ptr0"(#loc))
+#loc148 = loc("in_out_ptr1"(#loc))
+#loc149 = loc("in_ptr0"(#loc))
+#loc150 = loc("in_ptr1"(#loc))
+#loc151 = loc("in_ptr2"(#loc))
+#loc152 = loc("in_ptr3"(#loc))
+#loc153 = loc("in_ptr4"(#loc))
+#loc154 = loc("xnumel"(#loc))
+#loc155 = loc("r0_numel"(#loc))
+#loc185 = loc("tmp4"(#loc33))
+#loc187 = loc("tmp10"(#loc36))
+#loc292 = loc(callsite(#loc1 at #loc185))
+#loc294 = loc(callsite(#loc1 at #loc187))
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "cuda:89", "ttg.threads-per-warp" = 32 : i32} {
+  tt.func public @triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0(%in_out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_out_ptr0"(#loc)), %in_out_ptr1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_out_ptr1"(#loc)), %in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %in_ptr4: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr4"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
+    %cst = arith.constant dense<4097> : tensor<1x64xi32, #blocked> loc(#loc1)
+    %cst_0 = arith.constant dense<0.000000e+00> : tensor<1x64xbf16, #blocked1> loc(#loc1)
+    %cst_1 = arith.constant dense<1> : tensor<1x64xi32, #blocked> loc(#loc1)
+    %cst_2 = arith.constant dense<1> : tensor<1x64xi64, #blocked> loc(#loc1)
+    %cst_3 = arith.constant dense<2> : tensor<1x64xi32, #blocked> loc(#loc1)
+    %cst_4 = arith.constant dense<36864> : tensor<8x1xi32, #blocked> loc(#loc1)
+    %cst_5 = arith.constant dense<36864> : tensor<8x1xi32, #blocked1> loc(#loc1)
+    %cst_6 = arith.constant dense<128> : tensor<8x1xi32, #blocked> loc(#loc1)
+    %cst_7 = arith.constant dense<128> : tensor<8x1xi32, #blocked1> loc(#loc1)
+    %cst_8 = arith.constant dense<4096> : tensor<1x64xi32, #blocked> loc(#loc1)
+    %cst_9 = arith.constant dense<4096> : tensor<1x64xi32, #blocked1> loc(#loc1)
+    %cst_10 = arith.constant dense<128> : tensor<1x64xi32, #blocked> loc(#loc1)
+    %cst_11 = arith.constant dense<128> : tensor<1x64xi32, #blocked1> loc(#loc1)
+    %cst_12 = arith.constant dense<32> : tensor<8x1xi32, #blocked> loc(#loc1)
+    %cst_13 = arith.constant dense<32> : tensor<8x1xi32, #blocked1> loc(#loc1)
+    %c8_i32 = arith.constant 8 : i32 loc(#loc1)
+    %cst_14 = arith.constant dense<0.000000e+00> : tensor<8x64xbf16, #blocked1> loc(#loc1)
+    %cst_15 = arith.constant dense<0.000000e+00> : tensor<8x64xbf16, #blocked> loc(#loc1)
+    %c64_i32 = arith.constant 64 : i32 loc(#loc1)
+    %c128_i32 = arith.constant 128 : i32 loc(#loc1)
+    %c0_i32 = arith.constant 0 : i32 loc(#loc1)
+    %cst_16 = arith.constant dense<9.99999997E-7> : tensor<8x1xf32, #blocked1> loc(#loc1)
+    %cst_17 = arith.constant dense<1.280000e+02> : tensor<8x1xf32, #blocked1> loc(#loc1)
+    %cst_18 = arith.constant dense<0.000000e+00> : tensor<8x64xf32, #blocked> loc(#loc1)
+    %cst_19 = arith.constant dense<0.000000e+00> : tensor<8x64xf32, #blocked1> loc(#loc1)
+    %xoffset = tt.get_program_id x : i32 loc(#loc156)
+    %xoffset_20 = arith.muli %xoffset, %c8_i32 : i32 loc(#loc157)
+    %xindex = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc158)
+    %xindex_21 = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc158)
+    %xindex_22 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<8x1xi32, #blocked1> loc(#loc158)
+    %xindex_23 = tt.expand_dims %xindex_21 {axis = 1 : i32} : tensor<8xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<8x1xi32, #blocked> loc(#loc158)
+    %xindex_24 = tt.splat %xoffset_20 : i32 -> tensor<8x1xi32, #blocked1> loc(#loc159)
+    %xindex_25 = tt.splat %xoffset_20 : i32 -> tensor<8x1xi32, #blocked> loc(#loc159)
+    %xindex_26 = arith.addi %xindex_24, %xindex_22 : tensor<8x1xi32, #blocked1> loc(#loc159)
+    %xindex_27 = arith.addi %xindex_25, %xindex_23 : tensor<8x1xi32, #blocked> loc(#loc159)
+    %r0_base = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc160)
+    %r0_base_28 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc160)
+    %r0_base_29 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x64xi32, #blocked1> loc(#loc160)
+    %r0_base_30 = tt.expand_dims %r0_base_28 {axis = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x64xi32, #blocked> loc(#loc160)
+    %x0 = arith.remsi %xindex_26, %cst_13 : tensor<8x1xi32, #blocked1> loc(#loc161)
+    %x0_31 = arith.remsi %xindex_27, %cst_12 : tensor<8x1xi32, #blocked> loc(#loc161)
+    %x1 = arith.divsi %xindex_26, %cst_13 : tensor<8x1xi32, #blocked1> loc(#loc162)
+    %x1_32 = arith.divsi %xindex_27, %cst_12 : tensor<8x1xi32, #blocked> loc(#loc162)
+    %tmp0 = arith.muli %x0, %cst_7 : tensor<8x1xi32, #blocked1> loc(#loc163)
+    %tmp0_33 = tt.broadcast %tmp0 : tensor<8x1xi32, #blocked1> -> tensor<8x64xi32, #blocked1> loc(#loc164)
+    %tmp0_34 = arith.muli %x1, %cst_5 : tensor<8x1xi32, #blocked1> loc(#loc165)
+    %tmp0_35 = tt.broadcast %tmp0_34 : tensor<8x1xi32, #blocked1> -> tensor<8x64xi32, #blocked1> loc(#loc166)
+    %tmp0_36 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<8x64x!tt.ptr<bf16>, #blocked1> loc(#loc167)
+    %_tmp10:2 = scf.for %_tmp10_51 = %c0_i32 to %c128_i32 step %c64_i32 iter_args(%arg10 = %cst_19, %arg11 = %cst_19) -> (tensor<8x64xf32, #blocked1>, tensor<8x64xf32, #blocked1>)  : i32 {
+      %r0_index = tt.splat %_tmp10_51 : i32 -> tensor<1x64xi32, #blocked1> loc(#loc169)
+      %r0_index_52 = arith.addi %r0_index, %r0_base_29 : tensor<1x64xi32, #blocked1> loc(#loc169)
+      %r0_mask = arith.cmpi slt, %r0_index_52, %cst_11 : tensor<1x64xi32, #blocked1> loc(#loc170)
+      %tmp0_53 = arith.addi %r0_index_52, %cst_9 : tensor<1x64xi32, #blocked1> loc(#loc171)
+      %tmp0_54 = tt.broadcast %tmp0_53 : tensor<1x64xi32, #blocked1> -> tensor<8x64xi32, #blocked1> loc(#loc164)
+      %tmp0_55 = arith.addi %tmp0_54, %tmp0_33 : tensor<8x64xi32, #blocked1> loc(#loc164)
+      %tmp0_56 = arith.addi %tmp0_55, %tmp0_35 : tensor<8x64xi32, #blocked1> loc(#loc166)
+      %tmp0_57 = tt.addptr %tmp0_36, %tmp0_56 : tensor<8x64x!tt.ptr<bf16>, #blocked1>, tensor<8x64xi32, #blocked1> loc(#loc167)
+      %tmp0_58 = tt.broadcast %r0_mask : tensor<1x64xi1, #blocked1> -> tensor<8x64xi1, #blocked1> loc(#loc172)
+      %tmp0_59 = tt.load %tmp0_57, %tmp0_58, %cst_14 evictionPolicy = evict_last : tensor<8x64x!tt.ptr<bf16>, #blocked1> loc(#loc172)
+      %tmp0_60 = arith.extf %tmp0_59 : tensor<8x64xbf16, #blocked1> to tensor<8x64xf32, #blocked1> loc(#loc173)
+      %tmp6 = tt.broadcast %r0_index_52 : tensor<1x64xi32, #blocked1> -> tensor<8x64xi32, #blocked1> loc(#loc174)
+      %tmp6_61 = arith.addi %tmp6, %tmp0_33 : tensor<8x64xi32, #blocked1> loc(#loc174)
+      %tmp6_62 = arith.addi %tmp6_61, %tmp0_35 : tensor<8x64xi32, #blocked1> loc(#loc175)
+      %tmp6_63 = tt.addptr %tmp0_36, %tmp6_62 : tensor<8x64x!tt.ptr<bf16>, #blocked1>, tensor<8x64xi32, #blocked1> loc(#loc176)
+      %tmp6_64 = tt.load %tmp6_63, %tmp0_58, %cst_14 evictionPolicy = evict_last : tensor<8x64x!tt.ptr<bf16>, #blocked1> loc(#loc177)
+      %tmp6_65 = arith.extf %tmp6_64 : tensor<8x64xbf16, #blocked1> to tensor<8x64xf32, #blocked1> loc(#loc178)
+      %tmp2 = arith.mulf %tmp0_60, %tmp0_60 : tensor<8x64xf32, #blocked1> loc(#loc179)
+      %tmp5 = arith.addf %arg10, %tmp2 : tensor<8x64xf32, #blocked1> loc(#loc180)
+      %_tmp4 = arith.select %tmp0_58, %tmp5, %arg10 : tensor<8x64xi1, #blocked1>, tensor<8x64xf32, #blocked1> loc(#loc181)
+      %tmp8 = arith.mulf %tmp6_65, %tmp6_65 : tensor<8x64xf32, #blocked1> loc(#loc182)
+      %tmp11 = arith.addf %arg11, %tmp8 : tensor<8x64xf32, #blocked1> loc(#loc183)
+      %_tmp10_66 = arith.select %tmp0_58, %tmp11, %arg11 : tensor<8x64xi1, #blocked1>, tensor<8x64xf32, #blocked1> loc(#loc184)
+      scf.yield %_tmp4, %_tmp10_66 : tensor<8x64xf32, #blocked1>, tensor<8x64xf32, #blocked1> loc(#loc31)
+    } loc(#loc290)
+    %tmp4 = "tt.reduce"(%_tmp10#0) <{axis = 1 : i32}> ({
+    ^bb0(%tmp4_51: f32 loc(callsite(#loc1 at #loc185)), %tmp4_52: f32 loc(callsite(#loc1 at #loc185))):
+      %tmp4_53 = arith.addf %tmp4_51, %tmp4_52 : f32 loc(#loc297)
+      tt.reduce.return %tmp4_53 : f32 loc(#loc291)
+    }) : (tensor<8x64xf32, #blocked1>) -> tensor<8xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc291)
+    %tmp4_37 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<8xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<8x1xf32, #blocked1> loc(#loc186)
+    %tmp10 = "tt.reduce"(%_tmp10#1) <{axis = 1 : i32}> ({
+    ^bb0(%tmp10_51: f32 loc(callsite(#loc1 at #loc187)), %tmp10_52: f32 loc(callsite(#loc1 at #loc187))):
+      %tmp10_53 = arith.addf %tmp10_51, %tmp10_52 : f32 loc(#loc298)
+      tt.reduce.return %tmp10_53 : f32 loc(#loc293)
+    }) : (tensor<8x64xf32, #blocked1>) -> tensor<8xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc293)
+    %tmp10_38 = tt.expand_dims %tmp10 {axis = 1 : i32} : tensor<8xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<8x1xf32, #blocked1> loc(#loc188)
+    %tmp50 = arith.muli %x0_31, %cst_6 : tensor<8x1xi32, #blocked> loc(#loc189)
+    %tmp50_39 = tt.broadcast %tmp50 : tensor<8x1xi32, #blocked> -> tensor<8x64xi32, #blocked> loc(#loc190)
+    %tmp50_40 = arith.muli %x1_32, %cst_4 : tensor<8x1xi32, #blocked> loc(#loc191)
+    %tmp50_41 = tt.broadcast %tmp50_40 : tensor<8x1xi32, #blocked> -> tensor<8x64xi32, #blocked> loc(#loc192)
+    %tmp50_42 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<8x64x!tt.ptr<bf16>, #blocked> loc(#loc193)
+    %tmp58 = tt.splat %in_ptr1 : !tt.ptr<bf16> -> tensor<1x64x!tt.ptr<bf16>, #blocked> loc(#loc194)
+    %tmp58_43 = tt.splat %in_ptr1 : !tt.ptr<bf16> -> tensor<1x64x!tt.ptr<bf16>, #blocked1> loc(#loc194)
+    %tmp63 = arith.muli %x1, %cst_7 : tensor<8x1xi32, #blocked1> loc(#loc195)
+    %tmp63_44 = tt.broadcast %tmp63 : tensor<8x1xi32, #blocked1> -> tensor<8x64xi32, #blocked1> loc(#loc196)
+    %tmp63_45 = tt.splat %in_ptr2 : !tt.ptr<f32> -> tensor<8x64x!tt.ptr<f32>, #blocked1> loc(#loc197)
+    %tmp66 = tt.splat %in_ptr3 : !tt.ptr<f32> -> tensor<8x64x!tt.ptr<f32>, #blocked1> loc(#loc198)
+    %tmp102 = tt.splat %in_ptr4 : !tt.ptr<bf16> -> tensor<1x64x!tt.ptr<bf16>, #blocked> loc(#loc199)
+    %tmp102_46 = tt.splat %in_ptr4 : !tt.ptr<bf16> -> tensor<1x64x!tt.ptr<bf16>, #blocked1> loc(#loc199)
+    %tmp20 = arith.divf %tmp10_38, %cst_17 : tensor<8x1xf32, #blocked1> loc(#loc200)
+    %tmp22 = arith.addf %tmp20, %cst_16 : tensor<8x1xf32, #blocked1> loc(#loc201)
+    %tmp23 = tt.extern_elementwise %tmp22 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<8x1xf32, #blocked1>) -> tensor<8x1xf32, #blocked1> loc(#loc202)
+    %tmp24 = ttg.convert_layout %tmp23 : tensor<8x1xf32, #blocked1> -> tensor<8x1xf32, #blocked> loc(#loc203)
+    %tmp24_47 = tt.broadcast %tmp24 : tensor<8x1xf32, #blocked> -> tensor<8x64xf32, #blocked> loc(#loc203)
+    %tmp24_48 = tt.broadcast %tmp23 : tensor<8x1xf32, #blocked1> -> tensor<8x64xf32, #blocked1> loc(#loc203)
+    %tmp72 = arith.divf %tmp4_37, %cst_17 : tensor<8x1xf32, #blocked1> loc(#loc204)
+    %tmp73 = arith.addf %tmp72, %cst_16 : tensor<8x1xf32, #blocked1> loc(#loc205)
+    %tmp74 = tt.extern_elementwise %tmp73 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<8x1xf32, #blocked1>) -> tensor<8x1xf32, #blocked1> loc(#loc206)
+    %tmp75 = ttg.convert_layout %tmp74 : tensor<8x1xf32, #blocked1> -> tensor<8x1xf32, #blocked> loc(#loc207)
+    %tmp75_49 = tt.broadcast %tmp75 : tensor<8x1xf32, #blocked> -> tensor<8x64xf32, #blocked> loc(#loc207)
+    %tmp75_50 = tt.broadcast %tmp74 : tensor<8x1xf32, #blocked1> -> tensor<8x64xf32, #blocked1> loc(#loc207)
+    %0 = arith.muli %xindex_26, %cst_7 : tensor<8x1xi32, #blocked1> loc(#loc57)
+    %1 = tt.broadcast %0 : tensor<8x1xi32, #blocked1> -> tensor<8x64xi32, #blocked1> loc(#loc58)
+    %2 = tt.splat %in_out_ptr0 : !tt.ptr<bf16> -> tensor<8x64x!tt.ptr<bf16>, #blocked1> loc(#loc59)
+    %3 = tt.splat %in_out_ptr1 : !tt.ptr<bf16> -> tensor<8x64x!tt.ptr<bf16>, #blocked1> loc(#loc60)
+    scf.for %r0_offset = %c0_i32 to %c128_i32 step %c64_i32  : i32 {
+      %r0_index = tt.splat %r0_offset : i32 -> tensor<1x64xi32, #blocked1> loc(#loc208)
+      %r0_index_51 = tt.splat %r0_offset : i32 -> tensor<1x64xi32, #blocked> loc(#loc208)
+      %r0_index_52 = arith.addi %r0_index, %r0_base_29 : tensor<1x64xi32, #blocked1> loc(#loc208)
+      %r0_index_53 = arith.addi %r0_index_51, %r0_base_30 : tensor<1x64xi32, #blocked> loc(#loc208)
+      %r0_mask = arith.cmpi slt, %r0_index_52, %cst_11 : tensor<1x64xi32, #blocked1> loc(#loc209)
+      %r0_mask_54 = arith.cmpi slt, %r0_index_53, %cst_10 : tensor<1x64xi32, #blocked> loc(#loc209)
+      %r0_3 = arith.remsi %r0_index_53, %cst_3 : tensor<1x64xi32, #blocked> loc(#loc210)
+      %r0_4 = arith.divsi %r0_index_53, %cst_3 : tensor<1x64xi32, #blocked> loc(#loc211)
+      %tmp50_55 = tt.broadcast %r0_index_52 : tensor<1x64xi32, #blocked1> -> tensor<8x64xi32, #blocked1> loc(#loc190)
+      %tmp50_56 = arith.addi %tmp50_55, %tmp0_33 : tensor<8x64xi32, #blocked1> loc(#loc190)
+      %tmp50_57 = arith.addi %tmp50_56, %tmp0_35 : tensor<8x64xi32, #blocked1> loc(#loc192)
+      %tmp50_58 = tt.addptr %tmp0_36, %tmp50_57 : tensor<8x64x!tt.ptr<bf16>, #blocked1>, tensor<8x64xi32, #blocked1> loc(#loc193)
+      %tmp50_59 = tt.broadcast %r0_mask : tensor<1x64xi1, #blocked1> -> tensor<8x64xi1, #blocked1> loc(#loc212)
+      %tmp50_60 = tt.load %tmp50_58, %tmp50_59, %cst_14 evictionPolicy = evict_last : tensor<8x64x!tt.ptr<bf16>, #blocked1> loc(#loc212)
+      %tmp50_61 = arith.extf %tmp50_60 : tensor<8x64xbf16, #blocked1> to tensor<8x64xf32, #blocked1> loc(#loc213)
+      %tmp58_62 = tt.addptr %tmp58_43, %r0_index_52 : tensor<1x64x!tt.ptr<bf16>, #blocked1>, tensor<1x64xi32, #blocked1> loc(#loc194)
+      %tmp58_63 = tt.load %tmp58_62, %r0_mask, %cst_0 evictionPolicy = evict_last : tensor<1x64x!tt.ptr<bf16>, #blocked1> loc(#loc214)
+      %tmp58_64 = arith.extf %tmp58_63 : tensor<1x64xbf16, #blocked1> to tensor<1x64xf32, #blocked1> loc(#loc215)
+      %tmp63_65 = arith.addi %tmp50_55, %tmp63_44 : tensor<8x64xi32, #blocked1> loc(#loc196)
+      %tmp63_66 = tt.addptr %tmp63_45, %tmp63_65 : tensor<8x64x!tt.ptr<f32>, #blocked1>, tensor<8x64xi32, #blocked1> loc(#loc197)
+      %tmp63_67 = tt.load %tmp63_66, %tmp50_59, %cst_19 evictionPolicy = evict_last : tensor<8x64x!tt.ptr<f32>, #blocked1> loc(#loc216)
+      %tmp66_68 = tt.addptr %tmp66, %tmp63_65 : tensor<8x64x!tt.ptr<f32>, #blocked1>, tensor<8x64xi32, #blocked1> loc(#loc198)
+      %tmp66_69 = tt.load %tmp66_68, %tmp50_59, %cst_19 evictionPolicy = evict_last : tensor<8x64x!tt.ptr<f32>, #blocked1> loc(#loc217)
+      %tmp66_70 = ttg.convert_layout %tmp66_69 : tensor<8x64xf32, #blocked1> -> tensor<8x64xf32, #blocked> loc(#loc217)
+      %tmp96 = arith.addi %r0_index_52, %cst_9 : tensor<1x64xi32, #blocked1> loc(#loc218)
+      %tmp96_71 = tt.broadcast %tmp96 : tensor<1x64xi32, #blocked1> -> tensor<8x64xi32, #blocked1> loc(#loc219)
+      %tmp96_72 = arith.addi %tmp96_71, %tmp0_33 : tensor<8x64xi32, #blocked1> loc(#loc219)
+      %tmp96_73 = arith.addi %tmp96_72, %tmp0_35 : tensor<8x64xi32, #blocked1> loc(#loc220)
+      %tmp96_74 = tt.addptr %tmp0_36, %tmp96_73 : tensor<8x64x!tt.ptr<bf16>, #blocked1>, tensor<8x64xi32, #blocked1> loc(#loc221)
+      %tmp96_75 = tt.load %tmp96_74, %tmp50_59, %cst_14 evictionPolicy = evict_first : tensor<8x64x!tt.ptr<bf16>, #blocked1> loc(#loc222)
+      %tmp96_76 = arith.extf %tmp96_75 : tensor<8x64xbf16, #blocked1> to tensor<8x64xf32, #blocked1> loc(#loc223)
+      %tmp102_77 = tt.addptr %tmp102_46, %r0_index_52 : tensor<1x64x!tt.ptr<bf16>, #blocked1>, tensor<1x64xi32, #blocked1> loc(#loc199)
+      %tmp102_78 = tt.load %tmp102_77, %r0_mask, %cst_0 evictionPolicy = evict_last : tensor<1x64x!tt.ptr<bf16>, #blocked1> loc(#loc224)
+      %tmp102_79 = arith.extf %tmp102_78 : tensor<1x64xbf16, #blocked1> to tensor<1x64xf32, #blocked1> loc(#loc225)
+      %tmp16 = arith.extsi %r0_3 : tensor<1x64xi32, #blocked> to tensor<1x64xi64, #blocked> loc(#loc226)
+      %tmp16_80 = arith.cmpi slt, %tmp16, %cst_2 : tensor<1x64xi64, #blocked> loc(#loc226)
+      %tmp17 = arith.muli %r0_4, %cst_3 : tensor<1x64xi32, #blocked> loc(#loc227)
+      %tmp17_81 = arith.addi %tmp17, %cst_1 : tensor<1x64xi32, #blocked> loc(#loc228)
+      %tmp17_82 = tt.broadcast %tmp17_81 : tensor<1x64xi32, #blocked> -> tensor<8x64xi32, #blocked> loc(#loc229)
+      %tmp17_83 = arith.addi %tmp17_82, %tmp50_39 : tensor<8x64xi32, #blocked> loc(#loc229)
+      %tmp17_84 = arith.addi %tmp17_83, %tmp50_41 : tensor<8x64xi32, #blocked> loc(#loc230)
+      %tmp17_85 = tt.addptr %tmp50_42, %tmp17_84 : tensor<8x64x!tt.ptr<bf16>, #blocked>, tensor<8x64xi32, #blocked> loc(#loc231)
+      %tmp17_86 = arith.andi %r0_mask_54, %tmp16_80 : tensor<1x64xi1, #blocked> loc(#loc232)
+      %tmp17_87 = tt.broadcast %tmp17_86 : tensor<1x64xi1, #blocked> -> tensor<8x64xi1, #blocked> loc(#loc233)
+      %tmp17_88 = tt.load %tmp17_85, %tmp17_87, %cst_15 evictionPolicy = evict_last : tensor<8x64x!tt.ptr<bf16>, #blocked> loc(#loc233)
+      %tmp17_89 = arith.extf %tmp17_88 : tensor<8x64xbf16, #blocked> to tensor<8x64xf32, #blocked> loc(#loc234)
+      %tmp24_90 = arith.mulf %tmp17_89, %tmp24_47 : tensor<8x64xf32, #blocked> loc(#loc203)
+      %tmp25 = tt.addptr %tmp58, %tmp17_81 : tensor<1x64x!tt.ptr<bf16>, #blocked>, tensor<1x64xi32, #blocked> loc(#loc235)
+      %tmp25_91 = tt.broadcast %tmp25 : tensor<1x64x!tt.ptr<bf16>, #blocked> -> tensor<8x64x!tt.ptr<bf16>, #blocked> loc(#loc235)
+      %tmp25_92 = tt.load %tmp25_91, %tmp17_87, %cst_15 evictionPolicy = evict_last : tensor<8x64x!tt.ptr<bf16>, #blocked> loc(#loc236)
+      %tmp25_93 = arith.extf %tmp25_92 : tensor<8x64xbf16, #blocked> to tensor<8x64xf32, #blocked> loc(#loc237)
+      %tmp27 = arith.mulf %tmp24_90, %tmp25_93 : tensor<8x64xf32, #blocked> loc(#loc238)
+      %tmp29 = arith.subf %cst_18, %tmp27 : tensor<8x64xf32, #blocked> loc(#loc239)
+      %tmp31 = tt.broadcast %tmp16_80 : tensor<1x64xi1, #blocked> -> tensor<8x64xi1, #blocked> loc(#loc240)
+      %tmp32 = arith.cmpi sge, %tmp16, %cst_2 : tensor<1x64xi64, #blocked> loc(#loc241)
+      %tmp35 = tt.broadcast %tmp17 : tensor<1x64xi32, #blocked> -> tensor<8x64xi32, #blocked> loc(#loc242)
+      %tmp35_94 = arith.addi %tmp35, %tmp50_39 : tensor<8x64xi32, #blocked> loc(#loc242)
+      %tmp35_95 = arith.addi %tmp35_94, %tmp50_41 : tensor<8x64xi32, #blocked> loc(#loc243)
+      %tmp35_96 = tt.addptr %tmp50_42, %tmp35_95 : tensor<8x64x!tt.ptr<bf16>, #blocked>, tensor<8x64xi32, #blocked> loc(#loc244)
+      %tmp35_97 = arith.andi %r0_mask_54, %tmp32 : tensor<1x64xi1, #blocked> loc(#loc245)
+      %tmp35_98 = tt.broadcast %tmp35_97 : tensor<1x64xi1, #blocked> -> tensor<8x64xi1, #blocked> loc(#loc246)
+      %tmp35_99 = tt.load %tmp35_96, %tmp35_98, %cst_15 evictionPolicy = evict_last : tensor<8x64x!tt.ptr<bf16>, #blocked> loc(#loc246)
+      %tmp35_100 = arith.extf %tmp35_99 : tensor<8x64xbf16, #blocked> to tensor<8x64xf32, #blocked> loc(#loc247)
+      %tmp42 = arith.mulf %tmp35_100, %tmp24_47 : tensor<8x64xf32, #blocked> loc(#loc248)
+      %tmp43 = tt.addptr %tmp58, %tmp17 : tensor<1x64x!tt.ptr<bf16>, #blocked>, tensor<1x64xi32, #blocked> loc(#loc249)
+      %tmp43_101 = tt.broadcast %tmp43 : tensor<1x64x!tt.ptr<bf16>, #blocked> -> tensor<8x64x!tt.ptr<bf16>, #blocked> loc(#loc249)
+      %tmp43_102 = tt.load %tmp43_101, %tmp35_98, %cst_15 evictionPolicy = evict_last : tensor<8x64x!tt.ptr<bf16>, #blocked> loc(#loc250)
+      %tmp43_103 = arith.extf %tmp43_102 : tensor<8x64xbf16, #blocked> to tensor<8x64xf32, #blocked> loc(#loc251)
+      %tmp45 = arith.mulf %tmp42, %tmp43_103 : tensor<8x64xf32, #blocked> loc(#loc252)
+      %tmp48 = tt.broadcast %tmp32 : tensor<1x64xi1, #blocked> -> tensor<8x64xi1, #blocked> loc(#loc253)
+      %tmp48_104 = arith.select %tmp48, %tmp45, %cst_18 : tensor<8x64xi1, #blocked>, tensor<8x64xf32, #blocked> loc(#loc253)
+      %tmp49 = arith.select %tmp31, %tmp29, %tmp48_104 : tensor<8x64xi1, #blocked>, tensor<8x64xf32, #blocked> loc(#loc295)
+      %tmp57 = arith.mulf %tmp50_61, %tmp24_48 : tensor<8x64xf32, #blocked1> loc(#loc255)
+      %tmp60 = tt.broadcast %tmp58_64 : tensor<1x64xf32, #blocked1> -> tensor<8x64xf32, #blocked1> loc(#loc256)
+      %tmp60_105 = arith.mulf %tmp57, %tmp60 : tensor<8x64xf32, #blocked1> loc(#loc256)
+      %tmp64 = arith.mulf %tmp60_105, %tmp63_67 : tensor<8x64xf32, #blocked1> loc(#loc257)
+      %tmp64_106 = ttg.convert_layout %tmp64 : tensor<8x64xf32, #blocked1> -> tensor<8x64xf32, #blocked> loc(#loc257)
+      %tmp67 = arith.mulf %tmp49, %tmp66_70 : tensor<8x64xf32, #blocked> loc(#loc258)
+      %tmp68 = arith.addf %tmp64_106, %tmp67 : tensor<8x64xf32, #blocked> loc(#loc259)
+      %tmp70 = arith.addi %tmp17, %cst : tensor<1x64xi32, #blocked> loc(#loc260)
+      %tmp70_107 = tt.broadcast %tmp70 : tensor<1x64xi32, #blocked> -> tensor<8x64xi32, #blocked> loc(#loc261)
+      %tmp70_108 = arith.addi %tmp70_107, %tmp50_39 : tensor<8x64xi32, #blocked> loc(#loc261)
+      %tmp70_109 = arith.addi %tmp70_108, %tmp50_41 : tensor<8x64xi32, #blocked> loc(#loc262)
+      %tmp70_110 = tt.addptr %tmp50_42, %tmp70_109 : tensor<8x64x!tt.ptr<bf16>, #blocked>, tensor<8x64xi32, #blocked> loc(#loc263)
+      %tmp70_111 = tt.load %tmp70_110, %tmp17_87, %cst_15 evictionPolicy = evict_last : tensor<8x64x!tt.ptr<bf16>, #blocked> loc(#loc264)
+      %tmp70_112 = arith.extf %tmp70_111 : tensor<8x64xbf16, #blocked> to tensor<8x64xf32, #blocked> loc(#loc265)
+      %tmp75_113 = arith.mulf %tmp70_112, %tmp75_49 : tensor<8x64xf32, #blocked> loc(#loc207)
+      %tmp76 = tt.addptr %tmp102, %tmp17_81 : tensor<1x64x!tt.ptr<bf16>, #blocked>, tensor<1x64xi32, #blocked> loc(#loc266)
+      %tmp76_114 = tt.broadcast %tmp76 : tensor<1x64x!tt.ptr<bf16>, #blocked> -> tensor<8x64x!tt.ptr<bf16>, #blocked> loc(#loc266)
+      %tmp76_115 = tt.load %tmp76_114, %tmp17_87, %cst_15 evictionPolicy = evict_last : tensor<8x64x!tt.ptr<bf16>, #blocked> loc(#loc267)
+      %tmp76_116 = arith.extf %tmp76_115 : tensor<8x64xbf16, #blocked> to tensor<8x64xf32, #blocked> loc(#loc268)
+      %tmp78 = arith.mulf %tmp75_113, %tmp76_116 : tensor<8x64xf32, #blocked> loc(#loc269)
+      %tmp80 = arith.subf %cst_18, %tmp78 : tensor<8x64xf32, #blocked> loc(#loc270)
+      %tmp83 = arith.addi %tmp17, %cst_8 : tensor<1x64xi32, #blocked> loc(#loc271)
+      %tmp83_117 = tt.broadcast %tmp83 : tensor<1x64xi32, #blocked> -> tensor<8x64xi32, #blocked> loc(#loc272)
+      %tmp83_118 = arith.addi %tmp83_117, %tmp50_39 : tensor<8x64xi32, #blocked> loc(#loc272)
+      %tmp83_119 = arith.addi %tmp83_118, %tmp50_41 : tensor<8x64xi32, #blocked> loc(#loc273)
+      %tmp83_120 = tt.addptr %tmp50_42, %tmp83_119 : tensor<8x64x!tt.ptr<bf16>, #blocked>, tensor<8x64xi32, #blocked> loc(#loc274)
+      %tmp83_121 = tt.load %tmp83_120, %tmp35_98, %cst_15 evictionPolicy = evict_last : tensor<8x64x!tt.ptr<bf16>, #blocked> loc(#loc275)
+      %tmp83_122 = arith.extf %tmp83_121 : tensor<8x64xbf16, #blocked> to tensor<8x64xf32, #blocked> loc(#loc276)
+      %tmp88 = arith.mulf %tmp83_122, %tmp75_49 : tensor<8x64xf32, #blocked> loc(#loc277)
+      %tmp89 = tt.addptr %tmp102, %tmp17 : tensor<1x64x!tt.ptr<bf16>, #blocked>, tensor<1x64xi32, #blocked> loc(#loc278)
+      %tmp89_123 = tt.broadcast %tmp89 : tensor<1x64x!tt.ptr<bf16>, #blocked> -> tensor<8x64x!tt.ptr<bf16>, #blocked> loc(#loc278)
+      %tmp89_124 = tt.load %tmp89_123, %tmp35_98, %cst_15 evictionPolicy = evict_last : tensor<8x64x!tt.ptr<bf16>, #blocked> loc(#loc279)
+      %tmp89_125 = arith.extf %tmp89_124 : tensor<8x64xbf16, #blocked> to tensor<8x64xf32, #blocked> loc(#loc280)
+      %tmp91 = arith.mulf %tmp88, %tmp89_125 : tensor<8x64xf32, #blocked> loc(#loc281)
+      %tmp94 = arith.select %tmp48, %tmp91, %cst_18 : tensor<8x64xi1, #blocked>, tensor<8x64xf32, #blocked> loc(#loc282)
+      %tmp95 = arith.select %tmp31, %tmp80, %tmp94 : tensor<8x64xi1, #blocked>, tensor<8x64xf32, #blocked> loc(#loc296)
+      %tmp101 = arith.mulf %tmp96_76, %tmp75_50 : tensor<8x64xf32, #blocked1> loc(#loc285)
+      %tmp104 = tt.broadcast %tmp102_79 : tensor<1x64xf32, #blocked1> -> tensor<8x64xf32, #blocked1> loc(#loc286)
+      %tmp104_126 = arith.mulf %tmp101, %tmp104 : tensor<8x64xf32, #blocked1> loc(#loc286)
+      %tmp107 = arith.mulf %tmp104_126, %tmp63_67 : tensor<8x64xf32, #blocked1> loc(#loc287)
+      %tmp107_127 = ttg.convert_layout %tmp107 : tensor<8x64xf32, #blocked1> -> tensor<8x64xf32, #blocked> loc(#loc287)
+      %tmp109 = arith.mulf %tmp95, %tmp66_70 : tensor<8x64xf32, #blocked> loc(#loc288)
+      %tmp110 = arith.addf %tmp107_127, %tmp109 : tensor<8x64xf32, #blocked> loc(#loc289)
+      %4 = arith.addi %tmp50_55, %1 : tensor<8x64xi32, #blocked1> loc(#loc58)
+      %5 = tt.addptr %2, %4 : tensor<8x64x!tt.ptr<bf16>, #blocked1>, tensor<8x64xi32, #blocked1> loc(#loc59)
+      %6 = arith.truncf %tmp68 : tensor<8x64xf32, #blocked> to tensor<8x64xbf16, #blocked> loc(#loc144)
+      %7 = ttg.convert_layout %6 : tensor<8x64xbf16, #blocked> -> tensor<8x64xbf16, #blocked1> loc(#loc144)
+      tt.store %5, %7, %tmp50_59 : tensor<8x64x!tt.ptr<bf16>, #blocked1> loc(#loc144)
+      %8 = tt.addptr %3, %4 : tensor<8x64x!tt.ptr<bf16>, #blocked1>, tensor<8x64xi32, #blocked1> loc(#loc60)
+      %9 = arith.truncf %tmp110 : tensor<8x64xf32, #blocked> to tensor<8x64xbf16, #blocked> loc(#loc145)
+      %10 = ttg.convert_layout %9 : tensor<8x64xbf16, #blocked> -> tensor<8x64xbf16, #blocked1> loc(#loc145)
+      tt.store %8, %10, %tmp50_59 : tensor<8x64x!tt.ptr<bf16>, #blocked1> loc(#loc145)
+    } loc(#loc61)
+    tt.return loc(#loc146)
+  } loc(#loc)
+} loc(#loc)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":23:28)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":23:33)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:44)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:23)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":26:37)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":28:19)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":29:19)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:52)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:48)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:63)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:57)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:34)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":33:43)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":34:31)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":35:29)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:41)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:68)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:121)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:41)
+#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:50)
+#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:34)
+#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:61)
+#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:114)
+#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":42:22)
+#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":44:23)
+#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":45:40)
+#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":47:22)
+#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":49:25)
+#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":50:42)
+#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":50:8)
+#loc32 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:36)
+#loc34 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:15)
+#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":51:28)
+#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":52:30)
+#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:46)
+#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:42)
+#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:57)
+#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:51)
+#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:35)
+#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:35)
+#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:46)
+#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:42)
+#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:35)
+#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:35)
+#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:36)
+#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":75:25)
+#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":77:24)
+#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":78:32)
+#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":79:24)
+#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":123:24)
+#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":124:24)
+#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":125:32)
+#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":126:24)
+#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:43)
+#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:39)
+#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:32)
+#loc60 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:32)
+#loc61 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":53:43)
+#loc62 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":54:31)
+#loc63 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":55:29)
+#loc64 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":58:27)
+#loc65 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":59:27)
+#loc66 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:62)
+#loc67 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:115)
+#loc68 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:42)
+#loc69 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:95)
+#loc70 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:51)
+#loc71 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:51)
+#loc72 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:42)
+#loc73 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:49)
+#loc74 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:58)
+#loc75 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:35)
+#loc76 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:69)
+#loc77 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:123)
+#loc78 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:43)
+#loc79 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:96)
+#loc80 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":71:24)
+#loc81 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:41)
+#loc82 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:39)
+#loc83 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:48)
+#loc84 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:57)
+#loc85 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:35)
+#loc86 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:78)
+#loc87 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:68)
+#loc88 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:129)
+#loc89 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:35)
+#loc90 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:85)
+#loc91 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:146)
+#loc92 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":82:24)
+#loc93 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":84:17)
+#loc94 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":86:39)
+#loc95 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":87:25)
+#loc96 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:44)
+#loc97 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:53)
+#loc98 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:35)
+#loc99 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:74)
+#loc100 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:64)
+#loc101 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:125)
+#loc102 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":97:24)
+#loc103 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:35)
+#loc104 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:81)
+#loc105 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:142)
+#loc106 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":100:24)
+#loc107 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":103:39)
+#loc108 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":104:39)
+#loc109 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":111:24)
+#loc110 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":113:24)
+#loc111 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":116:24)
+#loc112 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":118:24)
+#loc113 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":119:24)
+#loc114 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:42)
+#loc115 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:51)
+#loc116 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:60)
+#loc117 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:35)
+#loc118 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:71)
+#loc119 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:132)
+#loc120 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:35)
+#loc121 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:85)
+#loc122 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:146)
+#loc123 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":129:24)
+#loc124 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":131:17)
+#loc125 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:42)
+#loc126 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:51)
+#loc127 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:60)
+#loc128 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:35)
+#loc129 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:71)
+#loc130 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:132)
+#loc131 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":139:24)
+#loc132 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:35)
+#loc133 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:81)
+#loc134 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:142)
+#loc135 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":142:24)
+#loc136 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":145:39)
+#loc137 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":146:39)
+#loc138 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":133:39)
+#loc139 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":151:25)
+#loc140 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":153:26)
+#loc141 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":156:26)
+#loc142 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":158:26)
+#loc143 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":159:26)
+#loc144 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:55)
+#loc145 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:56)
+#loc146 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":53:4)
+#loc156 = loc("xoffset"(#loc2))
+#loc157 = loc("xoffset"(#loc3))
+#loc158 = loc("xindex"(#loc4))
+#loc159 = loc("xindex"(#loc5))
+#loc160 = loc("r0_base"(#loc6))
+#loc161 = loc("x0"(#loc7))
+#loc162 = loc("x1"(#loc8))
+#loc163 = loc("tmp0"(#loc9))
+#loc164 = loc("tmp0"(#loc10))
+#loc165 = loc("tmp0"(#loc11))
+#loc166 = loc("tmp0"(#loc12))
+#loc167 = loc("tmp0"(#loc13))
+#loc168 = loc("_tmp4"(#loc14))
+#loc169 = loc("r0_index"(#loc15))
+#loc170 = loc("r0_mask"(#loc16))
+#loc171 = loc("tmp0"(#loc17))
+#loc172 = loc("tmp0"(#loc18))
+#loc173 = loc("tmp0"(#loc19))
+#loc174 = loc("tmp6"(#loc20))
+#loc175 = loc("tmp6"(#loc21))
+#loc176 = loc("tmp6"(#loc22))
+#loc177 = loc("tmp6"(#loc23))
+#loc178 = loc("tmp6"(#loc24))
+#loc179 = loc("tmp2"(#loc25))
+#loc180 = loc("tmp5"(#loc26))
+#loc181 = loc("_tmp4"(#loc27))
+#loc182 = loc("tmp8"(#loc28))
+#loc183 = loc("tmp11"(#loc29))
+#loc184 = loc("_tmp10"(#loc30))
+#loc186 = loc("tmp4"(#loc35))
+#loc188 = loc("tmp10"(#loc37))
+#loc189 = loc("tmp50"(#loc38))
+#loc190 = loc("tmp50"(#loc39))
+#loc191 = loc("tmp50"(#loc40))
+#loc192 = loc("tmp50"(#loc41))
+#loc193 = loc("tmp50"(#loc42))
+#loc194 = loc("tmp58"(#loc43))
+#loc195 = loc("tmp63"(#loc44))
+#loc196 = loc("tmp63"(#loc45))
+#loc197 = loc("tmp63"(#loc46))
+#loc198 = loc("tmp66"(#loc47))
+#loc199 = loc("tmp102"(#loc48))
+#loc200 = loc("tmp20"(#loc49))
+#loc201 = loc("tmp22"(#loc50))
+#loc202 = loc("tmp23"(#loc51))
+#loc203 = loc("tmp24"(#loc52))
+#loc204 = loc("tmp72"(#loc53))
+#loc205 = loc("tmp73"(#loc54))
+#loc206 = loc("tmp74"(#loc55))
+#loc207 = loc("tmp75"(#loc56))
+#loc208 = loc("r0_index"(#loc62))
+#loc209 = loc("r0_mask"(#loc63))
+#loc210 = loc("r0_3"(#loc64))
+#loc211 = loc("r0_4"(#loc65))
+#loc212 = loc("tmp50"(#loc66))
+#loc213 = loc("tmp50"(#loc67))
+#loc214 = loc("tmp58"(#loc68))
+#loc215 = loc("tmp58"(#loc69))
+#loc216 = loc("tmp63"(#loc70))
+#loc217 = loc("tmp66"(#loc71))
+#loc218 = loc("tmp96"(#loc72))
+#loc219 = loc("tmp96"(#loc73))
+#loc220 = loc("tmp96"(#loc74))
+#loc221 = loc("tmp96"(#loc75))
+#loc222 = loc("tmp96"(#loc76))
+#loc223 = loc("tmp96"(#loc77))
+#loc224 = loc("tmp102"(#loc78))
+#loc225 = loc("tmp102"(#loc79))
+#loc226 = loc("tmp16"(#loc80))
+#loc227 = loc("tmp17"(#loc81))
+#loc228 = loc("tmp17"(#loc82))
+#loc229 = loc("tmp17"(#loc83))
+#loc230 = loc("tmp17"(#loc84))
+#loc231 = loc("tmp17"(#loc85))
+#loc232 = loc("tmp17"(#loc86))
+#loc233 = loc("tmp17"(#loc87))
+#loc234 = loc("tmp17"(#loc88))
+#loc235 = loc("tmp25"(#loc89))
+#loc236 = loc("tmp25"(#loc90))
+#loc237 = loc("tmp25"(#loc91))
+#loc238 = loc("tmp27"(#loc92))
+#loc239 = loc("tmp29"(#loc93))
+#loc240 = loc("tmp31"(#loc94))
+#loc241 = loc("tmp32"(#loc95))
+#loc242 = loc("tmp35"(#loc96))
+#loc243 = loc("tmp35"(#loc97))
+#loc244 = loc("tmp35"(#loc98))
+#loc245 = loc("tmp35"(#loc99))
+#loc246 = loc("tmp35"(#loc100))
+#loc247 = loc("tmp35"(#loc101))
+#loc248 = loc("tmp42"(#loc102))
+#loc249 = loc("tmp43"(#loc103))
+#loc250 = loc("tmp43"(#loc104))
+#loc251 = loc("tmp43"(#loc105))
+#loc252 = loc("tmp45"(#loc106))
+#loc253 = loc("tmp48"(#loc107))
+#loc254 = loc("tmp49"(#loc108))
+#loc255 = loc("tmp57"(#loc109))
+#loc256 = loc("tmp60"(#loc110))
+#loc257 = loc("tmp64"(#loc111))
+#loc258 = loc("tmp67"(#loc112))
+#loc259 = loc("tmp68"(#loc113))
+#loc260 = loc("tmp70"(#loc114))
+#loc261 = loc("tmp70"(#loc115))
+#loc262 = loc("tmp70"(#loc116))
+#loc263 = loc("tmp70"(#loc117))
+#loc264 = loc("tmp70"(#loc118))
+#loc265 = loc("tmp70"(#loc119))
+#loc266 = loc("tmp76"(#loc120))
+#loc267 = loc("tmp76"(#loc121))
+#loc268 = loc("tmp76"(#loc122))
+#loc269 = loc("tmp78"(#loc123))
+#loc270 = loc("tmp80"(#loc124))
+#loc271 = loc("tmp83"(#loc125))
+#loc272 = loc("tmp83"(#loc126))
+#loc273 = loc("tmp83"(#loc127))
+#loc274 = loc("tmp83"(#loc128))
+#loc275 = loc("tmp83"(#loc129))
+#loc276 = loc("tmp83"(#loc130))
+#loc277 = loc("tmp88"(#loc131))
+#loc278 = loc("tmp89"(#loc132))
+#loc279 = loc("tmp89"(#loc133))
+#loc280 = loc("tmp89"(#loc134))
+#loc281 = loc("tmp91"(#loc135))
+#loc282 = loc("tmp94"(#loc136))
+#loc283 = loc("tmp95"(#loc137))
+#loc284 = loc("tmp82"(#loc138))
+#loc285 = loc("tmp101"(#loc139))
+#loc286 = loc("tmp104"(#loc140))
+#loc287 = loc("tmp107"(#loc141))
+#loc288 = loc("tmp109"(#loc142))
+#loc289 = loc("tmp110"(#loc143))
+#loc290 = loc("_tmp10"(#loc168))
+#loc291 = loc(callsite(#loc32 at #loc185))
+#loc293 = loc(callsite(#loc32 at #loc187))
+#loc295 = loc(fused[#loc254, #loc240])
+#loc296 = loc(fused[#loc283, #loc284])
+#loc297 = loc(callsite(#loc34 at #loc291))
+#loc298 = loc(callsite(#loc34 at #loc293))
diff --git a/triton/YTJMOQ5EK2K5SU77M2GF34KOJUJGHRSHTNE3D7KKVDZMRCR7C73Q/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttir b/triton/YTJMOQ5EK2K5SU77M2GF34KOJUJGHRSHTNE3D7KKVDZMRCR7C73Q/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttir
new file mode 100644
index 0000000000000000000000000000000000000000..724739f9c25dc5e8b33633edf8ff03ad9b391bf3
--- /dev/null
+++ b/triton/YTJMOQ5EK2K5SU77M2GF34KOJUJGHRSHTNE3D7KKVDZMRCR7C73Q/triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0.ttir
@@ -0,0 +1,520 @@
+#loc = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":18:0)
+#loc1 = loc(unknown)
+#loc35 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":51:25)
+#loc38 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":52:27)
+#loc149 = loc("in_out_ptr0"(#loc))
+#loc150 = loc("in_out_ptr1"(#loc))
+#loc151 = loc("in_ptr0"(#loc))
+#loc152 = loc("in_ptr1"(#loc))
+#loc153 = loc("in_ptr2"(#loc))
+#loc154 = loc("in_ptr3"(#loc))
+#loc155 = loc("in_ptr4"(#loc))
+#loc156 = loc("xnumel"(#loc))
+#loc157 = loc("r0_numel"(#loc))
+#loc189 = loc("tmp4"(#loc35))
+#loc191 = loc("tmp10"(#loc38))
+#loc296 = loc(callsite(#loc1 at #loc189))
+#loc298 = loc(callsite(#loc1 at #loc191))
+module {
+  tt.func public @triton_red_fused__fused_rms_norm__to_copy_add_mul_neg_split_split_with_sizes_stack_unbind_unsqueeze_view_0(%in_out_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_out_ptr0"(#loc)), %in_out_ptr1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_out_ptr1"(#loc)), %in_ptr0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %in_ptr4: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("in_ptr4"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} {
+    %cst = arith.constant dense<0.000000e+00> : tensor<1x64xbf16> loc(#loc1)
+    %cst_0 = arith.constant dense<0.000000e+00> : tensor<8x64xbf16> loc(#loc1)
+    %c64_i32 = arith.constant 64 : i32 loc(#loc1)
+    %c128_i32 = arith.constant 128 : i32 loc(#loc1)
+    %c0_i32 = arith.constant 0 : i32 loc(#loc1)
+    %cst_1 = arith.constant dense<4097> : tensor<1x64xi32> loc(#loc1)
+    %cst_2 = arith.constant dense<9.99999997E-7> : tensor<8x1xf32> loc(#loc1)
+    %cst_3 = arith.constant dense<1.280000e+02> : tensor<8x1xf32> loc(#loc1)
+    %cst_4 = arith.constant dense<1> : tensor<1x64xi32> loc(#loc1)
+    %cst_5 = arith.constant dense<1> : tensor<1x64xi64> loc(#loc1)
+    %cst_6 = arith.constant dense<2> : tensor<1x64xi32> loc(#loc1)
+    %cst_7 = arith.constant dense<36864> : tensor<8x1xi32> loc(#loc1)
+    %cst_8 = arith.constant dense<128> : tensor<8x1xi32> loc(#loc1)
+    %cst_9 = arith.constant dense<4096> : tensor<1x64xi32> loc(#loc1)
+    %cst_10 = arith.constant dense<128> : tensor<1x64xi32> loc(#loc1)
+    %cst_11 = arith.constant dense<0.000000e+00> : tensor<8x64xf32> loc(#loc1)
+    %cst_12 = arith.constant dense<32> : tensor<8x1xi32> loc(#loc1)
+    %c8_i32 = arith.constant 8 : i32 loc(#loc1)
+    %xoffset = tt.get_program_id x : i32 loc(#loc158)
+    %xoffset_13 = arith.muli %xoffset, %c8_i32 : i32 loc(#loc159)
+    %xindex = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32> loc(#loc160)
+    %xindex_14 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<8xi32> -> tensor<8x1xi32> loc(#loc161)
+    %xindex_15 = tt.splat %xoffset_13 : i32 -> tensor<8x1xi32> loc(#loc162)
+    %xindex_16 = arith.addi %xindex_15, %xindex_14 : tensor<8x1xi32> loc(#loc162)
+    %r0_base = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc163)
+    %r0_base_17 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc164)
+    %x0 = arith.remsi %xindex_16, %cst_12 : tensor<8x1xi32> loc(#loc165)
+    %x1 = arith.divsi %xindex_16, %cst_12 : tensor<8x1xi32> loc(#loc166)
+    %_tmp10:2 = scf.for %r0_offset = %c0_i32 to %c128_i32 step %c64_i32 iter_args(%_tmp4 = %cst_11, %_tmp10_20 = %cst_11) -> (tensor<8x64xf32>, tensor<8x64xf32>)  : i32 {
+      %r0_index = tt.splat %r0_offset : i32 -> tensor<1x64xi32> loc(#loc168)
+      %r0_index_21 = arith.addi %r0_index, %r0_base_17 : tensor<1x64xi32> loc(#loc168)
+      %r0_mask = arith.cmpi slt, %r0_index_21, %cst_10 : tensor<1x64xi32> loc(#loc169)
+      %tmp0 = arith.addi %r0_index_21, %cst_9 : tensor<1x64xi32> loc(#loc170)
+      %tmp0_22 = arith.muli %x0, %cst_8 : tensor<8x1xi32> loc(#loc171)
+      %tmp0_23 = tt.broadcast %tmp0 : tensor<1x64xi32> -> tensor<8x64xi32> loc(#loc172)
+      %tmp0_24 = tt.broadcast %tmp0_22 : tensor<8x1xi32> -> tensor<8x64xi32> loc(#loc172)
+      %tmp0_25 = arith.addi %tmp0_23, %tmp0_24 : tensor<8x64xi32> loc(#loc172)
+      %tmp0_26 = arith.muli %x1, %cst_7 : tensor<8x1xi32> loc(#loc173)
+      %tmp0_27 = tt.broadcast %tmp0_26 : tensor<8x1xi32> -> tensor<8x64xi32> loc(#loc174)
+      %tmp0_28 = arith.addi %tmp0_25, %tmp0_27 : tensor<8x64xi32> loc(#loc174)
+      %tmp0_29 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<8x64x!tt.ptr<bf16>> loc(#loc175)
+      %tmp0_30 = tt.addptr %tmp0_29, %tmp0_28 : tensor<8x64x!tt.ptr<bf16>>, tensor<8x64xi32> loc(#loc175)
+      %tmp0_31 = tt.broadcast %r0_mask : tensor<1x64xi1> -> tensor<8x64xi1> loc(#loc176)
+      %tmp0_32 = tt.load %tmp0_30, %tmp0_31, %cst_0 evictionPolicy = evict_last : tensor<8x64x!tt.ptr<bf16>> loc(#loc176)
+      %tmp0_33 = arith.extf %tmp0_32 : tensor<8x64xbf16> to tensor<8x64xf32> loc(#loc177)
+      %tmp6 = tt.broadcast %r0_index_21 : tensor<1x64xi32> -> tensor<8x64xi32> loc(#loc178)
+      %tmp6_34 = arith.addi %tmp6, %tmp0_24 : tensor<8x64xi32> loc(#loc178)
+      %tmp6_35 = arith.addi %tmp6_34, %tmp0_27 : tensor<8x64xi32> loc(#loc179)
+      %tmp6_36 = tt.addptr %tmp0_29, %tmp6_35 : tensor<8x64x!tt.ptr<bf16>>, tensor<8x64xi32> loc(#loc180)
+      %tmp6_37 = tt.load %tmp6_36, %tmp0_31, %cst_0 evictionPolicy = evict_last : tensor<8x64x!tt.ptr<bf16>> loc(#loc181)
+      %tmp6_38 = arith.extf %tmp6_37 : tensor<8x64xbf16> to tensor<8x64xf32> loc(#loc182)
+      %tmp2 = arith.mulf %tmp0_33, %tmp0_33 : tensor<8x64xf32> loc(#loc183)
+      %tmp5 = arith.addf %_tmp4, %tmp2 : tensor<8x64xf32> loc(#loc184)
+      %_tmp4_39 = arith.select %tmp0_31, %tmp5, %_tmp4 : tensor<8x64xi1>, tensor<8x64xf32> loc(#loc185)
+      %tmp8 = arith.mulf %tmp6_38, %tmp6_38 : tensor<8x64xf32> loc(#loc186)
+      %tmp11 = arith.addf %_tmp10_20, %tmp8 : tensor<8x64xf32> loc(#loc187)
+      %_tmp10_40 = arith.select %tmp0_31, %tmp11, %_tmp10_20 : tensor<8x64xi1>, tensor<8x64xf32> loc(#loc188)
+      scf.yield %_tmp4_39, %_tmp10_40 : tensor<8x64xf32>, tensor<8x64xf32> loc(#loc33)
+    } loc(#loc294)
+    %tmp4 = "tt.reduce"(%_tmp10#0) <{axis = 1 : i32}> ({
+    ^bb0(%tmp4_20: f32 loc(callsite(#loc1 at #loc189)), %tmp4_21: f32 loc(callsite(#loc1 at #loc189))):
+      %tmp4_22 = arith.addf %tmp4_20, %tmp4_21 : f32 loc(#loc299)
+      tt.reduce.return %tmp4_22 : f32 loc(#loc295)
+    }) : (tensor<8x64xf32>) -> tensor<8xf32> loc(#loc295)
+    %tmp4_18 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<8xf32> -> tensor<8x1xf32> loc(#loc190)
+    %tmp10 = "tt.reduce"(%_tmp10#1) <{axis = 1 : i32}> ({
+    ^bb0(%tmp10_20: f32 loc(callsite(#loc1 at #loc191)), %tmp10_21: f32 loc(callsite(#loc1 at #loc191))):
+      %tmp10_22 = arith.addf %tmp10_20, %tmp10_21 : f32 loc(#loc300)
+      tt.reduce.return %tmp10_22 : f32 loc(#loc297)
+    }) : (tensor<8x64xf32>) -> tensor<8xf32> loc(#loc297)
+    %tmp10_19 = tt.expand_dims %tmp10 {axis = 1 : i32} : tensor<8xf32> -> tensor<8x1xf32> loc(#loc192)
+    scf.for %r0_offset = %c0_i32 to %c128_i32 step %c64_i32  : i32 {
+      %r0_index = tt.splat %r0_offset : i32 -> tensor<1x64xi32> loc(#loc193)
+      %r0_index_20 = arith.addi %r0_index, %r0_base_17 : tensor<1x64xi32> loc(#loc193)
+      %r0_mask = arith.cmpi slt, %r0_index_20, %cst_10 : tensor<1x64xi32> loc(#loc194)
+      %r0_3 = arith.remsi %r0_index_20, %cst_6 : tensor<1x64xi32> loc(#loc195)
+      %r0_4 = arith.divsi %r0_index_20, %cst_6 : tensor<1x64xi32> loc(#loc196)
+      %tmp50 = arith.muli %x0, %cst_8 : tensor<8x1xi32> loc(#loc197)
+      %tmp50_21 = tt.broadcast %r0_index_20 : tensor<1x64xi32> -> tensor<8x64xi32> loc(#loc198)
+      %tmp50_22 = tt.broadcast %tmp50 : tensor<8x1xi32> -> tensor<8x64xi32> loc(#loc198)
+      %tmp50_23 = arith.addi %tmp50_21, %tmp50_22 : tensor<8x64xi32> loc(#loc198)
+      %tmp50_24 = arith.muli %x1, %cst_7 : tensor<8x1xi32> loc(#loc199)
+      %tmp50_25 = tt.broadcast %tmp50_24 : tensor<8x1xi32> -> tensor<8x64xi32> loc(#loc200)
+      %tmp50_26 = arith.addi %tmp50_23, %tmp50_25 : tensor<8x64xi32> loc(#loc200)
+      %tmp50_27 = tt.splat %in_ptr0 : !tt.ptr<bf16> -> tensor<8x64x!tt.ptr<bf16>> loc(#loc201)
+      %tmp50_28 = tt.addptr %tmp50_27, %tmp50_26 : tensor<8x64x!tt.ptr<bf16>>, tensor<8x64xi32> loc(#loc201)
+      %tmp50_29 = tt.broadcast %r0_mask : tensor<1x64xi1> -> tensor<8x64xi1> loc(#loc202)
+      %tmp50_30 = tt.load %tmp50_28, %tmp50_29, %cst_0 evictionPolicy = evict_last : tensor<8x64x!tt.ptr<bf16>> loc(#loc202)
+      %tmp50_31 = arith.extf %tmp50_30 : tensor<8x64xbf16> to tensor<8x64xf32> loc(#loc203)
+      %tmp58 = tt.splat %in_ptr1 : !tt.ptr<bf16> -> tensor<1x64x!tt.ptr<bf16>> loc(#loc204)
+      %tmp58_32 = tt.addptr %tmp58, %r0_index_20 : tensor<1x64x!tt.ptr<bf16>>, tensor<1x64xi32> loc(#loc204)
+      %tmp58_33 = tt.load %tmp58_32, %r0_mask, %cst evictionPolicy = evict_last : tensor<1x64x!tt.ptr<bf16>> loc(#loc205)
+      %tmp58_34 = arith.extf %tmp58_33 : tensor<1x64xbf16> to tensor<1x64xf32> loc(#loc206)
+      %tmp63 = arith.muli %x1, %cst_8 : tensor<8x1xi32> loc(#loc207)
+      %tmp63_35 = tt.broadcast %tmp63 : tensor<8x1xi32> -> tensor<8x64xi32> loc(#loc208)
+      %tmp63_36 = arith.addi %tmp50_21, %tmp63_35 : tensor<8x64xi32> loc(#loc208)
+      %tmp63_37 = tt.splat %in_ptr2 : !tt.ptr<f32> -> tensor<8x64x!tt.ptr<f32>> loc(#loc209)
+      %tmp63_38 = tt.addptr %tmp63_37, %tmp63_36 : tensor<8x64x!tt.ptr<f32>>, tensor<8x64xi32> loc(#loc209)
+      %tmp63_39 = tt.load %tmp63_38, %tmp50_29, %cst_11 evictionPolicy = evict_last : tensor<8x64x!tt.ptr<f32>> loc(#loc210)
+      %tmp66 = tt.splat %in_ptr3 : !tt.ptr<f32> -> tensor<8x64x!tt.ptr<f32>> loc(#loc211)
+      %tmp66_40 = tt.addptr %tmp66, %tmp63_36 : tensor<8x64x!tt.ptr<f32>>, tensor<8x64xi32> loc(#loc211)
+      %tmp66_41 = tt.load %tmp66_40, %tmp50_29, %cst_11 evictionPolicy = evict_last : tensor<8x64x!tt.ptr<f32>> loc(#loc212)
+      %tmp96 = arith.addi %r0_index_20, %cst_9 : tensor<1x64xi32> loc(#loc213)
+      %tmp96_42 = tt.broadcast %tmp96 : tensor<1x64xi32> -> tensor<8x64xi32> loc(#loc214)
+      %tmp96_43 = arith.addi %tmp96_42, %tmp50_22 : tensor<8x64xi32> loc(#loc214)
+      %tmp96_44 = arith.addi %tmp96_43, %tmp50_25 : tensor<8x64xi32> loc(#loc215)
+      %tmp96_45 = tt.addptr %tmp50_27, %tmp96_44 : tensor<8x64x!tt.ptr<bf16>>, tensor<8x64xi32> loc(#loc216)
+      %tmp96_46 = tt.load %tmp96_45, %tmp50_29, %cst_0 evictionPolicy = evict_first : tensor<8x64x!tt.ptr<bf16>> loc(#loc217)
+      %tmp96_47 = arith.extf %tmp96_46 : tensor<8x64xbf16> to tensor<8x64xf32> loc(#loc218)
+      %tmp102 = tt.splat %in_ptr4 : !tt.ptr<bf16> -> tensor<1x64x!tt.ptr<bf16>> loc(#loc219)
+      %tmp102_48 = tt.addptr %tmp102, %r0_index_20 : tensor<1x64x!tt.ptr<bf16>>, tensor<1x64xi32> loc(#loc219)
+      %tmp102_49 = tt.load %tmp102_48, %r0_mask, %cst evictionPolicy = evict_last : tensor<1x64x!tt.ptr<bf16>> loc(#loc220)
+      %tmp102_50 = arith.extf %tmp102_49 : tensor<1x64xbf16> to tensor<1x64xf32> loc(#loc221)
+      %tmp16 = arith.extsi %r0_3 : tensor<1x64xi32> to tensor<1x64xi64> loc(#loc222)
+      %tmp16_51 = arith.cmpi slt, %tmp16, %cst_5 : tensor<1x64xi64> loc(#loc222)
+      %tmp17 = arith.muli %r0_4, %cst_6 : tensor<1x64xi32> loc(#loc223)
+      %tmp17_52 = arith.addi %tmp17, %cst_4 : tensor<1x64xi32> loc(#loc224)
+      %tmp17_53 = tt.broadcast %tmp17_52 : tensor<1x64xi32> -> tensor<8x64xi32> loc(#loc225)
+      %tmp17_54 = arith.addi %tmp17_53, %tmp50_22 : tensor<8x64xi32> loc(#loc225)
+      %tmp17_55 = arith.addi %tmp17_54, %tmp50_25 : tensor<8x64xi32> loc(#loc226)
+      %tmp17_56 = tt.addptr %tmp50_27, %tmp17_55 : tensor<8x64x!tt.ptr<bf16>>, tensor<8x64xi32> loc(#loc227)
+      %tmp17_57 = arith.andi %r0_mask, %tmp16_51 : tensor<1x64xi1> loc(#loc228)
+      %tmp17_58 = tt.broadcast %tmp17_57 : tensor<1x64xi1> -> tensor<8x64xi1> loc(#loc229)
+      %tmp17_59 = tt.load %tmp17_56, %tmp17_58, %cst_0 evictionPolicy = evict_last : tensor<8x64x!tt.ptr<bf16>> loc(#loc229)
+      %tmp17_60 = arith.extf %tmp17_59 : tensor<8x64xbf16> to tensor<8x64xf32> loc(#loc230)
+      %tmp20 = arith.divf %tmp10_19, %cst_3 : tensor<8x1xf32> loc(#loc231)
+      %tmp22 = arith.addf %tmp20, %cst_2 : tensor<8x1xf32> loc(#loc232)
+      %tmp23 = tt.extern_elementwise %tmp22 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<8x1xf32>) -> tensor<8x1xf32> loc(#loc233)
+      %tmp24 = tt.broadcast %tmp23 : tensor<8x1xf32> -> tensor<8x64xf32> loc(#loc234)
+      %tmp24_61 = arith.mulf %tmp17_60, %tmp24 : tensor<8x64xf32> loc(#loc234)
+      %tmp25 = tt.addptr %tmp58, %tmp17_52 : tensor<1x64x!tt.ptr<bf16>>, tensor<1x64xi32> loc(#loc235)
+      %tmp25_62 = tt.broadcast %tmp25 : tensor<1x64x!tt.ptr<bf16>> -> tensor<8x64x!tt.ptr<bf16>> loc(#loc235)
+      %tmp25_63 = tt.load %tmp25_62, %tmp17_58, %cst_0 evictionPolicy = evict_last : tensor<8x64x!tt.ptr<bf16>> loc(#loc236)
+      %tmp25_64 = arith.extf %tmp25_63 : tensor<8x64xbf16> to tensor<8x64xf32> loc(#loc237)
+      %tmp27 = arith.mulf %tmp24_61, %tmp25_64 : tensor<8x64xf32> loc(#loc238)
+      %tmp29 = arith.subf %cst_11, %tmp27 : tensor<8x64xf32> loc(#loc239)
+      %tmp31 = tt.broadcast %tmp16_51 : tensor<1x64xi1> -> tensor<8x64xi1> loc(#loc240)
+      %tmp31_65 = arith.select %tmp31, %tmp29, %cst_11 : tensor<8x64xi1>, tensor<8x64xf32> loc(#loc240)
+      %tmp32 = arith.cmpi sge, %tmp16, %cst_5 : tensor<1x64xi64> loc(#loc241)
+      %tmp35 = tt.broadcast %tmp17 : tensor<1x64xi32> -> tensor<8x64xi32> loc(#loc242)
+      %tmp35_66 = arith.addi %tmp35, %tmp50_22 : tensor<8x64xi32> loc(#loc242)
+      %tmp35_67 = arith.addi %tmp35_66, %tmp50_25 : tensor<8x64xi32> loc(#loc243)
+      %tmp35_68 = tt.addptr %tmp50_27, %tmp35_67 : tensor<8x64x!tt.ptr<bf16>>, tensor<8x64xi32> loc(#loc244)
+      %tmp35_69 = arith.andi %r0_mask, %tmp32 : tensor<1x64xi1> loc(#loc245)
+      %tmp35_70 = tt.broadcast %tmp35_69 : tensor<1x64xi1> -> tensor<8x64xi1> loc(#loc246)
+      %tmp35_71 = tt.load %tmp35_68, %tmp35_70, %cst_0 evictionPolicy = evict_last : tensor<8x64x!tt.ptr<bf16>> loc(#loc246)
+      %tmp35_72 = arith.extf %tmp35_71 : tensor<8x64xbf16> to tensor<8x64xf32> loc(#loc247)
+      %tmp42 = arith.mulf %tmp35_72, %tmp24 : tensor<8x64xf32> loc(#loc248)
+      %tmp43 = tt.addptr %tmp58, %tmp17 : tensor<1x64x!tt.ptr<bf16>>, tensor<1x64xi32> loc(#loc249)
+      %tmp43_73 = tt.broadcast %tmp43 : tensor<1x64x!tt.ptr<bf16>> -> tensor<8x64x!tt.ptr<bf16>> loc(#loc249)
+      %tmp43_74 = tt.load %tmp43_73, %tmp35_70, %cst_0 evictionPolicy = evict_last : tensor<8x64x!tt.ptr<bf16>> loc(#loc250)
+      %tmp43_75 = arith.extf %tmp43_74 : tensor<8x64xbf16> to tensor<8x64xf32> loc(#loc251)
+      %tmp45 = arith.mulf %tmp42, %tmp43_75 : tensor<8x64xf32> loc(#loc252)
+      %tmp48 = tt.broadcast %tmp32 : tensor<1x64xi1> -> tensor<8x64xi1> loc(#loc253)
+      %tmp48_76 = arith.select %tmp48, %tmp45, %cst_11 : tensor<8x64xi1>, tensor<8x64xf32> loc(#loc253)
+      %tmp49 = arith.select %tmp31, %tmp31_65, %tmp48_76 : tensor<8x64xi1>, tensor<8x64xf32> loc(#loc254)
+      %tmp57 = arith.mulf %tmp50_31, %tmp24 : tensor<8x64xf32> loc(#loc255)
+      %tmp60 = tt.broadcast %tmp58_34 : tensor<1x64xf32> -> tensor<8x64xf32> loc(#loc256)
+      %tmp60_77 = arith.mulf %tmp57, %tmp60 : tensor<8x64xf32> loc(#loc256)
+      %tmp64 = arith.mulf %tmp60_77, %tmp63_39 : tensor<8x64xf32> loc(#loc257)
+      %tmp67 = arith.mulf %tmp49, %tmp66_41 : tensor<8x64xf32> loc(#loc258)
+      %tmp68 = arith.addf %tmp64, %tmp67 : tensor<8x64xf32> loc(#loc259)
+      %tmp70 = arith.addi %tmp17, %cst_1 : tensor<1x64xi32> loc(#loc260)
+      %tmp70_78 = tt.broadcast %tmp70 : tensor<1x64xi32> -> tensor<8x64xi32> loc(#loc261)
+      %tmp70_79 = arith.addi %tmp70_78, %tmp50_22 : tensor<8x64xi32> loc(#loc261)
+      %tmp70_80 = arith.addi %tmp70_79, %tmp50_25 : tensor<8x64xi32> loc(#loc262)
+      %tmp70_81 = tt.addptr %tmp50_27, %tmp70_80 : tensor<8x64x!tt.ptr<bf16>>, tensor<8x64xi32> loc(#loc263)
+      %tmp70_82 = tt.load %tmp70_81, %tmp17_58, %cst_0 evictionPolicy = evict_last : tensor<8x64x!tt.ptr<bf16>> loc(#loc264)
+      %tmp70_83 = arith.extf %tmp70_82 : tensor<8x64xbf16> to tensor<8x64xf32> loc(#loc265)
+      %tmp72 = arith.divf %tmp4_18, %cst_3 : tensor<8x1xf32> loc(#loc266)
+      %tmp73 = arith.addf %tmp72, %cst_2 : tensor<8x1xf32> loc(#loc267)
+      %tmp74 = tt.extern_elementwise %tmp73 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<8x1xf32>) -> tensor<8x1xf32> loc(#loc268)
+      %tmp75 = tt.broadcast %tmp74 : tensor<8x1xf32> -> tensor<8x64xf32> loc(#loc269)
+      %tmp75_84 = arith.mulf %tmp70_83, %tmp75 : tensor<8x64xf32> loc(#loc269)
+      %tmp76 = tt.addptr %tmp102, %tmp17_52 : tensor<1x64x!tt.ptr<bf16>>, tensor<1x64xi32> loc(#loc270)
+      %tmp76_85 = tt.broadcast %tmp76 : tensor<1x64x!tt.ptr<bf16>> -> tensor<8x64x!tt.ptr<bf16>> loc(#loc270)
+      %tmp76_86 = tt.load %tmp76_85, %tmp17_58, %cst_0 evictionPolicy = evict_last : tensor<8x64x!tt.ptr<bf16>> loc(#loc271)
+      %tmp76_87 = arith.extf %tmp76_86 : tensor<8x64xbf16> to tensor<8x64xf32> loc(#loc272)
+      %tmp78 = arith.mulf %tmp75_84, %tmp76_87 : tensor<8x64xf32> loc(#loc273)
+      %tmp80 = arith.subf %cst_11, %tmp78 : tensor<8x64xf32> loc(#loc274)
+      %tmp82 = arith.select %tmp31, %tmp80, %cst_11 : tensor<8x64xi1>, tensor<8x64xf32> loc(#loc275)
+      %tmp83 = arith.addi %tmp17, %cst_9 : tensor<1x64xi32> loc(#loc276)
+      %tmp83_88 = tt.broadcast %tmp83 : tensor<1x64xi32> -> tensor<8x64xi32> loc(#loc277)
+      %tmp83_89 = arith.addi %tmp83_88, %tmp50_22 : tensor<8x64xi32> loc(#loc277)
+      %tmp83_90 = arith.addi %tmp83_89, %tmp50_25 : tensor<8x64xi32> loc(#loc278)
+      %tmp83_91 = tt.addptr %tmp50_27, %tmp83_90 : tensor<8x64x!tt.ptr<bf16>>, tensor<8x64xi32> loc(#loc279)
+      %tmp83_92 = tt.load %tmp83_91, %tmp35_70, %cst_0 evictionPolicy = evict_last : tensor<8x64x!tt.ptr<bf16>> loc(#loc280)
+      %tmp83_93 = arith.extf %tmp83_92 : tensor<8x64xbf16> to tensor<8x64xf32> loc(#loc281)
+      %tmp88 = arith.mulf %tmp83_93, %tmp75 : tensor<8x64xf32> loc(#loc282)
+      %tmp89 = tt.addptr %tmp102, %tmp17 : tensor<1x64x!tt.ptr<bf16>>, tensor<1x64xi32> loc(#loc283)
+      %tmp89_94 = tt.broadcast %tmp89 : tensor<1x64x!tt.ptr<bf16>> -> tensor<8x64x!tt.ptr<bf16>> loc(#loc283)
+      %tmp89_95 = tt.load %tmp89_94, %tmp35_70, %cst_0 evictionPolicy = evict_last : tensor<8x64x!tt.ptr<bf16>> loc(#loc284)
+      %tmp89_96 = arith.extf %tmp89_95 : tensor<8x64xbf16> to tensor<8x64xf32> loc(#loc285)
+      %tmp91 = arith.mulf %tmp88, %tmp89_96 : tensor<8x64xf32> loc(#loc286)
+      %tmp94 = arith.select %tmp48, %tmp91, %cst_11 : tensor<8x64xi1>, tensor<8x64xf32> loc(#loc287)
+      %tmp95 = arith.select %tmp31, %tmp82, %tmp94 : tensor<8x64xi1>, tensor<8x64xf32> loc(#loc288)
+      %tmp101 = arith.mulf %tmp96_47, %tmp75 : tensor<8x64xf32> loc(#loc289)
+      %tmp104 = tt.broadcast %tmp102_50 : tensor<1x64xf32> -> tensor<8x64xf32> loc(#loc290)
+      %tmp104_97 = arith.mulf %tmp101, %tmp104 : tensor<8x64xf32> loc(#loc290)
+      %tmp107 = arith.mulf %tmp104_97, %tmp63_39 : tensor<8x64xf32> loc(#loc291)
+      %tmp109 = arith.mulf %tmp95, %tmp66_41 : tensor<8x64xf32> loc(#loc292)
+      %tmp110 = arith.addf %tmp107, %tmp109 : tensor<8x64xf32> loc(#loc293)
+      %0 = arith.muli %xindex_16, %cst_8 : tensor<8x1xi32> loc(#loc142)
+      %1 = tt.broadcast %0 : tensor<8x1xi32> -> tensor<8x64xi32> loc(#loc143)
+      %2 = arith.addi %tmp50_21, %1 : tensor<8x64xi32> loc(#loc143)
+      %3 = tt.splat %in_out_ptr0 : !tt.ptr<bf16> -> tensor<8x64x!tt.ptr<bf16>> loc(#loc144)
+      %4 = tt.addptr %3, %2 : tensor<8x64x!tt.ptr<bf16>>, tensor<8x64xi32> loc(#loc144)
+      %5 = arith.truncf %tmp68 : tensor<8x64xf32> to tensor<8x64xbf16> loc(#loc145)
+      tt.store %4, %5, %tmp50_29 : tensor<8x64x!tt.ptr<bf16>> loc(#loc145)
+      %6 = tt.splat %in_out_ptr1 : !tt.ptr<bf16> -> tensor<8x64x!tt.ptr<bf16>> loc(#loc146)
+      %7 = tt.addptr %6, %2 : tensor<8x64x!tt.ptr<bf16>>, tensor<8x64xi32> loc(#loc146)
+      %8 = arith.truncf %tmp110 : tensor<8x64xf32> to tensor<8x64xbf16> loc(#loc147)
+      tt.store %7, %8, %tmp50_29 : tensor<8x64x!tt.ptr<bf16>> loc(#loc147)
+    } loc(#loc40)
+    tt.return loc(#loc148)
+  } loc(#loc)
+} loc(#loc)
+#loc2 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":23:28)
+#loc3 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":23:33)
+#loc4 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:36)
+#loc5 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:44)
+#loc6 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":24:23)
+#loc7 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":26:27)
+#loc8 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":26:37)
+#loc9 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":28:19)
+#loc10 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":29:19)
+#loc11 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":33:43)
+#loc12 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":34:31)
+#loc13 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":35:29)
+#loc14 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:41)
+#loc15 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:52)
+#loc16 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:48)
+#loc17 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:63)
+#loc18 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:57)
+#loc19 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:34)
+#loc20 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:68)
+#loc21 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":39:121)
+#loc22 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:41)
+#loc23 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:50)
+#loc24 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:34)
+#loc25 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:61)
+#loc26 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":40:114)
+#loc27 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":42:22)
+#loc28 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":44:23)
+#loc29 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":45:40)
+#loc30 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":47:22)
+#loc31 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":49:25)
+#loc32 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":50:42)
+#loc33 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":50:8)
+#loc34 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":293:36)
+#loc36 = loc("/usr/local/lib/python3.12/dist-packages/triton/language/standard.py":263:15)
+#loc37 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":51:28)
+#loc39 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":52:30)
+#loc40 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":53:43)
+#loc41 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":54:31)
+#loc42 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":55:29)
+#loc43 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":58:27)
+#loc44 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":59:27)
+#loc45 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:46)
+#loc46 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:42)
+#loc47 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:57)
+#loc48 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:51)
+#loc49 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:35)
+#loc50 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:62)
+#loc51 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":61:115)
+#loc52 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:35)
+#loc53 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:42)
+#loc54 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":62:95)
+#loc55 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:46)
+#loc56 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:42)
+#loc57 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:35)
+#loc58 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":63:51)
+#loc59 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:35)
+#loc60 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":64:51)
+#loc61 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:42)
+#loc62 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:49)
+#loc63 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:58)
+#loc64 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:35)
+#loc65 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:69)
+#loc66 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":65:123)
+#loc67 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:36)
+#loc68 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:43)
+#loc69 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":66:96)
+#loc70 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":71:24)
+#loc71 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:41)
+#loc72 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:39)
+#loc73 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:48)
+#loc74 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:57)
+#loc75 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:35)
+#loc76 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:78)
+#loc77 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:68)
+#loc78 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":72:129)
+#loc79 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":75:25)
+#loc80 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":77:24)
+#loc81 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":78:32)
+#loc82 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":79:24)
+#loc83 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:35)
+#loc84 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:85)
+#loc85 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":80:146)
+#loc86 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":82:24)
+#loc87 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":84:17)
+#loc88 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":86:39)
+#loc89 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":87:25)
+#loc90 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:44)
+#loc91 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:53)
+#loc92 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:35)
+#loc93 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:74)
+#loc94 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:64)
+#loc95 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":90:125)
+#loc96 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":97:24)
+#loc97 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:35)
+#loc98 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:81)
+#loc99 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":98:142)
+#loc100 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":100:24)
+#loc101 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":103:39)
+#loc102 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":104:39)
+#loc103 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":111:24)
+#loc104 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":113:24)
+#loc105 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":116:24)
+#loc106 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":118:24)
+#loc107 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":119:24)
+#loc108 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:42)
+#loc109 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:51)
+#loc110 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:60)
+#loc111 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:35)
+#loc112 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:71)
+#loc113 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":121:132)
+#loc114 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":123:24)
+#loc115 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":124:24)
+#loc116 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":125:32)
+#loc117 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":126:24)
+#loc118 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:35)
+#loc119 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:85)
+#loc120 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":127:146)
+#loc121 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":129:24)
+#loc122 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":131:17)
+#loc123 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":133:39)
+#loc124 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:42)
+#loc125 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:51)
+#loc126 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:60)
+#loc127 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:35)
+#loc128 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:71)
+#loc129 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":134:132)
+#loc130 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":139:24)
+#loc131 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:35)
+#loc132 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:81)
+#loc133 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":140:142)
+#loc134 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":142:24)
+#loc135 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":145:39)
+#loc136 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":146:39)
+#loc137 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":151:25)
+#loc138 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":153:26)
+#loc139 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":156:26)
+#loc140 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":158:26)
+#loc141 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":159:26)
+#loc142 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:43)
+#loc143 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:39)
+#loc144 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:32)
+#loc145 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":161:55)
+#loc146 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:32)
+#loc147 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":162:56)
+#loc148 = loc("/app/tensorrt_llm/visual_gen/compiled_cache/flux2_klein_9b_NVIDIA_GeForce_RTX_4090_sm89_torch2.10.0a0_b4e4ee81d3.nv25.12_cuda13_1/torchinductor/bv/cbvqhjtyg7fvxzwtbtt4vrdkbnb6n32fnrijjpl3vv4cfqd4mznr.py":53:4)
+#loc158 = loc("xoffset"(#loc2))
+#loc159 = loc("xoffset"(#loc3))
+#loc160 = loc("xindex"(#loc4))
+#loc161 = loc("xindex"(#loc5))
+#loc162 = loc("xindex"(#loc6))
+#loc163 = loc("r0_base"(#loc7))
+#loc164 = loc("r0_base"(#loc8))
+#loc165 = loc("x0"(#loc9))
+#loc166 = loc("x1"(#loc10))
+#loc167 = loc("_tmp4"(#loc11))
+#loc168 = loc("r0_index"(#loc12))
+#loc169 = loc("r0_mask"(#loc13))
+#loc170 = loc("tmp0"(#loc14))
+#loc171 = loc("tmp0"(#loc15))
+#loc172 = loc("tmp0"(#loc16))
+#loc173 = loc("tmp0"(#loc17))
+#loc174 = loc("tmp0"(#loc18))
+#loc175 = loc("tmp0"(#loc19))
+#loc176 = loc("tmp0"(#loc20))
+#loc177 = loc("tmp0"(#loc21))
+#loc178 = loc("tmp6"(#loc22))
+#loc179 = loc("tmp6"(#loc23))
+#loc180 = loc("tmp6"(#loc24))
+#loc181 = loc("tmp6"(#loc25))
+#loc182 = loc("tmp6"(#loc26))
+#loc183 = loc("tmp2"(#loc27))
+#loc184 = loc("tmp5"(#loc28))
+#loc185 = loc("_tmp4"(#loc29))
+#loc186 = loc("tmp8"(#loc30))
+#loc187 = loc("tmp11"(#loc31))
+#loc188 = loc("_tmp10"(#loc32))
+#loc190 = loc("tmp4"(#loc37))
+#loc192 = loc("tmp10"(#loc39))
+#loc193 = loc("r0_index"(#loc41))
+#loc194 = loc("r0_mask"(#loc42))
+#loc195 = loc("r0_3"(#loc43))
+#loc196 = loc("r0_4"(#loc44))
+#loc197 = loc("tmp50"(#loc45))
+#loc198 = loc("tmp50"(#loc46))
+#loc199 = loc("tmp50"(#loc47))
+#loc200 = loc("tmp50"(#loc48))
+#loc201 = loc("tmp50"(#loc49))
+#loc202 = loc("tmp50"(#loc50))
+#loc203 = loc("tmp50"(#loc51))
+#loc204 = loc("tmp58"(#loc52))
+#loc205 = loc("tmp58"(#loc53))
+#loc206 = loc("tmp58"(#loc54))
+#loc207 = loc("tmp63"(#loc55))
+#loc208 = loc("tmp63"(#loc56))
+#loc209 = loc("tmp63"(#loc57))
+#loc210 = loc("tmp63"(#loc58))
+#loc211 = loc("tmp66"(#loc59))
+#loc212 = loc("tmp66"(#loc60))
+#loc213 = loc("tmp96"(#loc61))
+#loc214 = loc("tmp96"(#loc62))
+#loc215 = loc("tmp96"(#loc63))
+#loc216 = loc("tmp96"(#loc64))
+#loc217 = loc("tmp96"(#loc65))
+#loc218 = loc("tmp96"(#loc66))
+#loc219 = loc("tmp102"(#loc67))
+#loc220 = loc("tmp102"(#loc68))
+#loc221 = loc("tmp102"(#loc69))
+#loc222 = loc("tmp16"(#loc70))
+#loc223 = loc("tmp17"(#loc71))
+#loc224 = loc("tmp17"(#loc72))
+#loc225 = loc("tmp17"(#loc73))
+#loc226 = loc("tmp17"(#loc74))
+#loc227 = loc("tmp17"(#loc75))
+#loc228 = loc("tmp17"(#loc76))
+#loc229 = loc("tmp17"(#loc77))
+#loc230 = loc("tmp17"(#loc78))
+#loc231 = loc("tmp20"(#loc79))
+#loc232 = loc("tmp22"(#loc80))
+#loc233 = loc("tmp23"(#loc81))
+#loc234 = loc("tmp24"(#loc82))
+#loc235 = loc("tmp25"(#loc83))
+#loc236 = loc("tmp25"(#loc84))
+#loc237 = loc("tmp25"(#loc85))
+#loc238 = loc("tmp27"(#loc86))
+#loc239 = loc("tmp29"(#loc87))
+#loc240 = loc("tmp31"(#loc88))
+#loc241 = loc("tmp32"(#loc89))
+#loc242 = loc("tmp35"(#loc90))
+#loc243 = loc("tmp35"(#loc91))
+#loc244 = loc("tmp35"(#loc92))
+#loc245 = loc("tmp35"(#loc93))
+#loc246 = loc("tmp35"(#loc94))
+#loc247 = loc("tmp35"(#loc95))
+#loc248 = loc("tmp42"(#loc96))
+#loc249 = loc("tmp43"(#loc97))
+#loc250 = loc("tmp43"(#loc98))
+#loc251 = loc("tmp43"(#loc99))
+#loc252 = loc("tmp45"(#loc100))
+#loc253 = loc("tmp48"(#loc101))
+#loc254 = loc("tmp49"(#loc102))
+#loc255 = loc("tmp57"(#loc103))
+#loc256 = loc("tmp60"(#loc104))
+#loc257 = loc("tmp64"(#loc105))
+#loc258 = loc("tmp67"(#loc106))
+#loc259 = loc("tmp68"(#loc107))
+#loc260 = loc("tmp70"(#loc108))
+#loc261 = loc("tmp70"(#loc109))
+#loc262 = loc("tmp70"(#loc110))
+#loc263 = loc("tmp70"(#loc111))
+#loc264 = loc("tmp70"(#loc112))
+#loc265 = loc("tmp70"(#loc113))
+#loc266 = loc("tmp72"(#loc114))
+#loc267 = loc("tmp73"(#loc115))
+#loc268 = loc("tmp74"(#loc116))
+#loc269 = loc("tmp75"(#loc117))
+#loc270 = loc("tmp76"(#loc118))
+#loc271 = loc("tmp76"(#loc119))
+#loc272 = loc("tmp76"(#loc120))
+#loc273 = loc("tmp78"(#loc121))
+#loc274 = loc("tmp80"(#loc122))
+#loc275 = loc("tmp82"(#loc123))
+#loc276 = loc("tmp83"(#loc124))
+#loc277 = loc("tmp83"(#loc125))
+#loc278 = loc("tmp83"(#loc126))
+#loc279 = loc("tmp83"(#loc127))
+#loc280 = loc("tmp83"(#loc128))
+#loc281 = loc("tmp83"(#loc129))
+#loc282 = loc("tmp88"(#loc130))
+#loc283 = loc("tmp89"(#loc131))
+#loc284 = loc("tmp89"(#loc132))
+#loc285 = loc("tmp89"(#loc133))
+#loc286 = loc("tmp91"(#loc134))
+#loc287 = loc("tmp94"(#loc135))
+#loc288 = loc("tmp95"(#loc136))
+#loc289 = loc("tmp101"(#loc137))
+#loc290 = loc("tmp104"(#loc138))
+#loc291 = loc("tmp107"(#loc139))
+#loc292 = loc("tmp109"(#loc140))
+#loc293 = loc("tmp110"(#loc141))
+#loc294 = loc("_tmp10"(#loc167))
+#loc295 = loc(callsite(#loc34 at #loc189))
+#loc297 = loc(callsite(#loc34 at #loc191))
+#loc299 = loc(callsite(#loc36 at #loc295))
+#loc300 = loc(callsite(#loc36 at #loc297))